├── .lastrelease
├── .gitignore
├── docs
    ├── volk_logo.png
    ├── volk_logo_small.png
    ├── CMakeLists.txt
    ├── main_page.dox
    ├── using_volk.dox
    ├── DCO.txt
    └── CONTRIBUTING.md
├── .gitmodules
├── kernels
    ├── volk
    │   ├── asm
    │   │   ├── orc
    │   │   │   ├── volk_16u_byteswap_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_add_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_max_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_min_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_s32f_add_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_divide_32f_a_orc_impl.orc
    │   │   │   ├── volk_32i_x2_and_32i_a_orc_impl.orc
    │   │   │   ├── volk_32i_x2_or_32i_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_multiply_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_x2_subtract_32f_a_orc_impl.orc
    │   │   │   ├── volk_8i_convert_16i_a_orc_impl.orc
    │   │   │   ├── volk_32f_s32f_multiply_32f_a_orc_impl.orc
    │   │   │   ├── volk_32f_s32f_normalize_a_orc_impl.orc
    │   │   │   ├── volk_16ic_deinterleave_16i_x2_a_orc_impl.orc
    │   │   │   ├── volk_16ic_deinterleave_real_8i_a_orc_impl.orc
    │   │   │   ├── volk_32fc_32f_multiply_32fc_a_orc_impl.orc
    │   │   │   ├── volk_8i_s32f_convert_32f_a_orc_impl.orc
    │   │   │   ├── volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc
    │   │   │   └── volk_32fc_x2_multiply_32fc_a_orc_impl.orc
    │   │   ├── neon
    │   │   │   ├── volk_32f_s32f_multiply_32f_a_neonasm.s
    │   │   │   ├── volk_32f_x2_add_32f_a_neonasm.s
    │   │   │   ├── volk_32fc_x2_multiply_32fc_a_neonasm.s
    │   │   │   ├── volk_32f_x2_dot_prod_32f_a_neonasm.s
    │   │   │   ├── volk_16i_max_star_horizontal_16i.s
    │   │   │   ├── volk_32f_x2_add_32f_a_neonpipeline.s
    │   │   │   ├── volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
    │   │   │   ├── volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s
    │   │   │   ├── volk_32fc_32f_dot_prod_32fc_a_neonasm.s
    │   │   │   ├── volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s
    │   │   │   ├── volk_32fc_x2_dot_prod_32fc_a_neonasm.s
    │   │   │   └── volk_32f_x2_dot_prod_32f_a_neonasm_opts.s
    │   │   └── riscv
    │   │   │   ├── volk_32f_s32f_multiply_32f_sifive_u74.s
    │   │   │   └── volk_32fc_x2_dot_prod_32fc_sifive_u74.s
    │   ├── volk_32f_null_32f.h
    │   ├── volk_32fc_s32f_power_spectral_densitypuppet_32f.h
    │   ├── volk_32fc_s32f_x2_power_spectral_density_32f.h
    │   ├── volk_32f_s32f_power_32f.h
    │   ├── volk_32fc_s32f_power_32fc.h
    │   ├── volk_32u_popcntpuppet_32u.h
    │   ├── volk_64u_popcntpuppet_64u.h
    │   ├── volk_32u_popcnt.h
    │   ├── volk_16i_max_star_16i.h
    │   └── volk_32f_s32f_clamppuppet_32f.h
    └── README.md
├── cmake
    ├── Checks
    │   └── check-rvv-intrinsics.c
    ├── Toolchains
    │   ├── intel-sde.cmake
    │   ├── arm_cortex_a76_hardfp_native.cmake
    │   ├── arm_cortex_a8_softfp_native.cmake
    │   ├── arm_cortex_a15_hardfp_native.cmake
    │   ├── arm_cortex_a8_hardfp_native.cmake
    │   ├── arm_cortex_a9_hardfp_native.cmake
    │   ├── arm_cortex_a72_hardfp_native.cmake
    │   ├── arm_cortex_a53_hardfp_native.cmake
    │   ├── rv64gcv-linux-gnu.cmake
    │   ├── oe-sdk_cross.cmake
    │   ├── aarch64-linux-gnu.cmake
    │   └── arm-linux-gnueabihf.cmake
    ├── Modules
    │   ├── VolkConfigVersion.cmake.in
    │   ├── FindORC.cmake
    │   ├── VolkConfig.cmake.in
    │   └── VolkVersion.cmake
    ├── cmake_uninstall.cmake.in
    └── msvc
    │   ├── sys
    │       └── time.h
    │   └── config.h
├── python
    └── volk_modtool
    │   ├── __init__.py
    │   ├── CMakeLists.txt
    │   └── cfg.py
├── tmpl
    ├── volk.pc.in
    ├── volk_config_fixed.tmpl.h
    ├── volk_typedefs.tmpl.h
    ├── volk_machines.tmpl.c
    ├── volk_cpu.tmpl.h
    ├── volk_machines.tmpl.h
    ├── volk_machine_xxx.tmpl.c
    ├── volk_cpu.tmpl.c
    └── volk.tmpl.h
├── scripts
    ├── tools
    │   ├── update_citations.sh
    │   └── compare_volk_profiles
    ├── ci
    │   └── download_intel_sde.sh
    └── licensing
    │   └── count_contrib.sh
├── include
    └── volk
    │   ├── constants.h
    │   ├── saturation_arithmetic.h
    │   ├── volk_version.h.in
    │   ├── volk_prefs.h
    │   ├── volk_alloc.hh
    │   ├── volk_malloc.h
    │   ├── volk_avx2_fma_intrinsics.h
    │   ├── volk_sse3_intrinsics.h
    │   ├── volk_complex.h
    │   └── volk_rvv_intrinsics.h
├── .github
    └── workflows
    │   ├── check-pr-formatting.yml
    │   ├── publish_docs.yml
    │   ├── run-tests-rvv.yml
    │   └── android_build.yml
├── lib
    ├── constants.c.in
    ├── volk_rank_archs.h
    ├── volk_malloc.c
    ├── volk_prefs.c
    └── volk_rank_archs.c
├── apps
    ├── volk_profile.h
    ├── plot_best_vs_generic.py
    ├── volk_option_helpers.h
    ├── volk-config-info.cc
    └── CMakeLists.txt
├── .gitlab-ci.yml
├── tests
    ├── CMakeLists.txt
    ├── test_volk_32fc_x2_multiply_32fc.cc
    └── volk_test.cc
├── gen
    ├── volk_tmpl_utils.py
    ├── volk_compile_utils.py
    ├── volk_machine_defs.py
    ├── machines.xml
    └── volk_arch_defs.py
├── appveyor.yml
├── .travis.yml
├── .clang-format
├── .cmake-format.py
└── .mailmap


/.lastrelease:
--------------------------------------------------------------------------------
1 | v3.2.0
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc
3 | *.pyo
4 | *build*/
5 | archives/
6 | 


--------------------------------------------------------------------------------
/docs/volk_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnuradio/volk/HEAD/docs/volk_logo.png


--------------------------------------------------------------------------------
/docs/volk_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnuradio/volk/HEAD/docs/volk_logo_small.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "cpu_features"]
2 | 	path = cpu_features
3 | 	url = https://github.com/google/cpu_features.git
4 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_16u_byteswap_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_16u_byteswap_a_orc_impl
2 | .dest 2 dst uint16_t
3 | swapw dst, dst
4 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_add_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_add_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | addf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_max_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_max_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | maxf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_min_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_min_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | minf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_s32f_add_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_s32f_add_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .floatparam 4 scalar
5 | addf dst, src1, scalar
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_divide_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_divide_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | divf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32i_x2_and_32i_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32i_x2_and_32i_a_orc_impl
2 | .dest 4 dst int32_t
3 | .source 4 src1 int32_t
4 | .source 4 src2 int32_t
5 | andl dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32i_x2_or_32i_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32i_x2_or_32i_a_orc_impl
2 | .dest 4 dst int32_t
3 | .source 4 src1 int32_t
4 | .source 4 src2 int32_t
5 | orl dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_multiply_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | mulf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_x2_subtract_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .source 4 src2 float
5 | subf dst, src1, src2
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_8i_convert_16i_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_8i_convert_16i_a_orc_impl
2 | .source 1 src int8_t
3 | .dest 2 dst int16_t
4 | .temp 2 tmp
5 | convsbw tmp, src
6 | shlw dst, tmp, 8
7 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_s32f_multiply_32f_a_orc_impl
2 | .dest 4 dst float
3 | .source 4 src1 float
4 | .floatparam 4 scalar
5 | mulf dst, src1, scalar
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32f_s32f_normalize_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32f_s32f_normalize_a_orc_impl
2 | .source 4 src1 float
3 | .floatparam 4 invscalar
4 | .dest 4 dst float
5 | mulf dst, src1, invscalar
6 | 


--------------------------------------------------------------------------------
/cmake/Checks/check-rvv-intrinsics.c:
--------------------------------------------------------------------------------
1 | #if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
2 | int main() { return 0; }
3 | #else
4 | #error "rvv intrinsics aren't supported"
5 | #endif
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_16ic_deinterleave_16i_x2_a_orc_impl
2 | .dest 2 idst int16_t
3 | .dest 2 qdst int16_t
4 | .source 4 src lv_16sc_t
5 | splitlw qdst, idst, src
6 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_16ic_deinterleave_real_8i_a_orc_impl
2 | .dest 1 dst int8_t
3 | .source 4 src lv_16sc_t
4 | .temp 2 iw
5 | select0lw iw, src
6 | convhwb dst, iw
7 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc:
--------------------------------------------------------------------------------
1 | .function volk_32fc_32f_multiply_32fc_a_orc_impl
2 | .source 8 src1 lv_32fc_t
3 | .source 4 src2 float
4 | .dest 8 dst lv_32fc_t
5 | .temp 8 tmp
6 | mergelq tmp, src2, src2
7 | x2 mulf dst, src1, tmp
8 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc:
--------------------------------------------------------------------------------
 1 | .function volk_8i_s32f_convert_32f_a_orc_impl
 2 | .source 1 src uint8_t
 3 | .dest 4 dst float
 4 | .floatparam 4 scalar
 5 | .temp 4 flsrc
 6 | .temp 4 lsrc
 7 | .temp 2 ssrc
 8 | convsbw ssrc, src
 9 | convswl lsrc, ssrc
10 | convlf flsrc, lsrc
11 | mulf dst, flsrc, scalar
12 | 


--------------------------------------------------------------------------------
/python/volk_modtool/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright 2013, 2014 Free Software Foundation, Inc.
 5 | #
 6 | # This file is part of VOLK
 7 | #
 8 | # SPDX-License-Identifier: LGPL-3.0-or-later
 9 | #
10 | 
11 | from .cfg import volk_modtool_config
12 | from .volk_modtool_generate import volk_modtool
13 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc:
--------------------------------------------------------------------------------
 1 | .function volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl
 2 | .dest 4 idst float
 3 | .dest 4 qdst float
 4 | .source 4 src lv_16sc_t
 5 | .floatparam 4 scalar
 6 | .temp 8 iql
 7 | .temp 8 iqf
 8 | 
 9 | x2 convswl iql, src
10 | x2 convlf iqf, iql
11 | x2 divf iqf, iqf, scalar
12 | splitql qdst, idst, iqf
13 | 


--------------------------------------------------------------------------------
/tmpl/volk.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@CMAKE_INSTALL_PREFIX@
 2 | exec_prefix=@CMAKE_INSTALL_PREFIX@
 3 | libdir=@CMAKE_INSTALL_FULL_LIBDIR@
 4 | includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 5 | LV_CXXFLAGS=@LV_CXXFLAGS@
 6 | 
 7 | 
 8 | Name: volk
 9 | Description: VOLK: Vector Optimized Library of Kernels
10 | Requires:
11 | Version: @SOVERSION@
12 | Libs: -L${libdir} -lvolk
13 | Cflags: -I${includedir} ${LV_CXXFLAGS}
14 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/intel-sde.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl")
10 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=knl")
11 | set(CMAKE_CROSSCOMPILING_EMULATOR
12 |     "$ENV{TRAVIS_BUILD_DIR}/cache/$ENV{SDE_VERSION}/sde64 -knl --")
13 | 


--------------------------------------------------------------------------------
/tmpl/volk_config_fixed.tmpl.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_CONFIG_FIXED_H
11 | #define INCLUDED_VOLK_CONFIG_FIXED_H
12 | 
13 | %for i, arch in enumerate(archs):
14 | #define LV_${arch.name.upper()} ${i}
15 | %endfor
16 | 
17 | #endif /*INCLUDED_VOLK_CONFIG_FIXED*/
18 | 


--------------------------------------------------------------------------------
/tmpl/volk_typedefs.tmpl.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_TYPEDEFS
11 | #define INCLUDED_VOLK_TYPEDEFS
12 | 
13 | #include <inttypes.h>
14 | #include <volk/volk_complex.h>
15 | 
16 | %for kern in kernels:
17 | typedef void (*${kern.pname})(${kern.arglist_types});
18 | %endfor
19 | 
20 | #endif /*INCLUDED_VOLK_TYPEDEFS*/
21 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc:
--------------------------------------------------------------------------------
 1 | .function volk_32fc_x2_multiply_32fc_a_orc_impl
 2 | .source 8 src1 lv_32fc_t
 3 | .source 8 src2 lv_32fc_t
 4 | .dest 8 dst lv_32fc_t
 5 | .temp 8 iqprod
 6 | .temp 4 real
 7 | .temp 4 imag
 8 | .temp 4 ac
 9 | .temp 4 bd
10 | .temp 8 swapped
11 | x2 mulf iqprod, src1, src2
12 | splitql bd, ac, iqprod
13 | subf real, ac, bd
14 | swaplq swapped, src1
15 | x2 mulf iqprod, swapped, src2
16 | splitql bd, ac, iqprod
17 | addf imag, ac, bd
18 | mergelq dst, real, imag
19 | 


--------------------------------------------------------------------------------
/scripts/tools/update_citations.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright 2022 Johannes Demel
 4 | #
 5 | # This script is part of VOLK.
 6 | #
 7 | # SPDX-License-Identifier: LGPL-3.0-or-later
 8 | #
 9 | # Find all contributors according to git and update `.zenodo.json` accordingly.
10 | 
11 | script_name=$0
12 | script_full_path=$(dirname "$0")
13 | python_script=$"$script_full_path/run_citations_update.py"
14 | 
15 | contributors_list="$(git log --pretty="%an <%ae>" | sort | uniq)"
16 | 
17 | # Run a Python script to make things easier.
18 | python3 $python_script "$contributors_list"
19 | 


--------------------------------------------------------------------------------
/tmpl/volk_machines.tmpl.c:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #include <volk/volk_common.h>
11 | #include <volk/volk_typedefs.h>
12 | #include "volk_machines.h"
13 | 
14 | struct volk_machine *volk_machines[] = {
15 | %for machine in machines:
16 | #ifdef LV_MACHINE_${machine.name.upper()}
17 | &volk_machine_${machine.name},
18 | #endif
19 | %endfor
20 | };
21 | 
22 | unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines);
23 | 


--------------------------------------------------------------------------------
/tmpl/volk_cpu.tmpl.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_CPU_H
11 | #define INCLUDED_VOLK_CPU_H
12 | 
13 | #include <volk/volk_common.h>
14 | 
15 | __VOLK_DECL_BEGIN
16 | 
17 | struct VOLK_CPU {
18 |     %for arch in archs:
19 |     int (*has_${arch.name}) ();
20 |     %endfor
21 | };
22 | 
23 | extern struct VOLK_CPU volk_cpu;
24 | 
25 | void volk_cpu_init ();
26 | unsigned int volk_get_lvarch ();
27 | 
28 | __VOLK_DECL_END
29 | 
30 | #endif /*INCLUDED_VOLK_CPU_H*/
31 | 


--------------------------------------------------------------------------------
/docs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # Copyright 2022 Johannes Demel.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | find_package(Doxygen)
 9 | if(DOXYGEN_FOUND)
10 | 
11 |     message(STATUS "Doxygen found. Building docs ...")
12 | 
13 |     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_BINARY_DIR}/Doxyfile
14 |                    @ONLY)
15 | 
16 |     add_custom_target(
17 |         volk_doc
18 |         ${DOXYGEN_EXECUTABLE} ${CMAKE_BINARY_DIR}/Doxyfile
19 |         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
20 |         COMMENT "Generating documentation with Doxygen"
21 |         VERBATIM)
22 | 
23 | endif(DOXYGEN_FOUND)
24 | 


--------------------------------------------------------------------------------
/include/volk/constants.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2006,2009,2013 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_CONSTANTS_H
11 | #define INCLUDED_VOLK_CONSTANTS_H
12 | 
13 | #include <volk/volk_common.h>
14 | 
15 | __VOLK_DECL_BEGIN
16 | 
17 | VOLK_API const char* volk_prefix();
18 | VOLK_API const char* volk_version();
19 | VOLK_API const char* volk_c_compiler();
20 | VOLK_API const char* volk_compiler_flags();
21 | VOLK_API const char* volk_available_machines();
22 | 
23 | __VOLK_DECL_END
24 | 
25 | #endif /* INCLUDED_VOLK_CONSTANTS_H */
26 | 


--------------------------------------------------------------------------------
/python/volk_modtool/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2013, 2014 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Install python files and apps
11 | ########################################################################
12 | include(VolkPython)
13 | 
14 | volk_python_install(
15 |     FILES
16 |     __init__.py
17 |     cfg.py
18 |     volk_modtool_generate.py
19 |     DESTINATION
20 |     ${VOLK_PYTHON_DIR}/volk_modtool
21 |     COMPONENT
22 |     "volk")
23 | 
24 | volk_python_install(PROGRAMS volk_modtool DESTINATION ${VOLK_RUNTIME_DIR} COMPONENT
25 |                     "volk")
26 | 


--------------------------------------------------------------------------------
/.github/workflows/check-pr-formatting.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2020, 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | name: Check PR Formatting
10 | 
11 | on:
12 |   push:
13 |     paths-ignore:
14 |       - 'tmpl/'
15 |       - 'include/volk/sse2neon.h'
16 |   pull_request:
17 |     paths-ignore:
18 |       - 'tmpl/'
19 |       - 'include/volk/sse2neon.h'
20 | 
21 | jobs:
22 |   build:
23 |     runs-on: ubuntu-latest
24 | 
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |     - uses: gnuradio/clang-format-lint-action@v0.5-4
28 |       with:
29 |         source: '.'
30 |         exclude: './tmpl,./include/volk/sse2neon.h'
31 |         extensions: 'c,cc,cpp,cxx,h,hh'
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/main_page.dox:
--------------------------------------------------------------------------------
 1 | /*! \mainpage VOLK
 2 | 
 3 | Welcome to VOLK!
 4 | 
 5 | VOLK is the Vector-Optimized Library of Kernels. It is a library that contains
 6 | kernels of hand-written SIMD code for different mathematical operations. Since
 7 | each SIMD architecture can be very different and no compiler has yet come along
 8 | to handle vectorization properly or highly efficiently, VOLK approaches the
 9 | problem differently.
10 | 
11 | For each architecture or platform that a developer wishes to vectorize for, a
12 | new proto-kernel is added to VOLK. At runtime, VOLK will select the correct
13 | proto-kernel. In this way, the users of VOLK call a kernel for performing the
14 | operation that is platform/architecture agnostic. This allows us to write
15 | portable SIMD code.
16 | 
17 | */
18 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32f_null_32f.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #include <inttypes.h>
11 | #include <math.h>
12 | #include <stdio.h>
13 | 
14 | #ifndef INCLUDED_volk_32f_null_32f_a_H
15 | #define INCLUDED_volk_32f_null_32f_a_H
16 | 
17 | #ifdef LV_HAVE_GENERIC
18 | 
19 | static inline void
20 | volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
21 | {
22 |     float* bPtr = bVector;
23 |     const float* aPtr = aVector;
24 |     unsigned int number;
25 | 
26 |     for (number = 0; number < num_points; number++) {
27 |         *bPtr++ = *aPtr++;
28 |     }
29 | }
30 | #endif /* LV_HAVE_GENERIC */
31 | 
32 | #endif /* INCLUDED_volk_32f_null_32f_u_H */
33 | 


--------------------------------------------------------------------------------
/cmake/Modules/VolkConfigVersion.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright 2014, 2015, 2018, 2020 Free Software Foundation, Inc.
 2 | #
 3 | # This file is part of VOLK.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | set(MAJOR_VERSION @VERSION_INFO_MAJOR_VERSION@)
 9 | set(MINOR_VERSION @VERSION_INFO_MINOR_VERSION@)
10 | set(MAINT_VERSION @VERSION_INFO_MAINT_VERSION@)
11 | 
12 | set(PACKAGE_VERSION ${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION})
13 | 
14 | if(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION})
15 |     if(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${MINOR_VERSION})
16 |         if(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MAINT_VERSION})
17 |             set(PACKAGE_VERSION_EXACT 1) # exact match for API version
18 |             set(PACKAGE_VERSION_COMPATIBLE 1) # compat for minor/patch version
19 |         endif()
20 |     endif()
21 | endif()
22 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is part of VOLK
 3 | #
 4 | # SPDX-License-Identifier: LGPL-3.0-or-later
 5 | #
 6 | 
 7 | ########################################################################
 8 | # Toolchain file for building native on a ARM Cortex A76 w/ NEON
 9 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
10 | ########################################################################
11 | set(CMAKE_CXX_COMPILER g++)
12 | set(CMAKE_C_COMPILER gcc)
13 | set(CMAKE_CXX_FLAGS
14 |     "-march=armv8.2-a -mtune=cortex-a76 -mfpu=neon-fp-armv8 -mfloat-abi=hard"
15 |     CACHE STRING "" FORCE)
16 | set(CMAKE_C_FLAGS
17 |     ${CMAKE_CXX_FLAGS}
18 |     CACHE STRING "" FORCE) #same flags for C sources
19 | set(CMAKE_ASM_FLAGS
20 |     "${CMAKE_CXX_FLAGS} -mthumb -g"
21 |     CACHE STRING "" FORCE) #same flags for asm sources
22 | 


--------------------------------------------------------------------------------
/lib/constants.c.in:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2013, 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #if HAVE_CONFIG_H
11 | #include <config.h>
12 | #endif
13 | 
14 | #include <stdlib.h>
15 | #include <volk/constants.h>
16 | 
17 | const char*
18 | volk_prefix()
19 | {
20 |   const char *prefix = getenv("VOLK_PREFIX");
21 |   if (prefix != NULL) return prefix;
22 |   return "@prefix@";
23 | }
24 | 
25 | const char*
26 | volk_version()
27 | {
28 |   return "@VERSION@";
29 | }
30 | 
31 | const char*
32 | volk_c_compiler()
33 | {
34 |   return "@cmake_c_compiler_version@";
35 | }
36 | 
37 | const char*
38 | volk_compiler_flags()
39 | {
40 |   return "@COMPILER_INFO@";
41 | }
42 | 
43 | const char*
44 | volk_available_machines()
45 | {
46 |   return "@available_machines@";
47 | }
48 | 


--------------------------------------------------------------------------------
/apps/volk_profile.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2012-2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #include <stdbool.h> // for bool
11 | #include <iosfwd>    // for ofstream
12 | #include <string>    // for string
13 | #include <vector>    // for vector
14 | 
15 | class volk_test_results_t;
16 | 
17 | void read_results(std::vector<volk_test_results_t>* results);
18 | void read_results(std::vector<volk_test_results_t>* results, std::string path);
19 | void write_results(const std::vector<volk_test_results_t>* results, bool update_result);
20 | void write_results(const std::vector<volk_test_results_t>* results,
21 |                    bool update_result,
22 |                    const std::string path);
23 | void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results);
24 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a8_softfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a15_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv7-a -mtune=cortex-a15 -mfpu=neon -mfloat-abi=hard"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a8_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=hard"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a9_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv7-a -mtune=cortex-a9 -mfpu=neon -mfloat-abi=hard"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A72 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv8-a -mtune=cortex-a72 -mfpu=neon-fp-armv8 -mfloat-abi=hard"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -mthumb -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/scripts/ci/download_intel_sde.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2019, 2020 Christoph Mayer
 4 | #
 5 | # This script is part of VOLK.
 6 | #
 7 | # SPDX-License-Identifier: LGPL-3.0-or-later
 8 | 
 9 | set -e
10 | set -x
11 | 
12 | function test_sde
13 | {
14 |     if ! [ -f ${SDE} ]; then
15 |         echo "1"
16 |     else
17 |         ${SDE} -- ls > /dev/null
18 |         echo $?
19 |     fi
20 | }
21 | 
22 | mkdir -p cache
23 | cd cache
24 | 
25 | [ -z "${SDE_VERSION}" ] && SDE_VERSION=sde-external-8.50.0-2020-03-26-lin
26 | [ -z "${SDE_URL}" ] && SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/
27 | [ -z "${SDE}" ] && SDE=${SDE_VERSION}/sde64
28 | 
29 | 
30 | if [ _$(test_sde) == _0 ]; then
31 |     MSG="found working version: ${SDE_VERSION}"
32 | else
33 |     MSG="downloading: ${SDE_VERSION}"
34 |     wget ${SDE_URL}/${SDE_VERSION}.tar.bz2
35 |     tar xvf ${SDE_VERSION}.tar.bz2
36 | fi
37 | 
38 | echo $SDE
39 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm_cortex_a53_hardfp_native.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014, 2018, 2019, 2021 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Toolchain file for building native on a ARM Cortex A53 w/ NEON
11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
12 | ########################################################################
13 | set(CMAKE_CXX_COMPILER g++)
14 | set(CMAKE_C_COMPILER gcc)
15 | set(CMAKE_CXX_FLAGS
16 |     "-march=armv8-a -mtune=cortex-a53 -mfpu=neon-fp-armv8 -mfloat-abi=hard"
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     ${CMAKE_CXX_FLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_ASM_FLAGS
22 |     "${CMAKE_CXX_FLAGS} -mthumb -g"
23 |     CACHE STRING "" FORCE) #same flags for asm sources
24 | 


--------------------------------------------------------------------------------
/include/volk/saturation_arithmetic.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2016 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | 
11 | #ifndef INCLUDED_volk_saturation_arithmetic_H_
12 | #define INCLUDED_volk_saturation_arithmetic_H_
13 | 
14 | #include <limits.h>
15 | 
16 | static inline int16_t sat_adds16i(int16_t x, int16_t y)
17 | {
18 |     int32_t res = (int32_t)x + (int32_t)y;
19 | 
20 |     if (res < SHRT_MIN)
21 |         res = SHRT_MIN;
22 |     if (res > SHRT_MAX)
23 |         res = SHRT_MAX;
24 | 
25 |     return res;
26 | }
27 | 
28 | static inline int16_t sat_muls16i(int16_t x, int16_t y)
29 | {
30 |     int32_t res = (int32_t)x * (int32_t)y;
31 | 
32 |     if (res < SHRT_MIN)
33 |         res = SHRT_MIN;
34 |     if (res > SHRT_MAX)
35 |         res = SHRT_MAX;
36 | 
37 |     return res;
38 | }
39 | 
40 | #endif /* INCLUDED_volk_saturation_arithmetic_H_ */
41 | 


--------------------------------------------------------------------------------
/include/volk/volk_version.h.in:
--------------------------------------------------------------------------------
 1 | /* -*- C -*- */
 2 | /*
 3 |  * Copyright 2020 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_VERSION_H
11 | #define INCLUDED_VOLK_VERSION_H
12 | 
13 | #include <volk/volk_common.h>
14 | 
15 | __VOLK_DECL_BEGIN
16 | 
17 | /*
18 |  * define macros for the Volk version, which can then be used by any
19 |  * project that #include's this header, e.g., to determine whether
20 |  * some specific API is present and functional.
21 |  */
22 | 
23 | #define VOLK_VERSION_MAJOR @VERSION_INFO_MAJOR_VERSION@
24 | #define VOLK_VERSION_MINOR @VERSION_INFO_MINOR_VERSION@
25 | #define VOLK_VERSION_MAINT @VERSION_INFO_MAINT_VERSION@
26 | 
27 | /*
28 |  * VOLK_VERSION % 100 is the MAINT version
29 |  * (VOLK_VERSION / 100) % 100 is the MINOR version
30 |  * (VOLK_VERSION / 100) / 100 is the MAJOR version
31 |  */
32 | 
33 | #define VOLK_VERSION @VOLK_VERSION_DECIMAL@
34 | 
35 | __VOLK_DECL_END
36 | 
37 | #endif /* INCLUDED_VOLK_VERSION_H */
38 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/rv64gcv-linux-gnu.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2024 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | set(CMAKE_SYSTEM_NAME Linux)
10 | set(CMAKE_SYSTEM_PROCESSOR riscv64)
11 | 
12 | set(CMAKE_C_COMPILER $ENV{CC})
13 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
14 | set(CMAKE_CXX_COMPILER $ENV{CXX})
15 | 
16 | set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
17 | set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
18 | set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)
19 | 
20 | set(CMAKE_OBJCOPY
21 |     ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
22 |     CACHE INTERNAL "objcopy tool")
23 | set(CMAKE_SIZE_UTIL
24 |     ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
25 |     CACHE INTERNAL "size tool")
26 | 
27 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
28 | 
29 | set(QEMU_VLEN $ENV{VLEN})
30 | if(NOT QEMU_VLEN)
31 |     set(QEMU_VLEN "128")
32 | endif()
33 | 
34 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")
35 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32fc_s32f_power_spectral_densitypuppet_32f.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2020 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | 
11 | #ifndef INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H
12 | #define INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H
13 | 
14 | 
15 | #include <volk/volk_32fc_s32f_x2_power_spectral_density_32f.h>
16 | 
17 | 
18 | #ifdef LV_HAVE_GENERIC
19 | 
20 | static inline void
21 | volk_32fc_s32f_power_spectral_densitypuppet_32f_generic(float* logPowerOutput,
22 |                                                         const lv_32fc_t* complexFFTInput,
23 |                                                         const float normalizationFactor,
24 |                                                         unsigned int num_points)
25 | {
26 |     volk_32fc_s32f_x2_power_spectral_density_32f_generic(
27 |         logPowerOutput, complexFFTInput, normalizationFactor, 2.5, num_points);
28 | }
29 | 
30 | #endif /* LV_HAVE_GENERIC */
31 | 
32 | 
33 | #endif /* INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H */
34 | 


--------------------------------------------------------------------------------
/docs/using_volk.dox:
--------------------------------------------------------------------------------
 1 | /*! \page using_volk Using VOLK
 2 | 
 3 | Using VOLK in your code requires proper linking and including the correct headers. VOLK currently supports both C and C++ bindings.
 4 | 
 5 | VOLK provides both a pkgconfig and CMake module to help configuration and
 6 | linking. The pkfconfig file is installed to
 7 | $install_prefix/lib/pkgconfig/volk.pc. The CMake configuration module is in
 8 | $install_prefix/lib/cmake/volk/VolkConfig.cmake.
 9 | 
10 | The header in the VOLK include directory (includedir in pkgconfig,
11 | VOLK_INCLUDE_DIRS in cmake module) contains the header volk/volk.h defines all
12 | of the symbols exposed by VOLK. Alternatively individual kernel headers are in
13 | the same location.
14 | 
15 | In most cases it is sufficient to call the dispatcher for the kernel you are using.
16 | For example the following code will compute the dot product between the taps and
17 | input vector as part of a FIR filter.
18 | \code
19 | // assume there is sufficient history in the input buffer
20 | for(unsigned int ii=0; ii < input_length-ntaps; ++ii) {
21 |     volk_32fc_32f_dot_prod_32fc(filter_output[ii], input[ii-ntaps] taps, ntaps);
22 | }
23 | \endcode
24 | 
25 | */
26 | 
27 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/oe-sdk_cross.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2014 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | set(CMAKE_SYSTEM_NAME Linux)
10 | #set( CMAKE_C_COMPILER  $ENV{CC} )
11 | #set( CMAKE_CXX_COMPILER  $ENV{CXX} )
12 | string(REGEX MATCH "sysroots/([a-zA-Z0-9]+)" CMAKE_SYSTEM_PROCESSOR
13 |              $ENV{SDKTARGETSYSROOT})
14 | string(REGEX REPLACE "sysroots/" "" CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR})
15 | set(CMAKE_CXX_FLAGS
16 |     $ENV{CXXFLAGS}
17 |     CACHE STRING "" FORCE)
18 | set(CMAKE_C_FLAGS
19 |     $ENV{CFLAGS}
20 |     CACHE STRING "" FORCE) #same flags for C sources
21 | set(CMAKE_LDFLAGS_FLAGS
22 |     ${CMAKE_CXX_FLAGS}
23 |     CACHE STRING "" FORCE) #same flags for C sources
24 | set(CMAKE_LIBRARY_PATH ${OECORE_TARGET_SYSROOT}/usr/lib)
25 | set(CMAKE_FIND_ROOT_PATH $ENV{OECORE_TARGET_SYSROOT} $ENV{OECORE_NATIVE_SYSROOT})
26 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
27 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
28 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
29 | set(ORC_INCLUDE_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/include/orc-0.4)
30 | set(ORC_LIBRARY_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/lib)
31 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2018 - 2020, 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | # This file is a template, and might need editing before it works on your project.
10 | # use the official gcc image, based on debian
11 | # can use versions as well, like gcc:5.2
12 | # see https://hub.docker.com/_/gcc/
13 | image: ubuntu:18.04
14 | 
15 | build:
16 |   stage: build
17 |   # instead of calling g++ directly you can also use some build toolkit like make
18 |   # install the necessary build tools when needed
19 |   before_script:
20 |      - apt update && apt -y install make cmake python python-pip && pip install mako
21 |   script:
22 |     - mkdir build && cd build && cmake .. && make -j
23 |   artifacts:
24 |     paths:
25 |       - build/
26 |   # depending on your build setup it's most likely a good idea to cache outputs to reduce the build time
27 |   # cache:
28 |   #   paths:
29 |   #     - "*.o"
30 | 
31 | # run tests using the binary built before
32 | test:
33 |   stage: test
34 |   before_script:
35 |     - apt update && apt -y install cmake python python-pip && pip install mako
36 |   script:
37 |     - cd build && ctest -V
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2022, 2024 Johannes Demel
 3 | #
 4 | # This file is part of VOLK.
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | if(NOT ENABLE_TESTING)
10 |     return()
11 | endif(NOT ENABLE_TESTING)
12 | 
13 | find_package(fmt)
14 | find_package(GTest)
15 | 
16 | if(NOT fmt_FOUND OR NOT GTest_FOUND)
17 |     message(warning "Missing fmtlib and/or googletest for this test suite")
18 |     return()
19 | endif(NOT fmt_FOUND OR NOT GTest_FOUND)
20 | 
21 | file(GLOB volk_test_files "test_*.cc")
22 | 
23 | add_executable(
24 |   volk_tests
25 |   volk_test.cc
26 |   ${volk_test_files}
27 | )
28 | 
29 | target_compile_features(volk_tests PUBLIC cxx_std_20)
30 | 
31 | target_link_libraries(volk_tests
32 |   PRIVATE
33 |     GTest::gtest_main
34 |     volk
35 |     fmt::fmt
36 | )
37 | 
38 | include(GoogleTest)
39 | gtest_discover_tests(volk_tests)
40 | 
41 | 
42 | target_include_directories(volk_tests
43 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
44 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
45 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/lib>
46 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
47 |     PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
48 |     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
49 | )
50 | 


--------------------------------------------------------------------------------
/lib/volk_rank_archs.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_RANK_ARCHS_H
11 | #define INCLUDED_VOLK_RANK_ARCHS_H
12 | 
13 | #include <stdbool.h>
14 | #include <stdlib.h>
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | int volk_get_index(const char* impl_names[], // list of implementations by name
21 |                    const size_t n_impls,     // number of implementations available
22 |                    const char* impl_name     // the implementation name to find
23 | );
24 | 
25 | int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
26 |                     const char* impl_names[], // list of implementations by name
27 |                     const int* impl_deps,     // requirement mask per implementation
28 |                     const bool* alignment,    // alignment status of each implementation
29 |                     size_t n_impls,           // number of implementations available
30 |                     const bool align          // if false, filter aligned implementations
31 | );
32 | 
33 | #ifdef __cplusplus
34 | }
35 | #endif
36 | #endif /*INCLUDED_VOLK_RANK_ARCHS_H*/
37 | 


--------------------------------------------------------------------------------
/include/volk/volk_prefs.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011, 2012, 2015, 2019, 2020 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_PREFS_H
11 | #define INCLUDED_VOLK_PREFS_H
12 | 
13 | #include <stdbool.h>
14 | #include <stdlib.h>
15 | #include <volk/volk_common.h>
16 | 
17 | __VOLK_DECL_BEGIN
18 | 
19 | typedef struct volk_arch_pref {
20 |     char name[128];   // name of the kernel
21 |     char impl_a[128]; // best aligned impl
22 |     char impl_u[128]; // best unaligned impl
23 | } volk_arch_pref_t;
24 | 
25 | ////////////////////////////////////////////////////////////////////////
26 | // get path to volk_config profiling info; second arguments specifies
27 | // if config file should be tested on existence for reading.
28 | // returns \0 in the argument on failure.
29 | ////////////////////////////////////////////////////////////////////////
30 | VOLK_API void volk_get_config_path(char*, bool);
31 | 
32 | ////////////////////////////////////////////////////////////////////////
33 | // load prefs into global prefs struct
34 | ////////////////////////////////////////////////////////////////////////
35 | VOLK_API size_t volk_load_preferences(volk_arch_pref_t**);
36 | 
37 | __VOLK_DECL_END
38 | 
39 | #endif // INCLUDED_VOLK_PREFS_H
40 | 


--------------------------------------------------------------------------------
/gen/volk_tmpl_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2012 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | import os
10 | import re
11 | import sys
12 | import argparse
13 | import volk_arch_defs
14 | import volk_machine_defs
15 | import volk_kernel_defs
16 | from mako.template import Template
17 | 
18 | 
19 | def __parse_tmpl(_tmpl, **kwargs):
20 |     defs = {
21 |         'archs': volk_arch_defs.archs,
22 |         'arch_dict': volk_arch_defs.arch_dict,
23 |         'machines': volk_machine_defs.machines,
24 |         'machine_dict': volk_machine_defs.machine_dict,
25 |         'kernels': volk_kernel_defs.kernels,
26 |     }
27 |     defs.update(kwargs)
28 |     _tmpl = """
29 | 
30 | /* this file was generated by volk template utils, do not edit! */
31 | 
32 | """ + _tmpl
33 |     return str(Template(_tmpl).render(**defs))
34 | 
35 | 
36 | def main():
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--input', type=str)
39 |     parser.add_argument('--output', type=str)
40 |     args, extras = parser.parse_known_args()
41 | 
42 |     output = __parse_tmpl(open(args.input).read(), args=extras)
43 |     if args.output: open(args.output, 'w').write(output)
44 |     else: print(output)
45 | 
46 | 
47 | if __name__ == '__main__': 
48 |     main()
49 |     
50 | 


--------------------------------------------------------------------------------
/tmpl/volk_machines.tmpl.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_LIBVOLK_MACHINES_H
11 | #define INCLUDED_LIBVOLK_MACHINES_H
12 | 
13 | #include <volk/volk_common.h>
14 | #include <volk/volk_typedefs.h>
15 | 
16 | #include <stdbool.h>
17 | #include <stdlib.h>
18 | 
19 | __VOLK_DECL_BEGIN
20 | 
21 | struct volk_machine {
22 |     const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
23 |     const char *name;
24 |     const size_t alignment; //the maximum byte alignment required for functions in this library
25 |     %for kern in kernels:
26 |     const char *${kern.name}_name;
27 |     const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}];
28 |     const int ${kern.name}_impl_deps[${len_archs}];
29 |     const bool ${kern.name}_impl_alignment[${len_archs}];
30 |     const ${kern.pname} ${kern.name}_impls[${len_archs}];
31 |     const size_t ${kern.name}_n_impls;
32 |     %endfor
33 | };
34 | 
35 | %for machine in machines:
36 | #ifdef LV_MACHINE_${machine.name.upper()}
37 | extern struct volk_machine volk_machine_${machine.name};
38 | #endif
39 | %endfor
40 | 
41 | __VOLK_DECL_END
42 | 
43 | #endif //INCLUDED_LIBVOLK_MACHINES_H
44 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/aarch64-linux-gnu.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2018, 2020 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | set(CMAKE_SYSTEM_NAME Linux)
10 | set(CMAKE_SYSTEM_PROCESSOR aarch64)
11 | 
12 | if(MINGW
13 |    OR CYGWIN
14 |    OR WIN32)
15 |     set(UTIL_SEARCH_CMD where)
16 | elseif(UNIX OR APPLE)
17 |     set(UTIL_SEARCH_CMD which)
18 | endif()
19 | 
20 | set(TOOLCHAIN_PREFIX aarch64-linux-gnu-)
21 | 
22 | execute_process(
23 |     COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc
24 |     OUTPUT_VARIABLE BINUTILS_PATH
25 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
26 | 
27 | get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY)
28 | 
29 | # The following is not needed on debian
30 | # Without that flag CMake is not able to pass test compilation check
31 | #set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs")
32 | 
33 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc)
34 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
35 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++)
36 | 
37 | set(CMAKE_OBJCOPY
38 |     ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
39 |     CACHE INTERNAL "objcopy tool")
40 | set(CMAKE_SIZE_UTIL
41 |     ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
42 |     CACHE INTERNAL "size tool")
43 | 
44 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
45 | 
46 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-aarch64 -L /usr/aarch64-linux-gnu/")
47 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_docs.yml:
--------------------------------------------------------------------------------
 1 | name: Publish docs
 2 | 
 3 | on: [push]
 4 | jobs:
 5 |   build-docs:
 6 |     name: Build VOLK docs
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v4
11 |         with:
12 |           submodules: "recursive"
13 |       - name: Install dependencies
14 |         run: sudo apt install python3-mako liborc-dev doxygen
15 |       - name: Configure
16 |         run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
17 |       - name: Build
18 |         run: cmake --build build --target volk_doc
19 |       - name: Setup SSH Keys
20 |         env:
21 |           SSH_AUTH_SOCK: /tmp/ssh_agent.sock
22 |         run: |
23 |           ssh-agent -a $SSH_AUTH_SOCK > /dev/null
24 |           ssh-add - <<< "${{ secrets.SSH_KEY }}"
25 |           mkdir $HOME/.ssh
26 |           echo -n "${{ secrets.SSH_KNOWN_HOST }}" > $HOME/.ssh/known_hosts
27 |       - name: Upload via SSH
28 |         env:
29 |           SSH_AUTH_SOCK: /tmp/ssh_agent.sock
30 |           TARGET_DIR: "${{ github.ref_type }}/${{ github.ref_name }}"
31 |         run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xzf -; rm -f /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"'
32 |       - uses: actions/upload-artifact@v4
33 |         with:
34 |           name: volk_docs
35 |           path: build/html/
36 | 


--------------------------------------------------------------------------------
/docs/DCO.txt:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 1 Letterman Drive
 6 | Suite D4700
 7 | San Francisco, CA, 94129
 8 | 
 9 | Everyone is permitted to copy and distribute verbatim copies of this
10 | license document, but changing it is not allowed.
11 | 
12 | 
13 | Developer's Certificate of Origin 1.1
14 | 
15 | By making a contribution to this project, I certify that:
16 | 
17 | (a) The contribution was created in whole or in part by me and I
18 |     have the right to submit it under the open source license
19 |     indicated in the file; or
20 | 
21 | (b) The contribution is based upon previous work that, to the best
22 |     of my knowledge, is covered under an appropriate open source
23 |     license and I have the right under that license to submit that
24 |     work with modifications, whether created in whole or in part
25 |     by me, under the same open source license (unless I am
26 |     permitted to submit under a different license), as indicated
27 |     in the file; or
28 | 
29 | (c) The contribution was provided directly to me by some other
30 |     person who certified (a), (b) or (c) and I have not modified
31 |     it.
32 | 
33 | (d) I understand and agree that this project and the contribution
34 |     are public and that a record of the contribution (including all
35 |     personal information I submit with it, including my sign-off) is
36 |     maintained indefinitely and may be redistributed consistent with
37 |     this project or the open source license(s) involved.
38 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016, 2017, 2019, 2020, 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | image: Visual Studio 2022
 9 | cache:
10 |   - packages -> appveyor.yml
11 | environment:
12 |   environment:
13 |   matrix:
14 |     - job_name: VS 17 2022 / python 3.12
15 |       APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022
16 |       CMAKE_GENERATOR: Visual Studio 17 2022
17 |       PYTHON: "C:\Python312-x64"
18 | 
19 | install:
20 |     # Prepend the selected Python to the PATH of this build
21 |     - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
22 |     # Display version information about selected python and pip
23 |     - python --version
24 |     - python -c "import sys, platform, struct;
25 |       print(sys.platform, platform.machine(), struct.calcsize('P')*8)"
26 |     - pip --version
27 |     - pip install mako
28 | before_build:
29 |     - git submodule update --init --recursive
30 |     - cmake -G "%CMAKE_GENERATOR%" -A x64 -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=ON .
31 | build_script:
32 |     - cmake --build . --config Release --target INSTALL
33 | test_script:
34 |     - ctest -V --output-on-failure -C Release
35 | after_test:
36 |     - cd "C:\Program Files"
37 |     - 7z a "C:\libvolk-x64-%VC_VERSION%.zip" volk
38 |     - mkdir dlls
39 |     - cd dlls
40 |     - 7z a "C:\libvolk-x64-deps-%VC_VERSION%.zip" *
41 |     - appveyor PushArtifact C:\libvolk-x64-%VC_VERSION%.zip
42 |     - appveyor PushArtifact C:\libvolk-x64-deps-%VC_VERSION%.zip
43 | 


--------------------------------------------------------------------------------
/cmake/Toolchains/arm-linux-gnueabihf.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2018 - 2020 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | set(CMAKE_SYSTEM_NAME Linux)
10 | set(CMAKE_SYSTEM_PROCESSOR arm)
11 | 
12 | if(MINGW
13 |    OR CYGWIN
14 |    OR WIN32)
15 |     set(UTIL_SEARCH_CMD where)
16 | elseif(UNIX OR APPLE)
17 |     set(UTIL_SEARCH_CMD which)
18 | endif()
19 | 
20 | set(TOOLCHAIN_PREFIX arm-linux-gnueabihf-)
21 | 
22 | execute_process(
23 |     COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc
24 |     OUTPUT_VARIABLE BINUTILS_PATH
25 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
26 | 
27 | get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY)
28 | 
29 | # The following is not needed on debian
30 | # Without that flag CMake is not able to pass test compilation check
31 | #set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs")
32 | 
33 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc)
34 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
35 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++)
36 | ## the following is needed for CheckCSourceCompiles used in lib/CMakeLists.txt
37 | set(CMAKE_C_FLAGS
38 |     "-mfpu=neon"
39 |     CACHE STRING "" FORCE)
40 | set(CMAKE_ASM_FLAGS
41 |     "${CMAKE_C_FLAGS}"
42 |     CACHE STRING "" FORCE)
43 | 
44 | set(CMAKE_OBJCOPY
45 |     ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
46 |     CACHE INTERNAL "objcopy tool")
47 | set(CMAKE_SIZE_UTIL
48 |     ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
49 |     CACHE INTERNAL "size tool")
50 | 
51 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
52 | 
53 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-arm -L /usr/arm-linux-gnueabihf/")
54 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32f_s32f_multiply_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32f_s32f_multiply_32f_a_neonasm
 3 | volk_32f_s32f_multiply_32f_a_neonasm:
 4 | 	@ r0 - cVector: pointer to output array
 5 | 	@ r1 - aVector: pointer to input array 1
 6 | 	@ r2 - bVector: pointer to input array 2
 7 | 	@ r3 - num_points: number of items to process
 8 | 
 9 | 	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12}	@ prologue - save register states
10 | 
11 | 
12 |     @ quarter_points = num_points / 4
13 | 	movs r11, r3, lsr #2
14 | 	beq .loop2 @ if zero into quarterPoints
15 | 
16 |     @ number = quarter_points
17 | 	mov r10, r3
18 |     @ copy address of input vector
19 |     mov r4, r1
20 |     @ copy address of output vector
21 |     mov r5, r0
22 | 
23 |     @ load the scalar to a quad register
24 |     @ vmov.32 d2[0], r2
25 |     @ The scalar might be in s0, not totally sure
26 |     vdup.32 q2, d0[0]
27 | 
28 |     @ this is giving fits. Current theory is hf has something to do with it
29 |     .loop1:
30 |     @  vld1.32 {q1}, [r4:128]! @ aVal
31 |     @  vmul.f32 q3, q1, q2
32 |     @  vst1.32	{q3}, [r5:128]! @ cVal
33 |     @
34 |     @  subs r10, r10, #1
35 |     @  bne	.loop1	@ first loop
36 | 
37 |     @ number = quarter_points * 4
38 |     mov	r10, r11, asl #2
39 | 
40 |     .loop2:
41 |     @   cmp	num_points, number
42 |     @   bls	.done
43 |     @
44 |     @   vld1.32 {d0[0]}, [aVector]!
45 |     @   vmul.f32 s2, s0, s4
46 |     @   vst1.32 {d1[0]}, [cVector]!
47 |     @   add number, number, #1
48 |     @   b .loop2
49 | 
50 | .done:
51 | 	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states
52 | 	bx	lr
53 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32f_x2_add_32f_a_neonasm
 3 | volk_32f_x2_add_32f_a_neonasm:
 4 | 	@ r0 - cVector: pointer to output array
 5 | 	@ r1 - aVector: pointer to input array 1
 6 | 	@ r2 - bVector: pointer to input array 2
 7 | 	@ r3 - num_points: number of items to process
 8 | 	cVector .req r0
 9 | 	aVector .req r1
10 | 	bVector .req r2
11 | 	num_points .req r3
12 | 	quarterPoints .req r7
13 | 	number .req r8
14 | 	aVal .req q0 @ d0-d1
15 | 	bVal .req q1 @ d2-d3
16 | 	cVal .req q2 @ d4-d5
17 | 
18 | 	@ AAPCS Section 5.1.1
19 | 	@ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
20 | 	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
21 | 
22 | 	movs quarterPoints, num_points, lsr #2
23 | 	beq .loop2 @ if zero into quarterPoints
24 | 
25 | 	mov	number, #0	@ number, 0
26 | .loop1:
27 | 	pld [aVector, #128] @ pre-load hint - this is implementation specific!
28 | 	pld [bVector, #128] @ pre-load hint - this is implementation specific!
29 | 
30 | 	vld1.32	{d0-d1}, [aVector:128]!	@ aVal
31 | 	add	number, number, #1
32 | 	vld1.32	{d2-d3}, [bVector:128]!	@ bVal
33 | 	vadd.f32 cVal, bVal, aVal
34 | 	cmp	number, quarterPoints
35 | 	vst1.32	{d4-d5}, [cVector:128]!	@ cVal
36 | 
37 | 	blt	.loop1	@ first loop
38 | 
39 | 	mov	number, quarterPoints, asl #2
40 | 
41 | .loop2:
42 | 	cmp	num_points, number
43 | 	bls	.done
44 | 
45 | 	vld1.32 {d0[0]}, [aVector]!
46 | 	vld1.32 {d0[1]}, [bVector]!
47 | 	vadd.f32 s2, s1, s0
48 | 	vst1.32 {d1[0]}, [cVector]!
49 | 	add number, number, #1
50 | 	b .loop2
51 | 
52 | .done:
53 | 	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
54 | 	bx	lr
55 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_x2_multiply_32fc_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32fc_x2_multiply_32fc_a_neonasm
 3 | volk_32fc_x2_multiply_32fc_a_neonasm:
 4 |     push    {r4, r5, r6, r7, r8, r9, r14}
 5 |     lsrs    r7, r3, #2
 6 |     @ r0 is c vector
 7 |     @ r1 is a vector
 8 |     @ r2 is b vector
 9 |     @ r3 is num_points
10 |     @ r7 is quarter_points
11 |     beq     .smallvector
12 |     mov     r5, #0
13 | .mainloop:
14 |    vld2.32   {d24-d27}, [r1]!  @ ar=q12, ai=q13
15 |    add       r5, r5, #1
16 |    cmp       r5, r7
17 |    vld2.32   {d20-d23}, [r2]!  @ br=q10, bi=q11
18 |    pld       [r1]
19 |    pld       [r2]
20 |    vmul.f32  q0, q12, q10 @ q15 = ar*br
21 |    vmul.f32  q1, q13, q11 @ q11 = ai*bi
22 |    vmul.f32  q2, q12, q11 @ q14 = ar*bi
23 |    vmul.f32  q3, q13, q10 @ q12 = ai*br
24 |    vsub.f32  q9, q0, q1  @ real
25 |    vadd.f32  q10, q2, q3  @ imag
26 |    vst2.32   {q9-q10}, [r0]!
27 |    bne     .mainloop
28 | 
29 | .smallvector:
30 |    lsl     r5, r7, #2   @ r5 = quarter_points * 4
31 |    cmp     r3, r5       @ num_points == quarter_points?
32 |    bls     .done
33 | .tailcase:
34 |    add    r5, r5, #1    @ r5 +=1 <- number++
35 |    vld1.32    d1, [r1]! @ s2, s3 = ar, ai
36 |    vld1.32    d0, [r2]! @ s0, s1 = br, bi
37 |    vmul.f32   s4, s0, s2 @ s4 = ar*br
38 |    vmul.f32   s5, s0, s3 @ s5 = ar*bi
39 |    vmls.f32   s4, s1, s3 @ s4 = s4 - ai*bi
40 |    vmla.f32   s5, s1, s2 @ s5 = s5 + ai*br
41 |    @vst2.32    d2[0], [r0]!
42 |    vst1.32    {d2}, [r0]!
43 |    cmp     r3, r5       @ r3 == r5? num_points == number?
44 |    bne     .tailcase
45 | .done:
46 |    pop     {r4, r5, r6, r7, r8, r9, r15}
47 |    bx lr
48 | 


--------------------------------------------------------------------------------
/apps/plot_best_vs_generic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2019 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | # This script is used to compare the generic kernels to the highest performing kernel, for each operation
10 | # Run:
11 | #   ./volk_profile -j volk_results.json
12 | # Then run this script under python3
13 | 
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | import json
17 | 
18 | filename = 'volk_results.json'
19 | 
20 | operations = []
21 | metrics = []
22 | with open(filename) as json_file:
23 |     data = json.load(json_file)
24 |     for test in data['volk_tests']:
25 |         if ('generic' in test['results']) or ('u_generic' in test['results']): # some dont have a generic kernel
26 |             operations.append(test['name'][5:]) # remove volk_ prefix that they all have
27 |             extension_performance = []
28 |             for key, val in test['results'].items():
29 |                 if key not in ['generic', 'u_generic']: # exclude generic results, when trying to find fastest time
30 |                     extension_performance.append(val['time'])
31 |             try:
32 |                 generic_time = test['results']['generic']['time']
33 |             except:
34 |                 generic_time = test['results']['u_generic']['time']
35 |             metrics.append(extension_performance[np.argmin(extension_performance)]/generic_time)
36 | 
37 | 
38 | plt.bar(np.arange(len(metrics)), metrics)
39 | plt.hlines(1.0, -1, len(metrics), colors='r', linestyles='dashed')
40 | plt.axis([-1, len(metrics), 0, 2])
41 | plt.xticks(np.arange(len(operations)), operations, rotation=90)
42 | plt.ylabel('Time taken of fastest kernel relative to generic kernel')
43 | plt.tight_layout()
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindORC.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2014, 2019, 2020 Free Software Foundation, Inc.
 2 | #
 3 | # This file is part of VOLK.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | find_package(PkgConfig)
 9 | pkg_check_modules(PC_ORC "orc-0.4 > 0.4.11")
10 | 
11 | include(GNUInstallDirs)
12 | 
13 | find_program(
14 |     ORCC_EXECUTABLE orcc
15 |     HINTS ${PC_ORC_TOOLSDIR}
16 |     PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin)
17 | 
18 | find_path(
19 |     ORC_INCLUDE_DIR
20 |     NAMES orc/orc.h
21 |     HINTS ${PC_ORC_INCLUDEDIR}
22 |     PATHS ${ORC_ROOT}/include ${CMAKE_INSTALL_PREFIX}/include
23 |     PATH_SUFFIXES orc-0.4)
24 | 
25 | find_path(
26 |     ORC_LIBRARY_DIR
27 |     NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX}
28 |     HINTS ${PC_ORC_LIBDIR}
29 |     PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR}
30 |           ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
31 | 
32 | find_library(
33 |     ORC_LIB orc-0.4
34 |     HINTS ${PC_ORC_LIBRARY_DIRS}
35 |     PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR}
36 |           ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
37 | 
38 | find_library(
39 |     ORC_LIBRARY_STATIC liborc-0.4.a
40 |     HINTS ${PC_ORC_LIBRARY_DIRS}
41 |     PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR}
42 |           ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
43 | 
44 | list(APPEND ORC_LIBRARY ${ORC_LIB})
45 | 
46 | set(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
47 | set(ORC_LIBRARIES ${ORC_LIBRARY})
48 | set(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR})
49 | set(ORC_LIBRARIES_STATIC ${ORC_LIBRARY_STATIC})
50 | 
51 | include(FindPackageHandleStandardArgs)
52 | find_package_handle_standard_args(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR
53 |                                   ORCC_EXECUTABLE)
54 | 
55 | mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE)
56 | 


--------------------------------------------------------------------------------
/cmake/Modules/VolkConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright 2016, 2018 - 2020 Free Software Foundation, Inc.
 2 | #
 3 | # This file is part of VOLK.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | get_filename_component(VOLK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 9 | 
10 | if(NOT TARGET Volk::volk)
11 |     include("${VOLK_CMAKE_DIR}/VolkTargets.cmake")
12 | endif()
13 | 
14 | # set VOLK_FOUND to be set globally, for whether a compatible Volk was
15 | # found -- could be a correct enough version or any version depending
16 | # on how find_package was called.
17 | if(NOT TARGET Volk::volk)
18 |     set(VOLK_FOUND FALSE)
19 | else()
20 |     set(VOLK_FOUND TRUE)
21 | endif()
22 | 
23 | # cache whether a compatible Volk was found for
24 | # use anywhere in the calling project
25 | set(VOLK_FOUND
26 |     ${VOLK_FOUND}
27 |     CACHE BOOL "Whether a compatible Volk was found" FORCE)
28 | 
29 | if(VOLK_FOUND)
30 |     # use the new target library, regardless of whether new or old style
31 |     # we still need to set a variable with the library name so that there
32 |     # is a variable to reference in the using-project's cmake scripts!
33 |     set(VOLK_LIBRARIES
34 |         Volk::volk
35 |         CACHE STRING "Volk Library" FORCE)
36 | 
37 |     # INTERFACE_INCLUDE_DIRECTORIES should always be set
38 |     get_target_property(VOLK_INCLUDE_DIRS Volk::volk INTERFACE_INCLUDE_DIRECTORIES)
39 |     set(VOLK_INCLUDE_DIRS
40 |         ${VOLK_INCLUDE_DIRS}
41 |         CACHE STRING "Volk Include Directories" FORCE)
42 | 
43 |     # for backward compatibility with old-CMake non-target project finding
44 |     include(FindPackageHandleStandardArgs)
45 |     find_package_handle_standard_args(Volk DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
46 |     mark_as_advanced(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
47 | endif(VOLK_FOUND)
48 | 


--------------------------------------------------------------------------------
/cmake/cmake_uninstall.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Free Software Foundation, Inc.
 2 | #
 3 | # This file is part of VOLK.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | # https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#can-i-do-make-uninstall-with-cmake
 9 | 
10 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
11 |     message(
12 |         FATAL_ERROR
13 |             "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\""
14 |     )
15 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
16 | 
17 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
18 | string(REGEX REPLACE "\n" ";" files "${files}")
19 | foreach(file ${files})
20 |     message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
21 |     if(EXISTS "$ENV{DESTDIR}${file}")
22 |         exec_program(
23 |             "@CMAKE_COMMAND@" ARGS
24 |             "-E remove \"$ENV{DESTDIR}${file}\""
25 |             OUTPUT_VARIABLE rm_out
26 |             RETURN_VALUE rm_retval)
27 |         if(NOT "${rm_retval}" STREQUAL 0)
28 |             message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
29 |         endif(NOT "${rm_retval}" STREQUAL 0)
30 |     elseif(IS_SYMLINK "$ENV{DESTDIR}${file}")
31 |         exec_program(
32 |             "@CMAKE_COMMAND@" ARGS
33 |             "-E remove \"$ENV{DESTDIR}${file}\""
34 |             OUTPUT_VARIABLE rm_out
35 |             RETURN_VALUE rm_retval)
36 |         if(NOT "${rm_retval}" STREQUAL 0)
37 |             message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
38 |         endif(NOT "${rm_retval}" STREQUAL 0)
39 |     else(EXISTS "$ENV{DESTDIR}${file}")
40 |         message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.")
41 |     endif(EXISTS "$ENV{DESTDIR}${file}")
42 | endforeach(file)
43 | 


--------------------------------------------------------------------------------
/gen/volk_compile_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2012 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | import argparse
10 | import volk_arch_defs
11 | import volk_machine_defs
12 | 
13 | 
14 | def do_arch_flags_list(compiler):
15 |     output = list()
16 |     for arch in volk_arch_defs.archs:
17 |         if not arch.is_supported(compiler): continue
18 |         fields = [arch.name] + arch.get_flags(compiler)
19 |         output.append(','.join(fields))
20 |     print(';'.join(output))
21 | 
22 | 
23 | def do_machines_list(arch_names):
24 |     output = list()
25 |     for machine in volk_machine_defs.machines:
26 |         machine_arch_set = set(machine.arch_names)
27 |         if set(arch_names).intersection(machine_arch_set) == machine_arch_set:
28 |             output.append(machine.name)
29 |     print(';'.join(output))
30 | 
31 | 
32 | def do_machine_flags_list(compiler, machine_name):
33 |     output = list()
34 |     machine = volk_machine_defs.machine_dict[machine_name]
35 |     for arch in machine.archs:
36 |         output.extend(arch.get_flags(compiler))
37 |     print(' '.join(output))
38 | 
39 | 
40 | def main():
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument('--mode', type=str)
43 |     parser.add_argument('--compiler', type=str)
44 |     parser.add_argument('--archs', type=str)
45 |     parser.add_argument('--machine', type=str)
46 |     args = parser.parse_args()
47 | 
48 |     if args.mode == 'arch_flags': return do_arch_flags_list(args.compiler.lower())
49 |     if args.mode == 'machines': return do_machines_list(args.archs.split(';'))
50 |     if args.mode == 'machine_flags': return do_machine_flags_list(args.compiler.lower(), args.machine)
51 | 
52 | if __name__ == '__main__': 
53 |     main()
54 |     
55 | 


--------------------------------------------------------------------------------
/include/volk/volk_alloc.hh:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * Copyright 2019 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_ALLOC_H
11 | #define INCLUDED_VOLK_ALLOC_H
12 | 
13 | #include <cstdlib>
14 | #include <limits>
15 | #include <new>
16 | #include <vector>
17 | 
18 | #include <volk/volk.h>
19 | 
20 | namespace volk {
21 | 
22 | /*!
23 |  * \brief C++11 allocator using volk_malloc and volk_free
24 |  *
25 |  * \details
26 |  *   adapted from https://en.cppreference.com/w/cpp/named_req/Alloc
27 |  */
28 | template <class T>
29 | struct alloc {
30 |     typedef T value_type;
31 | 
32 |     alloc() = default;
33 | 
34 |     template <class U>
35 |     constexpr alloc(alloc<U> const&) noexcept
36 |     {
37 |     }
38 | 
39 |     T* allocate(std::size_t n)
40 |     {
41 |         if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
42 |             throw std::bad_alloc();
43 | 
44 |         if (auto p = static_cast<T*>(volk_malloc(n * sizeof(T), volk_get_alignment())))
45 |             return p;
46 | 
47 |         throw std::bad_alloc();
48 |     }
49 | 
50 |     void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
51 | };
52 | 
53 | template <class T, class U>
54 | bool operator==(alloc<T> const&, alloc<U> const&)
55 | {
56 |     return true;
57 | }
58 | 
59 | template <class T, class U>
60 | bool operator!=(alloc<T> const&, alloc<U> const&)
61 | {
62 |     return false;
63 | }
64 | 
65 | 
66 | /*!
67 |  * \brief type alias for std::vector using volk::alloc
68 |  *
69 |  * \details
70 |  * example code:
71 |  *   volk::vector<float> v(100); // vector using volk_malloc, volk_free
72 |  */
73 | template <class T>
74 | using vector = std::vector<T, alloc<T>>;
75 | 
76 | } // namespace volk
77 | #endif // INCLUDED_VOLK_ALLOC_H
78 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32f_x2_dot_prod_32f_a_neonasm
 3 | volk_32f_x2_dot_prod_32f_a_neonasm:
 4 | 	@ r0 - cVector: pointer to output array
 5 | 	@ r1 - aVector: pointer to input array 1
 6 | 	@ r2 - bVector: pointer to input array 2
 7 | 	@ r3 - num_points: number of items to process
 8 | 	cVector .req r0
 9 | 	aVector .req r1
10 | 	bVector .req r2
11 | 	num_points .req r3
12 | 	quarterPoints .req r7
13 | 	number .req r8
14 | 	aVal .req q0 @ d0-d1
15 | 	bVal .req q1 @ d2-d3
16 | 	cVal .req q2 @ d4-d5
17 | 
18 | 	@ AAPCS Section 5.1.1
19 | 	@ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
20 | 	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
21 | 
22 |     veor.32 q0, q0, q0
23 | 	movs quarterPoints, num_points, lsr #2
24 | 	beq .loop2 @ if zero into quarterPoints
25 | 
26 | 	mov	number, #0	@ number, 0
27 | .loop1:
28 | 	pld [aVector, #128] @ pre-load hint - this is implementation specific!
29 | 	pld [bVector, #128] @ pre-load hint - this is implementation specific!
30 | 
31 | 	vld1.32	{q1}, [aVector:128]!	@ aVal
32 | 	vld1.32	{q2}, [bVector:128]!	@ bVal
33 |     vmla.f32 q0, q1, q2
34 | 
35 | 	add	number, number, #1
36 | 	cmp	number, quarterPoints
37 | 	blt	.loop1	@ first loop
38 | 
39 |     @ strange order comes from trying to schedule instructions
40 |     vadd.f32 s0, s0, s1
41 |     vadd.f32 s2, s2, s3
42 | 	mov	number, quarterPoints, asl #2
43 |     vadd.f32 s0, s0, s2
44 | 
45 | .loop2:
46 | 	cmp	num_points, number
47 | 	bls	.done
48 | 
49 | 	vld1.32 {d1[0]}, [aVector]!
50 | 	vld1.32 {d1[1]}, [bVector]!
51 | 	vmla.f32 s0, s2, s3
52 | 	add number, number, #1
53 | 	b .loop2
54 | 
55 | .done:
56 | 	vstr s0, [cVector]
57 | 	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
58 | 	bx	lr
59 | 


--------------------------------------------------------------------------------
/tmpl/volk_machine_xxx.tmpl.c:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | <% this_machine = machine_dict[args[0]] %>
11 | <% arch_names = this_machine.arch_names %>
12 | 
13 | %for arch in this_machine.archs:
14 | #define LV_HAVE_${arch.name.upper()} 1
15 | %endfor
16 | 
17 | #include <volk/volk_common.h>
18 | #include "volk_machines.h"
19 | #include <volk/volk_config_fixed.h>
20 | 
21 | #ifdef HAVE_CONFIG_H
22 | #include "config.h"
23 | #endif
24 | 
25 | %for kern in kernels:
26 | #include <volk/${kern.name}.h>
27 | %endfor
28 | 
29 | struct volk_machine volk_machine_${this_machine.name} = {
30 | <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %>    ${make_arch_have_list},
31 | <% this_machine_name = "\""+this_machine.name+"\"" %>    ${this_machine_name},
32 |     ${this_machine.alignment},
33 | ##//list all kernels
34 |     %for kern in kernels:
35 | <% impls = kern.get_impls(arch_names) %>
36 | ##//kernel name
37 | <% kern_name = "\""+kern.name+"\"" %>    ${kern_name},
38 | ##//list of kernel implementations by name
39 | <% make_impl_name_list = "{"+', '.join(['"%s"'%i.name for i in impls])+"}" %>    ${make_impl_name_list},
40 | ##//list of arch dependencies per implementation
41 | <% make_impl_deps_list = "{"+', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in impls])+"}" %>    ${make_impl_deps_list},
42 | ##//alignment required? for each implementation
43 | <% make_impl_align_list = "{"+', '.join(['true' if i.is_aligned else 'false' for i in impls])+"}" %>    ${make_impl_align_list},
44 | ##//pointer to each implementation
45 | <% make_impl_fcn_list = "{"+', '.join(['%s_%s'%(kern.name, i.name) for i in impls])+"}" %>    ${make_impl_fcn_list},
46 | ##//number of implementations listed here
47 | <% len_impls = len(impls) %>    ${len_impls},
48 |     %endfor
49 | };
50 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_16i_max_star_horizontal_16i_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_16i_max_star_horizontal_16i_a_neonasm
 3 | volk_16i_max_star_horizontal_16i_a_neonasm:
 4 | 	@ r0 - cVector: pointer to output array
 5 | 	@ r1 - aVector: pointer to input array 1
 6 | 	@ r2 - num_points: number of items to process
 7 | 
 8 |     pld     [r1, #128]
 9 |     push    {r4, r5, r6}    @ preserve register states
10 |     lsrs    r5, r2, #4      @ 1/16th points = num_points/16
11 |     vmov.i32    q12, #0     @ q12 = [0,0,0,0]
12 |     beq .smallvector        @ less than 16 elements in vector
13 |     mov r4, r1              @ r4 = aVector
14 |     mov r12, r0             @ gcc calls this ip
15 |     mov r3, #0              @ number = 0
16 | 
17 | .loop1:
18 |     vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load
19 |     pld [r4, #128]
20 |     add r3, r3, #1           @ number += 1
21 |     cmp r3, r5               @ number < 1/16th points
22 |     vsub.i16    q10, q8, q9  @ subtraction
23 |     vcge.s16    q11, q10, #0 @ result > 0?
24 |     vcgt.s16    q10, q12, q10 @ result < 0?
25 |     vand.i16    q11, q8, q11 @ multiply by comparisons
26 |     vand.i16    q10, q9, q10 @ multiply by other comparison
27 |     vadd.i16    q10, q11, q10 @ add results to get max
28 |     vst1.16 {d20-d21}, [r12]! @ store the results
29 |     bne .loop1               @ at least 16 items left
30 |     add r1, r1, r3, lsl #5
31 |     add r0, r0, r3, lsl #4
32 | .smallvector:
33 |     ands    r2, r2, #15
34 |     beq .end
35 |     mov r3, #0
36 | .loop3:
37 |     ldrh    r4, [r1]
38 |     bic r5, r3, #1
39 |     ldrh    ip, [r1, #2]
40 |     add r3, r3, #2
41 |     add r1, r1, #4
42 |     rsb r6, ip, r4
43 |     sxth    r6, r6
44 |     cmp r6, #0
45 |     movgt   ip, r4
46 |     cmp r3, r2
47 |     strh    ip, [r0, r5]
48 |     bcc .loop3
49 | .end:
50 |     pop {r4, r5, r6}
51 |     bx  lr
52 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests-rvv.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2020 - 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | name: Run VOLK tests on different RVV configurations
10 | 
11 | on: [push, pull_request]
12 | 
13 | jobs:
14 |   Tests:
15 |     runs-on: ubuntu-24.04
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |       with:
19 |         submodules: "recursive"
20 |     - name: Install packages
21 |       run: |
22 |         sudo apt-get update -q -y
23 |         sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
24 |         mkdir build
25 |         cd build
26 |     - name: Test gcc-14 VLEN=128
27 |       run: |
28 |         cd build; rm -rf *
29 |         CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
30 |         cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
31 |         make -j$(nproc)
32 |         ARGS=-V make test
33 |     - name: Test gcc-14 VLEN=256
34 |       run: |
35 |         cd build; rm -rf *
36 |         CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \
37 |         cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
38 |         make -j$(nproc)
39 |         ARGS=-V make test
40 |     - name: Test clang-18 VLEN=512
41 |       run: |
42 |         cd build; rm -rf *
43 |         CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
44 |         cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
45 |         make -j$(nproc)
46 |         ARGS=-V make test
47 |     - name: Test clang-18 VLEN=1024
48 |       run: |
49 |         cd build; rm -rf *
50 |         CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
51 |         cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
52 |         make -j$(nproc)
53 |         ARGS=-V make test
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32f_x2_add_32f_a_neonpipeline
 3 | volk_32f_x2_add_32f_a_neonpipeline:
 4 | 	@ r0 - cVector: pointer to output array
 5 | 	@ r1 - aVector: pointer to input array 1
 6 | 	@ r2 - bVector: pointer to input array 2
 7 | 	@ r3 - num_points: number of items to process
 8 | 	cVector .req r0
 9 | 	aVector .req r1
10 | 	bVector .req r2
11 | 	num_points .req r3
12 | 	quarterPoints .req r7
13 | 	number .req r8
14 | 	aVal .req q0 @ d0-d1
15 | 	bVal .req q1 @ d2-d3
16 | 	cVal .req q2 @ d4-d5
17 | 
18 | 	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
19 | 
20 | 	pld [aVector, #128] @ pre-load hint - this is implementation specific!
21 | 	pld [bVector, #128] @ pre-load hint - this is implementation specific!
22 | 
23 | 	movs quarterPoints, num_points, lsr #2
24 | 	beq .loop2 @ if zero into quarterPoints
25 | 
26 | 	mov number, quarterPoints
27 | 
28 | 	@ Optimizing for pipeline
29 | 	vld1.32	{d0-d1}, [aVector:128]!	@ aVal
30 | 	vld1.32	{d2-d3}, [bVector:128]!	@ bVal
31 | 	subs number, number, #1
32 |     beq .flushpipe
33 | 
34 | .loop1:
35 | 	pld [aVector, #128] @ pre-load hint - this is implementation specific!
36 | 	pld [bVector, #128] @ pre-load hint - this is implementation specific!
37 | 	vadd.f32 cVal, bVal, aVal
38 | 	vld1.32 {d0-d1}, [aVector:128]! @ aVal
39 | 	vld1.32 {d2-d3}, [bVector:128]! @ bVal
40 | 	vst1.32	{d4-d5}, [cVector:128]! @ cVal
41 | 
42 | 	subs number, number, #1
43 | 	bne	.loop1	@ first loop
44 | 
45 | .flushpipe:
46 | 	@ One more time
47 | 	vadd.f32 cVal, bVal, aVal
48 | 	vst1.32	{d4-d5}, [cVector:128]! @ cVal
49 | 
50 | 	mov	number, quarterPoints, asl #2
51 | 
52 | .loop2:
53 | 	cmp	num_points, number
54 | 	bls	.done
55 | 
56 | 	vld1.32 {d0[0]}, [aVector]!
57 | 	vld1.32 {d0[1]}, [bVector]!
58 | 	vadd.f32 s2, s1, s0
59 | 	vst1.32 {d1[0]}, [cVector]!
60 | 	add number, number, #1
61 | 	b .loop2
62 | 
63 | .done:
64 | 	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
65 | 	bx	lr
66 | 


--------------------------------------------------------------------------------
/scripts/tools/compare_volk_profiles:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | # Copyright 2022, 2025 Marcus Müller
 3 | # SPDX-License-Identifier: GPL-3.0
 4 | # Takes in a list of volk profiles, ignores empty and identical lines, prints a table of differences.
 5 | # Hacky as hell
 6 | 
 7 | from sys import argv
 8 | 
 9 | 
10 | def keepline(line: str) -> bool:
11 |     if not line:
12 |         return False
13 |     line = line.strip()
14 |     if line.startswith("#"):
15 |         return False
16 |     return True
17 | 
18 | 
19 | def kernel(line: str) -> str:
20 |     return line.split(" ")[0]
21 | 
22 | 
23 | def impls(line: str) -> tuple[str, str]:
24 |     return tuple(line.strip().split(" ")[1:])
25 | 
26 | 
27 | machines = [
28 |     {kernel(line): impls(line) for line in open(f_name) if keepline(line)}
29 |     for f_name in argv[1:]
30 | ]
31 | kernels = [set(d.keys()) for d in machines]
32 | common_kernels = [
33 |     kernel for kernel in kernels[0] if all((kernel in ks for ks in kernels[1:]))
34 | ]
35 | 
36 | differing_kernels = dict()
37 | for kernel in common_kernels:
38 |     first_impl = machines[0][kernel]
39 |     if all(machine[kernel] == first_impl for machine in machines[1:]):
40 |         continue
41 |     differing_kernels[kernel] = {
42 |         argv[idx + 1]: machine[kernel] for idx, machine in enumerate(machines)
43 |     }
44 | 
45 | max_kernel_len = max(len(kernel) for kernel in common_kernels)
46 | max_impl_len = max(
47 |     max(max(len(alignment) for alignment in impl) for impl in kernel.values())
48 |     for kernel in differing_kernels.values()
49 | )
50 | 
51 | print(
52 |     f"|{'Kernel':<{max_kernel_len}}|"
53 |     + "|".join(
54 |         f"{fname + ' a':<{max_impl_len}}|{fname + ' u':<{max_impl_len}}"
55 |         for fname in argv[1:]
56 |     )
57 |     + "|"
58 | )
59 | for kernel, impls in differing_kernels.items():
60 |     print(
61 |         f"|{kernel:<{max_kernel_len}}|"
62 |         + "|".join(
63 |             "|".join(
64 |                 f"{impl:<{max_impl_len}}" for impl in differing_kernels[kernel][fname]
65 |             )
66 |             for fname in argv[1:]
67 |         )
68 |         + "|"
69 |     )
70 | 


--------------------------------------------------------------------------------
/scripts/licensing/count_contrib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to check the list of git submitters against the table of re-submitting
 3 | # users from the AUTHORFILE. Requires the authors to be listed in
 4 | # | ... | ... | email@address.com |
 5 | # format.
 6 | #
 7 | # We can add another table of "git committers who are exempt from the need to
 8 | # relicense due to their contributions being under an acceptable license
 9 | # already" if we need; no changes to this script would be necessary.
10 | #
11 | # This script is part of VOLK.
12 | #
13 | # Copyright 2021 Marcus Müller
14 | # SPDX-License-Identifier: MPL-2.0
15 | 
16 | rootdir=`git rev-parse --show-toplevel`
17 | if [[ "$#" -lt 1 ]]
18 | then
19 |     authorfile=$rootdir/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md
20 | else
21 |     authorfile=$1
22 | fi
23 | if [[ ! -r $authorfile ]]
24 | then
25 |     echo "$authorfile: file not readable"
26 |     exit -1
27 | fi
28 | 
29 | allfiles=`git ls-files $rootdir`
30 | lgplers="$(sed -ne 's/^|[^|]*|[^|]*| \([^|]*\)|/\1/ip' $authorfile)"
31 | lgplers="$lgplers 32478819+fritterhoff@users.noreply.github.com douggeiger@users.noreply.github.com"
32 | authorcounts="$(echo "$allfiles" | while read f; do git blame --line-porcelain --ignore-rev 092a59997a1e1d5f421a0a5f87ee655ad173b93f $f 2>/dev/null | sed -ne 's/^author-mail <\([^>]*\)>/\1/p'; done | sort -f | uniq -ic | sort -n)"
33 | 
34 | total_loc=0
35 | missing_loc=0
36 | 
37 | while read -r line
38 | do
39 |     authoremail=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\2/g')
40 |     authorlines=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\1/g')
41 |     total_loc=$(( $authorlines + $total_loc ))
42 |     if ! ( echo "$lgplers" | grep -i "$authoremail" ) > /dev/null
43 |     then
44 |         echo "missing: \"$authoremail\" (${authorlines} LOC)"
45 |         missingloc=$(($missingloc + $authorlines))
46 |     fi
47 | done < <(echo "$authorcounts")
48 | 
49 | percentage=$(echo "scale=2; 100.0 * $missingloc/$total_loc" | bc)
50 | echo "Missing $missingloc of $total_loc LOC in total ($percentage%)"
51 | 
52 | if [[  "$missingloc" -gt 0 ]]
53 | then
54 |    exit -2
55 | fi
56 | exit 0
57 | 


--------------------------------------------------------------------------------
/cmake/msvc/sys/time.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018, 2020 Free Software Foundation, Inc.
 3 |  *
 4 |  * This file is part of VOLK
 5 |  *
 6 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 7 |  */
 8 | 
 9 | #ifndef _MSC_VER // [
10 | #error "Use this header only with Microsoft Visual C++ compilers!"
11 | #endif // _MSC_VER ]
12 | 
13 | #ifndef _MSC_SYS_TIME_H_
14 | #define _MSC_SYS_TIME_H_
15 | 
16 | // prevent windows.h from clobbering min and max functions with macros
17 | #ifndef NOMINMAX
18 | #define NOMINMAX
19 | #endif
20 | 
21 | // https://learn.microsoft.com/en-us/archive/msdn-technet-forums/430449b3-f6dd-4e18-84de-eebd26a8d668
22 | #include < time.h >
23 | #include <windows.h> //I've omitted this line.
24 | #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
25 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
26 | #else
27 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
28 | #endif
29 | 
30 | #if _MSC_VER < 1900
31 | struct timespec {
32 | 
33 |     time_t tv_sec; /* Seconds since 00:00:00 GMT, */
34 | 
35 |     /* 1 January 1970 */
36 | 
37 |     long tv_nsec; /* Additional nanoseconds since */
38 | 
39 |     /* tv_sec */
40 | };
41 | #endif
42 | 
43 | struct timezone {
44 |     int tz_minuteswest; /* minutes W of Greenwich */
45 |     int tz_dsttime;     /* type of dst correction */
46 | };
47 | 
48 | static inline int gettimeofday(struct timeval* tv, struct timezone* tz)
49 | {
50 |     FILETIME ft;
51 |     unsigned __int64 tmpres = 0;
52 |     static int tzflag;
53 | 
54 |     if (NULL != tv) {
55 |         GetSystemTimeAsFileTime(&ft);
56 | 
57 |         tmpres |= ft.dwHighDateTime;
58 |         tmpres <<= 32;
59 |         tmpres |= ft.dwLowDateTime;
60 | 
61 |         /*converting file time to unix epoch*/
62 |         tmpres -= DELTA_EPOCH_IN_MICROSECS;
63 |         tv->tv_sec = (long)(tmpres / 1000000UL);
64 |         tv->tv_usec = (long)(tmpres % 1000000UL);
65 |     }
66 | 
67 |     if (NULL != tz) {
68 |         if (!tzflag) {
69 |             _tzset();
70 |             tzflag++;
71 |         }
72 |         tz->tz_minuteswest = _timezone / 60;
73 |         tz->tz_dsttime = _daylight;
74 |     }
75 | 
76 |     return 0;
77 | }
78 | 
79 | #endif //_MSC_SYS_TIME_H_
80 | 


--------------------------------------------------------------------------------
/include/volk/volk_malloc.h:
--------------------------------------------------------------------------------
 1 | /* -*- c -*- */
 2 | /*
 3 |  * Copyright 2014, 2020 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_VOLK_MALLOC_H
11 | #define INCLUDED_VOLK_MALLOC_H
12 | 
13 | #include <stdlib.h>
14 | #include <volk/volk_common.h>
15 | 
16 | __VOLK_DECL_BEGIN
17 | 
18 | /*!
19 |  * \brief Allocate \p size bytes of data aligned to \p alignment.
20 |  *
21 |  * \details
22 |  * We use C11 and want to rely on C11 library features,
23 |  * namely we use `aligned_alloc` to allocate aligned memory.
24 |  * see: https://en.cppreference.com/w/c/memory/aligned_alloc
25 |  *
26 |  * Not all platforms support this feature.
27 |  * For Apple Clang, we fall back to `posix_memalign`.
28 |  * see: https://linux.die.net/man/3/aligned_alloc
29 |  * For MSVC, we fall back to `_aligned_malloc`.
30 |  * see:
31 |  * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=msvc-170
32 |  *
33 |  * Because of the ways in which volk_malloc may allocate memory, it is
34 |  * important to always free volk_malloc pointers using volk_free.
35 |  * Mainly, in case MSVC is used. Consult corresponding documentation
36 |  * in case you use MSVC.
37 |  *
38 |  * \param size The number of bytes to allocate.
39 |  * \param alignment The byte alignment of the allocated memory.
40 |  * \return pointer to aligned memory.
41 |  */
42 | VOLK_API void* volk_malloc(size_t size, size_t alignment);
43 | 
44 | /*!
45 |  * \brief Free's memory allocated by volk_malloc.
46 |  *
47 |  * \details
48 |  * We rely on C11 syntax and compilers and just call `free` in case
49 |  * memory was allocated with `aligned_alloc` or `posix_memalign`.
50 |  * Thus, in this case `volk_free` inherits the same behavior `free` exhibits.
51 |  * see: https://en.cppreference.com/w/c/memory/free
52 |  * In case `_aligned_malloc` was used, we call `_aligned_free`.
53 |  * see:
54 |  * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=msvc-170
55 |  *
56 |  * \param aptr The aligned pointer allocated by volk_malloc.
57 |  */
58 | VOLK_API void volk_free(void* aptr);
59 | 
60 | __VOLK_DECL_END
61 | 
62 | #endif /* INCLUDED_VOLK_MALLOC_H */
63 | 


--------------------------------------------------------------------------------
/apps/volk_option_helpers.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2018-2020 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef VOLK_VOLK_OPTION_HELPERS_H
11 | #define VOLK_VOLK_OPTION_HELPERS_H
12 | 
13 | #include <limits.h>
14 | #include <cstring>
15 | #include <map>
16 | #include <string>
17 | #include <vector>
18 | 
19 | typedef enum {
20 |     VOID_CALLBACK,
21 |     INT_CALLBACK,
22 |     BOOL_CALLBACK,
23 |     STRING_CALLBACK,
24 |     FLOAT_CALLBACK,
25 |     STRING,
26 | } VOLK_OPTYPE;
27 | 
28 | class option_t
29 | {
30 | public:
31 |     option_t(std::string t_longform,
32 |              std::string t_shortform,
33 |              std::string t_msg,
34 |              void (*t_callback)());
35 |     option_t(std::string t_longform,
36 |              std::string t_shortform,
37 |              std::string t_msg,
38 |              void (*t_callback)(int));
39 |     option_t(std::string t_longform,
40 |              std::string t_shortform,
41 |              std::string t_msg,
42 |              void (*t_callback)(float));
43 |     option_t(std::string t_longform,
44 |              std::string t_shortform,
45 |              std::string t_msg,
46 |              void (*t_callback)(bool));
47 |     option_t(std::string t_longform,
48 |              std::string t_shortform,
49 |              std::string t_msg,
50 |              void (*t_callback)(std::string));
51 |     option_t(std::string t_longform,
52 |              std::string t_shortform,
53 |              std::string t_msg,
54 |              std::string t_printval);
55 | 
56 |     std::string longform;
57 |     std::string shortform;
58 |     std::string msg;
59 |     VOLK_OPTYPE option_type;
60 |     std::string printval;
61 |     void (*callback)();
62 | };
63 | 
64 | class option_list
65 | {
66 | public:
67 |     option_list(std::string program_name);
68 |     bool present(std::string option_name);
69 | 
70 |     void add(option_t opt);
71 | 
72 |     void parse(int argc, char** argv);
73 | 
74 |     void help();
75 | 
76 | private:
77 |     std::string d_program_name;
78 |     std::vector<option_t> d_internal_list;
79 |     std::map<std::string, int> d_present_options;
80 | };
81 | 
82 | 
83 | #endif // VOLK_VOLK_OPTION_HELPERS_H
84 | 


--------------------------------------------------------------------------------
/cmake/msvc/config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2012, 2017, 2020 Free Software Foundation, Inc.
 3 |  *
 4 |  * This file is part of VOLK
 5 |  *
 6 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 7 |  */
 8 | 
 9 | #ifndef _MSC_VER // [
10 | #error "Use this header only with Microsoft Visual C++ compilers!"
11 | #endif // _MSC_VER ]
12 | 
13 | #ifndef _MSC_CONFIG_H_ // [
14 | #define _MSC_CONFIG_H_
15 | 
16 | ////////////////////////////////////////////////////////////////////////
17 | // enable inline functions for C code
18 | ////////////////////////////////////////////////////////////////////////
19 | #ifndef __cplusplus
20 | #define inline __inline
21 | #endif
22 | 
23 | ////////////////////////////////////////////////////////////////////////
24 | // signed size_t
25 | ////////////////////////////////////////////////////////////////////////
26 | #include <stddef.h>
27 | typedef ptrdiff_t ssize_t;
28 | 
29 | ////////////////////////////////////////////////////////////////////////
30 | // rint functions
31 | ////////////////////////////////////////////////////////////////////////
32 | #if _MSC_VER < 1800
33 | #include <math.h>
34 | static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); }
35 | static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
36 | static inline long long llrint(double x)
37 | {
38 |     return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);
39 | }
40 | static inline long long llrintf(float x)
41 | {
42 |     return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);
43 | }
44 | static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
45 | static inline float rintf(float x)
46 | {
47 |     return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f);
48 | }
49 | #endif
50 | 
51 | ////////////////////////////////////////////////////////////////////////
52 | // math constants
53 | ////////////////////////////////////////////////////////////////////////
54 | #if _MSC_VER < 1800
55 | #include <math.h>
56 | #define INFINITY HUGE_VAL
57 | #endif
58 | 
59 | ////////////////////////////////////////////////////////////////////////
60 | // random and srandom
61 | ////////////////////////////////////////////////////////////////////////
62 | #include <stdlib.h>
63 | static inline long int random(void) { return rand(); }
64 | static inline void srandom(unsigned int seed) { srand(seed); }
65 | 
66 | #endif // _MSC_CONFIG_H_ ]
67 | 


--------------------------------------------------------------------------------
/gen/volk_machine_defs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2012 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | from volk_arch_defs import arch_dict
10 | 
11 | machines = list()
12 | machine_dict = dict()
13 | 
14 | class machine_class(object):
15 |     def __init__(self, name, archs):
16 |         self.name = name
17 |         self.archs = list()
18 |         self.arch_names = list()
19 |         for arch_name in archs:
20 |             if not arch_name: continue
21 |             arch = arch_dict[arch_name]
22 |             self.archs.append(arch)
23 |             self.arch_names.append(arch_name)
24 |         self.alignment = max([a.alignment for a in self.archs])
25 | 
26 |     def __repr__(self): return self.name
27 | 
28 | def register_machine(name, archs):
29 |     for i, arch_name in enumerate(archs):
30 |         if '|' in arch_name: #handle special arch names with the '|'
31 |             for arch_sub in arch_name.split('|'):
32 |                 if arch_sub:
33 |                     register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:])
34 |                 else:
35 |                     register_machine(name, archs[:i] + archs[i+1:])
36 |             return
37 |     machine = machine_class(name=name, archs=archs)
38 |     machines.append(machine)
39 |     machine_dict[machine.name] = machine
40 | 
41 | ########################################################################
42 | # register the machines
43 | ########################################################################
44 | #TODO skip the XML and put it here
45 | from xml.dom import minidom
46 | import os
47 | gendir = os.path.dirname(__file__)
48 | machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine')
49 | for machine_xml in machines_xml:
50 |     kwargs = dict()
51 |     for attr in machine_xml.attributes.keys():
52 |         kwargs[attr] = machine_xml.attributes[attr].value
53 |     for node in machine_xml.childNodes:
54 |         try:
55 |             name = node.tagName
56 |             val = machine_xml.getElementsByTagName(name)[0].firstChild.data
57 |             kwargs[name] = val
58 |         except: pass
59 |     kwargs['archs'] = kwargs['archs'].split()
60 |     register_machine(**kwargs)
61 | 
62 | if __name__ == '__main__':
63 |     print(machines)
64 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2012, 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*!
11 |  * \page volk_32fc_s32f_x2_power_spectral_density_32f
12 |  *
13 |  * \b Overview
14 |  *
15 |  * Calculates the log10 power value divided by the RBW for each input point.
16 |  *
17 |  * <b>Dispatcher Prototype</b>
18 |  * \code
19 |  * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const
20 |  * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned
21 |  * int num_points) \endcode
22 |  *
23 |  * \b Inputs
24 |  * \li complexFFTInput The complex data output from the FFT point.
25 |  * \li normalizationFactor: This value is divided against all the input values before the
26 |  * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li
27 |  * num_points: The number of fft data points.
28 |  *
29 |  * \b Outputs
30 |  * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
31 |  *
32 |  * \b Example
33 |  * \code
34 |  * int N = 10000;
35 |  *
36 |  * volk_32fc_s32f_x2_power_spectral_density_32f();
37 |  *
38 |  * volk_free(x);
39 |  * \endcode
40 |  */
41 | 
42 | #ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
43 | #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
44 | 
45 | #include <inttypes.h>
46 | #include <math.h>
47 | #include <stdio.h>
48 | 
49 | #ifdef LV_HAVE_GENERIC
50 | 
51 | static inline void
52 | volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput,
53 |                                                      const lv_32fc_t* complexFFTInput,
54 |                                                      const float normalizationFactor,
55 |                                                      const float rbw,
56 |                                                      unsigned int num_points)
57 | {
58 |     if (rbw != 1.0)
59 |         volk_32fc_s32f_power_spectrum_32f(
60 |             logPowerOutput, complexFFTInput, normalizationFactor * sqrt(rbw), num_points);
61 |     else
62 |         volk_32fc_s32f_power_spectrum_32f(
63 |             logPowerOutput, complexFFTInput, normalizationFactor, num_points);
64 | }
65 | 
66 | #endif /* LV_HAVE_GENERIC */
67 | 
68 | #endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */
69 | 


--------------------------------------------------------------------------------
/kernels/README.md:
--------------------------------------------------------------------------------
 1 | # How to create custom kernel dispatchers
 2 | 
 3 | A kernel dispatcher is kernel implementation that calls other kernel implementations.
 4 | By default, a dispatcher is generated by the build system for every kernel such that:
 5 |   * the best aligned implementation is called when all pointer arguments are aligned,
 6 |   * and otherwise the best unaligned implementation is called.
 7 | 
 8 | The author of a VOLK kernel may create a custom dispatcher,
 9 | to be called in place of the automatically generated one.
10 | A custom dispatcher may be useful to handle head and tail cases,
11 | or to implement different alignment and bounds checking logic.
12 | 
13 | ## Code for an example dispatcher w/ tail case
14 | 
15 | ```cpp
16 | #include <volk/volk_common.h>
17 | 
18 | #ifdef LV_HAVE_DISPATCHER
19 | 
20 | static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
21 | {
22 |     const unsigned int num_points_r = num_points%4;
23 |     const unsigned int num_points_x = num_points - num_points_r;
24 | 
25 |     if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
26 |     {
27 |         volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
28 |     }
29 |     else
30 |     {
31 |         volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
32 |     }
33 | 
34 |     volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
35 | }
36 | 
37 | #endif //LV_HAVE_DISPATCHER
38 | ```
39 | 
40 | ## Code for an example dispatcher w/ tail case and accumulator
41 | 
42 | ```cpp
43 | #include <volk/volk_common.h>
44 | 
45 | #ifdef LV_HAVE_DISPATCHER
46 | 
47 | static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
48 | {
49 |     const unsigned int num_points_r = num_points%16;
50 |     const unsigned int num_points_x = num_points - num_points_r;
51 | 
52 |     if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
53 |     {
54 |         volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
55 |     }
56 |     else
57 |     {
58 |         volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
59 |     }
60 | 
61 |     float result_tail = 0;
62 |     volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
63 | 
64 |     *result += result_tail;
65 | }
66 | 
67 | #endif //LV_HAVE_DISPATCHER
68 | ```
69 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32f_s32f_power_32f.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2012, 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*!
11 |  * \page volk_32f_s32f_power_32f
12 |  *
13 |  * \b Overview
14 |  *
15 |  * Takes each input vector value to the specified power and stores the
16 |  * results in the return vector.
17 |  *
18 |  * <b>Dispatcher Prototype</b>
19 |  * \code
20 |  * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power,
21 |  * unsigned int num_points) \endcode
22 |  *
23 |  * \b Inputs
24 |  * \li aVector: The input vector of floats.
25 |  * \li power: The power to raise the input value to.
26 |  * \li num_points: The number of data points.
27 |  *
28 |  * \b Outputs
29 |  * \li cVector: The output vector.
30 |  *
31 |  * \b Example
32 |  * Square the numbers (0,9)
33 |  * \code
34 |  *   int N = 10;
35 |  *   unsigned int alignment = volk_get_alignment();
36 |  *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
37 |  *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
38 |  *
39 |  *
40 |  *   for(unsigned int ii = 0; ii < N; ++ii){
41 |  *       increasing[ii] = (float)ii;
42 |  *   }
43 |  *
44 |  *   // Normalize by the smallest delta (0.2 in this example)
45 |  *   float scale = 2.0f;
46 |  *
47 |  *   volk_32f_s32f_power_32f(out, increasing, scale, N);
48 |  *
49 |  *   for(unsigned int ii = 0; ii < N; ++ii){
50 |  *       printf("out[%u] = %f\n", ii, out[ii]);
51 |  *   }
52 |  *
53 |  *   volk_free(increasing);
54 |  *   volk_free(out);
55 |  * \endcode
56 |  */
57 | 
58 | #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
59 | #define INCLUDED_volk_32f_s32f_power_32f_a_H
60 | 
61 | #include <inttypes.h>
62 | #include <math.h>
63 | #include <stdio.h>
64 | 
65 | #ifdef LV_HAVE_GENERIC
66 | 
67 | static inline void volk_32f_s32f_power_32f_generic(float* cVector,
68 |                                                    const float* aVector,
69 |                                                    const float power,
70 |                                                    unsigned int num_points)
71 | {
72 |     float* cPtr = cVector;
73 |     const float* aPtr = aVector;
74 |     unsigned int number = 0;
75 | 
76 |     for (number = 0; number < num_points; number++) {
77 |         *cPtr++ = powf((*aPtr++), power);
78 |     }
79 | }
80 | #endif /* LV_HAVE_GENERIC */
81 | 
82 | 
83 | #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
84 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32fc_s32f_power_32fc.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2012, 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*!
11 |  * \page volk_32fc_s32f_power_32fc
12 |  *
13 |  * \b Overview
14 |  *
15 |  * Takes each the input complex vector value to the specified power
16 |  * and stores the results in the return vector. The output is scaled
17 |  * and converted to 16-bit shorts.
18 |  *
19 |  * <b>Dispatcher Prototype</b>
20 |  * \code
21 |  * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
22 |  * float power, unsigned int num_points) \endcode
23 |  *
24 |  * \b Inputs
25 |  * \li aVector: The complex input vector.
26 |  * \li power: The power value to be applied to each data point.
27 |  * \li num_points: The number of samples.
28 |  *
29 |  * \b Outputs
30 |  * \li cVector: The output value as 16-bit shorts.
31 |  *
32 |  * \b Example
33 |  * \code
34 |  * int N = 10000;
35 |  *
36 |  * volk_32fc_s32f_power_32fc();
37 |  *
38 |  * volk_free(x);
39 |  * \endcode
40 |  */
41 | 
42 | #ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H
43 | #define INCLUDED_volk_32fc_s32f_power_32fc_a_H
44 | 
45 | #include <inttypes.h>
46 | #include <math.h>
47 | #include <stdio.h>
48 | 
49 | //! raise a complex float to a real float power
50 | static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp,
51 |                                                         const float power)
52 | {
53 |     const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp));
54 |     const float mag =
55 |         powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2);
56 |     return mag * lv_cmake(-cosf(arg), sinf(arg));
57 | }
58 | 
59 | #ifdef LV_HAVE_GENERIC
60 | 
61 | static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector,
62 |                                                      const lv_32fc_t* aVector,
63 |                                                      const float power,
64 |                                                      unsigned int num_points)
65 | {
66 |     lv_32fc_t* cPtr = cVector;
67 |     const lv_32fc_t* aPtr = aVector;
68 |     unsigned int number = 0;
69 | 
70 |     for (number = 0; number < num_points; number++) {
71 |         *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
72 |     }
73 | }
74 | 
75 | #endif /* LV_HAVE_GENERIC */
76 | 
77 | 
78 | #endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */
79 | 


--------------------------------------------------------------------------------
/include/volk/volk_avx2_fma_intrinsics.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*
11 |  * This file is intended to hold AVX2 FMA intrinsics.
12 |  * They should be used in VOLK kernels to avoid copy-paste.
13 |  */
14 | 
15 | #ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16 | #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17 | #include <immintrin.h>
18 | 
19 | /*
20 |  * Approximate arctan(x) via polynomial expansion
21 |  * on the interval [-1, 1]
22 |  *
23 |  * Maximum relative error ~6.5e-7
24 |  * Polynomial evaluated via Horner's method
25 |  */
26 | static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
27 | {
28 |     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29 |     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30 |     const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31 |     const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32 |     const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33 |     const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34 |     const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35 | 
36 |     const __m256 x_times_x = _mm256_mul_ps(x, x);
37 |     __m256 arctan;
38 |     arctan = a13;
39 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44 |     arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45 |     arctan = _mm256_mul_ps(x, arctan);
46 | 
47 |     return arctan;
48 | }
49 | 
50 | /*
51 |  * Approximate arcsin(x) via polynomial expansion
52 |  * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
53 |  *
54 |  * Maximum relative error ~1.5e-6
55 |  * Polynomial evaluated via Horner's method
56 |  */
57 | static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
58 | {
59 |     const __m256 c0 = _mm256_set1_ps(0x1.ffffcep-1f);
60 |     const __m256 c1 = _mm256_set1_ps(0x1.55b648p-3f);
61 |     const __m256 c2 = _mm256_set1_ps(0x1.24d192p-4f);
62 |     const __m256 c3 = _mm256_set1_ps(0x1.0a788p-4f);
63 | 
64 |     const __m256 u = _mm256_mul_ps(x, x);
65 |     __m256 p = c3;
66 |     p = _mm256_fmadd_ps(u, p, c2);
67 |     p = _mm256_fmadd_ps(u, p, c1);
68 |     p = _mm256_fmadd_ps(u, p, c0);
69 | 
70 |     return _mm256_mul_ps(x, p);
71 | }
72 | 
73 | #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
74 | 


--------------------------------------------------------------------------------
/.github/workflows/android_build.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | name: Build on Android NDK
10 | 
11 | on: [push, pull_request]
12 | 
13 | jobs:
14 |   build:
15 |     name: Build on Android NDK ${{ matrix.arch.name }}
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         arch: 
21 |           - { name: armeabi-v7a, allow_fail: false }
22 |           - { name: arm64-v8a, allow_fail: false }
23 |           - { name: x86, allow_fail: false }
24 |           - { name: x86_64, allow_fail: false }
25 | 
26 |     runs-on: ubuntu-latest
27 | 
28 |     steps:
29 |     - uses: actions/checkout@v4
30 |       with:
31 |         submodules: 'recursive'
32 | 
33 |     - name: Update repositories
34 |       run: sudo apt update 
35 |     
36 |     # All dependencies
37 |     - name: Install dependencies
38 |       run: sudo apt install -y cmake python3-mako
39 | 
40 |     # Setup Java
41 |     - uses: actions/setup-java@v4
42 |       with:
43 |         distribution: 'temurin'
44 |         java-version: '17'
45 | 
46 |     # Setup Android SDK, and auto-accept licenses
47 |     - name: Install Android SDK
48 |       run: wget --quiet --output-document=android-sdk.zip https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip && mkdir android-sdk-linux && unzip -qq android-sdk.zip -d android-sdk-linux && export ANDROID_HOME=./android-sdk-linux && echo y | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --update && (echo y; echo y; echo y; echo y; echo y; echo y; echo y; echo y) | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --licenses
49 | 
50 |     # Call SDKManager to install the Android NDK
51 |     - name: Install Android NDK
52 |       run: $GITHUB_WORKSPACE/android-sdk-linux/cmdline-tools/bin/sdkmanager --sdk_root=$GITHUB_WORKSPACE/android-sdk-linux --install "ndk;27.2.12479018" --channel=3
53 | 
54 |     # Setup build directory
55 |     - name: Setup ${{ matrix.arch.name }}
56 |       shell: bash
57 |       run: cd $GITHUB_WORKSPACE/ && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-sdk-linux/ndk/27.2.12479018/build/cmake/android.toolchain.cmake -DANDROID_ABI=${{ matrix.arch.name }} -DANDROID_PLATFORM=android-34 ..
58 | 
59 |     # Build
60 |     - name: Build ${{ matrix.arch.name }}
61 |       shell: bash
62 |       run: cd $GITHUB_WORKSPACE/build && make
63 |       continue-on-error: ${{ matrix.arch.allow_fail }}
64 | 


--------------------------------------------------------------------------------
/lib/volk_malloc.c:
--------------------------------------------------------------------------------
 1 | /* -*- c -*- */
 2 | /*
 3 |  * Copyright 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | #include <string.h>
13 | 
14 | #include <volk/volk_malloc.h>
15 | 
16 | /*
17 |  * C11 features:
18 |  * see: https://en.cppreference.com/w/c/memory/aligned_alloc
19 |  *
20 |  * MSVC is broken
21 |  * see:
22 |  * https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170
23 |  * This section:
24 |  * C11 The Universal CRT implemented the parts of the
25 |  * C11 Standard Library that are required by C++17,
26 |  * with the exception of C99 strftime() E/O alternative
27 |  * conversion specifiers, C11 fopen() exclusive mode,
28 |  * and C11 aligned_alloc(). The latter is unlikely to
29 |  * be implemented, because C11 specified aligned_alloc()
30 |  * in a way that's incompatible with the Microsoft
31 |  * implementation of free():
32 |  * namely, that free() must be able to handle highly aligned allocations.
33 |  *
34 |  * We must work around this problem because MSVC is non-compliant!
35 |  */
36 | 
37 | 
38 | void* volk_malloc(size_t size, size_t alignment)
39 | {
40 |     if ((size == 0) || (alignment == 0)) {
41 |         return NULL;
42 |     }
43 |     // Tweak size to satisfy ASAN (the GCC address sanitizer).
44 |     // Calling 'volk_malloc' might therefor result in the allocation of more memory than
45 |     // requested for correct alignment. Any allocation size change here will in general
46 |     // not impact the end result since initial size alignment is required either way.
47 |     if (size % alignment) {
48 |         size += alignment - (size % alignment);
49 |     }
50 | #if HAVE_POSIX_MEMALIGN
51 |     // quoting posix_memalign() man page:
52 |     // "alignment must be a power of two and a multiple of sizeof(void *)"
53 |     // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
54 |     if (alignment == 1) {
55 |         return malloc(size);
56 |     }
57 |     void* ptr;
58 |     int err = posix_memalign(&ptr, alignment, size);
59 |     if (err != 0) {
60 |         ptr = NULL;
61 |     }
62 | #elif defined(_MSC_VER) || defined(__MINGW32__)
63 |     void* ptr = _aligned_malloc(size, alignment);
64 | #else
65 |     void* ptr = aligned_alloc(alignment, size);
66 | #endif
67 |     return ptr;
68 | }
69 | 
70 | void volk_free(void* ptr)
71 | {
72 | #if defined(_MSC_VER) || defined(__MINGW32__)
73 |     _aligned_free(ptr);
74 | #else
75 |     free(ptr);
76 | #endif
77 | }
78 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32u_popcntpuppet_32u.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_volk_32u_popcntpuppet_32u_H
11 | #define INCLUDED_volk_32u_popcntpuppet_32u_H
12 | 
13 | #include <stdint.h>
14 | #include <volk/volk_32u_popcnt.h>
15 | 
16 | #ifdef LV_HAVE_GENERIC
17 | static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
18 |                                                      const uint32_t* inVector,
19 |                                                      unsigned int num_points)
20 | {
21 |     for (size_t i = 0; i < num_points; ++i) {
22 |         volk_32u_popcnt_generic(outVector + i, inVector[i]);
23 |     }
24 | }
25 | #endif /* LV_HAVE_GENERIC */
26 | 
27 | #ifdef LV_HAVE_SSE4_2
28 | static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
29 |                                                       const uint32_t* inVector,
30 |                                                       unsigned int num_points)
31 | {
32 |     for (size_t i = 0; i < num_points; ++i) {
33 |         volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]);
34 |     }
35 | }
36 | #endif /* LV_HAVE_SSE4_2 */
37 | 
38 | #ifdef LV_HAVE_NEON
39 | static inline void volk_32u_popcntpuppet_32u_neon(uint32_t* outVector,
40 |                                                   const uint32_t* inVector,
41 |                                                   unsigned int num_points)
42 | {
43 |     for (size_t i = 0; i < num_points; ++i) {
44 |         volk_32u_popcnt_neon(outVector + i, inVector[i]);
45 |     }
46 | }
47 | #endif /* LV_HAVE_NEON */
48 | 
49 | #ifdef LV_HAVE_RVV
50 | static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector,
51 |                                                  const uint32_t* inVector,
52 |                                                  unsigned int num_points)
53 | {
54 |     for (size_t i = 0; i < num_points; ++i) {
55 |         volk_32u_popcnt_rvv(outVector + i, inVector[i]);
56 |     }
57 | }
58 | #endif /* LV_HAVE_RVV */
59 | 
60 | #ifdef LV_HAVE_RVA22V
61 | static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector,
62 |                                                    const uint32_t* inVector,
63 |                                                    unsigned int num_points)
64 | {
65 |     for (size_t i = 0; i < num_points; ++i) {
66 |         volk_32u_popcnt_rva22(outVector + i, inVector[i]);
67 |     }
68 | }
69 | #endif /* LV_HAVE_RVA22V */
70 | 
71 | #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
72 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_64u_popcntpuppet_64u.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2014 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
11 | #define INCLUDED_volk_64u_popcntpuppet_64u_H
12 | 
13 | #include <stdint.h>
14 | #include <string.h>
15 | #include <volk/volk_64u_popcnt.h>
16 | 
17 | #ifdef LV_HAVE_GENERIC
18 | static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
19 |                                                      const uint64_t* inVector,
20 |                                                      unsigned int num_points)
21 | {
22 |     for (size_t i = 0; i < num_points; ++i) {
23 |         volk_64u_popcnt_generic(outVector + i, inVector[i]);
24 |     }
25 | }
26 | #endif /* LV_HAVE_GENERIC */
27 | 
28 | #if LV_HAVE_SSE4_2 && LV_HAVE_64
29 | static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
30 |                                                       const uint64_t* inVector,
31 |                                                       unsigned int num_points)
32 | {
33 |     for (size_t i = 0; i < num_points; ++i) {
34 |         volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]);
35 |     }
36 | }
37 | #endif /* LV_HAVE_SSE4_2 */
38 | 
39 | #ifdef LV_HAVE_NEON
40 | static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
41 |                                                   const uint64_t* inVector,
42 |                                                   unsigned int num_points)
43 | {
44 |     for (size_t i = 0; i < num_points; ++i) {
45 |         volk_64u_popcnt_neon(outVector + i, inVector[i]);
46 |     }
47 | }
48 | #endif /* LV_HAVE_NEON */
49 | 
50 | #ifdef LV_HAVE_RVV
51 | static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector,
52 |                                                  const uint64_t* inVector,
53 |                                                  unsigned int num_points)
54 | {
55 |     for (size_t i = 0; i < num_points; ++i) {
56 |         volk_64u_popcnt_rvv(outVector + i, inVector[i]);
57 |     }
58 | }
59 | #endif /* LV_HAVE_RVV */
60 | 
61 | #ifdef LV_HAVE_RVA22V
62 | static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector,
63 |                                                    const uint64_t* inVector,
64 |                                                    unsigned int num_points)
65 | {
66 |     for (size_t i = 0; i < num_points; ++i) {
67 |         volk_64u_popcnt_rva22(outVector + i, inVector[i]);
68 |     }
69 | }
70 | #endif /* LV_HAVE_RVA22V */
71 | 
72 | #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
73 | 


--------------------------------------------------------------------------------
/gen/machines.xml:
--------------------------------------------------------------------------------
 1 | <grammar>
 2 | 
 3 | <machine name="generic">
 4 | <archs>generic orc|</archs>
 5 | </machine>
 6 | 
 7 | <machine name="neon">
 8 | <archs>generic neon orc|</archs>
 9 | </machine>
10 | 
11 | <machine name="neonv7">
12 | <archs>generic neon neonv7 softfp|hardfp orc|</archs>
13 | </machine>
14 | 
15 | <machine name="neonv8">
16 | <archs>generic neon neonv8 orc|</archs>
17 | </machine>
18 | 
19 | <!-- trailing | bar means generate without either for MSVC -->
20 | <machine name="sse2">
21 | <archs>generic 32|64| mmx| sse sse2 orc|</archs>
22 | </machine>
23 | 
24 | <machine name="sse3">
25 | <archs>generic 32|64| mmx| sse sse2 sse3 orc|</archs>
26 | </machine>
27 | 
28 | <machine name="ssse3">
29 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 orc|</archs>
30 | </machine>
31 | 
32 | <machine name="sifive_u74">
33 | <archs>generic riscv64 orc|</archs>
34 | </machine>
35 | 
36 | <machine name="rv64gcv">
37 | <archs>generic riscv64 rvv rvvseg orc|</archs>
38 | </machine>
39 | 
40 | <!--machine name="rva22v">
41 | <archs>generic riscv64 rvv rvvseg rva22v orc|</archs>
42 | </machine-->
43 | 
44 | <!--machine name="rva23">
45 | <archs>generic riscv64 rvv rvvseg rva22v rva23 orc|</archs>
46 | </machine-->
47 | 
48 | <machine name="sse4_a">
49 | <archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
50 | </machine>
51 | 
52 | <machine name="sse4_1">
53 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 orc|</archs>
54 | </machine>
55 | 
56 | <machine name="sse4_2">
57 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
58 | </machine>
59 | 
60 | <!-- trailing | bar means generate without either for MSVC -->
61 | <machine name="avx">
62 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
63 | </machine>
64 | 
65 | <!-- trailing | bar means generate without either for MSVC -->
66 | <machine name="avx2">
67 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 orc|</archs>
68 | </machine>
69 | 
70 | <!-- trailing | bar means generate without either for MSVC -->
71 | <machine name="avx512f">
72 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f orc|</archs>
73 | </machine>
74 | 
75 | <!-- trailing | bar means generate without either for MSVC -->
76 | <machine name="avx512cd">
77 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
78 | </machine>
79 | 
80 | <!-- trailing | bar means generate without either for MSVC -->
81 | <machine name="avx512dq">
82 | <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
83 | </machine>
84 | 
85 | </grammar>
86 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/riscv/volk_32f_s32f_multiply_32f_sifive_u74.s:
--------------------------------------------------------------------------------
 1 |         .text
 2 |         .align 2
 3 |         .type volk_32f_s32f_multiply_32f_sifive_u74, @function
 4 |         .global volk_32f_s32f_multiply_32f_sifive_u74
 5 | 
 6 | volk_32f_s32f_multiply_32f_sifive_u74:
 7 |         # Input:
 8 |         # a0  out
 9 |         # a1  in
10 |         # fa0 scalar
11 |         # a2  size
12 | 
13 |         # Main loop in 8x unrolled.
14 | 
15 |         # Split counter into main and final loop.
16 |         # a5  main loop counter
17 |         # a2  closing loop counter
18 |         srli    a5,a2,3
19 |         andi    a2,a2,7
20 |         slli    a5,a5,5
21 |         beqz    a5,.dolastloop
22 |         add     a5,a0,a5
23 | 
24 |         .align 2
25 | .loop:
26 |         flw     fa1,0(a1)
27 |         addi    a0,a0,32      # increment output (free, running on pipeline A)
28 | 
29 |         flw     fa2,4(a1)
30 |         flw     fa3,8(a1)
31 |         flw     fa4,12(a1)
32 |         flw     fa5,16(a1)
33 |         flw     fa6,20(a1)
34 |         flw     fa7,24(a1)
35 |         flw     ft8,28(a1)
36 |         addi    a1,a1,32      # increment input (free, running on pipeline A)
37 | 
38 |         fmul.s  fa1,fa1,fa0
39 |         fmul.s  fa2,fa2,fa0
40 |         fmul.s  fa3,fa3,fa0
41 |         fmul.s  fa4,fa4,fa0
42 |         fmul.s  fa5,fa5,fa0
43 |         fmul.s  fa6,fa6,fa0
44 |         fmul.s  fa7,fa7,fa0
45 |         fmul.s  ft8,ft8,fa0
46 | 
47 |         fsw     fa1,-32(a0)
48 |         fsw     fa2,-28(a0)
49 |         fsw     fa3,-24(a0)
50 |         fsw     fa4,-20(a0)
51 |         fsw     fa5,-16(a0)
52 |         fsw     fa6,-12(a0)
53 |         fsw     fa7,-8(a0)
54 |         fsw     ft8,-4(a0)
55 | 
56 |         bne    a5,a0,.loop
57 | 
58 |         .align 2
59 | .dolastloop:
60 |         # TODO: is branch assumed to be taken or not?
61 |         beqz    a2,.done
62 | 
63 |         # Everything below is less optimized. In theory we could split
64 |         # this into more partial unrolled loops, but it's at most 7
65 |         # iterations, so not clear that it's worth it.
66 | 
67 |         # make a2 a pointer to the last entry.
68 |         slli    a2,a2,2
69 |         add     a2,a0,a2   # Stall!
70 | 
71 |         .align 2
72 | .lastloop:
73 |         flw     fa5,0(a1)     # Latency: 2
74 |         addi    a0,a0,4       # Increment out
75 |         fmul.s  fa5,fa5,fa0   # Stalled for a cycle or two. Latency: 5
76 |         addi    a1,a1,4       # Increment in
77 |         fsw     fa5,-4(a0)    # Stalled for a couple of cycles waiting for mul.
78 |         bne     a2,a0,.lastloop
79 | 
80 | 	.align 2
81 | .done:
82 |         ret
83 | 	.size	volk_32f_s32f_multiply_32f_sifive_u74, .-volk_32f_s32f_multiply_32f_sifive_u74
84 | 


--------------------------------------------------------------------------------
/gen/volk_arch_defs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2012 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | 
10 | archs = list()
11 | arch_dict = dict()
12 | 
13 | class arch_class(object):
14 |     def __init__(self, flags, checks, **kwargs):
15 |         for key, cast, failval in (
16 |             ('name', str, None),
17 |             ('environment', str, None),
18 |             ('include', str, None),
19 |             ('alignment', int, 1)
20 |         ):
21 |             try: setattr(self, key, cast(kwargs[key]))
22 |             except: setattr(self, key, failval)
23 |         self.checks = checks
24 |         assert(self.name)
25 |         self._flags = flags
26 | 
27 |     def is_supported(self, compiler):
28 |         if not self._flags.keys(): return True
29 |         return compiler in self._flags.keys()
30 | 
31 |     def get_flags(self, compiler):
32 |         try: return self._flags[compiler]
33 |         except KeyError: return list()
34 | 
35 |     def __repr__(self): return self.name
36 | 
37 | def register_arch(**kwargs):
38 |     arch = arch_class(**kwargs)
39 |     archs.append(arch)
40 |     arch_dict[arch.name] = arch
41 | 
42 | ########################################################################
43 | # register the arches
44 | ########################################################################
45 | #TODO skip the XML and put it here
46 | from xml.dom import minidom
47 | import os
48 | gendir = os.path.dirname(__file__)
49 | archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch')
50 | for arch_xml in archs_xml:
51 |     kwargs = dict()
52 |     for attr in arch_xml.attributes.keys():
53 |         kwargs[attr] = arch_xml.attributes[attr].value
54 |     for node in arch_xml.childNodes:
55 |         try:
56 |             name = node.tagName
57 |             val = arch_xml.getElementsByTagName(name)[0].firstChild.data
58 |             kwargs[name] = val
59 |         except: pass
60 |     checks = list()
61 |     for check_xml in arch_xml.getElementsByTagName("check"):
62 |         name = check_xml.attributes["name"].value
63 |         params = list()
64 |         for param_xml in check_xml.getElementsByTagName("param"):
65 |             params.append(param_xml.firstChild.data)
66 |         checks.append([name, params])
67 |     flags = dict()
68 |     for flag_xml in arch_xml.getElementsByTagName("flag"):
69 |         name = flag_xml.attributes["compiler"].value
70 |         if name not in flags: flags[name] = list()
71 |         flags[name].append(flag_xml.firstChild.data)
72 |     register_arch(flags=flags, checks=checks, **kwargs)
73 | 
74 | if __name__ == '__main__':
75 |     print(archs)
76 |     
77 | 


--------------------------------------------------------------------------------
/include/volk/volk_sse3_intrinsics.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2015 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*
11 |  * This file is intended to hold SSE3 intrinsics of intrinsics.
12 |  * They should be used in VOLK kernels to avoid copy-pasta.
13 |  */
14 | 
15 | #ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
16 | #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
17 | #include <pmmintrin.h>
18 | 
19 | static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y)
20 | {
21 |     __m128 yl, yh, tmp1, tmp2;
22 |     yl = _mm_moveldup_ps(y);        // Load yl with cr,cr,dr,dr
23 |     yh = _mm_movehdup_ps(y);        // Load yh with ci,ci,di,di
24 |     tmp1 = _mm_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
25 |     x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
26 |     tmp2 = _mm_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
27 |     return _mm_addsub_ps(tmp1,
28 |                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
29 | }
30 | 
31 | static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
32 | {
33 |     const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
34 |     y = _mm_xor_ps(y, conjugator); // conjugate y
35 |     return _mm_complexmul_ps(x, y);
36 | }
37 | 
38 | static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
39 | {
40 |     cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
41 |     cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
42 |     return _mm_hadd_ps(cplxValue1, cplxValue2);      // Add the I2 and Q2 values
43 | }
44 | 
45 | static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
46 | {
47 |     return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
48 | }
49 | 
50 | static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
51 |                                                   const __m128 symbols1,
52 |                                                   const __m128 points0,
53 |                                                   const __m128 points1,
54 |                                                   const __m128 scalar)
55 | {
56 |     /*
57 |      * Calculate: |y - x|^2 * SNR_lin
58 |      * Consider 'symbolsX' and 'pointsX' to be complex float
59 |      * 'symbolsX' are 'y' and 'pointsX' are 'x'
60 |      */
61 |     const __m128 diff0 = _mm_sub_ps(symbols0, points0);
62 |     const __m128 diff1 = _mm_sub_ps(symbols1, points1);
63 |     const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
64 |     return _mm_mul_ps(norms, scalar);
65 | }
66 | 
67 | #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
68 | 


--------------------------------------------------------------------------------
/apps/volk-config-info.cc:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2013, 2016, 2018 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #if HAVE_CONFIG_H
11 | #include <config.h>
12 | #endif
13 | 
14 | #include <volk/constants.h> // for volk_available_machines, volk_c_com...
15 | #include <iostream>         // for operator<<, endl, cout, ostream
16 | #include <string>           // for string
17 | 
18 | #include "volk/volk.h"           // for volk_get_alignment, volk_get_machine
19 | #include "volk_option_helpers.h" // for option_list, option_t
20 | 
21 | void print_alignment()
22 | {
23 |     std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
24 | }
25 | 
26 | void print_malloc()
27 | {
28 |     // You don't want to change the volk_malloc code, so just copy the if/else
29 |     // structure from there and give an explanation for the implementations
30 |     std::cout << "Used malloc implementation: ";
31 | #if HAVE_POSIX_MEMALIGN
32 |     std::cout << "posix_memalign" << std::endl;
33 | #elif defined(_MSC_VER)
34 |     std::cout << "_aligned_malloc" << std::endl;
35 | #else
36 |     std::cout << "C11 aligned_alloc" << std::endl;
37 | #endif
38 | }
39 | 
40 | 
41 | int main(int argc, char** argv)
42 | {
43 | 
44 |     option_list our_options("volk-config-info");
45 |     our_options.add(
46 |         option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
47 |     our_options.add(
48 |         option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
49 |     our_options.add(
50 |         option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
51 |     our_options.add(option_t(
52 |         "all-machines", "", "print VOLK machines built", volk_available_machines()));
53 |     our_options.add(option_t("avail-machines",
54 |                              "",
55 |                              "print VOLK machines on the current "
56 |                              "platform",
57 |                              volk_list_machines));
58 |     our_options.add(option_t("machine",
59 |                              "",
60 |                              "print the current VOLK machine that will be used",
61 |                              volk_get_machine()));
62 |     our_options.add(
63 |         option_t("alignment", "", "print the memory alignment", print_alignment));
64 |     our_options.add(option_t("malloc",
65 |                              "",
66 |                              "print the malloc implementation used in volk_malloc",
67 |                              print_malloc));
68 |     our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
69 | 
70 |     our_options.parse(argc, argv);
71 | 
72 |     return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/cmake/Modules/VolkVersion.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Free Software Foundation, Inc.
 2 | #
 3 | # This file is part of VOLK.
 4 | #
 5 | # SPDX-License-Identifier: LGPL-3.0-or-later
 6 | #
 7 | 
 8 | if(DEFINED __INCLUDED_VOLK_VERSION_CMAKE)
 9 |     return()
10 | endif()
11 | set(__INCLUDED_VOLK_VERSION_CMAKE TRUE)
12 | 
13 | #eventually, replace version.sh and fill in the variables below
14 | set(MAJOR_VERSION ${VERSION_INFO_MAJOR_VERSION})
15 | set(MINOR_VERSION ${VERSION_INFO_MINOR_VERSION})
16 | set(MAINT_VERSION ${VERSION_INFO_MAINT_VERSION})
17 | 
18 | ########################################################################
19 | # Extract the version string from git describe.
20 | ########################################################################
21 | find_package(Git)
22 | 
23 | if(GIT_FOUND AND EXISTS ${CMAKE_SOURCE_DIR}/.git)
24 |     message(STATUS "Extracting version information from git describe...")
25 |     execute_process(
26 |         COMMAND ${GIT_EXECUTABLE} describe --always --abbrev=8 --long
27 |         OUTPUT_VARIABLE GIT_DESCRIBE
28 |         OUTPUT_STRIP_TRAILING_WHITESPACE
29 |         WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
30 | else()
31 |     if(NOT VOLK_GIT_COUNT)
32 |         set(VOLK_GIT_COUNT "0")
33 |     endif()
34 | 
35 |     if(NOT VOLK_GIT_HASH)
36 |         set(VOLK_GIT_HASH "unknown")
37 |     endif()
38 | 
39 |     set(GIT_DESCRIBE
40 |         "v${MAJOR_VERSION}.${MINOR_VERSION}-${VOLK_GIT_COUNT}-${VOLK_GIT_HASH}")
41 | endif()
42 | 
43 | ########################################################################
44 | # Use the logic below to set the version constants
45 | ########################################################################
46 | if("${MINOR_VERSION}" STREQUAL "git")
47 |     # VERSION: 1.0git-xxx-gxxxxxxxx
48 |     # DOCVER:  1.0git
49 |     # SOVERSION:  1.0git
50 |     set(VERSION "${GIT_DESCRIBE}")
51 |     set(DOCVER "${MAJOR_VERSION}.0${MINOR_VERSION}")
52 |     set(SOVERSION "${MAJOR_VERSION}.0${MINOR_VERSION}")
53 |     set(RC_MINOR_VERSION "0")
54 |     set(RC_MAINT_VERSION "0")
55 | elseif("${MAINT_VERSION}" STREQUAL "git")
56 |     # VERSION: 1.xgit-xxx-gxxxxxxxx
57 |     # DOCVER:  1.xgit
58 |     # SOVERSION:  1.xgit
59 |     set(VERSION "${GIT_DESCRIBE}")
60 |     set(DOCVER "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
61 |     set(SOVERSION "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
62 |     math(EXPR RC_MINOR_VERSION "${MINOR_VERSION} - 1")
63 |     set(RC_MAINT_VERSION "0")
64 | else()
65 |     # This is a numbered release.
66 |     # VERSION: 1.1{.x}
67 |     # DOCVER:  1.1{.x}
68 |     # SOVERSION:  1.1.0
69 |     set(VERSION "${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION}")
70 |     set(DOCVER "${VERSION}")
71 |     set(SOVERSION "${MAJOR_VERSION}.${MINOR_VERSION}")
72 |     set(RC_MINOR_VERSION ${MINOR_VERSION})
73 |     set(RC_MAINT_VERSION ${MAINT_VERSION})
74 | endif()
75 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points)
 2 | 	.global	volk_32fc_32f_dot_prod_32fc_a_neonasmvmla
 3 | volk_32fc_32f_dot_prod_32fc_a_neonasmvmla:
 4 | 	@ r0 - result: pointer to output array (32fc)
 5 | 	@ r1 - input: pointer to input array 1 (32fc)
 6 | 	@ r2 - taps: pointer to input array 2 (32f)
 7 | 	@ r3 - num_points: number of items to process
 8 | 
 9 | 	result .req r0
10 | 	input .req r1
11 | 	taps .req r2
12 | 	num_points .req r3
13 | 	quarterPoints .req r7
14 | 	number .req r8
15 | 	@ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
16 |     @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
17 |     @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
18 |     @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
19 | 	realAccQ   .req q0 @ d0-d1/s0-s3
20 | 	compAccQ   .req q1 @ d2-d3/s4-s7
21 | 	realAccS   .req s0 @ d0[0]
22 | 	compAccS   .req s4 @ d2[0]
23 | 	tapsVal    .req q2 @ d4-d5
24 | 	outVal     .req q3 @ d6-d7
25 |     realMul    .req q8 @ d8-d9
26 |     compMul    .req q9 @ d16-d17
27 |     inRealVal  .req q10 @ d18-d19
28 | 	inCompVal  .req q11 @ d20-d21
29 | 
30 | 	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
31 | 
32 | 	veor realAccQ, realAccQ @ zero out accumulators
33 | 	veor compAccQ, compAccQ @ zero out accumulators
34 | 	movs quarterPoints, num_points, lsr #2
35 | 	beq .loop2 @ if zero into quarterPoints
36 | 
37 | 	mov number, quarterPoints
38 | 
39 | .loop1:
40 | 	@ do work here
41 | 	pld [taps, #128] @ pre-load hint - this is implementation specific!
42 | 	pld [input, #128] @ pre-load hint - this is implementation specific!
43 | 	vld1.32 {tapsVal}, [taps:128]! @ tapsVal
44 | 	vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
45 | 	vmla.f32 realAccQ, tapsVal, inRealVal
46 | 	vmla.f32 compAccQ, tapsVal, inCompVal
47 | 	subs number, number, #1
48 | 	bne	.loop1	@ first loop
49 | 
50 |     @ Sum up across realAccQ and compAccQ
51 |     vadd.f32 d0, d0, d1      @ realAccQ +-> d0
52 |     vadd.f32 d2, d2, d3      @ compAccQ +-> d2
53 |     vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
54 |     vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
55 |     @ critical values are now in s0 (realAccS), s4 (compAccS)
56 | 	mov	number, quarterPoints, asl #2
57 | .loop2:
58 | 	cmp	num_points, number
59 | 	bls	.done
60 | 
61 | 	vld1.32 {d4[0]}, [taps]! @ s8
62 | 	vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
63 | 	vmla.f32 realAccS, s8, s10 @ d0[0]
64 | 	vmla.f32 compAccS, s8, s12 @ d2[0]
65 | 
66 | 	add number, number, #1
67 | 	b .loop2
68 | 
69 | .done:
70 |     vst1.32 {d0[0]}, [result]! @ realAccS
71 |     vst1.32 {d2[0]}, [result]  @ compAccS
72 | 
73 | 	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
74 | 	bx	lr
75 | 


--------------------------------------------------------------------------------
/apps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011-2013 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | 
 9 | ########################################################################
10 | # Setup profiler
11 | ########################################################################
12 | 
13 | # POSIX_MEMALIGN: If we have to fall back to `posix_memalign`.
14 | if(HAVE_POSIX_MEMALIGN)
15 |     message(STATUS "Use `posix_memalign` for aligned malloc!")
16 |     add_definitions(-DHAVE_POSIX_MEMALIGN)
17 | endif(HAVE_POSIX_MEMALIGN)
18 | 
19 | # MAKE volk_profile
20 | add_executable(
21 |     volk_profile
22 |     ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc
23 |     ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc)
24 | target_compile_features(volk_profile PUBLIC cxx_std_17)
25 | 
26 | if(MSVC)
27 |     target_include_directories(
28 |         volk_profile PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/cmake/msvc>)
29 | endif(MSVC)
30 | 
31 | target_include_directories(
32 |     volk_profile
33 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
34 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
35 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/lib>
36 |     PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
37 |     PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
38 |     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
39 | 
40 | if(ENABLE_STATIC_LIBS)
41 |     target_link_libraries(volk_profile PRIVATE volk_static fmt::fmt)
42 |     set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static")
43 | else()
44 |     target_link_libraries(volk_profile PRIVATE volk fmt::fmt)
45 | endif()
46 | 
47 | install(
48 |     TARGETS volk_profile
49 |     DESTINATION bin
50 |     COMPONENT "volk")
51 | 
52 | # MAKE volk-config-info
53 | add_executable(volk-config-info volk-config-info.cc
54 |                                 ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc)
55 | target_compile_features(volk-config-info PUBLIC cxx_std_17)
56 | 
57 | if(ENABLE_STATIC_LIBS)
58 |     target_link_libraries(volk-config-info volk_static)
59 |     set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static")
60 | else()
61 |     target_link_libraries(volk-config-info volk)
62 | endif()
63 | 
64 | install(
65 |     TARGETS volk-config-info
66 |     DESTINATION bin
67 |     COMPONENT "volk")
68 | 
69 | # Launch volk_profile if requested to do so
70 | if(ENABLE_PROFILING)
71 |     if(DEFINED VOLK_CONFIGPATH)
72 |         set(VOLK_CONFIG_ARG "-p${VOLK_CONFIGPATH}")
73 |         set(VOLK_CONFIG "${VOLK_CONFIGPATH}/volk_config")
74 |     endif()
75 | 
76 |     add_custom_command(
77 |         OUTPUT ${VOLK_CONFIG}
78 |         COMMAND volk_profile "${VOLK_CONFIG_ARG}"
79 |         DEPENDS volk_profile
80 |         COMMENT "Launching profiler, this may take a few minutes...")
81 |     add_custom_target(volk-profile-run ALL DEPENDS ${VOLK_CONFIG})
82 | 
83 | endif()
84 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2018 - 2022 Free Software Foundation, Inc.
 3 | #
 4 | # This file is part of VOLK
 5 | #
 6 | # SPDX-License-Identifier: LGPL-3.0-or-later
 7 | #
 8 | language: cpp
 9 | 
10 | os: linux
11 | dist: bionic
12 | 
13 | addons:
14 |   apt:
15 |     packages: &common_packages
16 |       - python3-mako
17 |       - liborc-dev
18 | 
19 | env:
20 |   global:
21 |     - SDE_VERSION=sde-external-8.50.0-2020-03-26-lin
22 |     - SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/
23 | 
24 | matrix:
25 |   include:
26 |     # Job 1 ... gcc-7 with Intel SDE
27 |     - name: Linux x86 Intel SDE GCC 7
28 |       env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7 CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/intel-sde.cmake"
29 |       addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages]}}
30 |       cache:
31 |         directories:
32 |           - ${TRAVIS_BUILD_DIR}/cache
33 |       before_script:
34 |         - cd ${TRAVIS_BUILD_DIR} && ./scripts/ci/download_intel_sde.sh
35 | 
36 |     # Job 4 ... gcc-6
37 |     - name: Linux x86 GCC 6
38 |       env: MATRIX_EVAL="CC=gcc-6 && CXX=g++-6"
39 |       addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-6]}}
40 | 
41 |     # Job 7 ... ARMv7 cross compile
42 |     - name: Linux ARMv7 Qemu GCC 7
43 |       env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm-linux-gnueabihf.cmake"
44 |       addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-arm-linux-gnueabihf, qemu-user]}}
45 | 
46 |     # Job 8 ... ARMv8 (aarch64) cross compile
47 |     - name: Linux ARMv8 (aarch64) Qemu GCC 7
48 |       env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/aarch64-linux-gnu.cmake"
49 |       addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-aarch64-linux-gnu, qemu-user]}}
50 | 
51 |     # Job 9 ... clang
52 |     - name: Linux x86 Clang 6
53 |       env: MATRIX_EVAL="CC=\"clang -fprofile-instr-generate -fcoverage-mapping\" && CXX=\"clang++ -fprofile-instr-generate -fcoverage-mapping\""
54 |       addons: {apt: {packages: [*common_packages, ]}}
55 | 
56 |     - name: Linux ARMv8 (aarch64) GCC 7
57 |       arch: arm64
58 |       env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
59 |       addons: {apt: {packages: [*common_packages, ]}}
60 | 
61 |     - name: Linux ARMv8 (aarch64) Clang 6
62 |       arch: arm64
63 |       env: MATRIX_EVAL="CC=clang && CXX=clang++"
64 |       addons: {apt: {packages: [*common_packages, ]}}
65 | 
66 | script:
67 |   - eval "${MATRIX_EVAL}"
68 |   - lscpu
69 |   - git submodule update --init --recursive
70 |   - mkdir build && cd build
71 |   - cmake ${CMAKE_ARG} ../
72 |   - make
73 |   - echo $(./apps/volk-config-info --malloc) && echo $(./apps/volk-config-info --alignment) && echo "All compiled VOLK machines:" $(./apps/volk-config-info --all-machines) && echo "Available VOLK machines:" $(./apps/volk-config-info --avail-machines)
74 |   - ctest -V
75 | 


--------------------------------------------------------------------------------
/tmpl/volk_cpu.tmpl.c:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #include <volk/volk_cpu.h>
 11 | #include <volk/volk_config_fixed.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | 
 15 | 
 16 | #if defined(VOLK_CPU_FEATURES)
 17 | #include "cpu_features_macros.h"
 18 | #if defined(CPU_FEATURES_ARCH_X86)
 19 | #include "cpuinfo_x86.h"
 20 | #elif defined(CPU_FEATURES_ARCH_ARM)
 21 | #include "cpuinfo_arm.h"
 22 | #elif defined(CPU_FEATURES_ARCH_AARCH64)
 23 | #include "cpuinfo_aarch64.h"
 24 | #elif defined(CPU_FEATURES_ARCH_MIPS)
 25 | #include "cpuinfo_mips.h"
 26 | #elif defined(CPU_FEATURES_ARCH_PPC)
 27 | #include "cpuinfo_ppc.h"
 28 | #elif defined(CPU_FEATURES_ARCH_RISCV)
 29 | #include "cpuinfo_riscv.h"
 30 | #endif
 31 | 
 32 | // This is required for MSVC
 33 | #if defined(__cplusplus)
 34 | using namespace cpu_features;
 35 | #endif
 36 | #endif
 37 | 
 38 | 
 39 | struct VOLK_CPU volk_cpu;
 40 | 
 41 | %for arch in archs:
 42 | static int i_can_has_${arch.name} (void) {
 43 |     %for check, params in arch.checks:
 44 |         %if "neon" in arch.name:
 45 | #if defined(CPU_FEATURES_ARCH_ARM)
 46 |     if (GetArmInfo().features.${check} == 0){ return 0; }
 47 | #endif
 48 |         %elif "mips" in arch.name:
 49 | #if defined(CPU_FEATURES_ARCH_MIPS)
 50 |     if (GetMipsInfo().features.${check} == 0){ return 0; }
 51 | #endif
 52 |         %elif "riscv" in arch.name or arch.name[:2] == "rv":
 53 | #if defined(CPU_FEATURES_ARCH_RISCV)
 54 |     if (GetRiscvInfo().features.${check} == 0){ return 0; }
 55 | #endif
 56 |         %else:
 57 | #if defined(CPU_FEATURES_ARCH_X86)
 58 |     if (GetX86Info().features.${check} == 0){ return 0; }
 59 | #endif
 60 |         %endif
 61 |     %endfor
 62 |     return 1;
 63 | }
 64 | 
 65 | %endfor
 66 | 
 67 | #if defined(HAVE_FENV_H)
 68 |     #if defined(FE_TONEAREST)
 69 |         #include <fenv.h>
 70 |         static inline void set_float_rounding(void){
 71 |             fesetround(FE_TONEAREST);
 72 |         }
 73 |     #else
 74 |         static inline void set_float_rounding(void){
 75 |             //do nothing
 76 |         }
 77 |     #endif
 78 | #elif defined(_MSC_VER)
 79 |     #include <float.h>
 80 |     static inline void set_float_rounding(void){
 81 |         unsigned int cwrd;
 82 |         _controlfp_s(&cwrd, 0, 0);
 83 |         _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
 84 |     }
 85 | #else
 86 |     static inline void set_float_rounding(void){
 87 |         //do nothing
 88 |     }
 89 | #endif
 90 | 
 91 | 
 92 | void volk_cpu_init() {
 93 |     %for arch in archs:
 94 |     volk_cpu.has_${arch.name} = &i_can_has_${arch.name};
 95 |     %endfor
 96 |     set_float_rounding();
 97 | }
 98 | 
 99 | unsigned int volk_get_lvarch() {
100 |     unsigned int retval = 0;
101 |     volk_cpu_init();
102 |     %for arch in archs:
103 |     retval += volk_cpu.has_${arch.name}() << LV_${arch.name.upper()};
104 |     %endfor
105 |     return retval;
106 | }
107 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@
 2 | .global	volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests
 3 | volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests:
 4 |     push    {r4, r5, r6, r7, r8, r9, sl, fp, lr}
 5 |     vpush   {d8-d15}
 6 |     lsrs    fp, r3, #3
 7 |     sub     sp, sp, #52     @ 0x34
 8 |     mov     r9, r3
 9 |     mov     sl, r0
10 |     mov     r7, r1
11 |     mov     r8, r2
12 |     vorr    q0, q7, q7
13 |     vorr    q1, q7, q7
14 |     vorr    q2, q7, q7
15 |     vorr    q3, q7, q7
16 |     vorr    q4, q7, q7
17 |     vorr    q5, q7, q7
18 |     veor    q6, q7, q7
19 |     vorr    q7, q7, q7
20 |     beq     .smallvector
21 |     mov     r4, r1
22 |     mov     ip, r2
23 |     mov     r3, #0
24 | .mainloop:
25 |     @mov     r6, ip
26 |     @mov     r5, r4
27 |     vld4.32 {d24,d26,d28,d30}, [r6]!
28 |     @add     ip, ip, #64     @ 0x40
29 |     @add     r4, r4, #64     @ 0x40
30 |     vld4.32 {d16,d18,d20,d22}, [r5]!
31 |     add     r3, r3, #1
32 |     vld4.32 {d25,d27,d29,d31}, [r6]!
33 |     vld4.32 {d17,d19,d21,d23}, [r5]!
34 |     vmla.f32        q6, q8, q12
35 |     vmla.f32        q0, q9, q12
36 |     cmp     r3, fp
37 |     vmls.f32        q5, q13, q9
38 |     vmla.f32        q2, q13, q8
39 |     vmla.f32        q7, q10, q14
40 |     vmla.f32        q1, q11, q14
41 |     vmls.f32        q4, q15, q11
42 |     vmla.f32        q3, q15, q10
43 |     bne     .mainloop
44 |     lsl     r3, fp, #6
45 |     add     r8, r8, r3
46 |     add     r7, r7, r3
47 | .smallvector:
48 |     vadd.f32        q3, q2, q3
49 |     add     r3, sp, #16
50 |     lsl     r4, fp, #3
51 |     vadd.f32        q4, q5, q4
52 |     cmp     r9, r4
53 |     vadd.f32        q6, q6, q7
54 |     vadd.f32        q1, q0, q1
55 |     vadd.f32        q8, q6, q4
56 |     vadd.f32        q9, q1, q3
57 |     vst2.32 {d16-d19}, [r3 :64]
58 |     vldr    s15, [sp, #24]
59 |     vldr    s16, [sp, #16]
60 |     vldr    s17, [sp, #20]
61 |     vadd.f32        s16, s16, s15
62 |     vldr    s11, [sp, #28]
63 |     vldr    s12, [sp, #40]  @ 0x28
64 |     vldr    s13, [sp, #44]  @ 0x2c
65 |     vldr    s14, [sp, #32]
66 |     vldr    s15, [sp, #36]  @ 0x24
67 |     vadd.f32        s17, s17, s11
68 |     vadd.f32        s16, s16, s12
69 |     vadd.f32        s17, s17, s13
70 |     vadd.f32        s16, s16, s14
71 |     vadd.f32        s17, s17, s15
72 |     vstr    s16, [sl]
73 |     vstr    s17, [sl, #4]
74 |     bls     .epilog
75 |     add     r5, sp, #8
76 | .tailcase:
77 |     ldr     r3, [r7], #8
78 |     mov     r0, r5
79 |     ldr     r1, [r8], #8
80 |     add     r4, r4, #1
81 |     ldr     ip, [r7, #-4]
82 |     ldr     r2, [r8, #-4]
83 |     str     ip, [sp]
84 |     bl      __mulsc3
85 |     vldr    s14, [sp, #8]
86 |     vldr    s15, [sp, #12]
87 |     vadd.f32        s16, s16, s14
88 |     cmp     r4, r9
89 |     vadd.f32        s17, s17, s15
90 |     vstr    s16, [sl]
91 |     vstr    s17, [sl, #4]
92 |     bne     .tailcase
93 | .epilog:
94 |     add     sp, sp, #52     @ 0x34
95 |     vpop    {d8-d15}
96 |     pop     {r4, r5, r6, r7, r8, r9, sl, fp, pc}
97 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
 2 |     .global	volk_32fc_32f_dot_prod_32fc_a_neonasm
 3 |     volk_32fc_32f_dot_prod_32fc_a_neonasm:
 4 |     @ r0 - result: pointer to output array (32fc)
 5 |     @ r1 - input: pointer to input array 1 (32fc)
 6 |     @ r2 - taps: pointer to input array 2 (32f)
 7 |     @ r3 - num_points: number of items to process
 8 |     
 9 |     result .req r0
10 |     input .req r1
11 |     taps .req r2
12 |     num_points .req r3
13 |     quarterPoints .req r7
14 |     number .req r8
15 |     @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
16 |     @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
17 |     @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
18 |     @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
19 |     realAccQ   .req q0 @ d0-d1/s0-s3
20 |     compAccQ   .req q1 @ d2-d3/s4-s7
21 |     realAccS   .req s0 @ d0[0]
22 |     compAccS   .req s4 @ d2[0]
23 |     tapsVal    .req q2 @ d4-d5
24 |     outVal     .req q3 @ d6-d7
25 |     realMul    .req q8 @ d8-d9
26 |     compMul    .req q9 @ d16-d17
27 |     inRealVal  .req q10 @ d18-d19
28 |     inCompVal  .req q11 @ d20-d21
29 |     
30 |     stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
31 |     
32 |     veor realAccQ, realAccQ @ zero out accumulators
33 |     veor compAccQ, compAccQ @ zero out accumulators
34 |     movs quarterPoints, num_points, lsr #2
35 |     beq .loop2 @ if zero into quarterPoints
36 |     
37 |     mov number, quarterPoints
38 | 
39 | .loop1:
40 |     @ do work here
41 |     @pld [taps, #128] @ pre-load hint - this is implementation specific!
42 |     @pld [input, #128] @ pre-load hint - this is implementation specific!
43 |     vld1.32 {d4-d5}, [taps:128]! @ tapsVal
44 |     vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
45 |     vmul.f32 realMul, tapsVal, inRealVal
46 |     vmul.f32 compMul, tapsVal, inCompVal
47 |     vadd.f32 realAccQ, realAccQ, realMul
48 |     vadd.f32 compAccQ, compAccQ, compMul
49 |     subs number, number, #1
50 |     bne	.loop1	@ first loop
51 | 
52 |     @ Sum up across realAccQ and compAccQ
53 |     vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
54 |     vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
55 |     vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
56 |     vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
57 |     @ critical values are now in s0 (realAccS), s4 (realAccQ)
58 | 	mov	number, quarterPoints, asl #2
59 | 
60 | .loop2:
61 |     cmp	num_points, number
62 |     bls	.done
63 |     
64 |     vld1.32 {d4[0]}, [taps]! @ s8
65 |     vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
66 |     vmul.f32 s5, s8, s10
67 |     vmul.f32 s6, s8, s12
68 |     vadd.f32 realAccS, realAccS, s5
69 |     vadd.f32 compAccS, compAccS, s6
70 |     
71 |     add number, number, #1
72 |     b .loop2
73 | 
74 | .done:
75 |     vst1.32 {d0[0]}, [result]! @ realAccS
76 |     vst1.32 {d2[0]}, [result]  @ compAccS
77 | 
78 |     ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
79 |     bx	lr
80 | 


--------------------------------------------------------------------------------
/include/volk/volk_complex.h:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2010, 2011, 2015, 2018, 2020, 2021 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #ifndef INCLUDED_VOLK_COMPLEX_H
 11 | #define INCLUDED_VOLK_COMPLEX_H
 12 | 
 13 | /*!
 14 |  * \brief Provide typedefs and operators for all complex types in C and C++.
 15 |  *
 16 |  * The typedefs encompass all signed integer and floating point types.
 17 |  * Each operator function is intended to work across all data types.
 18 |  * Under C++, these operators are defined as inline templates.
 19 |  * Under C, these operators are defined as preprocessor macros.
 20 |  * The use of macros makes the operators agnostic to the type.
 21 |  *
 22 |  * The following operator functions are defined:
 23 |  * - lv_cmake - make a complex type from components
 24 |  * - lv_creal - get the real part of the complex number
 25 |  * - lv_cimag - get the imaginary part of the complex number
 26 |  * - lv_conj - take the conjugate of the complex number
 27 |  */
 28 | 
 29 | #ifdef __cplusplus
 30 | 
 31 | #include <stdint.h>
 32 | #include <complex>
 33 | 
 34 | typedef std::complex<int8_t> lv_8sc_t;
 35 | typedef std::complex<int16_t> lv_16sc_t;
 36 | typedef std::complex<int32_t> lv_32sc_t;
 37 | typedef std::complex<int64_t> lv_64sc_t;
 38 | typedef std::complex<float> lv_32fc_t;
 39 | typedef std::complex<double> lv_64fc_t;
 40 | 
 41 | template <typename T>
 42 | inline std::complex<T> lv_cmake(const T& r, const T& i)
 43 | {
 44 |     return std::complex<T>(r, i);
 45 | }
 46 | 
 47 | template <typename T>
 48 | inline typename T::value_type lv_creal(const T& x)
 49 | {
 50 |     return x.real();
 51 | }
 52 | 
 53 | template <typename T>
 54 | inline typename T::value_type lv_cimag(const T& x)
 55 | {
 56 |     return x.imag();
 57 | }
 58 | 
 59 | template <typename T>
 60 | inline T lv_conj(const T& x)
 61 | {
 62 |     return std::conj(x);
 63 | }
 64 | 
 65 | #else /* __cplusplus */
 66 | 
 67 | #include <complex.h>
 68 | #include <tgmath.h>
 69 | 
 70 | typedef char complex lv_8sc_t;
 71 | typedef short complex lv_16sc_t;
 72 | typedef long complex lv_32sc_t;
 73 | typedef long long complex lv_64sc_t;
 74 | typedef float complex lv_32fc_t;
 75 | typedef double complex lv_64fc_t;
 76 | 
 77 | #define lv_cmake(r, i) ((r) + _Complex_I * (i))
 78 | 
 79 | // When GNUC is available, use the complex extensions.
 80 | // The extensions always return the correct value type.
 81 | // https://gcc.gnu.org/onlinedocs/gcc/Complex.html
 82 | #ifdef __GNUC__
 83 | 
 84 | #define lv_creal(x) (__real__(x))
 85 | 
 86 | #define lv_cimag(x) (__imag__(x))
 87 | 
 88 | #define lv_conj(x) (~(x))
 89 | 
 90 | // When not available, use the c99 complex function family,
 91 | // which always returns double regardless of the input type,
 92 | // unless we have C99 and thus tgmath.h overriding functions
 93 | // with type-generic versions.
 94 | #else /* __GNUC__ */
 95 | 
 96 | #define lv_creal(x) (creal(x))
 97 | 
 98 | #define lv_cimag(x) (cimag(x))
 99 | 
100 | #define lv_conj(x) (conj(x))
101 | 
102 | #endif /* __GNUC__ */
103 | 
104 | #endif /* __cplusplus */
105 | 
106 | #endif /* INCLUDE_VOLK_COMPLEX_H */
107 | 


--------------------------------------------------------------------------------
/python/volk_modtool/cfg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright 2013, 2014 Free Software Foundation, Inc.
 5 | #
 6 | # This file is part of VOLK
 7 | #
 8 | # SPDX-License-Identifier: LGPL-3.0-or-later
 9 | #
10 | 
11 | import configparser
12 | import sys
13 | import os
14 | import re
15 | 
16 | 
17 | class volk_modtool_config(object):
18 |     def key_val_sub(self, num, stuff, section):
19 |         return re.sub(r'\$' + 'k' + str(num), stuff[num][0], (re.sub(r'\$' + str(num), stuff[num][1], section[1][num])));
20 | 
21 |     def verify(self):
22 |         for i in self.verification:
23 |             self.verify_section(i)
24 |     def remap(self):
25 |         for i in self.remapification:
26 |             self.verify_section(i)
27 | 
28 |     def verify_section(self, section):
29 |         stuff = self.cfg.items(section[0])
30 |         for i in range(len(section[1])):
31 |             eval(self.key_val_sub(i, stuff, section))
32 |             try:
33 |                val = eval(self.key_val_sub(i, stuff, section))
34 |                if val == False:
35 |                    raise ValueError
36 |             except ValueError:
37 |                 raise ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
38 |             except:
39 |                 raise IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
40 | 
41 | 
42 |     def __init__(self, cfg=None):
43 |         self.config_name = 'config'
44 |         self.config_defaults = ['name', 'destination', 'base']
45 |         self.config_defaults_remap = ['1',
46 |                                       'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))',
47 |                                       'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))']
48 | 
49 |         self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')',
50 |                                        'os.path.exists(\'$1\')',
51 |                                        'os.path.exists(\'$2\')']
52 |         self.remapification = [(self.config_name, self.config_defaults_remap)]
53 |         self.verification = [(self.config_name, self.config_defaults_verify)]
54 |         default = os.path.join(os.getcwd(), 'volk_modtool.cfg')
55 |         icfg = configparser.RawConfigParser()
56 |         if cfg:
57 |             icfg.read(cfg)
58 |         elif os.path.exists(default):
59 |             icfg.read(default)
60 |         else:
61 |             print("Initializing config file...")
62 |             icfg.add_section(self.config_name)
63 |             for kn in self.config_defaults:
64 |                 rv = input("%s: "%(kn))
65 |                 icfg.set(self.config_name, kn, rv)
66 |         self.cfg = icfg
67 |         self.remap()
68 |         self.verify()
69 | 
70 | 
71 | 
72 |     def read_map(self, name, inp):
73 |         if self.cfg.has_section(name):
74 |             self.cfg.remove_section(name)
75 |         self.cfg.add_section(name)
76 |         for i in inp:
77 |             self.cfg.set(name, i, inp[i])
78 | 
79 |     def get_map(self, name):
80 |         retval = {}
81 |         stuff = self.cfg.items(name)
82 |         for i in stuff:
83 |             retval[i[0]] = i[1]
84 |         return retval
85 | 


--------------------------------------------------------------------------------
/lib/volk_prefs.c:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2011, 2012, 2015, 2016, 2019, 2020 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #include <stdbool.h>
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | #if defined(_MSC_VER)
 15 | #include <io.h>
 16 | #define access _access
 17 | #define F_OK 0
 18 | #else
 19 | #include <unistd.h>
 20 | #endif
 21 | #include <volk/volk_prefs.h>
 22 | 
 23 | void volk_get_config_path(char* path, bool read)
 24 | {
 25 |     if (!path)
 26 |         return;
 27 |     const char* suffix = "/.volk/volk_config";
 28 |     const char* suffix2 = "/volk/volk_config"; // non-hidden
 29 |     char* home = NULL;
 30 | 
 31 |     // allows config redirection via env variable
 32 |     home = getenv("VOLK_CONFIGPATH");
 33 |     if (home != NULL) {
 34 |         strncpy(path, home, 512);
 35 |         strcat(path, suffix2);
 36 |         if (!read || access(path, F_OK) != -1) {
 37 |             return;
 38 |         }
 39 |     }
 40 | 
 41 |     // check for user-local config file
 42 |     home = getenv("HOME");
 43 |     if (home != NULL) {
 44 |         strncpy(path, home, 512);
 45 |         strcat(path, suffix);
 46 |         if (!read || (access(path, F_OK) != -1)) {
 47 |             return;
 48 |         }
 49 |     }
 50 | 
 51 |     // check for config file in APPDATA (Windows)
 52 |     home = getenv("APPDATA");
 53 |     if (home != NULL) {
 54 |         strncpy(path, home, 512);
 55 |         strcat(path, suffix);
 56 |         if (!read || (access(path, F_OK) != -1)) {
 57 |             return;
 58 |         }
 59 |     }
 60 | 
 61 |     // check for system-wide config file
 62 |     if (access("/etc/volk/volk_config", F_OK) != -1) {
 63 |         strncpy(path, "/etc", 512);
 64 |         strcat(path, suffix2);
 65 |         if (!read || (access(path, F_OK) != -1)) {
 66 |             return;
 67 |         }
 68 |     }
 69 | 
 70 |     // If still no path was found set path[0] to '0' and fall through
 71 |     path[0] = 0;
 72 |     return;
 73 | }
 74 | 
 75 | size_t volk_load_preferences(volk_arch_pref_t** prefs_res)
 76 | {
 77 |     FILE* config_file;
 78 |     char path[512], line[512];
 79 |     size_t n_arch_prefs = 0;
 80 |     volk_arch_pref_t* prefs = NULL;
 81 | 
 82 |     // get the config path
 83 |     volk_get_config_path(path, true);
 84 |     if (!path[0])
 85 |         return n_arch_prefs; // no prefs found
 86 |     config_file = fopen(path, "r");
 87 |     if (!config_file)
 88 |         return n_arch_prefs; // no prefs found
 89 | 
 90 |     // reset the file pointer and write the prefs into volk_arch_prefs
 91 |     while (fgets(line, sizeof(line), config_file) != NULL) {
 92 |         void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
 93 |         if (!new_prefs) {
 94 |             printf("volk_load_preferences: bad malloc\n");
 95 |             break;
 96 |         }
 97 |         prefs = (volk_arch_pref_t*)new_prefs;
 98 |         volk_arch_pref_t* p = prefs + n_arch_prefs;
 99 |         if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 &&
100 |             !strncmp(p->name, "volk_", 5)) {
101 |             n_arch_prefs++;
102 |         }
103 |     }
104 |     fclose(config_file);
105 |     *prefs_res = prefs;
106 |     return n_arch_prefs;
107 | }
108 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language: Cpp
  3 | # BasedOnStyle: LLVM
  4 | AccessModifierOffset: -4
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlinesLeft: true
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: false
 16 | AllowShortLoopsOnASingleLine: false
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: false
 20 | AlwaysBreakTemplateDeclarations: true
 21 | BinPackArguments: false
 22 | BinPackParameters: false
 23 | BreakBeforeBraces: Custom
 24 | BraceWrapping:
 25 |   AfterClass:      true
 26 |   AfterControlStatement: false
 27 |   AfterEnum:       false
 28 |   AfterFunction:   true
 29 |   AfterNamespace:  false
 30 |   AfterObjCDeclaration: false
 31 |   AfterStruct:     false
 32 |   AfterUnion:      false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 | BreakBeforeBinaryOperators: None
 37 | BreakBeforeTernaryOperators: true
 38 | BreakConstructorInitializersBeforeComma: false
 39 | BreakAfterJavaFieldAnnotations: false
 40 | BreakStringLiterals: true
 41 | ColumnLimit:     90
 42 | CommentPragmas:  '^ IWYU pragma:'
 43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 44 | ConstructorInitializerIndentWidth: 4
 45 | ContinuationIndentWidth: 4
 46 | Cpp11BracedListStyle: false
 47 | DerivePointerAlignment: false
 48 | DisableFormat:   false
 49 | ExperimentalAutoDetectBinPacking: false
 50 | ForEachMacros:
 51 |   - foreach
 52 |   - Q_FOREACH
 53 |   - BOOST_FOREACH
 54 | IncludeCategories:
 55 |   - Regex:           '^"(gnuradio)/'
 56 |     Priority:        1
 57 |   - Regex:           '^<(gnuradio)/'
 58 |     Priority:        2
 59 |   - Regex:           '^<(boost)/'
 60 |     Priority:        98
 61 |   - Regex:           '^<[a-z]*>$'
 62 |     Priority:        99
 63 |   - Regex:           '^".*"$'
 64 |     Priority:        0
 65 |   - Regex:           '.*'
 66 |     Priority:        10
 67 | 
 68 | IncludeIsMainRegex: '(Test)?$'
 69 | IndentCaseLabels: false
 70 | IndentWidth: 4
 71 | IndentWrappedFunctionNames: false
 72 | JavaScriptQuotes: Leave
 73 | JavaScriptWrapImports: true
 74 | KeepEmptyLinesAtTheStartOfBlocks: true
 75 | MacroBlockBegin: ''
 76 | MacroBlockEnd:   ''
 77 | MaxEmptyLinesToKeep: 2
 78 | NamespaceIndentation: None
 79 | ObjCBlockIndentWidth: 2
 80 | ObjCSpaceAfterProperty: false
 81 | ObjCSpaceBeforeProtocolList: true
 82 | PenaltyBreakBeforeFirstCallParameter: 19
 83 | PenaltyBreakComment: 300
 84 | PenaltyBreakFirstLessLess: 120
 85 | PenaltyBreakString: 1000
 86 | PenaltyExcessCharacter: 1000000
 87 | PenaltyReturnTypeOnItsOwnLine: 60
 88 | PointerAlignment: Left
 89 | ReflowComments:  true
 90 | SortIncludes:    true
 91 | SpaceAfterCStyleCast: false
 92 | SpaceAfterTemplateKeyword: true
 93 | SpaceBeforeAssignmentOperators: true
 94 | SpaceBeforeParens: ControlStatements
 95 | SpaceInEmptyParentheses: false
 96 | SpacesBeforeTrailingComments: 1
 97 | SpacesInAngles:  false
 98 | SpacesInContainerLiterals: true
 99 | SpacesInCStyleCastParentheses: false
100 | SpacesInParentheses: false
101 | SpacesInSquareBrackets: false
102 | Standard:        Cpp11
103 | TabWidth:        8
104 | UseTab:          Never
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32u_popcnt.h:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2012, 2014 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | /*!
 11 |  * \page volk_32u_popcnt
 12 |  *
 13 |  * \b Overview
 14 |  *
 15 |  * Computes the population count (popcnt), or Hamming distance of a
 16 |  * binary string. This kernel takes in a single unsigned 32-bit value
 17 |  * and returns the count of 1's that the value contains.
 18 |  *
 19 |  * <b>Dispatcher Prototype</b>
 20 |  * \code
 21 |  * void volk_32u_popcnt(uint32_t* ret, const uint32_t value)
 22 |  * \endcode
 23 |  *
 24 |  * \b Inputs
 25 |  * \li value: The input value.
 26 |  *
 27 |  * \b Outputs
 28 |  * \li ret: The return value containing the popcnt.
 29 |  *
 30 |  * \b Example
 31 |  * \code
 32 |     int N = 10;
 33 |     unsigned int alignment = volk_get_alignment();
 34 | 
 35 |     uint32_t bitstring = 0x55555555;
 36 |     uint32_t hamming_distance = 0;
 37 | 
 38 |     volk_32u_popcnt(&hamming_distance, bitstring);
 39 |     printf("hamming distance of %x = %i\n", bitstring, hamming_distance);
 40 |  * \endcode
 41 |  */
 42 | 
 43 | #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
 44 | #define INCLUDED_VOLK_32u_POPCNT_A16_H
 45 | 
 46 | #include <inttypes.h>
 47 | #include <stdio.h>
 48 | 
 49 | #ifdef LV_HAVE_GENERIC
 50 | 
 51 | static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
 52 | {
 53 |     // This is faster than a lookup table
 54 |     uint32_t retVal = value;
 55 | 
 56 |     retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
 57 |     retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
 58 |     retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
 59 |     retVal = (retVal + (retVal >> 8));
 60 |     retVal = (retVal + (retVal >> 16)) & 0x0000003F;
 61 | 
 62 |     *ret = retVal;
 63 | }
 64 | 
 65 | #endif /*LV_HAVE_GENERIC*/
 66 | 
 67 | 
 68 | #ifdef LV_HAVE_NEON
 69 | #include <arm_neon.h>
 70 | 
 71 | static inline void volk_32u_popcnt_neon(uint32_t* ret, const uint32_t value)
 72 | {
 73 |     // Load value into a 64-bit vector (as 8 bytes)
 74 |     uint8x8_t input = vreinterpret_u8_u32(vdup_n_u32(value));
 75 |     // Count bits in each byte
 76 |     uint8x8_t counts = vcnt_u8(input);
 77 |     // Sum across all bytes (only first 4 matter for 32-bit value)
 78 |     // Use vpaddl to widen and add: 8x8 -> 4x16 -> 2x32 -> 1x64
 79 |     uint16x4_t sum16 = vpaddl_u8(counts);
 80 |     uint32x2_t sum32 = vpaddl_u16(sum16);
 81 |     // Extract the lower 32-bit element which contains the sum of the lower 4 bytes
 82 |     *ret = vget_lane_u32(sum32, 0);
 83 | }
 84 | #endif /* LV_HAVE_NEON */
 85 | 
 86 | 
 87 | #ifdef LV_HAVE_SSE4_2
 88 | 
 89 | #include <nmmintrin.h>
 90 | 
 91 | static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
 92 | {
 93 |     *ret = _mm_popcnt_u32(value);
 94 | }
 95 | 
 96 | #endif /*LV_HAVE_SSE4_2*/
 97 | 
 98 | #ifdef LV_HAVE_RVV
 99 | #include <riscv_vector.h>
100 | 
101 | static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value)
102 | {
103 |     *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32);
104 | }
105 | #endif /*LV_HAVE_RVV*/
106 | 
107 | #ifdef LV_HAVE_RVA22V
108 | #include <riscv_bitmanip.h>
109 | 
110 | static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value)
111 | {
112 |     *ret = __riscv_cpop_32(value);
113 | }
114 | #endif /*LV_HAVE_RVA22V*/
115 | 
116 | #endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
117 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_16i_max_star_16i.h:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2012, 2014 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | /*!
 11 |  * \page volk_16i_max_star_16i
 12 |  *
 13 |  * \b Deprecation
 14 |  *
 15 |  * This kernel is deprecated.
 16 |  *
 17 |  * \b Overview
 18 |  *
 19 |  * <FIXME>
 20 |  *
 21 |  * <b>Dispatcher Prototype</b>
 22 |  * \code
 23 |  * void volk_16i_max_star_16i(short* target, short* src0, unsigned int num_points);
 24 |  * \endcode
 25 |  *
 26 |  * \b Inputs
 27 |  * \li src0: The input vector.
 28 |  * \li num_points: The number of complex data points.
 29 |  *
 30 |  * \b Outputs
 31 |  * \li target: The output value of the max* operation.
 32 |  *
 33 |  * \b Example
 34 |  * \code
 35 |  * int N = 10000;
 36 |  *
 37 |  * volk_16i_max_star_16i();
 38 |  *
 39 |  * volk_free(x);
 40 |  * volk_free(t);
 41 |  * \endcode
 42 |  */
 43 | 
 44 | #ifndef INCLUDED_volk_16i_max_star_16i_a_H
 45 | #define INCLUDED_volk_16i_max_star_16i_a_H
 46 | 
 47 | #include <inttypes.h>
 48 | #include <stdio.h>
 49 | 
 50 | #ifdef LV_HAVE_SSSE3
 51 | 
 52 | #include <emmintrin.h>
 53 | #include <tmmintrin.h>
 54 | #include <xmmintrin.h>
 55 | 
 56 | static inline void
 57 | volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
 58 | {
 59 |     const unsigned int num_bytes = num_points * 2;
 60 | 
 61 |     short candidate = src0[0];
 62 |     short cands[8];
 63 |     __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
 64 | 
 65 |     __m128i* p_src0;
 66 | 
 67 |     p_src0 = (__m128i*)src0;
 68 | 
 69 |     int bound = num_bytes >> 4;
 70 |     int leftovers = (num_bytes >> 1) & 7;
 71 | 
 72 |     int i = 0;
 73 | 
 74 |     xmm1 = _mm_setzero_si128();
 75 |     xmm0 = _mm_setzero_si128();
 76 |     //_mm_insert_epi16(xmm0, candidate, 0);
 77 | 
 78 |     xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
 79 | 
 80 |     for (i = 0; i < bound; ++i) {
 81 |         xmm1 = _mm_load_si128(p_src0);
 82 |         p_src0 += 1;
 83 |         // xmm2 = _mm_sub_epi16(xmm1, xmm0);
 84 | 
 85 |         xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
 86 |         xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
 87 |         xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
 88 | 
 89 |         xmm6 = _mm_xor_si128(xmm4, xmm5);
 90 | 
 91 |         xmm3 = _mm_and_si128(xmm3, xmm0);
 92 |         xmm4 = _mm_and_si128(xmm6, xmm1);
 93 | 
 94 |         xmm0 = _mm_add_epi16(xmm3, xmm4);
 95 |     }
 96 | 
 97 |     _mm_store_si128((__m128i*)cands, xmm0);
 98 | 
 99 |     for (i = 0; i < 8; ++i) {
100 |         candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
101 |     }
102 | 
103 |     for (i = 0; i < leftovers; ++i) {
104 |         candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0)
105 |                         ? candidate
106 |                         : src0[(bound << 3) + i];
107 |     }
108 | 
109 |     target[0] = candidate;
110 | }
111 | 
112 | #endif /*LV_HAVE_SSSE3*/
113 | 
114 | #ifdef LV_HAVE_GENERIC
115 | 
116 | static inline void
117 | volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
118 | {
119 |     const unsigned int num_bytes = num_points * 2;
120 | 
121 |     int i = 0;
122 | 
123 |     int bound = num_bytes >> 1;
124 | 
125 |     short candidate = src0[0];
126 |     for (i = 1; i < bound; ++i) {
127 |         candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
128 |     }
129 |     target[0] = candidate;
130 | }
131 | 
132 | #endif /*LV_HAVE_GENERIC*/
133 | 
134 | 
135 | #endif /*INCLUDED_volk_16i_max_star_16i_a_H*/
136 | 


--------------------------------------------------------------------------------
/.cmake-format.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Marcus Müller, 2024 Johannes Demel
 2 | # SPDX-License-Identifier: LGPL-3.0-or-later
 3 | 
 4 | class _clang_format_options:
 5 |     def __init__(self, clangfile=None):
 6 |         if not clangfile:
 7 |             clangfile = ".clang-format"
 8 |         self.lines = []
 9 |         with open(clangfile, encoding="utf-8") as opened:
10 |             for line in opened:
11 |                 if line.strip().startswith("#"):
12 |                     continue
13 |                 self.lines.append(line.rstrip().split(":"))
14 | 
15 |     def __getitem__(self, string):
16 |         path = string.split(".")
17 |         value = None
18 |         for crumble in path:
19 |             for line in self.lines:
20 |                 if line[0].strip() == crumble:
21 |                     if len(line) > 1:
22 |                         value = line[1].strip().rstrip()
23 |                     break
24 |         return value
25 | 
26 | 
27 | _clang_format = _clang_format_options()
28 | 
29 | # ----------------------------------
30 | # Options affecting listfile parsing
31 | # ----------------------------------
32 | with section("parse"):
33 |     additional_commands = {
34 |         'gr_python_install': {
35 |             'flags': [],
36 |             'kwargs': {
37 |                 "PROGRAMS": "*",
38 |                 "FILES": "*",
39 |                 "DESTINATION": "*"
40 |             }
41 |         },
42 |     }
43 | 
44 | with section("markup"):
45 |     first_comment_is_literal = True
46 |     enable_markup = False
47 | 
48 | with section("format"):
49 |     # Disable formatting entirely, making cmake-format a no-op
50 |     disable = False
51 | 
52 |     # How wide to allow formatted cmake files
53 |     line_width = int(_clang_format["ColumnLimit"])
54 | 
55 |     # How many spaces to tab for indent
56 |     tab_size = int(_clang_format["IndentWidth"])
57 | 
58 |     # If true, lines are indented using tab characters (utf-8 0x09) instead of
59 |     # <tab_size> space characters (utf-8 0x20). In cases where the layout would
60 |     # require a fractional tab character, the behavior of the  fractional
61 |     # indentation is governed by <fractional_tab_policy>
62 |     use_tabchars = _clang_format["UseTab"] in ("ForIndentation",
63 |                                                "ForContinuationAndIndentation",
64 |                                                "Always")
65 | 
66 |     # If true, separate flow control names from their parentheses with a space
67 |     separate_ctrl_name_with_space = False
68 | 
69 |     # If true, separate function names from parentheses with a space
70 |     separate_fn_name_with_space = False
71 | 
72 |     # If a statement is wrapped to more than one line, than dangle the closing
73 |     # parenthesis on its own line.
74 |     dangle_parens = False
75 | 
76 |     # If the statement spelling length (including space and parenthesis) is
77 |     # smaller than this amount, then force reject nested layouts.
78 |     min_prefix_chars = tab_size
79 | 
80 |     # If the statement spelling length (including space and parenthesis) is larger
81 |     # than the tab width by more than this amount, then force reject un-nested
82 |     # layouts.
83 |     max_prefix_chars = 3 * tab_size
84 | 
85 |     # What style line endings to use in the output.
86 |     line_ending = "unix"
87 | 
88 |     # Format command names consistently as 'lower' or 'upper' case
89 |     command_case = "canonical"
90 | 
91 |     # Format keywords consistently as 'lower' or 'upper' case
92 |     keyword_case = "upper"
93 | 
94 | 
95 | with section("lint"):
96 |     max_arguments = 6
97 |     max_localvars = 20
98 |     max_statements = 75
99 | 


--------------------------------------------------------------------------------
/tests/test_volk_32fc_x2_multiply_32fc.cc:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2022 Johannes Demel
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #include "volk_test.h"
 11 | #include <fmt/chrono.h>
 12 | #include <fmt/core.h>
 13 | #include <fmt/ranges.h>
 14 | #include <gtest/gtest-param-test.h>
 15 | #include <gtest/gtest.h>
 16 | #include <volk/volk.h>
 17 | #include <volk/volk_alloc.hh>
 18 | #include <chrono>
 19 | 
 20 | class volk_32fc_x2_multiply_32fc_test : public VolkTest
 21 | {
 22 | protected:
 23 |     void SetUp() override
 24 |     {
 25 |         initialize_test(GetParam());
 26 |         initialize_data(vector_length);
 27 |     }
 28 | 
 29 |     void initialize_data(const size_t length)
 30 |     {
 31 |         vector_length = length;
 32 |         vec0 = volk::vector<lv_32fc_t>(length);
 33 |         vec1 = volk::vector<lv_32fc_t>(length);
 34 |         result = volk::vector<lv_32fc_t>(length);
 35 |         for (size_t i = 0; i < length; ++i) {
 36 |             vec0[i] = std::complex<float>(i * 3.14, i * 0.45);
 37 |             vec1[i] = std::complex<float>(i * -2.78, i * 5.44);
 38 |         }
 39 | 
 40 |         expected = volk::vector<lv_32fc_t>(length);
 41 |         for (size_t i = 0; i < length; ++i) {
 42 |             expected[i] = vec0[i] * vec1[i];
 43 |         }
 44 | 
 45 |         // This is a hacky solution to have unaligned tests.
 46 |         ua_result = result;
 47 |         ua_result.at(0) = expected.at(0);
 48 |     }
 49 | 
 50 |     void execute_aligned(const std::string impl_name)
 51 |     {
 52 |         volk_32fc_x2_multiply_32fc_manual(
 53 |             result.data(), vec0.data(), vec1.data(), vector_length, impl_name.c_str());
 54 |         EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result));
 55 |     }
 56 | 
 57 |     void execute_unaligned(const std::string impl_name)
 58 |     {
 59 |         volk_32fc_x2_multiply_32fc_manual(ua_result.data() + 1,
 60 |                                           vec0.data() + 1,
 61 |                                           vec1.data() + 1,
 62 |                                           vector_length - 1,
 63 |                                           impl_name.c_str());
 64 |         EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, ua_result));
 65 |     }
 66 | 
 67 |     volk::vector<lv_32fc_t> vec0;
 68 |     volk::vector<lv_32fc_t> vec1;
 69 |     volk::vector<lv_32fc_t> result;
 70 |     volk::vector<lv_32fc_t> expected;
 71 | 
 72 |     volk::vector<lv_32fc_t> ua_result;
 73 | };
 74 | 
 75 | TEST_P(volk_32fc_x2_multiply_32fc_test, run)
 76 | {
 77 |     fmt::print("test {} implementation: {:>12}, size={} ...",
 78 |                is_aligned_implementation ? "aligned" : "unaligned",
 79 |                implementation_name,
 80 |                vector_length);
 81 |     auto start = std::chrono::steady_clock::now();
 82 | 
 83 |     if (is_aligned_implementation) {
 84 |         execute_aligned(implementation_name);
 85 |     } else {
 86 |         execute_unaligned(implementation_name);
 87 |     }
 88 | 
 89 | 
 90 |     std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - start;
 91 |     fmt::print("\tduration={}\n", elapsed);
 92 | }
 93 | 
 94 | 
 95 | INSTANTIATE_TEST_SUITE_P(
 96 |     volk_32fc_x2_multiply_32fc,
 97 |     volk_32fc_x2_multiply_32fc_test,
 98 |     testing::Combine(testing::ValuesIn(get_kernel_implementation_name_list(
 99 |                          volk_32fc_x2_multiply_32fc_get_func_desc())),
100 |                      testing::ValuesIn(default_vector_sizes)),
101 |     generate_volk_test_name());
102 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
 2 | 	.global	volk_32fc_32f_dot_prod_32fc_a_neonpipeline
 3 | volk_32fc_32f_dot_prod_32fc_a_neonpipeline:
 4 | 	@ r0 - result: pointer to output array (32fc)
 5 | 	@ r1 - input: pointer to input array 1 (32fc)
 6 | 	@ r2 - taps: pointer to input array 2 (32f)
 7 | 	@ r3 - num_points: number of items to process
 8 | 
 9 | 	result .req r0
10 | 	input .req r1
11 | 	taps .req r2
12 | 	num_points .req r3
13 | 	quarterPoints .req r7
14 | 	number .req r8
15 | 	@ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
16 |     @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
17 |     @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
18 |     @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
19 | 	realAccQ   .req q0 @ d0-d1/s0-s3
20 | 	compAccQ   .req q1 @ d2-d3/s4-s7
21 | 	realAccS   .req s0 @ d0[0]
22 | 	compAccS   .req s4 @ d2[0]
23 | 	tapsVal    .req q2 @ d4-d5
24 | 	outVal     .req q3 @ d6-d7
25 |     realMul    .req q8 @ d8-d9
26 |     compMul    .req q9 @ d16-d17
27 |     inRealVal  .req q10 @ d18-d19
28 | 	inCompVal  .req q11 @ d20-d21
29 | 
30 | 	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
31 | 
32 |     pld [taps, #128] @ pre-load hint - this is implementation specific!
33 | 	pld [input, #128] @ pre-load hint - this is implementation specific!
34 | 
35 | 	veor realAccQ, realAccQ @ zero out accumulators
36 | 	veor compAccQ, compAccQ @ zero out accumulators
37 | 	movs quarterPoints, num_points, lsr #2
38 | 	beq .loop2 @ if zero into quarterPoints
39 | 
40 | 	@mov number, quarterPoints
41 | 	mov number, #0
42 | 	@ Optimizing for pipeline
43 | 	vld1.32 {tapsVal}, [taps:128]! @ tapsVal
44 | 	vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
45 | 	add number, number, #1
46 | 
47 | .loop1:
48 | 	@ do work here
49 | 	pld [taps, #128] @ pre-load hint - this is implementation specific!
50 | 	pld [input, #128] @ pre-load hint - this is implementation specific!
51 | 	vmul.f32 realMul, tapsVal, inRealVal
52 | 	vmul.f32 compMul, tapsVal, inCompVal
53 | 	vadd.f32 realAccQ, realAccQ, realMul
54 | 	vadd.f32 compAccQ, compAccQ, compMul
55 | 	vld1.32 {tapsVal}, [taps:128]! @ tapsVal
56 | 	vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
57 | 
58 | 	@subs number, number, #1
59 | 	@bls	.loop1	@ first loop
60 |     add number, number, #1
61 |     cmp number, quarterPoints
62 |     blt .loop1
63 | 
64 | 	vmul.f32 realMul, tapsVal, inRealVal
65 | 	vmul.f32 compMul, tapsVal, inCompVal
66 | 	vadd.f32 realAccQ, realAccQ, realMul
67 | 	vadd.f32 compAccQ, compAccQ, compMul
68 | 
69 |     @ Sum up across realAccQ and compAccQ
70 |     vadd.f32 d0, d0, d1      @ realAccQ +-> d0
71 |     vadd.f32 d2, d2, d3      @ compAccQ +-> d2
72 |     vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
73 |     vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
74 | 
75 |     @ critical values are now in s0 (realAccS), s4 (realAccQ)
76 | 	mov	number, quarterPoints, asl #2
77 | 	cmp num_points, number
78 | 	beq	.done
79 | 
80 | .loop2:
81 | 	vld1.32 {d4[0]}, [taps]! @ s8
82 | 	vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
83 | 	vmul.f32 s5, s8, s10
84 | 	vmul.f32 s6, s8, s12
85 | 	vadd.f32 realAccS, realAccS, s5
86 | 	vadd.f32 compAccS, compAccS, s6
87 | 
88 | 	add number, number, #1
89 |     cmp number, num_points
90 | 	blt .loop2
91 | 
92 | .done:
93 |     vst1.32 {d0[0]}, [result]! @ realAccS
94 |     vst1.32 {d2[0]}, [result]  @ compAccS
95 | 
96 | 	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
97 | 	bx	lr
98 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s:
--------------------------------------------------------------------------------
 1 | @ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 2 | 	.global	volk_32fc_x2_dot_prod_32fc_neonasm
 3 | volk_32fc_x2_dot_prod_32fc_neonasm:
 4 |     push    {r4, r5, r6, r7, r8, lr}
 5 |     vpush   {q0-q7}
 6 |     vpush   {q8-q15}
 7 |     mov r8, r3          @ hold on to num_points (r8)
 8 |     @ zero out accumulators -- leave 1 reg in alu
 9 |     veor    q8, q15, q15
10 |     mov r7, r0          @ (r7) is cVec
11 |     veor    q9, q15, q15
12 |     mov r5, r1          @ (r5) is aVec
13 |     veor    q10, q15, q15
14 |     mov r6, r2          @ (r6) is bVec
15 |     veor    q11, q15, q15
16 |     lsrs    r3, r3, #3  @ eighth_points (r3) = num_points/8
17 |     veor    q12, q15, q15
18 |     mov r12, r2         @ (r12) is bVec
19 |     veor    q13, q15, q15
20 |     mov r4, r1          @ (r4) is aVec
21 |     veor    q14, q15, q15
22 |     veor    q15, q15, q15
23 |     beq .smallvector @ nathan optimized this file based on an objdump
24 |     @ but I don't understand this jump. Seems like it should go to loop2
25 |     @ and smallvector (really vector reduction) shouldn't need to be a label
26 |     mov r2, #0          @ 0 out r2 (now number)
27 | .loop1:
28 |     add r2, r2, #1      @ increment number
29 |     vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3
30 |     cmp r2, r3          @ is number < eighth_points
31 |     @pld [r12, #64]   
32 |     vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7
33 |     @pld [r4, #64]  
34 |     vmla.f32    q12, q4, q0 @ real (re*re)
35 |     vmla.f32    q14, q4, q1 @ imag (re*im)
36 |     vmls.f32    q15, q5, q1 @ real (im*im)
37 |     vmla.f32    q13, q5, q0 @ imag (im*re)
38 | 
39 |     vmla.f32    q8, q2, q6 @ real (re*re)
40 |     vmla.f32    q9, q2, q7 @ imag (re*im)
41 |     vmls.f32    q10, q3, q7 @ real (im*im)
42 |     vmla.f32    q11, q3, q6 @ imag (im*re)
43 |     bne .loop1
44 |     lsl r2, r3, #3      @ r2 = eighth_points * 8
45 |     add r6, r6, r2      @ bVec = bVec + eighth_points -- whyyyyy gcc?!?
46 |     add r5, r5, r2      @ aVec = aVec + eighth_points
47 |     @ q12-q13 were original real accumulators
48 |     @ q14-q15 were original imag accumulators
49 |     @ reduce 8 accumulators down to 2 (1 real, 1 imag)
50 |     vadd.f32    q8, q10, q8 @ real + real
51 |     vadd.f32    q11, q11, q9 @ imag + imag
52 |     vadd.f32    q12, q12, q15 @ real + real
53 |     vadd.f32    q14, q14, q13 @ imag + imag
54 |     vadd.f32    q8, q8, q12
55 |     vadd.f32    q9, q9, q14
56 | .smallvector:
57 |     lsl r4, r3, #3
58 |     cmp r8, r4
59 |     vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary!
60 |     vldr    s15, [sp, #8]
61 |     vldr    s17, [sp]
62 |     vldr    s16, [sp, #4]
63 |     vadd.f32    s17, s17, s15
64 |     vldr    s11, [sp, #12]
65 |     vldr    s12, [sp, #24]
66 |     vldr    s13, [sp, #28]
67 |     vldr    s14, [sp, #16]
68 |     vldr    s15, [sp, #20]
69 |     vadd.f32    s16, s16, s11
70 |     vadd.f32    s17, s17, s12
71 |     vadd.f32    s16, s16, s13
72 |     vadd.f32    s17, s17, s14
73 |     vadd.f32    s16, s16, s15
74 |     vstr    s17, [r7]
75 |     vstr    s16, [r7, #4]
76 |     bls .done
77 | .loop2:
78 |     mov r3, r6
79 |     add r6, r6, #8
80 |     vldr    s0, [r3]
81 |     vldr    s1, [r6, #-4]
82 |     mov r3, r5
83 |     add r5, r5, #8
84 |     vldr    s2, [r3]
85 |     vldr    s3, [r5, #-4]
86 |     bl  __mulsc3            @ GCC/Clang built-in. Portability?
87 |     add r4, r4, #1
88 |     cmp r4, r8
89 |     vadd.f32    s17, s17, s0
90 |     vadd.f32    s16, s16, s1
91 |     vstr    s17, [r7]
92 |     vstr    s16, [r7, #4]
93 |     bne .loop2
94 | .done: 
95 |     vpop    {q8-q15}
96 |     vpop    {q0-q7}
97 |     pop {r4, r5, r6, r7, r8, pc}
98 | 
99 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to VOLK
 2 | 
 3 | Welcome! You are reading about how to contribute code to VOLK. First of
 4 | all, we are very happy that you're about to contribute, and welcome your
 5 | submissions! We hope many more will come.
 6 | 
 7 | In this document, we will explain the main things to consider when submitting
 8 | pull requests against VOLK. Reading this first will help a lot with
 9 | streamlining the process of getting your code merged.
10 | 
11 | There is also a [wiki-based version of this file][wikicontrib], which contains
12 | more detail. VOLK is part of the GNU Radio project and as such, it follows the 
13 | same contribution guidelines.  This file is an [adopted GNU Radio checklist][gnuradiocontrib].
14 | 
15 | ## What about non-code contributions?
16 | 
17 | Those are at least as important as code contributions: Emails to the mailing
18 | list, answers on Stack Overflow, Wiki page edits, examples... We very much
19 | appreciate those. However, this document is specifically about contributing
20 | code.
21 | 
22 | ## DCO Signed?
23 | 
24 | Any code contributions going into VOLK will become part of an LGPL-licensed
25 | (former contributions are GPL-licensed), open source repository. It is therefore
26 | imperative that code submissions belong to the authors, and that submitters have
27 | the authority to merge that code into the public VOLK codebase.
28 | 
29 | For that purpose, we use the [Developer's Certificate of Origin](DCO.txt). It
30 | is the same document used by other projects. Signing the DCO states that there
31 | are no legal reasons to not merge your code.
32 | 
33 | To sign the DCO, suffix your git commits with a "Signed-off-by" line. When
34 | using the command line, you can use `git commit -s` to automatically add this
35 | line. If there were multiple authors of the code, or other types of
36 | stakeholders, make sure that all are listed, each with a separate Signed-off-by
37 | line.
38 | 
39 | ## Coding Guidelines
40 | 
41 | We have codified our coding guidelines in [GNU Radio GREP1][grep1]. Please read them, 
42 | and stick to them. For C/C++ code, use clang-format. For Python, PEP8 is your friend
43 | (but again, check the actual coding guidelines).
44 | 
45 | ## Git commit messages are very important
46 | 
47 | We follow standard git commit message guidelines, similar to many other open
48 | source projects. See the [coding guidelines][grep1] for more details. In a
49 | nutshell:
50 | - Keep the lines below 72 characters
51 | - Subject line has the component prepended (e.g., `kernelname:`)
52 | - Avoid empty git commit messages
53 | - The git commit message explains the change, the code only explains the current
54 |   state
55 | 
56 | ## Unit Tests
57 | 
58 | VOLK unit tests compare the results of each kernel version to the generic version.
59 | Keep the generic kernel version as simple as possible and verify your optimized
60 | kernels against the generic version.
61 | 
62 | ## The Buddy Principle: Submit One, Review One
63 | 
64 | When you've submitted a pull request, please take the time to review another
65 | one. This helps make sure that there are always a number of reviews at least
66 | equal to the number of pull requests, which means the maintainers don't get
67 | overwhelmed when a lot is being contributed.
68 | 
69 | ## Standard command line options
70 | 
71 | When writing programs that are executable from the command line,
72 | please follow existing examples regarding their command line arguments, and
73 | reuse them.
74 | 
75 | [grep1]: https://github.com/gnuradio/greps/blob/master/grep-0001-coding-guidelines.md
76 | [wikicontrib]: https://wiki.gnuradio.org/index.php/Development
77 | [gr-devs]: https://github.com/orgs/gnuradio/teams/gr-devs
78 | [gnuradiocontrib]: https://github.com/gnuradio/gnuradio/blob/master/CONTRIBUTING.md
79 | 


--------------------------------------------------------------------------------
/tests/volk_test.cc:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2022 Johannes Demel
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #include <fmt/core.h>
 11 | #include <fmt/ranges.h>
 12 | #include <gtest/gtest.h>
 13 | #include <volk/volk.h>
 14 | #include <algorithm>
 15 | #include <tuple>
 16 | 
 17 | 
 18 | template <class T>
 19 | ::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected,
 20 |                                                                     const T& actual)
 21 | {
 22 |     ::testing::AssertionResult result = ::testing::AssertionFailure();
 23 |     if (expected.size() != actual.size()) {
 24 |         return result << "expected result size=" << expected.size()
 25 |                       << " differs from actual size=" << actual.size();
 26 |     }
 27 |     const unsigned long length = expected.size();
 28 | 
 29 |     int errorsFound = 0;
 30 |     const char* separator = " ";
 31 |     for (unsigned long index = 0; index < length; index++) {
 32 |         auto expected_real = ::testing::internal::FloatingPoint(expected[index].real());
 33 |         auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag());
 34 |         auto actual_real = ::testing::internal::FloatingPoint(actual[index].real());
 35 |         auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag());
 36 |         if (not expected_real.AlmostEquals(actual_real) or
 37 |             not expected_imag.AlmostEquals(actual_imag))
 38 | 
 39 |         {
 40 |             if (errorsFound == 0) {
 41 |                 result << "Differences found:";
 42 |             }
 43 |             if (errorsFound < 3) {
 44 |                 result << separator << expected[index] << " != " << actual[index] << " @ "
 45 |                        << index;
 46 |                 separator = ",\n";
 47 |             }
 48 |             errorsFound++;
 49 |         }
 50 |     }
 51 |     if (errorsFound > 0) {
 52 |         result << separator << errorsFound << " differences in total";
 53 |         return result;
 54 |     }
 55 |     return ::testing::AssertionSuccess();
 56 | }
 57 | 
 58 | std::vector<std::string> get_kernel_implementation_name_list(const volk_func_desc_t desc)
 59 | {
 60 |     std::vector<std::string> names;
 61 |     for (size_t i = 0; i < desc.n_impls; i++) {
 62 |         names.push_back(std::string(desc.impl_names[i]));
 63 |     }
 64 |     std::sort(names.begin(), names.end());
 65 |     return names;
 66 | }
 67 | 
 68 | bool is_aligned_implementation_name(const std::string& name)
 69 | {
 70 |     return name.rfind("a_", 0) == 0;
 71 | }
 72 | 
 73 | std::tuple<std::vector<std::string>, std::vector<std::string>>
 74 | separate_implementations_by_alignment(const std::vector<std::string>& names)
 75 | {
 76 |     std::vector<std::string> aligned;
 77 |     std::vector<std::string> unaligned;
 78 |     for (auto name : names) {
 79 |         if (is_aligned_implementation_name(name)) {
 80 |             aligned.push_back(name);
 81 |         } else {
 82 |             unaligned.push_back(name);
 83 |         }
 84 |     }
 85 |     return { aligned, unaligned };
 86 | }
 87 | 
 88 | std::vector<std::string>
 89 | get_aligned_kernel_implementation_names(const volk_func_desc_t desc)
 90 | {
 91 |     auto impls = get_kernel_implementation_name_list(desc);
 92 |     auto [aligned, unaligned] = separate_implementations_by_alignment(impls);
 93 |     return aligned;
 94 | }
 95 | 
 96 | std::vector<std::string>
 97 | get_unaligned_kernel_implementation_names(const volk_func_desc_t desc)
 98 | {
 99 |     auto impls = get_kernel_implementation_name_list(desc);
100 |     auto [aligned, unaligned] = separate_implementations_by_alignment(impls);
101 |     return unaligned;
102 | }
103 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
 1 | Alexandre Rouma <alexandre.rouma@gmail.com> AlexandreRouma <alexandre.rouma@gmail.com>
 2 | Alexey Slokva <alesha72003@ya.ru> alesha72003 <alesha72003@ya.ru>
 3 | Andrey Rodionov <rodionovamp@mail.ru> dernasherbrezon <rodionovamp@mail.ru>
 4 | <ben@hilburn.dev> <ben.hilburn@ettus.com>
 5 | <ben@hilburn.dev> <bhilburn@gmail.com>
 6 | <carles.fernandez@gmail.com> <carlesfernandez@gmail.com>
 7 | Christoph Mayer <hcab14@gmail.com> cmayer <Christoph.Mayer@cern.ch>
 8 | Christoph Mayer <hcab14@gmail.com> hcab14 <hcab14@gmail.com>
 9 | <damian.miralles@aero.org> <damian.miralles@colorado.edu>
10 | <damian.miralles@aero.org> <dmiralles2009@gmail.com>
11 | Douglas Geiger <doug.geiger@bioradiation.net> Doug <douggeiger@users.noreply.github.com>
12 | Douglas Geiger <doug.geiger@bioradiation.net> Doug Geiger <doug.geiger@bioradiation.net>
13 | Douglas Geiger <doug.geiger@bioradiation.net> Douglas Geiger <douglas.geiger@nrl.navy.mil>
14 | <douglas.j.anderson@gmail.com> <djanderson@users.noreply.github.com>
15 | Federico Larroca <flarroca@fing.edu.uy> git-artes <flarroca@fing.edu.uy>
16 | Geof Nieboer <gnieboer@gcndevelopment.com> gnieboer <gnieboer@corpcomm.net>
17 | Jam M. Hernandez Quiceno <jamarck96@gmail.com> Jam Quiceno <jam_quiceno@partech.com>
18 | <jcorgan@corganenterprises.com> <johnathan@corganlabs.com>
19 | Johannes Demel <jdemel@gnuradio.org> jdemel <demel@ant.uni-bremen.de>
20 | Johannes Demel <jdemel@gnuradio.org> jdemel <demel@uni-bremen.de>
21 | Johannes Demel <jdemel@gnuradio.org> jdemel <jdemel@gnuradio.org>
22 | Johannes Demel <jdemel@gnuradio.org> jdemel <johannes@demels.de>
23 | Johannes Demel <jdemel@gnuradio.org> Johannes Demel <demel@ant.uni-bremen.de>
24 | Johannes Demel <jdemel@gnuradio.org> Johannes Demel <demel@uni-bremen.de>
25 | Johannes Demel <jdemel@gnuradio.org> Johannes Demel <ufcsy@student.kit.edu>
26 | John Sallay <jasallay@gmail.com> jsallay <31416796+jsallay@users.noreply.github.com>
27 | <magnuslundmark@gmail.com> <magnus@skysense.io>
28 | Marc Lichtman <marcll@vt.edu> Marc L <marcll@vt.edu>
29 | <marcus@hostalia.de> <mmueller@gnuradio.org>
30 | <marcus@hostalia.de> <mueller@kit.edu>
31 | Marcus Müller <marcus@hostalia.de> Marcus Mueller <marcus@hostalia.de>
32 | <michael.dickens@ettus.com> <157892+michaelld@users.noreply.github.com>
33 | <michael.dickens@ettus.com> <mlk@alum.mit.edu>
34 | Michael Dickens <michael.dickens@ettus.com> Michael L Dickens <mlk@alum.mit.edu>
35 | Michael Dickens <michael.dickens@ettus.com> Micheal Dickens <michael.dickens@ettus.com>
36 | Mike Piscopo <ghostop14@gmail.com> ghostop14 <ghostop14@gmail.com>
37 | <nate.ewest@gmail.com> <nathan@pepper>
38 | <nate.ewest@gmail.com> <nathan.west@nrl.navy.mil>
39 | <nate.ewest@gmail.com> <nathan.west@okstate.edu>
40 | <nate.ewest@gmail.com> <nwest@deepsig.io>
41 | Nathan West <nate.ewest@gmail.com> Nathan West <nathan.west@gnuradio.org>
42 | <n.corgan@gmail.com> <nick.corgan@ettus.com>
43 | Nicholas McCarthy <namccart@gmail.com> namccart <namccart@gmail.com>
44 | Nicholas McCarthy <namccart@gmail.com> Nick McCarthy <namccart@gmail.com>
45 | Nick Foster <nick@nerdnetworks.org> Nick Foster <bistromath@gmail.com>
46 | Nick Foster <nick@nerdnetworks.org> Nick Foster <nick@ettus.com>
47 | Olaf Bernstein <camel-cdr@protonmail.com>
48 | Pascal Giard <evilynux@gmail.com> Pascal Giard <pascal.giard@lacime.etsmtl.ca>
49 | <philip@balister.org> <philip@opensdr.com>
50 | Philip Balister <philip@balister.org> root <philip@balister.org>
51 | Philip Balister <philip@balister.org> root <root@usrp-e1xx.(none)>
52 | Rick Farina <zerochaos@gentoo.org> Rick Farina (Zero_Chaos) <zerochaos@gentoo.org>
53 | <ritterho@hm.edu> <32478819+fritterhoff@users.noreply.github.com>
54 | Ryan Volz <ryan.volz@gmail.com> Ryan Volz <rvolz@mit.edu>
55 | Sam Lane <sl01172@surrey.ac.uk> Sam Lane <9569766+SJ-Innovation@users.noreply.github.com>
56 | <tom@trondeau.com> <trondeau@vt.edu>
57 | 


--------------------------------------------------------------------------------
/lib/volk_rank_archs.c:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2011-2012 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | 
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | 
 15 | #include <volk/volk_prefs.h>
 16 | #include <volk_rank_archs.h>
 17 | 
 18 | int volk_get_index(const char* impl_names[], // list of implementations by name
 19 |                    const size_t n_impls,     // number of implementations available
 20 |                    const char* impl_name     // the implementation name to find
 21 | )
 22 | {
 23 |     unsigned int i;
 24 |     for (i = 0; i < n_impls; i++) {
 25 |         if (!strncmp(impl_names[i], impl_name, 20)) {
 26 |             return i;
 27 |         }
 28 |     }
 29 |     // TODO return -1;
 30 |     // something terrible should happen here
 31 |     fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
 32 |     return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now
 33 | }
 34 | 
 35 | int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
 36 |                     const char* impl_names[], // list of implementations by name
 37 |                     const int* impl_deps,     // requirement mask per implementation
 38 |                     const bool* alignment,    // alignment status of each implementation
 39 |                     size_t n_impls,           // number of implementations available
 40 |                     const bool align          // if false, filter aligned implementations
 41 | )
 42 | {
 43 |     size_t i;
 44 |     static volk_arch_pref_t* volk_arch_prefs;
 45 |     static size_t n_arch_prefs = 0;
 46 |     static int prefs_loaded = 0;
 47 |     if (!prefs_loaded) {
 48 |         n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
 49 |         prefs_loaded = 1;
 50 |     }
 51 | 
 52 |     // If we've defined VOLK_GENERIC to be anything, always return the
 53 |     // 'generic' kernel. Used in GR's QA code.
 54 |     char* gen_env = getenv("VOLK_GENERIC");
 55 |     if (gen_env) {
 56 |         return volk_get_index(impl_names, n_impls, "generic");
 57 |     }
 58 | 
 59 |     // If we've defined the kernel name as an environment variable, always return
 60 |     // the 'overridden' kernel. Used for manually overring config kernels at runtime.
 61 |     char* override_env = getenv(kern_name);
 62 |     if (override_env) {
 63 |         return volk_get_index(impl_names, n_impls, override_env);
 64 |     }
 65 | 
 66 |     // now look for the function name in the prefs list
 67 |     for (i = 0; i < n_arch_prefs; i++) {
 68 |         if (!strncmp(kern_name,
 69 |                      volk_arch_prefs[i].name,
 70 |                      sizeof(volk_arch_prefs[i].name))) // found it
 71 |         {
 72 |             const char* impl_name =
 73 |                 align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
 74 |             return volk_get_index(impl_names, n_impls, impl_name);
 75 |         }
 76 |     }
 77 | 
 78 |     // return the best index with the largest deps
 79 |     size_t best_index_a = 0;
 80 |     size_t best_index_u = 0;
 81 |     int best_value_a = -1;
 82 |     int best_value_u = -1;
 83 |     for (i = 0; i < n_impls; i++) {
 84 |         const signed val = impl_deps[i];
 85 |         if (alignment[i] && val > best_value_a) {
 86 |             best_index_a = i;
 87 |             best_value_a = val;
 88 |         }
 89 |         if (!alignment[i] && val > best_value_u) {
 90 |             best_index_u = i;
 91 |             best_value_u = val;
 92 |         }
 93 |     }
 94 | 
 95 |     // when align and we found a best aligned, use it
 96 |     if (align && best_value_a != -1)
 97 |         return best_index_a;
 98 | 
 99 |     // otherwise return the best unaligned
100 |     return best_index_u;
101 | }
102 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/riscv/volk_32fc_x2_dot_prod_32fc_sifive_u74.s:
--------------------------------------------------------------------------------
  1 |         .text
  2 |         .align 2
  3 |         .type   volk_32fc_x2_dot_prod_32fc_sifive_u74, @function
  4 |         .global volk_32fc_x2_dot_prod_32fc_sifive_u74
  5 | 
  6 |         #
  7 |         # RISC-V implementation using only I and F sets.
  8 |         # About 41% less CPU use than GCC, measured with volk_profile,
  9 |         # and a test gnuradio graph using Freq XLAT FIR filter.
 10 |         #
 11 |         # The generic C code is also 2x unrolled, but its main flaw
 12 |         # seems to be not properly fusing into fmadd and fnmsub.
 13 |         #
 14 |         # Focus of this hand coded assembly:
 15 |         # * Better use of fused multiply.
 16 |         # * Try to maximize space between write and read.
 17 |         #
 18 |         # Instruction order has been done manually and benchmarked,
 19 |         # and may not be optimal.
 20 |         #
 21 | volk_32fc_x2_dot_prod_32fc_sifive_u74:
 22 |         # a0: out
 23 |         # a1: in
 24 |         # a2: taps
 25 |         # a3: number of points
 26 | 
 27 |         # Calculate end of main loop.
 28 |         and     a4,a3,1
 29 |         xor     a4,a3,a4
 30 |         slli    a5,a4,3
 31 |         add     a5,a5,a1
 32 | 
 33 |         # Output regs.
 34 |         fmv.w.x ft0,zero
 35 |         fmv.w.x ft1,zero
 36 |         fmv.w.x ft2,zero
 37 |         fmv.w.x ft3,zero
 38 |         fmv.w.x ft4,zero
 39 |         fmv.w.x ft5,zero
 40 |         fmv.w.x ft6,zero
 41 |         fmv.w.x ft7,zero
 42 |         beq     a1,a5,.endloop
 43 | 
 44 |         # Main loop two complexes at a time.
 45 | .loop:
 46 |         # Load input in order of when it'll be used.
 47 |         # flw has 2 cycle latency, 1 cycle repeat.
 48 |         flw     ft8,0(a1)               # in0
 49 |         flw     ft9,0(a2)               # tp0
 50 |         flw     ft10,4(a2)              # tp1
 51 |         flw     ft11,4(a1)              # in1
 52 | 
 53 |         # None of the fused multiple-adds have a write-read stall.
 54 |         # FMA, like mul and add, have 5 cycle latency, 1 cycle repeat.
 55 |         fmadd.s  ft0,ft8, ft9, ft0      # in0*tp0
 56 |         flw      fa0,8(a1)              # in0
 57 |         fmadd.s  ft1,ft8, ft10,ft1      # in0*tp1
 58 |         flw      fa1,8(a2)              # tp0
 59 |         fnmsub.s ft2,ft11,ft10,ft2      # -in1*tp1
 60 |         flw      fa2,12(a2)             # tp1
 61 |         fmadd.s  ft3,ft11,ft9, ft3      # in1*tp0
 62 |         flw      fa3,12(a1)             # in1
 63 | 
 64 |         fmadd.s  ft4,fa0,fa1,ft4        # in0*tp0
 65 |         addi     a1,a1,16               # free ride in pipeline A.
 66 |         fmadd.s  ft5,fa0,fa2,ft5        # in0*tp1
 67 |         addi     a2,a2,16               # free ride in pipeline A.
 68 |         fnmsub.s ft6,fa3,fa2,ft6        # -in1*tp1
 69 |         fmadd.s  ft7,fa3,fa1,ft7        # in1*tp0
 70 |         bne      a1,a5,.loop
 71 | 
 72 | .endloop:
 73 |         # Check if odd number of inputs.
 74 |         andi    a3,a3,1
 75 |         beqz    a3,.done
 76 | 
 77 |         # Do odd one complex.
 78 |         flw     fa0,0(a1) # in0
 79 |         flw     fa1,0(a2) # tp0
 80 |         flw     fa2,4(a2) # tp1
 81 |         flw     fa3,4(a1) # in1
 82 | 
 83 |         fmadd.s  ft4,fa0,fa1,ft4   # in0*tp0
 84 |         fmadd.s  ft5,fa0,fa2,ft5   # in0*tp1
 85 |         fnmsub.s ft6,fa3,fa2,ft6   # -in1*tp1
 86 |         fmadd.s  ft7,fa3,fa1,ft7   # in1*tp0
 87 | .done:
 88 |         # Some one-time stalling here.
 89 |         # Latency 5, repeat 1.
 90 |         fadd.s  ft0,ft0,ft2
 91 |         fadd.s  ft1,ft1,ft3
 92 |         fadd.s  ft0,ft0,ft4
 93 |         fadd.s  ft1,ft1,ft5
 94 |         fadd.s  ft0,ft0,ft6
 95 |         fadd.s  ft1,ft1,ft7
 96 |         # fsw has latency 4, repeat 1.
 97 |         fsw     ft0,0(a0)
 98 |         fsw     ft1,4(a0)
 99 |         ret
100 | 
101 |         .size volk_32fc_x2_dot_prod_32fc_sifive_u74, .-volk_32fc_x2_dot_prod_32fc_sifive_u74
102 | 


--------------------------------------------------------------------------------
/kernels/volk/volk_32f_s32f_clamppuppet_32f.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | #ifndef INCLUDED_volk_32f_s32f_clamppuppet_32f_H
11 | #define INCLUDED_volk_32f_s32f_clamppuppet_32f_H
12 | 
13 | #include <volk/volk_32f_s32f_x2_clamp_32f.h>
14 | 
15 | #ifdef LV_HAVE_GENERIC
16 | static inline void volk_32f_s32f_clamppuppet_32f_generic(float* out,
17 |                                                          const float* in,
18 |                                                          const float min,
19 |                                                          unsigned int num_points)
20 | {
21 |     volk_32f_s32f_x2_clamp_32f_generic(out, in, min, -min, num_points);
22 | }
23 | #endif
24 | 
25 | #ifdef LV_HAVE_AVX2
26 | static inline void volk_32f_s32f_clamppuppet_32f_a_avx2(float* out,
27 |                                                         const float* in,
28 |                                                         const float min,
29 |                                                         unsigned int num_points)
30 | {
31 |     volk_32f_s32f_x2_clamp_32f_a_avx2(out, in, min, -min, num_points);
32 | }
33 | #endif
34 | 
35 | #ifdef LV_HAVE_SSE4_1
36 | static inline void volk_32f_s32f_clamppuppet_32f_a_sse4_1(float* out,
37 |                                                           const float* in,
38 |                                                           const float min,
39 |                                                           unsigned int num_points)
40 | {
41 |     volk_32f_s32f_x2_clamp_32f_a_sse4_1(out, in, min, -min, num_points);
42 | }
43 | #endif
44 | 
45 | #ifdef LV_HAVE_AVX2
46 | static inline void volk_32f_s32f_clamppuppet_32f_u_avx2(float* out,
47 |                                                         const float* in,
48 |                                                         const float min,
49 |                                                         unsigned int num_points)
50 | {
51 |     volk_32f_s32f_x2_clamp_32f_u_avx2(out, in, min, -min, num_points);
52 | }
53 | #endif
54 | 
55 | #ifdef LV_HAVE_SSE4_1
56 | static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out,
57 |                                                           const float* in,
58 |                                                           const float min,
59 |                                                           unsigned int num_points)
60 | {
61 |     volk_32f_s32f_x2_clamp_32f_u_sse4_1(out, in, min, -min, num_points);
62 | }
63 | #endif
64 | 
65 | #ifdef LV_HAVE_NEON
66 | static inline void volk_32f_s32f_clamppuppet_32f_neon(float* out,
67 |                                                       const float* in,
68 |                                                       const float min,
69 |                                                       unsigned int num_points)
70 | {
71 |     volk_32f_s32f_x2_clamp_32f_neon(out, in, min, -min, num_points);
72 | }
73 | #endif
74 | 
75 | #ifdef LV_HAVE_NEONV8
76 | static inline void volk_32f_s32f_clamppuppet_32f_neonv8(float* out,
77 |                                                         const float* in,
78 |                                                         const float min,
79 |                                                         unsigned int num_points)
80 | {
81 |     volk_32f_s32f_x2_clamp_32f_neonv8(out, in, min, -min, num_points);
82 | }
83 | #endif
84 | 
85 | #ifdef LV_HAVE_RVV
86 | static inline void volk_32f_s32f_clamppuppet_32f_rvv(float* out,
87 |                                                      const float* in,
88 |                                                      const float min,
89 |                                                      unsigned int num_points)
90 | {
91 |     volk_32f_s32f_x2_clamp_32f_rvv(out, in, min, -min, num_points);
92 | }
93 | #endif
94 | 
95 | #endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */
96 | 


--------------------------------------------------------------------------------
/include/volk/volk_rvv_intrinsics.h:
--------------------------------------------------------------------------------
 1 | /* -*- c++ -*- */
 2 | /*
 3 |  * Copyright 2024 Free Software Foundation, Inc.
 4 |  *
 5 |  * This file is part of VOLK
 6 |  *
 7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
 8 |  */
 9 | 
10 | /*
11 |  * This file is intended to hold RVV intrinsics of intrinsics.
12 |  * They should be used in VOLK kernels to avoid copy-paste.
13 |  */
14 | 
15 | #ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
16 | #define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
17 | #include <riscv_vector.h>
18 | 
19 | #define RISCV_SHRINK2(op, T, S, v)              \
20 |     __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
21 |                  __riscv_vget_##T##S##m1(v, 1), \
22 |                  __riscv_vsetvlmax_e##S##m1())
23 | 
24 | #define RISCV_SHRINK4(op, T, S, v)                           \
25 |     __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
26 |                               __riscv_vget_##T##S##m1(v, 1), \
27 |                               __riscv_vsetvlmax_e##S##m1()), \
28 |                  __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
29 |                               __riscv_vget_##T##S##m1(v, 3), \
30 |                               __riscv_vsetvlmax_e##S##m1()), \
31 |                  __riscv_vsetvlmax_e##S##m1())
32 | 
33 | #define RISCV_SHRINK8(op, T, S, v)                                        \
34 |     __riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
35 |                                            __riscv_vget_##T##S##m1(v, 1), \
36 |                                            __riscv_vsetvlmax_e##S##m1()), \
37 |                               __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
38 |                                            __riscv_vget_##T##S##m1(v, 3), \
39 |                                            __riscv_vsetvlmax_e##S##m1()), \
40 |                               __riscv_vsetvlmax_e##S##m1()),              \
41 |                  __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
42 |                                            __riscv_vget_##T##S##m1(v, 5), \
43 |                                            __riscv_vsetvlmax_e##S##m1()), \
44 |                               __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
45 |                                            __riscv_vget_##T##S##m1(v, 7), \
46 |                                            __riscv_vsetvlmax_e##S##m1()), \
47 |                               __riscv_vsetvlmax_e##S##m1()),              \
48 |                  __riscv_vsetvlmax_e##S##m1())
49 | 
50 | #define RISCV_PERM4(f, v, vidx)                                     \
51 |     __riscv_vcreate_v_u8m1_u8m4(                                    \
52 |         f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
53 |         f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
54 |         f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
55 |         f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()))
56 | 
57 | #define RISCV_LUT4(f, vtbl, v)                                      \
58 |     __riscv_vcreate_v_u8m1_u8m4(                                    \
59 |         f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
60 |         f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
61 |         f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
62 |         f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()))
63 | 
64 | #define RISCV_PERM8(f, v, vidx)                                     \
65 |     __riscv_vcreate_v_u8m1_u8m8(                                    \
66 |         f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
67 |         f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
68 |         f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
69 |         f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
70 |         f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
71 |         f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
72 |         f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
73 |         f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))
74 | 
75 | #define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
76 | 
77 | #endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
78 | 


--------------------------------------------------------------------------------
/tmpl/volk.tmpl.h:
--------------------------------------------------------------------------------
  1 | /* -*- c++ -*- */
  2 | /*
  3 |  * Copyright 2011-2020 Free Software Foundation, Inc.
  4 |  *
  5 |  * This file is part of VOLK
  6 |  *
  7 |  * SPDX-License-Identifier: LGPL-3.0-or-later
  8 |  */
  9 | 
 10 | #ifndef INCLUDED_VOLK_RUNTIME
 11 | #define INCLUDED_VOLK_RUNTIME
 12 | 
 13 | #include <volk/volk_typedefs.h>
 14 | #include <volk/volk_config_fixed.h>
 15 | #include <volk/volk_common.h>
 16 | #include <volk/volk_complex.h>
 17 | #include <volk/volk_malloc.h>
 18 | #include <volk/volk_version.h>
 19 | 
 20 | #include <stdlib.h>
 21 | #include <stdbool.h>
 22 | 
 23 | __VOLK_DECL_BEGIN
 24 | 
 25 | typedef struct volk_func_desc
 26 | {
 27 |     const char **impl_names;
 28 |     const int *impl_deps;
 29 |     const bool *impl_alignment;
 30 |     size_t n_impls;
 31 | } volk_func_desc_t;
 32 | 
 33 | //! Prints a list of machines available
 34 | VOLK_API void volk_list_machines(void);
 35 | 
 36 | //! Returns the name of the machine this instance will use
 37 | VOLK_API const char* volk_get_machine(void);
 38 | 
 39 | //! Get the machine alignment in bytes
 40 | VOLK_API size_t volk_get_alignment(void);
 41 | 
 42 | /*!
 43 |  * The VOLK_OR_PTR macro is a convenience macro
 44 |  * for checking the alignment of a set of pointers.
 45 |  * Example usage:
 46 |  * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
 47 |  */
 48 | #define VOLK_OR_PTR(ptr0, ptr1) \
 49 |     (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
 50 | 
 51 | /*!
 52 |  * Is the pointer on a machine alignment boundary?
 53 |  *
 54 |  * Note: for performance reasons, this function
 55 |  * is not usable until another volk API call is made
 56 |  * which will perform certain initialization tasks.
 57 |  *
 58 |  * \param ptr the pointer to some memory buffer
 59 |  * \return 1 for alignment boundary, else 0
 60 |  */
 61 | VOLK_API bool volk_is_aligned(const void *ptr);
 62 | 
 63 | // Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`.
 64 | // We just assume the compiler and the system are tight together as far as Mako templates are concerned.
 65 | <%
 66 | deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8',
 67 |                       'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i',
 68 |                       'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i',
 69 |                       'volk_32fc_s32fc_multiply_32fc', 'volk_32fc_s32fc_x2_rotator_32fc',
 70 |                       'volk_32fc_x2_s32fc_multiply_conjugate_add_32fc')
 71 | from platform import system
 72 | if system() == 'Windows':
 73 |     deprecated_kernels = ()
 74 | %>
 75 | %for kern in kernels:
 76 | 
 77 | % if kern.name in deprecated_kernels:
 78 | //! A function pointer to the dispatcher implementation
 79 | extern VOLK_API ${kern.pname} ${kern.name} __attribute__((deprecated));
 80 | 
 81 | //! A function pointer to the fastest aligned implementation
 82 | extern VOLK_API ${kern.pname} ${kern.name}_a __attribute__((deprecated));
 83 | 
 84 | //! A function pointer to the fastest unaligned implementation
 85 | extern VOLK_API ${kern.pname} ${kern.name}_u __attribute__((deprecated));
 86 | 
 87 | //! Call into a specific implementation given by name
 88 | extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) __attribute__((deprecated));
 89 | 
 90 | //! Get description parameters for this kernel
 91 | extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void) __attribute__((deprecated));
 92 | % else:
 93 | //! A function pointer to the dispatcher implementation
 94 | extern VOLK_API ${kern.pname} ${kern.name};
 95 | 
 96 | //! A function pointer to the fastest aligned implementation
 97 | extern VOLK_API ${kern.pname} ${kern.name}_a;
 98 | 
 99 | //! A function pointer to the fastest unaligned implementation
100 | extern VOLK_API ${kern.pname} ${kern.name}_u;
101 | 
102 | //! Call into a specific implementation given by name
103 | extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name);
104 | 
105 | //! Get description parameters for this kernel
106 | extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void);
107 | % endif
108 | 
109 | %endfor
110 | 
111 | __VOLK_DECL_END
112 | 
113 | #endif /*INCLUDED_VOLK_RUNTIME*/
114 | 


--------------------------------------------------------------------------------
/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s:
--------------------------------------------------------------------------------
  1 | @ static inline void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
  2 |     @ r0 = cVector
  3 |     @ r1 = aVector
  4 |     @ r2 = bVector
  5 |     @ r3 = num_points
  6 |     .global    volk_32f_x2_dot_prod_32f_a_neonasm_opts
  7 | volk_32f_x2_dot_prod_32f_a_neonasm_opts:
  8 |      push    {r4, r5, r6, r7, r8, r9, r10, r11}
  9 |     @ sixteenth_points = num_points / 16
 10 |      lsrs       r8, r3, #4
 11 |      sub        r13, r13, #16 @ subtracting 16 from stack pointer?, wat?
 12 |     @ 0 out neon accumulators
 13 |      veor       q0, q3, q3
 14 |      veor       q1, q3, q3
 15 |      veor       q2, q3, q3
 16 |      veor       q3, q3, q3
 17 |      beq        .smallvector @ if less than 16 points skip main loop
 18 |      mov        r7, r2  @ copy input ptrs
 19 |      mov        r6, r1  @ copy input ptrs
 20 |      mov        r5, #0  @ loop counter
 21 | .mainloop:
 22 |      vld4.32    {d16,d18,d20,d22}, [r6]!
 23 |      add        r5, r5, #1 @ inc loop counter
 24 |      cmp        r5, r8     @ loop counter < sixteenth_points?
 25 |      vld4.32    {d24,d26,d28,d30}, [r7]!
 26 |      vld4.32    {d17,d19,d21,d23}, [r6]!
 27 |      vld4.32    {d25,d27,d29,d31}, [r7]!
 28 |      vmla.f32   q3, q8, q12
 29 |      vmla.f32   q0, q13, q9
 30 |      vmla.f32   q1, q14, q10
 31 |      vmla.f32   q2, q15, q11
 32 |      bne        .mainloop
 33 |      lsl        r12, r8, #6 @ r12=r8/64
 34 |      add        r1, r1, r12
 35 |      add        r2, r2, r12
 36 | .smallvector: @ actually this can be skipped for small vectors
 37 |      vadd.f32   q3, q3, q0
 38 |      lsl        r8, r8, #4 @ sixteenth_points * 16
 39 |      cmp        r3, r8     @ num_points < sixteenth_points*16?
 40 |      vadd.f32   q2, q1, q2
 41 |      vadd.f32   q3, q2, q3 @ sum of 4 accumulators in to q3
 42 |      vadd.f32   s15, s12, s15 @ q3 is s12-s15, so reduce to a single float
 43 |      vadd.f32   s15, s15, s13
 44 |      vadd.f32   s15, s15, s14
 45 |      bls        .done      @ if vector is multiple of 16 then finish
 46 |      sbfx       r11, r1, #2, #1 @ check alignment
 47 |      rsb        r9, r8, r3
 48 |      and        r11, r11, #3
 49 |      mov        r6, r1
 50 |      cmp        r11, r9
 51 |      movcs      r11, r9
 52 |      cmp        r9, #3
 53 |      movls      r11, r9
 54 |      cmp        r11, #0
 55 |      beq        .nothingtodo
 56 |      mov        r5, r2
 57 |      mov        r12, r8
 58 | .dlabel5:
 59 |      add        r12, r12, #1
 60 |      vldmia     r6!, {s14}
 61 |      rsb        r4, r8, r12
 62 |      vldmia     r5!, {s13}
 63 |      cmp        r4, r11
 64 |      vmla.f32   s15, s13, s14
 65 |      mov        r7, r6
 66 |      mov        r4, r5
 67 |      bcc        .dlabel5
 68 |      cmp        r9, r11
 69 |      beq        .done
 70 | .dlabel8:
 71 |      rsb        r9, r11, r9
 72 |      lsr        r8, r9, #2
 73 |      lsls       r10, r8, #2
 74 |      beq        .dlabel6
 75 |      lsl        r6, r11, #2
 76 |      veor       q8, q8, q8
 77 |      add        r1, r1, r6
 78 |      add        r6, r2, r6
 79 |      mov        r5, #0
 80 | .dlabel9:
 81 |      add        r5, r5, #1
 82 |      vld1.32    {d20-d21}, [r6]!
 83 |      cmp        r5, r8
 84 |      vld1.64    {d18-d19}, [r1 :64]!
 85 |      vmla.f32   q8, q10, q9
 86 |      bcc        .dlabel9
 87 |      vadd.f32   d16, d16, d17
 88 |      lsl        r2, r10, #2
 89 |      veor       q9, q9, q9
 90 |      add        r7, r7, r2
 91 |      vpadd.f32  d6, d16, d16
 92 |      add        r4, r4, r2
 93 |      cmp        r9, r10
 94 |      add        r12, r12, r10
 95 |      vadd.f32   s15, s15, s12
 96 |      beq        .done
 97 | .dlabel6:
 98 |      mov        r2, r7
 99 | .dlabel7:
100 |      add        r12, r12, #1
101 |      vldmia     r2!, {s13}
102 |      cmp        r3, r12
103 |      vldmia     r4!, {s14}
104 |      vmla.f32   s15, s13, s14
105 |      bhi        .dlabel7
106 | .done:
107 |      vstr       s15, [r0]
108 |      add        r13, r13, #16
109 |      pop        {r4, r5, r6, r7, r8, r9, r10, r11}
110 |      bx         lr @ lr is the return address
111 | .nothingtodo:
112 |      mov        r12, r8
113 |      mov        r4, r2
114 |      mov        r7, r1
115 |      b          .dlabel8
116 | 
117 | 


--------------------------------------------------------------------------------