├── .github
└── workflows
│ └── ci.yml
├── .gitignore
├── CMakeLists.txt
├── CMakeSettings.json
├── GettingStarted
├── CMakeLists.txt
├── GettingStarted.cpp
├── GettingStarted.vcxproj
├── GettingStarted.vcxproj.filters
└── GettingStarted.vcxproj.user
├── LICENSE
├── LeaveOneOutRegression
├── LeaveOneOutRegression.cpp
├── LeaveOneOutRegression.vcxproj
└── LeaveOneOutRegression.vcxproj.user
├── README.md
├── VariadicReduction
├── CMakeLists.txt
├── VariadicReducrion.vcxproj
├── VariadicReducrion.vcxproj.filters
├── VariadicReduction.cpp
├── VariadicReduction.sln
└── VariadicReduction.vcxproj
├── VectorTest
├── DR3_tests.cpp
├── TestAccumulator.cpp
├── TestAllocator.cpp
├── TestCurve.cpp
├── TestFilterSelect.cpp
├── TestFilterTransform.cpp
├── TestScan.cpp
├── TestSpan.cpp
├── TestViews.cpp
├── Test_binary_unitary_operations.cpp
├── Unroll_operators.cpp
├── VTune Profiler Results
│ └── VectorTest
│ │ └── VectorTest.vtuneproj
├── VectorTest.log
├── VectorTest.vcxproj
├── VectorTest.vcxproj.user
├── dr3TestUtil.h
├── packages.config
├── pch.cpp
├── pch.h
├── test.cpp
├── testNamespace.cpp
├── testNamespace.h
└── test_precise_accumulation.cpp
├── Vectorisation.sln
├── Vectorisation
├── CMakeLists.txt
├── Output-Build.txt
├── TextFile1.txt
├── VCL
│ ├── LICENSE
│ ├── README.md
│ ├── dispatch_example1.cpp
│ ├── dispatch_example2.cpp
│ ├── instrset.h
│ ├── instrset_detect.cpp
│ ├── vector_convert.h
│ ├── vectorclass.h
│ ├── vectorf128.h
│ ├── vectorf256.h
│ ├── vectorf256e.h
│ ├── vectorf512.h
│ ├── vectorf512e.h
│ ├── vectori128.h
│ ├── vectori256.h
│ ├── vectori256e.h
│ ├── vectori512.h
│ ├── vectori512e.h
│ ├── vectori512s.h
│ ├── vectori512se.h
│ ├── vectormath_common.h
│ ├── vectormath_exp.h
│ ├── vectormath_hyp.h
│ ├── vectormath_lib.h
│ └── vectormath_trig.h
├── VecX
│ ├── accumulate_transform.h
│ ├── alloc_policy.cpp
│ ├── alloc_policy.h
│ ├── alloc_policy_imp.h
│ ├── apply_operation.h
│ ├── binary_unitary_operations.h
│ ├── binned_accumulator.h
│ ├── boolean_operations.h
│ ├── conditional_select_eval.h
│ ├── dr3.h
│ ├── error_utils.h
│ ├── filter_pipe_and_join.h
│ ├── filter_select.h
│ ├── instruction_traits.h
│ ├── math_ops.h
│ ├── operations.h
│ ├── sampler.h
│ ├── scan.h
│ ├── span.h
│ ├── target_name_space.h
│ ├── transform.h
│ ├── unroll_operators.h
│ ├── vcl_latest.h
│ ├── vec.cpp
│ ├── vec.h
│ ├── vec_bool.h
│ ├── vec_bool_d.h
│ ├── vec_d.h
│ ├── vec_double.h
│ ├── vec_view.h
│ └── zip_utils.h
├── Vectorisation.cpp
├── Vectorisation.log
├── Vectorisation.sln
├── Vectorisation.vcxproj
├── Vectorisation.vcxproj.filters
├── Vectorisation.vcxproj.user
├── intel_Libs
│ ├── libirc.lib
│ ├── svml_disp.lib
│ ├── svml_dispmd.lib
│ ├── svml_dispmt.lib
│ └── svmlpatch.lib
├── intrinsic_utils.h
├── packages.config
└── pch.h
├── accumulateExample
├── CMakeLists.txt
├── accumulateExample.log
├── accumulateExample.vcxproj
├── accumulateExample.vcxproj.filters
├── accumulateExample.vcxproj.user
├── accumulate_example.cpp
├── gnormcpp.cpp
└── norm.h
├── cumNormalExample
├── CMakeLists.txt
├── cumNormal.h
├── cumNormalExample.cpp
├── cumNormalExample.vcxproj
├── cumNormalExample.vcxproj.filters
└── cumNormalExample.vcxproj.user
├── dancingAVX512
├── AVX512Dance.cpp
├── AVX512Dance.h
├── CMakeLists.txt
├── dancingAVX512.cpp
├── dancingAVX512.vcxproj
└── dancingAVX512.vcxproj.user
├── docs
├── BlackScholesVecXX.mp4
├── Build.md
└── cppCon2022.pdf
├── inverseCumNormalExample
├── CMakeLists.txt
├── cdfNormalInverse.cpp
├── cdfNormalInverse.h
├── inverseCumNormalExample.cpp
├── inverseCumNormalExample.log
├── inverseCumNormalExample.vcxproj
├── inverseCumNormalExample.vcxproj.filters
└── inverseCumNormalExample.vcxproj.user
├── lattice
├── CMakeLists.txt
├── americanCrankNicholsonPricer.cpp
├── americanFinitDiffPricer.cpp
├── americanImplicitFiniteDiff.cpp
├── americanTrinomialPricer.cpp
├── americanTrinomialPricerUpAndOut.cpp
├── euroTrinomial.cpp
├── euroTrinomialPricerWithInit.cpp
├── europeanBinomialPricer.cpp
├── lattice.cpp
├── lattice.vcxproj
├── lattice.vcxproj.user
├── lattice_tools.cpp
├── lattice_tools.h
├── pricers.h
└── utils.h
└── scratch
├── CMakeLists.txt
├── scratch.cpp
├── scratch.vcxproj
└── scratch.vcxproj.user
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Build and Unittest
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 |
9 | env:
10 | # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
11 | BUILD_TYPE: Release
12 |
13 | jobs:
14 | example_matrix:
15 | strategy:
16 | matrix:
17 | os: [windows-latest, ubuntu-latest]
18 | runs-on: ${{ matrix.os }}
19 | steps:
20 | - uses: actions/checkout@v3
21 | - uses: symbitic/install-cmake@master
22 |
23 | - name: CMake Configure and Build on Ubuntu
24 | if: matrix.os == 'Ubuntu-latest'
25 | run: |
26 | cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
27 | cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
28 |
29 | - name: CMake Configure and Build on Windows
30 | if: matrix.os == 'windows-latest'
31 | run: |
32 | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64
33 | cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
34 | cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
35 | shell: cmd
36 |
37 | #- name: Test
38 | # working-directory: ${{github.workspace}}/build
39 | # # Execute tests defined by the CMake configuration.
40 | # # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
41 | # run: ctest -C ${{env.BUILD_TYPE}}
42 |
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio.
3 | ################################################################################
4 |
5 | /build
6 | /build_vcc
7 |
8 | /packages
9 | /.vs/DRCubed/v16/TestStore/0
10 | /accumulateExample/clang-cl
11 | /accumulateExample/ICC2022
12 | /accumulateExample/Release
13 | /GettingStarted/clang-cl
14 | /GettingStarted/ICC2022
15 | /GettingStarted/Release
16 | /GettingStarted/x64/Release
17 | /.vs/Vectorisation/v16/ipch/AutoPCH/4e9dfb20cefae0b2
18 | /accumulateExample/x64/Release
19 | /inverseCumNormalExample/clang-cl
20 | /inverseCumNormalExample/ICC2022
21 | /inverseCumNormalExample/Release
22 | /inverseCumNormalExample/x64/Release
23 | /Vectorisation/.vs/Vectorisation/v16
24 | /Vectorisation/clang-cl
25 | /Vectorisation/Debug
26 | /Vectorisation/ICC2022
27 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.0
28 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4/build/native
29 | /Vectorisation/Release
30 | /Vectorisation/x64/Release
31 | /VectorTest/clang-cl
32 | /VectorTest/ICC2022
33 | /VectorTest/Release
34 | /VectorTest/x64/Release
35 | /x64/Release
36 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4
37 | /.vs/Vectorisation/v16
38 | /.vs
39 | /accumulateExample/x64
40 | /GettingStarted/x64
41 | /inverseCumNormalExample/x64
42 | /Vectorisation/x64
43 | /VectorTest/x64
44 | /x64
45 | /accumulateExample/My Advisor Results - accumulateExample
46 | /accumulateExample/VTune Profiler Results/accumulateExample
47 | /cumNormalExample/x64
48 | /inverseCumNormalExample/My Advisor Results - inverseCumNormalExample
49 | /inverseCumNormalExample/VTune Profiler Results/inverseCumNormalExample
50 | /GettingStarted/My Advisor Results - GettingStarted
51 | /accumulateExample/My Inspector Results - accumulateExample
52 | /GettingStarted/My Inspector Results - GettingStarted
53 | /VectorTest/My Advisor Results - VectorTest
54 | /VectorTest/My Inspector Results - VectorTest
55 | /out/build/x64-Debug
56 | /lattice/x64
57 | /dancingAVX512/x64
58 | /Vectorisation/cmake-build-debug
59 | /lattice/cmake-build-debug
60 | /curveExample/Intel® VTune™ Profiler Results/curveExample
61 | /curveExample/My Advisor Results - curveExample
62 | /scratch/x64
63 | /cmake-build-debug
64 | /.idea
65 | /curveExample/x64
66 | /scratch/r000mi2
67 | /scratch/r001mi2
68 | /scratch/r002mi3
69 | /scratch/r003mi2
70 | /scratch/scratch.inspxeproj
71 | /debug.log
72 | /cmake-build-release/.cmake/api/v1/reply
73 | /cmake-build-release/.cmake/api/v1/query
74 | /cmake-build-release/accumulateExample/CMakeFiles/accumulateExample.dir/accumulate_example.cpp.obj
75 | /cmake-build-release
76 | /VariadicReducrion/e000
77 | /VariadicReducrion/Intel® VTune™ Profiler Results/VariadicReducrion
78 | /VariadicReducrion/x64
79 | /DawnCache
80 | /config
81 | /GPUCache
82 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17)
2 |
3 | project(DR3)
4 |
5 | set(CMAKE_CXX_STANDARD 17)
6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
7 |
8 |
9 | add_subdirectory(Vectorisation)
10 | add_subdirectory(accumulateExample)
11 | add_subdirectory(cumNormalExample)
12 | add_subdirectory(inverseCumNormalExample)
13 | add_subdirectory(dancingAVX512)
14 | add_subdirectory(lattice)
15 | add_subdirectory(GettingStarted)
16 | add_subdirectory(scratch)
17 | add_subdirectory(VariadicReduction)
18 |
19 |
--------------------------------------------------------------------------------
/CMakeSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "configurations": [
3 | {
4 | "name": "x64-Debug",
5 | "generator": "Ninja",
6 | "configurationType": "Debug",
7 | "inheritEnvironments": [ "msvc_x64_x64" ],
8 | "buildRoot": "${projectDir}\\out\\build\\${name}",
9 | "installRoot": "${projectDir}\\out\\install\\${name}",
10 | "cmakeCommandArgs": "",
11 | "buildCommandArgs": "",
12 | "ctestCommandArgs": ""
13 | },
14 | {
15 | "name": "Linux-GCC-Release",
16 | "generator": "Ninja",
17 | "configurationType": "RelWithDebInfo",
18 | "cmakeExecutable": "cmake",
19 | "remoteCopySourcesExclusionList": [ ".vs", ".git", "out" ],
20 | "cmakeCommandArgs": "",
21 | "buildCommandArgs": "",
22 | "ctestCommandArgs": "",
23 | "inheritEnvironments": [ "linux_x64" ],
24 | "remoteMachineName": "${defaultRemoteMachineName}",
25 | "remoteCMakeListsRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/src",
26 | "remoteBuildRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/build/${name}",
27 | "remoteInstallRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/install/${name}",
28 | "remoteCopySources": true,
29 | "rsyncCommandArgs": "-t --delete --delete-excluded",
30 | "remoteCopyBuildOutput": false,
31 | "remoteCopySourcesMethod": "rsync"
32 | }
33 | ]
34 | }
--------------------------------------------------------------------------------
/GettingStarted/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(GettingStarted GettingStarted.cpp)
2 |
3 | target_link_libraries(GettingStarted PUBLIC Vectorisation)
4 |
5 | target_include_directories(GettingStarted PUBLIC
6 | "${PROJECT_BINARY_DIR}"
7 | )
--------------------------------------------------------------------------------
/GettingStarted/GettingStarted.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/GettingStarted/GettingStarted.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/LeaveOneOutRegression/LeaveOneOutRegression.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DR3
2 |
3 | To get full use of the repo, you need a modern processor which has AVX512 or AVX2 instructions.
4 | If your processor only has AVX2, you need to change target instruction sets in the projects to AVX2, and don’t generate AVX512
5 | in the projects because your machine wont run them.
6 |
7 | The projects build with GCC, clang, IC2022 and VS2019.
8 | In visual c++ select x64 and solution configuration for IC2022, release, debug and clang
9 |
10 | The getting started project shows some example use cases for vectors, filters and views, together with an experimental
11 | vectorised forward AAD getting option sensitivities.
12 |
13 | The accumulate example shows some of the use cases given in the cppCon2022 talk.
14 | Additionally it gives an example of error correction in Khan accumulation
15 |
16 | The example build an run with VS2019, clang and intel compilers. The target instruction set
17 | generated by the framework can be changed by changing the namespace. These are double and float
18 | types VecDb is pair of doubles. Uncomment the namespace and build the example.
19 |
20 | //using namespace DRC::VecDb;
21 |
22 | //using namespace DRC::VecD2D; //sse2 double
23 |
24 | using namespace DRC::VecD4D; //avx2 double
25 |
26 | //using namespace DRC::VecF8F; // avx2 float
27 |
28 | //using namespace DRC::VecD8D; //avx512 double
29 |
30 | //using namespace DRC::VecF16F; //avx512 float
31 |
32 |
33 | For a machine supporting AVX512, ensure all the visual studio projects are set to use enhanced instruction set.
34 | ConfigurationProperties C++/Instruction Set /Enable Enhanced Instruction Set to ARCh:AVX512
35 | If your machine doesnt support this, reduce to AVX2 or SSE2, and dont select a namespace in the code requiring more advanced instruction
36 | sets.
37 |
38 | Uncomment one of the Using namespace lines select the instruction set that you wish to run
39 | Those ending in F have float type as underlying, those ending with D have a double.
40 |
41 | The project is set to compile using the AVX512 enhanced instruction set. The namespace selection
42 | choses the type of the intrinsics that are used to instantiate lambdas.
43 |
44 | If your hardware does not support AVX512 chose the next level down AVX2 and avoid using namespaces
45 | DRC::VecD8D or DRC::VecF16F which will cause generation of code with instructions that your computer doesn't support.
46 |
47 | check device manager/processor to determine what processor you have and check against web site
48 | https://ark.intel.com/content/www/us/en/ark/products/123550/intel-xeon-silver-4114-processor-13-75m-cache-2-20-ghz.html
49 | or
50 | https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html
51 |
52 |
53 | The getting started project shows the useage of vectors lambdas and filters
54 |
55 | The accumulateExample builds performance examples covered in the cppCon2022 talk.
56 | They give the user the chance to change between ICC,clang and VS2019 builds but changing the
57 | instruction set used via the using declaration.
58 |
59 | The inverseCumNormalExample gives the performance example shown in cppCon2022, although there might be some slight
60 | perfrormance regression on one or two of the examples. Its instructive to run the examples after building with the
61 | different compilers and chosing different instruction sets for the Lambdas (via namespace).
62 |
63 | The AVX512Dance function runs a routine which finds the max value in in array, using AVX2 and AVX512. By monitoring the
64 | power useage using something like openhardware monitor its possible to see that using the AVX512 instructions, use less
65 | energy to do the compute than the AVX2 ( on this silver4114 xeon).
66 |
67 | VectorTest is a selection of tests using googletest.
68 | The main library is Vectorisation. This refrence a local copy of the VCL2 library. It has a slight change to enable
69 | VCL2 to be used with the intel IC2022 compiler.
70 |
71 |
72 | ## Building DR3
73 |
74 | See [docs/Build.md](docs/Build.md) for instructions on how to build DR3 from source and a list of supported platforms.
75 |
76 |
--------------------------------------------------------------------------------
/VariadicReduction/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(VariadicExample VariadicReduction.cpp)
2 |
3 | target_link_libraries(VariadicExample PUBLIC Vectorisation)
4 |
5 | target_include_directories(VariadicExample PUBLIC
6 | "${PROJECT_BINARY_DIR}"
7 | )
8 |
--------------------------------------------------------------------------------
/VariadicReduction/VariadicReducrion.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
--------------------------------------------------------------------------------
/VariadicReduction/VariadicReduction.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.7.34202.233
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VariadicReduction", "VariadicReduction.vcxproj", "{271CF3D5-72FF-4657-9325-4206B8D5C84F}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | clang-cl23|x64 = clang-cl23|x64
11 | clang-cl23|x86 = clang-cl23|x86
12 | Debug|x64 = Debug|x64
13 | Debug|x86 = Debug|x86
14 | ICC2023|x64 = ICC2023|x64
15 | ICC2023|x86 = ICC2023|x86
16 | Release|x64 = Release|x64
17 | Release|x86 = Release|x86
18 | Release-23|x64 = Release-23|x64
19 | Release-23|x86 = Release-23|x86
20 | EndGlobalSection
21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.ActiveCfg = clang-cl23|x64
23 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.Build.0 = clang-cl23|x64
24 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.ActiveCfg = clang-cl23|Win32
25 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.Build.0 = clang-cl23|Win32
26 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.ActiveCfg = Debug|x64
27 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.Build.0 = Debug|x64
28 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.ActiveCfg = Debug|Win32
29 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.Build.0 = Debug|Win32
30 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.ActiveCfg = ICC2023|x64
31 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.Build.0 = ICC2023|x64
32 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.ActiveCfg = ICC2023|Win32
33 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.Build.0 = ICC2023|Win32
34 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.ActiveCfg = Release|x64
35 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.Build.0 = Release|x64
36 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.ActiveCfg = Release|Win32
37 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.Build.0 = Release|Win32
38 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.ActiveCfg = Release-23|x64
39 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.Build.0 = Release-23|x64
40 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.ActiveCfg = Release-23|Win32
41 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.Build.0 = Release-23|Win32
42 | EndGlobalSection
43 | GlobalSection(SolutionProperties) = preSolution
44 | HideSolutionNode = FALSE
45 | EndGlobalSection
46 | GlobalSection(ExtensibilityGlobals) = postSolution
47 | SolutionGuid = {347FE8D5-D275-4584-8F15-DD105566C258}
48 | EndGlobalSection
49 | EndGlobal
50 |
--------------------------------------------------------------------------------
/VectorTest/TestAllocator.cpp:
--------------------------------------------------------------------------------
1 | #include "pch.h"
2 |
3 | #include "../Vectorisation/VecX/vec.h"
4 | #include "../Vectorisation/VecX/operations.h"
5 | #include "../Vectorisation/VecX/vec_bool_d.h"
6 | #include "../Vectorisation/VecX/vec_double.h"
7 | #include "../Vectorisation/VecX/alloc_policy.h"
8 | #include "../Vectorisation/VecX/alloc_policy_imp.h"
9 | #include "../Vectorisation/VecX/target_name_space.h"
10 |
11 |
12 | TEST(TestCaseAlloc, fillup_empty_last) {
13 | EXPECT_EQ(1, 1);
14 | EXPECT_TRUE(true);
15 |
16 | PoolStrat myPool(10);
17 | int MAX_EL = 20;
18 | std::vector pAlloc;
19 |
20 | int pos = myPool.pos();
21 | for (int i = 0; i < MAX_EL; ++i)
22 | {
23 | double* p = myPool.alloc();
24 | pos = myPool.pos();
25 |
26 | (*p) = i;
27 | pAlloc.push_back(p);
28 | }
29 |
30 | int szx =myPool.size();
31 | pos = myPool.pos();
32 |
33 | for (int k = pos; k > 0; --k)
34 | {
35 | double* pback = pAlloc.back();
36 | pAlloc.pop_back();
37 | myPool.free(pback);
38 |
39 | pos = myPool.pos();
40 |
41 | }
42 |
43 |
44 | }
45 |
46 |
47 | TEST(TestCaseAlloc, fillup_empty_secondlast) {
48 | EXPECT_EQ(1, 1);
49 | EXPECT_TRUE(true);
50 |
51 | PoolStrat myPool(10);
52 | int MAX_EL = 20;
53 | std::vector pAlloc;
54 |
55 | int pos = myPool.pos();
56 | for (int i = 0; i < MAX_EL; ++i)
57 | {
58 | double* p = myPool.alloc();
59 | pos = myPool.pos();
60 |
61 | (*p) = i;
62 | pAlloc.push_back(p);
63 | }
64 |
65 | int szx = myPool.size();
66 | pos = myPool.pos();
67 |
68 | for (int k = pos; k > 1; --k)
69 | {
70 | double* pback = pAlloc[k - 2];
71 | //pAlloc.pop_back();
72 | myPool.free(pback);
73 |
74 | pos = myPool.pos();
75 |
76 | }
77 |
78 | //all ok
79 | //add one
80 | auto newOne = myPool.alloc();
81 | (*newOne) = 88;
82 |
83 | myPool.free(newOne);
84 | myPool.free(pAlloc[MAX_EL - 1]);
85 |
86 | //all deleted
87 | auto newOnetoo = myPool.alloc();
88 | (*newOnetoo) = 99;
89 | //one element 99
90 |
91 |
92 | for (int i = 0; i < 3; ++i)
93 | {
94 | auto newOnetoo = myPool.alloc();
95 | (*newOnetoo) = 44 + i;
96 | }
97 |
98 |
99 | //needto test some vakues
100 | }
101 |
102 |
103 | using namespace DRC::VecD4D;
104 |
105 | TEST(TestCaseAlloc, monkyBusinessBuffer) {
106 | EXPECT_EQ(1, 1);
107 | EXPECT_TRUE(true);
108 |
109 |
110 | std::vector mix(21,1.0);
111 | VecXX Vec2(mix);
112 |
113 |
114 | auto d = Vec2;
115 | auto a = d;
116 | auto b = a;
117 | auto c = b;
118 |
119 |
120 | a *= -1.0;
121 | auto w = log(-a);
122 | std::vector cach(w.size());
123 | for (size_t i = 0; i < w.size(); i++)
124 | {
125 | cach[i] = w[i];
126 | }
127 | auto aa = -b;
128 |
129 | //operation above should not change
130 | for (size_t i = 0; i < w.size(); i++)
131 | {
132 | double cacI = cach[i];
133 | double wI = w[i];
134 | EXPECT_EQ(cacI, wI);
135 | }
136 |
137 |
138 |
139 | }
--------------------------------------------------------------------------------
/VectorTest/TestCurve.cpp:
--------------------------------------------------------------------------------
1 | #include "pch.h"
2 |
3 |
4 | #include "../../Vectorisation/ExampleVectors/curve.h"
5 | #include "../Vectorisation/VecX/operations.h"
6 | #include "../Vectorisation/VecX/vec_bool_d.h"
7 | #include "../Vectorisation/VecX/vec_double.h"
8 | #include "../Vectorisation/VecX/alloc_policy.h"
9 |
10 | typedef VecD VecxD;
11 | typedef VecD Vecx;
12 | typedef Vec VecXX;
13 |
14 | TEST(TestCaseCurve, Test1) {
15 | EXPECT_EQ(1, 1);
16 | EXPECT_TRUE(true);
17 |
18 | std::vector values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 };
19 | std::vector dates = { 0,1,2,3,4,5,6,7,8,9,10 };
20 | std::vector datesD = { 0,1,2,3,4,5,6,7,8,9,10 };
21 |
22 |
23 |
24 |
25 | Curve< double, double> testCurve;
26 | testCurve.setValues(begin(values), end(values), begin(datesD), end(datesD)); //wrong way round
27 |
28 | auto val = testCurve.valueAt(0.0);
29 |
30 | EXPECT_EQ(val, 0.0);
31 | val = testCurve.valueAt(0.5);
32 | EXPECT_EQ(val, 0.5);
33 |
34 |
35 | ///////////////////////////
36 | std::vector< VecXX> vecVals;
37 | for (int i = 0; i < 11; i++)
38 | {
39 | VecXX vals(i * 0.001 + 0.06, 100);
40 | vecVals.push_back(vals);
41 |
42 | }
43 |
44 |
45 | {
46 | using ZeroCrv = Curve< double, VecXX, ZeroInterp >;
47 |
48 | ZeroCrv testCurve2;
49 | testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals));
50 |
51 | auto valV = testCurve2.valueAt(0.0);
52 |
53 | auto valV2 = testCurve2.valueAt(0.5);
54 | }
55 |
56 |
57 | {
58 | std::vector values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 };
59 | std::vector dates = { 0,1,2,3,4,5,6,7,8,9,10 };
60 | std::vector datesD = { 0,1,2,3,4,5,6,7,8,9,10 };
61 |
62 | std::vector< VecXX> vecVals;
63 | for (int i = 0; i < 11; i++)
64 | {
65 | VecXX vals(i * 0.001 + 0.06, 100);
66 | vecVals.push_back(vals);
67 |
68 | }
69 |
70 |
71 | Curve2> testCurve2(10);
72 | testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals));
73 |
74 | auto valV = testCurve2.valueAt(0.0);
75 |
76 | auto valV2 = testCurve2.valueAt(0.5);
77 |
78 |
79 | for (long l = 0; l < 10000; l++)
80 | {
81 | auto valV3 = testCurve2.valueAt(0.5);
82 | }
83 | }
84 |
85 | //EXPECT_EQ(val, 0.0);
86 | //val = testCurve.valueAt(0.5);
87 | //EXPECT_EQ(val, 0.5);
88 | }
--------------------------------------------------------------------------------
/VectorTest/TestFilterTransform.cpp:
--------------------------------------------------------------------------------
1 | #include "pch.h"
2 |
3 | #include "../Vectorisation/VecX/vec.h"
4 | #include "../Vectorisation/VecX/operations.h"
5 | #include "../Vectorisation/VecX/vec_bool_d.h"
6 | #include "../Vectorisation/VecX/vec_double.h"
7 | #include "../Vectorisation/VecX/alloc_policy.h"
8 | #include "../Vectorisation/VecX/vec_d.h"
9 | #include "../Vectorisation/VecX/vec_bool.h"
10 | #include "../Vectorisation/VecX/vec_view.h"
11 |
12 | #include "../Vectorisation/VecX/target_name_space.h"
13 |
14 |
15 | #include "../Vectorisation/VecX/dr3.h"
16 | #include "dr3TestUtil.h"
17 |
18 | #include
19 | #include "testNamespace.h"
20 |
21 |
22 | void testFilterTransform(int SZ )
23 | {
24 |
25 | auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); };
26 | std::vector input(SZ,asNumber( 0.0));
27 | std::iota(begin(input), end(input), asNumber(0.0));
28 |
29 | VecXX testVec(input);
30 | auto trueLambdaS = [&](auto x) { return x; };
31 | auto falseLambdaS = [&](auto x) { return -x; };
32 |
33 |
34 | for (int j = 0; j < SZ; ++j)
35 | {
36 | auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); };
37 | VecXX res = filterTransform(onlyJlambda, testVec, trueLambdaS, falseLambdaS);
38 |
39 | for (int k = 0; k < SZ; k++)
40 | {
41 | if( k==j)
42 | {
43 | EXPECT_NUMERIC_EQ(res[k], asNumber( k));
44 | }
45 | else
46 | {
47 | EXPECT_NUMERIC_EQ(res[k], asNumber(-k));
48 | }
49 | }
50 | }
51 |
52 | }
53 |
54 |
55 |
56 |
57 | TEST(TestFilterTransform, testTransformEachPoint)
58 | {
59 |
60 | for (int SZ = 3; SZ < 33; SZ++)
61 | {
62 | testFilterTransform(SZ);
63 | }
64 |
65 |
66 | testFilterTransform(34);
67 | testFilterTransform(65);
68 | testFilterTransform(63);
69 | testFilterTransform(64);
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/VectorTest/TestScan.cpp:
--------------------------------------------------------------------------------
1 | #include "pch.h"
2 |
3 |
4 |
5 | #include "../Vectorisation/VecX/vec.h"
6 | #include "../Vectorisation/VecX/operations.h"
7 | #include "../Vectorisation/VecX/vec_bool_d.h"
8 | #include "../Vectorisation/VecX/vec_double.h"
9 | #include "../Vectorisation/VecX/alloc_policy.h"
10 |
11 | #include "../Vectorisation/VecX/vec_d.h"
12 | #include "../Vectorisation/VecX/vec_bool.h"
13 | #include "../Vectorisation/VecX/vec_view.h"
14 |
15 | #include "../Vectorisation/VecX/target_name_space.h"
16 |
17 |
18 | #include "../Vectorisation/VecX/dr3.h"
19 | #include "../Vectorisation/VecX/scan.h"
20 | #include "../Vectorisation/VecX/instruction_traits.h"
21 |
22 |
23 | #include "testNamespace.h"
24 | #include "dr3TestUtil.h"
25 |
26 | #include
27 |
28 | #include
29 |
30 |
31 |
32 |
33 |
34 | void testScan(int SZ)
35 | {
36 |
37 |
38 | std::vector input(SZ, asNumber(0.0));
39 | std::iota(begin(input), end(input), asNumber(0.0));
40 |
41 | VecXX testVec(input);
42 | auto add = [](auto x, auto y) {return x + y; };
43 |
44 |
45 | for (int j = 0; j < SZ; ++j)
46 | {
47 |
48 | auto res = scan( testVec, add);
49 |
50 | std::vector dbg = res;
51 |
52 | auto expected = testVec[0];
53 |
54 | EXPECT_NUMERIC_EQ(expected, res[0]);
55 |
56 | for (int k = 1; k < SZ; k++)
57 | {
58 | expected += testVec[k] ;
59 | EXPECT_NUMERIC_EQ(expected, res[k]);
60 | }
61 | }
62 |
63 |
64 |
65 |
66 | }
67 |
68 |
69 |
70 | long double getErr(long double)
71 | {
72 | return std::pow(10, 4 - 16);
73 | }
74 |
75 | double getErr(double)
76 | {
77 | return std::pow(10, 4 - 16);
78 | }
79 |
80 | double getErr(float)
81 | {
82 | return std::pow(10, 4 - 8);
83 | }
84 |
85 |
86 | void testScan1(int SZ ,double start)
87 | {
88 |
89 |
90 | std::vector input(SZ, asNumber(0.0));
91 | std::iota(begin(input), end(input), asNumber(start));
92 |
93 |
94 | Numeric err = getErr(Numeric(0.));
95 |
96 | VecXX testVec(input);
97 | auto add = [](auto x, auto y) {return x + y; };
98 |
99 |
100 | for (int j = 0; j < SZ; ++j)
101 | {
102 |
103 | auto res = scan(testVec, add);
104 |
105 | std::vector dbg = res;
106 |
107 | std::vector expected;
108 | std::inclusive_scan(cbegin(input), cend(input), std::back_inserter( expected));
109 |
110 | EXPECT_NEAR(expected[0], res[0], err);
111 |
112 | for (int k = 1; k < SZ; k++)
113 | {
114 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
115 | EXPECT_NEAR(expected[k], res[k], relErr);
116 |
117 | }
118 | }
119 |
120 | }
121 |
122 |
123 |
124 |
125 |
126 |
127 | TEST(TestScan, scanShortVector)
128 | {
129 |
130 | for (int SZ = 3; SZ < 33; SZ++)
131 | {
132 | testScan(SZ);
133 | }
134 |
135 | for (int SZ = 3; SZ < 133; SZ++)
136 | {
137 | testScan1(SZ,3.14);
138 | }
139 |
140 | }
141 |
142 |
143 |
144 |
145 |
146 | void testTransformScan1(int SZ, double start)
147 | {
148 |
149 |
150 | std::vector input(SZ, asNumber(0.0));
151 | std::iota(begin(input), end(input), asNumber(start));
152 |
153 |
154 | Numeric err = getErr(Numeric(0.));
155 |
156 | VecXX testVec(input);
157 | auto SQR = [](auto x) { return x * x; };
158 |
159 | auto sqrVec = transform( [](auto x) {return x * x; }, testVec);
160 | std::vector< Numeric> sq = sqrVec;
161 | auto add = [](auto x, auto y) {return x + y; };
162 |
163 |
164 | for (int j = 0; j < SZ; ++j)
165 | {
166 |
167 | auto res = ApplyTransformScan(testVec, add, SQR);
168 |
169 | std::vector dbg = res;
170 |
171 | std::vector expected;
172 | std::inclusive_scan(cbegin(sq), cend(sq), std::back_inserter(expected));
173 |
174 | EXPECT_NEAR(expected[0], res[0], err);
175 |
176 | for (int k = 1; k < SZ; k++)
177 | {
178 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
179 | EXPECT_NEAR(expected[k], res[k], relErr);
180 |
181 | }
182 | }
183 |
184 | }
185 |
186 |
187 |
188 |
189 |
190 |
191 | TEST(TestTransformScanTransform, transformScanShortVector)
192 | {
193 |
194 | for (int SZ = 3; SZ < 33; SZ++)
195 | {
196 | testTransformScan1(SZ,0);
197 | }
198 |
199 | for (int SZ = 3; SZ < 133; SZ++)
200 | {
201 | testTransformScan1(SZ, 3.14);
202 | }
203 |
204 | }
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 | void testTransformScan2(int SZ, double start)
213 | {
214 |
215 |
216 | std::vector input(SZ, asNumber(0.0));
217 | std::iota(begin(input), end(input), asNumber(start));
218 |
219 |
220 | Numeric err = getErr(Numeric(0.));
221 |
222 | VecXX testVec(input);
223 |
224 | VecXX testVec1 = testVec + 1.0;
225 |
226 | auto MULT = [](auto x,auto y) { return x * y; };
227 |
228 | auto multVec = testVec * testVec1;
229 |
230 |
231 | std::vector< Numeric> prod = multVec;
232 | auto add = [](auto x, auto y) {return x + y; };
233 |
234 |
235 | for (int j = 0; j < SZ; ++j)
236 | {
237 |
238 | auto res = ApplyTransformScan(testVec, testVec1, add, MULT);
239 |
240 | std::vector dbg = res;
241 |
242 | std::vector expected;
243 | std::inclusive_scan(cbegin(prod), cend(prod), std::back_inserter(expected));
244 |
245 | EXPECT_NEAR(expected[0], res[0], err);
246 |
247 | for (int k = 1; k < SZ; k++)
248 | {
249 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
250 | EXPECT_NEAR(expected[k], res[k], relErr);
251 |
252 | }
253 | }
254 |
255 | }
256 |
257 |
258 |
259 |
260 |
261 |
262 | TEST(TestTransformScanTransform, transformScanShortVectorBinary)
263 | {
264 |
265 | for (int SZ = 3; SZ < 33; SZ++)
266 | {
267 | testTransformScan2(SZ, 0);
268 | }
269 |
270 | for (int SZ = 3; SZ < 133; SZ++)
271 | {
272 | testTransformScan2(SZ, 3.14);
273 | }
274 |
275 | }
276 |
277 |
278 |
--------------------------------------------------------------------------------
/VectorTest/VectorTest.log:
--------------------------------------------------------------------------------
1 | C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'VectorTest.vcxproj'. Please check to make sure that you have specified a valid combination of Configuration and Platform for this project. Configuration='Debug' Platform='ARM64'. This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform.
2 |
--------------------------------------------------------------------------------
/VectorTest/VectorTest.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/VectorTest/dr3TestUtil.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "pch.h"
3 |
4 |
5 | #include "../Vectorisation/VecX/vec.h"
6 | #include "testNamespace.h"
7 |
8 | Numeric asNumber(long double x);
9 |
10 | Numeric asNumber(double x);
11 |
12 | Numeric asNumber(float x);
13 |
14 | Numeric asNumber(int x);
15 |
16 | void EXPECT_NUMERIC_EQ(long double x, long double y);
17 |
18 | void EXPECT_NUMERIC_EQ(double x, double y);
19 |
20 | void EXPECT_NUMERIC_EQ(float x, float y);
21 |
22 | void EXPECT_NUMERIC_EQ(int x, int y);
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/VectorTest/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/VectorTest/pch.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // pch.cpp
3 | //
4 |
5 | //#include "pch.h"
6 |
--------------------------------------------------------------------------------
/VectorTest/pch.h:
--------------------------------------------------------------------------------
1 | //
2 | // pch.h
3 | //
4 |
5 | #pragma once
6 |
7 | #include "gtest/gtest.h"
8 |
--------------------------------------------------------------------------------
/VectorTest/testNamespace.cpp:
--------------------------------------------------------------------------------
1 | #include "testNamespace.h"
2 |
3 | Numeric asNumber(long double x)
4 | {
5 | return static_cast(x);
6 | }
7 |
8 | Numeric asNumber(double x)
9 | {
10 | return static_cast(x);
11 | }
12 |
13 | Numeric asNumber(float x)
14 | {
15 | return static_cast(x);
16 | }
17 |
18 |
19 | Numeric asNumber(int x)
20 | {
21 | return static_cast(x);
22 | }
23 |
24 |
25 | void EXPECT_NUMERIC_EQ(long double x, long double y)
26 | {
27 | EXPECT_DOUBLE_EQ(x, y);
28 | }
29 |
30 |
31 | void EXPECT_NUMERIC_EQ(double x, double y)
32 | {
33 | EXPECT_DOUBLE_EQ(x, y);
34 | }
35 |
36 |
37 | void EXPECT_NUMERIC_EQ(float x, float y)
38 | {
39 | EXPECT_FLOAT_EQ(x, y);
40 | }
41 |
42 |
43 | void EXPECT_NUMERIC_EQ(int x, int y)
44 | {
45 | EXPECT_EQ(x, y);
46 | }
47 |
48 |
--------------------------------------------------------------------------------
/VectorTest/testNamespace.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "pch.h"
3 |
4 |
5 | #include "../Vectorisation/VecX/vec.h"
6 | #include "../Vectorisation/VecX/target_name_space.h"
7 | #include "../Vectorisation/VecX/instruction_traits.h"
8 |
9 | //using namespace DRC::VecDb;
10 | //using namespace DRC::VecLDb;
11 |
12 | //using namespace DRC::VecF4F;
13 | //using namespace DRC::VecD2D;
14 | using namespace DRC::VecD4D;
15 | //using namespace DRC::VecF8F;
16 |
17 | //using namespace DRC::VecD8D;
18 | //using namespace DRC::VecF16F;
19 |
20 |
21 | using Numeric = InstructionTraits::FloatType;
22 | #include "dr3TestUtil.h"
23 |
--------------------------------------------------------------------------------
/VectorTest/test_precise_accumulation.cpp:
--------------------------------------------------------------------------------
1 | #include "pch.h"
2 |
3 |
4 | #include "../Vectorisation/VecX/vec.h"
5 | #include "../Vectorisation/VecX/operations.h"
6 | #include "../Vectorisation/VecX/vec_bool_d.h"
7 | #include "../Vectorisation/VecX/vec_double.h"
8 | #include "../Vectorisation/VecX/alloc_policy.h"
9 | #include "../Vectorisation/VecX/accumulate_transform.h"
10 | #include "../Vectorisation/VecX/target_name_space.h"
11 |
12 | #include "../Vectorisation/VecX/dr3.h"
13 | #include "testNamespace.h"
14 | #include "dr3TestUtil.h"
15 |
16 | #include
17 |
18 |
19 | auto getVecBig(int SZ, std::vector& stl)
20 | {
21 | std::vector v(SZ, asNumber(1.0/3.0));
22 | int i = 0;
23 |
24 | VecXX test(v);
25 | stl = v;
26 | return test;
27 |
28 | }
29 |
30 |
31 | void evalPrecAccumulate(int startLen, int endLen)
32 | {
33 |
34 | Numeric testEpsilon = 1e-10;
35 |
36 | for (int SZ = startLen; SZ <= endLen; SZ++)
37 | {
38 | std::vector v;
39 | VecXX test = getVecBig(SZ, v);
40 | using BINNED_ACCUMULATOR = BinsT;
41 | auto binned_Sum = reduce< BINNED_ACCUMULATOR >(test, BinnedAdd);
42 |
43 | EXPECT_NEAR(double(SZ / 3.0), binned_Sum, testEpsilon);
44 | }
45 |
46 | }
47 |
48 | TEST(TestPreciseAccumulator, simpleSummation)
49 | {
50 | EXPECT_EQ(1, 1);
51 | EXPECT_TRUE(true);
52 |
53 | //eval over multiple lengths
54 | evalPrecAccumulate(957, 1043);
55 |
56 | //eval over very small lengths
57 | evalPrecAccumulate(3, 23);
58 |
59 | }
60 |
61 |
62 | TEST(TestBin, simpleSummation2)
63 | {
64 | EXPECT_EQ(1, 1);
65 | EXPECT_TRUE(true);
66 |
67 |
68 | BinsT bin;
69 |
70 |
71 |
72 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
73 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
74 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
75 | EXPECT_EQ(bin.tinyV.extract(0), 0.0);
76 |
77 |
78 | VecXX::INS testVal =1.0e-16;
79 | bin += testVal;
80 |
81 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
82 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
83 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
84 | EXPECT_EQ(bin.tinyV.extract(0), 1.0e-16);
85 |
86 | bin += testVal;
87 |
88 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
89 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
90 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
91 | EXPECT_EQ(bin.tinyV.extract(0), 2.0e-16);
92 |
93 |
94 | bin += testVal / 2;
95 |
96 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
97 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
98 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
99 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
100 |
101 | //further tests for the other bins
102 |
103 | testVal = 1.0;
104 |
105 | bin += testVal;
106 |
107 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
108 | EXPECT_EQ(bin.bigSummV.extract(0), 1.0);
109 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
110 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
111 |
112 | bin += testVal;
113 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
114 | EXPECT_EQ(bin.bigSummV.extract(0), 2.0);
115 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
116 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
117 |
118 | bin += testVal / 2;
119 |
120 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
121 | EXPECT_EQ(bin.bigSummV.extract(0), 2.0);
122 | EXPECT_EQ(bin.smallSumV.extract(0), 0.5);
123 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
124 |
125 |
126 |
127 |
128 |
129 | BinsT bin2;
130 |
131 | auto oneThird = 1.0 / 3.0;
132 |
133 |
134 | bin2 += 1.0e-3 * oneThird;
135 |
136 |
137 | EXPECT_EQ(bin2.veryBigSummV.extract(0), 0.0);
138 | EXPECT_EQ(bin2.bigSummV.extract(0), 0.0);
139 | // EXPECT_EQ(bin2.smallSumV.extract(0), 1.0/3.0 *1.0e-3);
140 | // EXPECT_EQ(bin2.tinyV.extract(0), 0.0);
141 |
142 | auto sum = bin2.hsum();
143 |
144 | bin2 = bin2 *100000.0;
145 |
146 | sum = bin2.hsum();
147 |
148 | /*
149 |
150 | //eval over multiple lengths
151 | evalPrecAccumulate(957, 1043);
152 |
153 | //eval over very small lengths
154 | evalPrecAccumulate(3, 23);
155 | */
156 | }
157 |
158 |
--------------------------------------------------------------------------------
/Vectorisation/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(Vectorisation STATIC
2 | VecX/alloc_policy.cpp
3 | VecX/vec.cpp)
4 |
5 | if (MSVC)
6 | # add_compile_options(/W4 /WX)
7 | else()
8 | target_compile_options(Vectorisation PUBLIC "-march=native")
9 |
10 | # or -mavx/-mavx2/-mavx512f (and -march= options that imply them with relevant tuning settings)
11 | #target_compile_options(Vectorisation PUBLIC "--std=c++17")
12 | #target_compile_options(Vectorisation PUBLIC "-mavx2")
13 | #target_compile_options(Vectorisation PUBLIC "-mfma")
14 | endif()
15 |
16 |
--------------------------------------------------------------------------------
/Vectorisation/TextFile1.txt:
--------------------------------------------------------------------------------
1 |
2 | Vec4 f a ( 0 . 0 f , 0. 5 f , 1. 0 f , 1. 5 f ) ; // d e f i n e v e c t o r
3 | Vec4 f b = s i n ( a ) ; // s i n f u n c ti o n
4 | // b = ( 0. 0 0 0 0 f , 0. 4 7 9 4 f , 0. 8 4 1 5 f , 0. 9 9 7 5 f )
5 |
6 |
--------------------------------------------------------------------------------
/Vectorisation/VCL/README.md:
--------------------------------------------------------------------------------
1 | # version2
2 | Vector Class Library, latest version
3 |
4 | This is a C++ class library for using the Single Instruction Multiple Data (SIMD) instructions to improve performance on modern microprocessors with the x86 or x86/64 instruction set on Windows, Linux, and Mac platforms. There are no plans to support ARM or other instruction sets.
5 |
6 | [Latest release](https://github.com/vectorclass/version2/releases)
7 |
8 | [Download manual](https://github.com/vectorclass/manual/raw/master/vcl_manual.pdf)
9 |
10 | [Add-on packages for particular applications](https://github.com/vectorclass/add-on)
11 |
12 | [Getting-started video.](https://www.youtube.com/watch?v=TKjYdLIMTrI) Video blogger Christopher Rose has made this nice video telling how to get started with the Vector Class Library.
13 |
14 | **Help:** You may ask for programming help on [StackOverflow](https://stackoverflow.com) using the tag vector-class-library.
15 |
--------------------------------------------------------------------------------
/Vectorisation/VCL/dispatch_example1.cpp:
--------------------------------------------------------------------------------
1 | /************************* dispatch_example1.cpp ***************************
2 | Author: Agner Fog
3 | Date created: 2012-05-30
4 | Last modified: 2020-02-25
5 | Version: 2.01.00
6 | Project: vector class library
7 |
8 | Description: Example of automatic CPU dispatching.
9 | This shows how to compile vector code in multiple versions, each
10 | optimized for a different instruction set. The optimal version is
11 | selected by a dispatcher at run time.
12 |
13 | There are two examples of automatic dispatching:
14 |
15 | dispatch_example1.cpp: Uses separate function names for each version.
16 | This is useful for simple cases with one or a few functions.
17 |
18 | dispatch_example2.cpp: Uses separate namespaces for each version.
19 | This is the recommended method for cases with multiple functions,
20 | classes, objects, etc.
21 |
22 | The code has two sections:
23 |
24 | Dispatched code: This code is compiled multiple times to generate multiple instances
25 | of the compiled code, each one optimized for a different instruction set. The
26 | dispatched code section contains the speed-critical part of the program.
27 |
28 | Common code: This code is compiled only once, using the lowest instruction set.
29 | The common code section contains the dispatcher, startup code, user interface, and
30 | other parts of the program that do not need advanced optimization.
31 |
32 | To compile this code, do as in this example:
33 |
34 | # Example of compiling dispatch example with Gnu or Clang compiler:
35 | # Compile dispatch_example1.cpp four times for different instruction sets:
36 |
37 | # Compile for AVX
38 | clang++ -O2 -m64 -mavx -std=c++17 -c dispatch_example1.cpp -od7.o
39 |
40 | # Compile for AVX2
41 | clang++ -O2 -m64 -mavx2 -mfma -std=c++17 -c dispatch_example1.cpp -od8.o
42 |
43 | # Compile for AVX512
44 | clang++ -O2 -m64 -mavx512f -mfma -mavx512vl -mavx512bw -mavx512dq -std=c++17 -c dispatch_example1.cpp -od10.o
45 |
46 | # The last compilation uses the lowest supported instruction set (SSE2)
47 | # This includes the main program, and links all versions together:
48 | # (Change test.exe to test in Linux and Mac)
49 | clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example1.cpp instrset_detect.cpp d7.o d8.o d10.o -otest.exe
50 |
51 | # Run the program
52 | ./test.exe
53 |
54 | (c) Copyright 2012-2020 Agner Fog.
55 | Apache License version 2.0 or later.
56 | ******************************************************************************/
57 |
58 | /* The different instruction sets are defined in instrset_detect.cpp:
59 | 2: SSE2
60 | 3: SSE3
61 | 4: SSSE3 (Supplementary SSE3)
62 | 5: SSE4.1
63 | 6: SSE4.2
64 | 7: AVX
65 | 8: AVX2
66 | 9: AVX512F
67 | 10: AVX512VL + AVX512BW + AVX512DQ
68 | */
69 |
70 |
71 | #include
72 | #include "vectorclass.h"
73 |
74 | // Define function type
75 | // Change this to fit the entry function. Should not contain vector types:
76 | typedef float MyFuncType(float const []);
77 |
78 | // function prototypes for each version
79 | MyFuncType myfunc_SSE2, myfunc_AVX, myfunc_AVX2, myfunc_AVX512;
80 |
81 | // function prototypes for common entry point and dispatcher
82 | MyFuncType myfunc, myfunc_dispatch;
83 |
84 | // Define name of entry function depending on which instruction set we compile for
85 | #if INSTRSET >= 10 // AVX512VL
86 | #define FUNCNAME myfunc_AVX512
87 | #elif INSTRSET >= 8 // AVX2
88 | #define FUNCNAME myfunc_AVX2
89 | #elif INSTRSET >= 7 // AVX
90 | #define FUNCNAME myfunc_AVX
91 | #elif INSTRSET == 2
92 | #define FUNCNAME myfunc_SSE2 // SSE2
93 | #else
94 | #error Unsupported instruction set
95 | #endif
96 |
97 | /******************************************************************************
98 | Dispatched code
99 |
100 | Everything in this section is compiled multiple times, with one version for
101 | each instruction set. Speed-critical vector code belongs here.
102 | ******************************************************************************/
103 |
104 | // This is the dispatched function that is compiled in multiple versions with different names.
105 | // Make sure this function is static to prevent clash with other versions having the same name.
106 | // The function cannot be member of a class.
107 | static float sum (float const f[]) {
108 | // This example adds 16 floats
109 | Vec16f a; // vector of 16 floats
110 | a.load(f); // load array into vector
111 | return horizontal_add(a); // return sum of 16 elements
112 | }
113 |
114 | // -----------------------------------------------------------------------------
115 | // Entry function
116 | // -----------------------------------------------------------------------------
117 | // This is the entry function that is accessed through the dispatcher.
118 | // This serves as the interface between the common code and the dispatched code.
119 | // The entry function cannot be member of a class.
120 | // The entry function must use arrays rather than vectors for input and output.
121 | float FUNCNAME (float const f[]) {
122 | return sum(f);
123 | }
124 |
125 |
126 | /**********************************************************************************
127 | Common code
128 |
129 | Everything in this section is compiled only once, using the lowest instruction set.
130 |
131 | The dispatcher must be placed here. Program main(), user interface, and other
132 | less critical parts of the code are also placed in the common code section.
133 | **********************************************************************************/
134 |
135 | #if INSTRSET == 2
136 | // The common code is only included in the lowest of the compiled versions
137 |
138 |
139 | // ---------------------------------------------------------------------------------
140 | // Dispacther
141 | // ---------------------------------------------------------------------------------
142 | // This function pointer initially points to the dispatcher.
143 | // After the first call, it points to the selected version of the entry function
144 | MyFuncType * myfunc_pointer = &myfunc_dispatch; // function pointer
145 |
146 | // Dispatcher
147 | float myfunc_dispatch(float const f[]) {
148 | int iset = instrset_detect(); // Detect supported instruction set
149 | // Choose which version of the entry function we want to point to:
150 | if (iset >= 10) myfunc_pointer = &myfunc_AVX512; // AVX512 version
151 | else if (iset >= 8) myfunc_pointer = &myfunc_AVX2; // AVX2 version
152 | else if (iset >= 7) myfunc_pointer = &myfunc_AVX; // AVX version
153 | else if (iset >= 2) myfunc_pointer = &myfunc_SSE2; // SSE2 version
154 | else {
155 | // Error: lowest instruction set not supported.
156 | // Put any appropriate error handler here
157 | fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer");
158 | return 0.f;
159 | }
160 | // continue in dispatched version of the function
161 | return (*myfunc_pointer)(f);
162 | }
163 |
164 |
165 | // Call the entry function through the function pointer.
166 | // The first time this function is called, it goes through the dispatcher.
167 | // The dispatcher will change the function pointer so that all subsequent
168 | // calls go directly to the optimal version of the entry function
169 | inline float myfunc(float const f[]) {
170 | return (*myfunc_pointer)(f); // go to dispatched version
171 | }
172 |
173 |
174 | // ---------------------------------------------------------------------------------
175 | // Program main
176 | // ---------------------------------------------------------------------------------
177 | int main() {
178 |
179 | // array of 16 floats
180 | float const a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
181 |
182 | float sum = myfunc(a); // call function with dispatching
183 |
184 | printf("\nsum = %8.2f \n", sum); // print result (= 136.00)
185 |
186 | return 0;
187 | }
188 |
189 | #endif // INSTRSET == 2
190 |
--------------------------------------------------------------------------------
/Vectorisation/VCL/instrset_detect.cpp:
--------------------------------------------------------------------------------
1 | /************************** instrset_detect.cpp ****************************
2 | * Author: Agner Fog
3 | * Date created: 2012-05-30
4 | * Last modified: 2019-08-01
5 | * Version: 2.00.00
6 | * Project: vector class library
7 | * Description:
8 | * Functions for checking which instruction sets are supported.
9 | *
10 | * (c) Copyright 2012-2019 Agner Fog.
11 | * Apache License version 2.0 or later.
12 | ******************************************************************************/
13 |
14 | #include "instrset.h"
15 |
16 | #ifdef VCL_NAMESPACE
17 | namespace VCL_NAMESPACE {
18 | #endif
19 |
20 |
21 | // Define interface to xgetbv instruction
22 | static inline uint64_t xgetbv (int ctr) {
23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
24 | // Microsoft or Intel compiler supporting _xgetbv intrinsic
25 |
26 | return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV
27 |
28 | #elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax
29 |
30 | uint32_t a, d;
31 | __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
32 | return a | (uint64_t(d) << 32);
33 |
34 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax
35 | uint32_t a, d;
36 | __asm {
37 | mov ecx, ctr
38 | _emit 0x0f
39 | _emit 0x01
40 | _emit 0xd0 ; // xgetbv
41 | mov a, eax
42 | mov d, edx
43 | }
44 | return a | (uint64_t(d) << 32);
45 |
46 | #endif
47 | }
48 |
49 | /* find supported instruction set
50 | return value:
51 | 0 = 80386 instruction set
52 | 1 or above = SSE (XMM) supported by CPU (not testing for OS support)
53 | 2 or above = SSE2
54 | 3 or above = SSE3
55 | 4 or above = Supplementary SSE3 (SSSE3)
56 | 5 or above = SSE4.1
57 | 6 or above = SSE4.2
58 | 7 or above = AVX supported by CPU and operating system
59 | 8 or above = AVX2
60 | 9 or above = AVX512F
61 | 10 or above = AVX512VL, AVX512BW, AVX512DQ
62 | */
63 | int instrset_detect(void) {
64 |
65 | static int iset = -1; // remember value for next call
66 | if (iset >= 0) {
67 | return iset; // called before
68 | }
69 | iset = 0; // default value
70 | int abcd[4] = {0,0,0,0}; // cpuid results
71 | cpuid(abcd, 0); // call cpuid function 0
72 | if (abcd[0] == 0) return iset; // no further cpuid function supported
73 | cpuid(abcd, 1); // call cpuid function 1 for feature flags
74 | if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point
75 | if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX
76 | if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move
77 | if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE
78 | if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE
79 | iset = 1; // 1: SSE supported
80 | if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2
81 | iset = 2; // 2: SSE2 supported
82 | if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3
83 | iset = 3; // 3: SSE3 supported
84 | if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3
85 | iset = 4; // 4: SSSE3 supported
86 | if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1
87 | iset = 5; // 5: SSE4.1 supported
88 | if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT
89 | if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2
90 | iset = 6; // 6: SSE4.2 supported
91 | if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE
92 | if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S.
93 | if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX
94 | iset = 7; // 7: AVX supported
95 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
96 | if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2
97 | iset = 8;
98 | if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512
99 | cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags
100 | if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512
101 | iset = 9;
102 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
103 | if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL
104 | if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
105 | iset = 10;
106 | return iset;
107 | }
108 |
109 | // detect if CPU supports the FMA3 instruction set
110 | bool hasFMA3(void) {
111 | if (instrset_detect() < 7) return false; // must have AVX
112 | int abcd[4]; // cpuid results
113 | cpuid(abcd, 1); // call cpuid function 1
114 | return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3
115 | }
116 |
117 | // detect if CPU supports the FMA4 instruction set
118 | bool hasFMA4(void) {
119 | if (instrset_detect() < 7) return false; // must have AVX
120 | int abcd[4]; // cpuid results
121 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
122 | return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4
123 | }
124 |
125 | // detect if CPU supports the XOP instruction set
126 | bool hasXOP(void) {
127 | if (instrset_detect() < 7) return false; // must have AVX
128 | int abcd[4]; // cpuid results
129 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
130 | return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP
131 | }
132 |
133 | // detect if CPU supports the F16C instruction set
134 | bool hasF16C(void) {
135 | if (instrset_detect() < 7) return false; // must have AVX
136 | int abcd[4]; // cpuid results
137 | cpuid(abcd, 1); // call cpuid function 1
138 | return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C
139 | }
140 |
141 | // detect if CPU supports the AVX512ER instruction set
142 | bool hasAVX512ER(void) {
143 | if (instrset_detect() < 9) return false; // must have AVX512F
144 | int abcd[4]; // cpuid results
145 | cpuid(abcd, 7); // call cpuid function 7
146 | return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER
147 | }
148 |
149 | // detect if CPU supports the AVX512VBMI instruction set
150 | bool hasAVX512VBMI(void) {
151 | if (instrset_detect() < 10) return false; // must have AVX512BW
152 | int abcd[4]; // cpuid results
153 | cpuid(abcd, 7); // call cpuid function 7
154 | return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI
155 | }
156 |
157 | // detect if CPU supports the AVX512VBMI2 instruction set
158 | bool hasAVX512VBMI2(void) {
159 | if (instrset_detect() < 10) return false; // must have AVX512BW
160 | int abcd[4]; // cpuid results
161 | cpuid(abcd, 7); // call cpuid function 7
162 | return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2
163 | }
164 |
165 | #ifdef VCL_NAMESPACE
166 | }
167 | #endif
168 |
--------------------------------------------------------------------------------
/Vectorisation/VCL/vectorclass.h:
--------------------------------------------------------------------------------
1 | /**************************** vectorclass.h ********************************
2 | * Author: Agner Fog
3 | * Date created: 2012-05-30
4 | * Last modified: 2020-04-11
5 | * Version: 2.01.02
6 | * Project: vector class library
7 | * Home: https://github.com/vectorclass
8 | * Description:
9 | * Header file defining vector classes as interface to intrinsic functions
10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets.
11 | *
12 | * Instructions:
13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
14 | * instruction set, which must be at least SSE2. Specify the supported
15 | * instruction set by a command line define, e.g. __SSE4_1__ if the
16 | * compiler does not automatically do so.
17 | * For detailed instructions, see vcl_manual.pdf
18 | *
19 | * Each vector object is represented internally in the CPU as a vector
20 | * register with 128, 256 or 512 bits.
21 | *
22 | * This header file includes the appropriate header files depending on the
23 | * selected instruction set.
24 | *
25 | * (c) Copyright 2012-2020 Agner Fog.
26 | * Apache License version 2.0 or later.
27 | ******************************************************************************/
28 | #ifndef VECTORCLASS_H
29 | #define VECTORCLASS_H 20102
30 |
31 | // Maximum vector size, bits. Allowed values are 128, 256, 512
32 | #ifndef MAX_VECTOR_SIZE
33 | #define MAX_VECTOR_SIZE 512
34 | #endif
35 |
36 | // Determine instruction set, and define platform-dependent functions
37 | #include "instrset.h" // Select supported instruction set
38 |
39 | #if INSTRSET < 2 // instruction set SSE2 is the minimum
40 | #error Please compile for the SSE2 instruction set or higher
41 | #else
42 |
43 | // Select appropriate .h files depending on instruction set
44 | #include "vectori128.h" // 128-bit integer vectors
45 | #include "vectorf128.h" // 128-bit floating point vectors
46 |
47 | #if MAX_VECTOR_SIZE >= 256
48 | #if INSTRSET >= 8
49 | #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set
50 | #else
51 | #include "vectori256e.h" // 256-bit integer vectors, emulated
52 | #endif // INSTRSET >= 8
53 | #if INSTRSET >= 7
54 | #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set
55 | #else
56 | #include "vectorf256e.h" // 256-bit floating point vectors, emulated
57 | #endif // INSTRSET >= 7
58 | #endif // MAX_VECTOR_SIZE >= 256
59 |
60 | #if MAX_VECTOR_SIZE >= 512
61 | #if INSTRSET >= 9
62 | #include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set
63 | #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set
64 | #else
65 | #include "vectori512e.h" // 512-bit integer vectors, emulated
66 | #include "vectorf512e.h" // 512-bit floating point vectors, emulated
67 | #endif // INSTRSET >= 9
68 | #if INSTRSET >= 10
69 | #include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set
70 | #else
71 | #include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated
72 | #endif
73 | #endif // MAX_VECTOR_SIZE >= 512
74 |
75 | #include "vector_convert.h" // conversion between different vector sizes
76 |
77 | #endif // INSTRSET >= 2
78 |
79 |
80 | #else // VECTORCLASS_H
81 |
82 | #if VECTORCLASS_H < 20000
83 | #error Mixed versions of vector class library
84 | #endif
85 |
86 | #endif // VECTORCLASS_H
87 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy.cpp:
--------------------------------------------------------------------------------
1 | /**************************** alloc_policy.cpp *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #include "alloc_policy.h"
13 | #include "alloc_policy_imp.h"
14 | #include
15 |
16 | template<>
17 | int AllAllocators::lastSize_N = -1;
18 | template<>
19 | int AllAllocators::lastSize_N = -1;
20 | template<>
21 | int AllAllocators::lastSize_N = -1;
22 | template<>
23 | int AllAllocators::lastSize_N = -1;
24 |
25 | template<>
26 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr;
27 | template<>
28 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr;
29 | template<>
30 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr;
31 | template<>
32 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr;
33 | template<>
34 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>();
35 | template<>
36 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>();
37 | template<>
38 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>();
39 | template<>
40 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>();
41 |
42 |
43 |
44 | void freePool(size_t N, long double* pOld)
45 | {
46 | return freeT(N, pOld);
47 | }
48 |
49 |
50 | void freePool(size_t N, double* pOld)
51 | {
52 | return freeT(N, pOld);
53 | }
54 |
55 | void freePool(size_t N, float* pOld)
56 | {
57 | return freeT(N, pOld);
58 | }
59 |
60 | void freePool(size_t N, unsigned int* pOld)
61 | {
62 | return freeT(N, pOld);
63 | }
64 |
65 | void allocPool(size_t& N, long double*& pMem)
66 | {
67 | allocT(N, pMem);
68 | }
69 |
70 | void allocPool(size_t& N, double*& pMem)
71 | {
72 | allocT(N, pMem);
73 | }
74 |
75 | void allocPool(size_t& N, float*& pMem)
76 | {
77 | allocT(N, pMem);
78 | }
79 |
80 | void allocPool(size_t& N, unsigned int*& pMem)
81 | {
82 | allocT(N, pMem);
83 | }
84 |
85 | int getAllignedSize(size_t N, long double* pOld)
86 | {
87 | return getAllignedSizeT(N, pOld);
88 | }
89 |
90 | int getAllignedSize(size_t N, double* pOld)
91 | {
92 | return getAllignedSizeT(N, pOld);
93 | }
94 |
95 | int getAllignedSize(size_t N, float* pOld)
96 | {
97 | return getAllignedSizeT(N, pOld);
98 | }
99 |
100 | int getAllignedSize(size_t N, unsigned int* pOld)
101 | {
102 | return getAllignedSizeT(N, pOld);
103 | }
104 | void freeAllAllocators(long double)
105 | {
106 | AllAllocators::freeAll();
107 | }
108 | void freeAllAllocators(double)
109 | {
110 | AllAllocators::freeAll();
111 | }
112 | void freeAllAllocators(float)
113 | {
114 | AllAllocators::freeAll();
115 | }
116 | void freeAllAllocators(unsigned int)
117 | {
118 | AllAllocators::freeAll();
119 | }
--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy.h:
--------------------------------------------------------------------------------
1 | /**************************** alloc_policy.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 |
14 | #include
15 |
16 | void freePool(size_t N, long double* pOld);
17 | void freePool(size_t N, double* pOld);
18 | void freePool(size_t N, float* pOld);
19 | void freePool(size_t N, unsigned int* pOld);
20 |
21 | void allocPool(size_t& N, long double*& pMem);
22 | void allocPool(size_t& N, double*& pMem);
23 | void allocPool(size_t& N, float*& pOld);
24 | void allocPool(size_t& N, unsigned int*& pOld);
25 |
26 | int getAllignedSize(size_t N, long double* pOld);
27 | int getAllignedSize(size_t N, double* pOld);
28 | int getAllignedSize(size_t N, float* pOld);
29 | int getAllignedSize(size_t N, unsigned int* pOld);
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy_imp.h:
--------------------------------------------------------------------------------
1 | /**************************** alloc_policy_imp.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | #include
14 | #include
15 |
16 |
17 | //need a function to reduce size pools to a minimum
18 | // get rid of magic numbers of byte sizes etc
19 |
20 | const int BytesOnCacheLine = 64;
21 | const long MemPoolInitialIncrement = 16;
22 | const long MemPoolScaleFactor = 2;
23 | const int ByteAllignment = 64;
24 |
25 | template
26 | class PoolStrat
27 | {
28 | public:
29 |
30 | PoolStrat(const PoolStrat&) = delete;
31 | PoolStrat& operator=(const PoolStrat&) = delete;
32 | PoolStrat& operator=( PoolStrat&&) = delete;
33 | PoolStrat(PoolStrat&&) = delete;
34 |
35 |
36 | explicit PoolStrat(int vecSz) :m_vecSize(vecSz)
37 | {
38 | m_sz = 0;
39 | m_incrementSize = MemPoolInitialIncrement;// 16;
40 | addToPool(m_incrementSize);
41 | m_pos = 0;
42 | }
43 |
44 | ~PoolStrat()
45 | {
46 | for (auto vec : m_allocatedVecs)
47 | {
48 | delete vec;
49 | }
50 | }
51 |
52 |
53 |
54 | T* alloc()
55 | {
56 | if (m_pos < (m_sz - 1))
57 | {
58 | T* ret = m_memPool[m_pos];
59 | m_pos++;
60 | return ret;
61 | }
62 | else
63 | {
64 | m_incrementSize *= MemPoolScaleFactor;
65 | addToPool(m_incrementSize);
66 | return alloc();
67 | }
68 | }
69 |
70 |
71 | void free(T* pToFree)
72 | {
73 | //typically this should be next one down from top of stack
74 | if ((m_pos <= 0) || (nullptr == pToFree))
75 | {
76 | return;
77 | }
78 |
79 | if (m_memPool[m_pos - 1] == pToFree)
80 | {
81 | //pToFree[0] = 666;
82 | m_pos--;
83 | return;
84 | }
85 |
86 | //search for values of i > 0
87 | int i = m_pos;
88 | if (i >= static_cast(m_memPool.size()))
89 | {
90 | i = static_cast(m_memPool.size()) - 1;
91 | }
92 | int maxPos = i;
93 |
94 | for (; i > -1; i--)
95 | {
96 | if (m_memPool[i] == pToFree)
97 | {
98 | //swap to be freed with top element and decrement//bubble to the top
99 | for (int k = i; k < maxPos - 1; k++)
100 | {
101 | std::swap(m_memPool[k], m_memPool[k + 1]);
102 | }
103 | //pToFree[0] = 666;
104 | m_pos--;
105 | return;
106 | }
107 | }
108 |
109 | }
110 |
111 |
112 | void addToPool(int numElements)
113 | {
114 | //m_vecSize for double 64 byte align ie cache line
115 | size_t offsetAlgn = ByteAllignment;// 64;// 16;
116 | std::vector* pVecsMem = new std::vector((long)(numElements)*m_vecSize + offsetAlgn);
117 | m_allocatedVecs.push_back(pVecsMem);
118 |
119 | T* pstrtPt = &((*pVecsMem)[0]);
120 | while ((reinterpret_cast(pstrtPt)) % offsetAlgn) pstrtPt++;
121 |
122 | for (int i = 0; i < numElements; i++)
123 | {
124 | m_memPool.push_back(pstrtPt);
125 | pstrtPt += m_vecSize;
126 | }
127 |
128 | m_sz += numElements;
129 |
130 | }
131 |
132 | inline long pos() const
133 | {
134 | return m_pos;
135 | }
136 |
137 | inline long size() const
138 | {
139 | return m_sz;
140 | }
141 |
142 | const std::vector* >& getAllocVecs() const
143 | {
144 | m_allocatedVecs;
145 | }
146 |
147 | private:
148 | long m_pos;
149 | long m_sz;
150 | std::vector m_memPool;
151 | long m_incrementSize; // next number of vectors for allocation
152 | long m_vecSize; //size of element vector considering allignment and padding
153 | std::vector* > m_allocatedVecs;
154 |
155 | };
156 |
157 |
158 | //////////////////////////////////////////
159 |
160 |
161 | template
162 | class AllocPolicy
163 | {
164 | int m_vec_size;
165 | PoolStrat* m_pool;
166 | public:
167 | int size() const
168 | {
169 | return m_vec_size;
170 | }
171 |
172 | AllocPolicy(int size) :m_vec_size(size)
173 | {
174 | m_pool = new PoolStrat(size);
175 | }
176 | ~AllocPolicy()
177 | {
178 | delete m_pool;
179 | }
180 |
181 |
182 | inline T* alloc()
183 | {
184 | return m_pool->alloc();
185 | }
186 |
187 | inline void free(T* pElement)
188 | {
189 | m_pool->free(pElement);
190 | }
191 |
192 | };
193 |
194 |
195 |
196 | template
197 | class AllAllocators
198 | {
199 | static int lastSize_N;
200 | static AllocPolicy* pAllocPolicy;
201 | static std::unordered_map*> m_map_sizeToAllocPolicy;
202 |
203 |
204 | static void setUpPolicy(int size_N)
205 | {
206 | auto itr = m_map_sizeToAllocPolicy.find(size_N);
207 | if (m_map_sizeToAllocPolicy.end() == itr)
208 | {
209 | pAllocPolicy = new AllocPolicy(size_N);
210 | m_map_sizeToAllocPolicy[size_N] = pAllocPolicy;
211 | }
212 | }
213 |
214 |
215 |
216 | public:
217 |
218 | static void removePolicy(int size_N)
219 | {
220 | auto itr = m_map_sizeToAllocPolicy.find(size_N);
221 | if (m_map_sizeToAllocPolicy.end() != itr)
222 | {
223 | auto policyPtr = m_map_sizeToAllocPolicy[size_N];
224 | delete policyPtr;
225 | m_map_sizeToAllocPolicy.erase(itr);
226 | }
227 |
228 | }
229 |
230 | static void freeAll()
231 | {
232 | for (auto& item : m_map_sizeToAllocPolicy)
233 | {
234 | delete item.second;
235 | }
236 | m_map_sizeToAllocPolicy.clear();
237 | }
238 |
239 |
240 | static T* alloc(int size_N)
241 | {
242 | if (lastSize_N == size_N)
243 | {
244 | return pAllocPolicy->alloc();
245 | }
246 |
247 | setUpPolicy(size_N);
248 |
249 | pAllocPolicy = m_map_sizeToAllocPolicy[size_N];
250 | lastSize_N = size_N;
251 | return pAllocPolicy->alloc();
252 | }
253 |
254 |
255 |
256 | static void free(size_t size_N, T* pMem)
257 | {
258 | int sz_N = static_cast(size_N);
259 |
260 | if (lastSize_N == sz_N)
261 | {
262 | return pAllocPolicy->free(pMem);
263 | }
264 |
265 | setUpPolicy(sz_N);
266 | pAllocPolicy = m_map_sizeToAllocPolicy[sz_N];
267 | lastSize_N = sz_N;
268 | return pAllocPolicy->free(pMem);
269 |
270 | }
271 |
272 |
273 | };
274 |
275 | template< typename T>
276 | struct NumOnCacheLine
277 | {
278 | static inline int size()
279 | {
280 | return BytesOnCacheLine / sizeof(T);
281 | }
282 | };
283 |
284 |
285 | template
286 | int getAllignedSizeT(size_t N, T*)
287 | {
288 | const int M = NumOnCacheLine::size();
289 | size_t res = (N % M == 0) ? N : (N / M + 1) * M;
290 | return static_cast(res);
291 | }
292 |
293 |
294 |
295 | template< typename T>
296 | void allocT(size_t& N, T*& pMem)
297 | {
298 | int n = getAllignedSize(N, pMem);
299 | N = static_cast(n);
300 | pMem = AllAllocators::alloc(n);
301 | }
302 |
303 | template< typename T>
304 | void freeT(size_t N, T* pOld)
305 | {
306 | //find element and mark as unused
307 | return AllAllocators::free(N, pOld);
308 |
309 | }
310 | void freeAllAllocators(long double);
311 | void freeAllAllocators(double);
312 | void freeAllAllocators(float);
313 | void freeAllAllocators(unsigned int);
314 |
315 |
316 | template
317 | class AllAllocatorsGuard
318 | {
319 | public:
320 | ~AllAllocatorsGuard()
321 | {
322 | freeAllAllocators(T());
323 | }
324 |
325 | };
326 |
327 |
328 |
329 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/apply_operation.h:
--------------------------------------------------------------------------------
1 | /**************************** apply_operation.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | #include "vec.h"
14 | #include "vec_double.h"
15 | #include "instruction_traits.h"
16 | #include "boolean_operations.h"
17 | #include "accumulate_transform.h"
18 | #include "binary_unitary_operations.h"
19 | #include "math_ops.h"
20 | #include "filter_select.h"
21 | #include "conditional_select_eval.h"
22 | #include "vec_view.h"
23 | #include "vcl_latest.h"
24 |
25 | #include
26 |
27 |
28 |
29 | template
30 | static INS_VEC cdfnormD(INS_VEC x)
31 | {
32 |
33 | auto asNumber = [](auto x)
34 | {
35 | return static_cast::FloatType>(x);
36 | };
37 |
38 | // https://mathworld.wolfram.com/Erfc.html
39 | constexpr typename InstructionTraits::FloatType invRootPi = asNumber(0.564189583547756);
40 | constexpr typename InstructionTraits::FloatType invRootTwo =asNumber( 0.707106781186548);
41 | return invRootTwo * invRootPi*exp(-0.5*x*x);
42 | }
43 |
44 |
45 | /**/
46 | template
47 | static INS_VEC cdfnorm(const INS_VEC& z)
48 | {
49 |
50 | auto asNumber = [](auto x)
51 | {
52 | return static_cast::FloatType>(x);
53 | };
54 |
55 | auto asInsVec = [&](auto x){ return INS_VEC(asNumber(x) ); };
56 |
57 |
58 | // https://mathworld.wolfram.com/Erfc.html
59 | INS_VEC b1 = asInsVec(0.31938153);
60 | INS_VEC b2 = asInsVec(-0.356563782);
61 | INS_VEC b3 = asInsVec(1.781477937);
62 | INS_VEC b4 = asInsVec(-1.821255978);
63 | INS_VEC b5 = asInsVec(1.330274429);
64 | INS_VEC p = asInsVec(0.2316419);
65 | INS_VEC c2 = asInsVec(0.3989423);
66 |
67 | // const auto cond1 = (z > asInsVec(6.0));
68 | // INS_VEC x = select(cond1, asInsVec(1.0), z);
69 | // x = x;
70 |
71 | // INS_VEC y = select( (z < asInsVec(-6.0)),asInsVec(0.0), z);
72 | // y = y;
73 | INS_VEC a = abs(z);
74 | INS_VEC t = asInsVec(1.0) / (asInsVec(1.0) + a*p);
75 | INS_VEC b = c2*exp((-z)*(z / asInsVec(2.0)));
76 | INS_VEC n = ((((b5*t + b4)*t + b3)*t + b2)*t + b1)*t;
77 | n = asInsVec(1.0) - b*n;
78 | n = select( (z < asInsVec(0.0) ), asInsVec(1.0) - n,n);
79 | return n;
80 | }
81 |
82 |
83 |
84 | template
85 | Vec cdfnorm(const Vec& X)
86 | {
87 | using FLOAT = typename InstructionTraits::FloatType;
88 |
89 | auto asNumber = []( auto x) constexpr
90 | {
91 | return static_cast(x);
92 | };
93 |
94 | auto centralLambda = [&](auto z)
95 | {
96 |
97 | constexpr FLOAT N[] = { FLOAT(3.52624965998911e-02) , FLOAT(0.700383064443688), FLOAT(6.37396220353165), FLOAT(33.912866078383), FLOAT(112.079291497871), FLOAT(221.213596169931), FLOAT(220.206867912376) };
98 | constexpr FLOAT M[] = { FLOAT(8.83883476483184e-02), FLOAT(1.75566716318264), FLOAT(16.064177579207), FLOAT(86.7807322029461) , FLOAT(296.564248779674), FLOAT(637.333633378831), FLOAT(793.826512519948),FLOAT(440.413735824752) };
99 |
100 | auto inv_dc = 1.0 / mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(M[0], z, M[1]), z, M[2]), z, M[3]), z, M[4]), z, M[5]), z, M[6]), z, M[7]);
101 | auto n_c = mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(N[0], z, N[1]), z, N[2]), z, N[3]), z, N[4]), z, N[5]), z, N[6]);
102 |
103 | return n_c * inv_dc;
104 | };
105 |
106 |
107 | auto outerLambda = [&](auto z)
108 | {
109 | constexpr FLOAT inv_RT2PI(0.39894228040143267793994605993438);
110 | constexpr FLOAT d[] = { FLOAT(20.) , FLOAT(13.), FLOAT(200.), FLOAT(78.), FLOAT(300.), FLOAT(39.) };
111 | constexpr FLOAT n[] = { FLOAT(20.), FLOAT(13.), FLOAT(180.), FLOAT(65.), FLOAT(160.) };
112 |
113 | auto d_outer = mul_add(mul_add(mul_add(mul_add(mul_add((d[0] * z), z, d[1]), z, d[2]), z, d[3]), z, d[4]), z, d[5]);
114 | auto inv_d_outer = inv_RT2PI / d_outer;
115 |
116 | auto n_outer = mul_add(mul_add(mul_add(mul_add((n[0] * z), z, n[1]), z, n[2]), z, n[3]), z, n[4]);
117 | return n_outer * inv_d_outer;
118 | };
119 |
120 |
121 |
122 | auto onePass = [=](auto x)
123 | {
124 | auto z = abs(x);
125 | auto e = exp(-z * z * asNumber(0.5) );
126 | auto central = centralLambda(z);
127 | auto SPLIT = asNumber(7.42);// 7106781186547; // appears to give less error
128 | auto condAllDone = (x * x < SPLIT* SPLIT);
129 |
130 | if (horizontal_and(condAllDone))
131 | {
132 | central *= e;
133 | return select(x <= asNumber(0.0), central, asNumber(1.0) - central);
134 | }
135 |
136 | auto outer = outerLambda(z);
137 | auto RES = select((z < SPLIT), central, outer);
138 | RES *= e;
139 | return select(x <= asNumber(0.0), RES, asNumber(1.0) - RES);
140 |
141 | };
142 |
143 | return ApplyTransformUR_X(X, onePass);
144 |
145 | }
146 |
147 |
148 |
149 | template
150 | VecD cdfnorm(const VecD& rhs)
151 | {
152 | return VecD(cdfnorm(rhs.value()), rhs.derivative()*cdfnormD(rhs.value()));
153 | }
154 |
155 | //to do replace with WS 16 digit impl
156 | template
157 | Vec cdfnorminv(const Vec& X)
158 | {
159 |
160 | auto asNumber = [](auto x) constexpr
161 | {
162 | return static_cast::FloatType>(x);
163 | };
164 |
165 |
166 | /// acklams inverse cdf normal
167 | static typename InstructionTraits::FloatType a[] = { asNumber(0.0), asNumber( -3.969683028665376e+01), asNumber(2.209460984245205e+02), asNumber(-2.759285104469687e+02), asNumber(1.383577518672690e+02), asNumber(-3.066479806614716e+01) , asNumber(2.506628277459239e+00)};
168 | static typename InstructionTraits::FloatType b[] = { asNumber(0.0), asNumber(-5.447609879822406e+01), asNumber(1.615858368580409e+02), asNumber(-1.556989798598866e+02), asNumber(6.680131188771972e+01), asNumber(-1.328068155288572e+01) };
169 | static typename InstructionTraits::FloatType c[] = { asNumber(0.0), asNumber(-7.784894002430293e-03), asNumber(-3.223964580411365e-01), asNumber(-2.400758277161838e+00), asNumber(-2.549732539343734e+00), asNumber(4.374664141464968e+00), asNumber(2.938163982698783e+00) };
170 | static typename InstructionTraits::FloatType d[] = { asNumber(0.0), asNumber(7.784695709041462e-03), asNumber(3.224671290700398e-01), asNumber(2.445134137142996e+00), asNumber(3.754408661907416e+00) };
171 |
172 | auto aclambdaMain = [=](auto p)
173 | {
174 | auto X = p;
175 | auto q = p - asNumber(0.5);
176 | auto r = q * q;
177 | X = (((((a[1] * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * r + a[6]) * q /
178 | (((((b[1] * r + b[2]) * r + b[3]) * r + b[4]) * r + b[5]) * r + asNumber(1.));
179 |
180 | return X;
181 | };
182 |
183 |
184 | auto aclambdaLow = [=](auto initVal, auto p)
185 | {
186 | const auto p_low = asNumber(0.02425);
187 | auto condLo = (asNumber(0.0) < p) && (p < p_low);
188 |
189 | if (!horizontal_or(condLo))
190 | return initVal;
191 |
192 | auto q = sqrt(asNumber (-2.0) * log(p));
193 | auto X = (((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) /
194 | ((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + asNumber(1.0));
195 |
196 | return select(condLo, X, initVal);
197 |
198 | };
199 |
200 |
201 | auto aclambdaHi = [=](auto initVal, auto p)
202 | {
203 | const auto p_low = asNumber(0.02425);
204 | const auto p_high = asNumber(1.) - p_low;
205 | auto condHi = (p_high < p) && (p < asNumber(1.));
206 | if (!horizontal_or(condHi))
207 | return initVal;
208 |
209 | auto q = sqrt(asNumber(-2.0) * log(asNumber(1.) - p));
210 | const auto X = -(((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) /
211 | ((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + 1.0);
212 | return select(condHi, X, initVal);
213 | };
214 |
215 |
216 |
217 | auto res = ApplyUnitaryOperation1(X, aclambdaMain);
218 | SparseUpdateWithLambda1(res, X, aclambdaLow);
219 | SparseUpdateWithLambda1(res, X, aclambdaHi);
220 |
221 | return res;
222 | }
223 | //
224 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/binned_accumulator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "dr3.h"
3 | #include "instruction_traits.h"
4 | #include
5 |
6 |
7 | template
8 | struct BinsT
9 | {
10 | using INS = INS_T;
11 |
12 | inline static constexpr bool isDbl = std::is_same::FloatType >::value;
13 |
14 | inline static const INS_T TINY_C{ isDbl ? pow(1024.0 , -10.0) : 1.0/ 8388608.0f * 1.0 / 8388608.0f };
15 | inline static const INS_T VERY_SMALL_C{ isDbl ? pow(1024.0,-5.0) : 1.0 / 8388608.0f };
16 | inline static const INS_T SMALL_C{ isDbl ? 1.0 : 1.0f };
17 | inline static const INS_T BIG_C{ isDbl ? pow(1024.0, 5.0) : 8388608.0f };
18 |
19 |
20 | static inline auto roundIt(INS_T X, INS_T LEVEL)
21 | {
22 | auto INV_LEVEL = 1.0l / LEVEL;
23 | auto big = (LEVEL * truncate(X * INV_LEVEL));
24 | auto small = X - big;
25 | return std::pair(big, small);
26 | };
27 |
28 |
29 | INS_T m_scaleFactor{ InstructionTraits::oneValue };
30 | INS_T veryBigSummV{ InstructionTraits::nullValue };
31 | INS_T bigSummV{ InstructionTraits::nullValue };
32 | INS_T smallSumV{ InstructionTraits::nullValue };
33 | INS_T tinyV{ InstructionTraits::nullValue };
34 |
35 |
36 | INS_T TINY{ TINY_C };
37 | INS_T VERY_SMALL{ VERY_SMALL_C };
38 | INS_T SMALL{ SMALL_C };
39 | INS_T BIG{ BIG_C };
40 |
41 |
42 |
43 | BinsT() :
44 | m_scaleFactor{ InstructionTraits::oneValue },
45 | TINY{ m_scaleFactor * TINY_C },
46 | VERY_SMALL{ m_scaleFactor * VERY_SMALL_C },
47 | SMALL{ m_scaleFactor * SMALL_C },
48 | BIG{ m_scaleFactor * BIG_C }
49 | {}
50 |
51 |
52 |
53 |
54 |
55 | BinsT(typename InstructionTraits::FloatType x, typename InstructionTraits::FloatType scaleFactor = InstructionTraits::oneValue) :
56 | m_scaleFactor{ scaleFactor },
57 | TINY{ m_scaleFactor * TINY_C },
58 | VERY_SMALL{ m_scaleFactor * VERY_SMALL_C },
59 | SMALL{ m_scaleFactor * SMALL_C },
60 | BIG{ m_scaleFactor * BIG_C }
61 | {
62 |
63 | INS_T MASK(InstructionTraits::nullValue);
64 | MASK.insert(0, InstructionTraits::oneValue);
65 |
66 | set(MASK * x);
67 |
68 | }
69 |
70 |
71 | void set(INS_T x)
72 | {
73 | auto resRoundVeryBig = roundIt(x, BIG);
74 | auto resRoundBig = roundIt(resRoundVeryBig.second, SMALL);
75 | auto resRoundSmall = roundIt(resRoundBig.second, VERY_SMALL);
76 |
77 | veryBigSummV = resRoundVeryBig.first;
78 | bigSummV = resRoundBig.first;
79 | smallSumV = resRoundSmall.first;
80 | tinyV = resRoundSmall.second;
81 | }
82 |
83 | BinsT(INS_T x)
84 | {
85 | set(x);
86 | }
87 |
88 | BinsT& operator *(INS_T rhs)
89 | {
90 |
91 | veryBigSummV *= rhs;
92 | bigSummV *= rhs;
93 | smallSumV *= rhs;
94 | tinyV *= rhs;
95 |
96 | return *this;
97 | }
98 |
99 |
100 | BinsT(BinsT&& x) noexcept
101 | {
102 | veryBigSummV = x.veryBigSummV;
103 | bigSummV = x.bigSummV;
104 | smallSumV = x.smallSumV;
105 | tinyV = x.tinyV;
106 |
107 | m_scaleFactor = x.m_scaleFactor;
108 | TINY = x.TINY;
109 | VERY_SMALL = x.VERY_SMALL;
110 | SMALL = x.SMALL;
111 | BIG = x.BIG;
112 |
113 |
114 | };
115 |
116 |
117 | BinsT& operator =(const BinsT& x)
118 | {
119 | veryBigSummV = x.veryBigSummV;
120 | bigSummV = x.bigSummV;
121 | smallSumV = x.smallSumV;
122 | tinyV = x.tinyV;
123 |
124 | m_scaleFactor = x.m_scaleFactor;
125 | TINY = x.TINY;
126 | VERY_SMALL = x.VERY_SMALL;
127 | SMALL = x.SMALL;
128 | BIG = x.BIG;
129 |
130 |
131 | return *this;
132 | };
133 |
134 |
135 | BinsT& operator += (const BinsT& rhs)
136 | {
137 | auto resRoundTiny = roundIt(tinyV + rhs.tinyV, VERY_SMALL);
138 | tinyV = resRoundTiny.second;
139 |
140 |
141 | auto smallRound = roundIt(smallSumV + rhs.smallSumV + resRoundTiny.first, SMALL);
142 | smallSumV = smallRound.second;
143 | auto bigRound = roundIt(smallRound.first + bigSummV + rhs.bigSummV, BIG);
144 | bigSummV = bigRound.second;
145 | veryBigSummV = bigRound.first + veryBigSummV + rhs.veryBigSummV;
146 |
147 | return *this;
148 | }
149 |
150 |
151 |
152 | auto hsum()
153 | {
154 | auto lambdaBinSum = [this]() {return (((horizontal_add(tinyV)) + horizontal_add(smallSumV)) + horizontal_add(bigSummV)) + horizontal_add(veryBigSummV); };
155 | return lambdaBinSum();
156 | }
157 |
158 |
159 | };
160 |
161 |
162 |
163 |
164 | static auto BinnedAdd = [](auto& bin, auto x) mutable
165 | {
166 | bin += x;
167 | using INS_T = decltype(x);
168 | auto NULL_Vec = INS_T(InstructionTraits::nullValue);
169 | return NULL_Vec;
170 |
171 | };
--------------------------------------------------------------------------------
/Vectorisation/VecX/error_utils.h:
--------------------------------------------------------------------------------
1 | /**************************** error_utils.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | #include "vec.h"
14 | #include "vec_view.h"
15 | #include "span.h"
16 | #include
17 | #include
18 |
19 | //ignore for unused from sutter
20 | template void ignore(const T&) { }
21 |
22 | template
23 | bool check_vector( const VEC& rhs)
24 | {
25 | auto rhsSz = rhs.size();
26 |
27 | if ( ( rhsSz > 0) || rhs.isScalar() )
28 | {
29 | return true;
30 | }
31 | else
32 | {
33 | //std::
34 | assert(false);
35 | throw std::runtime_error("bad vector size of non scalar");
36 | }
37 | }
38 |
39 | template
40 | bool check_pair(const VEC& lhs, const VEC& rhs)
41 | {
42 | check_vector(lhs);
43 | check_vector(rhs);
44 |
45 | if ( (lhs.size() == rhs.size() ) && (lhs.size() > 0 ) )
46 | {
47 | return true;
48 | }
49 |
50 |
51 | if (rhs.isScalar() || lhs.isScalar())
52 | {
53 | return true;
54 | }
55 | else
56 | {
57 |
58 | assert(false);
59 | throw std::runtime_error("bad vector size");
60 | }
61 | }
62 |
63 | template
64 | bool check_pair_different_type(const VEC1& lhs, const VEC2& rhs)
65 | {
66 | check_vector(lhs);
67 | check_vector(rhs);
68 |
69 | if (lhs.size() == rhs.size())
70 | return true;
71 | if (rhs.isScalar() || lhs.isScalar())
72 | {
73 | return true;
74 | }
75 | else
76 | {
77 | //std::
78 | assert(false);
79 | throw std::runtime_error("bad vector size");
80 | }
81 | }
82 |
83 |
84 |
85 |
86 | ////////////// views ////////////////
87 | template
88 | bool check_vector(const VecView& /*rhs*/)
89 | {
90 | //TO DO
91 | /*
92 | auto rhsSz = rhs.size();
93 |
94 | if ((rhsSz > 0) || rhs.isScalar())
95 | {
96 | return true;
97 | }
98 | else
99 | {
100 | //std::assert(false);
101 | throw std::exception("bad vector size of non scalar");
102 | }
103 | */
104 | return true;
105 | }
106 |
107 | template
108 | bool check_vector(const Vec& rhs)
109 | {
110 | auto rhsSz = rhs.size();
111 |
112 | if ((rhsSz > 0) || rhs.isScalar())
113 | {
114 | return true;
115 | }
116 | else
117 | {
118 | //std::
119 | assert(false);
120 | throw std::runtime_error("bad vector size of non scalar");
121 | }
122 | }
123 |
124 |
125 |
126 | template
127 | bool check_vector(const VecD& rhs)
128 | {
129 | // Always return true
130 | return true;
131 | /*
132 | auto rhsSz = rhs.size();
133 |
134 | if ((rhsSz > 0) || rhs.isScalar())
135 | {
136 | return true;
137 | }
138 | else
139 | {
140 | //std::
141 | assert(false);
142 | throw std::exception("bad vector size of non scalar");
143 | }
144 | */
145 | }
146 |
147 | template
148 | bool check_vector_for_filter(const Vec& rhs)
149 | {
150 | auto rhsSz = rhs.size();
151 |
152 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
153 | {
154 | return true;
155 | }
156 | else
157 | {
158 | //std::
159 | assert(false);
160 | throw std::runtime_error("bad vector size of non scalar");
161 | }
162 | }
163 |
164 |
165 | template
166 | bool check_vector_for_filter(const VecView&/* rhs*/)
167 | {
168 | return true;// views can be empty
169 | /*
170 | auto rhsSz = rhs.size();
171 |
172 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
173 | {
174 | return true;
175 | }
176 | else
177 | {
178 | //std::assert(false);
179 | throw std::exception("bad vector size of non scalar");
180 | }
181 | */
182 | }
183 |
184 |
185 | template
186 | bool check_vector_for_filter(const Span&/* rhs*/)
187 | {
188 | return true;// views can be empty
189 | /*
190 | auto rhsSz = rhs.size();
191 |
192 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
193 | {
194 | return true;
195 | }
196 | else
197 | {
198 | //std::assert(false);
199 | throw std::exception("bad vector size of non scalar");
200 | }
201 | */
202 | }
203 |
204 |
205 |
206 |
207 | template
208 | bool check_view_pair(const Vec& lhs, const Vec& rhs)
209 | {
210 | check_vector_for_filter(lhs);
211 | check_vector_for_filter(rhs);
212 |
213 | if (lhs.size() == rhs.size())
214 | return true;
215 |
216 | //std::
217 | assert(false);
218 | throw std::runtime_error("bad vector size");
219 |
220 | }
221 |
222 | template
223 | bool check_view_pair(const VecView& lhs, const VecView& rhs)
224 | {
225 | check_vector_for_filter(lhs);
226 | check_vector_for_filter(rhs);
227 |
228 | if (lhs.size() == rhs.size())
229 | return true;
230 |
231 | //std::
232 | assert(false);
233 | throw std::runtime_error("bad vector size");
234 |
235 | }
--------------------------------------------------------------------------------
/Vectorisation/VecX/filter_pipe_and_join.h:
--------------------------------------------------------------------------------
1 | /**************************** filter_pipe_and_join.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | #include "filter_select.h"
14 |
15 |
16 | /*
17 | Use "|" for joining filters and ">" for joining operations
18 | use braces around sets of filters to control evaluation order
19 |
20 | */
21 |
22 | namespace PIPE
23 | {
24 |
25 | template< typename INS_VEC>
26 | VecView operator |(const Vec& rhs, const VecBool& condition)
27 | {
28 | return ApplyFilter(condition, rhs);
29 | }
30 |
31 |
32 | template< typename INS_VEC, typename OP>
33 | VecView operator |(const VecView& rhs, OP& condition)
34 | {
35 | return ApplyFilter(condition, rhs);
36 | }
37 |
38 | template< typename INS_VEC, typename OP>
39 | VecView operator |(const Vec& rhs, OP& condition)
40 | {
41 | return ApplyFilter(condition, rhs);
42 | }
43 |
44 | template< typename INS_VEC, typename OP>
45 | VecView operator |(Vec& rhs, OP& condition)
46 | {
47 | return ApplyFilter(condition, rhs);
48 | }
49 |
50 |
51 | // vector checks are applied inside ApplyUnitaryOperation
52 | template< typename INS_VEC, typename OP>
53 | VecView operator > ( VecView rhs, OP& oper)
54 | {
55 | ApplyUnitaryOperation( rhs, oper);
56 | return rhs;
57 | }
58 |
59 |
60 | template< typename INS_VEC, typename OP>
61 | VecView& operator > ( OP& oper, VecView& rhs)
62 | {
63 | ApplyUnitaryOperation(oper, rhs);
64 | return rhs;
65 | }
66 |
67 | template< typename INS_VEC, typename OP>
68 | VecView operator > (OP& oper, const VecView& rhs)
69 | {
70 | return ApplyUnitaryOperation(oper, rhs);
71 | }
72 |
73 |
74 | template< typename INS_VEC, typename OP>
75 | VecView& operator > (Vec& rhs, OP& oper)
76 | {
77 | ApplyUnitaryOperation(oper, rhs);
78 | return rhs;
79 | }
80 |
81 |
82 | template< typename INS_VEC, typename OP>
83 | VecView operator > (const Vec& rhs, OP& oper)
84 | {
85 | return ApplyUnitaryOperation(oper, rhs);
86 | }
87 |
88 |
89 | template< typename INS_VEC, typename OP>
90 | VecView operator > ( OP& oper , const Vec& rhs )
91 | {
92 | return ApplyUnitaryOperation(oper, rhs);
93 | }
94 |
95 |
96 | template< typename INS_VEC>
97 | Vec operator |(const VecView& rhs, Vec& out)
98 | {
99 | auto outRes(out);
100 | rhs.writeView(outRes);
101 | return outRes;
102 | }
103 |
104 |
105 | template< typename INS_VEC>
106 | Vec operator |(VecView& rhs, const Vec& out)
107 | {
108 | auto outRes(out);
109 | rhs.writeView(outRes);
110 | return outRes;
111 | }
112 |
113 |
114 | struct WriteOut
115 | {};
116 |
117 | template< typename INS_VEC>
118 | void operator |(const VecView& rhs, WriteOut& out)
119 | {
120 | //writes back to source to do
121 | rhs.writeView(out);
122 | }
123 |
124 | }// namespace PIPE
125 |
126 | /*
127 | These expression templates are for use at register level combinations of operations
128 | */
129 | namespace JOIN
130 | {
131 |
132 |
133 | template< typename LHS, typename RHS>
134 | struct CatOperation
135 | {
136 | CatOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
137 |
138 | template
139 | inline auto operator()(const X& val) noexcept
140 | {
141 | return m_lhs(m_rhs(val));
142 | }
143 |
144 | //for use with accumulate
145 | template
146 | inline auto operator()(const X& lhs_arg, const X& rhs_arg) noexcept
147 | {
148 | return m_rhs(rhs_arg, m_lhs(lhs_arg));
149 | }
150 | LHS m_lhs;
151 | RHS m_rhs;
152 | };
153 |
154 |
155 | template
156 | CatOperation< RHS, LHS> operator | (const LHS& lhs, const RHS& rhs)
157 | {
158 | return CatOperation(rhs,lhs);
159 | }
160 |
161 | /* Examples
162 |
163 | Boolean expression template conjuction for boolean lambdas
164 |
165 | auto isLessThanMinus10 = [](auto x) { return x < -10 };
166 | auto isGreaterThan10 = [](auto x) { return x > 10 };
167 | auto isLessThan20 = [](auto x) { return x < 20 };
168 |
169 | we can create simple logical conjunctions of boolean lambdas
170 |
171 | auto betweenTenAndTwenty = isGreaterThan10 && isLessThan20;
172 |
173 | auto isOutsideTenTwenty = !betweenTenAndTwenty;
174 |
175 | auto hasAbsGreaterThanTen = isLessThanMinus10 || isGreaterThan10;
176 |
177 | */
178 |
179 | template< typename RHS>
180 | struct NegateOperation
181 | {
182 | NegateOperation(const RHS& rhs) : m_rhs(rhs) {}
183 |
184 | template
185 | inline auto operator()(const INS_VEC& val) noexcept
186 | {
187 | return !m_rhs(val);
188 | }
189 | RHS m_rhs;
190 | };
191 |
192 |
193 | template< typename RHS>
194 | NegateOperation operator ! (const RHS& rhs)
195 | {
196 | return NegateOperation< RHS>(rhs);
197 | }
198 |
199 |
200 |
201 |
202 |
203 | template< typename LHS, typename RHS>
204 | struct OROperation
205 | {
206 | OROperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
207 |
208 | template
209 | inline auto operator()(const X& val) noexcept
210 | {
211 | return m_lhs(val) || m_rhs(val);
212 | }
213 | LHS m_lhs;
214 | RHS m_rhs;
215 | };
216 |
217 |
218 | template< typename LHS, typename RHS>
219 | OROperation< LHS, RHS> operator || (const LHS& lhs, const RHS& rhs)
220 | {
221 | return OROperation(lhs, rhs);
222 | }
223 |
224 |
225 |
226 | template< typename LHS, typename RHS>
227 | struct AndOperation
228 | {
229 | AndOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
230 |
231 | template
232 | inline auto operator()(const X& val) noexcept
233 | {
234 | return m_lhs(val) && m_rhs(val);
235 | }
236 |
237 | LHS m_lhs;
238 | RHS m_rhs;
239 |
240 | };
241 |
242 |
243 | template< typename LHS, typename RHS>
244 | AndOperation< LHS, RHS> operator && (const LHS& lhs, const RHS& rhs)
245 | {
246 | return AndOperation(lhs, rhs);
247 | }
248 |
249 |
250 | }//namespace JOIN
251 |
252 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/instruction_traits.h:
--------------------------------------------------------------------------------
1 | /**************************** instruction_traits.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | #pragma warning(suppress:4984)
14 |
15 | #include "vec.h"
16 | #include "vec_double.h"
17 | #include "../VCL/vectormath_common.h"
18 |
19 |
20 | template class Vec;
21 | template class VecBool;
22 | template class VecD;
23 | template< typename INS_VEC> class VecView;
24 |
25 |
26 | template< typename INS_VEC>
27 | struct InstructionTraits
28 | {
29 | using BoolType = VecBoolD;
30 | using FloatType = double;
31 | static constexpr int width = 2;
32 | static constexpr double nullValue = 0.0;
33 | static constexpr double oneValue =1.0;
34 | static constexpr bool alignedLoadStore = false;
35 | static constexpr bool boolTypeIsAlignedLoadStore = false;
36 | static constexpr bool useScatter = false;
37 | static constexpr uint32_t limit = 100000;
38 |
39 | static constexpr bool isCompact = false;
40 | using RegBoolType = VecBoolD;
41 | using MemBoolType = VecDouble;
42 |
43 | };
44 |
45 |
46 |
47 |
48 | template<>
49 | struct InstructionTraits
50 | {
51 | using IdxType = Vec2q;
52 | using BoolType = VecBoolD;
53 | using FloatType = double;
54 | static constexpr int width = 2;
55 | static constexpr double nullValue = 0.0;
56 | static constexpr double oneValue = 1.0;
57 | static constexpr bool alignedLoadStore = true;
58 | static constexpr bool useScatter = false;
59 | static constexpr uint32_t limit = 100000;
60 | static constexpr bool boolTypeIsAlignedLoadStore = true;
61 |
62 | static constexpr bool isCompact = false;
63 | using RegBoolType = VecBoolD;
64 | using MemBoolType = VecDouble;
65 | };
66 |
67 |
68 |
69 |
70 | template<>
71 | struct InstructionTraits
72 | {
73 | using IdxType = Vec2q;
74 | using BoolType = VecBoolD;
75 | using FloatType = long double;
76 | static constexpr int width = 2;
77 | static constexpr double nullValue = 0.0;
78 | static constexpr double oneValue = 1.0;
79 | static constexpr bool alignedLoadStore = false;
80 | static constexpr bool useScatter = false;
81 | static constexpr uint32_t limit = 100000;
82 | static constexpr bool boolTypeIsAlignedLoadStore = true;
83 |
84 | static constexpr bool isCompact = false;
85 | using RegBoolType = VecBoolD;
86 | using MemBoolType = VecLDouble;
87 | };
88 |
89 |
90 |
91 | template<>
92 | struct InstructionTraits
93 | {
94 | using IdxType = Vec2q;
95 | using BoolType = Vec2db;
96 | using FloatType = double;
97 | static constexpr int width = 2;
98 | static constexpr double nullValue = 0.0;
99 | static constexpr double oneValue = 1.0;
100 | static constexpr bool alignedLoadStore = true;
101 | static constexpr bool boolTypeIsAlignedLoadStore = false;
102 | static constexpr bool useScatter = false;
103 | static constexpr uint32_t limit = 100000;
104 |
105 | static constexpr bool isCompact = false;
106 | using RegBoolType = Vec2db;
107 | using MemBoolType = Vec2d;
108 |
109 |
110 | };
111 |
112 |
113 | template<>
114 | struct InstructionTraits
115 | {
116 | using IdxType = Vec4i;
117 | using BoolType = Vec4fb;
118 | using FloatType = float;
119 | static constexpr int width = 4;
120 | static constexpr bool alignedLoadStore = true;
121 | static constexpr bool boolTypeIsAlignedLoadStore = false;
122 | static constexpr float nullValue = 0.f;
123 | static constexpr float oneValue = 1.f;
124 | static constexpr bool useScatter = false;
125 | static constexpr uint32_t limit = 100000;
126 |
127 | static constexpr bool isCompact = false;
128 | using RegBoolType = Vec4fb;
129 | using MemBoolType = Vec4f;
130 | };
131 |
132 |
133 |
134 |
135 |
136 | template<>
137 | struct InstructionTraits
138 | {
139 | using IdxType = Vec4i;
140 | using BoolType = Vec4db;
141 | using FloatType = double;
142 | static constexpr int width = 4;
143 | static constexpr bool alignedLoadStore = true;
144 | static constexpr bool boolTypeIsAlignedLoadStore = false;
145 | static constexpr double nullValue = 0.0;
146 | static constexpr double oneValue = 1.0;
147 | static constexpr bool useScatter = true;
148 | static constexpr uint32_t limit = 100000;
149 |
150 | static constexpr bool isCompact = false;
151 | using RegBoolType = Vec4db;
152 | using MemBoolType = Vec4d;
153 | };
154 |
155 |
156 | template<>
157 | struct InstructionTraits
158 | {
159 | using IdxType = Vec8i;
160 | using BoolType = Vec8fb;
161 | using FloatType = float;
162 |
163 | static constexpr int width = 8;
164 | static constexpr float nullValue = 0.f;
165 | static constexpr float oneValue = 1.f;
166 | static constexpr bool alignedLoadStore = true;
167 | static constexpr bool boolTypeIsAlignedLoadStore = false;
168 | static constexpr bool useScatter = true;
169 | static constexpr uint32_t limit = 100000;
170 |
171 | static constexpr bool isCompact = false;
172 | using RegBoolType = Vec8fb;
173 | using MemBoolType = Vec8f;
174 |
175 | };
176 |
177 |
178 |
179 | template<>
180 | struct InstructionTraits
181 | {
182 | using IdxType = Vec8i;
183 |
184 | using BoolType = Vec8db;
185 |
186 | using FloatType = double;
187 | static constexpr int width = 8;
188 | static constexpr double nullValue = 0.0;
189 | static constexpr double oneValue = 1.0;
190 | static constexpr bool alignedLoadStore = false;
191 | static constexpr bool boolTypeIsAlignedLoadStore = false;
192 | static constexpr bool useScatter = true;
193 | static constexpr uint32_t limit = 1000000;
194 |
195 | static constexpr bool isCompact = true;
196 | using RegBoolType = Vec8db;
197 | using MemBoolType = Vec8d;
198 |
199 | };
200 |
201 |
202 |
203 | template<>
204 | struct InstructionTraits
205 | {
206 | using IdxType = Vec16i;
207 | using BoolType = Vec16fb;
208 | using FloatType = float;
209 | static constexpr int width = 16;
210 | static constexpr float nullValue = 0.f;
211 | static constexpr float oneValue = 1.f;
212 | static constexpr bool alignedLoadStore = false;
213 | static constexpr bool boolTypeIsAlignedLoadStore = false;
214 | static constexpr bool useScatter = true;
215 | static constexpr uint32_t limit = 1000000;
216 |
217 | static constexpr bool isCompact = true;
218 | using RegBoolType = Vec16fb;
219 | using MemBoolType = Vec16f;
220 | };
221 |
222 |
223 |
224 | template
225 | inline typename InstructionTraits::MemBoolType boolCompactSave(typename InstructionTraits::RegBoolType regVal )
226 | {
227 | return boolCompactConvert(regVal);
228 | }
229 |
230 |
231 | template
232 | inline typename InstructionTraits::MemBoolType boolCompactConvert(typename InstructionTraits::RegBoolType regVal)
233 | {
234 | return static_cast::MemBoolType>(regVal);
235 |
236 | }
237 |
238 |
239 |
240 |
241 | inline Vec8d boolCompactConvert(Vec8db regVal)
242 | {
243 | Vec8d const b = 0.;
244 | return select(regVal, -nan8d(), b);
245 | }
246 |
247 |
248 |
249 | inline Vec16f boolCompactConvert(Vec16fb regVal)
250 | {
251 | Vec16f const b = 0.f;
252 | return select(regVal, -nan16f(), b);
253 | }
254 |
255 |
256 | inline Vec8db boolCompactConvert(Vec8d regVal)
257 | {
258 | Vec8d allZeros = false;
259 | Vec8db ret = !(allZeros == regVal);
260 | return ret;
261 | }
262 |
263 |
264 | inline Vec16fb boolCompactConvert(Vec16f regVal)
265 | {
266 | Vec16f allZeros = false;
267 | Vec16fb ret = !(allZeros == regVal);
268 | return ret;
269 | }
270 |
271 |
272 |
273 | //for save
274 | template
275 | inline auto boolConvert(typename InstructionTraits::RegBoolType regVal)
276 | {
277 | if constexpr (! InstructionTraits::isCompact )
278 | {
279 | return regVal;
280 | }
281 | else
282 | {
283 | return boolCompactSave< TRAIT>(regVal);
284 | }
285 | }
286 |
287 |
288 | //for load
289 | template
290 | inline auto boolConvert(typename InstructionTraits::MemBoolType regVal)
291 | {
292 | if constexpr (!InstructionTraits::isCompact)
293 | {
294 | return regVal;
295 | }
296 | else
297 | {
298 | return boolCompactConvert(regVal);
299 | }
300 | }
301 |
302 |
303 |
304 |
--------------------------------------------------------------------------------
/Vectorisation/VecX/target_name_space.h:
--------------------------------------------------------------------------------
1 | /**************************** target_name_space.h *******************************
2 | * Author: Andrew Drakeford
3 | * Date created: 2021-04-10
4 | * Last modified: 2021-04-10
5 | * Version: 1.0
6 | * Project: DR Cubed
7 | * Description:
8 | *
9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 |
14 | #include "vec.h"
15 | #include "vec_bool.h"
16 | #include "vec_d.h"
17 | #include "vec_bool_d.h"
18 | #include "vec_double.h"
19 | #include "vec_view.h"
20 | #include "apply_operation.h"
21 | #include "span.h"
22 |
23 | namespace DRC
24 | {
25 |
26 |
27 |
28 | namespace VecLDb
29 | {
30 | using VecxD = VecD;
31 | using Vecx = VecD;
32 | using VecXX = Vec;
33 | using VecVW = VecView;
34 | using VecBL = VecBool;
35 | using SpanXX = Span;
36 | using StrdSpanXX = StridedSpan;
37 | };
38 | // experimental
39 |
40 |
41 | namespace VecDb
42 | {
43 | using VecxD = VecD;
44 | using Vecx = VecD;
45 | using VecXX = Vec;
46 | using VecVW = VecView;
47 | using VecBL = VecBool;
48 | using SpanXX = Span;
49 | using StrdSpanXX = StridedSpan;
50 | };
51 |
52 |
53 |
54 | namespace VecD2D
55 | {
56 | using VecxD = VecD;
57 | using Vecx = VecD;
58 | using VecXX = Vec;
59 | using VecVW = VecView;
60 | using VecBL = VecBool;
61 | using SpanXX = Span;
62 | using StrdSpanXX = StridedSpan;
63 | };
64 |
65 |
66 | namespace VecD4D
67 | {
68 | using VecxD = VecD;
69 | using Vecx = VecD;
70 | using VecXX = Vec;
71 | using VecVW = VecView;
72 | using VecBL = VecBool;
73 | using SpanXX = Span;
74 | using StrdSpanXX = StridedSpan;
75 | };
76 |
77 | namespace VecD8D
78 | {
79 | using VecxD = VecD