├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CMakeLists.txt ├── CMakeSettings.json ├── GettingStarted ├── CMakeLists.txt ├── GettingStarted.cpp ├── GettingStarted.vcxproj ├── GettingStarted.vcxproj.filters └── GettingStarted.vcxproj.user ├── LICENSE ├── LeaveOneOutRegression ├── LeaveOneOutRegression.cpp ├── LeaveOneOutRegression.vcxproj └── LeaveOneOutRegression.vcxproj.user ├── README.md ├── VariadicReduction ├── CMakeLists.txt ├── VariadicReducrion.vcxproj ├── VariadicReducrion.vcxproj.filters ├── VariadicReduction.cpp ├── VariadicReduction.sln └── VariadicReduction.vcxproj ├── VectorTest ├── DR3_tests.cpp ├── TestAccumulator.cpp ├── TestAllocator.cpp ├── TestCurve.cpp ├── TestFilterSelect.cpp ├── TestFilterTransform.cpp ├── TestScan.cpp ├── TestSpan.cpp ├── TestViews.cpp ├── Test_binary_unitary_operations.cpp ├── Unroll_operators.cpp ├── VTune Profiler Results │ └── VectorTest │ │ └── VectorTest.vtuneproj ├── VectorTest.log ├── VectorTest.vcxproj ├── VectorTest.vcxproj.user ├── dr3TestUtil.h ├── packages.config ├── pch.cpp ├── pch.h ├── test.cpp ├── testNamespace.cpp ├── testNamespace.h └── test_precise_accumulation.cpp ├── Vectorisation.sln ├── Vectorisation ├── CMakeLists.txt ├── Output-Build.txt ├── TextFile1.txt ├── VCL │ ├── LICENSE │ ├── README.md │ ├── dispatch_example1.cpp │ ├── dispatch_example2.cpp │ ├── instrset.h │ ├── instrset_detect.cpp │ ├── vector_convert.h │ ├── vectorclass.h │ ├── vectorf128.h │ ├── vectorf256.h │ ├── vectorf256e.h │ ├── vectorf512.h │ ├── vectorf512e.h │ ├── vectori128.h │ ├── vectori256.h │ ├── vectori256e.h │ ├── vectori512.h │ ├── vectori512e.h │ ├── vectori512s.h │ ├── vectori512se.h │ ├── vectormath_common.h │ ├── vectormath_exp.h │ ├── vectormath_hyp.h │ ├── vectormath_lib.h │ └── vectormath_trig.h ├── VecX │ ├── accumulate_transform.h │ ├── alloc_policy.cpp │ ├── alloc_policy.h │ ├── alloc_policy_imp.h │ ├── apply_operation.h │ ├── binary_unitary_operations.h │ ├── binned_accumulator.h │ ├── boolean_operations.h │ ├── conditional_select_eval.h │ ├── dr3.h │ ├── error_utils.h │ ├── filter_pipe_and_join.h │ ├── filter_select.h │ ├── instruction_traits.h │ ├── math_ops.h │ ├── operations.h │ ├── sampler.h │ ├── scan.h │ ├── span.h │ ├── target_name_space.h │ ├── transform.h │ ├── unroll_operators.h │ ├── vcl_latest.h │ ├── vec.cpp │ ├── vec.h │ ├── vec_bool.h │ ├── vec_bool_d.h │ ├── vec_d.h │ ├── vec_double.h │ ├── vec_view.h │ └── zip_utils.h ├── Vectorisation.cpp ├── Vectorisation.log ├── Vectorisation.sln ├── Vectorisation.vcxproj ├── Vectorisation.vcxproj.filters ├── Vectorisation.vcxproj.user ├── intel_Libs │ ├── libirc.lib │ ├── svml_disp.lib │ ├── svml_dispmd.lib │ ├── svml_dispmt.lib │ └── svmlpatch.lib ├── intrinsic_utils.h ├── packages.config └── pch.h ├── accumulateExample ├── CMakeLists.txt ├── accumulateExample.log ├── accumulateExample.vcxproj ├── accumulateExample.vcxproj.filters ├── accumulateExample.vcxproj.user ├── accumulate_example.cpp ├── gnormcpp.cpp └── norm.h ├── cumNormalExample ├── CMakeLists.txt ├── cumNormal.h ├── cumNormalExample.cpp ├── cumNormalExample.vcxproj ├── cumNormalExample.vcxproj.filters └── cumNormalExample.vcxproj.user ├── dancingAVX512 ├── AVX512Dance.cpp ├── AVX512Dance.h ├── CMakeLists.txt ├── dancingAVX512.cpp ├── dancingAVX512.vcxproj └── dancingAVX512.vcxproj.user ├── docs ├── BlackScholesVecXX.mp4 ├── Build.md └── cppCon2022.pdf ├── inverseCumNormalExample ├── CMakeLists.txt ├── cdfNormalInverse.cpp ├── cdfNormalInverse.h ├── inverseCumNormalExample.cpp ├── inverseCumNormalExample.log ├── inverseCumNormalExample.vcxproj ├── inverseCumNormalExample.vcxproj.filters └── inverseCumNormalExample.vcxproj.user ├── lattice ├── CMakeLists.txt ├── americanCrankNicholsonPricer.cpp ├── americanFinitDiffPricer.cpp ├── americanImplicitFiniteDiff.cpp ├── americanTrinomialPricer.cpp ├── americanTrinomialPricerUpAndOut.cpp ├── euroTrinomial.cpp ├── euroTrinomialPricerWithInit.cpp ├── europeanBinomialPricer.cpp ├── lattice.cpp ├── lattice.vcxproj ├── lattice.vcxproj.user ├── lattice_tools.cpp ├── lattice_tools.h ├── pricers.h └── utils.h └── scratch ├── CMakeLists.txt ├── scratch.cpp ├── scratch.vcxproj └── scratch.vcxproj.user /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build and Unittest 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) 11 | BUILD_TYPE: Release 12 | 13 | jobs: 14 | example_matrix: 15 | strategy: 16 | matrix: 17 | os: [windows-latest, ubuntu-latest] 18 | runs-on: ${{ matrix.os }} 19 | steps: 20 | - uses: actions/checkout@v3 21 | - uses: symbitic/install-cmake@master 22 | 23 | - name: CMake Configure and Build on Ubuntu 24 | if: matrix.os == 'Ubuntu-latest' 25 | run: | 26 | cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 27 | cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 28 | 29 | - name: CMake Configure and Build on Windows 30 | if: matrix.os == 'windows-latest' 31 | run: | 32 | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 33 | cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 34 | cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 35 | shell: cmd 36 | 37 | #- name: Test 38 | # working-directory: ${{github.workspace}}/build 39 | # # Execute tests defined by the CMake configuration. 40 | # # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail 41 | # run: ctest -C ${{env.BUILD_TYPE}} 42 | 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | /build 6 | /build_vcc 7 | 8 | /packages 9 | /.vs/DRCubed/v16/TestStore/0 10 | /accumulateExample/clang-cl 11 | /accumulateExample/ICC2022 12 | /accumulateExample/Release 13 | /GettingStarted/clang-cl 14 | /GettingStarted/ICC2022 15 | /GettingStarted/Release 16 | /GettingStarted/x64/Release 17 | /.vs/Vectorisation/v16/ipch/AutoPCH/4e9dfb20cefae0b2 18 | /accumulateExample/x64/Release 19 | /inverseCumNormalExample/clang-cl 20 | /inverseCumNormalExample/ICC2022 21 | /inverseCumNormalExample/Release 22 | /inverseCumNormalExample/x64/Release 23 | /Vectorisation/.vs/Vectorisation/v16 24 | /Vectorisation/clang-cl 25 | /Vectorisation/Debug 26 | /Vectorisation/ICC2022 27 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.0 28 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4/build/native 29 | /Vectorisation/Release 30 | /Vectorisation/x64/Release 31 | /VectorTest/clang-cl 32 | /VectorTest/ICC2022 33 | /VectorTest/Release 34 | /VectorTest/x64/Release 35 | /x64/Release 36 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4 37 | /.vs/Vectorisation/v16 38 | /.vs 39 | /accumulateExample/x64 40 | /GettingStarted/x64 41 | /inverseCumNormalExample/x64 42 | /Vectorisation/x64 43 | /VectorTest/x64 44 | /x64 45 | /accumulateExample/My Advisor Results - accumulateExample 46 | /accumulateExample/VTune Profiler Results/accumulateExample 47 | /cumNormalExample/x64 48 | /inverseCumNormalExample/My Advisor Results - inverseCumNormalExample 49 | /inverseCumNormalExample/VTune Profiler Results/inverseCumNormalExample 50 | /GettingStarted/My Advisor Results - GettingStarted 51 | /accumulateExample/My Inspector Results - accumulateExample 52 | /GettingStarted/My Inspector Results - GettingStarted 53 | /VectorTest/My Advisor Results - VectorTest 54 | /VectorTest/My Inspector Results - VectorTest 55 | /out/build/x64-Debug 56 | /lattice/x64 57 | /dancingAVX512/x64 58 | /Vectorisation/cmake-build-debug 59 | /lattice/cmake-build-debug 60 | /curveExample/Intel® VTune™ Profiler Results/curveExample 61 | /curveExample/My Advisor Results - curveExample 62 | /scratch/x64 63 | /cmake-build-debug 64 | /.idea 65 | /curveExample/x64 66 | /scratch/r000mi2 67 | /scratch/r001mi2 68 | /scratch/r002mi3 69 | /scratch/r003mi2 70 | /scratch/scratch.inspxeproj 71 | /debug.log 72 | /cmake-build-release/.cmake/api/v1/reply 73 | /cmake-build-release/.cmake/api/v1/query 74 | /cmake-build-release/accumulateExample/CMakeFiles/accumulateExample.dir/accumulate_example.cpp.obj 75 | /cmake-build-release 76 | /VariadicReducrion/e000 77 | /VariadicReducrion/Intel® VTune™ Profiler Results/VariadicReducrion 78 | /VariadicReducrion/x64 79 | /DawnCache 80 | /config 81 | /GPUCache 82 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.17) 2 | 3 | project(DR3) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | set(CMAKE_CXX_STANDARD_REQUIRED True) 7 | 8 | 9 | add_subdirectory(Vectorisation) 10 | add_subdirectory(accumulateExample) 11 | add_subdirectory(cumNormalExample) 12 | add_subdirectory(inverseCumNormalExample) 13 | add_subdirectory(dancingAVX512) 14 | add_subdirectory(lattice) 15 | add_subdirectory(GettingStarted) 16 | add_subdirectory(scratch) 17 | add_subdirectory(VariadicReduction) 18 | 19 | -------------------------------------------------------------------------------- /CMakeSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "x64-Debug", 5 | "generator": "Ninja", 6 | "configurationType": "Debug", 7 | "inheritEnvironments": [ "msvc_x64_x64" ], 8 | "buildRoot": "${projectDir}\\out\\build\\${name}", 9 | "installRoot": "${projectDir}\\out\\install\\${name}", 10 | "cmakeCommandArgs": "", 11 | "buildCommandArgs": "", 12 | "ctestCommandArgs": "" 13 | }, 14 | { 15 | "name": "Linux-GCC-Release", 16 | "generator": "Ninja", 17 | "configurationType": "RelWithDebInfo", 18 | "cmakeExecutable": "cmake", 19 | "remoteCopySourcesExclusionList": [ ".vs", ".git", "out" ], 20 | "cmakeCommandArgs": "", 21 | "buildCommandArgs": "", 22 | "ctestCommandArgs": "", 23 | "inheritEnvironments": [ "linux_x64" ], 24 | "remoteMachineName": "${defaultRemoteMachineName}", 25 | "remoteCMakeListsRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/src", 26 | "remoteBuildRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/build/${name}", 27 | "remoteInstallRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/install/${name}", 28 | "remoteCopySources": true, 29 | "rsyncCommandArgs": "-t --delete --delete-excluded", 30 | "remoteCopyBuildOutput": false, 31 | "remoteCopySourcesMethod": "rsync" 32 | } 33 | ] 34 | } -------------------------------------------------------------------------------- /GettingStarted/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(GettingStarted GettingStarted.cpp) 2 | 3 | target_link_libraries(GettingStarted PUBLIC Vectorisation) 4 | 5 | target_include_directories(GettingStarted PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) -------------------------------------------------------------------------------- /GettingStarted/GettingStarted.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /GettingStarted/GettingStarted.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /LeaveOneOutRegression/LeaveOneOutRegression.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DR3 2 | 3 | To get full use of the repo, you need a modern processor which has AVX512 or AVX2 instructions. 4 | If your processor only has AVX2, you need to change target instruction sets in the projects to AVX2, and don’t generate AVX512 5 | in the projects because your machine wont run them. 6 | 7 | The projects build with GCC, clang, IC2022 and VS2019. 8 | In visual c++ select x64 and solution configuration for IC2022, release, debug and clang 9 | 10 | The getting started project shows some example use cases for vectors, filters and views, together with an experimental 11 | vectorised forward AAD getting option sensitivities. 12 | 13 | The accumulate example shows some of the use cases given in the cppCon2022 talk. 14 | Additionally it gives an example of error correction in Khan accumulation 15 | 16 | The example build an run with VS2019, clang and intel compilers. The target instruction set 17 | generated by the framework can be changed by changing the namespace. These are double and float 18 | types VecDb is pair of doubles. Uncomment the namespace and build the example. 19 | 20 | //using namespace DRC::VecDb; 21 | 22 | //using namespace DRC::VecD2D; //sse2 double 23 | 24 | using namespace DRC::VecD4D; //avx2 double 25 | 26 | //using namespace DRC::VecF8F; // avx2 float 27 | 28 | //using namespace DRC::VecD8D; //avx512 double 29 | 30 | //using namespace DRC::VecF16F; //avx512 float 31 | 32 | 33 | For a machine supporting AVX512, ensure all the visual studio projects are set to use enhanced instruction set. 34 | ConfigurationProperties C++/Instruction Set /Enable Enhanced Instruction Set to ARCh:AVX512 35 | If your machine doesnt support this, reduce to AVX2 or SSE2, and dont select a namespace in the code requiring more advanced instruction 36 | sets. 37 | 38 | Uncomment one of the Using namespace lines select the instruction set that you wish to run 39 | Those ending in F have float type as underlying, those ending with D have a double. 40 | 41 | The project is set to compile using the AVX512 enhanced instruction set. The namespace selection 42 | choses the type of the intrinsics that are used to instantiate lambdas. 43 | 44 | If your hardware does not support AVX512 chose the next level down AVX2 and avoid using namespaces 45 | DRC::VecD8D or DRC::VecF16F which will cause generation of code with instructions that your computer doesn't support. 46 | 47 | check device manager/processor to determine what processor you have and check against web site 48 | https://ark.intel.com/content/www/us/en/ark/products/123550/intel-xeon-silver-4114-processor-13-75m-cache-2-20-ghz.html 49 | or 50 | https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html 51 | 52 | 53 | The getting started project shows the useage of vectors lambdas and filters 54 | 55 | The accumulateExample builds performance examples covered in the cppCon2022 talk. 56 | They give the user the chance to change between ICC,clang and VS2019 builds but changing the 57 | instruction set used via the using declaration. 58 | 59 | The inverseCumNormalExample gives the performance example shown in cppCon2022, although there might be some slight 60 | perfrormance regression on one or two of the examples. Its instructive to run the examples after building with the 61 | different compilers and chosing different instruction sets for the Lambdas (via namespace). 62 | 63 | The AVX512Dance function runs a routine which finds the max value in in array, using AVX2 and AVX512. By monitoring the 64 | power useage using something like openhardware monitor its possible to see that using the AVX512 instructions, use less 65 | energy to do the compute than the AVX2 ( on this silver4114 xeon). 66 | 67 | VectorTest is a selection of tests using googletest. 68 | The main library is Vectorisation. This refrence a local copy of the VCL2 library. It has a slight change to enable 69 | VCL2 to be used with the intel IC2022 compiler. 70 | 71 | 72 | ## Building DR3 73 | 74 | See [docs/Build.md](docs/Build.md) for instructions on how to build DR3 from source and a list of supported platforms. 75 | 76 | -------------------------------------------------------------------------------- /VariadicReduction/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(VariadicExample VariadicReduction.cpp) 2 | 3 | target_link_libraries(VariadicExample PUBLIC Vectorisation) 4 | 5 | target_include_directories(VariadicExample PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) 8 | -------------------------------------------------------------------------------- /VariadicReduction/VariadicReducrion.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | -------------------------------------------------------------------------------- /VariadicReduction/VariadicReduction.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.7.34202.233 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VariadicReduction", "VariadicReduction.vcxproj", "{271CF3D5-72FF-4657-9325-4206B8D5C84F}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | clang-cl23|x64 = clang-cl23|x64 11 | clang-cl23|x86 = clang-cl23|x86 12 | Debug|x64 = Debug|x64 13 | Debug|x86 = Debug|x86 14 | ICC2023|x64 = ICC2023|x64 15 | ICC2023|x86 = ICC2023|x86 16 | Release|x64 = Release|x64 17 | Release|x86 = Release|x86 18 | Release-23|x64 = Release-23|x64 19 | Release-23|x86 = Release-23|x86 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.ActiveCfg = clang-cl23|x64 23 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.Build.0 = clang-cl23|x64 24 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.ActiveCfg = clang-cl23|Win32 25 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.Build.0 = clang-cl23|Win32 26 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.ActiveCfg = Debug|x64 27 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.Build.0 = Debug|x64 28 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.ActiveCfg = Debug|Win32 29 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.Build.0 = Debug|Win32 30 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.ActiveCfg = ICC2023|x64 31 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.Build.0 = ICC2023|x64 32 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.ActiveCfg = ICC2023|Win32 33 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.Build.0 = ICC2023|Win32 34 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.ActiveCfg = Release|x64 35 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.Build.0 = Release|x64 36 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.ActiveCfg = Release|Win32 37 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.Build.0 = Release|Win32 38 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.ActiveCfg = Release-23|x64 39 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.Build.0 = Release-23|x64 40 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.ActiveCfg = Release-23|Win32 41 | {271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.Build.0 = Release-23|Win32 42 | EndGlobalSection 43 | GlobalSection(SolutionProperties) = preSolution 44 | HideSolutionNode = FALSE 45 | EndGlobalSection 46 | GlobalSection(ExtensibilityGlobals) = postSolution 47 | SolutionGuid = {347FE8D5-D275-4584-8F15-DD105566C258} 48 | EndGlobalSection 49 | EndGlobal 50 | -------------------------------------------------------------------------------- /VectorTest/TestAllocator.cpp: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | 3 | #include "../Vectorisation/VecX/vec.h" 4 | #include "../Vectorisation/VecX/operations.h" 5 | #include "../Vectorisation/VecX/vec_bool_d.h" 6 | #include "../Vectorisation/VecX/vec_double.h" 7 | #include "../Vectorisation/VecX/alloc_policy.h" 8 | #include "../Vectorisation/VecX/alloc_policy_imp.h" 9 | #include "../Vectorisation/VecX/target_name_space.h" 10 | 11 | 12 | TEST(TestCaseAlloc, fillup_empty_last) { 13 | EXPECT_EQ(1, 1); 14 | EXPECT_TRUE(true); 15 | 16 | PoolStrat myPool(10); 17 | int MAX_EL = 20; 18 | std::vector pAlloc; 19 | 20 | int pos = myPool.pos(); 21 | for (int i = 0; i < MAX_EL; ++i) 22 | { 23 | double* p = myPool.alloc(); 24 | pos = myPool.pos(); 25 | 26 | (*p) = i; 27 | pAlloc.push_back(p); 28 | } 29 | 30 | int szx =myPool.size(); 31 | pos = myPool.pos(); 32 | 33 | for (int k = pos; k > 0; --k) 34 | { 35 | double* pback = pAlloc.back(); 36 | pAlloc.pop_back(); 37 | myPool.free(pback); 38 | 39 | pos = myPool.pos(); 40 | 41 | } 42 | 43 | 44 | } 45 | 46 | 47 | TEST(TestCaseAlloc, fillup_empty_secondlast) { 48 | EXPECT_EQ(1, 1); 49 | EXPECT_TRUE(true); 50 | 51 | PoolStrat myPool(10); 52 | int MAX_EL = 20; 53 | std::vector pAlloc; 54 | 55 | int pos = myPool.pos(); 56 | for (int i = 0; i < MAX_EL; ++i) 57 | { 58 | double* p = myPool.alloc(); 59 | pos = myPool.pos(); 60 | 61 | (*p) = i; 62 | pAlloc.push_back(p); 63 | } 64 | 65 | int szx = myPool.size(); 66 | pos = myPool.pos(); 67 | 68 | for (int k = pos; k > 1; --k) 69 | { 70 | double* pback = pAlloc[k - 2]; 71 | //pAlloc.pop_back(); 72 | myPool.free(pback); 73 | 74 | pos = myPool.pos(); 75 | 76 | } 77 | 78 | //all ok 79 | //add one 80 | auto newOne = myPool.alloc(); 81 | (*newOne) = 88; 82 | 83 | myPool.free(newOne); 84 | myPool.free(pAlloc[MAX_EL - 1]); 85 | 86 | //all deleted 87 | auto newOnetoo = myPool.alloc(); 88 | (*newOnetoo) = 99; 89 | //one element 99 90 | 91 | 92 | for (int i = 0; i < 3; ++i) 93 | { 94 | auto newOnetoo = myPool.alloc(); 95 | (*newOnetoo) = 44 + i; 96 | } 97 | 98 | 99 | //needto test some vakues 100 | } 101 | 102 | 103 | using namespace DRC::VecD4D; 104 | 105 | TEST(TestCaseAlloc, monkyBusinessBuffer) { 106 | EXPECT_EQ(1, 1); 107 | EXPECT_TRUE(true); 108 | 109 | 110 | std::vector mix(21,1.0); 111 | VecXX Vec2(mix); 112 | 113 | 114 | auto d = Vec2; 115 | auto a = d; 116 | auto b = a; 117 | auto c = b; 118 | 119 | 120 | a *= -1.0; 121 | auto w = log(-a); 122 | std::vector cach(w.size()); 123 | for (size_t i = 0; i < w.size(); i++) 124 | { 125 | cach[i] = w[i]; 126 | } 127 | auto aa = -b; 128 | 129 | //operation above should not change 130 | for (size_t i = 0; i < w.size(); i++) 131 | { 132 | double cacI = cach[i]; 133 | double wI = w[i]; 134 | EXPECT_EQ(cacI, wI); 135 | } 136 | 137 | 138 | 139 | } -------------------------------------------------------------------------------- /VectorTest/TestCurve.cpp: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | 3 | 4 | #include "../../Vectorisation/ExampleVectors/curve.h" 5 | #include "../Vectorisation/VecX/operations.h" 6 | #include "../Vectorisation/VecX/vec_bool_d.h" 7 | #include "../Vectorisation/VecX/vec_double.h" 8 | #include "../Vectorisation/VecX/alloc_policy.h" 9 | 10 | typedef VecD VecxD; 11 | typedef VecD Vecx; 12 | typedef Vec VecXX; 13 | 14 | TEST(TestCaseCurve, Test1) { 15 | EXPECT_EQ(1, 1); 16 | EXPECT_TRUE(true); 17 | 18 | std::vector values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 }; 19 | std::vector dates = { 0,1,2,3,4,5,6,7,8,9,10 }; 20 | std::vector datesD = { 0,1,2,3,4,5,6,7,8,9,10 }; 21 | 22 | 23 | 24 | 25 | Curve< double, double> testCurve; 26 | testCurve.setValues(begin(values), end(values), begin(datesD), end(datesD)); //wrong way round 27 | 28 | auto val = testCurve.valueAt(0.0); 29 | 30 | EXPECT_EQ(val, 0.0); 31 | val = testCurve.valueAt(0.5); 32 | EXPECT_EQ(val, 0.5); 33 | 34 | 35 | /////////////////////////// 36 | std::vector< VecXX> vecVals; 37 | for (int i = 0; i < 11; i++) 38 | { 39 | VecXX vals(i * 0.001 + 0.06, 100); 40 | vecVals.push_back(vals); 41 | 42 | } 43 | 44 | 45 | { 46 | using ZeroCrv = Curve< double, VecXX, ZeroInterp >; 47 | 48 | ZeroCrv testCurve2; 49 | testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals)); 50 | 51 | auto valV = testCurve2.valueAt(0.0); 52 | 53 | auto valV2 = testCurve2.valueAt(0.5); 54 | } 55 | 56 | 57 | { 58 | std::vector values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 }; 59 | std::vector dates = { 0,1,2,3,4,5,6,7,8,9,10 }; 60 | std::vector datesD = { 0,1,2,3,4,5,6,7,8,9,10 }; 61 | 62 | std::vector< VecXX> vecVals; 63 | for (int i = 0; i < 11; i++) 64 | { 65 | VecXX vals(i * 0.001 + 0.06, 100); 66 | vecVals.push_back(vals); 67 | 68 | } 69 | 70 | 71 | Curve2> testCurve2(10); 72 | testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals)); 73 | 74 | auto valV = testCurve2.valueAt(0.0); 75 | 76 | auto valV2 = testCurve2.valueAt(0.5); 77 | 78 | 79 | for (long l = 0; l < 10000; l++) 80 | { 81 | auto valV3 = testCurve2.valueAt(0.5); 82 | } 83 | } 84 | 85 | //EXPECT_EQ(val, 0.0); 86 | //val = testCurve.valueAt(0.5); 87 | //EXPECT_EQ(val, 0.5); 88 | } -------------------------------------------------------------------------------- /VectorTest/TestFilterTransform.cpp: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | 3 | #include "../Vectorisation/VecX/vec.h" 4 | #include "../Vectorisation/VecX/operations.h" 5 | #include "../Vectorisation/VecX/vec_bool_d.h" 6 | #include "../Vectorisation/VecX/vec_double.h" 7 | #include "../Vectorisation/VecX/alloc_policy.h" 8 | #include "../Vectorisation/VecX/vec_d.h" 9 | #include "../Vectorisation/VecX/vec_bool.h" 10 | #include "../Vectorisation/VecX/vec_view.h" 11 | 12 | #include "../Vectorisation/VecX/target_name_space.h" 13 | 14 | 15 | #include "../Vectorisation/VecX/dr3.h" 16 | #include "dr3TestUtil.h" 17 | 18 | #include 19 | #include "testNamespace.h" 20 | 21 | 22 | void testFilterTransform(int SZ ) 23 | { 24 | 25 | auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); }; 26 | std::vector input(SZ,asNumber( 0.0)); 27 | std::iota(begin(input), end(input), asNumber(0.0)); 28 | 29 | VecXX testVec(input); 30 | auto trueLambdaS = [&](auto x) { return x; }; 31 | auto falseLambdaS = [&](auto x) { return -x; }; 32 | 33 | 34 | for (int j = 0; j < SZ; ++j) 35 | { 36 | auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); }; 37 | VecXX res = filterTransform(onlyJlambda, testVec, trueLambdaS, falseLambdaS); 38 | 39 | for (int k = 0; k < SZ; k++) 40 | { 41 | if( k==j) 42 | { 43 | EXPECT_NUMERIC_EQ(res[k], asNumber( k)); 44 | } 45 | else 46 | { 47 | EXPECT_NUMERIC_EQ(res[k], asNumber(-k)); 48 | } 49 | } 50 | } 51 | 52 | } 53 | 54 | 55 | 56 | 57 | TEST(TestFilterTransform, testTransformEachPoint) 58 | { 59 | 60 | for (int SZ = 3; SZ < 33; SZ++) 61 | { 62 | testFilterTransform(SZ); 63 | } 64 | 65 | 66 | testFilterTransform(34); 67 | testFilterTransform(65); 68 | testFilterTransform(63); 69 | testFilterTransform(64); 70 | 71 | } 72 | -------------------------------------------------------------------------------- /VectorTest/TestScan.cpp: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | 3 | 4 | 5 | #include "../Vectorisation/VecX/vec.h" 6 | #include "../Vectorisation/VecX/operations.h" 7 | #include "../Vectorisation/VecX/vec_bool_d.h" 8 | #include "../Vectorisation/VecX/vec_double.h" 9 | #include "../Vectorisation/VecX/alloc_policy.h" 10 | 11 | #include "../Vectorisation/VecX/vec_d.h" 12 | #include "../Vectorisation/VecX/vec_bool.h" 13 | #include "../Vectorisation/VecX/vec_view.h" 14 | 15 | #include "../Vectorisation/VecX/target_name_space.h" 16 | 17 | 18 | #include "../Vectorisation/VecX/dr3.h" 19 | #include "../Vectorisation/VecX/scan.h" 20 | #include "../Vectorisation/VecX/instruction_traits.h" 21 | 22 | 23 | #include "testNamespace.h" 24 | #include "dr3TestUtil.h" 25 | 26 | #include 27 | 28 | #include 29 | 30 | 31 | 32 | 33 | 34 | void testScan(int SZ) 35 | { 36 | 37 | 38 | std::vector input(SZ, asNumber(0.0)); 39 | std::iota(begin(input), end(input), asNumber(0.0)); 40 | 41 | VecXX testVec(input); 42 | auto add = [](auto x, auto y) {return x + y; }; 43 | 44 | 45 | for (int j = 0; j < SZ; ++j) 46 | { 47 | 48 | auto res = scan( testVec, add); 49 | 50 | std::vector dbg = res; 51 | 52 | auto expected = testVec[0]; 53 | 54 | EXPECT_NUMERIC_EQ(expected, res[0]); 55 | 56 | for (int k = 1; k < SZ; k++) 57 | { 58 | expected += testVec[k] ; 59 | EXPECT_NUMERIC_EQ(expected, res[k]); 60 | } 61 | } 62 | 63 | 64 | 65 | 66 | } 67 | 68 | 69 | 70 | long double getErr(long double) 71 | { 72 | return std::pow(10, 4 - 16); 73 | } 74 | 75 | double getErr(double) 76 | { 77 | return std::pow(10, 4 - 16); 78 | } 79 | 80 | double getErr(float) 81 | { 82 | return std::pow(10, 4 - 8); 83 | } 84 | 85 | 86 | void testScan1(int SZ ,double start) 87 | { 88 | 89 | 90 | std::vector input(SZ, asNumber(0.0)); 91 | std::iota(begin(input), end(input), asNumber(start)); 92 | 93 | 94 | Numeric err = getErr(Numeric(0.)); 95 | 96 | VecXX testVec(input); 97 | auto add = [](auto x, auto y) {return x + y; }; 98 | 99 | 100 | for (int j = 0; j < SZ; ++j) 101 | { 102 | 103 | auto res = scan(testVec, add); 104 | 105 | std::vector dbg = res; 106 | 107 | std::vector expected; 108 | std::inclusive_scan(cbegin(input), cend(input), std::back_inserter( expected)); 109 | 110 | EXPECT_NEAR(expected[0], res[0], err); 111 | 112 | for (int k = 1; k < SZ; k++) 113 | { 114 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k]))); 115 | EXPECT_NEAR(expected[k], res[k], relErr); 116 | 117 | } 118 | } 119 | 120 | } 121 | 122 | 123 | 124 | 125 | 126 | 127 | TEST(TestScan, scanShortVector) 128 | { 129 | 130 | for (int SZ = 3; SZ < 33; SZ++) 131 | { 132 | testScan(SZ); 133 | } 134 | 135 | for (int SZ = 3; SZ < 133; SZ++) 136 | { 137 | testScan1(SZ,3.14); 138 | } 139 | 140 | } 141 | 142 | 143 | 144 | 145 | 146 | void testTransformScan1(int SZ, double start) 147 | { 148 | 149 | 150 | std::vector input(SZ, asNumber(0.0)); 151 | std::iota(begin(input), end(input), asNumber(start)); 152 | 153 | 154 | Numeric err = getErr(Numeric(0.)); 155 | 156 | VecXX testVec(input); 157 | auto SQR = [](auto x) { return x * x; }; 158 | 159 | auto sqrVec = transform( [](auto x) {return x * x; }, testVec); 160 | std::vector< Numeric> sq = sqrVec; 161 | auto add = [](auto x, auto y) {return x + y; }; 162 | 163 | 164 | for (int j = 0; j < SZ; ++j) 165 | { 166 | 167 | auto res = ApplyTransformScan(testVec, add, SQR); 168 | 169 | std::vector dbg = res; 170 | 171 | std::vector expected; 172 | std::inclusive_scan(cbegin(sq), cend(sq), std::back_inserter(expected)); 173 | 174 | EXPECT_NEAR(expected[0], res[0], err); 175 | 176 | for (int k = 1; k < SZ; k++) 177 | { 178 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k]))); 179 | EXPECT_NEAR(expected[k], res[k], relErr); 180 | 181 | } 182 | } 183 | 184 | } 185 | 186 | 187 | 188 | 189 | 190 | 191 | TEST(TestTransformScanTransform, transformScanShortVector) 192 | { 193 | 194 | for (int SZ = 3; SZ < 33; SZ++) 195 | { 196 | testTransformScan1(SZ,0); 197 | } 198 | 199 | for (int SZ = 3; SZ < 133; SZ++) 200 | { 201 | testTransformScan1(SZ, 3.14); 202 | } 203 | 204 | } 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | void testTransformScan2(int SZ, double start) 213 | { 214 | 215 | 216 | std::vector input(SZ, asNumber(0.0)); 217 | std::iota(begin(input), end(input), asNumber(start)); 218 | 219 | 220 | Numeric err = getErr(Numeric(0.)); 221 | 222 | VecXX testVec(input); 223 | 224 | VecXX testVec1 = testVec + 1.0; 225 | 226 | auto MULT = [](auto x,auto y) { return x * y; }; 227 | 228 | auto multVec = testVec * testVec1; 229 | 230 | 231 | std::vector< Numeric> prod = multVec; 232 | auto add = [](auto x, auto y) {return x + y; }; 233 | 234 | 235 | for (int j = 0; j < SZ; ++j) 236 | { 237 | 238 | auto res = ApplyTransformScan(testVec, testVec1, add, MULT); 239 | 240 | std::vector dbg = res; 241 | 242 | std::vector expected; 243 | std::inclusive_scan(cbegin(prod), cend(prod), std::back_inserter(expected)); 244 | 245 | EXPECT_NEAR(expected[0], res[0], err); 246 | 247 | for (int k = 1; k < SZ; k++) 248 | { 249 | auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k]))); 250 | EXPECT_NEAR(expected[k], res[k], relErr); 251 | 252 | } 253 | } 254 | 255 | } 256 | 257 | 258 | 259 | 260 | 261 | 262 | TEST(TestTransformScanTransform, transformScanShortVectorBinary) 263 | { 264 | 265 | for (int SZ = 3; SZ < 33; SZ++) 266 | { 267 | testTransformScan2(SZ, 0); 268 | } 269 | 270 | for (int SZ = 3; SZ < 133; SZ++) 271 | { 272 | testTransformScan2(SZ, 3.14); 273 | } 274 | 275 | } 276 | 277 | 278 | -------------------------------------------------------------------------------- /VectorTest/VectorTest.log: -------------------------------------------------------------------------------- 1 | C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'VectorTest.vcxproj'. Please check to make sure that you have specified a valid combination of Configuration and Platform for this project. Configuration='Debug' Platform='ARM64'. This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform. 2 | -------------------------------------------------------------------------------- /VectorTest/VectorTest.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /VectorTest/dr3TestUtil.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "pch.h" 3 | 4 | 5 | #include "../Vectorisation/VecX/vec.h" 6 | #include "testNamespace.h" 7 | 8 | Numeric asNumber(long double x); 9 | 10 | Numeric asNumber(double x); 11 | 12 | Numeric asNumber(float x); 13 | 14 | Numeric asNumber(int x); 15 | 16 | void EXPECT_NUMERIC_EQ(long double x, long double y); 17 | 18 | void EXPECT_NUMERIC_EQ(double x, double y); 19 | 20 | void EXPECT_NUMERIC_EQ(float x, float y); 21 | 22 | void EXPECT_NUMERIC_EQ(int x, int y); 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /VectorTest/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /VectorTest/pch.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // pch.cpp 3 | // 4 | 5 | //#include "pch.h" 6 | -------------------------------------------------------------------------------- /VectorTest/pch.h: -------------------------------------------------------------------------------- 1 | // 2 | // pch.h 3 | // 4 | 5 | #pragma once 6 | 7 | #include "gtest/gtest.h" 8 | -------------------------------------------------------------------------------- /VectorTest/testNamespace.cpp: -------------------------------------------------------------------------------- 1 | #include "testNamespace.h" 2 | 3 | Numeric asNumber(long double x) 4 | { 5 | return static_cast(x); 6 | } 7 | 8 | Numeric asNumber(double x) 9 | { 10 | return static_cast(x); 11 | } 12 | 13 | Numeric asNumber(float x) 14 | { 15 | return static_cast(x); 16 | } 17 | 18 | 19 | Numeric asNumber(int x) 20 | { 21 | return static_cast(x); 22 | } 23 | 24 | 25 | void EXPECT_NUMERIC_EQ(long double x, long double y) 26 | { 27 | EXPECT_DOUBLE_EQ(x, y); 28 | } 29 | 30 | 31 | void EXPECT_NUMERIC_EQ(double x, double y) 32 | { 33 | EXPECT_DOUBLE_EQ(x, y); 34 | } 35 | 36 | 37 | void EXPECT_NUMERIC_EQ(float x, float y) 38 | { 39 | EXPECT_FLOAT_EQ(x, y); 40 | } 41 | 42 | 43 | void EXPECT_NUMERIC_EQ(int x, int y) 44 | { 45 | EXPECT_EQ(x, y); 46 | } 47 | 48 | -------------------------------------------------------------------------------- /VectorTest/testNamespace.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "pch.h" 3 | 4 | 5 | #include "../Vectorisation/VecX/vec.h" 6 | #include "../Vectorisation/VecX/target_name_space.h" 7 | #include "../Vectorisation/VecX/instruction_traits.h" 8 | 9 | //using namespace DRC::VecDb; 10 | //using namespace DRC::VecLDb; 11 | 12 | //using namespace DRC::VecF4F; 13 | //using namespace DRC::VecD2D; 14 | using namespace DRC::VecD4D; 15 | //using namespace DRC::VecF8F; 16 | 17 | //using namespace DRC::VecD8D; 18 | //using namespace DRC::VecF16F; 19 | 20 | 21 | using Numeric = InstructionTraits::FloatType; 22 | #include "dr3TestUtil.h" 23 | -------------------------------------------------------------------------------- /VectorTest/test_precise_accumulation.cpp: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | 3 | 4 | #include "../Vectorisation/VecX/vec.h" 5 | #include "../Vectorisation/VecX/operations.h" 6 | #include "../Vectorisation/VecX/vec_bool_d.h" 7 | #include "../Vectorisation/VecX/vec_double.h" 8 | #include "../Vectorisation/VecX/alloc_policy.h" 9 | #include "../Vectorisation/VecX/accumulate_transform.h" 10 | #include "../Vectorisation/VecX/target_name_space.h" 11 | 12 | #include "../Vectorisation/VecX/dr3.h" 13 | #include "testNamespace.h" 14 | #include "dr3TestUtil.h" 15 | 16 | #include 17 | 18 | 19 | auto getVecBig(int SZ, std::vector& stl) 20 | { 21 | std::vector v(SZ, asNumber(1.0/3.0)); 22 | int i = 0; 23 | 24 | VecXX test(v); 25 | stl = v; 26 | return test; 27 | 28 | } 29 | 30 | 31 | void evalPrecAccumulate(int startLen, int endLen) 32 | { 33 | 34 | Numeric testEpsilon = 1e-10; 35 | 36 | for (int SZ = startLen; SZ <= endLen; SZ++) 37 | { 38 | std::vector v; 39 | VecXX test = getVecBig(SZ, v); 40 | using BINNED_ACCUMULATOR = BinsT; 41 | auto binned_Sum = reduce< BINNED_ACCUMULATOR >(test, BinnedAdd); 42 | 43 | EXPECT_NEAR(double(SZ / 3.0), binned_Sum, testEpsilon); 44 | } 45 | 46 | } 47 | 48 | TEST(TestPreciseAccumulator, simpleSummation) 49 | { 50 | EXPECT_EQ(1, 1); 51 | EXPECT_TRUE(true); 52 | 53 | //eval over multiple lengths 54 | evalPrecAccumulate(957, 1043); 55 | 56 | //eval over very small lengths 57 | evalPrecAccumulate(3, 23); 58 | 59 | } 60 | 61 | 62 | TEST(TestBin, simpleSummation2) 63 | { 64 | EXPECT_EQ(1, 1); 65 | EXPECT_TRUE(true); 66 | 67 | 68 | BinsT bin; 69 | 70 | 71 | 72 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 73 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0); 74 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 75 | EXPECT_EQ(bin.tinyV.extract(0), 0.0); 76 | 77 | 78 | VecXX::INS testVal =1.0e-16; 79 | bin += testVal; 80 | 81 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 82 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0); 83 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 84 | EXPECT_EQ(bin.tinyV.extract(0), 1.0e-16); 85 | 86 | bin += testVal; 87 | 88 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 89 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0); 90 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 91 | EXPECT_EQ(bin.tinyV.extract(0), 2.0e-16); 92 | 93 | 94 | bin += testVal / 2; 95 | 96 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 97 | EXPECT_EQ(bin.bigSummV.extract(0), 0.0); 98 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 99 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16); 100 | 101 | //further tests for the other bins 102 | 103 | testVal = 1.0; 104 | 105 | bin += testVal; 106 | 107 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 108 | EXPECT_EQ(bin.bigSummV.extract(0), 1.0); 109 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 110 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16); 111 | 112 | bin += testVal; 113 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 114 | EXPECT_EQ(bin.bigSummV.extract(0), 2.0); 115 | EXPECT_EQ(bin.smallSumV.extract(0), 0.0); 116 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16); 117 | 118 | bin += testVal / 2; 119 | 120 | EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0); 121 | EXPECT_EQ(bin.bigSummV.extract(0), 2.0); 122 | EXPECT_EQ(bin.smallSumV.extract(0), 0.5); 123 | EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16); 124 | 125 | 126 | 127 | 128 | 129 | BinsT bin2; 130 | 131 | auto oneThird = 1.0 / 3.0; 132 | 133 | 134 | bin2 += 1.0e-3 * oneThird; 135 | 136 | 137 | EXPECT_EQ(bin2.veryBigSummV.extract(0), 0.0); 138 | EXPECT_EQ(bin2.bigSummV.extract(0), 0.0); 139 | // EXPECT_EQ(bin2.smallSumV.extract(0), 1.0/3.0 *1.0e-3); 140 | // EXPECT_EQ(bin2.tinyV.extract(0), 0.0); 141 | 142 | auto sum = bin2.hsum(); 143 | 144 | bin2 = bin2 *100000.0; 145 | 146 | sum = bin2.hsum(); 147 | 148 | /* 149 | 150 | //eval over multiple lengths 151 | evalPrecAccumulate(957, 1043); 152 | 153 | //eval over very small lengths 154 | evalPrecAccumulate(3, 23); 155 | */ 156 | } 157 | 158 | -------------------------------------------------------------------------------- /Vectorisation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(Vectorisation STATIC 2 | VecX/alloc_policy.cpp 3 | VecX/vec.cpp) 4 | 5 | if (MSVC) 6 | # add_compile_options(/W4 /WX) 7 | else() 8 | target_compile_options(Vectorisation PUBLIC "-march=native") 9 | 10 | # or -mavx/-mavx2/-mavx512f (and -march= options that imply them with relevant tuning settings) 11 | #target_compile_options(Vectorisation PUBLIC "--std=c++17") 12 | #target_compile_options(Vectorisation PUBLIC "-mavx2") 13 | #target_compile_options(Vectorisation PUBLIC "-mfma") 14 | endif() 15 | 16 | -------------------------------------------------------------------------------- /Vectorisation/TextFile1.txt: -------------------------------------------------------------------------------- 1 | 2 | Vec4 f a ( 0 . 0 f , 0. 5 f , 1. 0 f , 1. 5 f ) ; // d e f i n e v e c t o r 3 | Vec4 f b = s i n ( a ) ; // s i n f u n c ti o n 4 | // b = ( 0. 0 0 0 0 f , 0. 4 7 9 4 f , 0. 8 4 1 5 f , 0. 9 9 7 5 f ) 5 | 6 | -------------------------------------------------------------------------------- /Vectorisation/VCL/README.md: -------------------------------------------------------------------------------- 1 | # version2 2 | Vector Class Library, latest version 3 | 4 | This is a C++ class library for using the Single Instruction Multiple Data (SIMD) instructions to improve performance on modern microprocessors with the x86 or x86/64 instruction set on Windows, Linux, and Mac platforms. There are no plans to support ARM or other instruction sets. 5 | 6 | [Latest release](https://github.com/vectorclass/version2/releases) 7 | 8 | [Download manual](https://github.com/vectorclass/manual/raw/master/vcl_manual.pdf) 9 | 10 | [Add-on packages for particular applications](https://github.com/vectorclass/add-on) 11 | 12 | [Getting-started video.](https://www.youtube.com/watch?v=TKjYdLIMTrI) Video blogger Christopher Rose has made this nice video telling how to get started with the Vector Class Library. 13 | 14 | **Help:** You may ask for programming help on [StackOverflow](https://stackoverflow.com) using the tag vector-class-library. 15 | -------------------------------------------------------------------------------- /Vectorisation/VCL/dispatch_example1.cpp: -------------------------------------------------------------------------------- 1 | /************************* dispatch_example1.cpp *************************** 2 | Author: Agner Fog 3 | Date created: 2012-05-30 4 | Last modified: 2020-02-25 5 | Version: 2.01.00 6 | Project: vector class library 7 | 8 | Description: Example of automatic CPU dispatching. 9 | This shows how to compile vector code in multiple versions, each 10 | optimized for a different instruction set. The optimal version is 11 | selected by a dispatcher at run time. 12 | 13 | There are two examples of automatic dispatching: 14 | 15 | dispatch_example1.cpp: Uses separate function names for each version. 16 | This is useful for simple cases with one or a few functions. 17 | 18 | dispatch_example2.cpp: Uses separate namespaces for each version. 19 | This is the recommended method for cases with multiple functions, 20 | classes, objects, etc. 21 | 22 | The code has two sections: 23 | 24 | Dispatched code: This code is compiled multiple times to generate multiple instances 25 | of the compiled code, each one optimized for a different instruction set. The 26 | dispatched code section contains the speed-critical part of the program. 27 | 28 | Common code: This code is compiled only once, using the lowest instruction set. 29 | The common code section contains the dispatcher, startup code, user interface, and 30 | other parts of the program that do not need advanced optimization. 31 | 32 | To compile this code, do as in this example: 33 | 34 | # Example of compiling dispatch example with Gnu or Clang compiler: 35 | # Compile dispatch_example1.cpp four times for different instruction sets: 36 | 37 | # Compile for AVX 38 | clang++ -O2 -m64 -mavx -std=c++17 -c dispatch_example1.cpp -od7.o 39 | 40 | # Compile for AVX2 41 | clang++ -O2 -m64 -mavx2 -mfma -std=c++17 -c dispatch_example1.cpp -od8.o 42 | 43 | # Compile for AVX512 44 | clang++ -O2 -m64 -mavx512f -mfma -mavx512vl -mavx512bw -mavx512dq -std=c++17 -c dispatch_example1.cpp -od10.o 45 | 46 | # The last compilation uses the lowest supported instruction set (SSE2) 47 | # This includes the main program, and links all versions together: 48 | # (Change test.exe to test in Linux and Mac) 49 | clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example1.cpp instrset_detect.cpp d7.o d8.o d10.o -otest.exe 50 | 51 | # Run the program 52 | ./test.exe 53 | 54 | (c) Copyright 2012-2020 Agner Fog. 55 | Apache License version 2.0 or later. 56 | ******************************************************************************/ 57 | 58 | /* The different instruction sets are defined in instrset_detect.cpp: 59 | 2: SSE2 60 | 3: SSE3 61 | 4: SSSE3 (Supplementary SSE3) 62 | 5: SSE4.1 63 | 6: SSE4.2 64 | 7: AVX 65 | 8: AVX2 66 | 9: AVX512F 67 | 10: AVX512VL + AVX512BW + AVX512DQ 68 | */ 69 | 70 | 71 | #include 72 | #include "vectorclass.h" 73 | 74 | // Define function type 75 | // Change this to fit the entry function. Should not contain vector types: 76 | typedef float MyFuncType(float const []); 77 | 78 | // function prototypes for each version 79 | MyFuncType myfunc_SSE2, myfunc_AVX, myfunc_AVX2, myfunc_AVX512; 80 | 81 | // function prototypes for common entry point and dispatcher 82 | MyFuncType myfunc, myfunc_dispatch; 83 | 84 | // Define name of entry function depending on which instruction set we compile for 85 | #if INSTRSET >= 10 // AVX512VL 86 | #define FUNCNAME myfunc_AVX512 87 | #elif INSTRSET >= 8 // AVX2 88 | #define FUNCNAME myfunc_AVX2 89 | #elif INSTRSET >= 7 // AVX 90 | #define FUNCNAME myfunc_AVX 91 | #elif INSTRSET == 2 92 | #define FUNCNAME myfunc_SSE2 // SSE2 93 | #else 94 | #error Unsupported instruction set 95 | #endif 96 | 97 | /****************************************************************************** 98 | Dispatched code 99 | 100 | Everything in this section is compiled multiple times, with one version for 101 | each instruction set. Speed-critical vector code belongs here. 102 | ******************************************************************************/ 103 | 104 | // This is the dispatched function that is compiled in multiple versions with different names. 105 | // Make sure this function is static to prevent clash with other versions having the same name. 106 | // The function cannot be member of a class. 107 | static float sum (float const f[]) { 108 | // This example adds 16 floats 109 | Vec16f a; // vector of 16 floats 110 | a.load(f); // load array into vector 111 | return horizontal_add(a); // return sum of 16 elements 112 | } 113 | 114 | // ----------------------------------------------------------------------------- 115 | // Entry function 116 | // ----------------------------------------------------------------------------- 117 | // This is the entry function that is accessed through the dispatcher. 118 | // This serves as the interface between the common code and the dispatched code. 119 | // The entry function cannot be member of a class. 120 | // The entry function must use arrays rather than vectors for input and output. 121 | float FUNCNAME (float const f[]) { 122 | return sum(f); 123 | } 124 | 125 | 126 | /********************************************************************************** 127 | Common code 128 | 129 | Everything in this section is compiled only once, using the lowest instruction set. 130 | 131 | The dispatcher must be placed here. Program main(), user interface, and other 132 | less critical parts of the code are also placed in the common code section. 133 | **********************************************************************************/ 134 | 135 | #if INSTRSET == 2 136 | // The common code is only included in the lowest of the compiled versions 137 | 138 | 139 | // --------------------------------------------------------------------------------- 140 | // Dispacther 141 | // --------------------------------------------------------------------------------- 142 | // This function pointer initially points to the dispatcher. 143 | // After the first call, it points to the selected version of the entry function 144 | MyFuncType * myfunc_pointer = &myfunc_dispatch; // function pointer 145 | 146 | // Dispatcher 147 | float myfunc_dispatch(float const f[]) { 148 | int iset = instrset_detect(); // Detect supported instruction set 149 | // Choose which version of the entry function we want to point to: 150 | if (iset >= 10) myfunc_pointer = &myfunc_AVX512; // AVX512 version 151 | else if (iset >= 8) myfunc_pointer = &myfunc_AVX2; // AVX2 version 152 | else if (iset >= 7) myfunc_pointer = &myfunc_AVX; // AVX version 153 | else if (iset >= 2) myfunc_pointer = &myfunc_SSE2; // SSE2 version 154 | else { 155 | // Error: lowest instruction set not supported. 156 | // Put any appropriate error handler here 157 | fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer"); 158 | return 0.f; 159 | } 160 | // continue in dispatched version of the function 161 | return (*myfunc_pointer)(f); 162 | } 163 | 164 | 165 | // Call the entry function through the function pointer. 166 | // The first time this function is called, it goes through the dispatcher. 167 | // The dispatcher will change the function pointer so that all subsequent 168 | // calls go directly to the optimal version of the entry function 169 | inline float myfunc(float const f[]) { 170 | return (*myfunc_pointer)(f); // go to dispatched version 171 | } 172 | 173 | 174 | // --------------------------------------------------------------------------------- 175 | // Program main 176 | // --------------------------------------------------------------------------------- 177 | int main() { 178 | 179 | // array of 16 floats 180 | float const a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; 181 | 182 | float sum = myfunc(a); // call function with dispatching 183 | 184 | printf("\nsum = %8.2f \n", sum); // print result (= 136.00) 185 | 186 | return 0; 187 | } 188 | 189 | #endif // INSTRSET == 2 190 | -------------------------------------------------------------------------------- /Vectorisation/VCL/instrset_detect.cpp: -------------------------------------------------------------------------------- 1 | /************************** instrset_detect.cpp **************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2019-08-01 5 | * Version: 2.00.00 6 | * Project: vector class library 7 | * Description: 8 | * Functions for checking which instruction sets are supported. 9 | * 10 | * (c) Copyright 2012-2019 Agner Fog. 11 | * Apache License version 2.0 or later. 12 | ******************************************************************************/ 13 | 14 | #include "instrset.h" 15 | 16 | #ifdef VCL_NAMESPACE 17 | namespace VCL_NAMESPACE { 18 | #endif 19 | 20 | 21 | // Define interface to xgetbv instruction 22 | static inline uint64_t xgetbv (int ctr) { 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) 24 | // Microsoft or Intel compiler supporting _xgetbv intrinsic 25 | 26 | return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV 27 | 28 | #elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax 29 | 30 | uint32_t a, d; 31 | __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); 32 | return a | (uint64_t(d) << 32); 33 | 34 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax 35 | uint32_t a, d; 36 | __asm { 37 | mov ecx, ctr 38 | _emit 0x0f 39 | _emit 0x01 40 | _emit 0xd0 ; // xgetbv 41 | mov a, eax 42 | mov d, edx 43 | } 44 | return a | (uint64_t(d) << 32); 45 | 46 | #endif 47 | } 48 | 49 | /* find supported instruction set 50 | return value: 51 | 0 = 80386 instruction set 52 | 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 53 | 2 or above = SSE2 54 | 3 or above = SSE3 55 | 4 or above = Supplementary SSE3 (SSSE3) 56 | 5 or above = SSE4.1 57 | 6 or above = SSE4.2 58 | 7 or above = AVX supported by CPU and operating system 59 | 8 or above = AVX2 60 | 9 or above = AVX512F 61 | 10 or above = AVX512VL, AVX512BW, AVX512DQ 62 | */ 63 | int instrset_detect(void) { 64 | 65 | static int iset = -1; // remember value for next call 66 | if (iset >= 0) { 67 | return iset; // called before 68 | } 69 | iset = 0; // default value 70 | int abcd[4] = {0,0,0,0}; // cpuid results 71 | cpuid(abcd, 0); // call cpuid function 0 72 | if (abcd[0] == 0) return iset; // no further cpuid function supported 73 | cpuid(abcd, 1); // call cpuid function 1 for feature flags 74 | if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point 75 | if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX 76 | if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move 77 | if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE 78 | if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE 79 | iset = 1; // 1: SSE supported 80 | if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 81 | iset = 2; // 2: SSE2 supported 82 | if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 83 | iset = 3; // 3: SSE3 supported 84 | if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 85 | iset = 4; // 4: SSSE3 supported 86 | if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 87 | iset = 5; // 5: SSE4.1 supported 88 | if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT 89 | if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 90 | iset = 6; // 6: SSE4.2 supported 91 | if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE 92 | if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. 93 | if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX 94 | iset = 7; // 7: AVX supported 95 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 96 | if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 97 | iset = 8; 98 | if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512 99 | cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags 100 | if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 101 | iset = 9; 102 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 103 | if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL 104 | if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ 105 | iset = 10; 106 | return iset; 107 | } 108 | 109 | // detect if CPU supports the FMA3 instruction set 110 | bool hasFMA3(void) { 111 | if (instrset_detect() < 7) return false; // must have AVX 112 | int abcd[4]; // cpuid results 113 | cpuid(abcd, 1); // call cpuid function 1 114 | return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3 115 | } 116 | 117 | // detect if CPU supports the FMA4 instruction set 118 | bool hasFMA4(void) { 119 | if (instrset_detect() < 7) return false; // must have AVX 120 | int abcd[4]; // cpuid results 121 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 122 | return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4 123 | } 124 | 125 | // detect if CPU supports the XOP instruction set 126 | bool hasXOP(void) { 127 | if (instrset_detect() < 7) return false; // must have AVX 128 | int abcd[4]; // cpuid results 129 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 130 | return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP 131 | } 132 | 133 | // detect if CPU supports the F16C instruction set 134 | bool hasF16C(void) { 135 | if (instrset_detect() < 7) return false; // must have AVX 136 | int abcd[4]; // cpuid results 137 | cpuid(abcd, 1); // call cpuid function 1 138 | return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C 139 | } 140 | 141 | // detect if CPU supports the AVX512ER instruction set 142 | bool hasAVX512ER(void) { 143 | if (instrset_detect() < 9) return false; // must have AVX512F 144 | int abcd[4]; // cpuid results 145 | cpuid(abcd, 7); // call cpuid function 7 146 | return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER 147 | } 148 | 149 | // detect if CPU supports the AVX512VBMI instruction set 150 | bool hasAVX512VBMI(void) { 151 | if (instrset_detect() < 10) return false; // must have AVX512BW 152 | int abcd[4]; // cpuid results 153 | cpuid(abcd, 7); // call cpuid function 7 154 | return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI 155 | } 156 | 157 | // detect if CPU supports the AVX512VBMI2 instruction set 158 | bool hasAVX512VBMI2(void) { 159 | if (instrset_detect() < 10) return false; // must have AVX512BW 160 | int abcd[4]; // cpuid results 161 | cpuid(abcd, 7); // call cpuid function 7 162 | return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2 163 | } 164 | 165 | #ifdef VCL_NAMESPACE 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /Vectorisation/VCL/vectorclass.h: -------------------------------------------------------------------------------- 1 | /**************************** vectorclass.h ******************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2020-04-11 5 | * Version: 2.01.02 6 | * Project: vector class library 7 | * Home: https://github.com/vectorclass 8 | * Description: 9 | * Header file defining vector classes as interface to intrinsic functions 10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets. 11 | * 12 | * Instructions: 13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired 14 | * instruction set, which must be at least SSE2. Specify the supported 15 | * instruction set by a command line define, e.g. __SSE4_1__ if the 16 | * compiler does not automatically do so. 17 | * For detailed instructions, see vcl_manual.pdf 18 | * 19 | * Each vector object is represented internally in the CPU as a vector 20 | * register with 128, 256 or 512 bits. 21 | * 22 | * This header file includes the appropriate header files depending on the 23 | * selected instruction set. 24 | * 25 | * (c) Copyright 2012-2020 Agner Fog. 26 | * Apache License version 2.0 or later. 27 | ******************************************************************************/ 28 | #ifndef VECTORCLASS_H 29 | #define VECTORCLASS_H 20102 30 | 31 | // Maximum vector size, bits. Allowed values are 128, 256, 512 32 | #ifndef MAX_VECTOR_SIZE 33 | #define MAX_VECTOR_SIZE 512 34 | #endif 35 | 36 | // Determine instruction set, and define platform-dependent functions 37 | #include "instrset.h" // Select supported instruction set 38 | 39 | #if INSTRSET < 2 // instruction set SSE2 is the minimum 40 | #error Please compile for the SSE2 instruction set or higher 41 | #else 42 | 43 | // Select appropriate .h files depending on instruction set 44 | #include "vectori128.h" // 128-bit integer vectors 45 | #include "vectorf128.h" // 128-bit floating point vectors 46 | 47 | #if MAX_VECTOR_SIZE >= 256 48 | #if INSTRSET >= 8 49 | #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set 50 | #else 51 | #include "vectori256e.h" // 256-bit integer vectors, emulated 52 | #endif // INSTRSET >= 8 53 | #if INSTRSET >= 7 54 | #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set 55 | #else 56 | #include "vectorf256e.h" // 256-bit floating point vectors, emulated 57 | #endif // INSTRSET >= 7 58 | #endif // MAX_VECTOR_SIZE >= 256 59 | 60 | #if MAX_VECTOR_SIZE >= 512 61 | #if INSTRSET >= 9 62 | #include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set 63 | #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set 64 | #else 65 | #include "vectori512e.h" // 512-bit integer vectors, emulated 66 | #include "vectorf512e.h" // 512-bit floating point vectors, emulated 67 | #endif // INSTRSET >= 9 68 | #if INSTRSET >= 10 69 | #include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set 70 | #else 71 | #include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated 72 | #endif 73 | #endif // MAX_VECTOR_SIZE >= 512 74 | 75 | #include "vector_convert.h" // conversion between different vector sizes 76 | 77 | #endif // INSTRSET >= 2 78 | 79 | 80 | #else // VECTORCLASS_H 81 | 82 | #if VECTORCLASS_H < 20000 83 | #error Mixed versions of vector class library 84 | #endif 85 | 86 | #endif // VECTORCLASS_H 87 | -------------------------------------------------------------------------------- /Vectorisation/VecX/alloc_policy.cpp: -------------------------------------------------------------------------------- 1 | /**************************** alloc_policy.cpp ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #include "alloc_policy.h" 13 | #include "alloc_policy_imp.h" 14 | #include 15 | 16 | template<> 17 | int AllAllocators::lastSize_N = -1; 18 | template<> 19 | int AllAllocators::lastSize_N = -1; 20 | template<> 21 | int AllAllocators::lastSize_N = -1; 22 | template<> 23 | int AllAllocators::lastSize_N = -1; 24 | 25 | template<> 26 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr; 27 | template<> 28 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr; 29 | template<> 30 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr; 31 | template<> 32 | AllocPolicy* AllAllocators::pAllocPolicy = nullptr; 33 | template<> 34 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>(); 35 | template<> 36 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>(); 37 | template<> 38 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>(); 39 | template<> 40 | std::unordered_map*> AllAllocators::m_map_sizeToAllocPolicy = std::unordered_map*>(); 41 | 42 | 43 | 44 | void freePool(size_t N, long double* pOld) 45 | { 46 | return freeT(N, pOld); 47 | } 48 | 49 | 50 | void freePool(size_t N, double* pOld) 51 | { 52 | return freeT(N, pOld); 53 | } 54 | 55 | void freePool(size_t N, float* pOld) 56 | { 57 | return freeT(N, pOld); 58 | } 59 | 60 | void freePool(size_t N, unsigned int* pOld) 61 | { 62 | return freeT(N, pOld); 63 | } 64 | 65 | void allocPool(size_t& N, long double*& pMem) 66 | { 67 | allocT(N, pMem); 68 | } 69 | 70 | void allocPool(size_t& N, double*& pMem) 71 | { 72 | allocT(N, pMem); 73 | } 74 | 75 | void allocPool(size_t& N, float*& pMem) 76 | { 77 | allocT(N, pMem); 78 | } 79 | 80 | void allocPool(size_t& N, unsigned int*& pMem) 81 | { 82 | allocT(N, pMem); 83 | } 84 | 85 | int getAllignedSize(size_t N, long double* pOld) 86 | { 87 | return getAllignedSizeT(N, pOld); 88 | } 89 | 90 | int getAllignedSize(size_t N, double* pOld) 91 | { 92 | return getAllignedSizeT(N, pOld); 93 | } 94 | 95 | int getAllignedSize(size_t N, float* pOld) 96 | { 97 | return getAllignedSizeT(N, pOld); 98 | } 99 | 100 | int getAllignedSize(size_t N, unsigned int* pOld) 101 | { 102 | return getAllignedSizeT(N, pOld); 103 | } 104 | void freeAllAllocators(long double) 105 | { 106 | AllAllocators::freeAll(); 107 | } 108 | void freeAllAllocators(double) 109 | { 110 | AllAllocators::freeAll(); 111 | } 112 | void freeAllAllocators(float) 113 | { 114 | AllAllocators::freeAll(); 115 | } 116 | void freeAllAllocators(unsigned int) 117 | { 118 | AllAllocators::freeAll(); 119 | } -------------------------------------------------------------------------------- /Vectorisation/VecX/alloc_policy.h: -------------------------------------------------------------------------------- 1 | /**************************** alloc_policy.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | 14 | #include 15 | 16 | void freePool(size_t N, long double* pOld); 17 | void freePool(size_t N, double* pOld); 18 | void freePool(size_t N, float* pOld); 19 | void freePool(size_t N, unsigned int* pOld); 20 | 21 | void allocPool(size_t& N, long double*& pMem); 22 | void allocPool(size_t& N, double*& pMem); 23 | void allocPool(size_t& N, float*& pOld); 24 | void allocPool(size_t& N, unsigned int*& pOld); 25 | 26 | int getAllignedSize(size_t N, long double* pOld); 27 | int getAllignedSize(size_t N, double* pOld); 28 | int getAllignedSize(size_t N, float* pOld); 29 | int getAllignedSize(size_t N, unsigned int* pOld); 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /Vectorisation/VecX/alloc_policy_imp.h: -------------------------------------------------------------------------------- 1 | /**************************** alloc_policy_imp.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include 14 | #include 15 | 16 | 17 | //need a function to reduce size pools to a minimum 18 | // get rid of magic numbers of byte sizes etc 19 | 20 | const int BytesOnCacheLine = 64; 21 | const long MemPoolInitialIncrement = 16; 22 | const long MemPoolScaleFactor = 2; 23 | const int ByteAllignment = 64; 24 | 25 | template 26 | class PoolStrat 27 | { 28 | public: 29 | 30 | PoolStrat(const PoolStrat&) = delete; 31 | PoolStrat& operator=(const PoolStrat&) = delete; 32 | PoolStrat& operator=( PoolStrat&&) = delete; 33 | PoolStrat(PoolStrat&&) = delete; 34 | 35 | 36 | explicit PoolStrat(int vecSz) :m_vecSize(vecSz) 37 | { 38 | m_sz = 0; 39 | m_incrementSize = MemPoolInitialIncrement;// 16; 40 | addToPool(m_incrementSize); 41 | m_pos = 0; 42 | } 43 | 44 | ~PoolStrat() 45 | { 46 | for (auto vec : m_allocatedVecs) 47 | { 48 | delete vec; 49 | } 50 | } 51 | 52 | 53 | 54 | T* alloc() 55 | { 56 | if (m_pos < (m_sz - 1)) 57 | { 58 | T* ret = m_memPool[m_pos]; 59 | m_pos++; 60 | return ret; 61 | } 62 | else 63 | { 64 | m_incrementSize *= MemPoolScaleFactor; 65 | addToPool(m_incrementSize); 66 | return alloc(); 67 | } 68 | } 69 | 70 | 71 | void free(T* pToFree) 72 | { 73 | //typically this should be next one down from top of stack 74 | if ((m_pos <= 0) || (nullptr == pToFree)) 75 | { 76 | return; 77 | } 78 | 79 | if (m_memPool[m_pos - 1] == pToFree) 80 | { 81 | //pToFree[0] = 666; 82 | m_pos--; 83 | return; 84 | } 85 | 86 | //search for values of i > 0 87 | int i = m_pos; 88 | if (i >= static_cast(m_memPool.size())) 89 | { 90 | i = static_cast(m_memPool.size()) - 1; 91 | } 92 | int maxPos = i; 93 | 94 | for (; i > -1; i--) 95 | { 96 | if (m_memPool[i] == pToFree) 97 | { 98 | //swap to be freed with top element and decrement//bubble to the top 99 | for (int k = i; k < maxPos - 1; k++) 100 | { 101 | std::swap(m_memPool[k], m_memPool[k + 1]); 102 | } 103 | //pToFree[0] = 666; 104 | m_pos--; 105 | return; 106 | } 107 | } 108 | 109 | } 110 | 111 | 112 | void addToPool(int numElements) 113 | { 114 | //m_vecSize for double 64 byte align ie cache line 115 | size_t offsetAlgn = ByteAllignment;// 64;// 16; 116 | std::vector* pVecsMem = new std::vector((long)(numElements)*m_vecSize + offsetAlgn); 117 | m_allocatedVecs.push_back(pVecsMem); 118 | 119 | T* pstrtPt = &((*pVecsMem)[0]); 120 | while ((reinterpret_cast(pstrtPt)) % offsetAlgn) pstrtPt++; 121 | 122 | for (int i = 0; i < numElements; i++) 123 | { 124 | m_memPool.push_back(pstrtPt); 125 | pstrtPt += m_vecSize; 126 | } 127 | 128 | m_sz += numElements; 129 | 130 | } 131 | 132 | inline long pos() const 133 | { 134 | return m_pos; 135 | } 136 | 137 | inline long size() const 138 | { 139 | return m_sz; 140 | } 141 | 142 | const std::vector* >& getAllocVecs() const 143 | { 144 | m_allocatedVecs; 145 | } 146 | 147 | private: 148 | long m_pos; 149 | long m_sz; 150 | std::vector m_memPool; 151 | long m_incrementSize; // next number of vectors for allocation 152 | long m_vecSize; //size of element vector considering allignment and padding 153 | std::vector* > m_allocatedVecs; 154 | 155 | }; 156 | 157 | 158 | ////////////////////////////////////////// 159 | 160 | 161 | template 162 | class AllocPolicy 163 | { 164 | int m_vec_size; 165 | PoolStrat* m_pool; 166 | public: 167 | int size() const 168 | { 169 | return m_vec_size; 170 | } 171 | 172 | AllocPolicy(int size) :m_vec_size(size) 173 | { 174 | m_pool = new PoolStrat(size); 175 | } 176 | ~AllocPolicy() 177 | { 178 | delete m_pool; 179 | } 180 | 181 | 182 | inline T* alloc() 183 | { 184 | return m_pool->alloc(); 185 | } 186 | 187 | inline void free(T* pElement) 188 | { 189 | m_pool->free(pElement); 190 | } 191 | 192 | }; 193 | 194 | 195 | 196 | template 197 | class AllAllocators 198 | { 199 | static int lastSize_N; 200 | static AllocPolicy* pAllocPolicy; 201 | static std::unordered_map*> m_map_sizeToAllocPolicy; 202 | 203 | 204 | static void setUpPolicy(int size_N) 205 | { 206 | auto itr = m_map_sizeToAllocPolicy.find(size_N); 207 | if (m_map_sizeToAllocPolicy.end() == itr) 208 | { 209 | pAllocPolicy = new AllocPolicy(size_N); 210 | m_map_sizeToAllocPolicy[size_N] = pAllocPolicy; 211 | } 212 | } 213 | 214 | 215 | 216 | public: 217 | 218 | static void removePolicy(int size_N) 219 | { 220 | auto itr = m_map_sizeToAllocPolicy.find(size_N); 221 | if (m_map_sizeToAllocPolicy.end() != itr) 222 | { 223 | auto policyPtr = m_map_sizeToAllocPolicy[size_N]; 224 | delete policyPtr; 225 | m_map_sizeToAllocPolicy.erase(itr); 226 | } 227 | 228 | } 229 | 230 | static void freeAll() 231 | { 232 | for (auto& item : m_map_sizeToAllocPolicy) 233 | { 234 | delete item.second; 235 | } 236 | m_map_sizeToAllocPolicy.clear(); 237 | } 238 | 239 | 240 | static T* alloc(int size_N) 241 | { 242 | if (lastSize_N == size_N) 243 | { 244 | return pAllocPolicy->alloc(); 245 | } 246 | 247 | setUpPolicy(size_N); 248 | 249 | pAllocPolicy = m_map_sizeToAllocPolicy[size_N]; 250 | lastSize_N = size_N; 251 | return pAllocPolicy->alloc(); 252 | } 253 | 254 | 255 | 256 | static void free(size_t size_N, T* pMem) 257 | { 258 | int sz_N = static_cast(size_N); 259 | 260 | if (lastSize_N == sz_N) 261 | { 262 | return pAllocPolicy->free(pMem); 263 | } 264 | 265 | setUpPolicy(sz_N); 266 | pAllocPolicy = m_map_sizeToAllocPolicy[sz_N]; 267 | lastSize_N = sz_N; 268 | return pAllocPolicy->free(pMem); 269 | 270 | } 271 | 272 | 273 | }; 274 | 275 | template< typename T> 276 | struct NumOnCacheLine 277 | { 278 | static inline int size() 279 | { 280 | return BytesOnCacheLine / sizeof(T); 281 | } 282 | }; 283 | 284 | 285 | template 286 | int getAllignedSizeT(size_t N, T*) 287 | { 288 | const int M = NumOnCacheLine::size(); 289 | size_t res = (N % M == 0) ? N : (N / M + 1) * M; 290 | return static_cast(res); 291 | } 292 | 293 | 294 | 295 | template< typename T> 296 | void allocT(size_t& N, T*& pMem) 297 | { 298 | int n = getAllignedSize(N, pMem); 299 | N = static_cast(n); 300 | pMem = AllAllocators::alloc(n); 301 | } 302 | 303 | template< typename T> 304 | void freeT(size_t N, T* pOld) 305 | { 306 | //find element and mark as unused 307 | return AllAllocators::free(N, pOld); 308 | 309 | } 310 | void freeAllAllocators(long double); 311 | void freeAllAllocators(double); 312 | void freeAllAllocators(float); 313 | void freeAllAllocators(unsigned int); 314 | 315 | 316 | template 317 | class AllAllocatorsGuard 318 | { 319 | public: 320 | ~AllAllocatorsGuard() 321 | { 322 | freeAllAllocators(T()); 323 | } 324 | 325 | }; 326 | 327 | 328 | 329 | -------------------------------------------------------------------------------- /Vectorisation/VecX/apply_operation.h: -------------------------------------------------------------------------------- 1 | /**************************** apply_operation.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include "vec.h" 14 | #include "vec_double.h" 15 | #include "instruction_traits.h" 16 | #include "boolean_operations.h" 17 | #include "accumulate_transform.h" 18 | #include "binary_unitary_operations.h" 19 | #include "math_ops.h" 20 | #include "filter_select.h" 21 | #include "conditional_select_eval.h" 22 | #include "vec_view.h" 23 | #include "vcl_latest.h" 24 | 25 | #include 26 | 27 | 28 | 29 | template 30 | static INS_VEC cdfnormD(INS_VEC x) 31 | { 32 | 33 | auto asNumber = [](auto x) 34 | { 35 | return static_cast::FloatType>(x); 36 | }; 37 | 38 | // https://mathworld.wolfram.com/Erfc.html 39 | constexpr typename InstructionTraits::FloatType invRootPi = asNumber(0.564189583547756); 40 | constexpr typename InstructionTraits::FloatType invRootTwo =asNumber( 0.707106781186548); 41 | return invRootTwo * invRootPi*exp(-0.5*x*x); 42 | } 43 | 44 | 45 | /**/ 46 | template 47 | static INS_VEC cdfnorm(const INS_VEC& z) 48 | { 49 | 50 | auto asNumber = [](auto x) 51 | { 52 | return static_cast::FloatType>(x); 53 | }; 54 | 55 | auto asInsVec = [&](auto x){ return INS_VEC(asNumber(x) ); }; 56 | 57 | 58 | // https://mathworld.wolfram.com/Erfc.html 59 | INS_VEC b1 = asInsVec(0.31938153); 60 | INS_VEC b2 = asInsVec(-0.356563782); 61 | INS_VEC b3 = asInsVec(1.781477937); 62 | INS_VEC b4 = asInsVec(-1.821255978); 63 | INS_VEC b5 = asInsVec(1.330274429); 64 | INS_VEC p = asInsVec(0.2316419); 65 | INS_VEC c2 = asInsVec(0.3989423); 66 | 67 | // const auto cond1 = (z > asInsVec(6.0)); 68 | // INS_VEC x = select(cond1, asInsVec(1.0), z); 69 | // x = x; 70 | 71 | // INS_VEC y = select( (z < asInsVec(-6.0)),asInsVec(0.0), z); 72 | // y = y; 73 | INS_VEC a = abs(z); 74 | INS_VEC t = asInsVec(1.0) / (asInsVec(1.0) + a*p); 75 | INS_VEC b = c2*exp((-z)*(z / asInsVec(2.0))); 76 | INS_VEC n = ((((b5*t + b4)*t + b3)*t + b2)*t + b1)*t; 77 | n = asInsVec(1.0) - b*n; 78 | n = select( (z < asInsVec(0.0) ), asInsVec(1.0) - n,n); 79 | return n; 80 | } 81 | 82 | 83 | 84 | template 85 | Vec cdfnorm(const Vec& X) 86 | { 87 | using FLOAT = typename InstructionTraits::FloatType; 88 | 89 | auto asNumber = []( auto x) constexpr 90 | { 91 | return static_cast(x); 92 | }; 93 | 94 | auto centralLambda = [&](auto z) 95 | { 96 | 97 | constexpr FLOAT N[] = { FLOAT(3.52624965998911e-02) , FLOAT(0.700383064443688), FLOAT(6.37396220353165), FLOAT(33.912866078383), FLOAT(112.079291497871), FLOAT(221.213596169931), FLOAT(220.206867912376) }; 98 | constexpr FLOAT M[] = { FLOAT(8.83883476483184e-02), FLOAT(1.75566716318264), FLOAT(16.064177579207), FLOAT(86.7807322029461) , FLOAT(296.564248779674), FLOAT(637.333633378831), FLOAT(793.826512519948),FLOAT(440.413735824752) }; 99 | 100 | auto inv_dc = 1.0 / mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(M[0], z, M[1]), z, M[2]), z, M[3]), z, M[4]), z, M[5]), z, M[6]), z, M[7]); 101 | auto n_c = mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(N[0], z, N[1]), z, N[2]), z, N[3]), z, N[4]), z, N[5]), z, N[6]); 102 | 103 | return n_c * inv_dc; 104 | }; 105 | 106 | 107 | auto outerLambda = [&](auto z) 108 | { 109 | constexpr FLOAT inv_RT2PI(0.39894228040143267793994605993438); 110 | constexpr FLOAT d[] = { FLOAT(20.) , FLOAT(13.), FLOAT(200.), FLOAT(78.), FLOAT(300.), FLOAT(39.) }; 111 | constexpr FLOAT n[] = { FLOAT(20.), FLOAT(13.), FLOAT(180.), FLOAT(65.), FLOAT(160.) }; 112 | 113 | auto d_outer = mul_add(mul_add(mul_add(mul_add(mul_add((d[0] * z), z, d[1]), z, d[2]), z, d[3]), z, d[4]), z, d[5]); 114 | auto inv_d_outer = inv_RT2PI / d_outer; 115 | 116 | auto n_outer = mul_add(mul_add(mul_add(mul_add((n[0] * z), z, n[1]), z, n[2]), z, n[3]), z, n[4]); 117 | return n_outer * inv_d_outer; 118 | }; 119 | 120 | 121 | 122 | auto onePass = [=](auto x) 123 | { 124 | auto z = abs(x); 125 | auto e = exp(-z * z * asNumber(0.5) ); 126 | auto central = centralLambda(z); 127 | auto SPLIT = asNumber(7.42);// 7106781186547; // appears to give less error 128 | auto condAllDone = (x * x < SPLIT* SPLIT); 129 | 130 | if (horizontal_and(condAllDone)) 131 | { 132 | central *= e; 133 | return select(x <= asNumber(0.0), central, asNumber(1.0) - central); 134 | } 135 | 136 | auto outer = outerLambda(z); 137 | auto RES = select((z < SPLIT), central, outer); 138 | RES *= e; 139 | return select(x <= asNumber(0.0), RES, asNumber(1.0) - RES); 140 | 141 | }; 142 | 143 | return ApplyTransformUR_X(X, onePass); 144 | 145 | } 146 | 147 | 148 | 149 | template 150 | VecD cdfnorm(const VecD& rhs) 151 | { 152 | return VecD(cdfnorm(rhs.value()), rhs.derivative()*cdfnormD(rhs.value())); 153 | } 154 | 155 | //to do replace with WS 16 digit impl 156 | template 157 | Vec cdfnorminv(const Vec& X) 158 | { 159 | 160 | auto asNumber = [](auto x) constexpr 161 | { 162 | return static_cast::FloatType>(x); 163 | }; 164 | 165 | 166 | /// acklams inverse cdf normal 167 | static typename InstructionTraits::FloatType a[] = { asNumber(0.0), asNumber( -3.969683028665376e+01), asNumber(2.209460984245205e+02), asNumber(-2.759285104469687e+02), asNumber(1.383577518672690e+02), asNumber(-3.066479806614716e+01) , asNumber(2.506628277459239e+00)}; 168 | static typename InstructionTraits::FloatType b[] = { asNumber(0.0), asNumber(-5.447609879822406e+01), asNumber(1.615858368580409e+02), asNumber(-1.556989798598866e+02), asNumber(6.680131188771972e+01), asNumber(-1.328068155288572e+01) }; 169 | static typename InstructionTraits::FloatType c[] = { asNumber(0.0), asNumber(-7.784894002430293e-03), asNumber(-3.223964580411365e-01), asNumber(-2.400758277161838e+00), asNumber(-2.549732539343734e+00), asNumber(4.374664141464968e+00), asNumber(2.938163982698783e+00) }; 170 | static typename InstructionTraits::FloatType d[] = { asNumber(0.0), asNumber(7.784695709041462e-03), asNumber(3.224671290700398e-01), asNumber(2.445134137142996e+00), asNumber(3.754408661907416e+00) }; 171 | 172 | auto aclambdaMain = [=](auto p) 173 | { 174 | auto X = p; 175 | auto q = p - asNumber(0.5); 176 | auto r = q * q; 177 | X = (((((a[1] * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * r + a[6]) * q / 178 | (((((b[1] * r + b[2]) * r + b[3]) * r + b[4]) * r + b[5]) * r + asNumber(1.)); 179 | 180 | return X; 181 | }; 182 | 183 | 184 | auto aclambdaLow = [=](auto initVal, auto p) 185 | { 186 | const auto p_low = asNumber(0.02425); 187 | auto condLo = (asNumber(0.0) < p) && (p < p_low); 188 | 189 | if (!horizontal_or(condLo)) 190 | return initVal; 191 | 192 | auto q = sqrt(asNumber (-2.0) * log(p)); 193 | auto X = (((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) / 194 | ((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + asNumber(1.0)); 195 | 196 | return select(condLo, X, initVal); 197 | 198 | }; 199 | 200 | 201 | auto aclambdaHi = [=](auto initVal, auto p) 202 | { 203 | const auto p_low = asNumber(0.02425); 204 | const auto p_high = asNumber(1.) - p_low; 205 | auto condHi = (p_high < p) && (p < asNumber(1.)); 206 | if (!horizontal_or(condHi)) 207 | return initVal; 208 | 209 | auto q = sqrt(asNumber(-2.0) * log(asNumber(1.) - p)); 210 | const auto X = -(((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) / 211 | ((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + 1.0); 212 | return select(condHi, X, initVal); 213 | }; 214 | 215 | 216 | 217 | auto res = ApplyUnitaryOperation1(X, aclambdaMain); 218 | SparseUpdateWithLambda1(res, X, aclambdaLow); 219 | SparseUpdateWithLambda1(res, X, aclambdaHi); 220 | 221 | return res; 222 | } 223 | // 224 | -------------------------------------------------------------------------------- /Vectorisation/VecX/binned_accumulator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "dr3.h" 3 | #include "instruction_traits.h" 4 | #include 5 | 6 | 7 | template 8 | struct BinsT 9 | { 10 | using INS = INS_T; 11 | 12 | inline static constexpr bool isDbl = std::is_same::FloatType >::value; 13 | 14 | inline static const INS_T TINY_C{ isDbl ? pow(1024.0 , -10.0) : 1.0/ 8388608.0f * 1.0 / 8388608.0f }; 15 | inline static const INS_T VERY_SMALL_C{ isDbl ? pow(1024.0,-5.0) : 1.0 / 8388608.0f }; 16 | inline static const INS_T SMALL_C{ isDbl ? 1.0 : 1.0f }; 17 | inline static const INS_T BIG_C{ isDbl ? pow(1024.0, 5.0) : 8388608.0f }; 18 | 19 | 20 | static inline auto roundIt(INS_T X, INS_T LEVEL) 21 | { 22 | auto INV_LEVEL = 1.0l / LEVEL; 23 | auto big = (LEVEL * truncate(X * INV_LEVEL)); 24 | auto small = X - big; 25 | return std::pair(big, small); 26 | }; 27 | 28 | 29 | INS_T m_scaleFactor{ InstructionTraits::oneValue }; 30 | INS_T veryBigSummV{ InstructionTraits::nullValue }; 31 | INS_T bigSummV{ InstructionTraits::nullValue }; 32 | INS_T smallSumV{ InstructionTraits::nullValue }; 33 | INS_T tinyV{ InstructionTraits::nullValue }; 34 | 35 | 36 | INS_T TINY{ TINY_C }; 37 | INS_T VERY_SMALL{ VERY_SMALL_C }; 38 | INS_T SMALL{ SMALL_C }; 39 | INS_T BIG{ BIG_C }; 40 | 41 | 42 | 43 | BinsT() : 44 | m_scaleFactor{ InstructionTraits::oneValue }, 45 | TINY{ m_scaleFactor * TINY_C }, 46 | VERY_SMALL{ m_scaleFactor * VERY_SMALL_C }, 47 | SMALL{ m_scaleFactor * SMALL_C }, 48 | BIG{ m_scaleFactor * BIG_C } 49 | {} 50 | 51 | 52 | 53 | 54 | 55 | BinsT(typename InstructionTraits::FloatType x, typename InstructionTraits::FloatType scaleFactor = InstructionTraits::oneValue) : 56 | m_scaleFactor{ scaleFactor }, 57 | TINY{ m_scaleFactor * TINY_C }, 58 | VERY_SMALL{ m_scaleFactor * VERY_SMALL_C }, 59 | SMALL{ m_scaleFactor * SMALL_C }, 60 | BIG{ m_scaleFactor * BIG_C } 61 | { 62 | 63 | INS_T MASK(InstructionTraits::nullValue); 64 | MASK.insert(0, InstructionTraits::oneValue); 65 | 66 | set(MASK * x); 67 | 68 | } 69 | 70 | 71 | void set(INS_T x) 72 | { 73 | auto resRoundVeryBig = roundIt(x, BIG); 74 | auto resRoundBig = roundIt(resRoundVeryBig.second, SMALL); 75 | auto resRoundSmall = roundIt(resRoundBig.second, VERY_SMALL); 76 | 77 | veryBigSummV = resRoundVeryBig.first; 78 | bigSummV = resRoundBig.first; 79 | smallSumV = resRoundSmall.first; 80 | tinyV = resRoundSmall.second; 81 | } 82 | 83 | BinsT(INS_T x) 84 | { 85 | set(x); 86 | } 87 | 88 | BinsT& operator *(INS_T rhs) 89 | { 90 | 91 | veryBigSummV *= rhs; 92 | bigSummV *= rhs; 93 | smallSumV *= rhs; 94 | tinyV *= rhs; 95 | 96 | return *this; 97 | } 98 | 99 | 100 | BinsT(BinsT&& x) noexcept 101 | { 102 | veryBigSummV = x.veryBigSummV; 103 | bigSummV = x.bigSummV; 104 | smallSumV = x.smallSumV; 105 | tinyV = x.tinyV; 106 | 107 | m_scaleFactor = x.m_scaleFactor; 108 | TINY = x.TINY; 109 | VERY_SMALL = x.VERY_SMALL; 110 | SMALL = x.SMALL; 111 | BIG = x.BIG; 112 | 113 | 114 | }; 115 | 116 | 117 | BinsT& operator =(const BinsT& x) 118 | { 119 | veryBigSummV = x.veryBigSummV; 120 | bigSummV = x.bigSummV; 121 | smallSumV = x.smallSumV; 122 | tinyV = x.tinyV; 123 | 124 | m_scaleFactor = x.m_scaleFactor; 125 | TINY = x.TINY; 126 | VERY_SMALL = x.VERY_SMALL; 127 | SMALL = x.SMALL; 128 | BIG = x.BIG; 129 | 130 | 131 | return *this; 132 | }; 133 | 134 | 135 | BinsT& operator += (const BinsT& rhs) 136 | { 137 | auto resRoundTiny = roundIt(tinyV + rhs.tinyV, VERY_SMALL); 138 | tinyV = resRoundTiny.second; 139 | 140 | 141 | auto smallRound = roundIt(smallSumV + rhs.smallSumV + resRoundTiny.first, SMALL); 142 | smallSumV = smallRound.second; 143 | auto bigRound = roundIt(smallRound.first + bigSummV + rhs.bigSummV, BIG); 144 | bigSummV = bigRound.second; 145 | veryBigSummV = bigRound.first + veryBigSummV + rhs.veryBigSummV; 146 | 147 | return *this; 148 | } 149 | 150 | 151 | 152 | auto hsum() 153 | { 154 | auto lambdaBinSum = [this]() {return (((horizontal_add(tinyV)) + horizontal_add(smallSumV)) + horizontal_add(bigSummV)) + horizontal_add(veryBigSummV); }; 155 | return lambdaBinSum(); 156 | } 157 | 158 | 159 | }; 160 | 161 | 162 | 163 | 164 | static auto BinnedAdd = [](auto& bin, auto x) mutable 165 | { 166 | bin += x; 167 | using INS_T = decltype(x); 168 | auto NULL_Vec = INS_T(InstructionTraits::nullValue); 169 | return NULL_Vec; 170 | 171 | }; -------------------------------------------------------------------------------- /Vectorisation/VecX/error_utils.h: -------------------------------------------------------------------------------- 1 | /**************************** error_utils.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include "vec.h" 14 | #include "vec_view.h" 15 | #include "span.h" 16 | #include 17 | #include 18 | 19 | //ignore for unused from sutter 20 | template void ignore(const T&) { } 21 | 22 | template 23 | bool check_vector( const VEC& rhs) 24 | { 25 | auto rhsSz = rhs.size(); 26 | 27 | if ( ( rhsSz > 0) || rhs.isScalar() ) 28 | { 29 | return true; 30 | } 31 | else 32 | { 33 | //std:: 34 | assert(false); 35 | throw std::runtime_error("bad vector size of non scalar"); 36 | } 37 | } 38 | 39 | template 40 | bool check_pair(const VEC& lhs, const VEC& rhs) 41 | { 42 | check_vector(lhs); 43 | check_vector(rhs); 44 | 45 | if ( (lhs.size() == rhs.size() ) && (lhs.size() > 0 ) ) 46 | { 47 | return true; 48 | } 49 | 50 | 51 | if (rhs.isScalar() || lhs.isScalar()) 52 | { 53 | return true; 54 | } 55 | else 56 | { 57 | 58 | assert(false); 59 | throw std::runtime_error("bad vector size"); 60 | } 61 | } 62 | 63 | template 64 | bool check_pair_different_type(const VEC1& lhs, const VEC2& rhs) 65 | { 66 | check_vector(lhs); 67 | check_vector(rhs); 68 | 69 | if (lhs.size() == rhs.size()) 70 | return true; 71 | if (rhs.isScalar() || lhs.isScalar()) 72 | { 73 | return true; 74 | } 75 | else 76 | { 77 | //std:: 78 | assert(false); 79 | throw std::runtime_error("bad vector size"); 80 | } 81 | } 82 | 83 | 84 | 85 | 86 | ////////////// views //////////////// 87 | template 88 | bool check_vector(const VecView& /*rhs*/) 89 | { 90 | //TO DO 91 | /* 92 | auto rhsSz = rhs.size(); 93 | 94 | if ((rhsSz > 0) || rhs.isScalar()) 95 | { 96 | return true; 97 | } 98 | else 99 | { 100 | //std::assert(false); 101 | throw std::exception("bad vector size of non scalar"); 102 | } 103 | */ 104 | return true; 105 | } 106 | 107 | template 108 | bool check_vector(const Vec& rhs) 109 | { 110 | auto rhsSz = rhs.size(); 111 | 112 | if ((rhsSz > 0) || rhs.isScalar()) 113 | { 114 | return true; 115 | } 116 | else 117 | { 118 | //std:: 119 | assert(false); 120 | throw std::runtime_error("bad vector size of non scalar"); 121 | } 122 | } 123 | 124 | 125 | 126 | template 127 | bool check_vector(const VecD& rhs) 128 | { 129 | // Always return true 130 | return true; 131 | /* 132 | auto rhsSz = rhs.size(); 133 | 134 | if ((rhsSz > 0) || rhs.isScalar()) 135 | { 136 | return true; 137 | } 138 | else 139 | { 140 | //std:: 141 | assert(false); 142 | throw std::exception("bad vector size of non scalar"); 143 | } 144 | */ 145 | } 146 | 147 | template 148 | bool check_vector_for_filter(const Vec& rhs) 149 | { 150 | auto rhsSz = rhs.size(); 151 | 152 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views 153 | { 154 | return true; 155 | } 156 | else 157 | { 158 | //std:: 159 | assert(false); 160 | throw std::runtime_error("bad vector size of non scalar"); 161 | } 162 | } 163 | 164 | 165 | template 166 | bool check_vector_for_filter(const VecView&/* rhs*/) 167 | { 168 | return true;// views can be empty 169 | /* 170 | auto rhsSz = rhs.size(); 171 | 172 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views 173 | { 174 | return true; 175 | } 176 | else 177 | { 178 | //std::assert(false); 179 | throw std::exception("bad vector size of non scalar"); 180 | } 181 | */ 182 | } 183 | 184 | 185 | template 186 | bool check_vector_for_filter(const Span&/* rhs*/) 187 | { 188 | return true;// views can be empty 189 | /* 190 | auto rhsSz = rhs.size(); 191 | 192 | if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views 193 | { 194 | return true; 195 | } 196 | else 197 | { 198 | //std::assert(false); 199 | throw std::exception("bad vector size of non scalar"); 200 | } 201 | */ 202 | } 203 | 204 | 205 | 206 | 207 | template 208 | bool check_view_pair(const Vec& lhs, const Vec& rhs) 209 | { 210 | check_vector_for_filter(lhs); 211 | check_vector_for_filter(rhs); 212 | 213 | if (lhs.size() == rhs.size()) 214 | return true; 215 | 216 | //std:: 217 | assert(false); 218 | throw std::runtime_error("bad vector size"); 219 | 220 | } 221 | 222 | template 223 | bool check_view_pair(const VecView& lhs, const VecView& rhs) 224 | { 225 | check_vector_for_filter(lhs); 226 | check_vector_for_filter(rhs); 227 | 228 | if (lhs.size() == rhs.size()) 229 | return true; 230 | 231 | //std:: 232 | assert(false); 233 | throw std::runtime_error("bad vector size"); 234 | 235 | } -------------------------------------------------------------------------------- /Vectorisation/VecX/filter_pipe_and_join.h: -------------------------------------------------------------------------------- 1 | /**************************** filter_pipe_and_join.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include "filter_select.h" 14 | 15 | 16 | /* 17 | Use "|" for joining filters and ">" for joining operations 18 | use braces around sets of filters to control evaluation order 19 | 20 | */ 21 | 22 | namespace PIPE 23 | { 24 | 25 | template< typename INS_VEC> 26 | VecView operator |(const Vec& rhs, const VecBool& condition) 27 | { 28 | return ApplyFilter(condition, rhs); 29 | } 30 | 31 | 32 | template< typename INS_VEC, typename OP> 33 | VecView operator |(const VecView& rhs, OP& condition) 34 | { 35 | return ApplyFilter(condition, rhs); 36 | } 37 | 38 | template< typename INS_VEC, typename OP> 39 | VecView operator |(const Vec& rhs, OP& condition) 40 | { 41 | return ApplyFilter(condition, rhs); 42 | } 43 | 44 | template< typename INS_VEC, typename OP> 45 | VecView operator |(Vec& rhs, OP& condition) 46 | { 47 | return ApplyFilter(condition, rhs); 48 | } 49 | 50 | 51 | // vector checks are applied inside ApplyUnitaryOperation 52 | template< typename INS_VEC, typename OP> 53 | VecView operator > ( VecView rhs, OP& oper) 54 | { 55 | ApplyUnitaryOperation( rhs, oper); 56 | return rhs; 57 | } 58 | 59 | 60 | template< typename INS_VEC, typename OP> 61 | VecView& operator > ( OP& oper, VecView& rhs) 62 | { 63 | ApplyUnitaryOperation(oper, rhs); 64 | return rhs; 65 | } 66 | 67 | template< typename INS_VEC, typename OP> 68 | VecView operator > (OP& oper, const VecView& rhs) 69 | { 70 | return ApplyUnitaryOperation(oper, rhs); 71 | } 72 | 73 | 74 | template< typename INS_VEC, typename OP> 75 | VecView& operator > (Vec& rhs, OP& oper) 76 | { 77 | ApplyUnitaryOperation(oper, rhs); 78 | return rhs; 79 | } 80 | 81 | 82 | template< typename INS_VEC, typename OP> 83 | VecView operator > (const Vec& rhs, OP& oper) 84 | { 85 | return ApplyUnitaryOperation(oper, rhs); 86 | } 87 | 88 | 89 | template< typename INS_VEC, typename OP> 90 | VecView operator > ( OP& oper , const Vec& rhs ) 91 | { 92 | return ApplyUnitaryOperation(oper, rhs); 93 | } 94 | 95 | 96 | template< typename INS_VEC> 97 | Vec operator |(const VecView& rhs, Vec& out) 98 | { 99 | auto outRes(out); 100 | rhs.writeView(outRes); 101 | return outRes; 102 | } 103 | 104 | 105 | template< typename INS_VEC> 106 | Vec operator |(VecView& rhs, const Vec& out) 107 | { 108 | auto outRes(out); 109 | rhs.writeView(outRes); 110 | return outRes; 111 | } 112 | 113 | 114 | struct WriteOut 115 | {}; 116 | 117 | template< typename INS_VEC> 118 | void operator |(const VecView& rhs, WriteOut& out) 119 | { 120 | //writes back to source to do 121 | rhs.writeView(out); 122 | } 123 | 124 | }// namespace PIPE 125 | 126 | /* 127 | These expression templates are for use at register level combinations of operations 128 | */ 129 | namespace JOIN 130 | { 131 | 132 | 133 | template< typename LHS, typename RHS> 134 | struct CatOperation 135 | { 136 | CatOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {} 137 | 138 | template 139 | inline auto operator()(const X& val) noexcept 140 | { 141 | return m_lhs(m_rhs(val)); 142 | } 143 | 144 | //for use with accumulate 145 | template 146 | inline auto operator()(const X& lhs_arg, const X& rhs_arg) noexcept 147 | { 148 | return m_rhs(rhs_arg, m_lhs(lhs_arg)); 149 | } 150 | LHS m_lhs; 151 | RHS m_rhs; 152 | }; 153 | 154 | 155 | template 156 | CatOperation< RHS, LHS> operator | (const LHS& lhs, const RHS& rhs) 157 | { 158 | return CatOperation(rhs,lhs); 159 | } 160 | 161 | /* Examples 162 | 163 | Boolean expression template conjuction for boolean lambdas 164 | 165 | auto isLessThanMinus10 = [](auto x) { return x < -10 }; 166 | auto isGreaterThan10 = [](auto x) { return x > 10 }; 167 | auto isLessThan20 = [](auto x) { return x < 20 }; 168 | 169 | we can create simple logical conjunctions of boolean lambdas 170 | 171 | auto betweenTenAndTwenty = isGreaterThan10 && isLessThan20; 172 | 173 | auto isOutsideTenTwenty = !betweenTenAndTwenty; 174 | 175 | auto hasAbsGreaterThanTen = isLessThanMinus10 || isGreaterThan10; 176 | 177 | */ 178 | 179 | template< typename RHS> 180 | struct NegateOperation 181 | { 182 | NegateOperation(const RHS& rhs) : m_rhs(rhs) {} 183 | 184 | template 185 | inline auto operator()(const INS_VEC& val) noexcept 186 | { 187 | return !m_rhs(val); 188 | } 189 | RHS m_rhs; 190 | }; 191 | 192 | 193 | template< typename RHS> 194 | NegateOperation operator ! (const RHS& rhs) 195 | { 196 | return NegateOperation< RHS>(rhs); 197 | } 198 | 199 | 200 | 201 | 202 | 203 | template< typename LHS, typename RHS> 204 | struct OROperation 205 | { 206 | OROperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {} 207 | 208 | template 209 | inline auto operator()(const X& val) noexcept 210 | { 211 | return m_lhs(val) || m_rhs(val); 212 | } 213 | LHS m_lhs; 214 | RHS m_rhs; 215 | }; 216 | 217 | 218 | template< typename LHS, typename RHS> 219 | OROperation< LHS, RHS> operator || (const LHS& lhs, const RHS& rhs) 220 | { 221 | return OROperation(lhs, rhs); 222 | } 223 | 224 | 225 | 226 | template< typename LHS, typename RHS> 227 | struct AndOperation 228 | { 229 | AndOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {} 230 | 231 | template 232 | inline auto operator()(const X& val) noexcept 233 | { 234 | return m_lhs(val) && m_rhs(val); 235 | } 236 | 237 | LHS m_lhs; 238 | RHS m_rhs; 239 | 240 | }; 241 | 242 | 243 | template< typename LHS, typename RHS> 244 | AndOperation< LHS, RHS> operator && (const LHS& lhs, const RHS& rhs) 245 | { 246 | return AndOperation(lhs, rhs); 247 | } 248 | 249 | 250 | }//namespace JOIN 251 | 252 | -------------------------------------------------------------------------------- /Vectorisation/VecX/instruction_traits.h: -------------------------------------------------------------------------------- 1 | /**************************** instruction_traits.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #pragma warning(suppress:4984) 14 | 15 | #include "vec.h" 16 | #include "vec_double.h" 17 | #include "../VCL/vectormath_common.h" 18 | 19 | 20 | template class Vec; 21 | template class VecBool; 22 | template class VecD; 23 | template< typename INS_VEC> class VecView; 24 | 25 | 26 | template< typename INS_VEC> 27 | struct InstructionTraits 28 | { 29 | using BoolType = VecBoolD; 30 | using FloatType = double; 31 | static constexpr int width = 2; 32 | static constexpr double nullValue = 0.0; 33 | static constexpr double oneValue =1.0; 34 | static constexpr bool alignedLoadStore = false; 35 | static constexpr bool boolTypeIsAlignedLoadStore = false; 36 | static constexpr bool useScatter = false; 37 | static constexpr uint32_t limit = 100000; 38 | 39 | static constexpr bool isCompact = false; 40 | using RegBoolType = VecBoolD; 41 | using MemBoolType = VecDouble; 42 | 43 | }; 44 | 45 | 46 | 47 | 48 | template<> 49 | struct InstructionTraits 50 | { 51 | using IdxType = Vec2q; 52 | using BoolType = VecBoolD; 53 | using FloatType = double; 54 | static constexpr int width = 2; 55 | static constexpr double nullValue = 0.0; 56 | static constexpr double oneValue = 1.0; 57 | static constexpr bool alignedLoadStore = true; 58 | static constexpr bool useScatter = false; 59 | static constexpr uint32_t limit = 100000; 60 | static constexpr bool boolTypeIsAlignedLoadStore = true; 61 | 62 | static constexpr bool isCompact = false; 63 | using RegBoolType = VecBoolD; 64 | using MemBoolType = VecDouble; 65 | }; 66 | 67 | 68 | 69 | 70 | template<> 71 | struct InstructionTraits 72 | { 73 | using IdxType = Vec2q; 74 | using BoolType = VecBoolD; 75 | using FloatType = long double; 76 | static constexpr int width = 2; 77 | static constexpr double nullValue = 0.0; 78 | static constexpr double oneValue = 1.0; 79 | static constexpr bool alignedLoadStore = false; 80 | static constexpr bool useScatter = false; 81 | static constexpr uint32_t limit = 100000; 82 | static constexpr bool boolTypeIsAlignedLoadStore = true; 83 | 84 | static constexpr bool isCompact = false; 85 | using RegBoolType = VecBoolD; 86 | using MemBoolType = VecLDouble; 87 | }; 88 | 89 | 90 | 91 | template<> 92 | struct InstructionTraits 93 | { 94 | using IdxType = Vec2q; 95 | using BoolType = Vec2db; 96 | using FloatType = double; 97 | static constexpr int width = 2; 98 | static constexpr double nullValue = 0.0; 99 | static constexpr double oneValue = 1.0; 100 | static constexpr bool alignedLoadStore = true; 101 | static constexpr bool boolTypeIsAlignedLoadStore = false; 102 | static constexpr bool useScatter = false; 103 | static constexpr uint32_t limit = 100000; 104 | 105 | static constexpr bool isCompact = false; 106 | using RegBoolType = Vec2db; 107 | using MemBoolType = Vec2d; 108 | 109 | 110 | }; 111 | 112 | 113 | template<> 114 | struct InstructionTraits 115 | { 116 | using IdxType = Vec4i; 117 | using BoolType = Vec4fb; 118 | using FloatType = float; 119 | static constexpr int width = 4; 120 | static constexpr bool alignedLoadStore = true; 121 | static constexpr bool boolTypeIsAlignedLoadStore = false; 122 | static constexpr float nullValue = 0.f; 123 | static constexpr float oneValue = 1.f; 124 | static constexpr bool useScatter = false; 125 | static constexpr uint32_t limit = 100000; 126 | 127 | static constexpr bool isCompact = false; 128 | using RegBoolType = Vec4fb; 129 | using MemBoolType = Vec4f; 130 | }; 131 | 132 | 133 | 134 | 135 | 136 | template<> 137 | struct InstructionTraits 138 | { 139 | using IdxType = Vec4i; 140 | using BoolType = Vec4db; 141 | using FloatType = double; 142 | static constexpr int width = 4; 143 | static constexpr bool alignedLoadStore = true; 144 | static constexpr bool boolTypeIsAlignedLoadStore = false; 145 | static constexpr double nullValue = 0.0; 146 | static constexpr double oneValue = 1.0; 147 | static constexpr bool useScatter = true; 148 | static constexpr uint32_t limit = 100000; 149 | 150 | static constexpr bool isCompact = false; 151 | using RegBoolType = Vec4db; 152 | using MemBoolType = Vec4d; 153 | }; 154 | 155 | 156 | template<> 157 | struct InstructionTraits 158 | { 159 | using IdxType = Vec8i; 160 | using BoolType = Vec8fb; 161 | using FloatType = float; 162 | 163 | static constexpr int width = 8; 164 | static constexpr float nullValue = 0.f; 165 | static constexpr float oneValue = 1.f; 166 | static constexpr bool alignedLoadStore = true; 167 | static constexpr bool boolTypeIsAlignedLoadStore = false; 168 | static constexpr bool useScatter = true; 169 | static constexpr uint32_t limit = 100000; 170 | 171 | static constexpr bool isCompact = false; 172 | using RegBoolType = Vec8fb; 173 | using MemBoolType = Vec8f; 174 | 175 | }; 176 | 177 | 178 | 179 | template<> 180 | struct InstructionTraits 181 | { 182 | using IdxType = Vec8i; 183 | 184 | using BoolType = Vec8db; 185 | 186 | using FloatType = double; 187 | static constexpr int width = 8; 188 | static constexpr double nullValue = 0.0; 189 | static constexpr double oneValue = 1.0; 190 | static constexpr bool alignedLoadStore = false; 191 | static constexpr bool boolTypeIsAlignedLoadStore = false; 192 | static constexpr bool useScatter = true; 193 | static constexpr uint32_t limit = 1000000; 194 | 195 | static constexpr bool isCompact = true; 196 | using RegBoolType = Vec8db; 197 | using MemBoolType = Vec8d; 198 | 199 | }; 200 | 201 | 202 | 203 | template<> 204 | struct InstructionTraits 205 | { 206 | using IdxType = Vec16i; 207 | using BoolType = Vec16fb; 208 | using FloatType = float; 209 | static constexpr int width = 16; 210 | static constexpr float nullValue = 0.f; 211 | static constexpr float oneValue = 1.f; 212 | static constexpr bool alignedLoadStore = false; 213 | static constexpr bool boolTypeIsAlignedLoadStore = false; 214 | static constexpr bool useScatter = true; 215 | static constexpr uint32_t limit = 1000000; 216 | 217 | static constexpr bool isCompact = true; 218 | using RegBoolType = Vec16fb; 219 | using MemBoolType = Vec16f; 220 | }; 221 | 222 | 223 | 224 | template 225 | inline typename InstructionTraits::MemBoolType boolCompactSave(typename InstructionTraits::RegBoolType regVal ) 226 | { 227 | return boolCompactConvert(regVal); 228 | } 229 | 230 | 231 | template 232 | inline typename InstructionTraits::MemBoolType boolCompactConvert(typename InstructionTraits::RegBoolType regVal) 233 | { 234 | return static_cast::MemBoolType>(regVal); 235 | 236 | } 237 | 238 | 239 | 240 | 241 | inline Vec8d boolCompactConvert(Vec8db regVal) 242 | { 243 | Vec8d const b = 0.; 244 | return select(regVal, -nan8d(), b); 245 | } 246 | 247 | 248 | 249 | inline Vec16f boolCompactConvert(Vec16fb regVal) 250 | { 251 | Vec16f const b = 0.f; 252 | return select(regVal, -nan16f(), b); 253 | } 254 | 255 | 256 | inline Vec8db boolCompactConvert(Vec8d regVal) 257 | { 258 | Vec8d allZeros = false; 259 | Vec8db ret = !(allZeros == regVal); 260 | return ret; 261 | } 262 | 263 | 264 | inline Vec16fb boolCompactConvert(Vec16f regVal) 265 | { 266 | Vec16f allZeros = false; 267 | Vec16fb ret = !(allZeros == regVal); 268 | return ret; 269 | } 270 | 271 | 272 | 273 | //for save 274 | template 275 | inline auto boolConvert(typename InstructionTraits::RegBoolType regVal) 276 | { 277 | if constexpr (! InstructionTraits::isCompact ) 278 | { 279 | return regVal; 280 | } 281 | else 282 | { 283 | return boolCompactSave< TRAIT>(regVal); 284 | } 285 | } 286 | 287 | 288 | //for load 289 | template 290 | inline auto boolConvert(typename InstructionTraits::MemBoolType regVal) 291 | { 292 | if constexpr (!InstructionTraits::isCompact) 293 | { 294 | return regVal; 295 | } 296 | else 297 | { 298 | return boolCompactConvert(regVal); 299 | } 300 | } 301 | 302 | 303 | 304 | -------------------------------------------------------------------------------- /Vectorisation/VecX/target_name_space.h: -------------------------------------------------------------------------------- 1 | /**************************** target_name_space.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | 14 | #include "vec.h" 15 | #include "vec_bool.h" 16 | #include "vec_d.h" 17 | #include "vec_bool_d.h" 18 | #include "vec_double.h" 19 | #include "vec_view.h" 20 | #include "apply_operation.h" 21 | #include "span.h" 22 | 23 | namespace DRC 24 | { 25 | 26 | 27 | 28 | namespace VecLDb 29 | { 30 | using VecxD = VecD; 31 | using Vecx = VecD; 32 | using VecXX = Vec; 33 | using VecVW = VecView; 34 | using VecBL = VecBool; 35 | using SpanXX = Span; 36 | using StrdSpanXX = StridedSpan; 37 | }; 38 | // experimental 39 | 40 | 41 | namespace VecDb 42 | { 43 | using VecxD = VecD; 44 | using Vecx = VecD; 45 | using VecXX = Vec; 46 | using VecVW = VecView; 47 | using VecBL = VecBool; 48 | using SpanXX = Span; 49 | using StrdSpanXX = StridedSpan; 50 | }; 51 | 52 | 53 | 54 | namespace VecD2D 55 | { 56 | using VecxD = VecD; 57 | using Vecx = VecD; 58 | using VecXX = Vec; 59 | using VecVW = VecView; 60 | using VecBL = VecBool; 61 | using SpanXX = Span; 62 | using StrdSpanXX = StridedSpan; 63 | }; 64 | 65 | 66 | namespace VecD4D 67 | { 68 | using VecxD = VecD; 69 | using Vecx = VecD; 70 | using VecXX = Vec; 71 | using VecVW = VecView; 72 | using VecBL = VecBool; 73 | using SpanXX = Span; 74 | using StrdSpanXX = StridedSpan; 75 | }; 76 | 77 | namespace VecD8D 78 | { 79 | using VecxD = VecD ; 80 | using Vecx = VecD ; 81 | using VecXX =Vec ; 82 | using VecVW = VecView ; 83 | using VecBL = VecBool ; 84 | using SpanXX = Span ; 85 | using StrdSpanXX =StridedSpan ; 86 | }; 87 | 88 | namespace VecF16F 89 | { 90 | using VecxD = VecD; 91 | using Vecx = VecD; 92 | using VecXX = Vec; 93 | using VecVW = VecView; 94 | using VecBL = VecBool; 95 | using SpanXX = Span; 96 | using StrdSpanXX = StridedSpan; 97 | }; 98 | 99 | namespace VecF8F 100 | { 101 | using VecxD = VecD; 102 | using Vecx = VecD; 103 | using VecXX = Vec; 104 | using VecVW = VecView; 105 | using VecBL = VecBool; 106 | using SpanXX = Span; 107 | using StrdSpanXX = StridedSpan; 108 | }; 109 | 110 | 111 | namespace VecF4F 112 | { 113 | using VecxD = VecD; 114 | using Vecx = VecD; 115 | using VecXX = Vec; 116 | using VecVW = VecView; 117 | using VecBL = VecBool; 118 | using SpanXX = Span; 119 | using StrdSpanXX = StridedSpan; 120 | }; 121 | 122 | 123 | 124 | }// namespace DRC -------------------------------------------------------------------------------- /Vectorisation/VecX/transform.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "vec.h" 4 | #include "binary_unitary_operations.h" 5 | #include "conditional_select_eval.h" 6 | 7 | 8 | //Unitary lambdas 9 | //unrolled version defaults to unroll x4 10 | template 11 | Vec ApplyTransform(LAMBDA& lambda, const Vec& inputVec) 12 | { 13 | return ApplyUnitaryOperation(inputVec, lambda); 14 | } 15 | 16 | //not unrolled x1 17 | template 18 | Vec ApplyTransform1(LAMBDA& lambda, const Vec& inputVec) 19 | { 20 | return ApplyUnitaryOperation1(inputVec, lambda); 21 | } 22 | 23 | //inplace transforms 24 | template 25 | void ApplyTransformM(LAMBDA& lambda, Vec& inputVec) 26 | { 27 | ApplyUnitaryOperationM(inputVec, lambda); 28 | } 29 | 30 | //not unrolled x1 31 | template 32 | void ApplyTransform1(LAMBDA& lambda, Vec& inputVec) 33 | { 34 | ApplyUnitaryOperation1(inputVec, lambda); 35 | } 36 | 37 | 38 | //unrolled version defaults to unroll x4 39 | template 40 | Vec ApplyTransform(LAMBDA& lambda, const Vec& inputVecLHS, const Vec& inputVecRHS) 41 | { 42 | return ApplyBinaryOperation(inputVecLHS, inputVecRHS, lambda); 43 | } 44 | 45 | template 46 | Vec ApplyTransform(LAMBDA& lambda, typename InstructionTraits::FloatType LHS, const Vec& inputVecRHS) 47 | { 48 | return ApplyBinaryOperation(LHS, inputVecRHS, lambda); 49 | } 50 | 51 | template 52 | Vec ApplyTransform(LAMBDA& lambda, const Vec& inputVecLHS, typename InstructionTraits::FloatType RHS) 53 | { 54 | return ApplyBinaryOperation(inputVecLHS, RHS, lambda); 55 | } 56 | 57 | // not unrolled 58 | template 59 | Vec ApplyTransform1(LAMBDA& lambda, const Vec& inputVecLHS, const Vec& inputVecRHS) 60 | { 61 | return ApplyBinaryOperation1(inputVecLHS, inputVecRHS, lambda); 62 | } 63 | 64 | //conversion 65 | template 66 | Vec ApplyTransform1(LAMBDA& lambda, typename InstructionTraits::FloatType LHS, const Vec& inputVecRHS) 67 | { 68 | return ApplyBinaryOperation1(LHS, inputVecRHS, lambda); 69 | } 70 | 71 | template 72 | Vec ApplyTransform1(LAMBDA& lambda, const Vec& inputVecLHS, typename InstructionTraits::FloatType RHS) 73 | { 74 | return ApplyBinaryOperation1(inputVecLHS, RHS, lambda); 75 | } 76 | 77 | /////////////////////////////// in place binary unroll x 4 //////////////// 78 | 79 | template 80 | void ApplyTransformM(LAMBDA& lambda, Vec& inputVecLHS, const Vec& inputVecRHS) 81 | { 82 | ApplyBinaryOperationMMXY(inputVecLHS, inputVecRHS , lambda); 83 | } 84 | 85 | template 86 | void ApplyTransformM(LAMBDA& lambda, Vec& inputVecLHS, typename InstructionTraits::FloatType RHS) 87 | { 88 | ApplyBinaryOperationMMMX(inputVecLHS, RHS, lambda); 89 | } 90 | 91 | 92 | template 93 | void ApplyTransformM(LAMBDA& lambda, typename InstructionTraits::FloatType LHS, Vec& inputVecRHS) 94 | { 95 | ApplyBinaryOperationMMM(LHS, inputVecRHS , lambda); 96 | } 97 | 98 | 99 | // ApplySparseTransform takes a boolean condition lambda to determine if the lambda should be used 100 | // to calculate a given value in the vector 101 | // if so it applies the transform lambda and then blends it into the target vector 102 | template< typename LAMBDA, typename INS_VEC, typename CONDITION_LAMBDA> 103 | void ApplySparseTransform(const Vec& inputVec, Vec& updateResult, LAMBDA& oper, CONDITION_LAMBDA& selectionOp) 104 | { 105 | ApplySparseUnitaryOperationU(inputVec, updateResult,oper, selectionOp); 106 | } 107 | 108 | 109 | ///////////////////// TO DO make this the interface ///////////////////// 110 | 111 | template< typename INS_VEC, typename OP> 112 | typename InstructionTraits::FloatType ApplyReduce(const Vec& rhs1, OP& oper, typename InstructionTraits::FloatType initVal, bool singularInit = true) 113 | { 114 | return ApplyAccumulate2(rhs1, oper, initVal, singularInit); 115 | 116 | } 117 | 118 | 119 | template< typename INS_VEC, typename OP, typename OPT> 120 | typename InstructionTraits::FloatType ApplyTransformReduce(const Vec& rhs1, OPT& operTransform, OP& operAcc, typename InstructionTraits::FloatType initVal = InstructionTraits::nullValue, bool singularInit = true) 121 | { 122 | return ApplyTransformAccumulateUR(rhs1, operTransform, operAcc, initVal, singularInit); 123 | } 124 | 125 | 126 | ///////////////////////////////// 127 | 128 | 129 | template< typename INS_VEC, typename BOOL_OPER> 130 | Vec ApplySelection(BOOL_OPER& COND, const Vec& testData, const Vec& lhs, const Vec& rhs) 131 | { 132 | return ApplySelectionOperationC(COND, testData, lhs, rhs); 133 | } 134 | 135 | 136 | template< typename INS_VEC, typename BOOL_OPER> 137 | Vec ApplySelection(BOOL_OPER& COND, const Vec& testData, typename InstructionTraits::FloatType trueVal, typename InstructionTraits::FloatType falseVal) 138 | { 139 | return ApplySelectionOperationC (COND, testData, trueVal, falseVal); 140 | } 141 | 142 | template< typename INS_VEC, typename BOOL_OPER, typename TRUE_OPER, typename FALSE_OPER> 143 | Vec ApplySelectionF(BOOL_OPER& COND, const Vec& testData, TRUE_OPER& trueOper, FALSE_OPER& falseOper) 144 | { 145 | return ApplySelectionOperationFunc(COND, testData, trueOper, falseOper); 146 | } 147 | 148 | 149 | /* 150 | applies the tesFunc to the vector val, if the func returns true it applies the trueLambda to the value otherwise it applies the falseLambda 151 | can be slow if true/false Lambda functions are not heavyweight 152 | */ 153 | template< typename INS_VEC, typename BOOL_TEST_OP, typename TRUE_LAMBDA, typename FALSE_LAMBDA> 154 | Vec ApplySplitCalculate( BOOL_TEST_OP& testFunc, const Vec& val, TRUE_LAMBDA& trueLambda, FALSE_LAMBDA& falseLambda) 155 | { 156 | return splitConditionalCalculate(val, testFunc, trueLambda, falseLambda); 157 | } 158 | 159 | 160 | /* 161 | applies the tesFunc to the view val, if the func returns true it applies the trueLambda to the value otherwise it applies the falseLambda 162 | can be slow if true/false Lambda functions are not heavyweight 163 | */ 164 | template< typename INS_VEC, typename BOOL_TEST_OP, typename TRUE_LAMBDA, typename FALSE_LAMBDA> 165 | VecView ApplySplitCalculate( BOOL_TEST_OP& testFunc, const VecView& val, TRUE_LAMBDA& trueLambda, FALSE_LAMBDA& falseLambda) 166 | { 167 | return splitConditionalCalculate(val, testFunc, trueLambda, falseLambda); 168 | } 169 | 170 | 171 | /////////////////////////// filters ////////////////////// 172 | 173 | 174 | template 175 | void ApplyTransformM(LAMBDA& lambda, VecView& inputVec) 176 | { 177 | return ApplyUnitaryOperation(lambda, inputVec); 178 | } 179 | 180 | template 181 | VecView ApplyTransform(LAMBDA& lambda, const VecView& inputVec) 182 | { 183 | return ApplyUnitaryOperation(lambda, inputVec); 184 | } 185 | 186 | 187 | template 188 | VecView ApplyTransformV(LAMBDA& lambda, const Vec& inputVec) 189 | { 190 | return ApplyUnitaryOperation(lambda, inputVec); 191 | } 192 | 193 | 194 | /* 195 | applies the OP to the view in and scatter, writes the results to the corresponding elements of the result vector. 196 | */ 197 | template< typename INS_VEC, typename OP> 198 | void ApplyTransformWrite(OP& oper, const VecView& view, Vec& out) 199 | { 200 | ApplyUnitaryOperationWrite(oper, view, out); 201 | } 202 | -------------------------------------------------------------------------------- /Vectorisation/VecX/vcl_latest.h: -------------------------------------------------------------------------------- 1 | /**************************** vcl_latest.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | 14 | 15 | #include "../VCL/vectormath_common.h" 16 | #include "../VCL/vectorclass.h" 17 | #include "../VCL/vectormath_exp.h" 18 | #include "../VCL/vectormath_trig.h" 19 | #include "../VCL/vectormath_hyp.h" -------------------------------------------------------------------------------- /Vectorisation/VecX/vec.cpp: -------------------------------------------------------------------------------- 1 | /**************************** vec.cpp ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | //#include "vec.h" 13 | //#include "alloc_policy.h" 14 | //#include 15 | /* 16 | const double InstructionTraits::nullValue = 0.0; 17 | const double InstructionTraits::oneValue = 1.0; 18 | 19 | const double InstructionTraits::nullValue = 0.0; 20 | const double InstructionTraits::oneValue = 1.0; 21 | 22 | const double InstructionTraits::nullValue = 0.0; 23 | const double InstructionTraits::oneValue = 1.0; 24 | 25 | const double InstructionTraits::nullValue = 0.0; 26 | const double InstructionTraits::oneValue = 1.0; 27 | 28 | const float InstructionTraits::nullValue = 0.0f; 29 | const float InstructionTraits::oneValue = 1.0f; 30 | 31 | const float InstructionTraits::nullValue = 0.0f; 32 | const float InstructionTraits::oneValue = 1.0f; 33 | 34 | const float InstructionTraits::nullValue = 0.0f; 35 | const float InstructionTraits::oneValue = 1.0f; 36 | */ 37 | -------------------------------------------------------------------------------- /Vectorisation/VecX/vec.h: -------------------------------------------------------------------------------- 1 | /**************************** vec.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include "instruction_traits.h" 14 | #include "alloc_policy.h" 15 | #include "apply_operation.h" 16 | #include "vec_view.h" 17 | #include "span.h" 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | 25 | template 26 | class Vec 27 | { 28 | public: 29 | 30 | friend class VecView< INS_VEC>; 31 | 32 | typedef INS_VEC INS; 33 | using SCALA_TYPE = typename InstructionTraits::FloatType; 34 | 35 | //convert to scalar 36 | template 37 | static SCALA_TYPE scalar(const T& val) 38 | { 39 | return static_cast(val); 40 | } 41 | 42 | //convert to register 43 | template 44 | static INS reg(T& val) 45 | { 46 | INS vec(static_cast(val)); 47 | return vec; 48 | } 49 | 50 | 51 | private: 52 | 53 | typename InstructionTraits::FloatType m_scalarVal; 54 | bool m_isScalar; 55 | 56 | typename InstructionTraits::FloatType* m_pData; 57 | 58 | int m_size; // number of elements represented 59 | size_t m_implSize;// actual size of allocated block 60 | 61 | public: 62 | 63 | Vec():m_scalarVal(0.),m_isScalar(true) 64 | { 65 | m_size = 0; 66 | m_implSize = 0; 67 | m_pData = nullptr; 68 | } 69 | 70 | 71 | //not explicit allow conversions 72 | Vec( typename InstructionTraits::FloatType scalarVal):m_scalarVal(scalarVal),m_isScalar(true) 73 | { 74 | m_size = 0; 75 | m_implSize = 0; 76 | m_pData = nullptr; 77 | } 78 | 79 | 80 | Vec& operator =(typename InstructionTraits::FloatType scalarVal) 81 | { 82 | m_isScalar=true; 83 | m_scalarVal=scalarVal; 84 | if (m_pData != nullptr) 85 | { 86 | freePool(m_implSize, m_pData); 87 | } 88 | m_size = 0; 89 | m_implSize = 0; 90 | m_pData = nullptr; 91 | return *this; 92 | } 93 | 94 | 95 | Vec(const std::vector< typename InstructionTraits::FloatType > & ctr) 96 | { 97 | 98 | int sz = static_cast(std::distance(ctr.begin(), ctr.end()) ); 99 | m_size =sz; 100 | m_implSize = sz; 101 | allocPool(m_implSize,m_pData); 102 | 103 | auto repeatedPaddingValue = ctr.at(sz-1); 104 | for(auto s =sz; s < static_cast(m_implSize);s++) 105 | { 106 | m_pData[s] =repeatedPaddingValue; 107 | } 108 | 109 | std::copy(cbegin(ctr),cend(ctr),m_pData); 110 | 111 | m_isScalar = false; 112 | m_scalarVal = InstructionTraits::nullValue; 113 | } 114 | 115 | explicit Vec(int sz):m_size(sz), m_implSize(sz) 116 | { 117 | allocPool(m_implSize, m_pData); 118 | m_isScalar = false; 119 | m_scalarVal = InstructionTraits::nullValue; 120 | } 121 | 122 | 123 | 124 | Vec(typename InstructionTraits::FloatType val, int sz) :m_size(sz), m_implSize(sz) 125 | { 126 | allocPool(m_implSize, m_pData); 127 | m_isScalar = false; 128 | m_scalarVal = InstructionTraits::nullValue; 129 | 130 | std::fill_n(start(), sz, val); 131 | 132 | } 133 | 134 | 135 | ~Vec() 136 | { 137 | if(m_pData != nullptr) 138 | { 139 | freePool(m_implSize,m_pData); 140 | } 141 | } 142 | 143 | Vec(const Vec& rhs): m_scalarVal(rhs.m_scalarVal), m_isScalar(rhs.m_isScalar), m_size(rhs.m_size), m_implSize(rhs.m_implSize) 144 | { 145 | m_pData = nullptr; 146 | 147 | if( !m_isScalar) 148 | { 149 | m_implSize = m_size; 150 | allocPool(m_implSize,m_pData); 151 | std::copy(rhs.m_pData, rhs.m_pData+ m_implSize , m_pData); 152 | } 153 | } 154 | 155 | Vec& operator=(const Vec& rhs) 156 | { 157 | if (&rhs != this) 158 | { 159 | if (m_pData != nullptr) 160 | { 161 | freePool(m_implSize, m_pData); 162 | m_pData = nullptr; 163 | m_size = 0; 164 | m_implSize = 0; 165 | } 166 | 167 | m_isScalar = rhs.m_isScalar; 168 | m_scalarVal = rhs.m_scalarVal; 169 | 170 | if( !m_isScalar) 171 | { 172 | m_size= rhs.m_size; 173 | m_implSize = m_size; 174 | allocPool(m_implSize,m_pData); 175 | std::copy(rhs.m_pData, rhs.m_pData+ m_implSize , m_pData); 176 | } 177 | } 178 | 179 | return *this; 180 | } 181 | 182 | 183 | Vec(Vec&& rhs) noexcept 184 | { 185 | m_implSize = 0; 186 | m_isScalar = true; 187 | m_scalarVal = InstructionTraits::nullValue; 188 | m_size = 0; 189 | m_pData = nullptr; 190 | *this = std::move(rhs); 191 | } 192 | 193 | Vec& operator=( Vec&& rhs) noexcept 194 | { 195 | if (&rhs != this) 196 | { 197 | std::swap(m_isScalar , rhs.m_isScalar); 198 | std::swap(m_scalarVal , rhs.m_scalarVal); 199 | std::swap( m_implSize , rhs.m_implSize); 200 | std::swap(m_size, rhs.m_size); 201 | std::swap(m_pData, rhs.m_pData); 202 | } 203 | return *this; 204 | } 205 | 206 | 207 | //explicit 208 | operator std::vector::FloatType>() 209 | { 210 | return std::vector::FloatType>(begin(), end()); 211 | } 212 | 213 | 214 | typename InstructionTraits::FloatType& operator[](size_t pos) 215 | { 216 | return m_pData[pos]; 217 | } 218 | 219 | typename InstructionTraits::FloatType operator[](size_t pos) const 220 | { 221 | return m_pData[pos]; 222 | } 223 | 224 | 225 | inline typename InstructionTraits::FloatType* start() const 226 | { 227 | return m_pData; 228 | } 229 | 230 | inline typename InstructionTraits::FloatType* data() 231 | { 232 | return m_pData; 233 | } 234 | 235 | 236 | inline int size() const 237 | { 238 | return m_size; 239 | } 240 | 241 | 242 | inline int paddedSize() const 243 | { 244 | return static_cast(m_implSize); 245 | } 246 | 247 | inline bool isScalar() const 248 | { 249 | return m_isScalar; 250 | } 251 | 252 | inline typename InstructionTraits::FloatType getScalarValue() const 253 | { 254 | return m_scalarVal; 255 | } 256 | 257 | inline void setScalarValue( typename InstructionTraits::FloatType val) 258 | { 259 | m_scalarVal = val; 260 | } 261 | 262 | inline typename InstructionTraits::FloatType* begin() const 263 | { 264 | return start(); 265 | } 266 | 267 | inline typename InstructionTraits::FloatType* end() const 268 | { 269 | return start() + static_cast(m_size); 270 | } 271 | 272 | inline static INS_VEC reg(typename InstructionTraits::FloatType val) 273 | { 274 | return INS_VEC(val); 275 | } 276 | 277 | }; 278 | 279 | 280 | 281 | 282 | template 283 | bool isScalar(const Vec & X) 284 | { 285 | return X.isScalar(); 286 | } 287 | 288 | template 289 | bool isScalar( Vec& X) 290 | { 291 | return X.isScalar(); 292 | } 293 | 294 | 295 | template 296 | bool isScalar(const VecView& X) 297 | { 298 | return X.isScalar(); 299 | } 300 | 301 | template 302 | bool isScalar( VecView& X) 303 | { 304 | return X.isScalar(); 305 | } 306 | 307 | 308 | template 309 | bool isScalar(const T&) 310 | { 311 | return true; 312 | } 313 | 314 | template 315 | bool isScalar( T&) 316 | { 317 | return true; 318 | } 319 | 320 | 321 | template 322 | bool isScalar(const Span& X) 323 | { 324 | return false; 325 | } 326 | 327 | template< template typename VEC_TYPE, typename INS_VEC, typename OP> 328 | void getScalarValue(const VEC_TYPE& rhs1, typename InstructionTraits::FloatType& val) 329 | { 330 | 331 | if (isScalar(rhs1)) 332 | { 333 | val = rhs1.getScalarValue(); 334 | } 335 | } -------------------------------------------------------------------------------- /Vectorisation/VecX/vec_bool.h: -------------------------------------------------------------------------------- 1 | /**************************** vec_bool.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | 14 | #include 15 | #include 16 | #include "alloc_policy.h" 17 | #include "apply_operation.h" 18 | 19 | template 20 | class VecBool 21 | { 22 | private: 23 | 24 | 25 | typename InstructionTraits::FloatType* m_pData; 26 | size_t m_size; 27 | size_t m_implSize; 28 | 29 | bool m_scalarVal; 30 | bool m_isScalar = false; 31 | 32 | public: 33 | explicit VecBool(bool scalar) : m_pData(nullptr), m_size(0), m_implSize(0), m_scalarVal(scalar),m_isScalar(true) 34 | { 35 | 36 | } 37 | 38 | VecBool(int sz) :m_size(sz), m_implSize(sz), m_scalarVal(0), m_isScalar(false) 39 | { 40 | allocPool(m_implSize, m_pData); 41 | } 42 | 43 | 44 | ~VecBool() 45 | { 46 | if (m_pData != nullptr) 47 | { 48 | freePool(m_size, m_pData); 49 | } 50 | } 51 | 52 | VecBool(const VecBool& rhs) 53 | { 54 | m_isScalar = rhs.m_isScalar; 55 | m_scalarVal = rhs.m_scalarVal; 56 | m_size = rhs.m_size; 57 | m_implSize = m_size; 58 | allocPool(m_implSize, m_pData); 59 | std::copy(rhs.m_pData, rhs.m_pData + m_implSize, m_pData); 60 | } 61 | 62 | 63 | VecBool& operator=(const VecBool& rhs) 64 | { 65 | if (&rhs != this) 66 | { 67 | m_isScalar = rhs.m_isScalar; 68 | m_scalarVal = rhs.m_scalarVal; 69 | m_size = rhs.m_size; 70 | m_implSize = rhs.m_implSize; 71 | std::copy(rhs.m_pData, rhs.m_pData + m_implSize, m_pData); 72 | } 73 | return *this; 74 | } 75 | 76 | 77 | VecBool(VecBool&& rhs) noexcept 78 | { 79 | m_isScalar = rhs.m_isScalar; 80 | m_scalarVal = rhs.m_scalarVal; 81 | m_implSize = 0; 82 | m_implSize= rhs.m_implSize; 83 | m_size = rhs.size(); 84 | m_pData = nullptr; 85 | *this = std::move(rhs); 86 | } 87 | 88 | VecBool& operator=(VecBool&& rhs) noexcept 89 | { 90 | if (&rhs != this) 91 | { 92 | std::swap(m_implSize, rhs.m_implSize); 93 | std::swap(m_size, rhs.m_size); 94 | std::swap(m_pData, rhs.m_pData); 95 | std::swap(m_scalarVal,rhs.m_scalarVal); 96 | std::swap(m_isScalar, rhs.m_isScalar); 97 | 98 | } 99 | return *this; 100 | } 101 | 102 | 103 | inline typename InstructionTraits::FloatType* start() const 104 | { 105 | return m_pData; 106 | } 107 | 108 | inline size_t size() const 109 | { 110 | return m_size; 111 | 112 | } 113 | 114 | inline size_t paddedSize() const 115 | { 116 | return m_implSize; 117 | } 118 | 119 | inline bool getScalarValue() const 120 | { 121 | return m_scalarVal; 122 | } 123 | 124 | inline bool isScalar() const 125 | { 126 | return m_isScalar; 127 | } 128 | 129 | inline bool operator[](int j)const 130 | { 131 | return m_pData[j]; 132 | } 133 | 134 | 135 | void setAt(int j, bool val) 136 | { 137 | m_pData[j] = val; 138 | } 139 | 140 | }; 141 | 142 | -------------------------------------------------------------------------------- /Vectorisation/VecX/vec_bool_d.h: -------------------------------------------------------------------------------- 1 | /**************************** vec_bool_d.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include "vec_double.h" 14 | 15 | static inline double asDouble( bool bVal) 16 | { 17 | if(!bVal) return 0.0; 18 | return static_cast(0xFFFFFFFFFFFFFFFF); 19 | } 20 | 21 | 22 | static inline bool asBool( double val) 23 | { 24 | if( 0.0 == val) return false; 25 | return true; 26 | 27 | } 28 | 29 | 30 | class VecBoolD 31 | { 32 | private: 33 | 34 | double m_data[2]; 35 | public: 36 | VecBoolD() { m_data[0] = asDouble(false); m_data[1] = asDouble(false);} 37 | 38 | VecBoolD( bool d0,bool d1){ m_data[0] = asDouble(d0); m_data[1] = asDouble(d1);} 39 | 40 | VecBoolD& operator = ( const VecBoolD& rhs) 41 | { 42 | m_data[0] =rhs.m_data[0]; m_data[1] =rhs.m_data[1]; 43 | return *this; 44 | } 45 | 46 | VecBoolD(bool d0) { m_data[0] = asDouble(d0); m_data[1] = asDouble(d0); } 47 | 48 | VecBoolD& operator = (bool rhs) 49 | { 50 | m_data[0] = asDouble(rhs); m_data[1] = asDouble(rhs); 51 | return *this; 52 | } 53 | 54 | VecBoolD& load_a ( const double* p) 55 | { 56 | m_data[0] =p[0]; 57 | m_data[1] =p[1]; 58 | return *this; 59 | } 60 | 61 | 62 | void store_a( double* p) 63 | { 64 | p[0] = m_data[0]; 65 | p[1] =m_data[1]; 66 | } 67 | 68 | bool extract(size_t idx) const 69 | { 70 | return asBool(m_data[idx]); 71 | } 72 | 73 | 74 | void insert(size_t idx,bool val) 75 | { 76 | m_data[idx]= asDouble(val); 77 | } 78 | 79 | 80 | bool operator [] (size_t index) const 81 | { 82 | return extract(index); 83 | } 84 | 85 | static int size() { return 2;} 86 | 87 | inline bool isScalar() const 88 | { 89 | return false; 90 | } 91 | 92 | 93 | }; 94 | 95 | 96 | 97 | static inline bool horizontal_or(VecBoolD const & a) 98 | { 99 | return a[0] || a[1]; 100 | } 101 | 102 | static inline bool horizontal_and(VecBoolD const & a) 103 | { 104 | return a[0] && a[1]; 105 | } 106 | 107 | 108 | static inline VecBoolD operator &&(VecBoolD const& a, const VecBoolD& b) 109 | { 110 | return VecBoolD( static_cast(a[0]) & static_cast(b[0]), static_cast(a[1]) & static_cast(b[1])); 111 | } 112 | 113 | 114 | static inline VecBoolD operator ||(VecBoolD const& a, const VecBoolD& b) 115 | { 116 | return VecBoolD(static_cast(a[0]) | static_cast(b[0]), static_cast(a[1]) | static_cast(b[1]) ); 117 | } 118 | 119 | 120 | static inline VecBoolD operator !(VecBoolD const& a) 121 | { 122 | return VecBoolD(!a[0] , !a[1]); 123 | } 124 | -------------------------------------------------------------------------------- /Vectorisation/VecX/vec_d.h: -------------------------------------------------------------------------------- 1 | /**************************** vec_d.h ******************************* 2 | * Author: Andrew Drakeford 3 | * Date created: 2021-04-10 4 | * Last modified: 2021-04-10 5 | * Version: 1.0 6 | * Project: DR Cubed 7 | * Description: 8 | * 9 | * (c) Copyright 2019 Andrew Drakeford 10 | * Apache License version 2.0 or later. 11 | *****************************************************************************/ 12 | #pragma once 13 | #include 14 | #include 15 | #include "vec.h" 16 | #include "vec_bool.h" 17 | 18 | 19 | 20 | template 21 | class VecD 22 | { 23 | private: 24 | public: 25 | 26 | Vec< INS_VEC> val; 27 | Vec< INS_VEC> deriv; 28 | 29 | public: 30 | 31 | VecD() 32 | {} 33 | 34 | static VecD< INS_VEC> makeDVecZero(const Vec& value) 35 | { 36 | if (value.isScalar()) 37 | { 38 | return VecD(value.getScalarValue(), InstructionTraits::nullValue); 39 | } 40 | std::vector< typename InstructionTraits::FloatType> zeros(value.size(), InstructionTraits::nullValue); 41 | return VecD(value, Vec< INS_VEC>(zeros)); 42 | } 43 | 44 | static VecD< INS_VEC> makeDVecOnes(const Vec& value) 45 | { 46 | if (value.isScalar()) 47 | { 48 | return VecD(value.getScalarValue(), InstructionTraits::oneValue); 49 | } 50 | std::vector< typename InstructionTraits::FloatType> ones(value.size(), InstructionTraits::oneValue); 51 | return VecD(value, ones); 52 | } 53 | 54 | 55 | static VecD< INS_VEC> makeDVecOnes(const typename InstructionTraits::FloatType& value, int sz) 56 | { 57 | Vec< INS_VEC> values(value, sz); 58 | Vec< INS_VEC> ones(InstructionTraits::oneValue, sz); 59 | return VecD(values, ones); 60 | } 61 | 62 | static VecD< INS_VEC> makeDVecZero(const typename InstructionTraits::FloatType& value, int sz) 63 | { 64 | Vec< INS_VEC> values(value, sz); 65 | Vec< INS_VEC> zeros(InstructionTraits::nullValue, sz); 66 | return VecD(values, zeros); 67 | } 68 | 69 | static VecD< INS_VEC> makeDVecOnesV(const typename InstructionTraits::FloatType& value, int sz) 70 | { 71 | Vec< INS_VEC> values(value, sz); 72 | std::vector< typename InstructionTraits::FloatType> ones(value.size(), InstructionTraits::oneValue); 73 | return VecD(values, ones); 74 | } 75 | 76 | static VecD< INS_VEC> makeDVecZeroV(const typename InstructionTraits::FloatType& value, int sz) 77 | { 78 | Vec< INS_VEC> values(value, sz); 79 | std::vector< typename InstructionTraits::FloatType> nulls(value.size(), InstructionTraits::nullValue); 80 | return VecD(values, nulls); 81 | } 82 | 83 | explicit VecD(typename InstructionTraits::FloatType scalarVal) 84 | :val(scalarVal), deriv(InstructionTraits::nullValue) 85 | { 86 | 87 | } 88 | 89 | 90 | VecD(typename InstructionTraits::FloatType scalarVal, typename InstructionTraits::FloatType derivVal) 91 | :val(scalarVal), deriv(derivVal) 92 | { 93 | 94 | } 95 | 96 | 97 | 98 | VecD(const std::vector< typename InstructionTraits::FloatType> & ctr) :val(ctr), deriv(InstructionTraits::nullValue, ctr.size()) 99 | { 100 | 101 | } 102 | 103 | 104 | VecD(const Vec& value, const Vec& derivative) : val(value), deriv(derivative) 105 | {} 106 | 107 | 108 | VecD(Vec&& value, Vec&& derivative) : 109 | val(std::forward< Vec>(value)), 110 | deriv(std::forward>(derivative)) 111 | {} 112 | 113 | 114 | VecD(Vec&& value) : 115 | val(std::forward< Vec>(value)) 116 | { 117 | if (!val.isScalar()) 118 | { 119 | deriv(InstructionTraits::nullVal, value.size()); 120 | } 121 | else 122 | { 123 | deriv(InstructionTraits::nullVal); 124 | } 125 | } 126 | 127 | 128 | VecD(Vec&& value, const Vec& d) : 129 | val(std::forward< Vec>(value)), deriv(d) 130 | { 131 | } 132 | 133 | 134 | //explicit 135 | VecD(const Vec& value) : 136 | val(value), deriv(value.isScalar()? InstructionTraits::nullValue : Vec< INS_VEC>(InstructionTraits::nullValue,value.size() ) ) 137 | { 138 | } 139 | 140 | 141 | explicit VecD(size_t sz) :val(sz), deriv(sz) 142 | { 143 | 144 | } 145 | 146 | 147 | typename InstructionTraits::FloatType& operator[](size_t pos) 148 | { 149 | return val[pos]; 150 | } 151 | 152 | typename InstructionTraits::FloatType operator[](size_t pos) const 153 | { 154 | return val[pos]; 155 | } 156 | 157 | 158 | inline typename InstructionTraits::FloatType* start() const 159 | { 160 | return val.start(); 161 | } 162 | 163 | 164 | inline size_t size() const 165 | { 166 | return val.size(); 167 | } 168 | 169 | 170 | 171 | inline int paddedSize() const 172 | { 173 | return static_cast(val.paddedSize()); 174 | } 175 | 176 | inline bool isScalar() const 177 | { 178 | return val.isScalar(); 179 | } 180 | 181 | inline typename InstructionTraits::FloatType getScalarValue() const 182 | { 183 | return val.getScalarValue(); 184 | } 185 | 186 | inline void setScalarValue(typename InstructionTraits::FloatType newVal) 187 | { 188 | val.setScalarValue(newVal); 189 | } 190 | 191 | 192 | inline typename InstructionTraits::FloatType getScalarDeriv() const 193 | { 194 | return deriv.getScalarValue(); 195 | } 196 | 197 | inline void setScalarDeriv(typename InstructionTraits::FloatType newVal) 198 | { 199 | deriv.setScalarValue(newVal); 200 | } 201 | 202 | 203 | 204 | inline const Vec< INS_VEC>& value() const 205 | { 206 | return val; 207 | } 208 | 209 | inline const Vec< INS_VEC>& derivative() const 210 | { 211 | return deriv; 212 | } 213 | 214 | inline Vec< INS_VEC>& value() 215 | { 216 | return val; 217 | } 218 | 219 | inline Vec< INS_VEC>& derivative() 220 | { 221 | return deriv; 222 | } 223 | 224 | 225 | }; 226 | 227 | 228 | template 229 | VecD D(const Vec& rhs) 230 | { 231 | return VecD::makeDVecOnes(rhs); 232 | } 233 | 234 | 235 | template 236 | VecD C(const Vec& rhs) 237 | { 238 | return VecD::makeDVecZero(rhs); 239 | } 240 | 241 | -------------------------------------------------------------------------------- /Vectorisation/Vectorisation.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/Vectorisation.cpp -------------------------------------------------------------------------------- /Vectorisation/Vectorisation.log: -------------------------------------------------------------------------------- 1 | C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'Vectorisation.vcxproj'. Please check to make sure that you have specified a valid combination of Configuration and Platform for this project. Configuration='Debug' Platform='ARM64'. This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform. 2 | -------------------------------------------------------------------------------- /Vectorisation/Vectorisation.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {06d8baf8-80f2-40a5-98f6-1cf93bfb3eaa} 14 | 15 | 16 | 17 | 18 | VecWrapper 19 | 20 | 21 | VecWrapper 22 | 23 | 24 | VecWrapper 25 | 26 | 27 | VecWrapper 28 | 29 | 30 | VecWrapper 31 | 32 | 33 | VecWrapper 34 | 35 | 36 | VecWrapper 37 | 38 | 39 | VecWrapper 40 | 41 | 42 | Header Files 43 | 44 | 45 | Header Files 46 | 47 | 48 | Header Files 49 | 50 | 51 | Header Files 52 | 53 | 54 | Header Files 55 | 56 | 57 | Header Files 58 | 59 | 60 | Header Files 61 | 62 | 63 | Header Files 64 | 65 | 66 | Header Files 67 | 68 | 69 | Header Files 70 | 71 | 72 | Header Files 73 | 74 | 75 | Header Files 76 | 77 | 78 | Header Files 79 | 80 | 81 | Header Files 82 | 83 | 84 | Header Files 85 | 86 | 87 | Header Files 88 | 89 | 90 | Header Files 91 | 92 | 93 | Header Files 94 | 95 | 96 | Header Files 97 | 98 | 99 | Header Files 100 | 101 | 102 | 103 | 104 | VecWrapper 105 | 106 | 107 | VecWrapper 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /Vectorisation/Vectorisation.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /Vectorisation/intel_Libs/libirc.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/libirc.lib -------------------------------------------------------------------------------- /Vectorisation/intel_Libs/svml_disp.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_disp.lib -------------------------------------------------------------------------------- /Vectorisation/intel_Libs/svml_dispmd.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_dispmd.lib -------------------------------------------------------------------------------- /Vectorisation/intel_Libs/svml_dispmt.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_dispmt.lib -------------------------------------------------------------------------------- /Vectorisation/intel_Libs/svmlpatch.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svmlpatch.lib -------------------------------------------------------------------------------- /Vectorisation/intrinsic_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /**************************** unroll_operators.h ******************************* 3 | * Author: Andrew Drakeford 4 | * Date created: 2021-04-10 5 | * Last modified: 2021-04-10 6 | * Version: 1.0 7 | * Project: DR Cubed 8 | * Description: 9 | * 10 | * (c) Copyright 2019 Andrew Drakeford 11 | * Apache License version 2.0 or later. 12 | *****************************************************************************/ 13 | 14 | 15 | #include "../Vectorisation/VecX/instruction_traits.h" 16 | #include 17 | 18 | /* 19 | //_MM_MANTISSA_NORM_ENUM n; 20 | //_MM_MANTISSA_SIGN_ENUM m; 21 | 22 | static inline __m256d getMantissa(__m256d x)// const int n, const int m) 23 | { 24 | 25 | return _mm256_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); 26 | } 27 | 28 | static inline __m512d getMantissa(__m512d x, int n, const int m) 29 | { 30 | return _mm512_getmant_pd(x,n,m); 31 | } 32 | 33 | 34 | 35 | static inline __m256d getExponent(__m256d x) 36 | { 37 | 38 | return _mm256_getexp_pd(x); 39 | } 40 | 41 | static inline __m512d getExponent(__m512d x) 42 | { 43 | return _mm512_getexp_pd(x); 44 | } 45 | 46 | static inline __m128d getExponent(__m128d x) 47 | { 48 | return _mm_getexp_pd(x); 49 | } 50 | 51 | static inline __m256 getExponent(__m256 x) 52 | { 53 | 54 | return _mm256_getexp_ps(x); 55 | } 56 | 57 | static inline __m512 getExponent(__m512 x) 58 | { 59 | return _mm512_getexp_ps(x); 60 | } 61 | 62 | static inline __m128 getExponent(__m128 x) 63 | { 64 | return _mm_getexp_ps(x); 65 | } 66 | 67 | */ -------------------------------------------------------------------------------- /Vectorisation/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /Vectorisation/pch.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/pch.h -------------------------------------------------------------------------------- /accumulateExample/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(accumulateExample accumulate_example.cpp) 2 | 3 | target_link_libraries(accumulateExample PUBLIC Vectorisation) 4 | 5 | target_include_directories(accumulateExample PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) -------------------------------------------------------------------------------- /accumulateExample/accumulateExample.log: -------------------------------------------------------------------------------- 1 | C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'accumulateExample.vcxproj'. Please check to make sure that you have specified a valid combination of Configuration and Platform for this project. Configuration='Debug' Platform='ARM64'. This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform. 2 | -------------------------------------------------------------------------------- /accumulateExample/accumulateExample.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | Header Files 30 | 31 | 32 | -------------------------------------------------------------------------------- /accumulateExample/accumulateExample.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /accumulateExample/norm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | double qnorm5(double p); 5 | double qnorm6(double p); 6 | double qnorm7(double p); -------------------------------------------------------------------------------- /cumNormalExample/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(cumNormalExample cumNormalExample.cpp) 2 | 3 | target_link_libraries(cumNormalExample PUBLIC Vectorisation) 4 | 5 | target_include_directories(cumNormalExample PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) -------------------------------------------------------------------------------- /cumNormalExample/cumNormal.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | 5 | 6 | 7 | 8 | #include "../Vectorisation/VecX/vec.h" 9 | #include "../Vectorisation/VecX/operations.h" 10 | #include "../Vectorisation/VecX/apply_operation.h" 11 | #include "../Vectorisation/VecX/vec_d.h" 12 | #include "../Vectorisation/VecX/vec_bool.h" 13 | #include "../Vectorisation/VecX/vec_view.h" 14 | 15 | #include "../Vectorisation/VecX/target_name_space.h" 16 | 17 | #include 18 | 19 | 20 | 21 | 22 | 23 | //using namespace DRC::VecDb; 24 | //using namespace DRC::VecD2D; 25 | using namespace DRC::VecD4D; 26 | //using namespace DRC::VecD8D; 27 | //using namespace DRC::VecF16F; 28 | //using namespace DRC::VecF8F; 29 | 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | 44 | 45 | double getnull(double); 46 | 47 | 48 | 49 | 50 | template 51 | VEC phi(VEC x) 52 | { 53 | // https://stackoverflow.com/questions/2328258/cumulative-normal-distribution-function-in-c-c 54 | // references a Wests's implementation in Willmot. 55 | 56 | const VEC z = abs(x); 57 | 58 | 59 | constexpr double N0 = 220.206867912376; 60 | constexpr double N1 = 221.213596169931; 61 | constexpr double N2 = 112.079291497871; 62 | constexpr double N3 = 33.912866078383; 63 | constexpr double N4 = 6.37396220353165; 64 | constexpr double N5 = 0.700383064443688; 65 | constexpr double N6 = 3.52624965998911e-02; 66 | constexpr double M0 = 440.413735824752; 67 | constexpr double M1 = 793.826512519948; 68 | constexpr double M2 = 637.333633378831; 69 | constexpr double M3 = 296.564248779674; 70 | constexpr double M4 = 86.7807322029461; 71 | constexpr double M5 = 16.064177579207; 72 | constexpr double M6 = 1.75566716318264; 73 | constexpr double M7 = 8.83883476483184e-02; 74 | 75 | 76 | VEC n_c = (((((N6 * z + N5) * z + N4) * z + N3) * z + N2) * z + N1) * z + N0; 77 | VEC d_c = ((((((M7 * z + M6) * z + M5) * z + M4) * z + M3) * z + M2) * z + M1) * z + M0; 78 | VEC central = n_c / d_c; 79 | 80 | constexpr double inv_RT2PI(0.39894228040143267793994605993438); 81 | VEC d_outer = (((((20. * z) * z + 13.) * z + 200.) * z + 78.) * z + 300.) * z + 39.; 82 | VEC n_outer = ((((20. * z) * z + 13.) * z + 180.) * z + 65.) * z + 160.; 83 | VEC outer = inv_RT2PI * n_outer / d_outer; 84 | 85 | 86 | VEC e = exp(-z * z * 0.5); 87 | 88 | // static const double SPLIT = 7.07106781186547; //orig 89 | const VEC SPLIT(7.42);// 7106781186547; //play appears to give less error 90 | 91 | VEC RES = select((z < SPLIT), central, outer); 92 | RES *= e; 93 | 94 | return select(x <= VEC(0.0), RES, 1.0 - RES); 95 | 96 | } 97 | 98 | 99 | 100 | 101 | template 102 | VecXX calcCDFNormal(const VecXX& X) 103 | { 104 | //TO DO FMA 105 | auto centralLambda = [&](auto z) 106 | { 107 | 108 | const static double N[] = { 3.52624965998911e-02 , 0.700383064443688, 6.37396220353165, 33.912866078383, 112.079291497871, 221.213596169931, 220.206867912376 }; 109 | const static double M[] = { 8.83883476483184e-02, 1.75566716318264, 16.064177579207, 86.7807322029461 , 296.564248779674, 637.333633378831, 793.826512519948,440.413735824752 }; 110 | 111 | auto n_c = (((((N[0] * z + N[1]) * z + N[2]) * z + N[3]) * z + N[4]) * z + N[5]) * z + N[6]; 112 | auto d_c = ((((((M[0] * z + M[1]) * z + M[2]) * z + M[3]) * z + M[4]) * z + M[5]) * z + M[6]) * z + M[7]; 113 | return n_c / d_c; 114 | }; 115 | 116 | 117 | auto outerLambda = [](auto z) 118 | { 119 | constexpr double inv_RT2PI(0.39894228040143267793994605993438); 120 | auto d_outer = (((((20. * z) * z + 13.) * z + 200.) * z + 78.) * z + 300.) * z + 39.; 121 | auto n_outer = ((((20. * z) * z + 13.) * z + 180.) * z + 65.) * z + 160.; 122 | return inv_RT2PI * n_outer / d_outer; 123 | }; 124 | 125 | 126 | auto onePass = [=](auto x) 127 | { 128 | auto z = abs(x); 129 | auto e = exp(-z * z * 0.5); 130 | auto outer = outerLambda(z); 131 | auto central = centralLambda(z); 132 | auto SPLIT =7.42;// 7106781186547; //play appears to give less error 133 | auto RES = select((z < SPLIT), central, outer); 134 | RES *= e; 135 | return select(x <= 0.0, RES, 1.0 - RES); 136 | 137 | }; 138 | 139 | auto res = ApplyTransformUR_XX(X, onePass); 140 | return res; 141 | 142 | } 143 | 144 | 145 | 146 | 147 | 148 | template 149 | VecXX calcCDFNormalFMA(const VecXX& X) 150 | { 151 | //TO DO FMA 152 | auto centralLambda = [&](auto z) 153 | { 154 | 155 | constexpr double N[] = { 3.52624965998911e-02 , 0.700383064443688, 6.37396220353165, 33.912866078383, 112.079291497871, 221.213596169931, 220.206867912376 }; 156 | constexpr double M[] = { 8.83883476483184e-02, 1.75566716318264, 16.064177579207, 86.7807322029461 , 296.564248779674, 637.333633378831, 793.826512519948,440.413735824752 }; 157 | 158 | auto inv_dc = 1.0/ mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(M[0], z, M[1]), z, M[2]), z, M[3]), z, M[4]), z, M[5]), z, M[6]), z, M[7]); 159 | auto n_c = mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(N[0], z, N[1]), z, N[2]), z, N[3]), z, N[4]), z, N[5]), z, N[6]); 160 | 161 | return n_c * inv_dc; 162 | }; 163 | 164 | 165 | auto outerLambda = [](auto z) 166 | { 167 | constexpr double inv_RT2PI(0.39894228040143267793994605993438); 168 | constexpr double d[] = { 20. , 13., 200., 78., 300., 39. }; 169 | constexpr double n[] = { 20., 13., 180., 65., 160. }; 170 | 171 | auto d_outer = mul_add(mul_add(mul_add(mul_add(mul_add((d[0]* z), z , d[1]), z , d[2]),z , d[3]), z , d[4]), z ,d[5]); 172 | auto inv_d_outer = inv_RT2PI / d_outer; 173 | 174 | auto n_outer = mul_add(mul_add(mul_add(mul_add((n[0] * z), z , n[1]), z , n[2]), z , n[3]), z , n[4]); 175 | return n_outer * inv_d_outer; 176 | }; 177 | 178 | 179 | 180 | auto onePass = [=](auto x) 181 | { 182 | auto z = abs(x); 183 | auto e = exp(-z * z * 0.5); 184 | auto central = centralLambda(z); 185 | auto SPLIT = 7.42;// 7106781186547; //play appears to give less error 186 | auto condAllDone = (x * x < SPLIT* SPLIT); 187 | 188 | if (horizontal_and(condAllDone)) 189 | { 190 | central *= e; 191 | return select(x <= 0.0, central, 1.0 - central); 192 | } 193 | 194 | auto outer = outerLambda(z); 195 | auto RES = select((z < SPLIT), central, outer); 196 | RES *= e; 197 | return select(x <= 0.0, RES, 1.0 - RES); 198 | 199 | }; 200 | 201 | //auto res = ApplyTransformUR_XX(X, onePass); 202 | auto res = ApplyTransformUR_X(X, onePass); 203 | 204 | 205 | return res; 206 | 207 | } 208 | -------------------------------------------------------------------------------- /cumNormalExample/cumNormalExample.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Header Files 25 | 26 | 27 | -------------------------------------------------------------------------------- /cumNormalExample/cumNormalExample.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /dancingAVX512/AVX512Dance.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | 16 | #include "../Vectorisation/VecX/dr3.h" 17 | 18 | 19 | const double billion = 1000000000.0; 20 | 21 | 22 | using Calc_Values = std::map; 23 | using Mapped_Performance_Results = std::map >; // array size v vector 24 | using Mapped_Stats = std::map >; // size -.pair ( throughput , std dev of through put) 25 | 26 | struct RunResults 27 | { 28 | Mapped_Performance_Results m_raw_results; 29 | Calc_Values m_calc_results; 30 | double time; 31 | }; 32 | 33 | class TimerGuard 34 | { 35 | double& m_runTime; 36 | std::chrono::high_resolution_clock::time_point m_startTme; 37 | 38 | public: 39 | TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::high_resolution_clock::now()) { runTime = 0.; } 40 | 41 | ~TimerGuard() 42 | { 43 | auto endTime = std::chrono::high_resolution_clock::now(); 44 | auto runtime = endTime - m_startTme; 45 | m_runTime = runtime.count() / billion; 46 | } 47 | }; 48 | 49 | 50 | auto getRandomShuffledVectorxxx(int SZ, int instance_number = 0) 51 | { 52 | static std::map > vectors; 53 | int key = 10 * SZ + instance_number; 54 | //store vectors with key 10 times size and add on 0-9 integer for instance of different random vector 55 | 56 | if (SZ < 0) 57 | { 58 | vectors.clear(); 59 | SZ = 0; 60 | } 61 | 62 | 63 | if (vectors.find(key) != vectors.end()) 64 | { 65 | return vectors[key]; 66 | } 67 | else 68 | { 69 | std::vector v(SZ, double(6.66)); 70 | for (int i = 0; i < SZ; i++) { v[i] += double(SZ / 2) + i; } 71 | std::random_device rd; 72 | std::mt19937 g(rd()); 73 | std::shuffle(begin(v), end(v), g); 74 | vectors[key] = v; 75 | return v; 76 | } 77 | } 78 | 79 | 80 | auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ) 81 | { 82 | 83 | RunResults results; 84 | 85 | for (int j = 0; j < testRepeats; ++j) 86 | { 87 | int VEC_SZ = vec_start_size; 88 | for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ) 89 | { 90 | func(VEC_SZ, testLoopSZ); 91 | } 92 | } 93 | return results; 94 | }; 95 | 96 | 97 | 98 | 99 | void doAVXMax512Dance() 100 | { 101 | 102 | const int maxVectorSize = 4400; 103 | const int minVectorSize = 800; 104 | const long TEST_LOOP_SZ = 10000; 105 | const int vectorStepSize = 8; 106 | const int repeatRuns = 13; 107 | 108 | getRandomShuffledVectorxxx(-1); // reset random input vectors 109 | 110 | 111 | //avx512 lambda 112 | auto DR3_avx512 = [&](int SZ, long TEST_LOOP_SZ) 113 | { 114 | using namespace DRC::VecD8D; 115 | 116 | double time = 0.; 117 | volatile double res = 0.; 118 | 119 | auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); }; 120 | 121 | auto v1 = getRandomShuffledVectorxxx(SZ, 0); 122 | VecXX vec(v1); 123 | 124 | 125 | for (long l = 0; l < TEST_LOOP_SZ; l++) 126 | { 127 | res = reduce(vec, mxDbl); 128 | } 129 | 130 | return std::make_pair(res, time); 131 | }; 132 | 133 | 134 | auto DR3_avx2 = [&](int SZ, long TEST_LOOP_SZ) 135 | { 136 | using namespace DRC::VecD4D; 137 | 138 | double time = 0.; 139 | volatile double res = 0.; 140 | 141 | auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); }; 142 | 143 | auto v1 = getRandomShuffledVectorxxx(SZ, 0); 144 | VecXX vec(v1); 145 | 146 | 147 | for (long l = 0; l < TEST_LOOP_SZ; l++) 148 | { 149 | res = reduce(vec, mxDbl); 150 | } 151 | 152 | 153 | return std::make_pair(res, time); 154 | 155 | }; 156 | 157 | 158 | 159 | 160 | auto DR3_sse2 = [&](int SZ, long TEST_LOOP_SZ) 161 | { 162 | using namespace DRC::VecD2D; 163 | 164 | double time = 0.; 165 | volatile double res = 0.; 166 | 167 | auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); }; 168 | 169 | auto v1 = getRandomShuffledVectorxxx(SZ, 0); 170 | VecXX vec(v1); 171 | 172 | 173 | for (long l = 0; l < TEST_LOOP_SZ; l++) 174 | { 175 | res = reduce(vec, mxDbl); 176 | } 177 | 178 | return std::make_pair(res, time); 179 | 180 | }; 181 | 182 | 183 | 184 | auto DR3_stl = [&](int SZ, long TEST_LOOP_SZ) 185 | { 186 | using namespace DRC::VecD2D; 187 | 188 | double time = 0.; 189 | volatile double res = 0.; 190 | 191 | auto v1 = getRandomShuffledVectorxxx(SZ, 0); 192 | 193 | 194 | for (long l = 0; l < TEST_LOOP_SZ; l++) 195 | { 196 | res = *std::max_element(begin(v1), end(v1)); 197 | } 198 | 199 | return std::make_pair(res, time); 200 | }; 201 | 202 | 203 | 204 | 205 | using namespace std::chrono_literals; 206 | 207 | for (;;) 208 | { 209 | 210 | double time = 0.0; 211 | constexpr int NUM_BURSTS = 3; 212 | constexpr auto SLEEP_TIME = 20000ms; 213 | 214 | 215 | 216 | 217 | 218 | //STL 219 | for (int K = 0; K < NUM_BURSTS; K++) 220 | { 221 | time = 0.; 222 | std::cout << "1/3rd the work using STL max " << K + 1 << "of " << NUM_BURSTS << std::endl; 223 | { TimerGuard timer(time); 224 | auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_stl, TEST_LOOP_SZ / 3); 225 | } 226 | std::cout << "STL " << K + 1 << " of " << NUM_BURSTS << " " << time << " seconds now sleep" << std::endl; 227 | std::this_thread::sleep_for(SLEEP_TIME); 228 | } 229 | 230 | std::this_thread::sleep_for(SLEEP_TIME); 231 | 232 | 233 | 234 | //SSE2 235 | for (int K = 0; K < NUM_BURSTS; K++) 236 | { 237 | time = 0.; 238 | std::cout << "SSE2 " << K + 1 << " of " << NUM_BURSTS << std::endl; 239 | { TimerGuard timer(time); 240 | auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_sse2, TEST_LOOP_SZ); 241 | } 242 | std::cout << "SSE2 " << K + 1 << " of " << NUM_BURSTS << " " << time << " seconds now sleep" << std::endl; 243 | std::this_thread::sleep_for(SLEEP_TIME); 244 | } 245 | 246 | std::this_thread::sleep_for(SLEEP_TIME); 247 | 248 | 249 | //AVX2 250 | for (int K = 0; K < NUM_BURSTS; K++) 251 | { 252 | time = 0.; 253 | std::cout << "AVX2 " << K + 1 << " of " << NUM_BURSTS << std::endl; 254 | { TimerGuard timer(time); 255 | auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx2, TEST_LOOP_SZ); 256 | } 257 | std::cout << "AVX2 " << K + 1 << " of " << NUM_BURSTS << " " << time << " seconds now sleep" << std::endl; 258 | std::this_thread::sleep_for(SLEEP_TIME); 259 | 260 | } 261 | 262 | std::this_thread::sleep_for(SLEEP_TIME); 263 | 264 | //AVX512 265 | for (int K = 0; K < NUM_BURSTS; K++) 266 | { 267 | time = 0.; 268 | std::cout << "AVX512 " << K + 1 << " of " << NUM_BURSTS << std::endl; 269 | { 270 | TimerGuard timer(time); 271 | auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx512, TEST_LOOP_SZ); 272 | } 273 | std::cout << "AVX512 " << K + 1 << " of " << NUM_BURSTS << " " << time << " seconds now sleep" << std::endl; 274 | std::this_thread::sleep_for(SLEEP_TIME); 275 | } 276 | 277 | std::this_thread::sleep_for(SLEEP_TIME); 278 | 279 | 280 | 281 | 282 | 283 | } 284 | 285 | } 286 | 287 | 288 | 289 | 290 | -------------------------------------------------------------------------------- /dancingAVX512/AVX512Dance.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | void doAVXMax512Dance(); 5 | 6 | //return iff((x - (VecXX::INS(2.0) * floor(x * VecXX::INS(0.5)))) >= VecXX::INS(1.0), //auto oneIfOddLmbda = [&](auto x) { iff(x > VecXX::INS(0.5 * SZ), -------------------------------------------------------------------------------- /dancingAVX512/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(dancingAVX512 dancingAVX512.cpp AVX512Dance.cpp) 2 | 3 | target_link_libraries(dancingAVX512 PUBLIC Vectorisation) 4 | 5 | target_include_directories(dancingAVX512 PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) 8 | 9 | -------------------------------------------------------------------------------- /dancingAVX512/dancingAVX512.cpp: -------------------------------------------------------------------------------- 1 | // dancingAVX512.cpp : This file contains the 'main' function. Program execution begins and ends there. 2 | // 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | #include "../Vectorisation/VecX/dr3.h" 19 | #include "../Vectorisation/VecX/accumulate_transform.h" 20 | #include "../Vectorisation/VecX/error_utils.h" 21 | 22 | 23 | #include "../Vectorisation/VecX/zip_utils.h" 24 | 25 | //#include "norm.h" 26 | 27 | #include "AVX512Dance.h" 28 | 29 | // use namespace DRC::VecD8D run this and watch power consumption 30 | // switches between AVX2 and AVX512 implementations 31 | // AVX512 uses less energy in this case 32 | 33 | int main() 34 | { 35 | // std::cout << "Hello World!\n"; 36 | doAVXMax512Dance(); 37 | } 38 | 39 | -------------------------------------------------------------------------------- /dancingAVX512/dancingAVX512.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/BlackScholesVecXX.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/docs/BlackScholesVecXX.mp4 -------------------------------------------------------------------------------- /docs/Build.md: -------------------------------------------------------------------------------- 1 | # Supported platforms 2 | OS: Windows, Ubuntu 3 | Compiler: Visual Studio, gcc, clang 4 | CPU: 5 | 6 | # Build commands 7 | ``` 8 | mkdir build; cd build 9 | cmake .. -DCMAKE_BUILD_TYPE=Release 10 | cmake --build . --config Release 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/cppCon2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/docs/cppCon2022.pdf -------------------------------------------------------------------------------- /inverseCumNormalExample/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(inverseCumNormalExample inverseCumNormalExample.cpp cdfNormalInverse.cpp) 2 | 3 | target_link_libraries(inverseCumNormalExample PUBLIC Vectorisation) 4 | 5 | target_include_directories(inverseCumNormalExample PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) 8 | 9 | -------------------------------------------------------------------------------- /inverseCumNormalExample/cdfNormalInverse.cpp: -------------------------------------------------------------------------------- 1 | #include "cdfNormalInverse.h" 2 | 3 | /* 4 | #include "../Vectorisation/VecX/vec.h" 5 | #include "../Vectorisation/VecX/operations.h" 6 | #include "../Vectorisation/VecX/apply_operation.h" 7 | #include "../Vectorisation/VecX/vec_d.h" 8 | #include "../Vectorisation/VecX/vec_bool.h" 9 | #include "../Vectorisation/VecX/vec_view.h" 10 | #include "../Vectorisation/VecX/target_name_space.h" 11 | 12 | */ 13 | #include 14 | #include 15 | /* 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | */ 27 | 28 | 29 | 30 | double getnull(double) 31 | { 32 | return 0.0; 33 | } 34 | 35 | 36 | 37 | 38 | // @WichuraQuantile 39 | // see R implemerntation 40 | long double qnorm8(long double p) 41 | { 42 | 43 | static long double a[] = { 2509.0809287301226727 , 33430.575583588128105, 67265.770927008700853, 45921.953931549871457, 13731.693765509461125, 1971.5909503065514427, 133.14166789178437745,3.387132872796366608 }; 44 | static long double b[] = { 5226.495278852854561, 28729.085735721942674, 39307.89580009271061, 21213.794301586595867, 5394.1960214247511077, 687.1870074920579083, 42.313330701600911252 }; 45 | static long double c[] = { 7.7454501427834140764e-4 , .0227238449892691845833 ,.24178072517745061177, 1.27045825245236838258 , 3.64784832476320460504, 5.7694972214606914055, 4.6303378461565452959, 1.42343711074968357734 }; 46 | static long double d[] = { 1.05075007164441684324e-9 , 5.475938084995344946e-4, .0151986665636164571966, .14810397642748007459, .68976733498510000455, 1.6763848301838038494, 2.05319162663775882187,1. }; 47 | static long double e[] = { 2.01033439929228813265e-7 , 2.71155556874348757815e-5, .0012426609473880784386, .026532189526576123093, .29656057182850489123, 1.7848265399172913358, 5.4637849111641143699, 6.6579046435011037772 }; 48 | static long double f[] = { 2.04426310338993978564e-15 , 1.4215117583164458887e-7, 1.8463183175100546818e-5, 7.868691311456132591e-4, .0148753612908506148525,.13692988092273580531, .59983220655588793769, 1. }; 49 | 50 | 51 | long double val = 0.0; 52 | long double q = p - 0.5; 53 | 54 | 55 | if (fabs(q) <= .425) 56 | { 57 | long double r = .180625 - q * q; 58 | val = 59 | 60 | q * (((((((r * a[0] + 61 | a[1]) * r + a[2]) * r + 62 | a[3]) * r + a[4]) * r + 63 | a[5]) * r + a[6]) * r + 64 | a[7]) 65 | / (((((((r * b[0] + 66 | b[1]) * r + b[2]) * r + 67 | b[3]) * r + b[4]) * r + 68 | b[5]) * r + b[6]) * r + 1.); 69 | 70 | } 71 | else 72 | { // closer than 0.075 from {0,1} boundary 73 | 74 | // r = min(p, 1-p) < 0.075 75 | long double r = std::min(p, 1 - p); 76 | r = sqrt(-log(r)); 77 | 78 | 79 | 80 | // <==> min(p,1-p) >= exp(-25) ~= 1.3888e-11 81 | if (r <= 5.) 82 | { 83 | r += -1.6; 84 | val = (((((((r * c[0] + c[1]) * r + c[2]) * r + c[3]) * r + c[4]) * r + c[5]) * r + c[6]) * r + c[7]) 85 | / (((((((r * d[0] + d[1]) * r + d[2]) * r + d[3]) * r + d[4]) * r + d[5]) * r + d[6]) * r + 1.); 86 | 87 | 88 | } 89 | else 90 | { // very close to 0 or 1 91 | r += -5.; 92 | 93 | val = (((((((r * e[0] + e[1]) * r + e[2]) * r + e[3]) * r + e[4]) * r + e[5]) * r + e[6]) * r + e[7]) 94 | / (((((((r * f[0] + f[1]) * r + f[2]) * r + f[3]) * r + f[4]) * r + f[5]) * r + f[6]) * r + 1.); 95 | 96 | } 97 | 98 | long double valMult = (q < 0.0) ? -1.0 : 1.0; 99 | val *= valMult; 100 | } 101 | 102 | return val; 103 | } 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /inverseCumNormalExample/inverseCumNormalExample.log: -------------------------------------------------------------------------------- 1 | C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'inverseCumNormalExample.vcxproj'. Please check to make sure that you have specified a valid combination of Configuration and Platform for this project. Configuration='Debug' Platform='ARM64'. This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform. 2 | -------------------------------------------------------------------------------- /inverseCumNormalExample/inverseCumNormalExample.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | 26 | 27 | Header Files 28 | 29 | 30 | -------------------------------------------------------------------------------- /inverseCumNormalExample/inverseCumNormalExample.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /lattice/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(lattice lattice.cpp 2 | americanCrankNicholsonPricer.cpp 3 | americanFinitDiffPricer.cpp 4 | americanImplicitFiniteDiff.cpp 5 | americanTrinomialPricer.cpp 6 | americanTrinomialPricerUpAndOut.cpp 7 | europeanBinomialPricer.cpp 8 | euroTrinomial.cpp 9 | euroTrinomialPricerWithInit.cpp 10 | lattice_tools.cpp 11 | ) 12 | 13 | target_link_libraries(lattice PUBLIC Vectorisation) 14 | 15 | target_include_directories(lattice PUBLIC 16 | "${PROJECT_BINARY_DIR}" 17 | ) 18 | 19 | -------------------------------------------------------------------------------- /lattice/americanCrankNicholsonPricer.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | #include "utils.h" 3 | #include "pricers.h" 4 | 5 | 6 | //still broken ??? 7 | double americanCrankNicholsonPricer(double S, double K, double sig, double r, double T, int N) 8 | { 9 | 10 | //dividend yield 11 | double y = 0.;// 0.03;// 0.0;// 0.03;// 0.03; //div yield 12 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 13 | 14 | double Dt = T / N; 15 | double Dx = sig * std::sqrt(1.0 * Dt);// 0.2;// 16 | double v = r - y - 0.5 * sig * sig; 17 | 18 | 19 | VecXX::SCALA_TYPE pu = -0.25 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx); 20 | VecXX::SCALA_TYPE pd = -0.25 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx); 21 | VecXX::SCALA_TYPE pm = 1. + 0.5 * Dt * (sig * sig) / (Dx * Dx) + 0.5 * r * Dt; 22 | 23 | std::vector vdbg; 24 | //Pay off functions 25 | 26 | //call 27 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 28 | 29 | //put 30 | //auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; 31 | 32 | //set up underlying asset prices at maturity 33 | double last = S * exp(-(N + 1) * Dx); 34 | double edx = exp(Dx); 35 | for (auto& el : terminalAssetPrices) 36 | { 37 | last *= edx; 38 | el = last; 39 | } 40 | 41 | //option vakue at maturity 42 | 43 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 44 | 45 | 46 | //derivative boundary condition 47 | double lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]); 48 | double lambda_U = 0.0; 49 | 50 | auto odd_slice = excerciseValue; 51 | 52 | //set up slices 53 | auto even_slice = odd_slice * 0.0; 54 | even_slice[0] = odd_slice[0]; 55 | 56 | int J = 2 * N; 57 | int k = 0; 58 | 59 | VecXX pmp(1.0, J + 1); 60 | VecXX pp(1.0, J + 1); 61 | 62 | for (; k <= N; k += 2) 63 | { 64 | 65 | // SOLVE IMPLICIT TRIDIAGONAL IN LINE //SUB BOUNDARY CONDITION AT J = -n INTO J = -n+1 66 | 67 | pmp[1] = pm + pd; 68 | pp[1] = -pu * odd_slice[2] - (pm - 2.) * odd_slice[1] - pd * odd_slice[0] + pd * lambda_L; 69 | 70 | 71 | // eliminate upper diagonal 72 | for (int j = 2; j < J; ++j) 73 | { 74 | pmp[j] = pm - pu * pd / pmp[j - 1]; 75 | pp[j] = -pu * odd_slice[j + 1] - (pm - 2.0) * odd_slice[j] - pd * odd_slice[j - 1] - pp[j - 1] * pd / pmp[j - 1]; 76 | } 77 | 78 | even_slice[J] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 79 | even_slice[J - 1] = even_slice[J] - lambda_U; 80 | 81 | 82 | // back substitution 83 | for (int j = J - 1; j >= 0; j--) 84 | { 85 | even_slice[j] = (pp[j] - pu * even_slice[j + 1]) / pmp[j]; 86 | } 87 | 88 | 89 | even_slice[0] = odd_slice[0]; 90 | //vdbg = even_slice; 91 | /* 92 | //american condition 93 | for (int j = 0; j < (J+1); j++) 94 | { 95 | even_slice[j] = std::max(even_slice[j], excerciseValue[j]); 96 | } 97 | */ 98 | 99 | // vdbg = even_slice; 100 | 101 | // vdbg = odd_slice; 102 | // vdbg = pmp; 103 | // vdbg = pp; 104 | 105 | 106 | // calc odd slice now 107 | ////////////////////////////////////////////////////////////////////// 108 | 109 | pmp[1] = pm + pd; 110 | pp[1] = -pu * even_slice[2] - (pm - 2.) * even_slice[1] - pd * even_slice[0] + pd * lambda_L; 111 | 112 | 113 | // eliminate upper diagonal 114 | for (int j = 2; j < J; ++j) 115 | { 116 | pmp[j] = pm - pu * pd / pmp[j - 1]; 117 | pp[j] = -pu * even_slice[j + 1] - (pm - 2.0) * even_slice[j] - pd * even_slice[j - 1] - pp[j - 1] * pd / pmp[j - 1]; 118 | } 119 | 120 | odd_slice[J] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 121 | odd_slice[J - 1] = odd_slice[J] - lambda_U; 122 | 123 | 124 | // back substitution 125 | for (int j = J - 1; j >= 0; j--) 126 | { 127 | odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) / pmp[j]; 128 | } 129 | 130 | odd_slice[0] = even_slice[0]; 131 | /* 132 | //american condition 133 | for (int j = 0; j < (J + 1); j++) 134 | { 135 | odd_slice[j] = std::max(odd_slice[j], excerciseValue[j]); 136 | } 137 | */ 138 | 139 | // vdbg = odd_slice; 140 | // vdbg = even_slice; 141 | // vdbg = pmp; 142 | // vdbg = pp; 143 | 144 | } 145 | 146 | return odd_slice[N]; 147 | } 148 | 149 | -------------------------------------------------------------------------------- /lattice/americanFinitDiffPricer.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "../Vectorisation/VecX/dr3.h" 3 | #include "utils.h" 4 | #include "pricers.h" 5 | 6 | 7 | double americanFiniteDiffPricer(double S, double K, double sig, double r, double T, int N) 8 | { 9 | 10 | //dividend yield 11 | double y = 0.0; 12 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 13 | 14 | double Dt = T / N; 15 | double Dx = sig * std::sqrt(2.0 * Dt); 16 | double v = r - y - 0.5 * sig * sig; 17 | 18 | 19 | VecXX::INS pu = 0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx); 20 | VecXX::INS pd = 0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx); 21 | VecXX::INS pm = 1. - Dt * (sig * sig) / (Dx * Dx) - r * Dt; 22 | 23 | 24 | 25 | TrinomialSampler sampler; 26 | //introduces offset variables so that we can get vectorised versions 27 | // of X[i+1], X[i] and x[i-1] 28 | // under the hood these do unaligned loads into registers so taht we can still 29 | // apply the vectorised 30 | 31 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 32 | { 33 | const auto& X1 = sampler.X_1.value; 34 | const auto& X0 = sampler.X_0.value; 35 | const auto& X_1 = sampler.X_Minus_1.value; 36 | return (X1 * pu + X0 * pm + X_1 * pd); 37 | }; 38 | 39 | //Pay off functions 40 | //call 41 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 42 | 43 | //put 44 | //auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; 45 | 46 | //set up underlying asset prices at maturity 47 | double last = S * exp(-(N + 1) * Dx); 48 | double edx = exp(Dx); 49 | for (auto& el : terminalAssetPrices) 50 | { 51 | last *= edx; 52 | el = last; 53 | } 54 | 55 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 56 | auto odd_slice = excerciseValue; 57 | 58 | UnitarySampler identity_sampler; //identity pas through 59 | 60 | //this is the american part of the option excercise 61 | auto applyEarlyExcercise = [=](UnitarySampler& sampler, auto excercisePrice) 62 | { 63 | auto optPrice = sampler.X_0.value; //.get<0>(); 64 | return max(optPrice, excercisePrice); 65 | }; 66 | 67 | auto even_slice = odd_slice * 0.0; 68 | 69 | int J = 2 * N; 70 | int k = 0; 71 | for (; k < N; k += 2) 72 | { 73 | 74 | transform(odd_slice, even_slice, trinomialRollBack, sampler, 0, J); 75 | 76 | //apply boundary condition 77 | even_slice[0] = even_slice[1] + terminalAssetPrices[1] - terminalAssetPrices[0]; 78 | even_slice[J] = even_slice[J - 1]; 79 | // transform to get early excercise for american exercise , iderntity sampler just passes values straight through 80 | transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, 0, J); 81 | 82 | 83 | transform(even_slice, odd_slice, trinomialRollBack, sampler, 0, J); 84 | //boundary condition 85 | odd_slice[0] = odd_slice[1] + terminalAssetPrices[1] - terminalAssetPrices[0]; 86 | odd_slice[J] = odd_slice[J - 1]; 87 | transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, 0, J); 88 | 89 | } 90 | 91 | return odd_slice[N]; 92 | } 93 | -------------------------------------------------------------------------------- /lattice/americanImplicitFiniteDiff.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | #include "../Vectorisation/VecX/zip_utils.h" 3 | #include "utils.h" 4 | #include "pricers.h" 5 | 6 | 7 | double americanImplicitFiniteDiffPricerFast(double S, double K, double sig, double r, double T, int N) 8 | { 9 | 10 | 11 | double y = 0.0;//dividend yield 12 | 13 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 14 | 15 | double Dt = T / N; 16 | double Dx = sig * std::sqrt(2.0 * Dt); 17 | double v = r - y - 0.5 * sig * sig; 18 | 19 | 20 | VecXX::SCALA_TYPE pu = -0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx); 21 | VecXX::SCALA_TYPE pd = -0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx); 22 | VecXX::SCALA_TYPE pm = 1. + Dt * (sig * sig) / (Dx * Dx) + r * Dt; 23 | 24 | 25 | std::vector vdbg; 26 | //Pay off functions 27 | 28 | //call 29 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 30 | 31 | //put 32 | //auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; 33 | 34 | //set up underlying asset prices at maturity 35 | double last = S * exp(-(N + 1) * Dx); 36 | double edx = exp(Dx); 37 | for (auto& el : terminalAssetPrices) 38 | { 39 | last *= edx; 40 | el = last; 41 | } 42 | 43 | //option value at maturity 44 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 45 | 46 | auto american = [](auto X, auto Y) { return select(X > Y, X, Y); }; 47 | 48 | 49 | //derivative boundary condition 50 | double lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]); 51 | double lambda_U = 0.0; 52 | 53 | auto odd_slice = excerciseValue; 54 | // vdbg = odd_slice; 55 | 56 | 57 | auto even_slice = odd_slice * 0.0; 58 | 59 | int J = 2 * N; 60 | int k = 0; 61 | 62 | VecXX pmp(1.0, J + 1); 63 | VecXX pp(1.0, J + 1); 64 | 65 | //////////// 66 | //LOOP HOIST BITS FROM IMPLICIT TRIDIAGONAL 67 | 68 | pmp[1] = pm + pd; 69 | pp[1] = odd_slice[1] + pd * lambda_L; 70 | 71 | 72 | auto pu_pd = pu * pd; 73 | 74 | // eliminate upper diagonal 75 | for (int j = 2; j < J; ++j) 76 | { 77 | pmp[j] = pm - pu_pd / pmp[j - 1]; 78 | } 79 | 80 | auto inv_pmp = 1.0 / pmp; 81 | 82 | auto pd_inv_pmp = pd * inv_pmp; 83 | 84 | ///////////// 85 | 86 | for (; k < N; k += 2) 87 | { 88 | 89 | // SOLVE IMPLICIT TRIDIAGONAL IN LINE SUB BOUNDARY CONDITION AT J = -n INTO J = -n+1 90 | 91 | //pmp[1] = pm + pd; 92 | pp[1] = odd_slice[1] + pd * lambda_L; 93 | 94 | // eliminate upper diagonal 95 | for (int j = 2; j < J; ++j) 96 | { 97 | 98 | pp[j] = odd_slice[j] - pp[j - 1] * pd_inv_pmp[j - 1]; 99 | } 100 | 101 | even_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 102 | even_slice[J - 1] = even_slice[J] - lambda_U; 103 | 104 | 105 | // back substitution 106 | for (int j = J - 2; j != 0; j--) 107 | { 108 | even_slice[j] = (pp[j] - pu * even_slice[j + 1]) * inv_pmp[j]; 109 | } 110 | 111 | //american excercise bit 112 | even_slice = transform(american, even_slice, (const VecXX&)excerciseValue); 113 | 114 | 115 | // now calculate the odd slice 116 | 117 | pp[1] = even_slice[1] + pd * lambda_L; 118 | 119 | // eliminate upper diagonal 120 | for (int j = 2; j < J; ++j) 121 | { 122 | pp[j] = even_slice[j] - pp[j - 1] * pd_inv_pmp[j - 1]; 123 | } 124 | 125 | odd_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 126 | odd_slice[J - 1] = odd_slice[J] - lambda_U; 127 | 128 | 129 | // back substitution 130 | for (int j = J - 2; j != 0; j--) 131 | { 132 | odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) * inv_pmp[j]; 133 | } 134 | 135 | //american excercise bit 136 | odd_slice = transform(american, odd_slice, (const VecXX&)excerciseValue); 137 | } 138 | 139 | return odd_slice[N]; 140 | } 141 | 142 | double americanImplicitFiniteDiffPricer(double S, double K, double sig, double r, double T, int N) 143 | { 144 | 145 | //dividend yield 146 | double y = 0.0;// 0.03;// 0.03; //div yield 147 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 148 | 149 | double Dt = T / N; 150 | double Dx = sig * std::sqrt(2.0 * Dt); 151 | double v = r - y - 0.5 * sig * sig; 152 | 153 | //double u = Dx; 154 | //double d = 1. / u; 155 | 156 | 157 | VecXX::SCALA_TYPE pu = -0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx); 158 | VecXX::SCALA_TYPE pd = -0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx); 159 | VecXX::SCALA_TYPE pm = 1. + Dt * (sig * sig) / (Dx * Dx) + r * Dt; 160 | 161 | 162 | std::vector vdbg; 163 | //Pay off functions 164 | 165 | //call 166 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 167 | 168 | //put 169 | //auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; 170 | 171 | //set up underlying asset prices at maturity 172 | double last = S * exp(-(N + 1) * Dx); 173 | double edx = exp(Dx); 174 | for (auto& el : terminalAssetPrices) 175 | { 176 | last *= edx; 177 | el = last; 178 | } 179 | 180 | //option value at maturity 181 | 182 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 183 | 184 | //derivative boundary condition 185 | double lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]); 186 | double lambda_U = 0.0; 187 | 188 | auto odd_slice = excerciseValue; 189 | vdbg = odd_slice; 190 | 191 | 192 | auto even_slice = odd_slice * 0.0; 193 | 194 | int J = 2 * N; 195 | int k = 0; 196 | 197 | VecXX pmp(1.0, J + 1); 198 | VecXX pp(1.0, J + 1); 199 | 200 | for (; k < N; k += 2) 201 | { 202 | 203 | // SOLVE IMPLICIT TRIDIAGONAL IN LINE 204 | //SUB BOUNDARY CONDITION AT J = -n INTO J = -n+1 205 | pmp[1] = pm + pd; 206 | pp[1] = odd_slice[1] + pd * lambda_L; 207 | 208 | 209 | 210 | // eliminate upper diagonal 211 | for (int j = 2; j < J; ++j) 212 | { 213 | pmp[j] = pm - pu * pd / pmp[j - 1]; 214 | pp[j] = odd_slice[j] - pp[j - 1] * pd / pmp[j - 1]; 215 | } 216 | 217 | even_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 218 | even_slice[J - 1] = even_slice[J] - lambda_U; 219 | 220 | 221 | // back substitution 222 | for (int j = J - 2; j != 0; j--) 223 | { 224 | even_slice[j] = (pp[j] - pu * even_slice[j + 1]) / pmp[j]; 225 | } 226 | 227 | 228 | 229 | // american 230 | for (int j = 0; j < (J + 1); j++) 231 | { 232 | even_slice[j] = std::max(even_slice[j], excerciseValue[j]); 233 | } 234 | 235 | 236 | //even_slice = transform(even_slice, excerciseValue); 237 | 238 | // calc odd slice now 239 | ////////////////////////////////////////////////////////////////////// 240 | 241 | pmp[1] = pm + pd; 242 | pp[1] = even_slice[1] + pd * lambda_L; 243 | 244 | 245 | // eliminate upper diagonal 246 | for (int j = 2; j < J; ++j) 247 | { 248 | pmp[j] = pm - pu * pd / pmp[j - 1]; 249 | pp[j] = even_slice[j] - pp[j - 1] * pd / pmp[j - 1]; 250 | } 251 | 252 | odd_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]); 253 | odd_slice[J - 1] = odd_slice[J] - lambda_U; 254 | 255 | 256 | // back substitution 257 | for (int j = J - 2; j != 0; j--) 258 | { 259 | odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) / pmp[j]; 260 | } 261 | 262 | 263 | //american 264 | for (int j = 0; j < (J + 1); j++) 265 | { 266 | odd_slice[j] = std::max(odd_slice[j], excerciseValue[j]); 267 | } 268 | 269 | 270 | } 271 | 272 | return odd_slice[N]; 273 | } 274 | -------------------------------------------------------------------------------- /lattice/americanTrinomialPricer.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "../Vectorisation/VecX/dr3.h" 3 | #include "utils.h" 4 | #include "pricers.h" 5 | 6 | 7 | double americanTrinomialPricer(double S, double K, double sig, double r, double T, int N) 8 | { 9 | 10 | double y = 0.0;// 0.03; //div yield 11 | 12 | 13 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 14 | 15 | double Dt = T / N; 16 | double Dx = sig * std::sqrt(2.0 * Dt); 17 | double v = r - y - 0.5 * sig * sig; 18 | 19 | 20 | VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx); 21 | VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx); 22 | VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx); 23 | 24 | 25 | VecXX::INS disc = exp(-r * Dt); 26 | TrinomialSampler sampler; 27 | 28 | 29 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 30 | { 31 | auto X1 = sampler.X_Minus_1.value; 32 | auto X0 = sampler.X_0.value; 33 | auto X_1 = sampler.X_1.value; 34 | return disc * (X1 * pu + X0 * pm + X_1 * pd); 35 | }; 36 | 37 | 38 | //call 39 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 40 | 41 | //put 42 | //auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); }; 43 | 44 | //set up underlying asset prices at maturity 45 | double last = S * exp(-(N + 1) * Dx); 46 | double edx = exp(Dx); 47 | for (auto& el : terminalAssetPrices) 48 | { 49 | last *= edx; 50 | el = last; 51 | } 52 | 53 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 54 | auto odd_slice = excerciseValue; 55 | 56 | UnitarySampler identity_sampler; //identity just passes through 57 | 58 | auto applyEarlyExcercise = [=](UnitarySampler& sampler, auto excercisePrice) 59 | { 60 | auto optPrice = sampler.X_0.value; //.get<0>(); 61 | return max(optPrice, excercisePrice); 62 | }; 63 | 64 | 65 | auto even_slice = odd_slice; 66 | 67 | int j = 2 * N + 1 - 1; 68 | int i = 0; 69 | for (; i < N; i += 2) 70 | { 71 | transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j); 72 | // transform to get early excercise for american bit , iderntity sampler just passes values straight through 73 | transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j); 74 | 75 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 76 | transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1); 77 | 78 | j -= 2; 79 | } 80 | 81 | return odd_slice[N]; 82 | } 83 | -------------------------------------------------------------------------------- /lattice/americanTrinomialPricerUpAndOut.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | #include "utils.h" 3 | #include "pricers.h" 4 | 5 | 6 | 7 | double americanTrinomialPricerUpAndOut(double S, double K, double sig, double r, double T, double H, double rebate, int N) 8 | { 9 | 10 | double y = 0.0;// 0.03; //div yield 11 | 12 | 13 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 14 | 15 | double Dt = T / N; 16 | double Dx = sig * std::sqrt(2.0 * Dt); 17 | double v = r - y - 0.5 * sig * sig; 18 | 19 | //double u = Dx; 20 | //double d = 1. / u; 21 | 22 | 23 | VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx); 24 | VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx); 25 | VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx); 26 | 27 | 28 | VecXX::INS disc = exp(-r * Dt); 29 | TrinomialSampler sampler; 30 | 31 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 32 | { 33 | 34 | auto X1 = sampler.X_Minus_1.value; 35 | auto X0 = sampler.X_0.value; 36 | auto X_1 = sampler.X_1.value; 37 | 38 | return disc * (X1 * pu + X0 * pm + X_1 * pd); 39 | }; 40 | 41 | 42 | //auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; //call 43 | auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; //put 44 | 45 | //set up underlying asset prices at maturity 46 | double last = S * exp(-(N + 1) * Dx); 47 | double edx = exp(Dx); 48 | for (auto& el : terminalAssetPrices) 49 | { 50 | last *= edx; 51 | el = last; 52 | } 53 | 54 | 55 | 56 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 57 | auto odd_slice = excerciseValue; 58 | 59 | UnitarySampler identity_sampler; //identity just 60 | 61 | auto applyEarlyExcercise = [=](UnitarySampler& sampler, auto excercisePrice) 62 | { 63 | auto optPrice = sampler.X_0.value; //.get<0>(); 64 | return max(optPrice, excercisePrice); 65 | }; 66 | 67 | 68 | auto applyBarrier = [=](UnitarySampler& sampler, auto stockPrice) 69 | { 70 | auto optPrice = sampler.X_0.value;//.get<0>(); 71 | return select(stockPrice < H, optPrice, rebate); 72 | }; 73 | 74 | 75 | auto even_slice = odd_slice; 76 | 77 | int j = 2 * N + 1 - 1; 78 | int i = 0; 79 | for (; i < N; i += 2) 80 | { 81 | transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j); 82 | // transform to get early excercise for american bit , iderntity sampler just passes values straight through 83 | // transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j); 84 | transform(even_slice, terminalAssetPrices, even_slice, applyBarrier, identity_sampler, i, j); 85 | 86 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 87 | // transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1); 88 | transform(odd_slice, terminalAssetPrices, odd_slice, applyBarrier, identity_sampler, i + 1, j - 1); 89 | 90 | j -= 2; 91 | } 92 | 93 | ignore(applyEarlyExcercise); 94 | 95 | return odd_slice[N]; 96 | 97 | } 98 | 99 | -------------------------------------------------------------------------------- /lattice/euroTrinomial.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | 3 | #include "utils.h" 4 | #include "pricers.h" 5 | 6 | 7 | 8 | double europeanTrinomialPricer(double S, double K, double sig, double r, double T, int N) 9 | { 10 | 11 | double y = 0.0;// 0.03; //div yield 12 | 13 | 14 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 15 | 16 | double Dt = T / N; 17 | double Dx = sig * std::sqrt(3.0 * Dt); 18 | 19 | double v = r - y - 0.5 * sig * sig; 20 | 21 | 22 | 23 | VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx); 24 | VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx); 25 | VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx); 26 | 27 | VecXX::INS disc = exp(-r * Dt); 28 | 29 | TrinomialSampler sampler; 30 | 31 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 32 | { 33 | 34 | auto X1 = sampler.X_Minus_1.value; 35 | auto X0 = sampler.X_0.value; 36 | auto X_1 = sampler.X_1.value; 37 | 38 | return disc * (X1 * pu + X0 * pm + X_1 * pd); 39 | }; 40 | 41 | 42 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 43 | 44 | //set up underlying asset prices at maturity 45 | double last = S * exp(-(N + 1) * Dx); 46 | double edx = exp(Dx); 47 | for (auto& el : terminalAssetPrices) 48 | { 49 | last *= edx; 50 | el = last; 51 | } 52 | 53 | auto odd_slice = transform(payOffFunc, terminalAssetPrices); 54 | auto even_slice = odd_slice; 55 | 56 | int j = 2 * N + 1 - 1; 57 | int i = 0; 58 | for (; i < N; i += 2) 59 | { 60 | transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j); 61 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 62 | j -= 2; 63 | } 64 | 65 | return odd_slice[N]; 66 | } 67 | 68 | 69 | 70 | double europeanTrinomialPricer1(double S, double K, double sig, double r, double T, int N) 71 | { 72 | 73 | double y = 0.0;// 0.03; //div yield 74 | 75 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 76 | 77 | double Dt = T / N; 78 | double Dx = sig * std::sqrt(2.0 * Dt); 79 | double v = r - y - 0.5 * sig * sig; 80 | 81 | //double u = Dx; 82 | //double d = 1. / u; 83 | 84 | 85 | VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx); 86 | VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx); 87 | VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx); 88 | 89 | 90 | VecXX::INS disc = exp(-r * Dt); 91 | TrinomialSampler sampler; 92 | 93 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 94 | { 95 | const auto& X1 = sampler.X_1.value; 96 | const auto& X0 = sampler.X_0.value; 97 | const auto& X_1 = sampler.X_Minus_1.value; 98 | return disc * (X1 * pu + X0 * pm + X_1 * pd); 99 | }; 100 | 101 | //call 102 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 103 | 104 | //put 105 | //auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); }; 106 | 107 | //set up underlying asset prices at maturity 108 | double last = S * exp(-(N + 1) * Dx); 109 | double edx = exp(Dx); 110 | for (auto& el : terminalAssetPrices) 111 | { 112 | last *= edx; 113 | el = last; 114 | } 115 | 116 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 117 | auto odd_slice = excerciseValue; 118 | 119 | UnitarySampler identity_sampler; //identity just 120 | 121 | // auto applyEarlyExcercise = [=](UnitarySampler& sampler, auto excercisePrice) 122 | // { 123 | // auto optPrice = sampler.get<0>(); 124 | // return max(optPrice, excercisePrice); 125 | // }; 126 | 127 | 128 | auto even_slice = odd_slice; 129 | 130 | int j = 2 * N + 1 - 1; 131 | int i = 0; 132 | for (; i < N; i += 2) 133 | { 134 | transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j); 135 | // transform to get early excercise for american bit , iderntity sampler just passes values straight through 136 | //transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j); 137 | 138 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 139 | //transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1); 140 | 141 | j -= 2; 142 | } 143 | 144 | return odd_slice[N]; 145 | } 146 | -------------------------------------------------------------------------------- /lattice/euroTrinomialPricerWithInit.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | 3 | #include "utils.h" 4 | #include "pricers.h" 5 | 6 | 7 | double euroTrinomialPricerWithInit(double S, double K, double sig, double r, double T, int N) 8 | { 9 | 10 | double y = 0.0;// 0.03; //div yield 11 | VecXX terminalAssetPrices(1.0, 2 * N + 1); 12 | 13 | double Dt = T / N; 14 | double Dx = sig * std::sqrt(2.0 * Dt); 15 | double v = r - y - 0.5 * sig * sig; 16 | 17 | VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx); 18 | VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx); 19 | VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx); 20 | 21 | 22 | VecXX::INS disc = exp(-r * Dt); 23 | TrinomialSampler sampler; 24 | 25 | auto trinomialRollBack = [=](TrinomialSampler& sampler) 26 | { 27 | const auto& X1 = sampler.X_1.value; 28 | const auto& X0 = sampler.X_0.value; 29 | const auto& X_1 = sampler.X_Minus_1.value; 30 | return disc * (X1 * pu + X0 * pm + X_1 * pd); 31 | }; 32 | 33 | //call 34 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 35 | 36 | //put 37 | //auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); }; 38 | 39 | //set up underlying asset prices at maturity 40 | double last = S * exp(-(N + 1) * Dx); 41 | double edx = exp(Dx); 42 | for (auto& el : terminalAssetPrices) 43 | { 44 | last *= edx; 45 | el = last; 46 | } 47 | 48 | auto excerciseValue = transform(payOffFunc, terminalAssetPrices); 49 | auto odd_slice = excerciseValue; 50 | 51 | 52 | UnitarySampler identity_sampler; //identity just 53 | 54 | auto applyEarlyExcercise = [=](UnitarySampler& sampler, auto excercisePrice) 55 | { 56 | auto optPrice = sampler.X_0.value; //.get<0>(); 57 | return max(optPrice, excercisePrice); 58 | }; 59 | 60 | 61 | auto even_slice = odd_slice; 62 | 63 | 64 | /// blacks initialisation 65 | VecXX::INS invK = 1.0 / K; 66 | VecXX::INS discountedRate = exp(-r * Dt); 67 | 68 | VecXX::INS rootT = sqrt(Dt); 69 | VecXX::INS sigmaRootT = rootT * sig; 70 | VecXX::INS invSigmaRootT = 1.0 / sigmaRootT; 71 | VecXX::INS halfSigmaSqrd_t = (0.5 * sig * sig + r) * Dt; 72 | 73 | VecXX::INS Strike = K; 74 | 75 | auto blackScholeInit = [&](VecXX::INS S) 76 | { 77 | VecXX::INS S_invK = S * invK; 78 | VecXX::INS log_sK = log(S_invK); 79 | 80 | VecXX::INS d1 = invSigmaRootT * (log_sK + halfSigmaSqrd_t); 81 | VecXX::INS d2 = d1 - sigmaRootT; 82 | VecXX::INS normD1 = cdfnorm(d1); 83 | VecXX::INS normD2 = cdfnorm(d2); 84 | VecXX::INS C = S * normD1 - Strike * discountedRate * normD2; 85 | return C; 86 | 87 | }; 88 | 89 | int j = 2 * N + 1 - 1; 90 | int i = 0; 91 | 92 | //use BS transform and normal for first pair of slices 93 | even_slice = transform(blackScholeInit, terminalAssetPrices); 94 | 95 | //even_slice = transform(payOffFunc, terminalAssetPrices); 96 | 97 | //std::vector dbg = even_slice; 98 | 99 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 100 | 101 | i += 2; 102 | j -= 2; 103 | 104 | for (; i < N; i += 2) 105 | { 106 | transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j); 107 | // transform to get early excercise for american bit , identity sampler just passes values straight through 108 | //transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j); 109 | 110 | transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1); 111 | //transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1); 112 | 113 | j -= 2; 114 | } 115 | 116 | ignore(applyEarlyExcercise); 117 | 118 | return odd_slice[N]; 119 | 120 | } 121 | 122 | -------------------------------------------------------------------------------- /lattice/europeanBinomialPricer.cpp: -------------------------------------------------------------------------------- 1 | #include "../Vectorisation/VecX/dr3.h" 2 | #include "utils.h" 3 | #include "pricers.h" 4 | 5 | 6 | double europeanBinomialPricer(double S, double K, double sig, double r, double T, int N) 7 | { 8 | 9 | VecXX terminalAssetPrices(1.0, N + 1); 10 | 11 | double Dt = T / N; 12 | double u = std::exp(sig * std::sqrt(Dt)); 13 | double d = 1. / u; 14 | 15 | 16 | VecXX::INS pu = (exp(r * Dt) - d) / (u - d); 17 | VecXX::INS oneMinusP = (1.0 - pu); 18 | VecXX::INS disc = exp(-r * Dt); 19 | 20 | BinomialSampler sampler; 21 | 22 | auto binomialRollBack = [=](BinomialSampler& sampler) 23 | { 24 | const auto& X1 = sampler.X_1.value; 25 | const auto& X0 = sampler.X_0.value; 26 | return disc * (X1 * pu + X0 * oneMinusP); 27 | }; 28 | 29 | 30 | auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; 31 | // auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); }; 32 | 33 | //set up underlying asset prices at maturity 34 | double last = S * std::pow(d, N + 2); 35 | for (auto& el : terminalAssetPrices) 36 | { 37 | last *= (u * u); 38 | el = last; 39 | } 40 | 41 | 42 | auto odd_slice = transform(payOffFunc, terminalAssetPrices); 43 | auto even_slice = odd_slice; 44 | 45 | int j = N + 1; 46 | for (int i = 0; i < N / 2; ++i) 47 | { 48 | transform(odd_slice, even_slice, binomialRollBack, sampler, 0, j); 49 | transform(even_slice, odd_slice, binomialRollBack, sampler, 0, j - 1); 50 | j -= 2; 51 | 52 | } 53 | return odd_slice[0];; 54 | } 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /lattice/lattice.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /lattice/lattice_tools.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | //lattice tools 4 | 5 | void doScan(); 6 | 7 | void testSampler(); 8 | 9 | void doStridedSpan(); 10 | 11 | void doTransformWithASpan(); 12 | 13 | void doZipping(); 14 | 15 | void doMatrix(); 16 | -------------------------------------------------------------------------------- /lattice/pricers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | double europeanBinomialPricer(double S, double K, double sig, double r, double T, int N); 4 | 5 | double europeanTrinomialPricer(double S, double K, double sig, double r, double T, int N); 6 | 7 | double europeanTrinomialPricer1(double S, double K, double sig, double r, double T, int N); 8 | 9 | double americanTrinomialPricer(double S, double K, double sig, double r, double T, int N); 10 | 11 | double americanFiniteDiffPricer(double S, double K, double sig, double r, double T, int N); 12 | 13 | double americanImplicitFiniteDiffPricer(double S, double K, double sig, double r, double T, int N); 14 | 15 | double americanImplicitFiniteDiffPricerFast(double S, double K, double sig, double r, double T, int N); 16 | 17 | double euroTrinomialPricerWithInit(double S, double K, double sig, double r, double T, int N); 18 | 19 | double americanCrankNicholsonPricer(double S, double K, double sig, double r, double T, int N); 20 | 21 | double americanTrinomialPricerUpAndOut(double S, double K, double sig, double r, double T, double H, double rebate, int N); -------------------------------------------------------------------------------- /lattice/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | #include "../Vectorisation/VecX/error_utils.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | //using namespace DRC::VecLDb; 18 | //using namespace DRC::VecDb; 19 | //using namespace DRC::VecD2D; //sse2 double 20 | //using namespace DRC::VecD4D; //avx2 double 21 | //using namespace DRC::VecF8F; // avx2 float 22 | using namespace DRC::VecD8D; //avx512 double 23 | //using namespace DRC::VecF16F; //avx512 float 24 | 25 | using FLOAT = typename InstructionTraits::FloatType; 26 | 27 | 28 | constexpr double billion = 1000000000.0; 29 | 30 | 31 | template 32 | bool vectorsEqual(const std::vector& C1, const std::vector& C2, const std::vector& C3) 33 | { 34 | bool testOK = true; 35 | const double ERR = 1e-13; //for examples 36 | if (C1.size() != C2.size()) 37 | { 38 | return false; 39 | } 40 | 41 | if (C3.size() != C2.size()) 42 | { 43 | return false; 44 | } 45 | 46 | for (size_t i = 0; i < C3.size(); i++) 47 | { 48 | auto err1 = fabs((C1[i] - C2[i]) / (C2[i] + C1[i])); 49 | auto err2 = fabs((C1[i] - C3[i]) / (C1[i] + C3[i])); 50 | 51 | if ((err1 > ERR) || (err2 > ERR)) 52 | { 53 | testOK = false; 54 | std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n"; 55 | break; 56 | } 57 | } 58 | 59 | return testOK; 60 | 61 | } 62 | 63 | 64 | 65 | template 66 | bool vectorsEqualD(const std::vector& C1, const std::vector& C2, const std::vector& C3, const std::vector& input, T ERR = 1e-13) 67 | { 68 | 69 | 70 | ERR = getErr(C1); 71 | 72 | bool testOK = true; 73 | 74 | if (C1.size() != C2.size()) 75 | { 76 | std::cout << "wrong size C1,C2" << C1.size() << ", " << C2.size() << std::endl; 77 | return false; 78 | } 79 | 80 | if (C3.size() != C2.size()) 81 | { 82 | std::cout << "wrong size C2,C3" << C2.size() << ", " << C3.size() << std::endl; 83 | return false; 84 | } 85 | 86 | for (size_t i = 0; i < C3.size(); i++) 87 | { 88 | auto err1 = fabs((C1[i] - C2[i]) / (fabs(C2[i]) + fabs(C1[i]))); 89 | auto err2 = fabs((C1[i] - C3[i]) / (fabs(C1[i]) + fabs(C3[i]))); 90 | 91 | if ((err1 > ERR) || (err2 > ERR)) 92 | { 93 | testOK = false; 94 | std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n"; 95 | std::cout << "\n val @ " << i << " C1[i] =" << C1[i] << ", C2[i] = " << C2[i] << ", C3[i] = " << C3[i] << "input val=" << input[i] << "\n"; 96 | std::cout << std::endl; 97 | break; 98 | } 99 | } 100 | 101 | return testOK; 102 | 103 | } 104 | 105 | 106 | 107 | 108 | 109 | 110 | static auto numOps = [](int TEST_LOOP_SZ, int SZ) { return static_cast(double(TEST_LOOP_SZ) * double(SZ)); }; 111 | 112 | 113 | 114 | using Calc_Values = std::map; 115 | using Calc_Values_V = std::map >; 116 | using Mapped_Performance_Results = std::map >; // array size v vector 117 | using Mapped_Stats = std::map >; // size -.pair ( throughput , std dev of through put) 118 | 119 | struct RunResults 120 | { 121 | Mapped_Performance_Results m_raw_results; 122 | Calc_Values m_calc_results; 123 | }; 124 | 125 | struct RunResultsVec 126 | { 127 | Mapped_Performance_Results m_raw_results; 128 | Calc_Values_V m_calc_results; 129 | }; 130 | 131 | class TimerGuard 132 | { 133 | double& m_runTime; 134 | std::chrono::steady_clock::time_point m_startTme; 135 | 136 | public: 137 | TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::steady_clock::now()) { runTime = 0.; } 138 | 139 | ~TimerGuard() 140 | { 141 | auto endTime = std::chrono::steady_clock::now(); 142 | auto runtime = endTime - m_startTme; 143 | m_runTime = runtime.count() / billion; 144 | } 145 | }; 146 | 147 | 148 | static auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ) 149 | { 150 | 151 | RunResults results; 152 | 153 | for (int j = 0; j < testRepeats; ++j) 154 | { 155 | int VEC_SZ = vec_start_size; 156 | for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ) 157 | { 158 | auto res = func(VEC_SZ, testLoopSZ); 159 | auto calculation_rate = res.second; 160 | auto calc_value = res.first; 161 | results.m_raw_results[VEC_SZ].push_back(calculation_rate); 162 | if (j == 0) 163 | { 164 | results.m_calc_results[VEC_SZ] = static_cast(calc_value); 165 | } 166 | } 167 | } 168 | return results; 169 | }; 170 | 171 | 172 | static auto runFunctionOverDifferentSizeVec = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ) 173 | { 174 | 175 | RunResultsVec results; 176 | 177 | for (int j = 0; j < testRepeats; ++j) 178 | { 179 | int VEC_SZ = vec_start_size; 180 | for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ) 181 | { 182 | auto res = func(VEC_SZ, testLoopSZ); 183 | auto calculation_rate = res.second; 184 | auto calc_value = res.first; 185 | results.m_raw_results[VEC_SZ].push_back(calculation_rate); 186 | 187 | if (j == 0) 188 | { 189 | std::vector tmp = res.first; 190 | results.m_calc_results[VEC_SZ] = tmp; 191 | } 192 | 193 | } 194 | } 195 | return results; 196 | }; 197 | 198 | 199 | 200 | static auto performanceStats = [](const Mapped_Performance_Results& raw_results) 201 | { 202 | 203 | Mapped_Stats stats; 204 | 205 | for (const auto& item : raw_results) 206 | { 207 | double sum = 0; 208 | double sum_sqrd = 0; 209 | double N = 0.0; 210 | for (const auto run_rate : item.second) 211 | { 212 | sum += run_rate; 213 | sum_sqrd += (run_rate * run_rate); 214 | N++; 215 | } 216 | 217 | double avg = sum / N; 218 | double varSqrd = sum_sqrd + (avg * avg * N) - (2.0 * avg * sum); 219 | double var = std::sqrt(varSqrd / (N - 1.)); 220 | 221 | stats[item.first] = { avg ,var }; 222 | 223 | } 224 | return stats; 225 | }; 226 | 227 | 228 | -------------------------------------------------------------------------------- /scratch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(ScratchExample scratch.cpp) 2 | 3 | target_link_libraries(ScratchExample PUBLIC Vectorisation) 4 | 5 | target_include_directories(ScratchExample PUBLIC 6 | "${PROJECT_BINARY_DIR}" 7 | ) -------------------------------------------------------------------------------- /scratch/scratch.cpp: -------------------------------------------------------------------------------- 1 | // scratch.cpp : This file contains the 'main' function. Program execution begins and ends there. 2 | // 3 | 4 | #include "../Vectorisation/VecX/dr3.h" 5 | 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | 16 | //using namespace DRC::VecD2D; 17 | //using namespace DRC::VecF4F; 18 | //using namespace DRC::VecD4D; 19 | using namespace DRC::VecD8D; 20 | //using namespace DRC::VecF16F; 21 | //using namespace DRC::VecF8F; 22 | 23 | 24 | 25 | void doAddWithoutCancellation() 26 | { 27 | 28 | using BINNED_ACCUMULATOR = BinsT; 29 | using FLOAT = InstructionTraits::FloatType; 30 | FLOAT oneThird = static_cast(1.0 / 3.0); 31 | 32 | std::cout << "create empty bin, value 0.0 \n"; 33 | BINNED_ACCUMULATOR bin; 34 | std::cout << "add one third to it , value ="; 35 | 36 | bin += oneThird; 37 | std::cout << std::setprecision(8) << bin.hsum() << "\n"; 38 | 39 | std::cout << "add one hundred thousand to it , value ="; 40 | bin += 100000.0f; 41 | 42 | auto t = bin.hsum(); 43 | 44 | std::cout << std::setprecision(8)<< t << std::endl; 45 | 46 | std::cout << "dd minus one hundred thousand to it , value ="; 47 | bin += -100000.0f; 48 | t=bin.hsum(); 49 | 50 | std::cout << std::setprecision(8) << t << std::endl; 51 | 52 | std::cout << "no cancellation! \n \n \n"; 53 | 54 | 55 | } 56 | 57 | /* 58 | Example summation using std reduction, and for loop 59 | then pairwise_reduce and reduce with Kahan summation 60 | and finally using binned summation. 61 | 62 | Generally for large sums, we get the same rounding error for the for loop and 63 | std accumulation/ reduce 64 | 65 | However, with pairwise reduce and kahan accumulation, we tend to last digit level precision. 66 | 67 | Unfortunately, when we add pairs of large +ve and -ve numbers which cancel each other 68 | they destroy accuracy of intermediate sums. In the code we turn this on by 69 | setting BIG_CANCELLATION = true. 70 | 71 | The cancellation flag does not change the theoretical value of the sum, 72 | the sums should return the same result as when CANCELLATION =false. 73 | However, we find that only the binned arithmetic scheme can achieve this 74 | sort of stability with this example. 75 | 76 | The input data set is randomly permuted and the sum re calculated. The ideal result is that we get the same 77 | answer irrespective of the ordering. The actual results differ to varying degrees. 78 | 79 | Both pairwise and kahan summation can significantly reduce rounding errors, however 80 | binned summation tends to work much better if we have significant cancellation. 81 | For loops and std::accumulate /reduce are generally less accurate. 82 | 83 | */ 84 | 85 | void setCancelInput(float& flt) 86 | { 87 | flt = 100000.f; 88 | } 89 | 90 | void setCancelInput(double& dbl) 91 | { 92 | dbl = 1000000000000.0; 93 | } 94 | 95 | 96 | int main() 97 | { 98 | //simple binned sum example 99 | doAddWithoutCancellation(); 100 | 101 | long SZ = 10000 * 1024 ; // size of data set to be summed 102 | using FLOAT = InstructionTraits::FloatType; 103 | FLOAT initVal = static_cast(1.0 / 3.0); 104 | 105 | VecXX data(initVal, SZ); 106 | double scale = 1.0;// us power of 2 eg 1.0 / 1024.0 * 1.0 / 1024.0 * 1.0 / 1024.0; 107 | data *= scale; 108 | 109 | bool USE_BIG_CANCELLATION = false; 110 | 111 | for( int C = 0; C < 2;C++) // iterate using cancellation data set 112 | { 113 | if (C >0) { USE_BIG_CANCELLATION = true;} 114 | 115 | int i = 0; 116 | 117 | 118 | auto mixed = data; 119 | long count = 0; 120 | 121 | //make data members slightly different and add 122 | //cancelling values if required 123 | for (auto &x: mixed) { 124 | count++; 125 | x += count * 0.0001f; 126 | FLOAT a; 127 | setCancelInput(a); 128 | FLOAT b = -a; 129 | 130 | if (!USE_BIG_CANCELLATION) { 131 | a = 0.0; 132 | b = 0.0; 133 | } 134 | 135 | if ((count > 17) && (count % 17 == 0)) //every 17'th element set up cancellation 136 | { 137 | auto c = mixed[count] + mixed[count - 1] + mixed[count - 2]; 138 | mixed[count] = c; 139 | mixed[count - 1] = b; 140 | mixed[count - 2] = a; 141 | } 142 | ignore(x); 143 | } 144 | 145 | 146 | double multiplr = 1.0;// pow(1024., 4); 147 | mixed *= multiplr; 148 | 149 | //std::vector scaledVec = mixed; //for debug observation 150 | // run ten permutations of data set and do summation 151 | for (int kkk = 0; kkk < 10; kkk++) 152 | { 153 | std::random_device rd; 154 | std::mt19937 g(rd()); 155 | 156 | std::shuffle(mixed.begin(), mixed.end(), g); 157 | // std::vector obs= mixed; 158 | 159 | auto std_acc = std::accumulate(mixed.begin(), mixed.end(), static_cast(0.0)); 160 | auto std_reduce = std::reduce(mixed.begin(), mixed.end(), static_cast(0.0)); 161 | 162 | auto sumIt = [](auto x, auto y) { return x + y; }; 163 | auto sumPairwiseDr3 = ApplyAccumulate2UR_X_pairwise(mixed, sumIt); 164 | 165 | auto DRCubedAccum = ApplyAccumulate2UR_X(mixed, sumIt); 166 | 167 | FLOAT trad_for_loop = 0.0f; 168 | for (auto x: mixed) { 169 | trad_for_loop += x; 170 | } 171 | 172 | //correcting summation lambda 173 | auto NULL_Vec = VecXX::INS(0.0); 174 | auto KhanAddV = [c = NULL_Vec](auto sum, auto rhs) mutable { 175 | auto y = rhs - c; 176 | auto t = sum + y; 177 | c = (t - sum); 178 | c = c - y; 179 | sum = t; 180 | return t; 181 | }; 182 | 183 | 184 | auto sumKahan = reduce(mixed, KhanAddV); 185 | auto KahanAddD = [cc = static_cast(0.0)](auto sum, auto rhs) mutable { 186 | auto y = rhs - cc; 187 | auto t = sum + y; 188 | cc = (t - sum); 189 | cc = cc - y; 190 | sum = t; 191 | return t; 192 | }; 193 | 194 | 195 | double std_accumulate_Kahan = std::accumulate(mixed.begin(), mixed.end(), static_cast(0.0), KahanAddD); 196 | double std_reduce_Kahan = std::reduce(mixed.begin(), mixed.end(), static_cast(0.0), KahanAddD); 197 | 198 | NULL_Vec = VecXX::INS(0.0); 199 | auto sumPairwiseWithKahan = ApplyAccumulate2UR_X_pairwise(mixed, KhanAddV); 200 | 201 | // reduce with binned accumulator 202 | auto scale = 1.0;// pow(1024.0, 2); 203 | using BINNED_ACCUMULATOR = BinsT; 204 | BINNED_ACCUMULATOR Bin(0.0, scale);// 205 | 206 | 207 | 208 | auto binned_Sum = reduceWithAccumulator(Bin, mixed, BinnedAdd); 209 | 210 | 211 | ///binned_Sum = mult; 212 | 213 | std::cout << "\nUsing Significant Cancellation Data = " << std::boolalpha << USE_BIG_CANCELLATION << " \n"; 214 | std::cout << "shuffled version " << ++i << "\n" << std::setprecision(16) 215 | << trad_for_loop << "\t for loop sum \n" 216 | << std_acc << "\t std::accumulate sum \n" 217 | << std_reduce << "\t std::reduce \n" 218 | << DRCubedAccum << "\t accumulate DR3 \n" 219 | << std_accumulate_Kahan << "\t std_accumulate_Kahan \n" 220 | << std_reduce_Kahan << "\t std_reduce_Kahan \n" 221 | << sumPairwiseDr3 << "\t sum pairwise \n" 222 | << sumKahan << "\t sum Kahan acc \n" 223 | << sumPairwiseWithKahan << " \t pairwise_sum using Kahan acc \n" 224 | << binned_Sum << "\t binned sum acc \n \n \n \n"; 225 | 226 | 227 | } 228 | } 229 | } 230 | 231 | -------------------------------------------------------------------------------- /scratch/scratch.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | --------------------------------------------------------------------------------