├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CMakeLists.txt
├── CMakeSettings.json
├── GettingStarted
    ├── CMakeLists.txt
    ├── GettingStarted.cpp
    ├── GettingStarted.vcxproj
    ├── GettingStarted.vcxproj.filters
    └── GettingStarted.vcxproj.user
├── LICENSE
├── LeaveOneOutRegression
    ├── LeaveOneOutRegression.cpp
    ├── LeaveOneOutRegression.vcxproj
    └── LeaveOneOutRegression.vcxproj.user
├── README.md
├── VariadicReduction
    ├── CMakeLists.txt
    ├── VariadicReducrion.vcxproj
    ├── VariadicReducrion.vcxproj.filters
    ├── VariadicReduction.cpp
    ├── VariadicReduction.sln
    └── VariadicReduction.vcxproj
├── VectorTest
    ├── DR3_tests.cpp
    ├── TestAccumulator.cpp
    ├── TestAllocator.cpp
    ├── TestCurve.cpp
    ├── TestFilterSelect.cpp
    ├── TestFilterTransform.cpp
    ├── TestScan.cpp
    ├── TestSpan.cpp
    ├── TestViews.cpp
    ├── Test_binary_unitary_operations.cpp
    ├── Unroll_operators.cpp
    ├── VTune Profiler Results
    │   └── VectorTest
    │   │   └── VectorTest.vtuneproj
    ├── VectorTest.log
    ├── VectorTest.vcxproj
    ├── VectorTest.vcxproj.user
    ├── dr3TestUtil.h
    ├── packages.config
    ├── pch.cpp
    ├── pch.h
    ├── test.cpp
    ├── testNamespace.cpp
    ├── testNamespace.h
    └── test_precise_accumulation.cpp
├── Vectorisation.sln
├── Vectorisation
    ├── CMakeLists.txt
    ├── Output-Build.txt
    ├── TextFile1.txt
    ├── VCL
    │   ├── LICENSE
    │   ├── README.md
    │   ├── dispatch_example1.cpp
    │   ├── dispatch_example2.cpp
    │   ├── instrset.h
    │   ├── instrset_detect.cpp
    │   ├── vector_convert.h
    │   ├── vectorclass.h
    │   ├── vectorf128.h
    │   ├── vectorf256.h
    │   ├── vectorf256e.h
    │   ├── vectorf512.h
    │   ├── vectorf512e.h
    │   ├── vectori128.h
    │   ├── vectori256.h
    │   ├── vectori256e.h
    │   ├── vectori512.h
    │   ├── vectori512e.h
    │   ├── vectori512s.h
    │   ├── vectori512se.h
    │   ├── vectormath_common.h
    │   ├── vectormath_exp.h
    │   ├── vectormath_hyp.h
    │   ├── vectormath_lib.h
    │   └── vectormath_trig.h
    ├── VecX
    │   ├── accumulate_transform.h
    │   ├── alloc_policy.cpp
    │   ├── alloc_policy.h
    │   ├── alloc_policy_imp.h
    │   ├── apply_operation.h
    │   ├── binary_unitary_operations.h
    │   ├── binned_accumulator.h
    │   ├── boolean_operations.h
    │   ├── conditional_select_eval.h
    │   ├── dr3.h
    │   ├── error_utils.h
    │   ├── filter_pipe_and_join.h
    │   ├── filter_select.h
    │   ├── instruction_traits.h
    │   ├── math_ops.h
    │   ├── operations.h
    │   ├── sampler.h
    │   ├── scan.h
    │   ├── span.h
    │   ├── target_name_space.h
    │   ├── transform.h
    │   ├── unroll_operators.h
    │   ├── vcl_latest.h
    │   ├── vec.cpp
    │   ├── vec.h
    │   ├── vec_bool.h
    │   ├── vec_bool_d.h
    │   ├── vec_d.h
    │   ├── vec_double.h
    │   ├── vec_view.h
    │   └── zip_utils.h
    ├── Vectorisation.cpp
    ├── Vectorisation.log
    ├── Vectorisation.sln
    ├── Vectorisation.vcxproj
    ├── Vectorisation.vcxproj.filters
    ├── Vectorisation.vcxproj.user
    ├── intel_Libs
    │   ├── libirc.lib
    │   ├── svml_disp.lib
    │   ├── svml_dispmd.lib
    │   ├── svml_dispmt.lib
    │   └── svmlpatch.lib
    ├── intrinsic_utils.h
    ├── packages.config
    └── pch.h
├── accumulateExample
    ├── CMakeLists.txt
    ├── accumulateExample.log
    ├── accumulateExample.vcxproj
    ├── accumulateExample.vcxproj.filters
    ├── accumulateExample.vcxproj.user
    ├── accumulate_example.cpp
    ├── gnormcpp.cpp
    └── norm.h
├── cumNormalExample
    ├── CMakeLists.txt
    ├── cumNormal.h
    ├── cumNormalExample.cpp
    ├── cumNormalExample.vcxproj
    ├── cumNormalExample.vcxproj.filters
    └── cumNormalExample.vcxproj.user
├── dancingAVX512
    ├── AVX512Dance.cpp
    ├── AVX512Dance.h
    ├── CMakeLists.txt
    ├── dancingAVX512.cpp
    ├── dancingAVX512.vcxproj
    └── dancingAVX512.vcxproj.user
├── docs
    ├── BlackScholesVecXX.mp4
    ├── Build.md
    └── cppCon2022.pdf
├── inverseCumNormalExample
    ├── CMakeLists.txt
    ├── cdfNormalInverse.cpp
    ├── cdfNormalInverse.h
    ├── inverseCumNormalExample.cpp
    ├── inverseCumNormalExample.log
    ├── inverseCumNormalExample.vcxproj
    ├── inverseCumNormalExample.vcxproj.filters
    └── inverseCumNormalExample.vcxproj.user
├── lattice
    ├── CMakeLists.txt
    ├── americanCrankNicholsonPricer.cpp
    ├── americanFinitDiffPricer.cpp
    ├── americanImplicitFiniteDiff.cpp
    ├── americanTrinomialPricer.cpp
    ├── americanTrinomialPricerUpAndOut.cpp
    ├── euroTrinomial.cpp
    ├── euroTrinomialPricerWithInit.cpp
    ├── europeanBinomialPricer.cpp
    ├── lattice.cpp
    ├── lattice.vcxproj
    ├── lattice.vcxproj.user
    ├── lattice_tools.cpp
    ├── lattice_tools.h
    ├── pricers.h
    └── utils.h
└── scratch
    ├── CMakeLists.txt
    ├── scratch.cpp
    ├── scratch.vcxproj
    └── scratch.vcxproj.user


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Unittest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | env:
10 |   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
11 |   BUILD_TYPE: Release
12 | 
13 | jobs:
14 |   example_matrix:
15 |     strategy:
16 |       matrix:
17 |         os: [windows-latest, ubuntu-latest]
18 |     runs-on: ${{ matrix.os }}
19 |     steps:
20 |     - uses: actions/checkout@v3
21 |     - uses:  symbitic/install-cmake@master
22 | 
23 |     - name: CMake Configure and Build on Ubuntu
24 |       if: matrix.os == 'Ubuntu-latest'
25 |       run: |      
26 |         cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
27 |         cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
28 |   
29 |     - name: CMake  Configure and Build on Windows
30 |       if: matrix.os == 'windows-latest'
31 |       run: |
32 |         call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64
33 |         cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
34 |         cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
35 |       shell: cmd
36 |     
37 |     #- name: Test
38 |     #  working-directory: ${{github.workspace}}/build
39 |     #  # Execute tests defined by the CMake configuration.  
40 |     #  # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
41 |     #  run: ctest -C ${{env.BUILD_TYPE}}
42 |      
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ﻿################################################################################
 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio.
 3 | ################################################################################
 4 | 
 5 | /build
 6 | /build_vcc
 7 | 
 8 | /packages
 9 | /.vs/DRCubed/v16/TestStore/0
10 | /accumulateExample/clang-cl
11 | /accumulateExample/ICC2022
12 | /accumulateExample/Release
13 | /GettingStarted/clang-cl
14 | /GettingStarted/ICC2022
15 | /GettingStarted/Release
16 | /GettingStarted/x64/Release
17 | /.vs/Vectorisation/v16/ipch/AutoPCH/4e9dfb20cefae0b2
18 | /accumulateExample/x64/Release
19 | /inverseCumNormalExample/clang-cl
20 | /inverseCumNormalExample/ICC2022
21 | /inverseCumNormalExample/Release
22 | /inverseCumNormalExample/x64/Release
23 | /Vectorisation/.vs/Vectorisation/v16
24 | /Vectorisation/clang-cl
25 | /Vectorisation/Debug
26 | /Vectorisation/ICC2022
27 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.0
28 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4/build/native
29 | /Vectorisation/Release
30 | /Vectorisation/x64/Release
31 | /VectorTest/clang-cl
32 | /VectorTest/ICC2022
33 | /VectorTest/Release
34 | /VectorTest/x64/Release
35 | /x64/Release
36 | /Vectorisation/packages/Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn.1.8.1.4
37 | /.vs/Vectorisation/v16
38 | /.vs
39 | /accumulateExample/x64
40 | /GettingStarted/x64
41 | /inverseCumNormalExample/x64
42 | /Vectorisation/x64
43 | /VectorTest/x64
44 | /x64
45 | /accumulateExample/My Advisor Results - accumulateExample
46 | /accumulateExample/VTune Profiler Results/accumulateExample
47 | /cumNormalExample/x64
48 | /inverseCumNormalExample/My Advisor Results - inverseCumNormalExample
49 | /inverseCumNormalExample/VTune Profiler Results/inverseCumNormalExample
50 | /GettingStarted/My Advisor Results - GettingStarted
51 | /accumulateExample/My Inspector Results - accumulateExample
52 | /GettingStarted/My Inspector Results - GettingStarted
53 | /VectorTest/My Advisor Results - VectorTest
54 | /VectorTest/My Inspector Results - VectorTest
55 | /out/build/x64-Debug
56 | /lattice/x64
57 | /dancingAVX512/x64
58 | /Vectorisation/cmake-build-debug
59 | /lattice/cmake-build-debug
60 | /curveExample/Intel® VTune™ Profiler Results/curveExample
61 | /curveExample/My Advisor Results - curveExample
62 | /scratch/x64
63 | /cmake-build-debug
64 | /.idea
65 | /curveExample/x64
66 | /scratch/r000mi2
67 | /scratch/r001mi2
68 | /scratch/r002mi3
69 | /scratch/r003mi2
70 | /scratch/scratch.inspxeproj
71 | /debug.log
72 | /cmake-build-release/.cmake/api/v1/reply
73 | /cmake-build-release/.cmake/api/v1/query
74 | /cmake-build-release/accumulateExample/CMakeFiles/accumulateExample.dir/accumulate_example.cpp.obj
75 | /cmake-build-release
76 | /VariadicReducrion/e000
77 | /VariadicReducrion/Intel® VTune™ Profiler Results/VariadicReducrion
78 | /VariadicReducrion/x64
79 | /DawnCache
80 | /config
81 | /GPUCache
82 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.17)
 2 | 
 3 | project(DR3)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 7 | 
 8 | 
 9 | add_subdirectory(Vectorisation)
10 | add_subdirectory(accumulateExample)
11 | add_subdirectory(cumNormalExample)
12 | add_subdirectory(inverseCumNormalExample)
13 | add_subdirectory(dancingAVX512)
14 | add_subdirectory(lattice)
15 | add_subdirectory(GettingStarted)
16 | add_subdirectory(scratch)
17 | add_subdirectory(VariadicReduction)
18 | 
19 | 


--------------------------------------------------------------------------------
/CMakeSettings.json:
--------------------------------------------------------------------------------
 1 | ﻿{
 2 |   "configurations": [
 3 |     {
 4 |       "name": "x64-Debug",
 5 |       "generator": "Ninja",
 6 |       "configurationType": "Debug",
 7 |       "inheritEnvironments": [ "msvc_x64_x64" ],
 8 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
 9 |       "installRoot": "${projectDir}\\out\\install\\${name}",
10 |       "cmakeCommandArgs": "",
11 |       "buildCommandArgs": "",
12 |       "ctestCommandArgs": ""
13 |     },
14 |     {
15 |       "name": "Linux-GCC-Release",
16 |       "generator": "Ninja",
17 |       "configurationType": "RelWithDebInfo",
18 |       "cmakeExecutable": "cmake",
19 |       "remoteCopySourcesExclusionList": [ ".vs", ".git", "out" ],
20 |       "cmakeCommandArgs": "",
21 |       "buildCommandArgs": "",
22 |       "ctestCommandArgs": "",
23 |       "inheritEnvironments": [ "linux_x64" ],
24 |       "remoteMachineName": "${defaultRemoteMachineName}",
25 |       "remoteCMakeListsRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/src",
26 |       "remoteBuildRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/build/${name}",
27 |       "remoteInstallRoot": "$HOME/.vs/${projectDirName}/${workspaceHash}/out/install/${name}",
28 |       "remoteCopySources": true,
29 |       "rsyncCommandArgs": "-t --delete --delete-excluded",
30 |       "remoteCopyBuildOutput": false,
31 |       "remoteCopySourcesMethod": "rsync"
32 |     }
33 |   ]
34 | }


--------------------------------------------------------------------------------
/GettingStarted/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(GettingStarted GettingStarted.cpp)
2 | 
3 | target_link_libraries(GettingStarted PUBLIC Vectorisation)
4 | 
5 | target_include_directories(GettingStarted PUBLIC
6 |                            "${PROJECT_BINARY_DIR}"
7 |                            )


--------------------------------------------------------------------------------
/GettingStarted/GettingStarted.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="GettingStarted.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |   </ItemGroup>
22 |   <ItemGroup>
23 |     <Library Include="..\Vectorisation\intel_Libs\libirc.lib" />
24 |     <Library Include="..\Vectorisation\intel_Libs\svml_disp.lib" />
25 |     <Library Include="..\Vectorisation\intel_Libs\svmlpatch.lib" />
26 |   </ItemGroup>
27 | </Project>


--------------------------------------------------------------------------------
/GettingStarted/GettingStarted.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/LeaveOneOutRegression/LeaveOneOutRegression.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DR3 
 2 | 
 3 | To get full use of the repo, you need a modern processor which has AVX512 or AVX2 instructions. 
 4 | If your processor only has  AVX2, you need to change target instruction sets in the projects to AVX2, and don’t generate AVX512
 5 | in the projects because your machine wont run them.
 6 |  
 7 | The projects build with GCC, clang, IC2022 and VS2019.
 8 | In visual c++ select x64 and solution configuration  for IC2022, release, debug and clang
 9 | 
10 | The getting started project shows some example use cases for vectors, filters and views, together with an experimental
11 | vectorised forward AAD getting option sensitivities.
12 | 
13 | The accumulate example shows some of the use cases given in the cppCon2022 talk.
14 | Additionally it gives an example of error correction in Khan accumulation
15 | 
16 | The example build an run with VS2019, clang and intel compilers. The target instruction set 
17 | generated by the framework can be changed by changing the namespace. These are double and float
18 | types  VecDb is pair of doubles. Uncomment the namespace and build the example.
19 | 
20 | //using namespace DRC::VecDb;
21 | 
22 | //using namespace DRC::VecD2D;  //sse2   double
23 | 
24 | using namespace DRC::VecD4D;	//avx2   double
25 | 
26 | //using namespace DRC::VecF8F;	// avx2  float
27 | 
28 | //using namespace DRC::VecD8D;  //avx512 double
29 | 
30 | //using namespace DRC::VecF16F; //avx512   float
31 | 
32 | 
33 | For a machine supporting AVX512, ensure all the visual studio projects are set to use  enhanced instruction set. 
34 | ConfigurationProperties C++/Instruction Set /Enable Enhanced Instruction Set to ARCh:AVX512
35 | If your machine doesnt support this, reduce to AVX2 or SSE2, and dont select a namespace in the code requiring more advanced instruction
36 | sets.
37 | 
38 | Uncomment one of the Using namespace lines  select the instruction set that you wish to run  
39 | Those ending in F have float type as underlying, those ending with D have a double.
40 | 
41 | The project is set to compile using the AVX512  enhanced instruction set. The namespace selection 
42 | choses the type of the intrinsics that are used to instantiate lambdas.
43 | 
44 | If your hardware does not support AVX512 chose the next level down AVX2 and avoid using namespaces 
45 | DRC::VecD8D or DRC::VecF16F which will cause generation of code with instructions that your computer doesn't support. 
46 | 
47 | check device manager/processor to determine what processor you have and check against web site 
48 | https://ark.intel.com/content/www/us/en/ark/products/123550/intel-xeon-silver-4114-processor-13-75m-cache-2-20-ghz.html
49 | or
50 | https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html
51 | 
52 | 
53 | The getting started project shows the useage of vectors lambdas and filters
54 | 
55 | The accumulateExample builds performance examples covered in the cppCon2022 talk. 
56 | They give the user the chance to change between ICC,clang and VS2019 builds but changing the
57 | instruction set used via the using declaration.
58 | 
59 | The inverseCumNormalExample  gives the performance example shown in cppCon2022, although there might be some slight
60 | perfrormance regression on one or two of the examples.   Its instructive to run the examples after building with the
61 | different compilers and  chosing different instruction sets for the Lambdas (via namespace).
62 | 
63 | The AVX512Dance function runs a routine which finds the max value in in array, using AVX2 and AVX512. By monitoring the
64 | power useage using something like openhardware monitor its possible to see that using the AVX512 instructions, use less
65 | energy to do the compute than the AVX2 ( on this silver4114 xeon).
66 | 
67 | VectorTest is a selection of tests using googletest.  
68 | The main library is  Vectorisation.  This refrence a local copy of the VCL2 library.  It has a slight change to enable
69 | VCL2 to be used with the intel IC2022 compiler.
70 | 
71 | 
72 | ## Building DR3
73 | 
74 | See [docs/Build.md](docs/Build.md) for instructions on how to build DR3 from source and a list of supported platforms.
75 | 
76 | 


--------------------------------------------------------------------------------
/VariadicReduction/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(VariadicExample VariadicReduction.cpp)
2 | 
3 | target_link_libraries(VariadicExample PUBLIC Vectorisation)
4 | 
5 | target_include_directories(VariadicExample PUBLIC
6 |         "${PROJECT_BINARY_DIR}"
7 | )
8 | 


--------------------------------------------------------------------------------
/VariadicReduction/VariadicReducrion.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="VariadicReduction.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |   </ItemGroup>
22 | </Project>


--------------------------------------------------------------------------------
/VariadicReduction/VariadicReduction.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 17
 4 | VisualStudioVersion = 17.7.34202.233
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VariadicReduction", "VariadicReduction.vcxproj", "{271CF3D5-72FF-4657-9325-4206B8D5C84F}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		clang-cl23|x64 = clang-cl23|x64
11 | 		clang-cl23|x86 = clang-cl23|x86
12 | 		Debug|x64 = Debug|x64
13 | 		Debug|x86 = Debug|x86
14 | 		ICC2023|x64 = ICC2023|x64
15 | 		ICC2023|x86 = ICC2023|x86
16 | 		Release|x64 = Release|x64
17 | 		Release|x86 = Release|x86
18 | 		Release-23|x64 = Release-23|x64
19 | 		Release-23|x86 = Release-23|x86
20 | 	EndGlobalSection
21 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.ActiveCfg = clang-cl23|x64
23 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x64.Build.0 = clang-cl23|x64
24 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.ActiveCfg = clang-cl23|Win32
25 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.clang-cl23|x86.Build.0 = clang-cl23|Win32
26 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.ActiveCfg = Debug|x64
27 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x64.Build.0 = Debug|x64
28 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.ActiveCfg = Debug|Win32
29 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Debug|x86.Build.0 = Debug|Win32
30 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.ActiveCfg = ICC2023|x64
31 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x64.Build.0 = ICC2023|x64
32 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.ActiveCfg = ICC2023|Win32
33 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.ICC2023|x86.Build.0 = ICC2023|Win32
34 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.ActiveCfg = Release|x64
35 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x64.Build.0 = Release|x64
36 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.ActiveCfg = Release|Win32
37 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release|x86.Build.0 = Release|Win32
38 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.ActiveCfg = Release-23|x64
39 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x64.Build.0 = Release-23|x64
40 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.ActiveCfg = Release-23|Win32
41 | 		{271CF3D5-72FF-4657-9325-4206B8D5C84F}.Release-23|x86.Build.0 = Release-23|Win32
42 | 	EndGlobalSection
43 | 	GlobalSection(SolutionProperties) = preSolution
44 | 		HideSolutionNode = FALSE
45 | 	EndGlobalSection
46 | 	GlobalSection(ExtensibilityGlobals) = postSolution
47 | 		SolutionGuid = {347FE8D5-D275-4584-8F15-DD105566C258}
48 | 	EndGlobalSection
49 | EndGlobal
50 | 


--------------------------------------------------------------------------------
/VectorTest/TestAllocator.cpp:
--------------------------------------------------------------------------------
  1 | #include "pch.h"
  2 | 
  3 | #include "../Vectorisation/VecX/vec.h"
  4 | #include "../Vectorisation/VecX/operations.h"
  5 | #include "../Vectorisation/VecX/vec_bool_d.h"
  6 | #include "../Vectorisation/VecX/vec_double.h"
  7 | #include  "../Vectorisation/VecX/alloc_policy.h"
  8 | #include  "../Vectorisation/VecX/alloc_policy_imp.h"
  9 | #include "../Vectorisation/VecX/target_name_space.h"
 10 | 
 11 | 
 12 | TEST(TestCaseAlloc, fillup_empty_last) {
 13 |   EXPECT_EQ(1, 1);
 14 |   EXPECT_TRUE(true);
 15 | 
 16 |   PoolStrat<double> myPool(10);
 17 |   int  MAX_EL = 20;
 18 |   std::vector<double*> pAlloc;
 19 | 
 20 |   int pos = myPool.pos();
 21 |   for (int i = 0; i < MAX_EL; ++i)
 22 |   {
 23 | 	  double* p = myPool.alloc();
 24 | 	  pos = myPool.pos();
 25 | 
 26 | 	  (*p) = i;
 27 | 	  pAlloc.push_back(p);
 28 |   }
 29 | 
 30 |   int szx =myPool.size();
 31 |   pos = myPool.pos();
 32 | 
 33 |   for (int k = pos; k > 0; --k)
 34 |   {
 35 | 	  double* pback = pAlloc.back();
 36 | 		  pAlloc.pop_back();
 37 | 	  myPool.free(pback);
 38 | 
 39 | 	  pos = myPool.pos();
 40 | 
 41 |   }
 42 | 
 43 | 
 44 | }
 45 | 
 46 | 
 47 | TEST(TestCaseAlloc, fillup_empty_secondlast) {
 48 | 	EXPECT_EQ(1, 1);
 49 | 	EXPECT_TRUE(true);
 50 | 
 51 | 	PoolStrat<double> myPool(10);
 52 | 	int  MAX_EL = 20;
 53 | 	std::vector<double*> pAlloc;
 54 | 
 55 | 	int pos = myPool.pos();
 56 | 	for (int i = 0; i < MAX_EL; ++i)
 57 | 	{
 58 | 		double* p = myPool.alloc();
 59 | 		pos = myPool.pos();
 60 | 
 61 | 		(*p) = i;
 62 | 		pAlloc.push_back(p);
 63 | 	}
 64 | 
 65 | 	int szx = myPool.size();
 66 | 	pos = myPool.pos();
 67 | 
 68 | 	for (int k = pos; k > 1; --k)
 69 | 	{
 70 | 		double* pback = pAlloc[k - 2];
 71 | 		//pAlloc.pop_back();
 72 | 		myPool.free(pback);
 73 | 
 74 | 		pos = myPool.pos();
 75 | 
 76 | 	}
 77 | 
 78 | 	//all ok
 79 | 	//add one
 80 | 	auto newOne = myPool.alloc();
 81 | 	(*newOne) = 88;
 82 | 
 83 | 	myPool.free(newOne);
 84 | 	myPool.free(pAlloc[MAX_EL - 1]);
 85 | 
 86 | 	//all deleted
 87 | 	auto newOnetoo = myPool.alloc();
 88 | 	(*newOnetoo) = 99;
 89 | 	//one element 99
 90 | 
 91 | 
 92 | 	for (int i = 0; i < 3; ++i)
 93 | 	{
 94 | 		auto newOnetoo = myPool.alloc();
 95 | 		(*newOnetoo) = 44 + i;
 96 | 	}
 97 | 
 98 | 
 99 | 	//needto test some vakues
100 | }
101 | 
102 | 
103 | using namespace DRC::VecD4D;
104 | 
105 | TEST(TestCaseAlloc, monkyBusinessBuffer) {
106 | 	EXPECT_EQ(1, 1);
107 | 	EXPECT_TRUE(true);
108 | 
109 | 
110 | 	std::vector<double> mix(21,1.0);
111 | 	VecXX Vec2(mix);
112 | 
113 | 
114 | 	auto d = Vec2;
115 | 	auto a = d;
116 | 	auto b = a;
117 | 	auto c = b;
118 | 
119 | 
120 | 	a *= -1.0;
121 | 	auto w = log(-a);
122 | 	std::vector<double> cach(w.size());
123 | 	for (size_t i = 0; i < w.size(); i++)
124 | 	{
125 | 		cach[i] = w[i];
126 | 	}
127 | 	auto aa = -b;
128 | 
129 | 	//operation above should not change
130 | 	for (size_t i = 0; i < w.size(); i++)
131 | 	{
132 | 		double cacI = cach[i];
133 | 		double wI = w[i];
134 | 		EXPECT_EQ(cacI, wI);
135 | 	}
136 | 
137 | 
138 | 
139 | }


--------------------------------------------------------------------------------
/VectorTest/TestCurve.cpp:
--------------------------------------------------------------------------------
 1 | #include "pch.h"
 2 | 
 3 | 
 4 | #include "../../Vectorisation/ExampleVectors/curve.h"
 5 | #include "../Vectorisation/VecX/operations.h"
 6 | #include "../Vectorisation/VecX/vec_bool_d.h"
 7 | #include "../Vectorisation/VecX/vec_double.h"
 8 | #include  "../Vectorisation/VecX/alloc_policy.h"
 9 | 
10 | typedef VecD<VecDouble>  VecxD;
11 | typedef VecD<VecDouble>  Vecx;
12 | typedef Vec<VecDouble>  VecXX;
13 | 
14 | TEST(TestCaseCurve, Test1) {
15 | 	EXPECT_EQ(1, 1);
16 | 	EXPECT_TRUE(true);
17 | 
18 | 	std::vector<double>  values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 };
19 | 	std::vector <long>   dates = { 0,1,2,3,4,5,6,7,8,9,10 };
20 | 	std::vector <double>   datesD = { 0,1,2,3,4,5,6,7,8,9,10 };
21 | 
22 | 
23 | 
24 | 
25 | 	Curve< double, double> testCurve;
26 | 	testCurve.setValues(begin(values), end(values), begin(datesD), end(datesD)); //wrong way round
27 | 
28 | 	auto val = testCurve.valueAt(0.0);
29 | 
30 | 	EXPECT_EQ(val, 0.0);
31 | 	val = testCurve.valueAt(0.5);
32 | 	EXPECT_EQ(val, 0.5);
33 | 
34 | 
35 | 	///////////////////////////
36 | 	std::vector< VecXX>  vecVals;
37 | 	for (int i = 0; i < 11; i++)
38 | 	{
39 | 		VecXX vals(i * 0.001 + 0.06, 100);
40 | 		vecVals.push_back(vals);
41 | 
42 | 	}
43 | 
44 | 
45 | 	{
46 | 		using ZeroCrv = Curve< double, VecXX, ZeroInterp<double, VecXX> >;
47 | 
48 | 		ZeroCrv testCurve2;
49 | 		testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals));
50 | 
51 | 		auto valV = testCurve2.valueAt(0.0);
52 | 
53 | 		auto valV2 = testCurve2.valueAt(0.5);
54 | 	}
55 | 
56 | 
57 | 	{
58 | 		std::vector<double>  values{ 0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0 };
59 | 		std::vector <long>   dates = { 0,1,2,3,4,5,6,7,8,9,10 };
60 | 		std::vector <double>   datesD = { 0,1,2,3,4,5,6,7,8,9,10 };
61 | 
62 | 		std::vector< VecXX>  vecVals;
63 | 		for (int i = 0; i < 11; i++)
64 | 		{
65 | 			VecXX vals(i * 0.001 + 0.06, 100);
66 | 			vecVals.push_back(vals);
67 | 
68 | 		}
69 | 
70 | 
71 | 		Curve2<double, VecXX, ZeroInterpCached<double, VecXX>>  testCurve2(10);
72 | 		testCurve2.setValues(begin(datesD), end(datesD), begin(vecVals), end(vecVals));
73 | 
74 | 		auto valV = testCurve2.valueAt(0.0);
75 | 
76 | 		auto valV2 = testCurve2.valueAt(0.5);
77 | 
78 | 
79 | 		for (long l = 0; l < 10000; l++)
80 | 		{
81 | 			auto valV3 = testCurve2.valueAt(0.5);
82 | 		}
83 | 	}
84 | 
85 | 	//EXPECT_EQ(val, 0.0);
86 | 	//val = testCurve.valueAt(0.5);
87 | 	//EXPECT_EQ(val, 0.5);
88 | }


--------------------------------------------------------------------------------
/VectorTest/TestFilterTransform.cpp:
--------------------------------------------------------------------------------
 1 | #include "pch.h"
 2 | 
 3 | #include "../Vectorisation/VecX/vec.h"
 4 | #include "../Vectorisation/VecX/operations.h"
 5 | #include "../Vectorisation/VecX/vec_bool_d.h"
 6 | #include "../Vectorisation/VecX/vec_double.h"
 7 | #include  "../Vectorisation/VecX/alloc_policy.h"
 8 | #include "../Vectorisation/VecX/vec_d.h"
 9 | #include "../Vectorisation/VecX/vec_bool.h"
10 | #include "../Vectorisation/VecX/vec_view.h"
11 | 
12 | #include "../Vectorisation/VecX/target_name_space.h"
13 | 
14 | 
15 | #include "../Vectorisation/VecX/dr3.h"
16 | #include "dr3TestUtil.h"
17 | 
18 | #include <numeric>
19 | #include "testNamespace.h"
20 | 
21 | 
22 | void testFilterTransform(int SZ )
23 | {
24 | 
25 | 	auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); };
26 | 	std::vector<Numeric> input(SZ,asNumber( 0.0));
27 | 	std::iota(begin(input), end(input), asNumber(0.0));
28 | 
29 | 	VecXX testVec(input);
30 | 	auto trueLambdaS = [&](auto x) { return x; };
31 | 	auto falseLambdaS = [&](auto x) { return -x; };
32 | 
33 | 
34 | 	for (int j = 0; j < SZ; ++j)
35 | 	{
36 | 		auto onlyJlambda = [=](auto x) { return (asNumber(j) > (x - asNumber(0.0001)) && (asNumber(j) < x + asNumber(0.00001))); };
37 | 		VecXX res =  filterTransform(onlyJlambda, testVec, trueLambdaS, falseLambdaS);
38 | 
39 | 		for (int k = 0; k < SZ; k++)
40 | 		{
41 | 			if( k==j)
42 | 			{
43 | 				EXPECT_NUMERIC_EQ(res[k], asNumber( k));
44 | 			}
45 | 			else
46 | 			{
47 | 				EXPECT_NUMERIC_EQ(res[k],  asNumber(-k));
48 | 			}
49 | 		}		
50 | 	}
51 | 
52 | }
53 | 
54 | 
55 | 
56 | 
57 | TEST(TestFilterTransform, testTransformEachPoint)
58 | {
59 | 
60 | 	for (int SZ = 3; SZ < 33; SZ++)
61 | 	{
62 | 		testFilterTransform(SZ);
63 | 	}
64 | 
65 | 
66 | 	testFilterTransform(34);
67 | 	testFilterTransform(65);
68 | 	testFilterTransform(63);
69 | 	testFilterTransform(64);
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/VectorTest/TestScan.cpp:
--------------------------------------------------------------------------------
  1 | #include "pch.h"
  2 | 
  3 | 
  4 | 
  5 | #include "../Vectorisation/VecX/vec.h"
  6 | #include "../Vectorisation/VecX/operations.h"
  7 | #include "../Vectorisation/VecX/vec_bool_d.h"
  8 | #include "../Vectorisation/VecX/vec_double.h"
  9 | #include  "../Vectorisation/VecX/alloc_policy.h"
 10 | 
 11 | #include "../Vectorisation/VecX/vec_d.h"
 12 | #include "../Vectorisation/VecX/vec_bool.h"
 13 | #include "../Vectorisation/VecX/vec_view.h"
 14 | 
 15 | #include "../Vectorisation/VecX/target_name_space.h"
 16 | 
 17 | 
 18 | #include "../Vectorisation/VecX/dr3.h"
 19 | #include "../Vectorisation/VecX/scan.h"
 20 | #include "../Vectorisation/VecX/instruction_traits.h"
 21 | 
 22 | 
 23 | #include "testNamespace.h"
 24 | #include "dr3TestUtil.h"
 25 | 
 26 | #include <algorithm>
 27 | 
 28 | #include <numeric>
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | void testScan(int SZ)
 35 | {
 36 | 
 37 | 
 38 | 	std::vector<Numeric> input(SZ, asNumber(0.0));
 39 | 	std::iota(begin(input), end(input), asNumber(0.0));
 40 | 
 41 | 	VecXX testVec(input);
 42 | 	auto add = [](auto x, auto y) {return x + y; };
 43 | 
 44 | 
 45 | 	for (int j = 0; j < SZ; ++j)
 46 | 	{
 47 | 
 48 | 		auto res = scan( testVec, add);
 49 | 
 50 | 		std::vector<Numeric> dbg = res;
 51 | 
 52 | 		auto expected = testVec[0];
 53 | 
 54 | 		EXPECT_NUMERIC_EQ(expected, res[0]);
 55 | 
 56 | 		for (int k = 1; k < SZ; k++)
 57 | 		{
 58 | 			expected += testVec[k] ;
 59 | 			EXPECT_NUMERIC_EQ(expected, res[k]);	
 60 | 		}
 61 | 	}
 62 | 
 63 | 
 64 | 
 65 | 
 66 | }
 67 | 
 68 | 
 69 | 
 70 | long double getErr(long double)
 71 | {
 72 | 	return std::pow(10, 4 - 16);
 73 | }
 74 | 
 75 | double getErr(double)
 76 | {
 77 | 	return std::pow(10, 4 - 16);
 78 | }
 79 | 
 80 | double getErr(float)
 81 | {
 82 | 	return std::pow(10, 4 - 8);
 83 | }
 84 | 
 85 | 
 86 | void testScan1(int SZ ,double start)
 87 | {
 88 | 
 89 | 
 90 | 	std::vector<Numeric> input(SZ, asNumber(0.0));
 91 | 	std::iota(begin(input), end(input), asNumber(start));
 92 | 
 93 | 	
 94 | 	Numeric err = getErr(Numeric(0.));
 95 | 
 96 | 	VecXX testVec(input);
 97 | 	auto add = [](auto x, auto y) {return x + y; };
 98 | 
 99 | 
100 | 	for (int j = 0; j < SZ; ++j)
101 | 	{
102 | 
103 | 		auto res = scan(testVec, add);
104 | 
105 | 		std::vector<Numeric> dbg = res;
106 | 	
107 | 		std::vector<Numeric> expected;
108 | 		std::inclusive_scan(cbegin(input), cend(input), std::back_inserter( expected));
109 | 
110 | 		EXPECT_NEAR(expected[0], res[0], err);
111 | 
112 | 		for (int k = 1; k < SZ; k++)
113 | 		{
114 | 			auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
115 | 			EXPECT_NEAR(expected[k], res[k], relErr);
116 | 			
117 | 		}
118 | 	}
119 | 
120 | }
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | TEST(TestScan, scanShortVector)
128 | {
129 | 
130 | 	for (int SZ = 3; SZ < 33; SZ++)
131 | 	{
132 | 		testScan(SZ);
133 | 	}
134 | 
135 | 	for (int SZ = 3; SZ < 133; SZ++)
136 | 	{
137 | 		testScan1(SZ,3.14);
138 | 	}
139 | 
140 | }
141 | 
142 | 
143 | 
144 | 
145 | 
146 | void testTransformScan1(int SZ, double start)
147 | {
148 | 
149 | 
150 | 	std::vector<Numeric> input(SZ, asNumber(0.0));
151 | 	std::iota(begin(input), end(input), asNumber(start));
152 | 
153 | 
154 | 	Numeric err = getErr(Numeric(0.));
155 | 
156 | 	VecXX testVec(input);
157 | 	auto SQR = [](auto x) { return x * x; };
158 | 
159 | 	auto sqrVec = transform( [](auto x) {return x * x; }, testVec);
160 | 	std::vector< Numeric> sq = sqrVec;
161 | 	auto add = [](auto x, auto y) {return x + y; };
162 | 
163 | 
164 | 	for (int j = 0; j < SZ; ++j)
165 | 	{
166 | 
167 | 		auto res = ApplyTransformScan(testVec, add, SQR);
168 | 
169 | 		std::vector<Numeric> dbg = res;
170 | 
171 | 		std::vector<Numeric> expected;
172 | 		std::inclusive_scan(cbegin(sq), cend(sq), std::back_inserter(expected));
173 | 
174 | 		EXPECT_NEAR(expected[0], res[0], err);
175 | 
176 | 		for (int k = 1; k < SZ; k++)
177 | 		{
178 | 			auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
179 | 			EXPECT_NEAR(expected[k], res[k], relErr);
180 | 
181 | 		}
182 | 	}
183 | 
184 | }
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | TEST(TestTransformScanTransform, transformScanShortVector)
192 | {
193 | 
194 | 	for (int SZ = 3; SZ < 33; SZ++)
195 | 	{
196 | 		testTransformScan1(SZ,0);
197 | 	}
198 | 
199 | 	for (int SZ = 3; SZ < 133; SZ++)
200 | 	{
201 | 		testTransformScan1(SZ, 3.14);
202 | 	}
203 | 
204 | }
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | void testTransformScan2(int SZ, double start)
213 | {
214 | 
215 | 
216 | 	std::vector<Numeric> input(SZ, asNumber(0.0));
217 | 	std::iota(begin(input), end(input), asNumber(start));
218 | 
219 | 
220 | 	Numeric err = getErr(Numeric(0.));
221 | 
222 | 	VecXX testVec(input);
223 | 
224 | 	VecXX testVec1 = testVec + 1.0;
225 | 
226 | 	auto MULT = [](auto x,auto y) { return x * y; };
227 | 
228 | 	auto multVec = testVec * testVec1;
229 | 
230 | 
231 | 	std::vector< Numeric> prod = multVec;
232 | 	auto add = [](auto x, auto y) {return x + y; };
233 | 
234 | 
235 | 	for (int j = 0; j < SZ; ++j)
236 | 	{
237 | 
238 | 		auto res = ApplyTransformScan(testVec, testVec1, add, MULT);
239 | 
240 | 		std::vector<Numeric> dbg = res;
241 | 
242 | 		std::vector<Numeric> expected;
243 | 		std::inclusive_scan(cbegin(prod), cend(prod), std::back_inserter(expected));
244 | 
245 | 		EXPECT_NEAR(expected[0], res[0], err);
246 | 
247 | 		for (int k = 1; k < SZ; k++)
248 | 		{
249 | 			auto relErr = err * std::max(Numeric(1.), std::abs(Numeric(expected[k])));
250 | 			EXPECT_NEAR(expected[k], res[k], relErr);
251 | 
252 | 		}
253 | 	}
254 | 
255 | }
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | TEST(TestTransformScanTransform, transformScanShortVectorBinary)
263 | {
264 | 
265 | 	for (int SZ = 3; SZ < 33; SZ++)
266 | 	{
267 | 		testTransformScan2(SZ, 0);
268 | 	}
269 | 
270 | 	for (int SZ = 3; SZ < 133; SZ++)
271 | 	{
272 | 		testTransformScan2(SZ, 3.14);
273 | 	}
274 | 
275 | }
276 | 
277 | 
278 | 


--------------------------------------------------------------------------------
/VectorTest/VectorTest.log:
--------------------------------------------------------------------------------
1 | ﻿C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'VectorTest.vcxproj'.  Please check to make sure that you have specified a valid combination of Configuration and Platform for this project.  Configuration='Debug'  Platform='ARM64'.  This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform.
2 | 


--------------------------------------------------------------------------------
/VectorTest/VectorTest.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/VectorTest/dr3TestUtil.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "pch.h"
 3 | 
 4 | 
 5 | #include "../Vectorisation/VecX/vec.h"
 6 | #include "testNamespace.h"
 7 | 
 8 | Numeric asNumber(long double x);
 9 | 
10 | Numeric asNumber(double x);
11 | 
12 | Numeric asNumber(float x);
13 | 
14 | Numeric asNumber(int x);
15 | 
16 | void EXPECT_NUMERIC_EQ(long double x, long double y);
17 | 
18 | void EXPECT_NUMERIC_EQ(double x, double y);
19 | 
20 | void EXPECT_NUMERIC_EQ(float x, float y);
21 | 
22 | void EXPECT_NUMERIC_EQ(int x, int y);
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/VectorTest/packages.config:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <packages>
3 |   <package id="Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn" version="1.8.1.7" targetFramework="native" />
4 | </packages>


--------------------------------------------------------------------------------
/VectorTest/pch.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // pch.cpp
3 | //
4 | 
5 | //#include "pch.h"
6 | 


--------------------------------------------------------------------------------
/VectorTest/pch.h:
--------------------------------------------------------------------------------
1 | //
2 | // pch.h
3 | //
4 | 
5 | #pragma once
6 | 
7 | #include "gtest/gtest.h"
8 | 


--------------------------------------------------------------------------------
/VectorTest/testNamespace.cpp:
--------------------------------------------------------------------------------
 1 | #include "testNamespace.h"
 2 | 
 3 | Numeric asNumber(long double x)
 4 | {
 5 | 	return static_cast<Numeric>(x);
 6 | }
 7 | 
 8 | Numeric asNumber(double x)
 9 | {
10 | 	return static_cast<Numeric>(x);
11 | }
12 | 
13 | Numeric asNumber(float x)
14 | {
15 | 	return static_cast<Numeric>(x);
16 | }
17 | 
18 | 
19 | Numeric asNumber(int x)
20 | {
21 | 	return static_cast<Numeric>(x);
22 | }
23 | 
24 | 
25 | void EXPECT_NUMERIC_EQ(long double x, long double y)
26 | {
27 | 	EXPECT_DOUBLE_EQ(x, y);
28 | }
29 | 
30 | 
31 | void EXPECT_NUMERIC_EQ(double x, double y)
32 | {
33 | 	EXPECT_DOUBLE_EQ(x, y);
34 | }
35 | 
36 | 
37 | void EXPECT_NUMERIC_EQ(float x, float y)
38 | {
39 | 	EXPECT_FLOAT_EQ(x, y);
40 | }
41 | 
42 | 
43 | void EXPECT_NUMERIC_EQ(int x, int y)
44 | {
45 | 	EXPECT_EQ(x, y);
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/VectorTest/testNamespace.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "pch.h"
 3 | 
 4 | 
 5 | #include "../Vectorisation/VecX/vec.h"
 6 | #include "../Vectorisation/VecX/target_name_space.h"
 7 | #include "../Vectorisation/VecX/instruction_traits.h"
 8 | 
 9 | //using namespace DRC::VecDb;
10 | //using namespace DRC::VecLDb;
11 | 
12 | //using namespace DRC::VecF4F;
13 | //using namespace DRC::VecD2D;
14 | using namespace DRC::VecD4D;
15 | //using namespace DRC::VecF8F;
16 | 
17 | //using namespace DRC::VecD8D;
18 | //using namespace DRC::VecF16F;
19 | 
20 | 
21 | using Numeric = InstructionTraits<VecXX::INS>::FloatType;
22 | #include "dr3TestUtil.h"
23 | 


--------------------------------------------------------------------------------
/VectorTest/test_precise_accumulation.cpp:
--------------------------------------------------------------------------------
  1 | #include "pch.h"
  2 | 
  3 | 
  4 | #include "../Vectorisation/VecX/vec.h"
  5 | #include "../Vectorisation/VecX/operations.h"
  6 | #include "../Vectorisation/VecX/vec_bool_d.h"
  7 | #include "../Vectorisation/VecX/vec_double.h"
  8 | #include  "../Vectorisation/VecX/alloc_policy.h"
  9 | #include  "../Vectorisation/VecX/accumulate_transform.h"
 10 | #include "../Vectorisation/VecX/target_name_space.h"
 11 | 
 12 | #include "../Vectorisation/VecX/dr3.h"
 13 | #include "testNamespace.h"
 14 | #include "dr3TestUtil.h"
 15 | 
 16 | #include <numeric>
 17 | 
 18 | 
 19 | auto  getVecBig(int SZ, std::vector<Numeric>& stl)
 20 | {
 21 | 	std::vector<Numeric>  v(SZ, asNumber(1.0/3.0));
 22 | 	int i = 0;
 23 | 	
 24 | 	VecXX test(v);
 25 | 	stl = v;
 26 | 	return  test;
 27 | 
 28 | }
 29 | 
 30 | 
 31 | void evalPrecAccumulate(int startLen, int endLen)
 32 | {
 33 | 
 34 | 	Numeric testEpsilon = 1e-10;
 35 | 	
 36 | 	for (int SZ = startLen; SZ <= endLen; SZ++)
 37 | 	{
 38 | 		std::vector<Numeric> v;
 39 | 		VecXX test = getVecBig(SZ, v);
 40 | 		using BINNED_ACCUMULATOR = BinsT<VecXX::INS>;
 41 | 		auto binned_Sum = reduce< BINNED_ACCUMULATOR >(test, BinnedAdd);
 42 | 
 43 | 		EXPECT_NEAR(double(SZ / 3.0), binned_Sum, testEpsilon);
 44 | 	}
 45 | 
 46 | }
 47 | 
 48 | TEST(TestPreciseAccumulator, simpleSummation) 
 49 | {
 50 | 	EXPECT_EQ(1, 1);
 51 | 	EXPECT_TRUE(true);
 52 | 
 53 | 	//eval over multiple lengths
 54 | 	evalPrecAccumulate(957, 1043);
 55 | 
 56 | 	//eval over very small lengths
 57 | 	evalPrecAccumulate(3, 23);
 58 | 
 59 | }
 60 | 
 61 | 
 62 | TEST(TestBin, simpleSummation2)
 63 | {
 64 | 	EXPECT_EQ(1, 1);
 65 | 	EXPECT_TRUE(true);
 66 | 
 67 | 
 68 | 	BinsT<VecXX::INS> bin; 
 69 | 
 70 | 
 71 | 
 72 |     EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
 73 | 	EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
 74 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
 75 | 	EXPECT_EQ(bin.tinyV.extract(0), 0.0);
 76 | 
 77 | 
 78 | 	VecXX::INS testVal =1.0e-16;
 79 | 	bin += testVal;
 80 | 
 81 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
 82 | 	EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
 83 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
 84 | 	EXPECT_EQ(bin.tinyV.extract(0), 1.0e-16);
 85 | 
 86 | 	bin += testVal;
 87 | 
 88 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
 89 | 	EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
 90 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
 91 | 	EXPECT_EQ(bin.tinyV.extract(0), 2.0e-16);
 92 | 
 93 | 
 94 | 	bin += testVal / 2;
 95 | 
 96 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
 97 | 	EXPECT_EQ(bin.bigSummV.extract(0), 0.0);
 98 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
 99 | 	EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
100 | 
101 | 	//further tests for the other bins
102 | 
103 | 	testVal = 1.0;
104 | 
105 | 	bin += testVal;
106 | 
107 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
108 | 	EXPECT_EQ(bin.bigSummV.extract(0), 1.0);
109 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
110 | 	EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
111 | 
112 | 	bin += testVal;
113 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
114 | 	EXPECT_EQ(bin.bigSummV.extract(0), 2.0);
115 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.0);
116 | 	EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
117 | 
118 | 	bin += testVal / 2;
119 | 	
120 | 	EXPECT_EQ(bin.veryBigSummV.extract(0), 0.0);
121 | 	EXPECT_EQ(bin.bigSummV.extract(0), 2.0);
122 | 	EXPECT_EQ(bin.smallSumV.extract(0), 0.5);
123 | 	EXPECT_EQ(bin.tinyV.extract(0), 2.5e-16);
124 | 
125 | 	
126 | 
127 | 
128 | 
129 | 	BinsT<VecXX::INS> bin2;
130 | 
131 | 	auto oneThird = 1.0 / 3.0;
132 | 
133 | 
134 | 	bin2 += 1.0e-3 * oneThird;
135 | 
136 | 
137 | 	EXPECT_EQ(bin2.veryBigSummV.extract(0), 0.0);
138 | 	EXPECT_EQ(bin2.bigSummV.extract(0), 0.0);
139 | //	EXPECT_EQ(bin2.smallSumV.extract(0), 1.0/3.0 *1.0e-3);
140 | //	EXPECT_EQ(bin2.tinyV.extract(0), 0.0);
141 | 
142 | 	auto sum = bin2.hsum();
143 | 
144 | 	bin2 = bin2 *100000.0;
145 | 
146 | 	sum = bin2.hsum();
147 | 
148 | 	/*
149 | 
150 | 	//eval over multiple lengths
151 | 	evalPrecAccumulate(957, 1043);
152 | 
153 | 	//eval over very small lengths
154 | 	evalPrecAccumulate(3, 23);
155 | 	*/
156 | }
157 | 
158 | 


--------------------------------------------------------------------------------
/Vectorisation/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(Vectorisation STATIC
 2 |             VecX/alloc_policy.cpp
 3 |             VecX/vec.cpp)
 4 | 
 5 | if (MSVC)    
 6 |     # add_compile_options(/W4 /WX)
 7 | else()
 8 |     target_compile_options(Vectorisation PUBLIC "-march=native")
 9 | 
10 |     #  or -mavx/-mavx2/-mavx512f (and -march= options that imply them with relevant tuning settings)
11 |     #target_compile_options(Vectorisation PUBLIC "--std=c++17")
12 |     #target_compile_options(Vectorisation PUBLIC "-mavx2")
13 |     #target_compile_options(Vectorisation PUBLIC "-mfma")
14 | endif()
15 | 
16 |             


--------------------------------------------------------------------------------
/Vectorisation/TextFile1.txt:
--------------------------------------------------------------------------------
1 | 
2 | Vec4 f a ( 0 . 0 f , 0. 5 f , 1. 0 f , 1. 5 f ) ; // d e f i n e v e c t o r
3 |  Vec4 f b = s i n ( a ) ; // s i n  f u n c ti o n 
4 | // b = ( 0. 0 0 0 0 f , 0. 4 7 9 4 f , 0. 8 4 1 5 f , 0. 9 9 7 5 f ) 
5 | 
6 | 


--------------------------------------------------------------------------------
/Vectorisation/VCL/README.md:
--------------------------------------------------------------------------------
 1 | # version2
 2 | Vector Class Library, latest version
 3 | 
 4 | This is a C++ class library for using the Single Instruction Multiple Data (SIMD) instructions to improve performance on modern microprocessors with the x86 or x86/64 instruction set on Windows, Linux, and Mac platforms. There are no plans to support ARM or other instruction sets.
 5 | 
 6 | [Latest release](https://github.com/vectorclass/version2/releases)
 7 | 
 8 | [Download manual](https://github.com/vectorclass/manual/raw/master/vcl_manual.pdf)
 9 | 
10 | [Add-on packages for particular applications](https://github.com/vectorclass/add-on)
11 | 
12 | [Getting-started video.](https://www.youtube.com/watch?v=TKjYdLIMTrI) Video blogger Christopher Rose has made this nice video telling how to get started with the Vector Class Library.
13 | 
14 | **Help:** You may ask for programming help on [StackOverflow](https://stackoverflow.com) using the tag vector-class-library.
15 | 


--------------------------------------------------------------------------------
/Vectorisation/VCL/dispatch_example1.cpp:
--------------------------------------------------------------------------------
  1 | /*************************  dispatch_example1.cpp   ***************************
  2 | Author:        Agner Fog
  3 | Date created:  2012-05-30
  4 | Last modified: 2020-02-25
  5 | Version:       2.01.00
  6 | Project:       vector class library
  7 | 
  8 | Description:   Example of automatic CPU dispatching.
  9 |                This shows how to compile vector code in multiple versions, each
 10 |                optimized for a different instruction set. The optimal version is
 11 |                selected by a dispatcher at run time.
 12 | 
 13 | There are two examples of automatic dispatching:
 14 | 
 15 | dispatch_example1.cpp: Uses separate function names for each version.
 16 |                        This is useful for simple cases with one or a few functions.
 17 | 
 18 | dispatch_example2.cpp: Uses separate namespaces for each version.
 19 |                        This is the recommended method for cases with multiple functions,
 20 |                        classes, objects, etc.
 21 | 
 22 | The code has two sections: 
 23 | 
 24 | Dispatched code: This code is compiled multiple times to generate multiple instances
 25 | of the compiled code, each one optimized for a different instruction set. The
 26 | dispatched code section contains the speed-critical part of the program.
 27 | 
 28 | Common code: This code is compiled only once, using the lowest instruction set.
 29 | The common code section contains the dispatcher, startup code, user interface, and 
 30 | other parts of the program that do not need advanced optimization.
 31 | 
 32 | To compile this code, do as in this example:
 33 | 
 34 | # Example of compiling dispatch example with Gnu or Clang compiler:
 35 | # Compile dispatch_example1.cpp four times for different instruction sets:
 36 | 
 37 | # Compile for AVX
 38 | clang++ -O2 -m64 -mavx -std=c++17 -c dispatch_example1.cpp -od7.o
 39 | 
 40 | # Compile for AVX2
 41 | clang++ -O2 -m64 -mavx2 -mfma -std=c++17 -c dispatch_example1.cpp -od8.o
 42 | 
 43 | # Compile for AVX512
 44 | clang++ -O2 -m64 -mavx512f -mfma -mavx512vl -mavx512bw -mavx512dq -std=c++17 -c dispatch_example1.cpp -od10.o
 45 | 
 46 | # The last compilation uses the lowest supported instruction set (SSE2)
 47 | # This includes the main program, and links all versions together:
 48 | # (Change test.exe to test in Linux and Mac)
 49 | clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example1.cpp instrset_detect.cpp d7.o d8.o d10.o -otest.exe
 50 | 
 51 | # Run the program
 52 | ./test.exe
 53 | 
 54 | (c) Copyright 2012-2020 Agner Fog.
 55 | Apache License version 2.0 or later.
 56 | ******************************************************************************/
 57 | 
 58 | /* The different instruction sets are defined in instrset_detect.cpp:
 59 | 2:  SSE2
 60 | 3:  SSE3
 61 | 4:  SSSE3 (Supplementary SSE3)
 62 | 5:  SSE4.1
 63 | 6:  SSE4.2
 64 | 7:  AVX
 65 | 8:  AVX2
 66 | 9:  AVX512F
 67 | 10: AVX512VL + AVX512BW + AVX512DQ
 68 | */
 69 | 
 70 | 
 71 | #include <stdio.h>
 72 | #include "vectorclass.h"
 73 | 
 74 | // Define function type
 75 | // Change this to fit the entry function. Should not contain vector types:
 76 | typedef float MyFuncType(float const []);
 77 | 
 78 | // function prototypes for each version
 79 | MyFuncType  myfunc_SSE2, myfunc_AVX, myfunc_AVX2, myfunc_AVX512;
 80 | 
 81 | // function prototypes for common entry point and dispatcher
 82 | MyFuncType  myfunc, myfunc_dispatch;
 83 | 
 84 | // Define name of entry function depending on which instruction set we compile for
 85 | #if   INSTRSET >= 10                   // AVX512VL
 86 | #define FUNCNAME myfunc_AVX512
 87 | #elif INSTRSET >= 8                    // AVX2
 88 | #define FUNCNAME myfunc_AVX2
 89 | #elif INSTRSET >= 7                    // AVX
 90 | #define FUNCNAME myfunc_AVX
 91 | #elif INSTRSET == 2
 92 | #define FUNCNAME myfunc_SSE2           // SSE2
 93 | #else
 94 | #error Unsupported instruction set
 95 | #endif
 96 | 
 97 | /******************************************************************************
 98 |                              Dispatched code
 99 | 
100 | Everything in this section is compiled multiple times, with one version for
101 | each instruction set. Speed-critical vector code belongs here.
102 | ******************************************************************************/
103 | 
104 | // This is the dispatched function that is compiled in multiple versions with different names.
105 | // Make sure this function is static to prevent clash with other versions having the same name.
106 | // The function cannot be member of a class.
107 | static float sum (float const f[]) {
108 |     // This example adds 16 floats
109 |     Vec16f a;                          // vector of 16 floats
110 |     a.load(f);                         // load array into vector
111 |     return horizontal_add(a);          // return sum of 16 elements
112 | }
113 | 
114 | // -----------------------------------------------------------------------------
115 | //                       Entry function
116 | // -----------------------------------------------------------------------------
117 | // This is the entry function that is accessed through the dispatcher.
118 | // This serves as the interface between the common code and the dispatched code.
119 | // The entry function cannot be member of a class.
120 | // The entry function must use arrays rather than vectors for input and output.
121 | float FUNCNAME (float const f[]) {
122 |     return sum(f);
123 | }
124 | 
125 | 
126 | /**********************************************************************************
127 |                              Common code
128 | 
129 | Everything in this section is compiled only once, using the lowest instruction set. 
130 | 
131 | The dispatcher must be placed here. Program main(), user interface, and other
132 | less critical parts of the code are also placed in the common code section.
133 | **********************************************************************************/
134 | 
135 | #if INSTRSET == 2
136 | // The common code is only included in the lowest of the compiled versions
137 | 
138 | 
139 | // ---------------------------------------------------------------------------------
140 | //                       Dispacther
141 | // ---------------------------------------------------------------------------------
142 | // This function pointer initially points to the dispatcher.
143 | // After the first call, it points to the selected version of the entry function
144 | MyFuncType * myfunc_pointer = &myfunc_dispatch;            // function pointer
145 | 
146 | // Dispatcher
147 | float myfunc_dispatch(float const f[]) {
148 |     int iset = instrset_detect();                          // Detect supported instruction set
149 |     // Choose which version of the entry function we want to point to:
150 |     if      (iset >= 10) myfunc_pointer = &myfunc_AVX512;  // AVX512 version
151 |     else if (iset >=  8) myfunc_pointer = &myfunc_AVX2;    // AVX2 version
152 |     else if (iset >=  7) myfunc_pointer = &myfunc_AVX;     // AVX version
153 |     else if (iset >=  2) myfunc_pointer = &myfunc_SSE2;    // SSE2 version
154 |     else {
155 |         // Error: lowest instruction set not supported.
156 |         // Put any appropriate error handler here
157 |         fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer");
158 |         return 0.f;
159 |     }
160 |     // continue in dispatched version of the function
161 |     return (*myfunc_pointer)(f);
162 | }
163 | 
164 | 
165 | // Call the entry function through the function pointer.
166 | // The first time this function is called, it goes through the dispatcher.
167 | // The dispatcher will change the function pointer so that all subsequent
168 | // calls go directly to the optimal version of the entry function
169 | inline float myfunc(float const f[]) {
170 |     return (*myfunc_pointer)(f);                 // go to dispatched version
171 | }
172 | 
173 | 
174 | // ---------------------------------------------------------------------------------
175 | //                       Program main
176 | // ---------------------------------------------------------------------------------
177 | int main() {
178 | 
179 |     // array of 16 floats
180 |     float const a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
181 | 
182 |     float sum = myfunc(a);                       // call function with dispatching
183 | 
184 |     printf("\nsum = %8.2f \n", sum);             // print result (= 136.00)
185 | 
186 |     return 0;
187 | }
188 | 
189 | #endif  // INSTRSET == 2
190 | 


--------------------------------------------------------------------------------
/Vectorisation/VCL/instrset_detect.cpp:
--------------------------------------------------------------------------------
  1 | /**************************  instrset_detect.cpp   ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2012-05-30
  4 | * Last modified: 2019-08-01
  5 | * Version:       2.00.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Functions for checking which instruction sets are supported.
  9 | *
 10 | * (c) Copyright 2012-2019 Agner Fog.
 11 | * Apache License version 2.0 or later.
 12 | ******************************************************************************/
 13 | 
 14 | #include "instrset.h"
 15 | 
 16 | #ifdef VCL_NAMESPACE
 17 | namespace VCL_NAMESPACE {
 18 | #endif
 19 | 
 20 | 
 21 | // Define interface to xgetbv instruction
 22 | static inline uint64_t xgetbv (int ctr) {
 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
 24 |     // Microsoft or Intel compiler supporting _xgetbv intrinsic
 25 | 
 26 |     return uint64_t(_xgetbv(ctr));                    // intrinsic function for XGETBV
 27 | 
 28 | #elif defined(__GNUC__) ||  defined (__clang__)       // use inline assembly, Gnu/AT&T syntax
 29 | 
 30 |    uint32_t a, d;
 31 |    __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
 32 |    return a | (uint64_t(d) << 32);
 33 | 
 34 | #else  // #elif defined (_WIN32)                      // other compiler. try inline assembly with masm/intel/MS syntax
 35 |    uint32_t a, d;
 36 |     __asm {
 37 |         mov ecx, ctr
 38 |         _emit 0x0f
 39 |         _emit 0x01
 40 |         _emit 0xd0 ; // xgetbv
 41 |         mov a, eax
 42 |         mov d, edx
 43 |     }
 44 |    return a | (uint64_t(d) << 32);
 45 | 
 46 | #endif
 47 | }
 48 | 
 49 | /* find supported instruction set
 50 |     return value:
 51 |     0           = 80386 instruction set
 52 |     1  or above = SSE (XMM) supported by CPU (not testing for OS support)
 53 |     2  or above = SSE2
 54 |     3  or above = SSE3
 55 |     4  or above = Supplementary SSE3 (SSSE3)
 56 |     5  or above = SSE4.1
 57 |     6  or above = SSE4.2
 58 |     7  or above = AVX supported by CPU and operating system
 59 |     8  or above = AVX2
 60 |     9  or above = AVX512F
 61 |    10  or above = AVX512VL, AVX512BW, AVX512DQ
 62 | */
 63 | int instrset_detect(void) {
 64 | 
 65 |     static int iset = -1;                                  // remember value for next call
 66 |     if (iset >= 0) {
 67 |         return iset;                                       // called before
 68 |     }
 69 |     iset = 0;                                              // default value
 70 |     int abcd[4] = {0,0,0,0};                               // cpuid results
 71 |     cpuid(abcd, 0);                                        // call cpuid function 0
 72 |     if (abcd[0] == 0) return iset;                         // no further cpuid function supported
 73 |     cpuid(abcd, 1);                                        // call cpuid function 1 for feature flags
 74 |     if ((abcd[3] & (1 <<  0)) == 0) return iset;           // no floating point
 75 |     if ((abcd[3] & (1 << 23)) == 0) return iset;           // no MMX
 76 |     if ((abcd[3] & (1 << 15)) == 0) return iset;           // no conditional move
 77 |     if ((abcd[3] & (1 << 24)) == 0) return iset;           // no FXSAVE
 78 |     if ((abcd[3] & (1 << 25)) == 0) return iset;           // no SSE
 79 |     iset = 1;                                              // 1: SSE supported
 80 |     if ((abcd[3] & (1 << 26)) == 0) return iset;           // no SSE2
 81 |     iset = 2;                                              // 2: SSE2 supported
 82 |     if ((abcd[2] & (1 <<  0)) == 0) return iset;           // no SSE3
 83 |     iset = 3;                                              // 3: SSE3 supported
 84 |     if ((abcd[2] & (1 <<  9)) == 0) return iset;           // no SSSE3
 85 |     iset = 4;                                              // 4: SSSE3 supported
 86 |     if ((abcd[2] & (1 << 19)) == 0) return iset;           // no SSE4.1
 87 |     iset = 5;                                              // 5: SSE4.1 supported
 88 |     if ((abcd[2] & (1 << 23)) == 0) return iset;           // no POPCNT
 89 |     if ((abcd[2] & (1 << 20)) == 0) return iset;           // no SSE4.2
 90 |     iset = 6;                                              // 6: SSE4.2 supported
 91 |     if ((abcd[2] & (1 << 27)) == 0) return iset;           // no OSXSAVE
 92 |     if ((xgetbv(0) & 6) != 6)       return iset;           // AVX not enabled in O.S.
 93 |     if ((abcd[2] & (1 << 28)) == 0) return iset;           // no AVX
 94 |     iset = 7;                                              // 7: AVX supported
 95 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
 96 |     if ((abcd[1] & (1 <<  5)) == 0) return iset;           // no AVX2
 97 |     iset = 8;
 98 |     if ((abcd[1] & (1 << 16)) == 0) return iset;           // no AVX512
 99 |     cpuid(abcd, 0xD);                                      // call cpuid leaf 0xD for feature flags
100 |     if ((abcd[0] & 0x60) != 0x60)   return iset;           // no AVX512
101 |     iset = 9;
102 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
103 |     if ((abcd[1] & (1 << 31)) == 0) return iset;           // no AVX512VL
104 |     if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
105 |     iset = 10;
106 |     return iset;
107 | }
108 | 
109 | // detect if CPU supports the FMA3 instruction set
110 | bool hasFMA3(void) {
111 |     if (instrset_detect() < 7) return false;               // must have AVX
112 |     int abcd[4];                                           // cpuid results
113 |     cpuid(abcd, 1);                                        // call cpuid function 1
114 |     return ((abcd[2] & (1 << 12)) != 0);                   // ecx bit 12 indicates FMA3
115 | }
116 | 
117 | // detect if CPU supports the FMA4 instruction set
118 | bool hasFMA4(void) {
119 |     if (instrset_detect() < 7) return false;               // must have AVX
120 |     int abcd[4];                                           // cpuid results
121 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
122 |     return ((abcd[2] & (1 << 16)) != 0);                   // ecx bit 16 indicates FMA4
123 | }
124 | 
125 | // detect if CPU supports the XOP instruction set
126 | bool hasXOP(void) {
127 |     if (instrset_detect() < 7) return false;               // must have AVX
128 |     int abcd[4];                                           // cpuid results
129 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
130 |     return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
131 | }
132 | 
133 | // detect if CPU supports the F16C instruction set
134 | bool hasF16C(void) {
135 |     if (instrset_detect() < 7) return false;               // must have AVX
136 |     int abcd[4];                                           // cpuid results
137 |     cpuid(abcd, 1);                                        // call cpuid function 1
138 |     return ((abcd[2] & (1 << 29)) != 0);                   // ecx bit 29 indicates F16C
139 | }
140 | 
141 | // detect if CPU supports the AVX512ER instruction set
142 | bool hasAVX512ER(void) {
143 |     if (instrset_detect() < 9) return false;               // must have AVX512F
144 |     int abcd[4];                                           // cpuid results
145 |     cpuid(abcd, 7);                                        // call cpuid function 7
146 |     return ((abcd[1] & (1 << 27)) != 0);                   // ebx bit 27 indicates AVX512ER
147 | }
148 | 
149 | // detect if CPU supports the AVX512VBMI instruction set
150 | bool hasAVX512VBMI(void) {
151 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
152 |     int abcd[4];                                           // cpuid results
153 |     cpuid(abcd, 7);                                        // call cpuid function 7
154 |     return ((abcd[2] & (1 << 1)) != 0);                    // ecx bit 1 indicates AVX512VBMI
155 | }
156 | 
157 | // detect if CPU supports the AVX512VBMI2 instruction set
158 | bool hasAVX512VBMI2(void) {
159 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
160 |     int abcd[4];                                           // cpuid results
161 |     cpuid(abcd, 7);                                        // call cpuid function 7
162 |     return ((abcd[2] & (1 << 6)) != 0);                    // ecx bit 6 indicates AVX512VBMI2
163 | }
164 | 
165 | #ifdef VCL_NAMESPACE
166 | }
167 | #endif
168 | 


--------------------------------------------------------------------------------
/Vectorisation/VCL/vectorclass.h:
--------------------------------------------------------------------------------
 1 | /****************************  vectorclass.h   ********************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2012-05-30
 4 | * Last modified: 2020-04-11
 5 | * Version:       2.01.02
 6 | * Project:       vector class library
 7 | * Home:          https://github.com/vectorclass
 8 | * Description:
 9 | * Header file defining vector classes as interface to intrinsic functions
10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets.
11 | *
12 | * Instructions:
13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
14 | * instruction set, which must be at least SSE2. Specify the supported
15 | * instruction set by a command line define, e.g. __SSE4_1__ if the
16 | * compiler does not automatically do so.
17 | * For detailed instructions, see vcl_manual.pdf
18 | *
19 | * Each vector object is represented internally in the CPU as a vector
20 | * register with 128, 256 or 512 bits.
21 | *
22 | * This header file includes the appropriate header files depending on the
23 | * selected instruction set.
24 | *
25 | * (c) Copyright 2012-2020 Agner Fog.
26 | * Apache License version 2.0 or later.
27 | ******************************************************************************/
28 | #ifndef VECTORCLASS_H
29 | #define VECTORCLASS_H  20102
30 | 
31 | // Maximum vector size, bits. Allowed values are 128, 256, 512
32 | #ifndef MAX_VECTOR_SIZE
33 | #define MAX_VECTOR_SIZE 512
34 | #endif
35 | 
36 | // Determine instruction set, and define platform-dependent functions
37 | #include "instrset.h"        // Select supported instruction set
38 | 
39 | #if INSTRSET < 2             // instruction set SSE2 is the minimum
40 | #error Please compile for the SSE2 instruction set or higher
41 | #else
42 | 
43 | // Select appropriate .h files depending on instruction set
44 | #include "vectori128.h"      // 128-bit integer vectors
45 | #include "vectorf128.h"      // 128-bit floating point vectors
46 | 
47 | #if MAX_VECTOR_SIZE >= 256
48 | #if INSTRSET >= 8
49 | #include "vectori256.h"      // 256-bit integer vectors, requires AVX2 instruction set
50 | #else
51 | #include "vectori256e.h"     // 256-bit integer vectors, emulated
52 | #endif  // INSTRSET >= 8
53 | #if INSTRSET >= 7
54 | #include "vectorf256.h"      // 256-bit floating point vectors, requires AVX instruction set
55 | #else
56 | #include "vectorf256e.h"     // 256-bit floating point vectors, emulated
57 | #endif  //  INSTRSET >= 7
58 | #endif  //  MAX_VECTOR_SIZE >= 256
59 | 
60 | #if MAX_VECTOR_SIZE >= 512
61 | #if INSTRSET >= 9
62 | #include "vectori512.h"      // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set
63 | #include "vectorf512.h"      // 512-bit floating point vectors, requires AVX512F instruction set
64 | #else
65 | #include "vectori512e.h"     // 512-bit integer vectors, emulated
66 | #include "vectorf512e.h"     // 512-bit floating point vectors, emulated
67 | #endif  //  INSTRSET >= 9
68 | #if INSTRSET >= 10
69 | #include "vectori512s.h"     // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set
70 | #else
71 | #include "vectori512se.h"    // 512-bit vectors of 8 and 16 bit integers, emulated
72 | #endif
73 | #endif  //  MAX_VECTOR_SIZE >= 512
74 | 
75 | #include "vector_convert.h"  // conversion between different vector sizes
76 | 
77 | #endif  // INSTRSET >= 2
78 | 
79 | 
80 | #else   // VECTORCLASS_H
81 | 
82 | #if VECTORCLASS_H < 20000
83 | #error Mixed versions of vector class library
84 | #endif
85 | 
86 | #endif  // VECTORCLASS_H
87 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy.cpp:
--------------------------------------------------------------------------------
  1 | /****************************  alloc_policy.cpp   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #include "alloc_policy.h"
 13 | #include "alloc_policy_imp.h"
 14 | #include <unordered_map>
 15 | 
 16 | template<>
 17 | int AllAllocators<long double>::lastSize_N = -1;
 18 | template<>
 19 | int AllAllocators<double>::lastSize_N = -1;
 20 | template<>
 21 | int AllAllocators<float>::lastSize_N = -1;
 22 | template<>
 23 | int AllAllocators<unsigned int>::lastSize_N = -1;
 24 | 
 25 | template<>
 26 | AllocPolicy<long double>* AllAllocators<long double>::pAllocPolicy = nullptr;
 27 | template<>
 28 | AllocPolicy<double>* AllAllocators<double>::pAllocPolicy = nullptr;
 29 | template<>
 30 | AllocPolicy<float>* AllAllocators<float>::pAllocPolicy = nullptr;
 31 | template<>
 32 | AllocPolicy<unsigned int>* AllAllocators<unsigned int>::pAllocPolicy = nullptr;
 33 | template<>
 34 | std::unordered_map<int, AllocPolicy<long double>*>  AllAllocators<long double>::m_map_sizeToAllocPolicy = std::unordered_map<int, AllocPolicy<long double>*>();
 35 | template<>
 36 | std::unordered_map<int, AllocPolicy<double>*>  AllAllocators<double>::m_map_sizeToAllocPolicy = std::unordered_map<int, AllocPolicy<double>*>();
 37 | template<>
 38 | std::unordered_map<int, AllocPolicy<float>*>  AllAllocators<float>::m_map_sizeToAllocPolicy = std::unordered_map<int, AllocPolicy<float>*>();
 39 | template<>
 40 | std::unordered_map<int, AllocPolicy<unsigned int>*>  AllAllocators<unsigned int>::m_map_sizeToAllocPolicy = std::unordered_map<int, AllocPolicy<unsigned int>*>();
 41 | 
 42 | 
 43 | 
 44 | void freePool(size_t N, long double* pOld)
 45 | {
 46 | 	return freeT(N, pOld);
 47 | }
 48 | 
 49 | 
 50 | void freePool(size_t N, double* pOld)
 51 | {
 52 | 	return freeT(N, pOld);
 53 | }
 54 | 
 55 | void freePool(size_t N, float* pOld)
 56 | {
 57 | 	return freeT(N, pOld);
 58 | }
 59 | 
 60 | void freePool(size_t N, unsigned int* pOld)
 61 | {
 62 | 	return freeT(N, pOld);
 63 | }
 64 | 
 65 | void allocPool(size_t& N, long double*& pMem)
 66 | {
 67 | 	allocT(N, pMem);
 68 | }
 69 | 
 70 | void allocPool(size_t& N, double*& pMem)
 71 | {
 72 | 	allocT(N, pMem);
 73 | }
 74 | 
 75 | void allocPool(size_t& N, float*& pMem)
 76 | {
 77 | 	allocT(N, pMem);
 78 | }
 79 | 
 80 | void allocPool(size_t& N, unsigned int*& pMem)
 81 | {
 82 | 	allocT(N, pMem);
 83 | }
 84 | 
 85 | int  getAllignedSize(size_t N, long double* pOld)
 86 | {
 87 | 	return getAllignedSizeT(N, pOld);
 88 | }
 89 | 
 90 | int  getAllignedSize(size_t N, double* pOld)
 91 | {
 92 | 	return getAllignedSizeT(N, pOld);
 93 | }
 94 | 
 95 | int  getAllignedSize(size_t N, float* pOld)
 96 | {
 97 | 	return getAllignedSizeT(N, pOld);
 98 | }
 99 | 
100 | int  getAllignedSize(size_t N, unsigned int* pOld)
101 | {
102 | 	return getAllignedSizeT(N, pOld);
103 | }
104 | void freeAllAllocators(long double)
105 | {
106 | 	AllAllocators<long double>::freeAll();
107 | }
108 | void freeAllAllocators(double)
109 | {
110 | 	AllAllocators<double>::freeAll();
111 | }
112 | void freeAllAllocators(float)
113 | {
114 | 	AllAllocators<float>::freeAll();
115 | }
116 | void freeAllAllocators(unsigned int)
117 | {
118 | 	AllAllocators<unsigned int>::freeAll();
119 | }


--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy.h:
--------------------------------------------------------------------------------
 1 | /****************************  alloc_policy.h   *******************************
 2 | * Author:        Andrew Drakeford
 3 | * Date created:  2021-04-10
 4 | * Last modified: 2021-04-10
 5 | * Version:       1.0
 6 | * Project:       DR Cubed
 7 | * Description:
 8 | *
 9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | 
14 | #include <cstddef>
15 | 
16 | void freePool(size_t N, long double* pOld);
17 | void freePool(size_t N, double* pOld);
18 | void freePool(size_t N, float* pOld);
19 | void freePool(size_t N, unsigned int* pOld);
20 | 
21 | void allocPool(size_t& N, long double*& pMem);
22 | void allocPool(size_t& N, double*& pMem);
23 | void allocPool(size_t& N, float*& pOld);
24 | void allocPool(size_t& N, unsigned int*& pOld);
25 | 
26 | int  getAllignedSize(size_t N, long double* pOld);
27 | int  getAllignedSize(size_t N, double* pOld);
28 | int  getAllignedSize(size_t N, float* pOld);
29 | int  getAllignedSize(size_t N, unsigned int* pOld);
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/alloc_policy_imp.h:
--------------------------------------------------------------------------------
  1 | /****************************  alloc_policy_imp.h   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include <vector>
 14 | #include <unordered_map>
 15 | 
 16 | 
 17 | //need a function to reduce size pools to  a minimum
 18 | // get rid of magic numbers of byte sizes etc
 19 | 
 20 | const int BytesOnCacheLine = 64;
 21 | const long MemPoolInitialIncrement = 16;
 22 | const long  MemPoolScaleFactor = 2;
 23 | const int  ByteAllignment = 64;
 24 | 
 25 | template <typename T>
 26 | class PoolStrat
 27 | {
 28 | public:
 29 | 
 30 | 	PoolStrat(const PoolStrat&) = delete;
 31 | 	PoolStrat& operator=(const PoolStrat&) = delete;
 32 | 	PoolStrat& operator=( PoolStrat&&) = delete;
 33 | 	PoolStrat(PoolStrat&&) = delete;
 34 | 
 35 | 
 36 | 	explicit PoolStrat(int vecSz) :m_vecSize(vecSz)
 37 | 	{
 38 | 		m_sz = 0;
 39 | 		m_incrementSize = MemPoolInitialIncrement;// 16;
 40 | 		addToPool(m_incrementSize);
 41 | 		m_pos = 0;
 42 | 	}
 43 | 
 44 | 	~PoolStrat()
 45 | 	{
 46 | 		for (auto vec : m_allocatedVecs)
 47 | 		{
 48 | 			delete vec;
 49 | 		}
 50 | 	}
 51 | 
 52 | 
 53 | 
 54 | 	T* alloc()
 55 | 	{
 56 | 		if (m_pos < (m_sz - 1))
 57 | 		{
 58 | 			T* ret = m_memPool[m_pos];
 59 | 			m_pos++;
 60 | 			return ret;
 61 | 		}
 62 | 		else
 63 | 		{
 64 | 			m_incrementSize *= MemPoolScaleFactor;
 65 | 			addToPool(m_incrementSize);
 66 | 			return alloc();
 67 | 		}
 68 | 	}
 69 | 
 70 | 
 71 | 	void free(T* pToFree)
 72 | 	{
 73 | 		//typically this should be next one down from top of stack
 74 | 		if ((m_pos <= 0) || (nullptr == pToFree))
 75 | 		{
 76 | 			return;
 77 | 		}
 78 | 
 79 | 		if (m_memPool[m_pos - 1] == pToFree)
 80 | 		{
 81 | 			//pToFree[0] = 666;
 82 | 			m_pos--;
 83 | 			return;
 84 | 		}
 85 | 
 86 | 		//search for values of i > 0
 87 | 		int i = m_pos;
 88 | 		if (i >= static_cast<int>(m_memPool.size()))
 89 | 		{
 90 | 			i = static_cast<int>(m_memPool.size()) - 1;
 91 | 		}
 92 | 		int maxPos = i;
 93 | 
 94 | 		for (; i > -1; i--)
 95 | 		{
 96 | 			if (m_memPool[i] == pToFree)
 97 | 			{
 98 | 				//swap to be freed with top element and  decrement//bubble to the top
 99 | 				for (int k = i; k < maxPos - 1; k++)
100 | 				{
101 | 					std::swap(m_memPool[k], m_memPool[k + 1]);
102 | 				}
103 | 				//pToFree[0] = 666;
104 | 				m_pos--;
105 | 				return;
106 | 			}
107 | 		}
108 | 
109 | 	}
110 | 
111 | 
112 | 	void addToPool(int numElements)
113 | 	{
114 | 		//m_vecSize for double 64 byte align ie cache line
115 | 		size_t offsetAlgn = ByteAllignment;// 64;//   16;
116 | 		std::vector<T>* pVecsMem = new std::vector<T>((long)(numElements)*m_vecSize + offsetAlgn);
117 | 		m_allocatedVecs.push_back(pVecsMem);
118 | 
119 | 		T* pstrtPt = &((*pVecsMem)[0]);
120 | 		while ((reinterpret_cast<long long>(pstrtPt)) % offsetAlgn) pstrtPt++;
121 | 
122 | 		for (int i = 0; i < numElements; i++)
123 | 		{
124 | 			m_memPool.push_back(pstrtPt);
125 | 			pstrtPt += m_vecSize;
126 | 		}
127 | 
128 | 		m_sz += numElements;
129 | 
130 | 	}
131 | 
132 | 	inline long pos() const
133 | 	{
134 | 		return m_pos;
135 | 	}
136 | 
137 | 	inline long size() const
138 | 	{
139 | 		return m_sz;
140 | 	}
141 | 
142 | 	const std::vector<std::vector<T>* >& getAllocVecs() const
143 | 	{
144 | 		m_allocatedVecs;
145 | 	}
146 | 
147 | private:
148 | 	long m_pos;
149 | 	long m_sz;
150 | 	std::vector<T*>  m_memPool;
151 | 	long m_incrementSize; // next number of vectors for allocation
152 | 	long m_vecSize; //size of element vector considering allignment and padding 
153 | 	std::vector<std::vector<T>* >  m_allocatedVecs;
154 | 
155 | };
156 | 
157 | 
158 | //////////////////////////////////////////
159 | 
160 | 
161 | template <typename T>
162 | class AllocPolicy
163 | {
164 | 	int m_vec_size;
165 | 	PoolStrat<T>* m_pool;
166 | public:
167 | 	int size() const
168 | 	{
169 | 		return m_vec_size;
170 | 	}
171 | 
172 | 	AllocPolicy(int size) :m_vec_size(size)
173 | 	{
174 | 		m_pool = new PoolStrat<T>(size);
175 | 	}
176 | 	~AllocPolicy()
177 | 	{
178 | 		delete m_pool;
179 | 	}
180 | 
181 | 
182 | 	inline T* alloc()
183 | 	{
184 | 		return m_pool->alloc();
185 | 	}
186 | 
187 | 	inline void free(T* pElement)
188 | 	{
189 | 		m_pool->free(pElement);
190 | 	}
191 | 
192 | };
193 | 
194 | 
195 | 
196 | template <typename T = double>
197 | class AllAllocators
198 | {
199 | 	static int lastSize_N;
200 | 	static AllocPolicy<T>* pAllocPolicy;
201 | 	static std::unordered_map<int, AllocPolicy<T>*>  m_map_sizeToAllocPolicy;
202 | 
203 | 
204 | 	static 	void setUpPolicy(int size_N)
205 | 	{
206 | 		auto itr = m_map_sizeToAllocPolicy.find(size_N);
207 | 		if (m_map_sizeToAllocPolicy.end() == itr)
208 | 		{
209 | 			pAllocPolicy = new AllocPolicy<T>(size_N);
210 | 			m_map_sizeToAllocPolicy[size_N] = pAllocPolicy;
211 | 		}
212 | 	}
213 | 
214 | 
215 | 
216 | public:
217 | 
218 | 	static 	void removePolicy(int size_N)
219 | 	{
220 | 		auto itr = m_map_sizeToAllocPolicy.find(size_N);
221 | 		if (m_map_sizeToAllocPolicy.end() != itr)
222 | 		{
223 | 			auto policyPtr = m_map_sizeToAllocPolicy[size_N];
224 | 			delete policyPtr;
225 | 			m_map_sizeToAllocPolicy.erase(itr);
226 | 		}
227 | 		
228 | 	}
229 | 
230 | 	static 	void freeAll()
231 | 	{
232 | 		for (auto& item : m_map_sizeToAllocPolicy)
233 | 		{
234 | 			delete item.second;
235 | 		}
236 | 		m_map_sizeToAllocPolicy.clear();
237 | 	}
238 | 
239 | 
240 | 	static T* alloc(int size_N)
241 | 	{
242 | 		if (lastSize_N == size_N)
243 | 		{
244 | 			return  pAllocPolicy->alloc();
245 | 		}
246 | 
247 | 		setUpPolicy(size_N);
248 | 
249 | 		pAllocPolicy = m_map_sizeToAllocPolicy[size_N];
250 | 		lastSize_N = size_N;
251 | 		return pAllocPolicy->alloc();
252 | 	}
253 | 
254 | 
255 | 
256 | 	static void  free(size_t size_N, T* pMem)
257 | 	{
258 | 		int sz_N = static_cast<int>(size_N);
259 | 
260 | 		if (lastSize_N == sz_N)
261 | 		{
262 | 			return  pAllocPolicy->free(pMem);
263 | 		}
264 | 
265 | 		setUpPolicy(sz_N);
266 | 		pAllocPolicy = m_map_sizeToAllocPolicy[sz_N];
267 | 		lastSize_N = sz_N;
268 | 		return pAllocPolicy->free(pMem);
269 | 
270 | 	}
271 | 
272 | 
273 | };
274 | 
275 | template< typename T>
276 | struct NumOnCacheLine
277 | {
278 | 	static inline int size()
279 | 	{
280 | 		return BytesOnCacheLine / sizeof(T);
281 | 	}
282 | };
283 | 
284 | 
285 | template<typename T>
286 | int  getAllignedSizeT(size_t N, T*)
287 | {
288 | 	const int M = NumOnCacheLine<T>::size();
289 | 	size_t res = (N % M == 0) ? N : (N / M + 1) * M;
290 | 	return static_cast<int>(res);
291 | }
292 | 
293 | 
294 | 
295 | template< typename T>
296 | void allocT(size_t& N, T*& pMem)
297 | {
298 | 	int n = getAllignedSize(N, pMem);
299 | 	N = static_cast<size_t>(n);
300 | 	pMem = AllAllocators<T>::alloc(n);
301 | }
302 | 
303 | template< typename T>
304 | void freeT(size_t N, T* pOld)
305 | {
306 | 	//find element and mark as unused 
307 | 	return AllAllocators<T>::free(N, pOld);
308 | 
309 | }
310 | void freeAllAllocators(long double);
311 | void freeAllAllocators(double);
312 | void freeAllAllocators(float);
313 | void freeAllAllocators(unsigned int);
314 | 
315 | 
316 | template <typename T = double>
317 | class AllAllocatorsGuard
318 | {
319 | public:
320 | 	~AllAllocatorsGuard()
321 | 	{
322 | 		freeAllAllocators(T());
323 | 	}
324 | 
325 | };
326 | 
327 | 
328 | 
329 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/apply_operation.h:
--------------------------------------------------------------------------------
  1 | ﻿/****************************  apply_operation.h  *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include "vec.h"
 14 | #include "vec_double.h"
 15 | #include "instruction_traits.h"
 16 | #include "boolean_operations.h"
 17 | #include "accumulate_transform.h"
 18 | #include "binary_unitary_operations.h"
 19 | #include "math_ops.h"
 20 | #include "filter_select.h"
 21 | #include "conditional_select_eval.h"
 22 | #include "vec_view.h"
 23 | #include "vcl_latest.h"
 24 | 
 25 | #include <type_traits>
 26 | 
 27 | 
 28 | 
 29 | template<typename INS_VEC>
 30 | static INS_VEC cdfnormD(INS_VEC x)
 31 | {
 32 | 
 33 | 	auto asNumber = [](auto x)
 34 | 	{
 35 | 		return static_cast<typename InstructionTraits<INS_VEC>::FloatType>(x);
 36 | 	};
 37 | 
 38 | 	//   https://mathworld.wolfram.com/Erfc.html
 39 | 	constexpr typename  InstructionTraits<INS_VEC>::FloatType invRootPi = asNumber(0.564189583547756);
 40 | 	constexpr typename InstructionTraits<INS_VEC>::FloatType invRootTwo =asNumber( 0.707106781186548);
 41 | 	return invRootTwo * invRootPi*exp(-0.5*x*x);
 42 | }
 43 | 
 44 | 
 45 | /**/
 46 | template<typename INS_VEC>
 47 | static INS_VEC cdfnorm(const INS_VEC& z)  
 48 | {
 49 | 
 50 | 	auto asNumber = [](auto x)
 51 | 	{
 52 | 		return static_cast<typename InstructionTraits<INS_VEC>::FloatType>(x);
 53 | 	};
 54 | 
 55 | 	auto asInsVec = [&](auto x){  return  INS_VEC(asNumber(x) ); };
 56 | 
 57 | 
 58 | 	//   https://mathworld.wolfram.com/Erfc.html
 59 | 	INS_VEC b1 = asInsVec(0.31938153);
 60 | 	INS_VEC b2 = asInsVec(-0.356563782);
 61 | 	INS_VEC b3 = asInsVec(1.781477937);
 62 | 	INS_VEC b4 = asInsVec(-1.821255978);
 63 | 	INS_VEC b5 = asInsVec(1.330274429);
 64 | 	INS_VEC p = asInsVec(0.2316419);	
 65 | 	INS_VEC c2 = asInsVec(0.3989423);
 66 | 
 67 | //	const auto  cond1 = (z > asInsVec(6.0));
 68 | //	INS_VEC x = select(cond1, asInsVec(1.0), z);
 69 | //	x = x;
 70 | 	
 71 | //	INS_VEC y = select( (z < asInsVec(-6.0)),asInsVec(0.0), z);
 72 | //	y = y;
 73 | 	INS_VEC a = abs(z);
 74 | 	INS_VEC t = asInsVec(1.0) / (asInsVec(1.0) + a*p);
 75 | 	INS_VEC b = c2*exp((-z)*(z / asInsVec(2.0)));
 76 | 	INS_VEC n = ((((b5*t + b4)*t + b3)*t + b2)*t + b1)*t;
 77 | 	n = asInsVec(1.0) - b*n;
 78 | 	n = select( (z < asInsVec(0.0) ), asInsVec(1.0) - n,n);
 79 | 	return n;
 80 | }
 81 | 
 82 | 
 83 | 
 84 | template<typename INS_VEC>
 85 | Vec<INS_VEC> cdfnorm(const Vec<INS_VEC>& X)
 86 | {
 87 | 	using FLOAT =  typename InstructionTraits<INS_VEC>::FloatType;
 88 | 
 89 | 	 auto asNumber = []( auto x) constexpr
 90 | 	{
 91 | 		return static_cast<FLOAT>(x);
 92 | 	};
 93 | 
 94 | 	auto centralLambda = [&](auto z)
 95 | 	{
 96 | 
 97 | 		constexpr FLOAT N[] = { FLOAT(3.52624965998911e-02) , FLOAT(0.700383064443688),   FLOAT(6.37396220353165), FLOAT(33.912866078383),  FLOAT(112.079291497871),  FLOAT(221.213596169931), FLOAT(220.206867912376) };
 98 | 		constexpr FLOAT M[] = { FLOAT(8.83883476483184e-02), FLOAT(1.75566716318264), FLOAT(16.064177579207), FLOAT(86.7807322029461) , FLOAT(296.564248779674),  FLOAT(637.333633378831), FLOAT(793.826512519948),FLOAT(440.413735824752) };
 99 | 
100 | 		auto inv_dc = 1.0 / mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(M[0], z, M[1]), z, M[2]), z, M[3]), z, M[4]), z, M[5]), z, M[6]), z, M[7]);
101 | 		auto n_c = mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(N[0], z, N[1]), z, N[2]), z, N[3]), z, N[4]), z, N[5]), z, N[6]);
102 | 
103 | 		return n_c * inv_dc;
104 | 	};
105 | 
106 | 
107 | 	auto outerLambda = [&](auto z)
108 | 	{
109 | 		constexpr FLOAT inv_RT2PI(0.39894228040143267793994605993438);
110 | 		constexpr FLOAT  d[] = { FLOAT(20.) , FLOAT(13.), FLOAT(200.), FLOAT(78.), FLOAT(300.), FLOAT(39.) };
111 | 		constexpr FLOAT  n[] = { FLOAT(20.), FLOAT(13.), FLOAT(180.), FLOAT(65.), FLOAT(160.) };
112 | 
113 | 		auto d_outer = mul_add(mul_add(mul_add(mul_add(mul_add((d[0] * z), z, d[1]), z, d[2]), z, d[3]), z, d[4]), z, d[5]);
114 | 		auto inv_d_outer = inv_RT2PI / d_outer;
115 | 
116 | 		auto n_outer = mul_add(mul_add(mul_add(mul_add((n[0] * z), z, n[1]), z, n[2]), z, n[3]), z, n[4]);
117 | 		return   n_outer * inv_d_outer;
118 | 	};
119 | 
120 | 
121 | 
122 | 	auto onePass = [=](auto x)
123 | 	{
124 | 		auto z = abs(x);
125 | 		auto e = exp(-z * z * asNumber(0.5) );
126 | 		auto central = centralLambda(z);
127 | 		auto  SPLIT = asNumber(7.42);// 7106781186547; // appears to give less error
128 | 		auto condAllDone = (x * x < SPLIT* SPLIT);
129 | 
130 | 		if (horizontal_and(condAllDone))
131 | 		{
132 | 			central *= e;
133 | 			return select(x <= asNumber(0.0), central, asNumber(1.0) - central);
134 | 		}
135 | 
136 | 		auto outer = outerLambda(z);
137 | 		auto RES = select((z < SPLIT), central, outer);
138 | 		RES *= e;
139 | 		return select(x <= asNumber(0.0), RES, asNumber(1.0) - RES);
140 | 
141 | 	};
142 | 
143 | 	return ApplyTransformUR_X(X, onePass);
144 | 
145 | }
146 | 
147 | 
148 | 
149 | template<typename INS_VEC>
150 | VecD<INS_VEC> cdfnorm(const VecD<INS_VEC>& rhs)
151 | {
152 | 	return VecD<INS_VEC>(cdfnorm(rhs.value()), rhs.derivative()*cdfnormD(rhs.value()));
153 | }
154 | 
155 | //to do replace with WS 16 digit impl
156 | template<typename INS_VEC>
157 | Vec<INS_VEC> cdfnorminv(const Vec<INS_VEC>& X)
158 | {
159 | 
160 | 	auto asNumber = [](auto x) constexpr
161 | 	{
162 | 		return static_cast<typename InstructionTraits<INS_VEC>::FloatType>(x);
163 | 	};
164 | 
165 | 
166 | 	/// acklams inverse cdf normal
167 | 	static typename InstructionTraits<INS_VEC>::FloatType a[] = { asNumber(0.0), asNumber( -3.969683028665376e+01), asNumber(2.209460984245205e+02), asNumber(-2.759285104469687e+02), asNumber(1.383577518672690e+02), asNumber(-3.066479806614716e+01) ,  asNumber(2.506628277459239e+00)};
168 | 	static typename InstructionTraits<INS_VEC>::FloatType b[] = { asNumber(0.0), asNumber(-5.447609879822406e+01),  asNumber(1.615858368580409e+02), asNumber(-1.556989798598866e+02), asNumber(6.680131188771972e+01), asNumber(-1.328068155288572e+01) };
169 | 	static typename InstructionTraits<INS_VEC>::FloatType c[] = { asNumber(0.0), asNumber(-7.784894002430293e-03), asNumber(-3.223964580411365e-01), asNumber(-2.400758277161838e+00), asNumber(-2.549732539343734e+00), asNumber(4.374664141464968e+00), asNumber(2.938163982698783e+00) };
170 | 	static typename InstructionTraits<INS_VEC>::FloatType d[] = { asNumber(0.0), asNumber(7.784695709041462e-03), asNumber(3.224671290700398e-01),  asNumber(2.445134137142996e+00), asNumber(3.754408661907416e+00) };
171 | 
172 | 	auto aclambdaMain = [=](auto p)
173 | 	{
174 | 		auto X = p;
175 | 		auto q = p - asNumber(0.5);
176 | 		auto r = q * q;
177 | 		X = (((((a[1] * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * r + a[6]) * q /
178 | 			(((((b[1] * r + b[2]) * r + b[3]) * r + b[4]) * r + b[5]) * r + asNumber(1.));
179 | 
180 | 		return X;
181 | 	};
182 | 
183 | 
184 | 	auto aclambdaLow = [=](auto initVal, auto p)
185 | 	{
186 | 		const auto p_low = asNumber(0.02425);
187 | 		auto condLo = (asNumber(0.0) < p) && (p < p_low);
188 | 
189 | 		if (!horizontal_or(condLo))
190 | 			return initVal;
191 | 
192 | 		auto q = sqrt(asNumber (-2.0) * log(p));
193 | 		auto X = (((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) /
194 | 			((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + asNumber(1.0));
195 | 
196 | 		return select(condLo, X, initVal);
197 | 
198 | 	};
199 | 
200 | 
201 | 	auto aclambdaHi = [=](auto initVal, auto p)
202 | 	{
203 | 		const auto p_low = asNumber(0.02425);
204 | 		const auto p_high = asNumber(1.) - p_low;
205 | 		auto condHi = (p_high < p) && (p < asNumber(1.));
206 | 		if (!horizontal_or(condHi))
207 | 			return initVal;
208 | 
209 | 		auto q = sqrt(asNumber(-2.0) * log(asNumber(1.) - p));
210 | 		const auto X = -(((((c[1] * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) * q + c[6]) /
211 | 			((((d[1] * q + d[2]) * q + d[3]) * q + d[4]) * q + 1.0);
212 | 		return select(condHi, X, initVal);
213 | 	};
214 | 
215 | 
216 | 
217 | 	auto res = ApplyUnitaryOperation1(X, aclambdaMain);
218 | 	SparseUpdateWithLambda1(res, X, aclambdaLow);
219 | 	SparseUpdateWithLambda1(res, X, aclambdaHi);
220 | 
221 | 	return res;
222 | }
223 | //
224 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/binned_accumulator.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "dr3.h"
  3 | #include "instruction_traits.h"
  4 | #include <type_traits>
  5 | 
  6 | 
  7 | template<typename INS_T>
  8 | struct BinsT
  9 | {
 10 |     using  INS = INS_T;
 11 | 
 12 |     inline static constexpr bool isDbl = std::is_same<double, typename InstructionTraits<INS_T>::FloatType >::value;
 13 | 
 14 |     inline static const INS_T TINY_C{ isDbl ? pow(1024.0 , -10.0) : 1.0/ 8388608.0f * 1.0 / 8388608.0f }; 
 15 |     inline static const INS_T VERY_SMALL_C{ isDbl ? pow(1024.0,-5.0) : 1.0 / 8388608.0f };
 16 |     inline static const INS_T SMALL_C{ isDbl ? 1.0 : 1.0f };
 17 |     inline static const INS_T BIG_C{ isDbl ? pow(1024.0, 5.0) :  8388608.0f }; 
 18 | 
 19 | 
 20 |     static inline auto roundIt(INS_T X, INS_T LEVEL)
 21 |     {
 22 |         auto INV_LEVEL = 1.0l / LEVEL;
 23 |         auto big = (LEVEL * truncate(X * INV_LEVEL));
 24 |         auto small = X - big;
 25 |         return std::pair(big, small);
 26 |     };
 27 | 
 28 | 
 29 |     INS_T m_scaleFactor{ InstructionTraits<INS_T>::oneValue };
 30 |     INS_T veryBigSummV{ InstructionTraits<INS_T>::nullValue };
 31 |     INS_T bigSummV{ InstructionTraits<INS_T>::nullValue };
 32 |     INS_T smallSumV{ InstructionTraits<INS_T>::nullValue };
 33 |     INS_T tinyV{ InstructionTraits<INS_T>::nullValue };
 34 | 
 35 | 
 36 |     INS_T TINY{ TINY_C };
 37 |     INS_T VERY_SMALL{ VERY_SMALL_C };
 38 |     INS_T SMALL{ SMALL_C };
 39 |     INS_T BIG{ BIG_C };
 40 | 
 41 | 
 42 | 
 43 |     BinsT() :
 44 |         m_scaleFactor{ InstructionTraits<INS_T>::oneValue },
 45 |         TINY{ m_scaleFactor * TINY_C },
 46 |         VERY_SMALL{ m_scaleFactor * VERY_SMALL_C },
 47 |         SMALL{ m_scaleFactor * SMALL_C },
 48 |         BIG{ m_scaleFactor * BIG_C }
 49 |     {}
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 |     BinsT(typename InstructionTraits<INS_T>::FloatType x, typename InstructionTraits<INS_T>::FloatType scaleFactor = InstructionTraits<INS_T>::oneValue) :
 56 |         m_scaleFactor{ scaleFactor },
 57 |         TINY{ m_scaleFactor * TINY_C },
 58 |         VERY_SMALL{ m_scaleFactor * VERY_SMALL_C },
 59 |         SMALL{ m_scaleFactor * SMALL_C },
 60 |         BIG{ m_scaleFactor * BIG_C }
 61 |     {
 62 |     
 63 |         INS_T MASK(InstructionTraits<INS_T>::nullValue);
 64 |         MASK.insert(0, InstructionTraits<INS_T>::oneValue);
 65 | 
 66 |         set(MASK * x);
 67 | 
 68 |     }
 69 | 
 70 | 
 71 |     void set(INS_T x)
 72 |     {
 73 |         auto resRoundVeryBig = roundIt(x, BIG);
 74 |         auto resRoundBig = roundIt(resRoundVeryBig.second, SMALL);
 75 |         auto resRoundSmall = roundIt(resRoundBig.second, VERY_SMALL);
 76 | 
 77 |         veryBigSummV = resRoundVeryBig.first;
 78 |         bigSummV = resRoundBig.first;
 79 |         smallSumV = resRoundSmall.first;
 80 |         tinyV = resRoundSmall.second;
 81 |     }
 82 | 
 83 |     BinsT(INS_T x)
 84 |     {
 85 |         set(x);
 86 |     }
 87 | 
 88 |     BinsT& operator *(INS_T rhs)
 89 |     {
 90 | 
 91 |         veryBigSummV *= rhs;
 92 |         bigSummV *= rhs;
 93 |         smallSumV *= rhs;
 94 |         tinyV *= rhs;
 95 | 
 96 |         return *this;
 97 |     }
 98 |   
 99 | 
100 |     BinsT(BinsT&& x) noexcept
101 |     {
102 |         veryBigSummV = x.veryBigSummV;
103 |         bigSummV = x.bigSummV;
104 |         smallSumV = x.smallSumV;
105 |         tinyV = x.tinyV;
106 | 
107 |         m_scaleFactor = x.m_scaleFactor;
108 |         TINY = x.TINY;
109 |         VERY_SMALL = x.VERY_SMALL;
110 |         SMALL = x.SMALL;
111 |         BIG = x.BIG;
112 | 
113 | 
114 |     };
115 | 
116 | 
117 |     BinsT& operator =(const BinsT& x) 
118 |     {
119 |         veryBigSummV = x.veryBigSummV;
120 |         bigSummV = x.bigSummV;
121 |         smallSumV = x.smallSumV;
122 |         tinyV = x.tinyV;
123 | 
124 |         m_scaleFactor = x.m_scaleFactor;
125 |         TINY = x.TINY;
126 |         VERY_SMALL = x.VERY_SMALL;
127 |         SMALL = x.SMALL;
128 |         BIG = x.BIG;
129 | 
130 | 
131 |         return *this;
132 |     };
133 | 
134 | 
135 |     BinsT& operator += (const BinsT& rhs)
136 |     {
137 |         auto resRoundTiny = roundIt(tinyV + rhs.tinyV, VERY_SMALL);
138 |         tinyV = resRoundTiny.second;
139 | 
140 | 
141 |         auto smallRound = roundIt(smallSumV + rhs.smallSumV + resRoundTiny.first, SMALL);
142 |         smallSumV = smallRound.second;
143 |         auto bigRound = roundIt(smallRound.first + bigSummV + rhs.bigSummV, BIG);
144 |         bigSummV = bigRound.second;
145 |         veryBigSummV = bigRound.first + veryBigSummV + rhs.veryBigSummV;
146 | 
147 |         return *this;
148 |     }
149 | 
150 | 
151 | 
152 |     auto hsum()
153 |     {
154 |         auto lambdaBinSum = [this]() {return (((horizontal_add(tinyV)) + horizontal_add(smallSumV)) + horizontal_add(bigSummV)) + horizontal_add(veryBigSummV); };
155 |         return lambdaBinSum();
156 |     }
157 | 
158 |    
159 | };
160 | 
161 | 
162 | 
163 | 
164 | static auto BinnedAdd = [](auto& bin, auto x) mutable
165 | {
166 |     bin += x;
167 |     using  INS_T = decltype(x);
168 |     auto NULL_Vec = INS_T(InstructionTraits<INS_T>::nullValue);
169 |     return  NULL_Vec;
170 | 
171 | };


--------------------------------------------------------------------------------
/Vectorisation/VecX/error_utils.h:
--------------------------------------------------------------------------------
  1 | /****************************  error_utils.h   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include "vec.h"
 14 | #include "vec_view.h"
 15 | #include "span.h"
 16 | #include <exception>
 17 | #include <cassert>
 18 | 
 19 | //ignore for unused from sutter
 20 | template<class T> void ignore(const T&) { }
 21 | 
 22 | template<typename VEC>
 23 | bool check_vector( const VEC& rhs)
 24 | {
 25 | 	auto rhsSz = rhs.size();
 26 | 
 27 | 	if ( ( rhsSz > 0) || rhs.isScalar() )
 28 | 	{
 29 | 		return true;		
 30 | 	}
 31 | 	else
 32 | 	{
 33 | 	    //std::
 34 | 		assert(false);
 35 | 		throw std::runtime_error("bad vector size of non scalar");
 36 | 	}
 37 | }
 38 | 
 39 | template<typename VEC>
 40 | bool check_pair(const VEC& lhs, const VEC& rhs)
 41 | {
 42 | 	check_vector(lhs);
 43 | 	check_vector(rhs);
 44 | 
 45 | 	if ( (lhs.size() == rhs.size() ) && (lhs.size() > 0 ) )
 46 | 	{
 47 | 		return true;
 48 | 	}
 49 | 	
 50 | 
 51 | 	if (rhs.isScalar() || lhs.isScalar())
 52 | 	{
 53 | 		return true;
 54 | 	}
 55 | 	else
 56 | 	{
 57 | 		
 58 | 		assert(false);
 59 | 		throw std::runtime_error("bad vector size");
 60 | 	}
 61 | }
 62 | 
 63 | template<typename VEC1, typename VEC2>
 64 | bool check_pair_different_type(const VEC1& lhs, const VEC2& rhs)
 65 | {
 66 | 	check_vector(lhs);
 67 | 	check_vector(rhs);
 68 | 
 69 | 	if (lhs.size() == rhs.size())
 70 | 		return true;
 71 | 	if (rhs.isScalar() || lhs.isScalar())
 72 | 	{
 73 | 		return true;
 74 | 	}
 75 | 	else
 76 | 	{
 77 | 		//std::
 78 | 		assert(false);
 79 | 		throw std::runtime_error("bad vector size");
 80 | 	}
 81 | }
 82 | 
 83 | 
 84 | 
 85 | 
 86 | //////////////  views ////////////////
 87 | template<typename INS_VEC>
 88 | bool check_vector(const VecView<INS_VEC>& /*rhs*/)
 89 | {
 90 | 	//TO DO
 91 | 	/*
 92 | 	auto rhsSz = rhs.size();
 93 | 
 94 | 	if ((rhsSz > 0) || rhs.isScalar())
 95 | 	{
 96 | 		return true;
 97 | 	}
 98 | 	else
 99 | 	{
100 | 		//std::assert(false);
101 | 		throw std::exception("bad vector size of non scalar");
102 | 	}
103 | 	*/
104 | 	return true;
105 | }
106 | 
107 | template<typename INS_VEC>
108 | bool check_vector(const Vec<INS_VEC>& rhs)
109 | {
110 | 	auto rhsSz = rhs.size();
111 | 
112 | 	if ((rhsSz > 0) || rhs.isScalar())
113 | 	{
114 | 		return true;
115 | 	}
116 | 	else
117 | 	{
118 | 		//std::
119 | 		assert(false);
120 | 		throw std::runtime_error("bad vector size of non scalar");
121 | 	}
122 | }
123 | 
124 | 
125 | 
126 | template<typename INS_VEC>
127 | bool check_vector(const VecD<INS_VEC>& rhs)
128 | {
129 | 	// Always return true
130 | 	return true;
131 | 	/*
132 | 	auto rhsSz = rhs.size();
133 | 
134 | 	if ((rhsSz > 0) || rhs.isScalar())
135 | 	{
136 | 		return true;
137 | 	}
138 | 	else
139 | 	{
140 | 		//std::
141 | 		assert(false);
142 | 		throw std::exception("bad vector size of non scalar");
143 | 	}
144 | 	*/
145 | }
146 | 
147 | template<typename INS_VEC>
148 | bool check_vector_for_filter(const Vec<INS_VEC>& rhs)
149 | {
150 | 	auto rhsSz = rhs.size();
151 | 
152 | 	if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
153 | 	{
154 | 		return true;
155 | 	}
156 | 	else
157 | 	{
158 | 		//std::
159 | 		assert(false);
160 | 		throw std::runtime_error("bad vector size of non scalar");
161 | 	}
162 | }
163 | 
164 | 
165 | template<typename INS_VEC>
166 | bool check_vector_for_filter(const VecView<INS_VEC>&/* rhs*/)
167 | {
168 | 	return true;// views can be empty
169 | 	/*
170 | 	auto rhsSz = rhs.size();
171 | 
172 | 	if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
173 | 	{
174 | 		return true;
175 | 	}
176 | 	else
177 | 	{
178 | 		//std::assert(false);
179 | 		throw std::exception("bad vector size of non scalar");
180 | 	}
181 | 	*/
182 | }
183 | 
184 | 
185 | template<typename INS_VEC>
186 | bool check_vector_for_filter(const Span<INS_VEC>&/* rhs*/)
187 | {
188 | 	return true;// views can be empty
189 | 	/*
190 | 	auto rhsSz = rhs.size();
191 | 
192 | 	if ((rhsSz > 0) || !rhs.isScalar()) //no scalar vectors for filtering to views
193 | 	{
194 | 		return true;
195 | 	}
196 | 	else
197 | 	{
198 | 		//std::assert(false);
199 | 		throw std::exception("bad vector size of non scalar");
200 | 	}
201 | 	*/
202 | }
203 | 
204 | 
205 | 
206 | 
207 | template<typename INS_VEC>
208 | bool check_view_pair(const Vec<INS_VEC>& lhs, const Vec<INS_VEC>& rhs)
209 | {
210 | 	check_vector_for_filter(lhs);
211 | 	check_vector_for_filter(rhs);
212 | 
213 | 	if (lhs.size() == rhs.size())
214 | 		return true;
215 | 
216 | 	//std::
217 | 	assert(false);
218 | 	throw std::runtime_error("bad vector size");
219 | 
220 | }
221 | 
222 | template<typename INS_VEC>
223 | bool check_view_pair(const VecView<INS_VEC>& lhs, const VecView<INS_VEC>& rhs)
224 | {
225 | 	check_vector_for_filter(lhs);
226 | 	check_vector_for_filter(rhs);
227 | 
228 | 	if (lhs.size() == rhs.size())
229 | 		return true;
230 | 
231 | 	//std::
232 | 	assert(false);
233 | 	throw std::runtime_error("bad vector size");
234 | 
235 | }


--------------------------------------------------------------------------------
/Vectorisation/VecX/filter_pipe_and_join.h:
--------------------------------------------------------------------------------
  1 | /****************************  filter_pipe_and_join.h   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include "filter_select.h"
 14 | 
 15 | 
 16 | /*
 17 |  Use "|"  for joining filters and ">" for joining operations
 18 |  use braces around sets of filters to control evaluation order
 19 | 
 20 | */
 21 | 
 22 | namespace PIPE
 23 | {
 24 | 
 25 | template< typename INS_VEC>
 26 | VecView<INS_VEC> operator |(const Vec<INS_VEC>& rhs, const VecBool<INS_VEC>& condition)
 27 | {
 28 | 	return ApplyFilter(condition, rhs);
 29 | }
 30 | 
 31 | 
 32 | template< typename INS_VEC, typename OP>
 33 | VecView<INS_VEC> operator |(const VecView<INS_VEC>& rhs, OP& condition)
 34 | {
 35 | 	return ApplyFilter(condition, rhs);
 36 | }
 37 | 
 38 | template< typename INS_VEC, typename OP>
 39 | VecView<INS_VEC> operator |(const Vec<INS_VEC>& rhs, OP& condition)
 40 | {
 41 | 	return ApplyFilter(condition, rhs);
 42 | }
 43 | 
 44 | template< typename INS_VEC, typename OP>
 45 | VecView<INS_VEC> operator |(Vec<INS_VEC>& rhs, OP& condition)
 46 | {
 47 | 	return ApplyFilter(condition, rhs);
 48 | }
 49 | 
 50 | 
 51 | // vector checks are applied inside ApplyUnitaryOperation
 52 | template< typename INS_VEC, typename OP>
 53 | VecView<INS_VEC> operator > (  VecView<INS_VEC> rhs, OP& oper)
 54 | {
 55 | 	ApplyUnitaryOperation( rhs, oper);
 56 | 	return rhs;
 57 | }
 58 | 
 59 | 
 60 | template< typename INS_VEC, typename OP>
 61 | VecView<INS_VEC>& operator > ( OP& oper, VecView<INS_VEC>& rhs)
 62 | {
 63 | 	ApplyUnitaryOperation(oper, rhs);
 64 | 	return rhs;
 65 | }
 66 | 
 67 | template< typename INS_VEC, typename OP>
 68 | VecView<INS_VEC> operator > (OP& oper, const VecView<INS_VEC>& rhs)
 69 | {
 70 | 	return ApplyUnitaryOperation(oper, rhs);
 71 | }
 72 | 
 73 | 
 74 | template< typename INS_VEC, typename OP>
 75 | VecView<INS_VEC>& operator > (Vec<INS_VEC>& rhs, OP& oper)
 76 | {
 77 | 	ApplyUnitaryOperation(oper, rhs);
 78 | 	return rhs;
 79 | }
 80 | 
 81 | 
 82 | template< typename INS_VEC, typename OP>
 83 | VecView<INS_VEC> operator > (const Vec<INS_VEC>& rhs, OP& oper)
 84 | {
 85 | 	return ApplyUnitaryOperation(oper, rhs);
 86 | }
 87 | 
 88 | 
 89 | template< typename INS_VEC, typename OP>
 90 | VecView<INS_VEC> operator > ( OP& oper , const Vec<INS_VEC>& rhs )
 91 | {
 92 | 	return ApplyUnitaryOperation(oper, rhs);
 93 | }
 94 | 
 95 | 
 96 | template< typename INS_VEC>
 97 | Vec<INS_VEC> operator |(const VecView<INS_VEC>& rhs, Vec<INS_VEC>& out)
 98 | {
 99 | 	auto outRes(out);
100 | 	rhs.writeView(outRes);
101 | 	return outRes;
102 | }
103 | 
104 | 
105 | template< typename INS_VEC>
106 | Vec<INS_VEC> operator |(VecView<INS_VEC>& rhs, const Vec<INS_VEC>& out)
107 | {
108 | 	auto outRes(out);
109 | 	rhs.writeView(outRes);
110 | 	return outRes;
111 | }
112 | 
113 | 
114 | struct WriteOut
115 | {};
116 | 
117 | template< typename INS_VEC>
118 | void operator |(const VecView<INS_VEC>& rhs, WriteOut& out)
119 | {
120 | 	//writes back to source to do
121 | 	rhs.writeView(out);
122 | }
123 | 
124 | }// namespace PIPE
125 | 
126 | /*
127 |  These expression templates are for use at register level combinations of operations
128 | */
129 | namespace  JOIN
130 | {
131 | 
132 | 
133 | 	template< typename LHS, typename RHS>
134 | 	struct CatOperation
135 | 	{
136 | 		CatOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
137 | 
138 | 		template <typename X>
139 | 		inline  auto operator()(const X& val) noexcept
140 | 		{
141 | 			return m_lhs(m_rhs(val));
142 | 		}
143 | 
144 | 		//for use with accumulate
145 | 		template <typename X>
146 | 		inline  auto operator()(const X& lhs_arg, const X& rhs_arg) noexcept
147 | 		{
148 | 			return m_rhs(rhs_arg, m_lhs(lhs_arg));
149 | 		}
150 | 		LHS m_lhs;
151 | 		RHS m_rhs;
152 | 	};
153 | 
154 | 	
155 | 	template<typename LHS, typename RHS>
156 | 	CatOperation< RHS, LHS> operator | (const LHS& lhs, const RHS& rhs)
157 | 	{
158 | 		return CatOperation<RHS, LHS>(rhs,lhs);
159 | 	}
160 | 
161 | 	/*  Examples
162 | 
163 | 		Boolean expression template conjuction for boolean lambdas
164 | 
165 | 		auto isLessThanMinus10 = [](auto x) { return x < -10 };
166 | 		auto isGreaterThan10 = [](auto x) { return x > 10 };
167 | 		auto isLessThan20 = [](auto x) { return x < 20 };
168 | 
169 | 		we can create simple logical conjunctions of boolean lambdas
170 | 
171 | 		auto betweenTenAndTwenty = isGreaterThan10 && isLessThan20;
172 | 
173 | 		auto isOutsideTenTwenty = !betweenTenAndTwenty;
174 | 
175 | 		auto hasAbsGreaterThanTen = isLessThanMinus10 || isGreaterThan10;
176 | 
177 | 	*/
178 | 
179 | 	template<  typename RHS>
180 | 	struct NegateOperation
181 | 	{
182 | 		NegateOperation(const RHS& rhs) : m_rhs(rhs) {}
183 | 
184 | 		template <typename INS_VEC>
185 | 		inline  auto operator()(const INS_VEC& val) noexcept
186 | 		{
187 | 			return !m_rhs(val);
188 | 		}
189 | 		RHS m_rhs;
190 | 	};
191 | 
192 | 
193 | 	template<  typename RHS>
194 | 	NegateOperation<RHS> operator ! (const RHS& rhs)
195 | 	{
196 | 		return NegateOperation< RHS>(rhs);
197 | 	}
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 	template< typename LHS, typename RHS>
204 | 	struct OROperation
205 | 	{
206 | 		OROperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
207 | 
208 | 		template <typename X>
209 | 		inline  auto operator()(const X& val) noexcept
210 | 		{
211 | 			return m_lhs(val) || m_rhs(val);
212 | 		}
213 | 		LHS m_lhs;
214 | 		RHS m_rhs;
215 | 	};
216 | 
217 | 
218 | 	template< typename LHS, typename RHS>
219 | 	OROperation<  LHS, RHS> operator || (const LHS& lhs, const RHS& rhs)
220 | 	{
221 | 		return OROperation<LHS, RHS>(lhs, rhs);
222 | 	}
223 | 
224 | 
225 | 
226 | 	template< typename LHS, typename RHS>
227 | 	struct AndOperation
228 | 	{
229 | 		AndOperation(const LHS& lhs, const RHS& rhs) :m_lhs(lhs), m_rhs(rhs) {}
230 | 
231 | 		template <typename X>
232 | 		inline  auto operator()(const X& val) noexcept
233 | 		{
234 | 			return m_lhs(val) && m_rhs(val);
235 | 		}
236 | 
237 | 		LHS m_lhs;
238 | 		RHS m_rhs;
239 | 
240 | 	};
241 | 
242 | 
243 | 	template< typename LHS, typename RHS>
244 | 	AndOperation<  LHS, RHS> operator && (const LHS& lhs, const RHS& rhs)
245 | 	{
246 | 		return AndOperation<LHS, RHS>(lhs, rhs);
247 | 	}
248 | 
249 | 
250 | }//namespace JOIN
251 | 
252 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/instruction_traits.h:
--------------------------------------------------------------------------------
  1 | /****************************  instruction_traits.h   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #pragma warning(suppress:4984)
 14 | 
 15 | #include "vec.h"
 16 | #include "vec_double.h"
 17 | #include "../VCL/vectormath_common.h"
 18 | 
 19 | 
 20 | template<typename INS_VEC> class  Vec;
 21 | template<typename INS_VEC> class  VecBool;
 22 | template<typename INS_VEC> class  VecD;
 23 | template< typename INS_VEC> class VecView;
 24 | 
 25 | 
 26 | template< typename INS_VEC>
 27 | struct InstructionTraits
 28 | {
 29 | 	using BoolType = VecBoolD;
 30 | 	using FloatType = double;
 31 | 	static constexpr int width = 2;
 32 | 	static constexpr double nullValue = 0.0;
 33 | 	static constexpr double oneValue  =1.0;
 34 | 	static constexpr  bool alignedLoadStore = false;
 35 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
 36 | 	static constexpr  bool useScatter = false;
 37 | 	static constexpr uint32_t limit = 100000;
 38 | 
 39 | 	static constexpr  bool isCompact = false;
 40 | 	using RegBoolType = VecBoolD;
 41 | 	using MemBoolType = VecDouble;
 42 | 
 43 | };
 44 | 
 45 | 
 46 | 
 47 | 
 48 | template<>
 49 | struct InstructionTraits<VecDouble>
 50 | {
 51 | 	using IdxType = Vec2q;
 52 | 	using BoolType = VecBoolD;
 53 | 	using FloatType = double;
 54 | 	static constexpr int width = 2;
 55 | 	static constexpr double nullValue = 0.0;
 56 | 	static constexpr double oneValue = 1.0;
 57 | 	static constexpr  bool alignedLoadStore = true;
 58 | 	static constexpr  bool useScatter = false;
 59 | 	static constexpr uint32_t limit = 100000;
 60 | 	static constexpr  bool boolTypeIsAlignedLoadStore = true;
 61 | 
 62 | 	static constexpr  bool isCompact = false;
 63 | 	using RegBoolType = VecBoolD;
 64 | 	using MemBoolType = VecDouble;
 65 | };
 66 | 
 67 | 
 68 | 
 69 | 
 70 | template<>
 71 | struct InstructionTraits<VecLDouble>
 72 | {
 73 | 	using IdxType = Vec2q;
 74 | 	using BoolType = VecBoolD;
 75 | 	using FloatType = long double;
 76 | 	static constexpr int width = 2;
 77 | 	static constexpr double nullValue = 0.0;
 78 | 	static constexpr double oneValue = 1.0;
 79 | 	static constexpr  bool alignedLoadStore = false;
 80 | 	static constexpr  bool useScatter = false;
 81 | 	static constexpr uint32_t limit = 100000;
 82 | 	static constexpr  bool boolTypeIsAlignedLoadStore = true;
 83 | 
 84 | 	static constexpr  bool isCompact = false;
 85 | 	using RegBoolType = VecBoolD;
 86 | 	using MemBoolType = VecLDouble;
 87 | };
 88 | 
 89 | 
 90 | 
 91 | template<>
 92 | struct InstructionTraits<Vec2d>
 93 | {
 94 | 	using IdxType = Vec2q;
 95 | 	using BoolType = Vec2db;
 96 | 	using FloatType = double;
 97 | 	static constexpr int width = 2;
 98 | 	static constexpr double nullValue = 0.0;
 99 | 	static constexpr double oneValue = 1.0;
100 | 	static constexpr  bool alignedLoadStore = true;
101 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
102 | 	static constexpr  bool useScatter = false;
103 | 	static constexpr uint32_t limit = 100000;
104 | 
105 | 	static constexpr  bool isCompact = false;
106 | 	using RegBoolType = Vec2db;
107 | 	using MemBoolType = Vec2d;
108 | 
109 | 
110 | };
111 | 
112 | 
113 | template<>
114 | struct InstructionTraits<Vec4f>
115 | {
116 | 	using IdxType = Vec4i;
117 | 	using BoolType = Vec4fb;
118 | 	using FloatType = float;
119 | 	static constexpr int width = 4;
120 | 	static constexpr  bool alignedLoadStore = true;
121 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
122 | 	static constexpr float nullValue = 0.f;
123 | 	static constexpr float oneValue = 1.f;
124 | 	static constexpr  bool useScatter = false;
125 | 	static constexpr uint32_t limit = 100000;
126 | 
127 | 	static constexpr  bool isCompact = false;
128 | 	using RegBoolType = Vec4fb;
129 | 	using MemBoolType = Vec4f;
130 | };
131 | 
132 | 
133 | 
134 | 
135 | 
136 | template<>
137 | struct InstructionTraits<Vec4d>
138 | {
139 | 	using IdxType = Vec4i;
140 | 	using BoolType = Vec4db;
141 | 	using FloatType = double;
142 | 	static constexpr int width = 4;
143 | 	static constexpr  bool alignedLoadStore = true;
144 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
145 | 	static constexpr double nullValue = 0.0;
146 | 	static constexpr double oneValue = 1.0;
147 | 	static constexpr  bool useScatter = true;
148 | 	static constexpr uint32_t limit = 100000;
149 | 
150 | 	static constexpr  bool isCompact = false;
151 | 	using RegBoolType = Vec4db;
152 | 	using MemBoolType = Vec4d;
153 | };
154 | 
155 | 
156 | template<>
157 | struct InstructionTraits<Vec8f>
158 | {
159 | 	using IdxType = Vec8i;
160 | 	using BoolType = Vec8fb;
161 | 	using FloatType = float;
162 | 
163 | 	static constexpr int width = 8;
164 | 	static constexpr float nullValue = 0.f;
165 | 	static constexpr float oneValue = 1.f;
166 | 	static constexpr  bool alignedLoadStore = true;
167 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
168 | 	static constexpr  bool useScatter = true;
169 | 	static constexpr uint32_t limit = 100000;
170 | 
171 | 	static constexpr  bool isCompact = false;
172 | 	using RegBoolType = Vec8fb;
173 | 	using MemBoolType = Vec8f;
174 | 
175 | };
176 | 
177 | 
178 | 
179 | template<>
180 | struct InstructionTraits<Vec8d>
181 | {
182 | 	using IdxType = Vec8i;
183 | 
184 | 	using BoolType = Vec8db;
185 | 
186 | 	using FloatType = double;
187 | 	static constexpr int width = 8;
188 | 	static constexpr double nullValue = 0.0;
189 | 	static constexpr double oneValue = 1.0;
190 | 	static constexpr  bool alignedLoadStore = false;
191 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
192 | 	static constexpr  bool useScatter = true;
193 | 	static constexpr uint32_t limit = 1000000;
194 | 
195 | 	static constexpr  bool isCompact =  true;
196 | 	using RegBoolType = Vec8db;
197 | 	using MemBoolType = Vec8d;
198 | 
199 | };
200 | 
201 | 
202 | 
203 | template<>
204 | struct InstructionTraits<Vec16f>
205 | {
206 | 	using IdxType = Vec16i;
207 | 	using BoolType = Vec16fb;
208 | 	using FloatType = float;
209 | 	static constexpr int width = 16;
210 | 	static constexpr float nullValue = 0.f;
211 | 	static constexpr float oneValue = 1.f;
212 | 	static constexpr  bool alignedLoadStore = false;
213 | 	static constexpr  bool boolTypeIsAlignedLoadStore = false;
214 | 	static constexpr  bool useScatter = true;
215 | 	static constexpr uint32_t limit = 1000000;
216 | 
217 | 	static constexpr  bool isCompact = true; 
218 | 	using RegBoolType = Vec16fb;
219 | 	using MemBoolType = Vec16f;
220 | };
221 | 
222 | 
223 | 
224 | template<typename TRAIT>
225 | inline  typename InstructionTraits<TRAIT>::MemBoolType boolCompactSave(typename InstructionTraits<TRAIT>::RegBoolType regVal )
226 | {
227 | 	return boolCompactConvert(regVal);
228 | }
229 | 
230 | 
231 | template<typename TRAIT>
232 | inline  typename InstructionTraits<TRAIT>::MemBoolType boolCompactConvert(typename InstructionTraits<TRAIT>::RegBoolType regVal)
233 | {
234 | 	return  static_cast<typename InstructionTraits<TRAIT>::MemBoolType>(regVal);
235 | 
236 | }
237 | 
238 | 
239 | 
240 | 
241 | inline  Vec8d boolCompactConvert(Vec8db regVal)
242 | {
243 | 	Vec8d const b = 0.;
244 | 	return select(regVal, -nan8d(), b);
245 | }
246 | 
247 | 
248 | 
249 | inline  Vec16f boolCompactConvert(Vec16fb regVal)
250 | {
251 | 	Vec16f const b = 0.f;
252 | 	return select(regVal, -nan16f(), b);
253 | }
254 | 
255 | 
256 | inline  Vec8db boolCompactConvert(Vec8d regVal)
257 | {
258 | 	Vec8d allZeros = false;
259 | 	Vec8db ret = !(allZeros == regVal);
260 | 	return ret;
261 | }
262 | 
263 | 
264 | inline  Vec16fb boolCompactConvert(Vec16f regVal)
265 | {
266 | 	Vec16f allZeros = false;
267 | 	Vec16fb ret = !(allZeros == regVal);
268 | 	return ret;
269 | }
270 | 
271 | 
272 | 
273 | //for save
274 | template<typename TRAIT  >
275 | inline  auto boolConvert(typename  InstructionTraits<TRAIT>::RegBoolType regVal)
276 | {
277 | 	if constexpr (! InstructionTraits<TRAIT>::isCompact )
278 | 	{
279 | 		return regVal;
280 | 	}
281 | 	else
282 | 	{
283 | 		return  boolCompactSave< TRAIT>(regVal);
284 | 	}
285 | }
286 | 
287 | 
288 | //for load
289 | template<typename TRAIT  >
290 | inline  auto boolConvert(typename  InstructionTraits<TRAIT>::MemBoolType regVal)
291 | {
292 | 	if constexpr (!InstructionTraits<TRAIT>::isCompact)
293 | 	{
294 | 		return regVal;
295 | 	}
296 | 	else
297 | 	{
298 | 		return  boolCompactConvert(regVal);
299 | 	}
300 | }
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/target_name_space.h:
--------------------------------------------------------------------------------
  1 | /****************************  target_name_space.h   *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | 
 14 | #include "vec.h"
 15 | #include "vec_bool.h"
 16 | #include "vec_d.h"
 17 | #include "vec_bool_d.h"
 18 | #include "vec_double.h"
 19 | #include "vec_view.h"
 20 | #include "apply_operation.h"
 21 | #include "span.h"
 22 | 
 23 | namespace DRC
 24 | {
 25 | 
 26 | 
 27 | 
 28 | 	namespace VecLDb
 29 | 	{
 30 | 		using VecxD = VecD<VecLDouble>;
 31 | 		using Vecx = VecD<VecLDouble>;
 32 | 		using VecXX = Vec<VecLDouble>;
 33 | 		using VecVW = VecView<VecLDouble>;
 34 | 		using VecBL = VecBool<VecLDouble>;
 35 | 		using SpanXX = Span<VecLDouble>;
 36 | 		using StrdSpanXX = StridedSpan<VecLDouble>;
 37 | 	};
 38 | 	// experimental
 39 | 	
 40 | 
 41 | 	namespace VecDb
 42 | 	{
 43 | 		using VecxD = VecD<VecDouble>;
 44 | 		using Vecx = VecD<VecDouble>;
 45 | 		using VecXX = Vec<VecDouble>;
 46 | 		using VecVW = VecView<VecDouble>;
 47 | 		using VecBL = VecBool<VecDouble>;
 48 | 		using SpanXX = Span<VecDouble>;
 49 | 		using StrdSpanXX = StridedSpan<VecDouble>;
 50 | 	};
 51 | 
 52 | 
 53 | 
 54 | 	namespace VecD2D
 55 | 	{
 56 | 		using  VecxD = VecD<Vec2d>;
 57 | 		using  Vecx = VecD<Vec2d>;
 58 | 		using  VecXX = Vec<Vec2d>;
 59 | 		using  VecVW = VecView<Vec2d>;
 60 | 		using  VecBL = VecBool<Vec2d>;
 61 | 		using  SpanXX = Span<Vec2d>;
 62 | 		using  StrdSpanXX = StridedSpan<Vec2d>;
 63 | 	};
 64 | 
 65 | 
 66 | 	namespace VecD4D
 67 | 	{
 68 | 		using  VecxD = VecD<Vec4d>;
 69 | 		using  Vecx = VecD<Vec4d>;
 70 | 		using  VecXX = Vec<Vec4d>;
 71 | 		using  VecVW = VecView<Vec4d>;
 72 | 		using  VecBL = VecBool<Vec4d>;
 73 | 		using  SpanXX = Span<Vec4d>;
 74 | 		using  StrdSpanXX = StridedSpan<Vec4d>;
 75 | 	};
 76 | 
 77 | 	namespace VecD8D
 78 | 	{
 79 | 		using  VecxD = VecD<Vec8d>  ;
 80 | 		using  Vecx = VecD<Vec8d> ;
 81 | 		using  VecXX =Vec<Vec8d> ;
 82 | 		using  VecVW = VecView<Vec8d> ;
 83 | 		using  VecBL = VecBool<Vec8d> ;
 84 | 		using  SpanXX = Span<Vec8d> ;
 85 | 		using  StrdSpanXX =StridedSpan<Vec8d> ;
 86 | 	};
 87 | 
 88 | 	namespace VecF16F
 89 | 	{
 90 | 		using  VecxD = VecD<Vec16f>;
 91 | 		using  Vecx = VecD<Vec16f>;
 92 | 		using  VecXX = Vec<Vec16f>;
 93 | 		using  VecVW = VecView<Vec16f>;
 94 | 		using  VecBL = VecBool<Vec16f>;
 95 | 		using  SpanXX = Span<Vec16f>;
 96 | 		using  StrdSpanXX = StridedSpan<Vec16f>;
 97 | 	};
 98 | 
 99 | 	namespace VecF8F
100 | 	{
101 | 		using  VecxD = VecD<Vec8f>;
102 | 		using  Vecx = VecD<Vec8f>;
103 | 		using  VecXX = Vec<Vec8f>;
104 | 		using  VecVW = VecView<Vec8f>;
105 | 		using  VecBL = VecBool<Vec8f>;
106 | 		using  SpanXX = Span<Vec8f>;
107 | 		using  StrdSpanXX = StridedSpan<Vec8f>;
108 | 	};
109 | 
110 | 
111 | 	namespace VecF4F
112 | 	{
113 | 		using  VecxD = VecD<Vec4f>;
114 | 		using  Vecx = VecD<Vec4f>;
115 | 		using  VecXX = Vec<Vec4f>;
116 | 		using  VecVW = VecView<Vec4f>;
117 | 		using  VecBL = VecBool<Vec4f>;
118 | 		using  SpanXX = Span<Vec4f>;
119 | 		using  StrdSpanXX = StridedSpan<Vec4f>;
120 | 	};
121 | 
122 | 
123 | 
124 | }// namespace DRC


--------------------------------------------------------------------------------
/Vectorisation/VecX/transform.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "vec.h"
  4 | #include "binary_unitary_operations.h"
  5 | #include "conditional_select_eval.h"
  6 | 
  7 | 
  8 | //Unitary lambdas
  9 | //unrolled version defaults to unroll x4
 10 | template<typename LAMBDA, typename INS_VEC>
 11 | Vec<INS_VEC>  ApplyTransform(LAMBDA& lambda, const Vec<INS_VEC>& inputVec)
 12 | {
 13 | 	return ApplyUnitaryOperation(inputVec, lambda); 
 14 | }
 15 | 
 16 | //not unrolled x1
 17 | template<typename LAMBDA, typename INS_VEC>
 18 | Vec<INS_VEC>  ApplyTransform1(LAMBDA& lambda, const Vec<INS_VEC>& inputVec)
 19 | {
 20 | 	return ApplyUnitaryOperation1(inputVec, lambda);
 21 | }
 22 | 
 23 | //inplace transforms
 24 | template<typename LAMBDA, typename INS_VEC>
 25 | void ApplyTransformM(LAMBDA& lambda,  Vec<INS_VEC>& inputVec)
 26 | {
 27 | 	ApplyUnitaryOperationM(inputVec, lambda);
 28 | }
 29 | 
 30 | //not unrolled x1
 31 | template<typename LAMBDA, typename INS_VEC>
 32 | void  ApplyTransform1(LAMBDA& lambda, Vec<INS_VEC>& inputVec)
 33 | {
 34 | 	ApplyUnitaryOperation1(inputVec, lambda);
 35 | }
 36 | 
 37 | 
 38 | //unrolled version defaults to unroll x4
 39 | template<typename LAMBDA, typename INS_VEC>
 40 | Vec<INS_VEC>  ApplyTransform(LAMBDA& lambda, const Vec<INS_VEC>& inputVecLHS, const Vec<INS_VEC>& inputVecRHS)
 41 | {
 42 | 	return ApplyBinaryOperation(inputVecLHS, inputVecRHS, lambda);
 43 | }
 44 | 
 45 | template<typename LAMBDA, typename INS_VEC>
 46 | Vec<INS_VEC>  ApplyTransform(LAMBDA& lambda, typename InstructionTraits<INS_VEC>::FloatType LHS, const Vec<INS_VEC>& inputVecRHS)
 47 | {
 48 | 	return ApplyBinaryOperation(LHS, inputVecRHS, lambda);
 49 | }
 50 | 
 51 | template<typename LAMBDA, typename INS_VEC>
 52 | Vec<INS_VEC>  ApplyTransform(LAMBDA& lambda, const Vec<INS_VEC>& inputVecLHS, typename InstructionTraits<INS_VEC>::FloatType RHS)
 53 | {
 54 | 	return ApplyBinaryOperation(inputVecLHS, RHS, lambda);
 55 | }
 56 | 
 57 | // not unrolled
 58 | template<typename LAMBDA, typename INS_VEC>
 59 | Vec<INS_VEC>  ApplyTransform1(LAMBDA& lambda, const Vec<INS_VEC>& inputVecLHS, const Vec<INS_VEC>& inputVecRHS)
 60 | {
 61 | 	return ApplyBinaryOperation1(inputVecLHS, inputVecRHS, lambda);
 62 | }
 63 | 
 64 | //conversion 
 65 | template<typename LAMBDA, typename INS_VEC>
 66 | Vec<INS_VEC>  ApplyTransform1(LAMBDA& lambda, typename InstructionTraits<INS_VEC>::FloatType LHS, const Vec<INS_VEC>& inputVecRHS)
 67 | {
 68 | 	return ApplyBinaryOperation1(LHS, inputVecRHS, lambda);
 69 | }
 70 | 
 71 | template<typename LAMBDA, typename INS_VEC>
 72 | Vec<INS_VEC>  ApplyTransform1(LAMBDA& lambda, const Vec<INS_VEC>& inputVecLHS, typename InstructionTraits<INS_VEC>::FloatType RHS)
 73 | {
 74 | 	return ApplyBinaryOperation1(inputVecLHS, RHS, lambda);
 75 | }
 76 | 
 77 | /////////////////////////////// in place binary unroll x 4 ////////////////
 78 | 
 79 | template<typename LAMBDA, typename INS_VEC>
 80 | void ApplyTransformM(LAMBDA& lambda,  Vec<INS_VEC>& inputVecLHS, const Vec<INS_VEC>& inputVecRHS)
 81 | {
 82 | 	ApplyBinaryOperationMMXY<INS_VEC, LAMBDA>(inputVecLHS, inputVecRHS , lambda);
 83 | }
 84 | 
 85 | template<typename LAMBDA, typename INS_VEC>
 86 | void ApplyTransformM(LAMBDA& lambda, Vec<INS_VEC>& inputVecLHS, typename InstructionTraits<INS_VEC>::FloatType RHS)
 87 | {
 88 | 	ApplyBinaryOperationMMMX<INS_VEC, LAMBDA>(inputVecLHS, RHS, lambda);
 89 | }
 90 | 
 91 | 
 92 | template<typename LAMBDA, typename INS_VEC>
 93 | void ApplyTransformM(LAMBDA& lambda, typename InstructionTraits<INS_VEC>::FloatType LHS,  Vec<INS_VEC>& inputVecRHS)
 94 | {
 95 | 	ApplyBinaryOperationMMM<INS_VEC, LAMBDA>(LHS, inputVecRHS , lambda);
 96 | }
 97 | 
 98 | 
 99 | //    ApplySparseTransform  takes a boolean condition lambda to determine if the lambda should be used
100 | //    to calculate a given value in the vector
101 | //    if so it applies the transform lambda and then blends it into the target vector 
102 | template< typename LAMBDA, typename INS_VEC, typename CONDITION_LAMBDA>
103 | void ApplySparseTransform(const Vec<INS_VEC>& inputVec, Vec<INS_VEC>& updateResult, LAMBDA& oper, CONDITION_LAMBDA& selectionOp)
104 | {
105 | 	ApplySparseUnitaryOperationU(inputVec, updateResult,oper, selectionOp);
106 | }
107 | 
108 | 
109 | ///////////////////// TO DO  make this the interface /////////////////////
110 | 
111 | template< typename INS_VEC, typename OP>
112 | typename InstructionTraits<INS_VEC>::FloatType ApplyReduce(const Vec<INS_VEC>& rhs1, OP& oper, typename InstructionTraits<INS_VEC>::FloatType initVal, bool singularInit = true)
113 | {
114 | 	return ApplyAccumulate2(rhs1, oper, initVal, singularInit);
115 | 
116 | }
117 | 
118 | 
119 | template< typename INS_VEC, typename OP, typename OPT>
120 | typename InstructionTraits<INS_VEC>::FloatType ApplyTransformReduce(const Vec<INS_VEC>& rhs1, OPT& operTransform, OP& operAcc, typename InstructionTraits<INS_VEC>::FloatType initVal = InstructionTraits<INS_VEC>::nullValue, bool singularInit = true)
121 | {
122 | 	return ApplyTransformAccumulateUR(rhs1, operTransform, operAcc, initVal, singularInit);
123 | }
124 | 
125 | 
126 | /////////////////////////////////
127 | 
128 | 
129 | template< typename INS_VEC, typename BOOL_OPER>
130 | Vec<INS_VEC> ApplySelection(BOOL_OPER& COND, const Vec<INS_VEC>& testData, const Vec<INS_VEC>& lhs, const Vec<INS_VEC>& rhs)
131 | {
132 | 	return ApplySelectionOperationC<INS_VEC, BOOL_OPER>(COND, testData, lhs, rhs);
133 | } 
134 | 
135 | 
136 | template< typename INS_VEC, typename BOOL_OPER>
137 | Vec<INS_VEC> ApplySelection(BOOL_OPER& COND, const Vec<INS_VEC>& testData, typename InstructionTraits<INS_VEC>::FloatType trueVal, typename InstructionTraits<INS_VEC>::FloatType falseVal)
138 | {
139 | 	return ApplySelectionOperationC<INS_VEC, BOOL_OPER > (COND, testData, trueVal, falseVal);
140 | }
141 | 
142 | template< typename INS_VEC, typename BOOL_OPER, typename TRUE_OPER, typename FALSE_OPER>
143 | Vec<INS_VEC> ApplySelectionF(BOOL_OPER& COND, const Vec<INS_VEC>& testData, TRUE_OPER& trueOper, FALSE_OPER& falseOper)
144 | {
145 | 	return ApplySelectionOperationFunc<INS_VEC, BOOL_OPER, TRUE_OPER, FALSE_OPER >(COND, testData, trueOper, falseOper);
146 | }
147 | 
148 | 
149 | /*
150 |  applies the tesFunc to the vector val, if the func returns true  it applies the trueLambda to the value otherwise it applies the falseLambda
151 |  can be slow if true/false Lambda functions are not heavyweight
152 | */
153 | template< typename INS_VEC, typename  BOOL_TEST_OP, typename  TRUE_LAMBDA, typename  FALSE_LAMBDA>
154 | Vec<INS_VEC> ApplySplitCalculate( BOOL_TEST_OP& testFunc, const Vec<INS_VEC>& val, TRUE_LAMBDA& trueLambda, FALSE_LAMBDA& falseLambda)
155 | {
156 | 	return splitConditionalCalculate(val, testFunc, trueLambda, falseLambda);
157 | }
158 | 
159 | 
160 | /*
161 |  applies the tesFunc to the view val, if the func returns true  it applies the trueLambda to the value otherwise it applies the falseLambda
162 |  can be slow if true/false Lambda functions are not heavyweight
163 | */
164 | template< typename INS_VEC, typename  BOOL_TEST_OP, typename  TRUE_LAMBDA, typename  FALSE_LAMBDA>
165 | VecView<INS_VEC> ApplySplitCalculate( BOOL_TEST_OP& testFunc, const VecView<INS_VEC>& val, TRUE_LAMBDA& trueLambda, FALSE_LAMBDA& falseLambda)
166 | {
167 | 	return splitConditionalCalculate(val, testFunc, trueLambda, falseLambda);
168 | }
169 | 
170 | 
171 | /////////////////////////// filters //////////////////////
172 | 
173 | 
174 | template<typename LAMBDA, typename INS_VEC>
175 | void ApplyTransformM(LAMBDA& lambda, VecView<INS_VEC>& inputVec)
176 | {
177 | 	return ApplyUnitaryOperation(lambda, inputVec);
178 | }
179 | 
180 | template<typename LAMBDA, typename INS_VEC>
181 | VecView<INS_VEC> ApplyTransform(LAMBDA& lambda, const VecView<INS_VEC>& inputVec)
182 | {
183 | 	return ApplyUnitaryOperation(lambda, inputVec);
184 | }
185 | 
186 | 
187 | template<typename LAMBDA, typename INS_VEC>
188 | VecView<INS_VEC> ApplyTransformV(LAMBDA& lambda, const Vec<INS_VEC>& inputVec)
189 | {
190 | 	return ApplyUnitaryOperation(lambda, inputVec);
191 | }
192 | 
193 | 
194 | /*
195 | applies the OP to the view in and  scatter,  writes the results to the corresponding elements  of the result vector.
196 | */
197 | template< typename INS_VEC, typename OP>
198 | void ApplyTransformWrite(OP& oper, const VecView<INS_VEC>& view, Vec<INS_VEC>& out)
199 | {
200 | 	ApplyUnitaryOperationWrite(oper, view, out);
201 | }
202 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/vcl_latest.h:
--------------------------------------------------------------------------------
 1 | /****************************  vcl_latest.h   *******************************
 2 | * Author:        Andrew Drakeford
 3 | * Date created:  2021-04-10
 4 | * Last modified: 2021-04-10
 5 | * Version:       1.0
 6 | * Project:       DR Cubed
 7 | * Description:
 8 | *
 9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | #pragma once
13 | 
14 | 
15 | #include "../VCL/vectormath_common.h"
16 | #include "../VCL/vectorclass.h"
17 | #include "../VCL/vectormath_exp.h"
18 | #include "../VCL/vectormath_trig.h"
19 | #include "../VCL/vectormath_hyp.h"


--------------------------------------------------------------------------------
/Vectorisation/VecX/vec.cpp:
--------------------------------------------------------------------------------
 1 | /****************************  vec.cpp  *******************************
 2 | * Author:        Andrew Drakeford
 3 | * Date created:  2021-04-10
 4 | * Last modified: 2021-04-10
 5 | * Version:       1.0
 6 | * Project:       DR Cubed
 7 | * Description:
 8 | *
 9 | * (c) Copyright 2019 Andrew Drakeford
10 | * Apache License version 2.0 or later.
11 | *****************************************************************************/
12 | //#include "vec.h"
13 | //#include "alloc_policy.h"
14 | //#include <algorithm>
15 | /*
16 | const double InstructionTraits<VecDouble>::nullValue = 0.0;
17 | const double InstructionTraits<VecDouble>::oneValue = 1.0;
18 | 
19 | const double InstructionTraits<Vec2d>::nullValue = 0.0;
20 | const double InstructionTraits<Vec2d>::oneValue = 1.0;
21 | 
22 | const double InstructionTraits<Vec4d>::nullValue = 0.0;
23 | const double InstructionTraits<Vec4d>::oneValue = 1.0;
24 | 
25 | const double InstructionTraits<Vec8d>::nullValue = 0.0;
26 | const double InstructionTraits<Vec8d>::oneValue = 1.0;
27 | 
28 | const float InstructionTraits<Vec16f>::nullValue = 0.0f;
29 | const float InstructionTraits<Vec16f>::oneValue = 1.0f;
30 | 
31 | const float InstructionTraits<Vec8f>::nullValue = 0.0f;
32 | const float InstructionTraits<Vec8f>::oneValue = 1.0f;
33 | 
34 | const float InstructionTraits<Vec4f>::nullValue = 0.0f;
35 | const float InstructionTraits<Vec4f>::oneValue = 1.0f;
36 | */
37 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/vec.h:
--------------------------------------------------------------------------------
  1 | /****************************  vec.h  *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include "instruction_traits.h"
 14 | #include "alloc_policy.h"
 15 | #include "apply_operation.h"
 16 | #include "vec_view.h"
 17 | #include "span.h"
 18 | 
 19 | #include <algorithm>
 20 | #include <iterator>
 21 | #include <vector>
 22 | 
 23 | 
 24 | 
 25 | template <typename INS_VEC>
 26 | class Vec
 27 | {
 28 | public:
 29 | 
 30 | 	friend class VecView< INS_VEC>;
 31 | 
 32 | 	typedef INS_VEC INS;
 33 | 	using SCALA_TYPE = typename InstructionTraits<INS_VEC>::FloatType;
 34 | 
 35 | 	//convert to scalar
 36 | 	template<typename T>
 37 | 	static SCALA_TYPE scalar(const T& val)
 38 | 	{
 39 | 		return static_cast<SCALA_TYPE>(val);
 40 | 	}
 41 | 
 42 | 	//convert to register 
 43 | 	template<typename T>
 44 | 	static INS reg(T& val)
 45 | 	{
 46 | 		INS vec(static_cast<SCALA_TYPE>(val));
 47 | 		return vec;
 48 | 	}
 49 | 
 50 | 
 51 | private:
 52 | 
 53 | 	typename InstructionTraits<INS_VEC>::FloatType 	 m_scalarVal;
 54 | 	bool m_isScalar;
 55 | 
 56 | 	typename InstructionTraits<INS_VEC>::FloatType*  m_pData;
 57 | 
 58 | 	int m_size; // number of elements represented
 59 | 	size_t m_implSize;// actual size of allocated block
 60 | 
 61 | public:
 62 | 
 63 | 	Vec():m_scalarVal(0.),m_isScalar(true)
 64 | 	{
 65 | 		m_size = 0;
 66 | 		m_implSize = 0;
 67 | 		m_pData = nullptr;
 68 | 	}
 69 | 
 70 | 
 71 | 	//not explicit  allow conversions
 72 | 	Vec( typename InstructionTraits<INS_VEC>::FloatType scalarVal):m_scalarVal(scalarVal),m_isScalar(true)
 73 | 	{
 74 | 		m_size = 0;
 75 | 		m_implSize = 0;
 76 | 		m_pData = nullptr;
 77 | 	}
 78 | 
 79 | 
 80 | 	Vec& operator =(typename InstructionTraits<INS_VEC>::FloatType scalarVal) 
 81 | 	{
 82 | 		m_isScalar=true;
 83 | 		m_scalarVal=scalarVal;
 84 | 		if (m_pData != nullptr)
 85 | 		{
 86 | 			freePool(m_implSize, m_pData);
 87 | 		}
 88 | 		m_size = 0;
 89 | 		m_implSize = 0;
 90 | 		m_pData = nullptr;
 91 | 		return *this;
 92 | 	}
 93 | 	
 94 | 
 95 | 	Vec(const std::vector<  typename InstructionTraits<INS_VEC>::FloatType > & ctr)
 96 | 	{
 97 | 
 98 | 		int sz = static_cast<int>(std::distance(ctr.begin(), ctr.end()) );
 99 | 		m_size =sz;
100 | 		m_implSize = sz;
101 | 		allocPool(m_implSize,m_pData);
102 | 
103 | 		auto repeatedPaddingValue = ctr.at(sz-1);
104 | 		for(auto s =sz; s < static_cast<int>(m_implSize);s++)
105 | 		{
106 | 			m_pData[s] =repeatedPaddingValue;
107 | 		}
108 | 
109 | 		std::copy(cbegin(ctr),cend(ctr),m_pData);
110 | 
111 | 		m_isScalar = false;
112 | 		m_scalarVal = InstructionTraits<INS_VEC>::nullValue;
113 | 	}
114 | 
115 | 	explicit Vec(int sz):m_size(sz), m_implSize(sz)
116 | 	{
117 | 		allocPool(m_implSize, m_pData);
118 | 		m_isScalar = false;
119 | 		m_scalarVal = InstructionTraits<INS_VEC>::nullValue;
120 | 	}
121 | 
122 | 
123 | 
124 | 	Vec(typename InstructionTraits<INS_VEC>::FloatType val, int sz) :m_size(sz), m_implSize(sz)
125 | 	{
126 | 		allocPool(m_implSize, m_pData);
127 | 		m_isScalar = false;
128 | 		m_scalarVal = InstructionTraits<INS_VEC>::nullValue;
129 | 
130 | 		std::fill_n(start(), sz, val);
131 | 
132 | 	}
133 | 
134 | 
135 | 	~Vec()
136 | 	{
137 | 		if(m_pData != nullptr)
138 | 		{
139 | 			freePool(m_implSize,m_pData);
140 | 		}
141 | 	}
142 | 
143 | 	Vec(const Vec& rhs):  m_scalarVal(rhs.m_scalarVal), m_isScalar(rhs.m_isScalar), m_size(rhs.m_size), m_implSize(rhs.m_implSize)
144 | 	{
145 | 		m_pData = nullptr;
146 | 
147 | 		if( !m_isScalar)
148 | 		{
149 | 			m_implSize = m_size;
150 | 			allocPool(m_implSize,m_pData);
151 | 			std::copy(rhs.m_pData, rhs.m_pData+ m_implSize , m_pData);
152 | 		}
153 | 	}
154 | 
155 | 	Vec& operator=(const Vec& rhs)
156 | 	{
157 | 		if (&rhs != this)
158 | 		{
159 | 			if (m_pData != nullptr)
160 | 			{
161 | 				freePool(m_implSize, m_pData);
162 | 				m_pData = nullptr;
163 | 				m_size = 0;
164 | 				m_implSize = 0;
165 | 			}
166 | 
167 | 			m_isScalar = rhs.m_isScalar;
168 | 			m_scalarVal = rhs.m_scalarVal;
169 | 
170 | 			if( !m_isScalar)
171 | 			{
172 | 				m_size= rhs.m_size;
173 | 				m_implSize = m_size;
174 | 				allocPool(m_implSize,m_pData);
175 | 				std::copy(rhs.m_pData, rhs.m_pData+ m_implSize , m_pData);
176 | 			}
177 | 		}
178 | 
179 | 	   return *this;
180 | 	}
181 | 
182 | 
183 | 	Vec(Vec&& rhs) noexcept
184 | 	{
185 | 		m_implSize = 0;
186 | 		m_isScalar = true;
187 | 		m_scalarVal = InstructionTraits<INS_VEC>::nullValue;
188 | 		m_size = 0;
189 | 		m_pData = nullptr;
190 | 		*this = std::move(rhs);
191 | 	}
192 | 
193 | 	Vec& operator=( Vec&& rhs) noexcept
194 | 	{
195 | 		if (&rhs != this)
196 | 		{
197 | 			std::swap(m_isScalar , rhs.m_isScalar);
198 | 			std::swap(m_scalarVal , rhs.m_scalarVal);
199 | 			std::swap( m_implSize , rhs.m_implSize);
200 | 			std::swap(m_size, rhs.m_size);
201 | 			std::swap(m_pData, rhs.m_pData);
202 | 		}
203 | 		return *this;
204 | 	}
205 | 
206 | 	
207 | 	//explicit
208 | 	operator std::vector<typename InstructionTraits<INS_VEC>::FloatType>()
209 | 	{
210 | 		return std::vector<typename InstructionTraits<INS_VEC>::FloatType>(begin(), end());
211 | 	}
212 | 
213 | 
214 | 	typename InstructionTraits<INS_VEC>::FloatType& operator[](size_t pos) 
215 | 	{
216 | 		return m_pData[pos];
217 | 	}
218 | 
219 | 	typename InstructionTraits<INS_VEC>::FloatType operator[](size_t pos) const
220 | 	{
221 | 		return m_pData[pos];
222 | 	}
223 | 
224 | 
225 | 	inline typename InstructionTraits<INS_VEC>::FloatType* start() const
226 | 	{
227 | 		return m_pData;
228 | 	}
229 | 
230 | 	inline typename InstructionTraits<INS_VEC>::FloatType* data() 
231 | 	{
232 | 		return m_pData;
233 | 	}
234 | 
235 | 
236 | 	inline int size() const
237 | 	{
238 | 		return m_size;
239 | 	}
240 | 
241 | 	
242 | 	inline int paddedSize() const
243 | 	{
244 | 		return  static_cast<int>(m_implSize);
245 | 	}
246 | 
247 | 	inline bool isScalar() const
248 | 	{
249 | 		return m_isScalar;
250 | 	}
251 | 
252 | 	inline typename InstructionTraits<INS_VEC>::FloatType getScalarValue() const
253 | 	{
254 | 		return m_scalarVal;
255 | 	}
256 | 
257 | 	inline void setScalarValue( typename InstructionTraits<INS_VEC>::FloatType val)
258 | 	{
259 | 		m_scalarVal = val;
260 | 	}
261 | 
262 | 	inline typename InstructionTraits<INS_VEC>::FloatType* begin() const
263 | 	{
264 | 		return start();
265 | 	}
266 | 	
267 | 	inline typename InstructionTraits<INS_VEC>::FloatType* end() const
268 | 	{
269 | 		return start() + static_cast<size_t>(m_size);
270 | 	}
271 | 
272 | 	inline static  INS_VEC reg(typename InstructionTraits<INS_VEC>::FloatType val)
273 | 	{
274 | 		return INS_VEC(val);
275 | 	}
276 | 
277 | };
278 | 
279 | 
280 | 
281 | 
282 | template<typename T>
283 | bool isScalar(const Vec<T> & X)
284 | {
285 | 	return  X.isScalar();
286 | }
287 | 
288 | template<typename T>
289 | bool isScalar( Vec<T>& X)
290 | {
291 | 	return  X.isScalar();
292 | }
293 | 
294 | 
295 | template<typename T>
296 | bool isScalar(const VecView<T>& X)
297 | {
298 | 	return  X.isScalar();
299 | }
300 | 
301 | template<typename T>
302 | bool isScalar( VecView<T>& X)
303 | {
304 | 	return  X.isScalar();
305 | }
306 | 
307 | 
308 | template<typename T> 
309 | bool isScalar(const T&)
310 | {
311 | 	return true;
312 | }
313 | 
314 | template<typename T>
315 | bool isScalar( T&)
316 | {
317 | 	return true;
318 | }
319 | 
320 | 
321 | template<typename T>
322 | bool isScalar(const Span<T>& X)
323 | {
324 | 	return false;
325 | }
326 | 
327 | template<  template <class> typename VEC_TYPE, typename INS_VEC, typename OP>
328 | void  getScalarValue(const VEC_TYPE<INS_VEC>& rhs1, typename InstructionTraits<INS_VEC>::FloatType& val)
329 | {
330 | 
331 | 	if (isScalar(rhs1)) 
332 | 	{
333 | 		val = rhs1.getScalarValue();
334 | 	}
335 | }


--------------------------------------------------------------------------------
/Vectorisation/VecX/vec_bool.h:
--------------------------------------------------------------------------------
  1 | /****************************  vec_bool.h  *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | 
 14 | #include <algorithm>
 15 | #include <iterator>
 16 | #include "alloc_policy.h"
 17 | #include "apply_operation.h"
 18 | 
 19 | template <typename INS_VEC>
 20 | class VecBool
 21 | {
 22 | private:
 23 | 
 24 | 
 25 | 	typename InstructionTraits<INS_VEC>::FloatType* m_pData;
 26 | 	size_t m_size;
 27 | 	size_t m_implSize;
 28 | 
 29 | 	bool m_scalarVal;
 30 | 	bool m_isScalar = false;
 31 | 
 32 | public:
 33 | 	explicit VecBool(bool scalar) : m_pData(nullptr), m_size(0), m_implSize(0),  m_scalarVal(scalar),m_isScalar(true)
 34 | 	{
 35 | 		
 36 | 	}
 37 | 
 38 | 	VecBool(int sz) :m_size(sz), m_implSize(sz), m_scalarVal(0), m_isScalar(false)
 39 | 	{
 40 | 		allocPool(m_implSize, m_pData);
 41 | 	}
 42 | 
 43 | 
 44 | 	~VecBool()
 45 | 	{
 46 | 		if (m_pData != nullptr)
 47 | 		{
 48 | 			freePool(m_size, m_pData);
 49 | 		}
 50 | 	}
 51 | 
 52 | 	VecBool(const VecBool& rhs)
 53 | 	{
 54 | 		m_isScalar = rhs.m_isScalar;
 55 | 		m_scalarVal = rhs.m_scalarVal;
 56 | 		m_size = rhs.m_size;
 57 | 		m_implSize = m_size;
 58 | 		allocPool(m_implSize, m_pData);
 59 | 		std::copy(rhs.m_pData, rhs.m_pData + m_implSize, m_pData);
 60 | 	}
 61 | 
 62 | 
 63 | 	VecBool& operator=(const VecBool& rhs)
 64 | 	{
 65 | 		if (&rhs != this)
 66 | 		{
 67 | 			m_isScalar = rhs.m_isScalar;
 68 | 			m_scalarVal = rhs.m_scalarVal;
 69 | 			m_size = rhs.m_size;
 70 | 			m_implSize = rhs.m_implSize;
 71 | 			std::copy(rhs.m_pData, rhs.m_pData + m_implSize, m_pData);
 72 | 		}
 73 | 		return *this;
 74 | 	}
 75 | 
 76 | 
 77 | 	VecBool(VecBool&& rhs) noexcept
 78 | 	{
 79 | 		m_isScalar = rhs.m_isScalar;
 80 | 		m_scalarVal = rhs.m_scalarVal;
 81 | 		m_implSize = 0;
 82 | 		m_implSize= rhs.m_implSize;
 83 | 		m_size = rhs.size();
 84 | 		m_pData = nullptr;
 85 | 		*this = std::move(rhs);
 86 | 	}
 87 | 
 88 | 	VecBool& operator=(VecBool&& rhs) noexcept
 89 | 	{
 90 | 		if (&rhs != this)
 91 | 		{
 92 | 			std::swap(m_implSize, rhs.m_implSize);
 93 | 			std::swap(m_size, rhs.m_size);
 94 | 			std::swap(m_pData, rhs.m_pData);
 95 | 			std::swap(m_scalarVal,rhs.m_scalarVal);
 96 | 			std::swap(m_isScalar, rhs.m_isScalar);
 97 | 			
 98 | 		}
 99 | 		return *this;
100 | 	}
101 | 
102 | 
103 | 	inline typename InstructionTraits<INS_VEC>::FloatType* start() const
104 | 	{
105 | 		return m_pData;
106 | 	}
107 | 
108 | 	inline size_t size() const
109 | 	{
110 | 		return m_size;
111 | 
112 | 	}
113 | 
114 | 	inline size_t paddedSize() const
115 | 	{
116 | 		return m_implSize;
117 | 	}
118 | 
119 | 	inline bool getScalarValue() const
120 | 	{
121 | 		return m_scalarVal;
122 | 	}
123 | 
124 | 	inline bool isScalar() const
125 | 	{
126 | 		return m_isScalar;
127 | 	}
128 | 
129 | 	inline bool operator[](int j)const
130 | 	{
131 | 		return m_pData[j];
132 | 	}
133 | 
134 | 
135 | 	void setAt(int j, bool val)
136 | 	{
137 | 		m_pData[j] = val; 
138 | 	}
139 | 
140 | };
141 | 
142 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/vec_bool_d.h:
--------------------------------------------------------------------------------
  1 | /****************************  vec_bool_d.h  *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include "vec_double.h"
 14 | 
 15 | static inline double asDouble( bool bVal)
 16 | {
 17 | 	if(!bVal) return 0.0;
 18 | 	return static_cast<double>(0xFFFFFFFFFFFFFFFF);
 19 | }
 20 | 
 21 | 
 22 | static inline bool asBool( double val)
 23 | {
 24 | 	if( 0.0 == val) return false;
 25 | 	return true;
 26 | 	
 27 | }
 28 | 
 29 | 
 30 | class VecBoolD
 31 | {
 32 | private:
 33 | 
 34 | 	 double m_data[2];
 35 | public:
 36 | 	VecBoolD()	{ m_data[0] = asDouble(false);	m_data[1] = asDouble(false);}
 37 | 
 38 | 	VecBoolD( bool d0,bool d1){ m_data[0] = asDouble(d0);	m_data[1] = asDouble(d1);}
 39 | 
 40 | 	VecBoolD& operator = ( const VecBoolD& rhs)
 41 | 	{
 42 | 		 m_data[0] =rhs.m_data[0];	m_data[1] =rhs.m_data[1];
 43 | 		 return *this;
 44 | 	}
 45 | 
 46 | 	VecBoolD(bool d0) { m_data[0] = asDouble(d0);	m_data[1] = asDouble(d0); }
 47 | 
 48 | 	VecBoolD& operator = (bool rhs)
 49 | 	{
 50 | 		m_data[0] = asDouble(rhs);	m_data[1] = asDouble(rhs);
 51 | 		return *this;
 52 | 	}
 53 | 
 54 | 	VecBoolD& load_a ( const double* p)
 55 | 	{
 56 | 		 m_data[0] =p[0];
 57 | 		 m_data[1] =p[1];
 58 | 		 return *this;
 59 | 	}
 60 | 
 61 | 
 62 |    void store_a( double* p)
 63 | 	{
 64 | 		p[0] = m_data[0];
 65 | 		p[1] =m_data[1];
 66 | 	}
 67 | 
 68 |    bool extract(size_t idx) const
 69 |    {
 70 | 	    return  asBool(m_data[idx]);
 71 |    }
 72 | 
 73 | 
 74 |    void insert(size_t idx,bool val)
 75 |    {
 76 | 	  m_data[idx]= asDouble(val);
 77 |    }
 78 | 
 79 | 
 80 |     bool operator [] (size_t index) const
 81 | 	{
 82 |         return extract(index);
 83 | 	}
 84 | 
 85 |    static int size() { return 2;}
 86 | 
 87 |    inline bool isScalar() const
 88 |    {
 89 | 	   return false;
 90 |    }
 91 | 
 92 | 	
 93 | };
 94 | 
 95 | 
 96 | 
 97 | static inline bool horizontal_or(VecBoolD const & a)
 98 | {
 99 | 	return a[0] || a[1];
100 | }
101 | 
102 | static inline bool horizontal_and(VecBoolD const & a)
103 | {
104 | 	return a[0] && a[1];
105 | }
106 | 
107 | 
108 | static inline VecBoolD operator  &&(VecBoolD const& a, const VecBoolD& b)
109 | {
110 | 	return  VecBoolD( static_cast<long>(a[0]) & static_cast<long>(b[0]), static_cast<long>(a[1]) & static_cast<long>(b[1]));
111 | }
112 | 
113 | 
114 | static inline VecBoolD operator  ||(VecBoolD const& a, const VecBoolD& b)
115 | {
116 | 	return  VecBoolD(static_cast<long>(a[0]) | static_cast<long>(b[0]), static_cast<long>(a[1]) | static_cast<long>(b[1]) );
117 | }
118 | 
119 | 
120 | static inline VecBoolD operator !(VecBoolD const& a)
121 | {
122 | 	return  VecBoolD(!a[0] , !a[1]);
123 | }
124 | 


--------------------------------------------------------------------------------
/Vectorisation/VecX/vec_d.h:
--------------------------------------------------------------------------------
  1 | /****************************  vec_d.h  *******************************
  2 | * Author:        Andrew Drakeford
  3 | * Date created:  2021-04-10
  4 | * Last modified: 2021-04-10
  5 | * Version:       1.0
  6 | * Project:       DR Cubed
  7 | * Description:
  8 | *
  9 | * (c) Copyright 2019 Andrew Drakeford
 10 | * Apache License version 2.0 or later.
 11 | *****************************************************************************/
 12 | #pragma once
 13 | #include <algorithm>
 14 | #include <iterator>
 15 | #include "vec.h"
 16 | #include "vec_bool.h"
 17 | 
 18 | 
 19 | 
 20 | template <typename INS_VEC>
 21 | class VecD
 22 | {
 23 | private:
 24 | public:
 25 | 
 26 | 	Vec< INS_VEC> val;
 27 | 	Vec< INS_VEC> deriv;
 28 | 
 29 | public:
 30 | 
 31 | 	VecD()
 32 | 	{}
 33 | 
 34 | 	static VecD< INS_VEC> makeDVecZero(const Vec<INS_VEC>&  value)
 35 | 	{
 36 | 		if (value.isScalar())
 37 | 		{
 38 | 			return  VecD(value.getScalarValue(), InstructionTraits<INS_VEC>::nullValue);
 39 | 		}
 40 | 		std::vector< typename InstructionTraits<INS_VEC>::FloatType> zeros(value.size(), InstructionTraits<INS_VEC>::nullValue);
 41 | 		return VecD(value, Vec< INS_VEC>(zeros));
 42 | 	}
 43 | 
 44 | 	static VecD< INS_VEC> makeDVecOnes(const Vec<INS_VEC>&  value)
 45 | 	{
 46 | 		if (value.isScalar())
 47 | 		{
 48 | 			return  VecD(value.getScalarValue(), InstructionTraits<INS_VEC>::oneValue);
 49 | 		}
 50 | 		std::vector< typename InstructionTraits<INS_VEC>::FloatType> ones(value.size(), InstructionTraits<INS_VEC>::oneValue);
 51 | 		return VecD(value, ones);
 52 | 	}
 53 | 
 54 | 
 55 | 	static VecD< INS_VEC> makeDVecOnes(const typename InstructionTraits<INS_VEC>::FloatType&  value, int sz)
 56 | 	{
 57 | 		Vec< INS_VEC> values(value, sz);
 58 | 		Vec< INS_VEC> ones(InstructionTraits<INS_VEC>::oneValue, sz);
 59 | 		return VecD(values, ones);
 60 | 	}
 61 | 
 62 | 	static VecD< INS_VEC> makeDVecZero(const typename InstructionTraits<INS_VEC>::FloatType&  value, int sz)
 63 | 	{
 64 | 		Vec< INS_VEC> values(value, sz);
 65 | 		Vec< INS_VEC> zeros(InstructionTraits<INS_VEC>::nullValue, sz);
 66 | 		return VecD(values, zeros);
 67 | 	}
 68 | 
 69 | 	static VecD< INS_VEC> makeDVecOnesV(const typename InstructionTraits<INS_VEC>::FloatType&  value, int sz)
 70 | 	{
 71 | 		Vec< INS_VEC> values(value, sz);
 72 | 		std::vector< typename InstructionTraits<INS_VEC>::FloatType> ones(value.size(), InstructionTraits<INS_VEC>::oneValue);
 73 | 		return VecD(values, ones);
 74 | 	}
 75 | 
 76 | 	static VecD< INS_VEC> makeDVecZeroV(const typename InstructionTraits<INS_VEC>::FloatType&  value, int sz)
 77 | 	{
 78 | 		Vec< INS_VEC> values(value, sz);
 79 | 		std::vector< typename InstructionTraits<INS_VEC>::FloatType> nulls(value.size(), InstructionTraits<INS_VEC>::nullValue);
 80 | 		return VecD(values, nulls);
 81 | 	}
 82 | 
 83 | 	explicit VecD(typename InstructionTraits<INS_VEC>::FloatType scalarVal)
 84 | 		:val(scalarVal), deriv(InstructionTraits<INS_VEC>::nullValue)
 85 | 	{
 86 | 
 87 | 	}
 88 | 
 89 | 
 90 | 	VecD(typename InstructionTraits<INS_VEC>::FloatType scalarVal, typename InstructionTraits<INS_VEC>::FloatType derivVal)
 91 | 		:val(scalarVal), deriv(derivVal)
 92 | 	{
 93 | 
 94 | 	}
 95 | 
 96 | 
 97 | 
 98 | 	VecD(const std::vector< typename InstructionTraits<INS_VEC>::FloatType> & ctr) :val(ctr), deriv(InstructionTraits<INS_VEC>::nullValue, ctr.size())
 99 | 	{
100 | 
101 | 	}
102 | 
103 | 
104 | 	VecD(const Vec<INS_VEC>&  value, const Vec<INS_VEC>&  derivative) : val(value), deriv(derivative)
105 | 	{}
106 | 
107 | 
108 | 	VecD(Vec<INS_VEC>&&  value, Vec<INS_VEC>&&  derivative) :
109 | 		val(std::forward< Vec<INS_VEC>>(value)),
110 | 		deriv(std::forward<Vec<INS_VEC>>(derivative))
111 | 	{}
112 | 
113 | 
114 | 	VecD(Vec<INS_VEC>&&  value) :
115 | 		val(std::forward< Vec<INS_VEC>>(value))
116 | 	{
117 | 		if (!val.isScalar())
118 | 		{
119 | 			deriv(InstructionTraits<INS_VEC>::nullVal, value.size());
120 | 		}
121 | 		else
122 | 		{
123 | 			deriv(InstructionTraits<INS_VEC>::nullVal);
124 | 		}
125 | 	}
126 | 
127 | 
128 | 	VecD(Vec<INS_VEC>&&  value, const Vec<INS_VEC>&  d) :
129 | 		val(std::forward< Vec<INS_VEC>>(value)), deriv(d)
130 | 	{
131 | 	}
132 | 
133 | 
134 | 	//explicit
135 | 	VecD(const Vec<INS_VEC>& value) :
136 | 		val(value), deriv(value.isScalar()? InstructionTraits<INS_VEC>::nullValue : Vec< INS_VEC>(InstructionTraits<INS_VEC>::nullValue,value.size()  ) )
137 | 	{
138 | 	}
139 | 	
140 | 
141 | 	explicit VecD(size_t sz) :val(sz), deriv(sz)
142 | 	{
143 | 
144 | 	}
145 | 
146 | 
147 | 	typename InstructionTraits<INS_VEC>::FloatType& operator[](size_t pos)
148 | 	{
149 | 		return val[pos];
150 | 	}
151 | 
152 | 	typename InstructionTraits<INS_VEC>::FloatType operator[](size_t pos) const
153 | 	{
154 | 		return val[pos];
155 | 	}
156 | 
157 | 
158 | 	inline typename InstructionTraits<INS_VEC>::FloatType* start() const
159 | 	{
160 | 		return val.start();
161 | 	}
162 | 
163 | 
164 | 	inline size_t size() const
165 | 	{
166 | 		return val.size();
167 | 	}
168 | 
169 | 
170 | 
171 | 	inline int  paddedSize() const
172 | 	{
173 | 		return static_cast<int>(val.paddedSize());
174 | 	}
175 | 
176 | 	inline bool isScalar() const
177 | 	{
178 | 		return val.isScalar();
179 | 	}
180 | 
181 | 	inline typename InstructionTraits<INS_VEC>::FloatType getScalarValue() const
182 | 	{
183 | 		return val.getScalarValue();
184 | 	}
185 | 
186 | 	inline void setScalarValue(typename InstructionTraits<INS_VEC>::FloatType newVal)
187 | 	{
188 | 		val.setScalarValue(newVal);
189 | 	}
190 | 
191 | 
192 | 	inline typename InstructionTraits<INS_VEC>::FloatType getScalarDeriv() const
193 | 	{
194 | 		return deriv.getScalarValue();
195 | 	}
196 | 
197 | 	inline void setScalarDeriv(typename InstructionTraits<INS_VEC>::FloatType newVal)
198 | 	{
199 | 		deriv.setScalarValue(newVal);
200 | 	}
201 | 
202 | 
203 | 
204 | 	inline const Vec< INS_VEC>& value() const
205 | 	{
206 | 		return val;
207 | 	}
208 | 
209 | 	inline const Vec< INS_VEC>& derivative() const
210 | 	{
211 | 		return deriv;
212 | 	}
213 | 
214 | 	inline  Vec< INS_VEC>& value()
215 | 	{
216 | 		return val;
217 | 	}
218 | 
219 | 	inline  Vec< INS_VEC>& derivative()
220 | 	{
221 | 		return deriv;
222 | 	}
223 | 
224 | 
225 | };
226 | 
227 | 
228 | template <typename INS_VEC>
229 | VecD<INS_VEC> D(const Vec<INS_VEC>& rhs)
230 | {
231 | 	return VecD<INS_VEC>::makeDVecOnes(rhs);
232 | }
233 | 
234 | 
235 | template <typename INS_VEC>
236 | VecD<INS_VEC> C(const Vec<INS_VEC>& rhs)
237 | {
238 | 	return VecD<INS_VEC>::makeDVecZero(rhs);
239 | }
240 | 
241 | 


--------------------------------------------------------------------------------
/Vectorisation/Vectorisation.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/Vectorisation.cpp


--------------------------------------------------------------------------------
/Vectorisation/Vectorisation.log:
--------------------------------------------------------------------------------
1 | ﻿C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'Vectorisation.vcxproj'.  Please check to make sure that you have specified a valid combination of Configuration and Platform for this project.  Configuration='Debug'  Platform='ARM64'.  This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform.
2 | 


--------------------------------------------------------------------------------
/Vectorisation/Vectorisation.vcxproj.filters:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup>
  4 |     <Filter Include="Source Files">
  5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
  6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
  7 |     </Filter>
  8 |     <Filter Include="Header Files">
  9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
 10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
 11 |     </Filter>
 12 |     <Filter Include="VecWrapper">
 13 |       <UniqueIdentifier>{06d8baf8-80f2-40a5-98f6-1cf93bfb3eaa}</UniqueIdentifier>
 14 |     </Filter>
 15 |   </ItemGroup>
 16 |   <ItemGroup>
 17 |     <ClInclude Include="VecX\vec.h">
 18 |       <Filter>VecWrapper</Filter>
 19 |     </ClInclude>
 20 |     <ClInclude Include="VecX\vec_bool_d.h">
 21 |       <Filter>VecWrapper</Filter>
 22 |     </ClInclude>
 23 |     <ClInclude Include="VecX\vec_double.h">
 24 |       <Filter>VecWrapper</Filter>
 25 |     </ClInclude>
 26 |     <ClInclude Include="VecX\alloc_policy.h">
 27 |       <Filter>VecWrapper</Filter>
 28 |     </ClInclude>
 29 |     <ClInclude Include="VecX\apply_operation.h">
 30 |       <Filter>VecWrapper</Filter>
 31 |     </ClInclude>
 32 |     <ClInclude Include="VecX\vec_d.h">
 33 |       <Filter>VecWrapper</Filter>
 34 |     </ClInclude>
 35 |     <ClInclude Include="VecX\vec_bool.h">
 36 |       <Filter>VecWrapper</Filter>
 37 |     </ClInclude>
 38 |     <ClInclude Include="VecX\vec_view.h">
 39 |       <Filter>VecWrapper</Filter>
 40 |     </ClInclude>
 41 |     <ClInclude Include="VecX\operations.h">
 42 |       <Filter>Header Files</Filter>
 43 |     </ClInclude>
 44 |     <ClInclude Include="VecX\target_name_space.h">
 45 |       <Filter>Header Files</Filter>
 46 |     </ClInclude>
 47 |     <ClInclude Include="VecX\instruction_traits.h">
 48 |       <Filter>Header Files</Filter>
 49 |     </ClInclude>
 50 |     <ClInclude Include="VecX\math_ops.h">
 51 |       <Filter>Header Files</Filter>
 52 |     </ClInclude>
 53 |     <ClInclude Include="VecX\boolean_operations.h">
 54 |       <Filter>Header Files</Filter>
 55 |     </ClInclude>
 56 |     <ClInclude Include="VecX\accumulate_transform.h">
 57 |       <Filter>Header Files</Filter>
 58 |     </ClInclude>
 59 |     <ClInclude Include="VecX\binary_unitary_operations.h">
 60 |       <Filter>Header Files</Filter>
 61 |     </ClInclude>
 62 |     <ClInclude Include="VecX\filter_select.h">
 63 |       <Filter>Header Files</Filter>
 64 |     </ClInclude>
 65 |     <ClInclude Include="VecX\vcl_latest.h">
 66 |       <Filter>Header Files</Filter>
 67 |     </ClInclude>
 68 |     <ClInclude Include="VecX\unroll_operators.h">
 69 |       <Filter>Header Files</Filter>
 70 |     </ClInclude>
 71 |     <ClInclude Include="VecX\conditional_select_eval.h">
 72 |       <Filter>Header Files</Filter>
 73 |     </ClInclude>
 74 |     <ClInclude Include="VecX\alloc_policy_imp.h">
 75 |       <Filter>Header Files</Filter>
 76 |     </ClInclude>
 77 |     <ClInclude Include="VecX\filter_pipe_and_join.h">
 78 |       <Filter>Header Files</Filter>
 79 |     </ClInclude>
 80 |     <ClInclude Include="VecX\error_utils.h">
 81 |       <Filter>Header Files</Filter>
 82 |     </ClInclude>
 83 |     <ClInclude Include="VecX\transform.h">
 84 |       <Filter>Header Files</Filter>
 85 |     </ClInclude>
 86 |     <ClInclude Include="VecX\dr3.h">
 87 |       <Filter>Header Files</Filter>
 88 |     </ClInclude>
 89 |     <ClInclude Include="VecX\zip_utils.h">
 90 |       <Filter>Header Files</Filter>
 91 |     </ClInclude>
 92 |     <ClInclude Include="VecX\scan.h">
 93 |       <Filter>Header Files</Filter>
 94 |     </ClInclude>
 95 |     <ClInclude Include="intrinsic_utils.h">
 96 |       <Filter>Header Files</Filter>
 97 |     </ClInclude>
 98 |     <ClInclude Include="VecX\binned_accumulator.h">
 99 |       <Filter>Header Files</Filter>
100 |     </ClInclude>
101 |   </ItemGroup>
102 |   <ItemGroup>
103 |     <ClCompile Include="VecX\vec.cpp">
104 |       <Filter>VecWrapper</Filter>
105 |     </ClCompile>
106 |     <ClCompile Include="VecX\alloc_policy.cpp">
107 |       <Filter>VecWrapper</Filter>
108 |     </ClCompile>
109 |   </ItemGroup>
110 |   <ItemGroup>
111 |     <None Include="packages.config" />
112 |     <None Include="Vectorisation.ruleset" />
113 |     <None Include="..\VectorTest\VectorTest.vcxproj" />
114 |     <None Include="..\accumulateExample\accumulateExample.vcxproj" />
115 |     <None Include="..\GettingStarted\GettingStarted.vcxproj" />
116 |     <None Include="..\inverseCumNormalExample\inverseCumNormalExample.vcxproj" />
117 |   </ItemGroup>
118 | </Project>


--------------------------------------------------------------------------------
/Vectorisation/Vectorisation.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/Vectorisation/intel_Libs/libirc.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/libirc.lib


--------------------------------------------------------------------------------
/Vectorisation/intel_Libs/svml_disp.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_disp.lib


--------------------------------------------------------------------------------
/Vectorisation/intel_Libs/svml_dispmd.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_dispmd.lib


--------------------------------------------------------------------------------
/Vectorisation/intel_Libs/svml_dispmt.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svml_dispmt.lib


--------------------------------------------------------------------------------
/Vectorisation/intel_Libs/svmlpatch.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/intel_Libs/svmlpatch.lib


--------------------------------------------------------------------------------
/Vectorisation/intrinsic_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | /****************************  unroll_operators.h   *******************************
 3 | * Author:        Andrew Drakeford
 4 | * Date created:  2021-04-10
 5 | * Last modified: 2021-04-10
 6 | * Version:       1.0
 7 | * Project:       DR Cubed
 8 | * Description:
 9 | *
10 | * (c) Copyright 2019 Andrew Drakeford
11 | * Apache License version 2.0 or later.
12 | *****************************************************************************/
13 | 
14 | 
15 | #include "../Vectorisation/VecX/instruction_traits.h"
16 | #include <immintrin.h>
17 | 
18 | /*
19 | //_MM_MANTISSA_NORM_ENUM n;
20 | //_MM_MANTISSA_SIGN_ENUM m;
21 | 
22 | static inline __m256d  getMantissa(__m256d x)// const int n, const int m) 
23 | {
24 | 
25 | 	return _mm256_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
26 | }
27 | 
28 | static inline __m512d  getMantissa(__m512d x, int n, const int m)
29 | {
30 | 	return _mm512_getmant_pd(x,n,m);
31 | }
32 | 
33 | 
34 | 
35 | static inline __m256d  getExponent(__m256d x)
36 | {
37 | 
38 | 	return _mm256_getexp_pd(x);
39 | }
40 | 
41 | static inline __m512d  getExponent(__m512d x)
42 | {
43 | 	return _mm512_getexp_pd(x);
44 | }
45 | 
46 | static inline __m128d  getExponent(__m128d x)
47 | {
48 | 	return _mm_getexp_pd(x);
49 | }
50 | 
51 | static inline __m256  getExponent(__m256 x)
52 | {
53 | 
54 | 	return _mm256_getexp_ps(x);
55 | }
56 | 
57 | static inline __m512  getExponent(__m512 x)
58 | {
59 | 	return _mm512_getexp_ps(x);
60 | }
61 | 
62 | static inline __m128  getExponent(__m128 x)
63 | {
64 | 	return _mm_getexp_ps(x);
65 | }
66 | 
67 | */


--------------------------------------------------------------------------------
/Vectorisation/packages.config:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <packages>
3 |   <package id="Microsoft.googletest.v140.windesktop.msvcstl.static.rt-dyn" version="1.8.0" targetFramework="native" />
4 | </packages>


--------------------------------------------------------------------------------
/Vectorisation/pch.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/Vectorisation/pch.h


--------------------------------------------------------------------------------
/accumulateExample/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(accumulateExample accumulate_example.cpp)
2 | 
3 | target_link_libraries(accumulateExample PUBLIC Vectorisation)
4 | 
5 | target_include_directories(accumulateExample PUBLIC
6 |                            "${PROJECT_BINARY_DIR}"
7 |                            )


--------------------------------------------------------------------------------
/accumulateExample/accumulateExample.log:
--------------------------------------------------------------------------------
1 | ﻿C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'accumulateExample.vcxproj'.  Please check to make sure that you have specified a valid combination of Configuration and Platform for this project.  Configuration='Debug'  Platform='ARM64'.  This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform.
2 | 


--------------------------------------------------------------------------------
/accumulateExample/accumulateExample.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="accumulate_example.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |   </ItemGroup>
22 |   <ItemGroup>
23 |     <Library Include="..\Vectorisation\intel_Libs\libirc.lib" />
24 |     <Library Include="..\Vectorisation\intel_Libs\svmlpatch.lib" />
25 |     <Library Include="..\Vectorisation\intel_Libs\svml_disp.lib" />
26 |   </ItemGroup>
27 |   <ItemGroup>
28 |     <ClInclude Include="AVX512Dance.h">
29 |       <Filter>Header Files</Filter>
30 |     </ClInclude>
31 |   </ItemGroup>
32 | </Project>


--------------------------------------------------------------------------------
/accumulateExample/accumulateExample.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/accumulateExample/norm.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | 
4 | double qnorm5(double p);
5 | double qnorm6(double p);
6 | double qnorm7(double p);


--------------------------------------------------------------------------------
/cumNormalExample/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(cumNormalExample cumNormalExample.cpp)
2 | 
3 | target_link_libraries(cumNormalExample PUBLIC Vectorisation)
4 | 
5 | target_include_directories(cumNormalExample PUBLIC
6 |                            "${PROJECT_BINARY_DIR}"
7 |                            )


--------------------------------------------------------------------------------
/cumNormalExample/cumNormal.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | #include "../Vectorisation/VecX/vec.h"
  9 | #include "../Vectorisation/VecX/operations.h"
 10 | #include "../Vectorisation/VecX/apply_operation.h"
 11 | #include "../Vectorisation/VecX/vec_d.h"
 12 | #include "../Vectorisation/VecX/vec_bool.h"
 13 | #include "../Vectorisation/VecX/vec_view.h"
 14 | 
 15 | #include "../Vectorisation/VecX/target_name_space.h"
 16 | 
 17 | #include <immintrin.h>
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | //using namespace DRC::VecDb;
 24 | //using namespace DRC::VecD2D;
 25 | using namespace DRC::VecD4D;
 26 | //using namespace DRC::VecD8D;
 27 | //using namespace DRC::VecF16F;
 28 | //using namespace DRC::VecF8F;
 29 | 
 30 | 
 31 | #include <algorithm>
 32 | #include <random>
 33 | #include <numeric>
 34 | #include <iterator>
 35 | #include <iostream>
 36 | #include <vector>
 37 | #include <chrono>
 38 | #include <iomanip>  
 39 | #include <chrono>
 40 | #include <iostream>
 41 | #include <functional>
 42 | 
 43 | 
 44 | 
 45 | double getnull(double);
 46 | 
 47 | 
 48 | 
 49 | 
 50 | template<typename VEC>
 51 | VEC phi(VEC x)
 52 | {
 53 | 	// https://stackoverflow.com/questions/2328258/cumulative-normal-distribution-function-in-c-c
 54 | 	// references a Wests's implementation in Willmot.
 55 | 
 56 |  	const VEC z = abs(x);
 57 | 
 58 | 
 59 | 	constexpr double N0 = 220.206867912376;
 60 | 	constexpr double N1 = 221.213596169931;
 61 | 	constexpr double N2 = 112.079291497871;
 62 | 	constexpr double N3 = 33.912866078383;
 63 | 	constexpr double N4 = 6.37396220353165;
 64 | 	constexpr double N5 = 0.700383064443688;
 65 | 	constexpr double N6 = 3.52624965998911e-02;
 66 | 	constexpr double M0 = 440.413735824752;
 67 | 	constexpr double M1 = 793.826512519948;
 68 | 	constexpr double M2 = 637.333633378831;
 69 | 	constexpr double M3 = 296.564248779674;
 70 | 	constexpr double M4 = 86.7807322029461;
 71 | 	constexpr double M5 = 16.064177579207;
 72 | 	constexpr double M6 = 1.75566716318264;
 73 | 	constexpr double M7 = 8.83883476483184e-02;
 74 | 
 75 | 
 76 | 	VEC n_c = (((((N6 * z + N5) * z + N4) * z + N3) * z + N2) * z + N1) * z + N0;
 77 | 	VEC d_c = ((((((M7 * z + M6) * z + M5) * z + M4) * z + M3) * z + M2) * z + M1) * z + M0;
 78 | 	VEC  central = n_c / d_c;
 79 | 
 80 | 	constexpr double inv_RT2PI(0.39894228040143267793994605993438);
 81 | 	VEC d_outer = (((((20. * z) * z + 13.) * z + 200.) * z + 78.) * z + 300.) * z + 39.;
 82 | 	VEC n_outer = ((((20. * z) * z + 13.) * z + 180.) * z + 65.) * z + 160.;
 83 | 	VEC outer = inv_RT2PI * n_outer / d_outer;
 84 | 
 85 | 	
 86 | 	VEC e = exp(-z * z * 0.5);
 87 | 
 88 | 	 //   static const double SPLIT = 7.07106781186547; //orig
 89 | 	const VEC SPLIT(7.42);// 7106781186547; //play  appears to give less error
 90 | 
 91 | 	VEC RES = select((z < SPLIT), central, outer);
 92 | 	RES *= e;
 93 | 
 94 | 	return select(x <= VEC(0.0), RES, 1.0 - RES);
 95 | 
 96 | }
 97 | 
 98 | 
 99 | 
100 | 
101 | template <typename VecXX>
102 | VecXX calcCDFNormal(const VecXX& X)
103 | {
104 | 	   //TO DO FMA
105 | 	   auto centralLambda = [&](auto z)
106 | 	   {
107 | 
108 | 			const static double N[] = { 3.52624965998911e-02 , 0.700383064443688,   6.37396220353165, 33.912866078383,  112.079291497871,  221.213596169931, 220.206867912376 };
109 | 			const static double M[] = { 8.83883476483184e-02, 1.75566716318264, 16.064177579207, 86.7807322029461 , 296.564248779674,  637.333633378831, 793.826512519948,440.413735824752 };
110 | 
111 | 			auto n_c = (((((N[0] * z + N[1]) * z + N[2]) * z + N[3]) * z + N[4]) * z + N[5]) * z + N[6];
112 | 			auto d_c = ((((((M[0] * z + M[1]) * z + M[2]) * z + M[3]) * z + M[4]) * z + M[5]) * z + M[6]) * z + M[7];
113 | 			return n_c / d_c;
114 | 		};
115 | 
116 | 
117 | 	   auto outerLambda = [](auto z)
118 |        {
119 | 			constexpr double inv_RT2PI(0.39894228040143267793994605993438);
120 | 			auto d_outer = (((((20. * z) * z + 13.) * z + 200.) * z + 78.) * z + 300.) * z + 39.;
121 | 			auto n_outer = ((((20. * z) * z + 13.) * z + 180.) * z + 65.) * z + 160.;
122 | 			return  inv_RT2PI * n_outer / d_outer;
123 | 	   };
124 | 
125 | 	
126 | 	auto onePass = [=](auto x)
127 | 	{
128 | 		auto z = abs(x);
129 | 		auto e = exp(-z * z * 0.5);
130 | 		auto outer = outerLambda(z);
131 | 		auto central = centralLambda(z);
132 | 		auto  SPLIT =7.42;// 7106781186547; //play  appears to give less error
133 | 		auto RES = select((z < SPLIT), central, outer);
134 | 		RES *= e;
135 | 		return select(x <= 0.0, RES, 1.0 - RES);
136 | 
137 | 	};
138 | 
139 | 	auto res = ApplyTransformUR_XX(X, onePass);
140 | 	return res;
141 | 
142 | }
143 | 
144 | 
145 | 
146 | 
147 | 
148 | template <typename VecXX>
149 | VecXX calcCDFNormalFMA(const VecXX& X)
150 | {
151 | 	//TO DO FMA
152 | 	auto centralLambda = [&](auto z)
153 | 	{
154 | 
155 | 		constexpr double N[] = { 3.52624965998911e-02 , 0.700383064443688,   6.37396220353165, 33.912866078383,  112.079291497871,  221.213596169931, 220.206867912376 };
156 | 		constexpr double M[] = { 8.83883476483184e-02, 1.75566716318264, 16.064177579207, 86.7807322029461 , 296.564248779674,  637.333633378831, 793.826512519948,440.413735824752 };
157 | 	
158 | 		auto inv_dc = 1.0/ mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(M[0], z, M[1]), z, M[2]), z, M[3]), z, M[4]), z, M[5]), z, M[6]), z, M[7]);
159 | 		auto n_c = mul_add(mul_add(mul_add(mul_add(mul_add(mul_add(N[0], z, N[1]), z, N[2]), z, N[3]), z, N[4]), z, N[5]), z, N[6]);
160 | 
161 | 		return n_c * inv_dc;
162 | 	};
163 | 
164 | 
165 | 	auto outerLambda = [](auto z)
166 | 	{
167 | 		constexpr double inv_RT2PI(0.39894228040143267793994605993438);
168 | 		constexpr double  d[] = { 20. , 13., 200., 78., 300., 39. };
169 | 		constexpr double  n[] = { 20., 13., 180., 65., 160. };
170 | 
171 | 		auto d_outer = mul_add(mul_add(mul_add(mul_add(mul_add((d[0]* z), z , d[1]), z , d[2]),z , d[3]), z , d[4]), z ,d[5]);
172 | 		auto inv_d_outer = inv_RT2PI / d_outer;
173 | 
174 | 		auto n_outer = mul_add(mul_add(mul_add(mul_add((n[0] * z), z , n[1]), z , n[2]), z , n[3]), z , n[4]);
175 | 		return   n_outer * inv_d_outer;
176 | 	};
177 | 
178 | 
179 | 
180 | 	auto onePass = [=](auto x)
181 | 	{
182 | 		auto z = abs(x);
183 | 		auto e = exp(-z * z * 0.5);
184 | 		auto central = centralLambda(z);
185 | 		auto  SPLIT = 7.42;// 7106781186547; //play  appears to give less error
186 | 		auto condAllDone = (x * x < SPLIT* SPLIT);
187 | 
188 | 		if (horizontal_and(condAllDone))
189 | 		{
190 | 			central *= e;
191 | 			return select(x <= 0.0, central, 1.0 - central);
192 | 		}
193 | 
194 | 		auto outer = outerLambda(z);
195 | 		auto RES = select((z < SPLIT), central, outer);
196 | 		RES *= e;
197 | 		return select(x <= 0.0, RES, 1.0 - RES);
198 | 
199 | 	};
200 | 
201 | 	//auto res = ApplyTransformUR_XX(X, onePass);
202 | 	auto res = ApplyTransformUR_X(X, onePass);
203 | 
204 | 	
205 | 	return res;
206 | 
207 | }
208 | 


--------------------------------------------------------------------------------
/cumNormalExample/cumNormalExample.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="cumNormalExample.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |   </ItemGroup>
22 |   <ItemGroup>
23 |     <ClInclude Include="cumNormal.h">
24 |       <Filter>Header Files</Filter>
25 |     </ClInclude>
26 |   </ItemGroup>
27 | </Project>


--------------------------------------------------------------------------------
/cumNormalExample/cumNormalExample.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/dancingAVX512/AVX512Dance.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include <algorithm>
  4 | #include <random>
  5 | #include <numeric>
  6 | #include <iterator>
  7 | #include <iostream>
  8 | #include <vector>
  9 | #include <chrono>
 10 | #include <iomanip>  
 11 | #include <thread>
 12 | #include <map>
 13 | 
 14 | 
 15 | 
 16 | #include "../Vectorisation/VecX/dr3.h"
 17 | 
 18 | 
 19 | const double billion = 1000000000.0;
 20 | 
 21 | 
 22 | using Calc_Values = std::map<int, double>;
 23 | using  Mapped_Performance_Results = std::map<int, std::vector<double> >; // array size  v vector<throughput for runs>
 24 | using Mapped_Stats = std::map<int, std::pair<double, double> >; // size -.pair ( throughput ,  std dev of through put)
 25 | 
 26 | struct RunResults
 27 | {
 28 | 	Mapped_Performance_Results m_raw_results;
 29 | 	Calc_Values  m_calc_results;
 30 | 	double time;
 31 | };
 32 | 
 33 | class TimerGuard
 34 | {
 35 | 	double& m_runTime;
 36 | 	std::chrono::high_resolution_clock::time_point  m_startTme;
 37 | 
 38 | public:
 39 | 	TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::high_resolution_clock::now()) { runTime = 0.; }
 40 | 
 41 | 	~TimerGuard()
 42 | 	{
 43 | 		auto endTime = std::chrono::high_resolution_clock::now();
 44 | 		auto runtime = endTime - m_startTme;
 45 | 		m_runTime = runtime.count() / billion;
 46 | 	}
 47 | };
 48 | 
 49 | 
 50 | auto getRandomShuffledVectorxxx(int SZ, int instance_number = 0)
 51 | {
 52 | 	static std::map<int, std::vector<double> > vectors;
 53 | 	int key = 10 * SZ + instance_number;
 54 | 	//store vectors with key 10 times size  and add on 0-9 integer for instance of different random vector
 55 | 
 56 | 	if (SZ < 0)
 57 | 	{
 58 | 		vectors.clear();
 59 | 		SZ = 0;
 60 | 	}
 61 | 
 62 | 
 63 | 	if (vectors.find(key) != vectors.end())
 64 | 	{
 65 | 		return vectors[key];
 66 | 	}
 67 | 	else
 68 | 	{
 69 | 		std::vector<double>  v(SZ, double(6.66));
 70 | 		for (int i = 0; i < SZ; i++) { v[i] += double(SZ / 2) + i; }
 71 | 		std::random_device rd;
 72 | 		std::mt19937 g(rd());
 73 | 		std::shuffle(begin(v), end(v), g);
 74 | 		vectors[key] = v;
 75 | 		return v;
 76 | 	}
 77 | }
 78 | 
 79 | 
 80 | auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ)
 81 | {
 82 | 
 83 | 	RunResults results;
 84 | 
 85 | 	for (int j = 0; j < testRepeats; ++j)
 86 | 	{
 87 | 		int VEC_SZ = vec_start_size;
 88 | 		for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ)
 89 | 		{
 90 | 			func(VEC_SZ, testLoopSZ);
 91 | 		}
 92 | 	}
 93 | 	return results;
 94 | };
 95 | 
 96 | 
 97 | 
 98 | 
 99 | void doAVXMax512Dance()
100 | {
101 | 
102 | 	const int maxVectorSize = 4400;
103 | 	const int minVectorSize = 800;
104 | 	const long TEST_LOOP_SZ = 10000;
105 | 	const int vectorStepSize = 8;
106 | 	const int repeatRuns = 13;
107 | 
108 | 	getRandomShuffledVectorxxx(-1); // reset  random input vectors
109 | 
110 | 
111 | 	//avx512 lambda
112 | 	auto DR3_avx512 = [&](int SZ, long TEST_LOOP_SZ)
113 | 	{
114 | 		using namespace DRC::VecD8D;
115 | 
116 | 		double time = 0.;
117 | 		volatile  double res = 0.;
118 | 
119 | 		auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); };
120 | 
121 | 		auto v1 = getRandomShuffledVectorxxx(SZ, 0);
122 | 		VecXX vec(v1);
123 | 
124 | 
125 | 		for (long l = 0; l < TEST_LOOP_SZ; l++)
126 | 		{
127 | 			res = reduce(vec, mxDbl);
128 | 		}
129 | 
130 | 		return std::make_pair(res, time);
131 | 	};
132 | 
133 | 
134 | 	auto DR3_avx2 = [&](int SZ, long TEST_LOOP_SZ)
135 | 	{
136 | 		using namespace DRC::VecD4D;
137 | 
138 | 		double time = 0.;
139 | 		volatile  double res = 0.;
140 | 
141 | 		auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); };
142 | 
143 | 		auto v1 = getRandomShuffledVectorxxx(SZ, 0);
144 | 		VecXX vec(v1);
145 | 
146 | 
147 | 		for (long l = 0; l < TEST_LOOP_SZ; l++)
148 | 		{
149 | 			res = reduce(vec, mxDbl);
150 | 		}
151 | 
152 | 
153 | 		return std::make_pair(res, time);
154 | 
155 | 	};
156 | 
157 | 
158 | 
159 | 
160 | 	auto DR3_sse2 = [&](int SZ, long TEST_LOOP_SZ)
161 | 	{
162 | 		using namespace DRC::VecD2D;
163 | 
164 | 		double time = 0.;
165 | 		volatile  double res = 0.;
166 | 
167 | 		auto mxDbl = [](auto lhs, auto rhs) { return iff(lhs > rhs, lhs, rhs); };
168 | 
169 | 		auto v1 = getRandomShuffledVectorxxx(SZ, 0);
170 | 		VecXX vec(v1);
171 | 
172 | 
173 | 		for (long l = 0; l < TEST_LOOP_SZ; l++)
174 | 		{
175 | 			res = reduce(vec, mxDbl);
176 | 		}
177 | 
178 | 		return std::make_pair(res, time);
179 | 
180 | 	};
181 | 
182 | 
183 | 
184 | 	auto DR3_stl = [&](int SZ, long TEST_LOOP_SZ)
185 | 	{
186 | 		using namespace DRC::VecD2D;
187 | 
188 | 		double time = 0.;
189 | 		volatile  double res = 0.;
190 | 
191 | 		auto v1 = getRandomShuffledVectorxxx(SZ, 0);
192 | 
193 | 
194 | 		for (long l = 0; l < TEST_LOOP_SZ; l++)
195 | 		{
196 | 			res = *std::max_element(begin(v1), end(v1));
197 | 		}
198 | 
199 | 		return std::make_pair(res, time);
200 | 	};
201 | 
202 | 
203 | 
204 | 
205 | 	using namespace std::chrono_literals;
206 | 
207 | 	for (;;)
208 | 	{
209 | 
210 | 		double time = 0.0;
211 | 		constexpr int NUM_BURSTS = 3;
212 | 		constexpr auto SLEEP_TIME = 20000ms;
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 		//STL
219 | 		for (int K = 0; K < NUM_BURSTS; K++)
220 | 		{
221 | 			time = 0.;
222 | 			std::cout << "1/3rd the work using STL max " << K + 1 << "of " << NUM_BURSTS << std::endl;
223 | 			{	TimerGuard timer(time);
224 | 			auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_stl, TEST_LOOP_SZ / 3);
225 | 			}
226 | 			std::cout << "STL " << K + 1 << " of  " << NUM_BURSTS << "    " << time << " seconds   now sleep" << std::endl;
227 | 			std::this_thread::sleep_for(SLEEP_TIME);
228 | 		}
229 | 
230 | 		std::this_thread::sleep_for(SLEEP_TIME);
231 | 
232 | 
233 | 
234 | 		//SSE2
235 | 		for (int K = 0; K < NUM_BURSTS; K++)
236 | 		{
237 | 			time = 0.;
238 | 			std::cout << "SSE2 " << K + 1 << " of  " << NUM_BURSTS << std::endl;
239 | 			{	TimerGuard timer(time);
240 | 			auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_sse2, TEST_LOOP_SZ);
241 | 			}
242 | 			std::cout << "SSE2 " << K + 1 << " of  " << NUM_BURSTS << "    " << time << " seconds   now sleep" << std::endl;
243 | 			std::this_thread::sleep_for(SLEEP_TIME);
244 | 		}
245 | 
246 | 		std::this_thread::sleep_for(SLEEP_TIME);
247 | 
248 | 
249 | 		//AVX2 
250 | 		for (int K = 0; K < NUM_BURSTS; K++)
251 | 		{
252 | 			time = 0.;
253 | 			std::cout << "AVX2 " << K + 1 << " of  " << NUM_BURSTS << std::endl;
254 | 			{	TimerGuard timer(time);
255 | 			auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx2, TEST_LOOP_SZ);
256 | 			}
257 | 			std::cout << "AVX2 " << K + 1 << " of  " << NUM_BURSTS << "    " << time << " seconds   now sleep" << std::endl;
258 | 			std::this_thread::sleep_for(SLEEP_TIME);
259 | 
260 | 		}
261 | 
262 | 		std::this_thread::sleep_for(SLEEP_TIME);
263 | 
264 | 		//AVX512
265 | 		for (int K = 0; K < NUM_BURSTS; K++)
266 | 		{
267 | 			time = 0.;
268 | 			std::cout << "AVX512 " << K + 1 << " of  " << NUM_BURSTS << std::endl;
269 | 			{
270 | 				TimerGuard timer(time);
271 | 				auto dr3_raw_results = runFunctionOverDifferentSize(repeatRuns, minVectorSize, vectorStepSize, maxVectorSize, DR3_avx512, TEST_LOOP_SZ);
272 | 			}
273 | 			std::cout << "AVX512 " << K + 1 << " of  " << NUM_BURSTS << "    " << time << " seconds   now sleep" << std::endl;
274 | 			std::this_thread::sleep_for(SLEEP_TIME);
275 | 		}
276 | 
277 | 		std::this_thread::sleep_for(SLEEP_TIME);
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 	}
284 | 
285 | }
286 | 
287 | 
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/dancingAVX512/AVX512Dance.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | 
4 | void doAVXMax512Dance();
5 | 
6 |  //return  iff((x - (VecXX::INS(2.0) * floor(x * VecXX::INS(0.5)))) >= VecXX::INS(1.0),			//auto oneIfOddLmbda = [&](auto x) { iff(x > VecXX::INS(0.5 * SZ), 


--------------------------------------------------------------------------------
/dancingAVX512/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(dancingAVX512 dancingAVX512.cpp AVX512Dance.cpp)
2 | 
3 | target_link_libraries(dancingAVX512 PUBLIC Vectorisation)
4 | 
5 | target_include_directories(dancingAVX512  PUBLIC
6 |                            "${PROJECT_BINARY_DIR}"
7 |                            )
8 | 
9 |         


--------------------------------------------------------------------------------
/dancingAVX512/dancingAVX512.cpp:
--------------------------------------------------------------------------------
 1 | // dancingAVX512.cpp : This file contains the 'main' function. Program execution begins and ends there.
 2 | //
 3 | 
 4 | 
 5 | #include <algorithm>
 6 | #include <random>
 7 | #include <numeric>
 8 | #include <iterator>
 9 | #include <iostream>
10 | #include <vector>
11 | #include <chrono>
12 | #include <iomanip>  
13 | #include <thread>
14 | #include <map>
15 | #include <cstring>
16 | 
17 | 
18 | #include "../Vectorisation/VecX/dr3.h"
19 | #include "../Vectorisation/VecX/accumulate_transform.h"
20 | #include "../Vectorisation/VecX/error_utils.h"
21 | 
22 | 
23 | #include "../Vectorisation/VecX/zip_utils.h"
24 | 
25 | //#include "norm.h"
26 | 
27 | #include "AVX512Dance.h"
28 | 
29 | // use namespace DRC::VecD8D  run this and watch power consumption
30 | // switches between AVX2 and AVX512  implementations
31 | // AVX512 uses less energy in this case
32 | 
33 | int main()
34 | {
35 |   //  std::cout << "Hello World!\n";
36 |     doAVXMax512Dance();
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/dancingAVX512/dancingAVX512.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/docs/BlackScholesVecXX.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/docs/BlackScholesVecXX.mp4


--------------------------------------------------------------------------------
/docs/Build.md:
--------------------------------------------------------------------------------
 1 | # Supported platforms    
 2 | OS: Windows, Ubuntu   
 3 | Compiler: Visual Studio, gcc, clang   
 4 | CPU:   
 5 | 
 6 | # Build commands
 7 | ```
 8 | mkdir build; cd build    
 9 | cmake .. -DCMAKE_BUILD_TYPE=Release
10 | cmake --build . --config Release
11 | ```
12 | 


--------------------------------------------------------------------------------
/docs/cppCon2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andyD123/DR3/88e9c310ea9ba3cae0f536fe01109a18222e713c/docs/cppCon2022.pdf


--------------------------------------------------------------------------------
/inverseCumNormalExample/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(inverseCumNormalExample inverseCumNormalExample.cpp cdfNormalInverse.cpp)
2 | 
3 | target_link_libraries(inverseCumNormalExample PUBLIC Vectorisation)
4 | 
5 | target_include_directories(inverseCumNormalExample PUBLIC
6 |                            "${PROJECT_BINARY_DIR}"
7 |                            )
8 | 
9 |         


--------------------------------------------------------------------------------
/inverseCumNormalExample/cdfNormalInverse.cpp:
--------------------------------------------------------------------------------
  1 | #include "cdfNormalInverse.h"
  2 | 
  3 | /*
  4 | #include "../Vectorisation/VecX/vec.h"
  5 | #include "../Vectorisation/VecX/operations.h"
  6 | #include "../Vectorisation/VecX/apply_operation.h"
  7 | #include "../Vectorisation/VecX/vec_d.h"
  8 | #include "../Vectorisation/VecX/vec_bool.h"
  9 | #include "../Vectorisation/VecX/vec_view.h"
 10 | #include "../Vectorisation/VecX/target_name_space.h"
 11 | 
 12 | */
 13 | #include <algorithm>
 14 | #include <numeric>
 15 | /*
 16 | #include <immintrin.h>
 17 | 
 18 | #include <random>
 19 | 
 20 | #include <iterator>
 21 | #include <iostream>
 22 | #include <vector>
 23 | #include <chrono>
 24 | #include <iomanip>  
 25 | #include <functional>
 26 | */
 27 | 
 28 | 
 29 | 
 30 | double getnull(double)
 31 | {
 32 | 	return 0.0;
 33 | }
 34 | 
 35 | 
 36 | 
 37 | 
 38 | // @WichuraQuantile   
 39 | // see R implemerntation
 40 | long double qnorm8(long double p)
 41 | {
 42 | 
 43 | 	static long double a[] = { 2509.0809287301226727 , 33430.575583588128105, 67265.770927008700853, 45921.953931549871457, 13731.693765509461125,  1971.5909503065514427, 133.14166789178437745,3.387132872796366608 };
 44 | 	static long double b[] = { 5226.495278852854561, 28729.085735721942674,    39307.89580009271061, 21213.794301586595867, 5394.1960214247511077,   687.1870074920579083, 42.313330701600911252 };
 45 | 	static long double c[] = { 7.7454501427834140764e-4 , .0227238449892691845833 ,.24178072517745061177, 1.27045825245236838258 ,  3.64784832476320460504, 5.7694972214606914055, 4.6303378461565452959, 1.42343711074968357734 };
 46 | 	static long double d[] = { 1.05075007164441684324e-9 , 5.475938084995344946e-4, .0151986665636164571966, .14810397642748007459, .68976733498510000455,  1.6763848301838038494,  2.05319162663775882187,1. };
 47 | 	static long double e[] = { 2.01033439929228813265e-7 ,   2.71155556874348757815e-5,   .0012426609473880784386, .026532189526576123093, .29656057182850489123,   1.7848265399172913358, 5.4637849111641143699, 6.6579046435011037772 };
 48 | 	static long double f[] = { 2.04426310338993978564e-15 , 1.4215117583164458887e-7, 1.8463183175100546818e-5,  7.868691311456132591e-4, .0148753612908506148525,.13692988092273580531, .59983220655588793769, 1. };
 49 | 
 50 | 
 51 | 	long double val = 0.0;
 52 | 	long double   q = p - 0.5;
 53 | 
 54 | 
 55 | 	if (fabs(q) <= .425)
 56 | 	{
 57 | 		long double r = .180625 - q * q;
 58 | 		val =
 59 | 
 60 | 			q * (((((((r * a[0] +
 61 | 				a[1]) * r + a[2]) * r +
 62 | 				a[3]) * r + a[4]) * r +
 63 | 				a[5]) * r + a[6]) * r +
 64 | 				a[7])
 65 | 			/ (((((((r * b[0] +
 66 | 				b[1]) * r + b[2]) * r +
 67 | 				b[3]) * r + b[4]) * r +
 68 | 				b[5]) * r + b[6]) * r + 1.);
 69 | 
 70 | 	}
 71 | 	else
 72 | 	{ // closer than 0.075 from {0,1} boundary 
 73 | 
 74 | 	   // r = min(p, 1-p) < 0.075 
 75 | 		long double  r = std::min(p, 1 - p);
 76 | 		r = sqrt(-log(r));
 77 | 
 78 | 
 79 | 
 80 | 		// <==> min(p,1-p) >= exp(-25) ~= 1.3888e-11 
 81 | 		if (r <= 5.)
 82 | 		{
 83 | 			r += -1.6;
 84 | 			val = (((((((r * c[0] + c[1]) * r + c[2]) * r + c[3]) * r + c[4]) * r + c[5]) * r + c[6]) * r + c[7])
 85 | 				/ (((((((r * d[0] + d[1]) * r + d[2]) * r + d[3]) * r + d[4]) * r + d[5]) * r + d[6]) * r + 1.);
 86 | 
 87 | 
 88 | 		}
 89 | 		else
 90 | 		{ // very close to  0 or 1 
 91 | 			r += -5.;
 92 | 
 93 | 			val = (((((((r * e[0] + e[1]) * r + e[2]) * r + e[3]) * r + e[4]) * r + e[5]) * r + e[6]) * r + e[7])
 94 | 				/ (((((((r * f[0] + f[1]) * r + f[2]) * r + f[3]) * r + f[4]) * r + f[5]) * r + f[6]) * r + 1.);
 95 | 
 96 | 		}
 97 | 
 98 | 		long double valMult = (q < 0.0) ? -1.0 : 1.0;
 99 | 		val *= valMult;
100 | 	}
101 | 
102 | 	return val;
103 | }
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/inverseCumNormalExample/inverseCumNormalExample.log:
--------------------------------------------------------------------------------
1 | ﻿C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Current\Bin\Microsoft.Common.CurrentVersion.targets(820,5): error : The BaseOutputPath/OutputPath property is not set for project 'inverseCumNormalExample.vcxproj'.  Please check to make sure that you have specified a valid combination of Configuration and Platform for this project.  Configuration='Debug'  Platform='ARM64'.  This error may also appear if some other project is trying to follow a project-to-project reference to this project, this project has been unloaded or is not included in the solution, and the referencing project does not build using the same or an equivalent Configuration or Platform.
2 | 


--------------------------------------------------------------------------------
/inverseCumNormalExample/inverseCumNormalExample.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="inverseCumNormalExample.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="cdfNormalInverse.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |   </ItemGroup>
25 |   <ItemGroup>
26 |     <ClInclude Include="cdfNormalInverse.h">
27 |       <Filter>Header Files</Filter>
28 |     </ClInclude>
29 |   </ItemGroup>
30 | </Project>


--------------------------------------------------------------------------------
/inverseCumNormalExample/inverseCumNormalExample.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/lattice/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(lattice lattice.cpp
 2 |   americanCrankNicholsonPricer.cpp
 3 |   americanFinitDiffPricer.cpp
 4 |   americanImplicitFiniteDiff.cpp
 5 |   americanTrinomialPricer.cpp
 6 |   americanTrinomialPricerUpAndOut.cpp
 7 |   europeanBinomialPricer.cpp
 8 |   euroTrinomial.cpp
 9 |   euroTrinomialPricerWithInit.cpp
10 |   lattice_tools.cpp
11 |   )
12 | 
13 | target_link_libraries(lattice PUBLIC Vectorisation)
14 | 
15 | target_include_directories(lattice PUBLIC
16 |                            "${PROJECT_BINARY_DIR}"
17 |                            )
18 | 
19 |         


--------------------------------------------------------------------------------
/lattice/americanCrankNicholsonPricer.cpp:
--------------------------------------------------------------------------------
  1 | #include "../Vectorisation/VecX/dr3.h"
  2 | #include "utils.h"
  3 | #include "pricers.h"
  4 | 
  5 | 
  6 | //still broken ???
  7 | double americanCrankNicholsonPricer(double S, double K, double sig, double r, double T, int N)
  8 | {
  9 | 
 10 | 	//dividend yield
 11 | 	double y = 0.;// 0.03;// 0.0;// 0.03;// 0.03; //div yield
 12 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
 13 | 
 14 | 	double Dt = T / N;
 15 | 	double Dx = sig * std::sqrt(1.0 * Dt);// 0.2;//
 16 | 	double v = r - y - 0.5 * sig * sig;
 17 | 
 18 | 
 19 | 	VecXX::SCALA_TYPE pu = -0.25 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx);
 20 | 	VecXX::SCALA_TYPE pd = -0.25 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx);
 21 | 	VecXX::SCALA_TYPE pm = 1. + 0.5 * Dt * (sig * sig) / (Dx * Dx) + 0.5 * r * Dt;
 22 | 
 23 | 	std::vector<FLOAT> vdbg;
 24 | 	//Pay off functions
 25 | 
 26 | 	//call
 27 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
 28 | 
 29 | 	//put
 30 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };
 31 | 
 32 | 	//set up underlying asset prices at maturity
 33 | 	double last = S * exp(-(N + 1) * Dx);
 34 | 	double edx = exp(Dx);
 35 | 	for (auto& el : terminalAssetPrices)
 36 | 	{
 37 | 		last *= edx;
 38 | 		el = last;
 39 | 	}
 40 | 
 41 | 	//option vakue at maturity
 42 | 
 43 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
 44 | 
 45 | 
 46 | 	//derivative boundary condition
 47 | 	double  lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]);
 48 | 	double  lambda_U = 0.0;
 49 | 
 50 | 	auto odd_slice = excerciseValue;
 51 | 
 52 | 	//set up slices
 53 | 	auto even_slice = odd_slice * 0.0;
 54 | 	even_slice[0] = odd_slice[0];
 55 | 
 56 | 	int J = 2 * N;
 57 | 	int k = 0;
 58 | 
 59 | 	VecXX pmp(1.0, J + 1);
 60 | 	VecXX pp(1.0, J + 1);
 61 | 
 62 | 	for (; k <= N; k += 2)
 63 | 	{
 64 | 
 65 | 		// SOLVE IMPLICIT TRIDIAGONAL  IN LINE //SUB BOUNDARY CONDITION AT J = -n INTO  J = -n+1
 66 | 		
 67 | 		pmp[1] = pm + pd;
 68 | 		pp[1] = -pu * odd_slice[2] - (pm - 2.) * odd_slice[1] - pd * odd_slice[0] + pd * lambda_L;
 69 | 
 70 | 
 71 | 		// eliminate upper diagonal
 72 | 		for (int j = 2; j < J; ++j)
 73 | 		{
 74 | 			pmp[j] = pm - pu * pd / pmp[j - 1];
 75 | 			pp[j] = -pu * odd_slice[j + 1] - (pm - 2.0) * odd_slice[j] - pd * odd_slice[j - 1] - pp[j - 1] * pd / pmp[j - 1];
 76 | 		}
 77 | 
 78 | 		even_slice[J] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
 79 | 		even_slice[J - 1] = even_slice[J] - lambda_U;
 80 | 
 81 | 
 82 | 		// back substitution
 83 | 		for (int j = J - 1; j >= 0; j--)
 84 | 		{
 85 | 			even_slice[j] = (pp[j] - pu * even_slice[j + 1]) / pmp[j];
 86 | 		}
 87 | 
 88 | 
 89 | 		even_slice[0] = odd_slice[0];
 90 | 		//vdbg = even_slice;
 91 | /*
 92 | 		//american condition
 93 | 		for (int j = 0; j < (J+1); j++)
 94 | 		{
 95 | 			even_slice[j] = std::max(even_slice[j], excerciseValue[j]);
 96 | 		}
 97 | */
 98 | 
 99 | //	vdbg = even_slice;
100 | 
101 | //	vdbg = odd_slice;
102 | //	vdbg = pmp;
103 | //	vdbg = pp;
104 | 
105 | 
106 | 	// calc odd slice now
107 | //////////////////////////////////////////////////////////////////////
108 | 
109 | 		pmp[1] = pm + pd;
110 | 		pp[1] = -pu * even_slice[2] - (pm - 2.) * even_slice[1] - pd * even_slice[0] + pd * lambda_L;
111 | 
112 | 
113 | 		// eliminate upper diagonal
114 | 		for (int j = 2; j < J; ++j)
115 | 		{
116 | 			pmp[j] = pm - pu * pd / pmp[j - 1];
117 | 			pp[j] = -pu * even_slice[j + 1] - (pm - 2.0) * even_slice[j] - pd * even_slice[j - 1] - pp[j - 1] * pd / pmp[j - 1];
118 | 		}
119 | 
120 | 		odd_slice[J] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
121 | 		odd_slice[J - 1] = odd_slice[J] - lambda_U;
122 | 
123 | 
124 | 		// back substitution
125 | 		for (int j = J - 1; j >= 0; j--)
126 | 		{
127 | 			odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) / pmp[j];
128 | 		}
129 | 
130 | 		odd_slice[0] = even_slice[0];
131 | 		/*
132 | 		//american condition
133 | 		for (int j = 0; j < (J + 1); j++)
134 | 		{
135 | 			odd_slice[j] = std::max(odd_slice[j], excerciseValue[j]);
136 | 		}
137 | 			*/
138 | 
139 | 			//	vdbg = odd_slice;
140 | 			//	vdbg = even_slice;
141 | 			//	vdbg = pmp;
142 | 			//	vdbg = pp;
143 | 
144 | 	}
145 | 
146 | 	return odd_slice[N];
147 | }
148 | 
149 | 


--------------------------------------------------------------------------------
/lattice/americanFinitDiffPricer.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "../Vectorisation/VecX/dr3.h"
 3 | #include "utils.h"
 4 | #include "pricers.h"
 5 | 
 6 | 
 7 | double americanFiniteDiffPricer(double S, double K, double sig, double r, double T, int N)
 8 | {
 9 | 
10 | 	//dividend yield
11 | 	double y = 0.0;
12 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
13 | 
14 | 	double Dt = T / N;
15 | 	double Dx = sig * std::sqrt(2.0 * Dt);
16 | 	double v = r - y - 0.5 * sig * sig;
17 | 
18 | 
19 | 	VecXX::INS pu = 0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx);
20 | 	VecXX::INS pd = 0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx);
21 | 	VecXX::INS pm = 1. - Dt * (sig * sig) / (Dx * Dx) - r * Dt;
22 | 
23 | 
24 | 
25 | 	TrinomialSampler<VecXX::INS> sampler;
26 | 	//introduces offset variables so that we can get vectorised versions
27 | 	// of X[i+1], X[i] and  x[i-1]
28 | 	// under the hood these do unaligned loads into registers so taht we can still
29 | 	// apply the vectorised 
30 | 
31 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
32 | 	{
33 | 		const auto& X1 = sampler.X_1.value;
34 | 		const auto& X0 = sampler.X_0.value;
35 | 		const auto& X_1 = sampler.X_Minus_1.value;
36 | 		return  (X1 * pu + X0 * pm + X_1 * pd);
37 | 	};
38 | 
39 | 	//Pay off functions
40 | 	//call
41 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
42 | 
43 | 	//put
44 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };
45 | 
46 | 	//set up underlying asset prices at maturity
47 | 	double last = S * exp(-(N + 1) * Dx);
48 | 	double edx = exp(Dx);
49 | 	for (auto& el : terminalAssetPrices)
50 | 	{
51 | 		last *= edx;
52 | 		el = last;
53 | 	}
54 | 
55 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
56 | 	auto odd_slice = excerciseValue;
57 | 
58 | 	UnitarySampler<VecXX::INS> identity_sampler; //identity  pas through
59 | 
60 | 	//this is the american part of the option excercise
61 | 	auto applyEarlyExcercise = [=](UnitarySampler<VecXX::INS>& sampler, auto excercisePrice)
62 | 	{
63 | 		auto optPrice = sampler.X_0.value; //.get<0>();
64 | 		return max(optPrice, excercisePrice);
65 | 	};
66 | 
67 | 	auto even_slice = odd_slice * 0.0;
68 | 
69 | 	int J = 2 * N;
70 | 	int k = 0;
71 | 	for (; k < N; k += 2)
72 | 	{
73 | 
74 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, 0, J);
75 | 
76 | 		//apply boundary condition
77 | 		even_slice[0] = even_slice[1] + terminalAssetPrices[1] - terminalAssetPrices[0];
78 | 		even_slice[J] = even_slice[J - 1];
79 | 		// transform to get early excercise for american exercise , iderntity sampler just passes values straight through
80 | 		transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, 0, J);
81 | 
82 | 
83 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, 0, J);
84 | 		//boundary condition
85 | 		odd_slice[0] = odd_slice[1] + terminalAssetPrices[1] - terminalAssetPrices[0];
86 | 		odd_slice[J] = odd_slice[J - 1];
87 | 		transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, 0, J);
88 | 
89 | 	}
90 | 
91 | 	return odd_slice[N];
92 | }
93 | 


--------------------------------------------------------------------------------
/lattice/americanImplicitFiniteDiff.cpp:
--------------------------------------------------------------------------------
  1 | #include "../Vectorisation/VecX/dr3.h"
  2 | #include "../Vectorisation/VecX/zip_utils.h"
  3 | #include "utils.h"
  4 | #include "pricers.h"
  5 | 
  6 | 
  7 | double americanImplicitFiniteDiffPricerFast(double S, double K, double sig, double r, double T, int N)
  8 | {
  9 | 
 10 | 
 11 | 	double y = 0.0;//dividend yield
 12 | 
 13 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
 14 | 
 15 | 	double Dt = T / N;
 16 | 	double Dx = sig * std::sqrt(2.0 * Dt);
 17 | 	double v = r - y - 0.5 * sig * sig;
 18 | 
 19 | 
 20 | 	VecXX::SCALA_TYPE pu = -0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx);
 21 | 	VecXX::SCALA_TYPE pd = -0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx);
 22 | 	VecXX::SCALA_TYPE pm = 1. + Dt * (sig * sig) / (Dx * Dx) + r * Dt;
 23 | 
 24 | 
 25 | 	std::vector<FLOAT> vdbg;
 26 | 	//Pay off functions
 27 | 
 28 | 	//call
 29 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
 30 | 
 31 | 	//put
 32 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };
 33 | 
 34 | 	//set up underlying asset prices at maturity
 35 | 	double last = S * exp(-(N + 1) * Dx);
 36 | 	double edx = exp(Dx);
 37 | 	for (auto& el : terminalAssetPrices)
 38 | 	{
 39 | 		last *= edx;
 40 | 		el = last;
 41 | 	}
 42 | 
 43 | 	//option value at maturity
 44 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
 45 | 
 46 | 	auto american = [](auto X, auto Y) { return select(X > Y, X, Y); };
 47 | 
 48 | 
 49 | 	//derivative boundary condition
 50 | 	double  lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]);
 51 | 	double  lambda_U = 0.0;
 52 | 
 53 | 	auto odd_slice = excerciseValue;
 54 | 	//	vdbg = odd_slice;
 55 | 
 56 | 
 57 | 	auto even_slice = odd_slice * 0.0;
 58 | 
 59 | 	int J = 2 * N;
 60 | 	int k = 0;
 61 | 
 62 | 	VecXX pmp(1.0, J + 1);
 63 | 	VecXX pp(1.0, J + 1);
 64 | 
 65 | 	////////////
 66 | 	//LOOP HOIST BITS FROM  IMPLICIT TRIDIAGONAL 
 67 | 
 68 | 	pmp[1] = pm + pd;
 69 | 	pp[1] = odd_slice[1] + pd * lambda_L;
 70 | 
 71 | 
 72 | 	auto pu_pd = pu * pd;
 73 | 
 74 | 	// eliminate upper diagonal
 75 | 	for (int j = 2; j < J; ++j)
 76 | 	{
 77 | 		pmp[j] = pm - pu_pd / pmp[j - 1];
 78 | 	}
 79 | 
 80 | 	auto inv_pmp = 1.0 / pmp;
 81 | 
 82 | 	auto pd_inv_pmp = pd * inv_pmp;
 83 | 
 84 | 	/////////////
 85 | 
 86 | 	for (; k < N; k += 2)
 87 | 	{
 88 | 
 89 | 		// SOLVE IMPLICIT TRIDIAGONAL  IN LINE 	SUB BOUNDARY CONDITION AT J = -n INTO  J = -n+1
 90 | 
 91 | 		//pmp[1] = pm + pd;
 92 | 		pp[1] = odd_slice[1] + pd * lambda_L;
 93 | 
 94 | 		// eliminate upper diagonal
 95 | 		for (int j = 2; j < J; ++j)
 96 | 		{
 97 | 
 98 | 			pp[j] = odd_slice[j] - pp[j - 1] * pd_inv_pmp[j - 1];
 99 | 		}
100 | 
101 | 		even_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
102 | 		even_slice[J - 1] = even_slice[J] - lambda_U;
103 | 
104 | 
105 | 		// back substitution
106 | 		for (int j = J - 2; j != 0; j--)
107 | 		{
108 | 			even_slice[j] = (pp[j] - pu * even_slice[j + 1]) * inv_pmp[j];
109 | 		}
110 | 
111 | 		//american excercise bit
112 | 		even_slice = transform(american, even_slice, (const VecXX&)excerciseValue);
113 | 
114 | 
115 | 		// now calculate the  odd slice 
116 | 
117 | 		pp[1] = even_slice[1] + pd * lambda_L;
118 | 
119 | 		// eliminate upper diagonal
120 | 		for (int j = 2; j < J; ++j)
121 | 		{
122 | 			pp[j] = even_slice[j] - pp[j - 1] * pd_inv_pmp[j - 1];
123 | 		}
124 | 
125 | 		odd_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
126 | 		odd_slice[J - 1] = odd_slice[J] - lambda_U;
127 | 
128 | 
129 | 		// back substitution
130 | 		for (int j = J - 2; j != 0; j--)
131 | 		{
132 | 			odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) * inv_pmp[j];
133 | 		}
134 | 
135 | 		//american excercise bit
136 | 		odd_slice = transform(american, odd_slice, (const VecXX&)excerciseValue);
137 | 	}
138 | 
139 | 	return odd_slice[N];
140 | }
141 | 
142 | double americanImplicitFiniteDiffPricer(double S, double K, double sig, double r, double T, int N)
143 | {
144 | 
145 | 	//dividend yield
146 | 	double y = 0.0;// 0.03;// 0.03; //div yield
147 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
148 | 
149 | 	double Dt = T / N;
150 | 	double Dx = sig * std::sqrt(2.0 * Dt);
151 | 	double v = r - y - 0.5 * sig * sig;
152 | 
153 | 	//double  u = Dx;
154 | 	//double d = 1. / u;
155 | 
156 | 
157 | 	VecXX::SCALA_TYPE pu = -0.5 * Dt * ((sig * sig) / (Dx * Dx) + v / Dx);
158 | 	VecXX::SCALA_TYPE pd = -0.5 * Dt * ((sig * sig) / (Dx * Dx) - v / Dx);
159 | 	VecXX::SCALA_TYPE pm = 1. + Dt * (sig * sig) / (Dx * Dx) + r * Dt;
160 | 
161 | 
162 | 	std::vector<FLOAT> vdbg;
163 | 	//Pay off functions
164 | 
165 | 	//call
166 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
167 | 
168 | 	//put
169 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };
170 | 
171 | 	//set up underlying asset prices at maturity
172 | 	double last = S * exp(-(N + 1) * Dx);
173 | 	double edx = exp(Dx);
174 | 	for (auto& el : terminalAssetPrices)
175 | 	{
176 | 		last *= edx;
177 | 		el = last;
178 | 	}
179 | 
180 | 	//option value at maturity
181 | 
182 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
183 | 
184 | 	//derivative boundary condition
185 | 	double  lambda_L = -1. * (terminalAssetPrices[1] - terminalAssetPrices[0]);
186 | 	double  lambda_U = 0.0;
187 | 
188 | 	auto odd_slice = excerciseValue;
189 | 	vdbg = odd_slice;
190 | 
191 | 
192 | 	auto even_slice = odd_slice * 0.0;
193 | 
194 | 	int J = 2 * N;
195 | 	int k = 0;
196 | 
197 | 	VecXX pmp(1.0, J + 1);
198 | 	VecXX pp(1.0, J + 1);
199 | 
200 | 	for (; k < N; k += 2)
201 | 	{
202 | 
203 | 		// SOLVE IMPLICIT TRIDIAGONAL  IN LINE 
204 | 		//SUB BOUNDARY CONDITION AT J = -n INTO  J = -n+1
205 | 		pmp[1] = pm + pd;
206 | 		pp[1] = odd_slice[1] + pd * lambda_L;
207 | 
208 | 
209 | 
210 | 		// eliminate upper diagonal
211 | 		for (int j = 2; j < J; ++j)
212 | 		{
213 | 			pmp[j] = pm - pu * pd / pmp[j - 1];
214 | 			pp[j] = odd_slice[j] - pp[j - 1] * pd / pmp[j - 1];
215 | 		}
216 | 
217 | 		even_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
218 | 		even_slice[J - 1] = even_slice[J] - lambda_U;
219 | 
220 | 
221 | 		// back substitution
222 | 		for (int j = J - 2; j != 0; j--)
223 | 		{
224 | 			even_slice[j] = (pp[j] - pu * even_slice[j + 1]) / pmp[j];
225 | 		}
226 | 
227 | 
228 | 
229 | 		// american
230 | 		for (int j = 0; j < (J + 1); j++)
231 | 		{
232 | 			even_slice[j] = std::max(even_slice[j], excerciseValue[j]);
233 | 		}
234 | 
235 | 
236 | 		//even_slice =	transform(even_slice, excerciseValue);
237 | 
238 | 		// calc odd slice now
239 | 		//////////////////////////////////////////////////////////////////////
240 | 
241 | 		pmp[1] = pm + pd;
242 | 		pp[1] = even_slice[1] + pd * lambda_L;
243 | 
244 | 
245 | 		// eliminate upper diagonal
246 | 		for (int j = 2; j < J; ++j)
247 | 		{
248 | 			pmp[j] = pm - pu * pd / pmp[j - 1];
249 | 			pp[j] = even_slice[j] - pp[j - 1] * pd / pmp[j - 1];
250 | 		}
251 | 
252 | 		odd_slice[1] = (pp[J - 1] + pmp[J - 1] * lambda_U) / (pu + pmp[J - 1]);
253 | 		odd_slice[J - 1] = odd_slice[J] - lambda_U;
254 | 
255 | 
256 | 		// back substitution
257 | 		for (int j = J - 2; j != 0; j--)
258 | 		{
259 | 			odd_slice[j] = (pp[j] - pu * odd_slice[j + 1]) / pmp[j];
260 | 		}
261 | 
262 | 
263 | 		//american
264 | 		for (int j = 0; j < (J + 1); j++)
265 | 		{
266 | 			odd_slice[j] = std::max(odd_slice[j], excerciseValue[j]);
267 | 		}
268 | 
269 | 
270 | 	}
271 | 
272 | 	return odd_slice[N];
273 | }
274 | 


--------------------------------------------------------------------------------
/lattice/americanTrinomialPricer.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "../Vectorisation/VecX/dr3.h"
 3 | #include "utils.h"
 4 | #include "pricers.h"
 5 | 
 6 | 
 7 | double americanTrinomialPricer(double S, double K, double sig, double r, double T, int N)
 8 | {
 9 | 
10 | 	double y = 0.0;// 0.03; //div yield
11 | 
12 | 
13 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
14 | 
15 | 	double Dt = T / N;
16 | 	double Dx = sig * std::sqrt(2.0 * Dt);
17 | 	double v = r - y - 0.5 * sig * sig;
18 | 
19 | 
20 | 	VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx);
21 | 	VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx);
22 | 	VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx);
23 | 
24 | 
25 | 	VecXX::INS disc = exp(-r * Dt);
26 | 	TrinomialSampler<VecXX::INS> sampler;
27 | 
28 | 
29 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
30 | 	{
31 | 		auto X1 = sampler.X_Minus_1.value;
32 | 		auto X0 = sampler.X_0.value;
33 | 		auto X_1 = sampler.X_1.value;
34 | 		return disc * (X1 * pu + X0 * pm + X_1 * pd);
35 | 	};
36 | 
37 | 
38 | 	//call
39 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
40 | 
41 | 	//put
42 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); };
43 | 
44 | 	//set up underlying asset prices at maturity
45 | 	double last = S * exp(-(N + 1) * Dx);
46 | 	double edx = exp(Dx);
47 | 	for (auto& el : terminalAssetPrices)
48 | 	{
49 | 		last *= edx;
50 | 		el = last;
51 | 	}
52 | 
53 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
54 | 	auto odd_slice = excerciseValue;
55 | 
56 | 	UnitarySampler<VecXX::INS> identity_sampler; //identity just  passes through
57 | 
58 | 	auto applyEarlyExcercise = [=](UnitarySampler<VecXX::INS>& sampler, auto excercisePrice)
59 | 	{
60 | 		auto optPrice = sampler.X_0.value; //.get<0>();
61 | 		return max(optPrice, excercisePrice);
62 | 	};
63 | 
64 | 
65 | 	auto even_slice = odd_slice;
66 | 
67 | 	int j = 2 * N + 1 - 1;
68 | 	int i = 0;
69 | 	for (; i < N; i += 2)
70 | 	{
71 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j);
72 | 		// transform to get early excercise for american bit , iderntity sampler just passes values straight through
73 | 		transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j);
74 | 
75 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
76 | 		transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1);
77 | 
78 | 		j -= 2;
79 | 	}
80 | 
81 | 	return odd_slice[N];
82 | }
83 | 


--------------------------------------------------------------------------------
/lattice/americanTrinomialPricerUpAndOut.cpp:
--------------------------------------------------------------------------------
 1 | #include "../Vectorisation/VecX/dr3.h"
 2 | #include "utils.h"
 3 | #include "pricers.h"
 4 | 
 5 | 
 6 | 
 7 | double americanTrinomialPricerUpAndOut(double S, double K, double sig, double r, double T, double H, double rebate, int N)
 8 | {
 9 | 
10 | 	double y = 0.0;// 0.03; //div yield
11 | 
12 | 
13 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
14 | 
15 | 	double Dt = T / N;
16 | 	double Dx = sig * std::sqrt(2.0 * Dt);
17 | 	double v = r - y - 0.5 * sig * sig;
18 | 
19 | 	//double  u = Dx;
20 | 	//double d = 1. / u;
21 | 
22 | 
23 | 	VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx);
24 | 	VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx);
25 | 	VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx);
26 | 
27 | 
28 | 	VecXX::INS disc = exp(-r * Dt);
29 | 	TrinomialSampler<VecXX::INS> sampler;
30 | 
31 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
32 | 	{
33 | 
34 | 		auto X1 = sampler.X_Minus_1.value;
35 | 		auto X0 = sampler.X_0.value;
36 | 		auto X_1 = sampler.X_1.value;
37 | 
38 | 		return disc * (X1 * pu + X0 * pm + X_1 * pd);
39 | 	};
40 | 
41 | 
42 | 	//auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); }; //call
43 | 	auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };  //put
44 | 
45 | 	//set up underlying asset prices at maturity
46 | 	double last = S * exp(-(N + 1) * Dx);
47 | 	double edx = exp(Dx);
48 | 	for (auto& el : terminalAssetPrices)
49 | 	{
50 | 		last *= edx;
51 | 		el = last;
52 | 	}
53 | 
54 | 
55 | 
56 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
57 | 	auto odd_slice = excerciseValue;
58 | 
59 | 	UnitarySampler<VecXX::INS> identity_sampler; //identity just 
60 | 
61 | 	auto applyEarlyExcercise = [=](UnitarySampler<VecXX::INS>& sampler, auto excercisePrice)
62 | 	{
63 | 		auto optPrice = sampler.X_0.value; //.get<0>();
64 | 		return max(optPrice, excercisePrice);
65 | 	};
66 | 
67 | 
68 | 	auto applyBarrier = [=](UnitarySampler<VecXX::INS>& sampler, auto stockPrice)
69 | 	{
70 | 		auto optPrice = sampler.X_0.value;//.get<0>();
71 | 		return select(stockPrice < H, optPrice, rebate);
72 | 	};
73 | 
74 | 
75 | 	auto even_slice = odd_slice;
76 | 
77 | 	int j = 2 * N + 1 - 1;
78 | 	int i = 0;
79 | 	for (; i < N; i += 2)
80 | 	{
81 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j);
82 | 		// transform to get early excercise for american bit , iderntity sampler just passes values straight through
83 | 	//	transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j);
84 | 		transform(even_slice, terminalAssetPrices, even_slice, applyBarrier, identity_sampler, i, j);
85 | 
86 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
87 | 		//	transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1);
88 | 		transform(odd_slice, terminalAssetPrices, odd_slice, applyBarrier, identity_sampler, i + 1, j - 1);
89 | 
90 | 		j -= 2;
91 | 	}
92 | 
93 | 	ignore(applyEarlyExcercise);
94 | 
95 | 	return odd_slice[N];
96 | 
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/lattice/euroTrinomial.cpp:
--------------------------------------------------------------------------------
  1 | #include "../Vectorisation/VecX/dr3.h"
  2 | 
  3 | #include "utils.h"
  4 | #include "pricers.h"
  5 | 
  6 | 
  7 | 
  8 | double europeanTrinomialPricer(double S, double K, double sig, double r, double T, int N)
  9 | {
 10 | 
 11 | 	double y = 0.0;// 0.03; //div yield
 12 | 
 13 | 
 14 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
 15 | 
 16 | 	double Dt = T / N;
 17 | 	double Dx = sig * std::sqrt(3.0 * Dt);
 18 | 
 19 | 	double v = r - y - 0.5 * sig * sig;
 20 | 
 21 | 
 22 | 
 23 | 	VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx);
 24 | 	VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx);
 25 | 	VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx);
 26 | 
 27 | 	VecXX::INS disc = exp(-r * Dt);
 28 | 
 29 | 	TrinomialSampler<VecXX::INS> sampler;
 30 | 
 31 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
 32 | 	{
 33 | 
 34 | 		auto X1 = sampler.X_Minus_1.value;
 35 | 		auto X0 = sampler.X_0.value;
 36 | 		auto X_1 = sampler.X_1.value;
 37 | 
 38 | 		return disc * (X1 * pu + X0 * pm + X_1 * pd);
 39 | 	};
 40 | 
 41 | 
 42 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
 43 | 
 44 | 	//set up underlying asset prices at maturity
 45 | 	double last = S * exp(-(N + 1) * Dx);
 46 | 	double edx = exp(Dx);
 47 | 	for (auto& el : terminalAssetPrices)
 48 | 	{
 49 | 		last *= edx;
 50 | 		el = last;
 51 | 	}
 52 | 
 53 | 	auto odd_slice = transform(payOffFunc, terminalAssetPrices);
 54 | 	auto even_slice = odd_slice;
 55 | 
 56 | 	int j = 2 * N + 1 - 1;
 57 | 	int i = 0;
 58 | 	for (; i < N; i += 2)
 59 | 	{
 60 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j);
 61 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
 62 | 		j -= 2;
 63 | 	}
 64 | 
 65 | 	return odd_slice[N];
 66 | }
 67 | 
 68 | 
 69 | 
 70 | double europeanTrinomialPricer1(double S, double K, double sig, double r, double T, int N)
 71 | {
 72 | 
 73 | 	double y = 0.0;// 0.03; //div yield
 74 | 
 75 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
 76 | 
 77 | 	double Dt = T / N;
 78 | 	double Dx = sig * std::sqrt(2.0 * Dt);
 79 | 	double v = r - y - 0.5 * sig * sig;
 80 | 
 81 | 	//double  u = Dx;
 82 | 	//double d = 1. / u;
 83 | 
 84 | 
 85 | 	VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx);
 86 | 	VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx);
 87 | 	VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx);
 88 | 
 89 | 
 90 | 	VecXX::INS disc = exp(-r * Dt);
 91 | 	TrinomialSampler<VecXX::INS> sampler;
 92 | 
 93 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
 94 | 	{
 95 | 		const auto& X1 = sampler.X_1.value;
 96 | 		const auto& X0 = sampler.X_0.value;
 97 | 		const auto& X_1 = sampler.X_Minus_1.value;
 98 | 		return disc * (X1 * pu + X0 * pm + X_1 * pd);
 99 | 	};
100 | 
101 | 	//call
102 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
103 | 
104 | 	//put
105 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); };
106 | 
107 | 	//set up underlying asset prices at maturity
108 | 	double last = S * exp(-(N + 1) * Dx);
109 | 	double edx = exp(Dx);
110 | 	for (auto& el : terminalAssetPrices)
111 | 	{
112 | 		last *= edx;
113 | 		el = last;
114 | 	}
115 | 
116 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
117 | 	auto odd_slice = excerciseValue;
118 | 
119 | 	UnitarySampler<VecXX::INS> identity_sampler; //identity just 
120 | 
121 | //	auto applyEarlyExcercise = [=](UnitarySampler<VecXX::INS>& sampler, auto excercisePrice)
122 | //	{
123 | //		auto optPrice = sampler.get<0>();
124 | //		return max(optPrice, excercisePrice);
125 | //	};
126 | 
127 | 
128 | 	auto even_slice = odd_slice;
129 | 
130 | 	int j = 2 * N + 1 - 1;
131 | 	int i = 0;
132 | 	for (; i < N; i += 2)
133 | 	{
134 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j);
135 | 		// transform to get early excercise for american bit , iderntity sampler just passes values straight through
136 | 		//transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j);
137 | 
138 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
139 | 		//transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1);
140 | 
141 | 		j -= 2;
142 | 	}
143 | 
144 | 	return odd_slice[N];
145 | }
146 | 


--------------------------------------------------------------------------------
/lattice/euroTrinomialPricerWithInit.cpp:
--------------------------------------------------------------------------------
  1 | #include "../Vectorisation/VecX/dr3.h"
  2 | 
  3 | #include "utils.h"
  4 | #include "pricers.h"
  5 | 
  6 | 
  7 | double euroTrinomialPricerWithInit(double S, double K, double sig, double r, double T, int N)
  8 | {
  9 | 
 10 | 	double y = 0.0;// 0.03; //div yield
 11 | 	VecXX terminalAssetPrices(1.0, 2 * N + 1);
 12 | 
 13 | 	double Dt = T / N;
 14 | 	double Dx = sig * std::sqrt(2.0 * Dt);
 15 | 	double v = r - y - 0.5 * sig * sig;
 16 | 
 17 | 	VecXX::INS pu = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) + (v * Dt) / Dx);
 18 | 	VecXX::INS pd = 0.5 * ((Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx) - (v * Dt) / Dx);
 19 | 	VecXX::INS pm = 1. - (Dt * sig * sig + v * v * Dt * Dt) / (Dx * Dx);
 20 | 
 21 | 
 22 | 	VecXX::INS disc = exp(-r * Dt);
 23 | 	TrinomialSampler<VecXX::INS> sampler;
 24 | 
 25 | 	auto trinomialRollBack = [=](TrinomialSampler<VecXX::INS>& sampler)
 26 | 	{
 27 | 		const auto& X1 = sampler.X_1.value;
 28 | 		const auto& X0 = sampler.X_0.value;
 29 | 		const auto& X_1 = sampler.X_Minus_1.value;
 30 | 		return disc * (X1 * pu + X0 * pm + X_1 * pd);
 31 | 	};
 32 | 
 33 | 	//call
 34 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
 35 | 
 36 | 	//put
 37 | 	//auto payOffFunc = [=](auto X) { return select(X < K, K -X , 0.0); };
 38 | 
 39 | 	//set up underlying asset prices at maturity
 40 | 	double last = S * exp(-(N + 1) * Dx);
 41 | 	double edx = exp(Dx);
 42 | 	for (auto& el : terminalAssetPrices)
 43 | 	{
 44 | 		last *= edx;
 45 | 		el = last;
 46 | 	}
 47 | 
 48 | 	auto excerciseValue = transform(payOffFunc, terminalAssetPrices);
 49 | 	auto odd_slice = excerciseValue;
 50 | 
 51 | 
 52 | 	UnitarySampler<VecXX::INS> identity_sampler; //identity just 
 53 | 
 54 | 	auto applyEarlyExcercise = [=](UnitarySampler<VecXX::INS>& sampler, auto excercisePrice)
 55 | 	{
 56 | 		auto optPrice = sampler.X_0.value; //.get<0>();
 57 | 		return max(optPrice, excercisePrice);
 58 | 	};
 59 | 
 60 | 
 61 | 	auto even_slice = odd_slice;
 62 | 
 63 | 
 64 | 	/// blacks initialisation 
 65 | 	VecXX::INS invK = 1.0 / K;
 66 | 	VecXX::INS discountedRate = exp(-r * Dt);
 67 | 
 68 | 	VecXX::INS rootT = sqrt(Dt);
 69 | 	VecXX::INS sigmaRootT = rootT * sig;
 70 | 	VecXX::INS invSigmaRootT = 1.0 / sigmaRootT;
 71 | 	VecXX::INS halfSigmaSqrd_t = (0.5 * sig * sig + r) * Dt;
 72 | 
 73 | 	VecXX::INS Strike = K;
 74 | 
 75 | 	auto blackScholeInit = [&](VecXX::INS S)
 76 | 	{
 77 | 		VecXX::INS S_invK = S * invK;
 78 | 		VecXX::INS log_sK = log(S_invK);
 79 | 
 80 | 		VecXX::INS d1 = invSigmaRootT * (log_sK + halfSigmaSqrd_t);
 81 | 		VecXX::INS d2 = d1 - sigmaRootT;
 82 | 		VecXX::INS normD1 = cdfnorm(d1);
 83 | 		VecXX::INS normD2 = cdfnorm(d2);
 84 | 		VecXX::INS C = S * normD1 - Strike * discountedRate * normD2;
 85 | 		return C;
 86 | 
 87 | 	};
 88 | 
 89 | 	int j = 2 * N + 1 - 1;
 90 | 	int i = 0;
 91 | 
 92 | 	//use BS transform and normal for first pair of slices
 93 | 	even_slice = transform(blackScholeInit, terminalAssetPrices);
 94 | 
 95 | 	//even_slice = transform(payOffFunc, terminalAssetPrices);
 96 | 
 97 | 	//std::vector<double> dbg = even_slice;
 98 | 
 99 | 	transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
100 | 
101 | 	i += 2;
102 | 	j -= 2;
103 | 
104 | 	for (; i < N; i += 2)
105 | 	{
106 | 		transform(odd_slice, even_slice, trinomialRollBack, sampler, i, j);
107 | 		// transform to get early excercise for american bit , identity sampler just passes values straight through
108 | 		//transform(even_slice, excerciseValue, even_slice, applyEarlyExcercise, identity_sampler, i, j);
109 | 
110 | 		transform(even_slice, odd_slice, trinomialRollBack, sampler, i + 1, j - 1);
111 | 		//transform(odd_slice, excerciseValue, odd_slice, applyEarlyExcercise, identity_sampler, i + 1, j - 1);
112 | 
113 | 		j -= 2;
114 | 	}
115 | 
116 | 	ignore(applyEarlyExcercise);
117 | 
118 | 	return odd_slice[N];
119 | 
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/lattice/europeanBinomialPricer.cpp:
--------------------------------------------------------------------------------
 1 | #include "../Vectorisation/VecX/dr3.h"
 2 | #include "utils.h"
 3 | #include "pricers.h"
 4 | 
 5 | 
 6 | double europeanBinomialPricer(double S, double K, double sig, double r, double T, int N)
 7 | {
 8 | 
 9 | 	VecXX terminalAssetPrices(1.0, N + 1);
10 | 
11 | 	double Dt = T / N;
12 | 	double  u = std::exp(sig * std::sqrt(Dt));
13 | 	double d = 1. / u;
14 | 
15 | 
16 | 	VecXX::INS pu = (exp(r * Dt) - d) / (u - d);
17 | 	VecXX::INS oneMinusP = (1.0 - pu);
18 | 	VecXX::INS disc = exp(-r * Dt);
19 | 
20 | 	BinomialSampler<VecXX::INS> sampler;
21 | 
22 | 	auto binomialRollBack = [=](BinomialSampler<VecXX::INS>& sampler)
23 | 	{
24 | 		const auto& X1 = sampler.X_1.value;
25 | 		const auto& X0 = sampler.X_0.value;
26 | 		return disc * (X1 * pu + X0 * oneMinusP);
27 | 	};
28 | 
29 | 
30 | 	auto payOffFunc = [=](auto X) { return select(X > K, X - K, 0.0); };
31 | 	//	auto payOffFunc = [=](auto X) { return select(X < K, K - X, 0.0); };
32 | 
33 | 	//set up underlying asset prices at maturity
34 | 	double last = S * std::pow(d, N + 2);
35 | 	for (auto& el : terminalAssetPrices)
36 | 	{
37 | 		last *= (u * u);
38 | 		el = last;
39 | 	}
40 | 
41 | 
42 | 	auto odd_slice = transform(payOffFunc, terminalAssetPrices);
43 | 	auto even_slice = odd_slice;
44 | 
45 | 	int j = N + 1;
46 | 	for (int i = 0; i < N / 2; ++i)
47 | 	{
48 | 		transform(odd_slice, even_slice, binomialRollBack, sampler, 0, j);
49 | 		transform(even_slice, odd_slice, binomialRollBack, sampler, 0, j - 1);
50 | 		j -= 2;
51 | 
52 | 	}
53 | 	return odd_slice[0];;
54 | }
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/lattice/lattice.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------
/lattice/lattice_tools.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | //lattice tools
 4 | 
 5 | void doScan();
 6 | 
 7 | void testSampler();
 8 | 
 9 | void doStridedSpan();
10 | 
11 | void doTransformWithASpan();
12 | 
13 | void doZipping();
14 | 
15 | void doMatrix();
16 | 


--------------------------------------------------------------------------------
/lattice/pricers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | double europeanBinomialPricer(double S, double K, double sig, double r, double T, int N);
 4 | 
 5 | double europeanTrinomialPricer(double S, double K, double sig, double r, double T, int N);
 6 | 
 7 | double europeanTrinomialPricer1(double S, double K, double sig, double r, double T, int N);
 8 | 
 9 | double americanTrinomialPricer(double S, double K, double sig, double r, double T, int N);
10 | 
11 | double americanFiniteDiffPricer(double S, double K, double sig, double r, double T, int N);
12 | 
13 | double americanImplicitFiniteDiffPricer(double S, double K, double sig, double r, double T, int N);
14 | 
15 | double americanImplicitFiniteDiffPricerFast(double S, double K, double sig, double r, double T, int N);
16 | 
17 | double euroTrinomialPricerWithInit(double S, double K, double sig, double r, double T, int N);
18 | 
19 | double americanCrankNicholsonPricer(double S, double K, double sig, double r, double T, int N);
20 | 
21 | double americanTrinomialPricerUpAndOut(double S, double K, double sig, double r, double T, double H, double rebate, int N);


--------------------------------------------------------------------------------
/lattice/utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | 
  4 | #include "../Vectorisation/VecX/error_utils.h"
  5 | #include <vector>
  6 | #include <chrono>
  7 | #include <map>
  8 | #include <iostream>
  9 | 
 10 | #include <algorithm>
 11 | #include <random>
 12 | #include <numeric>
 13 | #include <iterator>
 14 | #include <iomanip>  
 15 | #include <cstring>
 16 | 
 17 | //using namespace DRC::VecLDb;
 18 | //using namespace DRC::VecDb;
 19 | //using namespace DRC::VecD2D;  //sse2   double
 20 | //using namespace DRC::VecD4D;	//avx2   double
 21 | //using namespace DRC::VecF8F;	// avx2  float
 22 | using namespace DRC::VecD8D;  //avx512 double
 23 | //using namespace DRC::VecF16F; //avx512   float
 24 | 
 25 | using FLOAT = typename InstructionTraits<VecXX::INS>::FloatType;
 26 | 
 27 | 
 28 | constexpr double billion = 1000000000.0;
 29 | 
 30 | 
 31 | template<typename T>
 32 | bool vectorsEqual(const std::vector<T>& C1, const std::vector<T>& C2, const std::vector<T>& C3)
 33 | {
 34 | 	bool  testOK = true;
 35 | 	const double ERR = 1e-13; //for examples
 36 | 	if (C1.size() != C2.size())
 37 | 	{
 38 | 		return false;
 39 | 	}
 40 | 
 41 | 	if (C3.size() != C2.size())
 42 | 	{
 43 | 		return false;
 44 | 	}
 45 | 
 46 | 	for (size_t i = 0; i < C3.size(); i++)
 47 | 	{
 48 | 		auto err1 = fabs((C1[i] - C2[i]) / (C2[i] + C1[i]));
 49 | 		auto err2 = fabs((C1[i] - C3[i]) / (C1[i] + C3[i]));
 50 | 
 51 | 		if ((err1 > ERR) || (err2 > ERR))
 52 | 		{
 53 | 			testOK = false;
 54 | 			std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n";
 55 | 			break;
 56 | 		}
 57 | 	}
 58 | 
 59 | 	return testOK;
 60 | 
 61 | }
 62 | 
 63 | 
 64 | 
 65 | template<typename T>
 66 | bool vectorsEqualD(const std::vector<T>& C1, const std::vector<T>& C2, const std::vector<T>& C3, const std::vector<T>& input, T ERR = 1e-13)
 67 | {
 68 | 
 69 | 
 70 | 	ERR = getErr(C1);
 71 | 
 72 | 	bool  testOK = true;
 73 | 
 74 | 	if (C1.size() != C2.size())
 75 | 	{
 76 | 		std::cout << "wrong size C1,C2" << C1.size() << ", " << C2.size() << std::endl;
 77 | 		return false;
 78 | 	}
 79 | 
 80 | 	if (C3.size() != C2.size())
 81 | 	{
 82 | 		std::cout << "wrong size C2,C3" << C2.size() << ", " << C3.size() << std::endl;
 83 | 		return false;
 84 | 	}
 85 | 
 86 | 	for (size_t i = 0; i < C3.size(); i++)
 87 | 	{
 88 | 		auto err1 = fabs((C1[i] - C2[i]) / (fabs(C2[i]) + fabs(C1[i])));
 89 | 		auto err2 = fabs((C1[i] - C3[i]) / (fabs(C1[i]) + fabs(C3[i])));
 90 | 
 91 | 		if ((err1 > ERR) || (err2 > ERR))
 92 | 		{
 93 | 			testOK = false;
 94 | 			std::cout << "\n err diff@ " << i << " err1 =" << err1 << ", err2 = " << err2 << "\n";
 95 | 			std::cout << "\n val @ " << i << " C1[i] =" << C1[i] << ", C2[i] = " << C2[i] << ", C3[i] = " << C3[i] << "input val=" << input[i] << "\n";
 96 | 			std::cout << std::endl;
 97 | 			break;
 98 | 		}
 99 | 	}
100 | 
101 | 	return testOK;
102 | 
103 | }
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | static auto numOps = [](int TEST_LOOP_SZ, int SZ) { return  static_cast<int>(double(TEST_LOOP_SZ) * double(SZ)); };
111 | 
112 | 
113 | 
114 | using Calc_Values = std::map<int, FLOAT>;
115 | using Calc_Values_V = std::map<int, std::vector<FLOAT> >;
116 | using  Mapped_Performance_Results = std::map<int, std::vector<double> >; // array size  v vector<throughput for runs>
117 | using Mapped_Stats = std::map<int, std::pair<double, double> >; // size -.pair ( throughput ,  std dev of through put)
118 | 
119 | struct RunResults
120 | {
121 | 	Mapped_Performance_Results m_raw_results;
122 | 	Calc_Values  m_calc_results;
123 | };
124 | 
125 | struct RunResultsVec
126 | {
127 | 	Mapped_Performance_Results m_raw_results;
128 | 	Calc_Values_V  m_calc_results;
129 | };
130 | 
131 | class TimerGuard
132 | {
133 | 	double& m_runTime;
134 | 	std::chrono::steady_clock::time_point  m_startTme;
135 | 
136 | public:
137 | 	TimerGuard(double& runTime) : m_runTime(runTime), m_startTme(std::chrono::steady_clock::now()) { runTime = 0.; }
138 | 
139 | 	~TimerGuard()
140 | 	{
141 | 		auto endTime = std::chrono::steady_clock::now();
142 | 		auto runtime = endTime - m_startTme;
143 | 		m_runTime = runtime.count() / billion;
144 | 	}
145 | };
146 | 
147 | 
148 | static auto runFunctionOverDifferentSize = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ)
149 | {
150 | 
151 | 	RunResults results;
152 | 
153 | 	for (int j = 0; j < testRepeats; ++j)
154 | 	{
155 | 		int VEC_SZ = vec_start_size;
156 | 		for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ)
157 | 		{
158 | 			auto res = func(VEC_SZ, testLoopSZ);
159 | 			auto calculation_rate = res.second;
160 | 			auto calc_value = res.first;
161 | 			results.m_raw_results[VEC_SZ].push_back(calculation_rate);
162 | 			if (j == 0)
163 | 			{
164 | 				results.m_calc_results[VEC_SZ] = static_cast<FLOAT>(calc_value);
165 | 			}
166 | 		}
167 | 	}
168 | 	return results;
169 | };
170 | 
171 | 
172 | static auto runFunctionOverDifferentSizeVec = [](int testRepeats, int vec_start_size, int vec_stepSZ, int vec_maxSZ, const auto& func, long testLoopSZ)
173 | {
174 | 
175 | 	RunResultsVec results;
176 | 
177 | 	for (int j = 0; j < testRepeats; ++j)
178 | 	{
179 | 		int VEC_SZ = vec_start_size;
180 | 		for (; VEC_SZ < vec_maxSZ; VEC_SZ += vec_stepSZ)
181 | 		{
182 | 			auto res = func(VEC_SZ, testLoopSZ);
183 | 			auto calculation_rate = res.second;
184 | 			auto calc_value = res.first;
185 | 			results.m_raw_results[VEC_SZ].push_back(calculation_rate);
186 | 
187 | 			if (j == 0)
188 | 			{
189 | 				std::vector<FLOAT> tmp = res.first;
190 | 				results.m_calc_results[VEC_SZ] = tmp;
191 | 			}
192 | 
193 | 		}
194 | 	}
195 | 	return results;
196 | };
197 | 
198 | 
199 | 
200 | static auto performanceStats = [](const Mapped_Performance_Results& raw_results)
201 | {
202 | 
203 | 	Mapped_Stats stats;
204 | 
205 | 	for (const auto& item : raw_results)
206 | 	{
207 | 		double sum = 0;
208 | 		double sum_sqrd = 0;
209 | 		double N = 0.0;
210 | 		for (const auto run_rate : item.second)
211 | 		{
212 | 			sum += run_rate;
213 | 			sum_sqrd += (run_rate * run_rate);
214 | 			N++;
215 | 		}
216 | 
217 | 		double avg = sum / N;
218 | 		double varSqrd = sum_sqrd + (avg * avg * N) - (2.0 * avg * sum);
219 | 		double var = std::sqrt(varSqrd / (N - 1.));
220 | 
221 | 		stats[item.first] = { avg ,var };
222 | 
223 | 	}
224 | 	return stats;
225 | };
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/scratch/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(ScratchExample scratch.cpp)
2 | 
3 | target_link_libraries(ScratchExample PUBLIC Vectorisation)
4 | 
5 | target_include_directories(ScratchExample PUBLIC
6 |         "${PROJECT_BINARY_DIR}"
7 | )


--------------------------------------------------------------------------------
/scratch/scratch.cpp:
--------------------------------------------------------------------------------
  1 | // scratch.cpp : This file contains the 'main' function. Program execution begins and ends there.
  2 | //
  3 | 
  4 | #include "../Vectorisation/VecX/dr3.h"
  5 | 
  6 | 
  7 | #include <random>
  8 | #include <numeric>
  9 | #include <iostream>
 10 | #include <vector>
 11 | #include <iomanip>
 12 | #include <functional>
 13 | 
 14 | 
 15 | 
 16 | //using namespace DRC::VecD2D; 
 17 | //using namespace DRC::VecF4F;
 18 | //using namespace DRC::VecD4D;
 19 | using namespace DRC::VecD8D;
 20 | //using namespace DRC::VecF16F;
 21 | //using namespace DRC::VecF8F;
 22 | 
 23 | 
 24 | 
 25 | void doAddWithoutCancellation()
 26 | {
 27 | 
 28 |     using BINNED_ACCUMULATOR = BinsT<VecXX::INS>;
 29 |     using FLOAT = InstructionTraits<VecXX::INS>::FloatType;
 30 |     FLOAT oneThird = static_cast<FLOAT>(1.0 / 3.0);
 31 |     
 32 |     std::cout << "create empty bin, value 0.0 \n";
 33 |     BINNED_ACCUMULATOR bin;
 34 |     std::cout << "add one third to it , value =";
 35 | 
 36 |     bin += oneThird;
 37 |     std::cout << std::setprecision(8) << bin.hsum() << "\n";
 38 | 
 39 |     std::cout << "add one hundred thousand to it , value =";
 40 |     bin += 100000.0f;
 41 | 
 42 |     auto t = bin.hsum();
 43 | 
 44 |     std::cout << std::setprecision(8)<< t << std::endl;
 45 | 
 46 |     std::cout << "dd minus one hundred thousand to it , value =";
 47 |     bin += -100000.0f;
 48 |     t=bin.hsum();
 49 | 
 50 |     std::cout << std::setprecision(8) << t <<  std::endl;
 51 | 
 52 |     std::cout << "no cancellation! \n \n \n";
 53 | 
 54 | 
 55 | }
 56 | 
 57 | /*
 58 |  Example summation using std reduction, and for loop
 59 |  then pairwise_reduce  and  reduce with Kahan summation 
 60 |  and finally using binned summation.
 61 | 
 62 |  Generally for large sums, we get the same rounding error for  the for loop and
 63 |  std accumulation/ reduce
 64 | 
 65 |  However,  with pairwise reduce and  kahan accumulation, we tend to last digit level precision.
 66 | 
 67 |  Unfortunately, when we add pairs of large  +ve and -ve numbers which cancel each other
 68 |  they destroy accuracy of intermediate sums.  In the code we turn this on by
 69 |  setting  BIG_CANCELLATION  = true.
 70 | 
 71 |  The cancellation flag does not change the theoretical value of the sum,
 72 |  the sums should return the same result  as when CANCELLATION =false.
 73 |  However, we find that only the binned arithmetic scheme can achieve this
 74 |  sort of stability with this example.
 75 | 
 76 |  The input data set is randomly permuted  and the sum re calculated. The  ideal result is that we get the same
 77 |  answer irrespective of the ordering. The actual results differ to varying degrees.
 78 | 
 79 |  Both pairwise and kahan summation can significantly reduce rounding errors, however
 80 |  binned summation tends to work much better if we  have significant cancellation.
 81 |  For loops and std::accumulate /reduce are generally less accurate.
 82 | 
 83 | */
 84 | 
 85 | void setCancelInput(float& flt)
 86 | {
 87 |     flt = 100000.f;
 88 | }
 89 | 
 90 | void setCancelInput(double& dbl)
 91 | {
 92 |     dbl = 1000000000000.0;
 93 | }
 94 | 
 95 | 
 96 | int main()
 97 | {
 98 |    //simple binned sum example
 99 |     doAddWithoutCancellation();
100 | 
101 |     long SZ = 10000 * 1024 ; // size of data set to be summed
102 |     using FLOAT = InstructionTraits<VecXX::INS>::FloatType;
103 |     FLOAT initVal = static_cast<FLOAT>(1.0 / 3.0);
104 |     
105 |     VecXX data(initVal, SZ);
106 |     double scale = 1.0;// us power of 2   eg 1.0 / 1024.0 * 1.0 / 1024.0 * 1.0 / 1024.0;
107 |     data *= scale;
108 | 
109 |     bool USE_BIG_CANCELLATION =  false;
110 | 
111 |     for( int C = 0; C < 2;C++) // iterate using cancellation data set
112 |     {
113 |         if (C >0) { USE_BIG_CANCELLATION =  true;}
114 | 
115 |         int i = 0;
116 | 
117 | 
118 |         auto mixed = data;
119 |         long count = 0;
120 | 
121 |         //make data members slightly different and add
122 |         //cancelling values if required
123 |         for (auto &x: mixed) {
124 |             count++;
125 |             x += count * 0.0001f;
126 |             FLOAT a;
127 |             setCancelInput(a);
128 |             FLOAT b = -a;
129 | 
130 |             if (!USE_BIG_CANCELLATION) {
131 |                 a = 0.0;
132 |                 b = 0.0;
133 |             }
134 | 
135 |             if ((count > 17) && (count % 17 == 0)) //every 17'th element set up cancellation
136 |             {
137 |                 auto c = mixed[count] + mixed[count - 1] + mixed[count - 2];
138 |                 mixed[count] = c;
139 |                 mixed[count - 1] = b;
140 |                 mixed[count - 2] = a;
141 |             }
142 |             ignore(x);
143 |         }
144 | 
145 | 
146 |         double multiplr = 1.0;// pow(1024., 4);
147 |         mixed *= multiplr;
148 | 
149 |         //std::vector<FLOAT> scaledVec = mixed; //for debug observation
150 |         // run ten permutations of  data set and do summation
151 |         for (int kkk = 0; kkk < 10; kkk++)
152 |         {
153 |             std::random_device rd;
154 |             std::mt19937 g(rd());
155 | 
156 |             std::shuffle(mixed.begin(), mixed.end(), g);
157 |             // std::vector<FLOAT> obs= mixed;
158 | 
159 |             auto std_acc = std::accumulate(mixed.begin(), mixed.end(), static_cast<FLOAT>(0.0));
160 |             auto std_reduce = std::reduce(mixed.begin(), mixed.end(), static_cast<FLOAT>(0.0));
161 | 
162 |             auto sumIt = [](auto x, auto y) { return x + y; };
163 |             auto sumPairwiseDr3 = ApplyAccumulate2UR_X_pairwise(mixed, sumIt);
164 | 
165 |             auto DRCubedAccum = ApplyAccumulate2UR_X(mixed, sumIt);
166 | 
167 |             FLOAT trad_for_loop = 0.0f;
168 |             for (auto x: mixed) {
169 |                 trad_for_loop += x;
170 |             }
171 | 
172 |             //correcting summation  lambda
173 |             auto NULL_Vec = VecXX::INS(0.0);
174 |             auto KhanAddV = [c = NULL_Vec](auto sum, auto rhs) mutable {
175 |                 auto y = rhs - c;
176 |                 auto t = sum + y;
177 |                 c = (t - sum);
178 |                 c = c - y;
179 |                 sum = t;
180 |                 return t;
181 |             };
182 | 
183 | 
184 |             auto sumKahan = reduce(mixed, KhanAddV);
185 |             auto KahanAddD = [cc = static_cast<FLOAT>(0.0)](auto sum, auto rhs)  mutable {
186 |                 auto y = rhs - cc;
187 |                 auto t = sum + y;
188 |                 cc = (t - sum);
189 |                 cc = cc - y;
190 |                 sum = t;
191 |                 return t;
192 |                 };
193 | 
194 | 
195 |             double std_accumulate_Kahan = std::accumulate(mixed.begin(), mixed.end(), static_cast<FLOAT>(0.0), KahanAddD);
196 |             double std_reduce_Kahan = std::reduce(mixed.begin(), mixed.end(), static_cast<FLOAT>(0.0), KahanAddD);
197 | 
198 |             NULL_Vec = VecXX::INS(0.0);
199 |             auto sumPairwiseWithKahan = ApplyAccumulate2UR_X_pairwise(mixed, KhanAddV);
200 | 
201 |             // reduce with binned accumulator
202 |             auto scale = 1.0;// pow(1024.0, 2);
203 |             using BINNED_ACCUMULATOR = BinsT<VecXX::INS>;
204 |             BINNED_ACCUMULATOR Bin(0.0, scale);//
205 | 
206 | 
207 | 
208 |             auto binned_Sum = reduceWithAccumulator(Bin, mixed, BinnedAdd);
209 | 
210 | 
211 |             ///binned_Sum = mult;
212 | 
213 |             std::cout << "\nUsing Significant Cancellation Data = " << std::boolalpha << USE_BIG_CANCELLATION << "  \n";
214 |             std::cout << "shuffled version " << ++i << "\n" << std::setprecision(16)
215 |                       << trad_for_loop << "\t for loop sum   \n"
216 |                       << std_acc << "\t std::accumulate sum  \n"
217 |                       << std_reduce << "\t std::reduce  \n"
218 |                       << DRCubedAccum << "\t accumulate DR3 \n"
219 |                       << std_accumulate_Kahan << "\t std_accumulate_Kahan \n"
220 |                       << std_reduce_Kahan << "\t std_reduce_Kahan \n"
221 |                       << sumPairwiseDr3 << "\t sum pairwise  \n"
222 |                       << sumKahan << "\t sum Kahan acc \n"
223 |                       << sumPairwiseWithKahan << " \t pairwise_sum  using Kahan acc \n"
224 |                       << binned_Sum << "\t binned sum acc \n \n \n \n";
225 | 
226 | 
227 |         }
228 |     }
229 | }
230 | 
231 | 


--------------------------------------------------------------------------------
/scratch/scratch.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------