├── example ├── Sphere.cpp ├── SyntheticData.h ├── Sphere.h ├── SyntheticData.cpp └── main.cpp ├── src ├── Loss.cpp ├── Optimization.cpp ├── Loss.h ├── Optimization.h ├── Dual.cpp └── Dual.h ├── LICENSE ├── CMakeLists.txt └── README.md /example/Sphere.cpp: -------------------------------------------------------------------------------- 1 | #include "Sphere.h" 2 | 3 | template struct Sphere>; 4 | -------------------------------------------------------------------------------- /example/SyntheticData.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | Eigen::MatrixXd 8 | GeneratePointsOnSphereSurface( 9 | const int samples, 10 | const Eigen::Vector3d center, 11 | const double radius, 12 | const double noise = 0.); 13 | -------------------------------------------------------------------------------- /src/Loss.cpp: -------------------------------------------------------------------------------- 1 | #include "Loss.h" 2 | 3 | namespace PistolsAtDawn { 4 | 5 | // Implements dp/dx as the primal part, and d^2p/dx^2 in the first dual slot 6 | Loss::Dual 7 | Loss::operator () (const float x_) const 8 | { 9 | Dual x = Dual(x_, 0); 10 | return x * inv_c2 * pow(x * x * inv_c2b + 1.f, d_exp); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /example/Sphere.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Dual.h" 4 | #include 5 | 6 | template 7 | struct Sphere 8 | { 9 | const Float radius; 10 | 11 | Sphere() 12 | : radius{ 1.f } 13 | {} 14 | 15 | Sphere(const double radius) 16 | : radius{ (Float) radius } 17 | {} 18 | 19 | inline 20 | Float 21 | operator() ( 22 | const Eigen::Matrix & parameters_, 23 | const Eigen::Matrix & datum_) const 24 | { 25 | 26 | const auto R2 = (parameters_ - datum_).squaredNorm(); 27 | const auto R = sqrt(R2); 28 | const auto dr = Float(R) - radius; 29 | return dr; 30 | } 31 | }; 32 | -------------------------------------------------------------------------------- /src/Optimization.cpp: -------------------------------------------------------------------------------- 1 | #include "Optimization.h" 2 | 3 | namespace PistolsAtDawn { 4 | 5 | template 6 | void 7 | Optimizer::Reset(const int num_data, const int num_parameters) 8 | { 9 | parameters_.resize(num_parameters, 1); 10 | parameters_.setZero(); 11 | datum_.resize(num_data, 1); 12 | datum_.setZero(); 13 | state.resize(num_data, 1); 14 | state.setZero(); 15 | J_row.resize(1, num_parameters); 16 | J_row.setZero(); 17 | 18 | iteration = 0; 19 | } 20 | 21 | template 22 | void 23 | Optimizer::LoadDefaults() 24 | { 25 | max_iterations = 50; 26 | learning_rate = 1.0; 27 | measurement_error = 1.0; 28 | } 29 | 30 | template struct Optimizer; 31 | 32 | } // namespace PistolsAtDawn 33 | -------------------------------------------------------------------------------- /src/Loss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "Dual.h" 5 | 6 | namespace PistolsAtDawn { 7 | 8 | // Implements Barron's loss function 9 | // https://arxiv.org/pdf/1701.03077.pdf 10 | class Loss 11 | { 12 | using Dual = PistolsAtDawn::Dual_Float<1>; 13 | public: 14 | const float a; 15 | const float c; 16 | 17 | private: 18 | float inv_c2, inv_c2b, d_exp; 19 | public: 20 | Loss(const float a = 0.f, const float c = 1.f, const float eps = 1e-5) 21 | : a{ a } 22 | , c{ c } 23 | { 24 | const auto b = std::fabs(2.f - a) + eps; 25 | const auto c2 = c * c; 26 | const auto d = (a < 0) ? a - eps : a + eps; 27 | inv_c2 = 1.f / c2; 28 | inv_c2b = inv_c2 / b; 29 | d_exp = 0.5f * d - 1.f; 30 | } 31 | 32 | Dual 33 | operator () (const float x_) const; 34 | }; 35 | 36 | } -------------------------------------------------------------------------------- /example/SyntheticData.cpp: -------------------------------------------------------------------------------- 1 | #define _USE_MATH_DEFINES 2 | #include 3 | #include "SyntheticData.h" 4 | 5 | Eigen::MatrixXd 6 | GeneratePointsOnSphereSurface( 7 | const int samples, 8 | const Eigen::Vector3d center, 9 | const double radius, 10 | const double noise) 11 | { 12 | std::vector temp; 13 | 14 | std::mt19937_64 rng(0); 15 | std::normal_distribution normal(0, noise); 16 | 17 | const auto offset = 2. / samples; 18 | const auto increment = M_PI * (3. - sqrt(5.)); 19 | 20 | for (int i = 0, k = 0; i < samples; i++) { 21 | const auto y = ((i * offset) - 1) + (offset / 2); 22 | const auto r = sqrt(1 - y*y); 23 | const auto phi = i * increment; 24 | const auto x = cos(phi) * r; 25 | const auto z = sin(phi) * r; 26 | const auto xx = x * radius + center[0]; 27 | const auto yy = y * radius + center[1]; 28 | const auto zz = z * radius + center[2]; 29 | if (std::isfinite(xx) && std::isfinite(yy) && std::isfinite(zz)) { 30 | temp.emplace_back(xx + normal(rng), yy + normal(rng), zz + normal(rng)); 31 | } 32 | } 33 | 34 | Eigen::MatrixXd out; 35 | out.resize(3, temp.size()); 36 | for (unsigned int i = 0; i < temp.size(); i++) { 37 | out.col(i) = temp[i]; 38 | } 39 | 40 | return out; 41 | } 42 | -------------------------------------------------------------------------------- /example/main.cpp: -------------------------------------------------------------------------------- 1 | #include "Sphere.h" 2 | #include "SyntheticData.h" 3 | #include "Optimization.h" 4 | 5 | /* 6 | 7 | This example fits a sphere of constant radius to a 8 | noisy set of 3d points. 9 | 10 | In order to fit a different model to different data, 11 | all the user must do is create a different class 12 | which implements the same operator() as the Sphere 13 | class does in this example. 14 | 15 | On each iteration of the optimization the Optimizer 16 | object loops over the data and calls that operator() 17 | function with the current model parameters and one 18 | column of data matrix. (Each column represents a 19 | different data point.) 20 | 21 | */ 22 | 23 | int main() 24 | { 25 | using Dual = PistolsAtDawn::Dual_Float<1>; 26 | 27 | const int data_points = 100; 28 | const Eigen::Vector3d sphere_center = Eigen::Vector3d(2, 3, 5); 29 | const double radius = 1.; 30 | const double noise = 0.1; 31 | 32 | const Eigen::MatrixXd data = GeneratePointsOnSphereSurface(data_points, sphere_center, radius, noise); 33 | 34 | Eigen::VectorXd parameters = Eigen::Vector3d::Zero(); 35 | 36 | auto sphere = Sphere(radius); 37 | 38 | std::cout << "First Residual: " << (parameters - sphere_center).norm() << std::endl; 39 | 40 | PistolsAtDawn::Optimizer opt; 41 | opt.Optimize(sphere, parameters, data); 42 | 43 | std::cout << "Final Residual: " << (parameters - sphere_center).norm() << std::endl; 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Christopher Parker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | All changes and improvements to the Software, or derived works of similar or general purpose, must be made publicly available within 3 months. All such changes and improvements must also be reflected in a pull request to the GitHub repository “csp256/PistolsAtDawn” within the same 3 months. This pull request must include a summary of changes, and include all relevant compilation instructions. 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(PistolsAtDawn) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | set(CMAKE_DEBUG_POSTFIX "-dbg") 6 | 7 | find_package(Eigen3 REQUIRED NO_MODULE) 8 | # find_package for eigen does not work sometimes and does not add library 9 | if (NOT TARGET Eigen3::Eigen) 10 | include_directories(${EIGEN3_INCLUDE_DIR}) 11 | endif() 12 | 13 | add_library(PistolsAtDawn 14 | src/Dual.cpp 15 | src/Loss.cpp 16 | src/Optimization.cpp) 17 | 18 | target_include_directories(PistolsAtDawn PUBLIC 19 | $ 20 | $) 21 | 22 | if (TARGET Eigen3::Eigen) 23 | # we can relay on it to provide transitive dependency 24 | target_link_libraries(PistolsAtDawn PUBLIC Eigen3::Eigen) 25 | endif() 26 | 27 | add_library(PistolsAtDawn::PistolsAtDawn ALIAS PistolsAtDawn) 28 | 29 | #### Installation of library 30 | install(DIRECTORY src/ DESTINATION include 31 | FILES_MATCHING PATTERN "*.h") 32 | 33 | install(TARGETS PistolsAtDawn 34 | ARCHIVE DESTINATION lib/static 35 | LIBRARY DESTINATION lib 36 | RUNTIME DESTINATION bin 37 | PUBLIC_HEADER DESTINATION include) 38 | 39 | ### Other (tests, examples, etc) 40 | 41 | option(BUILD_EXAMPLE "Build example" ON) 42 | 43 | if (BUILD_EXAMPLE) 44 | add_executable(sphere_example 45 | example/main.cpp 46 | example/Sphere.cpp 47 | example/SyntheticData.cpp 48 | ) 49 | target_link_libraries(sphere_example PistolsAtDawn::PistolsAtDawn) 50 | endif() 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PistolsAtDawn 2 | 3 | Easy optimization framework with explicitly vectorized automatic differentiation. 4 | 5 | Think of it as "Ceres-light". The only dependency is Eigen, so installation is painless. 6 | 7 | Fitting a model to data is as simple as writing a function to compute the residual (error). See the example on sphere-fitting. 8 | 9 | # Features 10 | 11 | * Robustified Gauss-Newton, as per "Bundle Adjustment, A Modern Synthesis" 12 | * Barron loss function. 13 | * to be continued 14 | 15 | # Design Philosophy 16 | 17 | 1. Require the absolute bare minimum of the user. 18 | 2. Insist on clean code that is easy to read and modify. 19 | 3. Default behavior should be user friendly, but flexible and configurable. 20 | 4. Autodiff functionality should stand alone and be very high performance. 21 | 5. Introduce advanced features with pedagogy and erudition in mind. 22 | 23 | # Automatic Differentiation 24 | 25 | The forward mode, dual number approach is used. 26 | 27 | All operations involving the dual part are explicitly vectorized using SSE. 28 | 29 | Currently, only single precision is supported for model evaluation, but this will change. I will replace the SSE intrinsics with a SIMD wrapper (taking suggestions!) to enable AVX, NEON, double precision, etc. 30 | 31 | # Build 32 | 33 | You can either copy `src/` to your code or use CMake to build example. 34 | Ensure you have CMake (>= 3.5) and Eigen3 installed. Then you can type: 35 | ``` 36 | mkdir build && cd build 37 | cmake .. 38 | make -j 39 | ``` 40 | and you should expect it to create `./example/sphere_example` that you can run. 41 | 42 | This is a draft CMake for quick setup, with partial support for `make install` or embedding as part of other project (with `add_subdirectory`). 43 | Tested on MacOS/clang and Ubuntu 16.04 gcc5. 44 | 45 | # License 46 | 47 | Modified MIT. Free for all uses, even commercial, with one caveat: 48 | 49 | All changes and improvements to the Software, 50 | or derived works of similar or general purpose, 51 | must be made publicly available within 3 months. 52 | 53 | All such changes and improvements must also be 54 | reflected in a pull request to the GitHub 55 | repository “csp256/PistolsAtDawn” within the 56 | same 3 months. 57 | 58 | This pull request must include a 59 | summary of changes, and include all relevant 60 | compilation instructions. 61 | 62 | While the license does not require it, I encourage users of PistolsAtDawn to let me know how they are using it, and to cite it in any related academic papers. 63 | 64 | # Need Help? 65 | 66 | Please open an issue if you have any questions. 67 | -------------------------------------------------------------------------------- /src/Optimization.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "Dual.h" 6 | #include "Loss.h" 7 | 8 | namespace PistolsAtDawn { 9 | 10 | template 11 | struct Optimizer 12 | { 13 | static constexpr int kSlots = 1; 14 | 15 | using Dual = Dual_Float; 16 | using Duel = Dual_Float; 17 | using PistolsAtDawn = Dual_Float; 18 | 19 | Eigen::Matrix parameters_; 20 | Eigen::Matrix datum_; 21 | Eigen::Matrix state; 22 | Eigen::Matrix J_row; 23 | 24 | int iteration = 0; 25 | int max_iterations; 26 | Type learning_rate; 27 | Type measurement_error; 28 | Loss barron_loss; 29 | 30 | void 31 | Reset(const int num_data, const int num_parameters); 32 | 33 | void 34 | LoadDefaults(); 35 | 36 | Optimizer() 37 | : barron_loss(0.f, 1.f) 38 | { LoadDefaults(); } 39 | 40 | // See "Bundle adjustment, a modern synthesis", page 17, equation 10 41 | template 42 | inline 43 | void 44 | Optimize( 45 | F & f, 46 | Eigen::VectorXd & parameters, 47 | const Eigen::MatrixXd & data) 48 | { 49 | Reset(data.cols(), parameters.rows()); 50 | 51 | for (int i = 0; i < parameters_.rows(); i++) { 52 | parameters_(i, 0) = Dual((float) parameters[i], i); 53 | } 54 | 55 | Eigen::Matrix H; 56 | Eigen::Matrix g; 57 | H.resize(parameters.rows(), parameters.rows()); 58 | g.resize(parameters.rows(), 1); 59 | 60 | for (int count = 0; count < max_iterations; count++) { 61 | for (int i = 0; i < parameters.rows(); i++) { 62 | parameters_(i, 0).data.x = (float) parameters[i]; 63 | } 64 | 65 | H.setZero(); 66 | g.setZero(); 67 | Type DeltaZ; 68 | const Type W = 1 / (measurement_error * measurement_error); 69 | 70 | for (int i = 0; i < data.cols(); i++) { 71 | const auto & datum_ = data.col(i).template cast(); 72 | Dual f_ = f(parameters_.col(0), datum_); 73 | DeltaZ = f_.Primal(); 74 | for (int j = 0; j < parameters.rows(); j++) { 75 | J_row[j] = (Type)f_.Partial(j); 76 | } 77 | const auto W_DeltaZ = W * DeltaZ; 78 | const auto DeltaZ_W_DeltaZ = DeltaZ * W_DeltaZ; 79 | const auto loss = barron_loss((float) DeltaZ_W_DeltaZ); 80 | auto InnerTerm = [&]() 81 | { 82 | const bool negative_curvature = 83 | (loss.Partial(0) * DeltaZ_W_DeltaZ) < (-0.5 * loss.Primal()); 84 | 85 | if (negative_curvature) { 86 | // Unsafe to apply acceleration term without rescaling (see Triggs, eq 11) 87 | // so we just set it to 0 as an approximation (I think this is what Ceres does?) 88 | return loss.Primal() * W; 89 | } else { 90 | // Can safely add acceleration term because we have positive curvature 91 | return loss.Primal() * W + 2. * loss.Partial(0) * W_DeltaZ * W_DeltaZ; 92 | } 93 | }; 94 | g -= loss.Primal() * J_row.transpose() * W * DeltaZ; 95 | H += J_row.transpose() * InnerTerm() * J_row; 96 | } // loop over data 97 | 98 | Eigen::BDCSVD> solver(H, Eigen::ComputeFullU | Eigen::ComputeFullV); 99 | 100 | parameters.col(0) += (learning_rate * solver.solve(g)).eval(); 101 | 102 | //std::cout << parameters.transpose() << std::endl; 103 | } // main optimization loop 104 | 105 | } // Optimizer::Optimize() 106 | 107 | }; // struct Optimizer 108 | 109 | } // namespace PistolsAtDawn 110 | -------------------------------------------------------------------------------- /src/Dual.cpp: -------------------------------------------------------------------------------- 1 | #include "Dual.h" 2 | 3 | // I should really make these all inline and move them into the header. 4 | 5 | #define forSlots for (int i = 0; i < kSlots; i++) 6 | 7 | namespace PistolsAtDawn { 8 | 9 | Float4 operator + (const float c, const Float4 & x) { return x + c; } 10 | Float4 operator - (const float c, const Float4 & x) { return _mm_sub_ps(_mm_set_ps1(c), x.q); } 11 | Float4 operator * (const float c, const Float4 & x) { return x * c; } 12 | Float4 operator / (const float c, const Float4 & x) { return _mm_mul_ps(_mm_set_ps1(c), _mm_rcp_ps(x.q)); } 13 | Float4 operator+(const Float4 & x) { return x; } 14 | Float4 operator-(const Float4 & x) { return Float4(_mm_sub_ps(_mm_setzero_ps(), x.q)); } 15 | 16 | // ***************************** 17 | 18 | template Dual_Float operator+(const float lhs, const Dual_Float & rhs) { return rhs + lhs; } 19 | template Dual_Float operator-(const float lhs, const Dual_Float & rhs) { return -rhs + lhs; } 20 | template Dual_Float operator*(const float lhs, const Dual_Float & rhs) { return rhs * lhs; } 21 | template Dual_Float operator/(const float lhs, const Dual_Float & rhs) { Dual_Float out; out.data.x = lhs / rhs.data.x; forSlots{ out.data.dx[i] = lhs / rhs.data.dx[i]; } return out; } 22 | template bool operator< (const T lhs, const Dual_Float & rhs) { return lhs < rhs.data.x; } 23 | template bool operator<=(const T lhs, const Dual_Float & rhs) { return lhs <= rhs.data.x; } 24 | template bool operator> (const T lhs, const Dual_Float & rhs) { return lhs > rhs.data.x; } 25 | template bool operator>=(const T lhs, const Dual_Float & rhs) { return lhs >= rhs.data.x; } 26 | template bool operator==(const T lhs, const Dual_Float & rhs) { return lhs == rhs.data.x; } 27 | template bool operator!=(const T lhs, const Dual_Float & rhs) { return lhs != rhs.data.x; } 28 | template Dual_Float operator+(const Dual_Float & x) { return Dual_Float(x); } 29 | template Dual_Float operator-(const Dual_Float & x) { Dual_Float y{ -x.data.x }; forSlots{ y.data.dx[i] = -x.data.dx[i]; } return y; } 30 | template Dual_Float sin(const Dual_Float & x) { Dual_Float out{ sin(x.data.x) }; const float c = cos(x.data.x); for (int i = 0; i < kSlots; i++) { out.data.dx[i] = x.data.dx[i] * c; } return out; } 31 | template Dual_Float cos(const Dual_Float & x) { Dual_Float out{ cos(x.data.x) }; const float ns = -sin(x.data.x); forSlots{ out.data.dx[i] = x.data.dx[i] * ns; } return out; } 32 | // template Dual_Float tan(const Dual_Float & x) { Dual_Float out{ asin(x.data.x) }; const float t = sec( x.data.x ); const float t2 = t * t; forSlots{ out.data.dx[i] = x.data.dx[i] * t2; } return out; } 33 | template Dual_Float asin(const Dual_Float & x) { Dual_Float out{ asin(x.data.x) }; const float t = 1.f / sqrt(1.f - x.data.x*x.data.x); forSlots{ out.data.dx[i] = x.data.dx[i] * t; } return out; } 34 | template Dual_Float acos(const Dual_Float & x) { Dual_Float out{ acos(x.data.x) }; const float t = -1.f / sqrt(1.f - x.data.x*x.data.x); forSlots{ out.data.dx[i] = x.data.dx[i] * t; } return out; } 35 | template Dual_Float atan(const Dual_Float & x) { Dual_Float out{ atan(x.data.x) }; const float t = 1.f / (1.f + x.data.x*x.data.x); forSlots{ out.data.dx[i] = x.data.dx[i] * t; } return out; } 36 | template Dual_Float exp(const Dual_Float & x) { Dual_Float out{ exp(x.data.x) }; /*const Float4 e = out.data.x;*/ forSlots{ out.data.dx[i] = x.data.dx[i] * out.data.x; } return out; } 37 | template Dual_Float log(const Dual_Float & x) { Dual_Float out{ log(x.data.x) }; const Float4 inv = _mm_rcp_ps(_mm_set1_ps(x.data.x)); forSlots{ out.data.dx[i] = x.data.dx[i] * inv; } return out; } 38 | 39 | } // namespace PistolsAtDawn 40 | 41 | #undef forSlots 42 | -------------------------------------------------------------------------------- /src/Dual.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _MSC_VER 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace PistolsAtDawn { 14 | 15 | #define forSlots for (int i = 0; i < kSlots; i++) 16 | 17 | struct Float4 18 | { 19 | __m128 q; 20 | Float4() : q{ 0 } {} 21 | Float4(const __m128 q) : q{ q } {} 22 | Float4(const float x[4]) : q{ x[0],x[1],x[2],x[3] } {} 23 | Float4(const float a, const float b, const float c, const float d) : q{ a,b,c,d } {} 24 | // Float4(const int x) : q{ 0 } { Track(x); } 25 | void Track(const uint64_t x) { float pf[4]; memcpy(pf, &q, sizeof(q)); pf[x] = 1.f; memcpy(&q, pf, sizeof(q)); } 26 | // void Track(const uint64_t x) { union u { __m128 p; float pf[4]; }; ((u*) &q )->pf[x] = 1.0f; /* oh, is this not working right?? */ } 27 | float Partial(const uint64_t x) const { float pf[4]; memcpy(pf, &q, sizeof(q)); return pf[x]; } 28 | // float Partial(const uint64_t x) const { union u { __m128 p; float pf[4]; }; return ((u*)&q)->pf[x]; } 29 | void Print() const { std::cout << "{ " << Partial(0) << " " << Partial(1) << " " << Partial(2) << " " << Partial(3) << " } "; } 30 | void Reset(void) { q = _mm_setzero_ps(); } 31 | // float operator[](const int i) const { float uf[4]; memcpy(uf, &q, sizeof(q)); return uf[i]; } 32 | Float4 & operator+=(const Float4 & rhs) { q = _mm_add_ps(q, rhs.q); return *this; } 33 | Float4 & operator-=(const Float4 & rhs) { q = _mm_sub_ps(q, rhs.q); return *this; } 34 | Float4 & operator*=(const Float4 & rhs) { q = _mm_mul_ps(q, rhs.q); return *this; } 35 | Float4 & operator/=(const Float4 & rhs) { q = _mm_div_ps(q, rhs.q); return *this; } 36 | Float4 operator+ (const Float4 & rhs) const { return Float4(*this) += rhs; } 37 | Float4 operator- (const Float4 & rhs) const { return Float4(*this) -= rhs; } 38 | Float4 operator* (const Float4 & rhs) const { return Float4(*this) *= rhs; } 39 | Float4 operator/ (const Float4 & rhs) const { return Float4(*this) /= rhs; } 40 | Float4 & operator+=(const float rhs) { q = _mm_add_ps(q, _mm_set_ps1(rhs)); return *this; } 41 | Float4 & operator-=(const float rhs) { q = _mm_sub_ps(q, _mm_set_ps1(rhs)); return *this; } 42 | Float4 & operator*=(const float rhs) { q = _mm_mul_ps(q, _mm_set_ps1(rhs)); return *this; } 43 | Float4 & operator/=(const float rhs) { q = _mm_mul_ps(q, _mm_rcp_ps(_mm_set1_ps(rhs))); return *this; } 44 | /* 45 | Though the above gets good codegen, technically rcp is an approximation. 46 | Intel says: 47 | The relative error for this approximation is: 48 | |Relative Error| ≤ 1.5 ∗ 2^−12 49 | You can use the below implementation instead, if it makes you feel better. 50 | Also, search for other places the "reciprocal and multiply" idiom is used! 51 | Though you should note that this *ONLY* impacts the partial derivatives. 52 | (Primal part never has data dependency on dual part.) 53 | */ 54 | // Float4 & operator/=(const float rhs) { q = _mm_div_ps(q, _mm_set_ps1(rhs)); return *this; } 55 | Float4 operator+ (const float rhs) const { return Float4(*this) += rhs; } 56 | Float4 operator- (const float rhs) const { return Float4(*this) -= rhs; } 57 | Float4 operator* (const float rhs) const { return Float4(*this) *= rhs; } 58 | Float4 operator/ (const float rhs) const { return Float4(*this) /= rhs; } 59 | Float4 & operator+=(const __m128 rhs) { q = _mm_add_ps(q, rhs); return *this; } 60 | Float4 & operator-=(const __m128 rhs) { q = _mm_sub_ps(q, rhs); return *this; } 61 | Float4 & operator*=(const __m128 rhs) { q = _mm_mul_ps(q, rhs); return *this; } 62 | Float4 & operator/=(const __m128 rhs) { q = _mm_div_ps(q, rhs); return *this; } 63 | Float4 operator+ (const __m128 rhs) const { return Float4(*this) += rhs; } 64 | Float4 operator- (const __m128 rhs) const { return Float4(*this) -= rhs; } 65 | Float4 operator* (const __m128 rhs) const { return Float4(*this) *= rhs; } 66 | Float4 operator/ (const __m128 rhs) const { return Float4(*this) /= rhs; } 67 | Float4 & operator= (const Float4 & rhs) { q = rhs.q; return *this; } 68 | 69 | }; // struct Float4 70 | 71 | Float4 operator + (const float c, const Float4 & x); 72 | Float4 operator - (const float c, const Float4 & x); 73 | Float4 operator * (const float c, const Float4 & x); 74 | Float4 operator / (const float c, const Float4 & x); 75 | Float4 operator + (const Float4 & x); 76 | Float4 operator - (const Float4 & x); 77 | 78 | /* ************************************************************************************************************************ 79 | Data Type 80 | ************************************************************************************************************************* */ 81 | 82 | struct DummyType {}; 83 | namespace Internal { 84 | 85 | template 86 | struct Data 87 | { 88 | Float4 dx[kSlots]; 89 | float x; 90 | // char padding[12]; 91 | Data() {} 92 | Data(const float & x) : x{ x } { } 93 | Data(const Data & rhs) : x{ rhs.x } { forSlots{ dx[i] = rhs.dx[i]; } } 94 | void Track(const uint64_t i) { dx[i / 4].Track(i % 4); } 95 | }; 96 | 97 | template 98 | struct Data 99 | { 100 | union { 101 | DummyType dx[1]; 102 | float x; 103 | // DummyType padding[1]; 104 | }; 105 | Data() {} 106 | Data(const float & x) : x{ x } {} 107 | Data(const Data & x) : x{ x.x } {} 108 | }; 109 | 110 | } // namespace Internal 111 | 112 | 113 | /* ************************************************************************************************************************ 114 | Dual Type 115 | ************************************************************************************************************************* */ 116 | 117 | 118 | template 119 | struct Dual_Float 120 | { 121 | typedef float InternalType; 122 | static constexpr int kSlots = kSlots_; 123 | Internal::Data data; 124 | static constexpr float ln2 = 0.6931471805599453f; 125 | Dual_Float() {} 126 | Dual_Float(const float x) : data{ x } {} 127 | Dual_Float(const float x, const int slot) : data{ x } { data.Track(slot); } 128 | Dual_Float(const Dual_Float & d) : data{ d.data } {} 129 | void Print(void) const { std::cout << data.x << " { "; forSlots{ data.dx[i].Print(); } std::cout << "}" << std::endl; } 130 | void Track(const uint64_t slot) { data.Track(slot); } 131 | float Primal() const { return data.x; } 132 | float & PrimalReference() { return data.x; } 133 | float Partial(const uint64_t slot) const { const Float4 & dx = data.dx[slot / 4]; return dx.Partial(slot % 4); } 134 | Dual_Float & operator+=(const Dual_Float & rhs) { forSlots{ data.dx[i] += rhs.data.dx[i]; } data.x += rhs.data.x; return *this; }; 135 | Dual_Float & operator-=(const Dual_Float & rhs) { forSlots{ data.dx[i] -= rhs.data.dx[i]; } data.x -= rhs.data.x; return *this; }; 136 | Dual_Float & operator*=(const Dual_Float & rhs) { forSlots{ data.dx[i] = data.dx[i] * rhs.data.x + rhs.data.dx[i] * data.x; } data.x *= rhs.data.x; return *this; } 137 | Dual_Float & operator/=(const Dual_Float & rhs) { const float inv = 1.f / rhs.data.x, inv2 = inv * inv; forSlots{ data.dx[i] = (data.dx[i] * rhs.data.x - rhs.data.dx[i] * data.x) * inv2; } data.x *= inv; return *this; } 138 | Dual_Float operator+ (const Dual_Float & rhs) const { return Dual_Float(*this) += rhs; } 139 | Dual_Float operator- (const Dual_Float & rhs) const { return Dual_Float(*this) -= rhs; } 140 | Dual_Float operator* (const Dual_Float & rhs) const { return Dual_Float(*this) *= rhs; } 141 | Dual_Float operator/ (const Dual_Float & rhs) const { return Dual_Float(*this) /= rhs; } 142 | Dual_Float & operator+=(const float rhs) { data.x += rhs; return *this; } 143 | Dual_Float & operator-=(const float rhs) { data.x -= rhs; return *this; } 144 | Dual_Float & operator*=(const float rhs) { forSlots{ data.dx[i] *= rhs; } data.x *= rhs; return *this; } 145 | Dual_Float & operator/=(const float rhs) { const float inv = 1.f / rhs; forSlots{ data.dx[i] *= inv; } data.x *= inv; return *this; } 146 | Dual_Float operator+ (const float rhs) const { return Dual_Float(*this) += rhs; } 147 | Dual_Float operator- (const float rhs) const { return Dual_Float(*this) -= rhs; } 148 | Dual_Float operator* (const float rhs) const { return Dual_Float(*this) *= rhs; } 149 | Dual_Float operator/ (const float rhs) const { return Dual_Float(*this) /= rhs; } 150 | bool operator< (const Dual_Float & rhs) const { return data.x < rhs.data.x; } 151 | bool operator<=(const Dual_Float & rhs) const { return data.x <= rhs.data.x; } 152 | bool operator> (const Dual_Float & rhs) const { return data.x > rhs.data.x; } 153 | bool operator>=(const Dual_Float & rhs) const { return data.x >= rhs.data.x; } 154 | bool operator==(const Dual_Float & rhs) const { return data.x == rhs.data.x; } 155 | bool operator!=(const Dual_Float & rhs) const { return data.x != rhs.data.x; } 156 | Dual_Float & operator=(const Dual_Float & rhs) { data.x = rhs.data.x; forSlots{ data.dx[i] = rhs.data.dx[i]; } return *this; } 157 | bool operator< (const float rhs) const { return data.x < rhs; } 158 | bool operator<=(const float rhs) const { return data.x <= rhs; } 159 | bool operator> (const float rhs) const { return data.x > rhs; } 160 | bool operator>=(const float rhs) const { return data.x >= rhs; } 161 | bool operator==(const float rhs) const { return data.x == rhs; } 162 | bool operator!=(const float rhs) const { return data.x != rhs; } 163 | Dual_Float & operator=(const float rhs) { data.x = rhs; forSlots{ data.dx[i].Reset(); } return *this; } 164 | bool operator< (const int rhs) const { return data.x < rhs; } 165 | bool operator<=(const int rhs) const { return data.x <= rhs; } 166 | bool operator> (const int rhs) const { return data.x > rhs; } 167 | bool operator>=(const int rhs) const { return data.x >= rhs; } 168 | bool operator==(const int rhs) const { return data.x == rhs; } 169 | bool operator!=(const int rhs) const { return data.x != rhs; } 170 | Dual_Float & operator=(const int rhs) { data.x = (float)rhs; forSlots{ data.dx[i].Reset(); } return *this; } 171 | // operator float() const { return (float) data.x; } 172 | // operator double() const { return (double) data.x; } 173 | 174 | }; // struct Dual_Float 175 | 176 | template Dual_Float operator+(const float lhs, const Dual_Float & rhs); 177 | template Dual_Float operator-(const float lhs, const Dual_Float & rhs); 178 | template Dual_Float operator*(const float lhs, const Dual_Float & rhs); 179 | template Dual_Float operator/(const float lhs, const Dual_Float & rhs); 180 | template bool operator< (const T lhs, const Dual_Float & rhs); 181 | template bool operator<=(const T lhs, const Dual_Float & rhs); 182 | template bool operator> (const T lhs, const Dual_Float & rhs); 183 | template bool operator>=(const T lhs, const Dual_Float & rhs); 184 | template bool operator==(const T lhs, const Dual_Float & rhs); 185 | template bool operator!=(const T lhs, const Dual_Float & rhs); 186 | template Dual_Float operator+(const Dual_Float & x); 187 | template Dual_Float operator-(const Dual_Float & x); 188 | 189 | /* ************************************************************************************************************************ 190 | Special functions 191 | ************************************************************************************************************************* */ 192 | 193 | template Dual_Float sin(const Dual_Float & x); 194 | template Dual_Float cos(const Dual_Float & x); 195 | // template Dual_Float tan(const Dual_Float & x) { Dual_Float out{ asin(x.data.x) }; const float t = sec( x.data.x ); const float t2 = t * t; forSlots{ out.data.dx[i] = x.data.dx[i] * t2; } return out; } 196 | template Dual_Float asin(const Dual_Float & x); 197 | template Dual_Float acos(const Dual_Float & x); 198 | template Dual_Float atan(const Dual_Float & x); 199 | template Dual_Float exp(const Dual_Float & x); 200 | template Dual_Float log(const Dual_Float & x); 201 | 202 | template inline Dual_Float sqrt(const Dual_Float & x) { Dual_Float out{ sqrtf(x.data.x) }; const float hx = 0.5f * out.data.x; forSlots{ out.data.dx[i] = x.data.dx[i] * hx; } return out; } 203 | 204 | /* Still missing one of the pow's! */ 205 | template inline Dual_Float pow(const Dual_Float & x, const float k) { Dual_Float out{ powf(x.data.x, k) }; const Float4 c = k * out.data.x * _mm_rcp_ps(_mm_set1_ps(x.data.x)); forSlots{ out.data.dx[i] = c * x.data.dx[i]; } return out; } 206 | //template inline Dual_Float pow(const Dual_Float & x, const Dual_Float & k) { Dual_Float out = Dual_Float{ pow(x.data.x, k.data.x) }; const float c = k.data.x * out.data.x / x.data.x; const float d = out.data.x * log(x.data.x); forSlots{ out.data.dx[i] = c * x.data.dx[i] + d * k.data.dx[i]; } return out; } 207 | 208 | template struct Dual_Float<1>; 209 | 210 | } // namespace PistolsAtDawn 211 | 212 | #undef forSlots 213 | --------------------------------------------------------------------------------