├── CuMatrix ├── Buffers │ ├── GenericBuffer.h │ └── ManagedBuffer.h ├── Geometry │ └── Geometry.h ├── Interface │ └── EigenInterface.h └── MatrixOps │ ├── CuMatrix.h │ ├── CuMatrixDefs.h │ ├── CuMatrixVis.h │ └── VectorTypes.h ├── Examples └── P01_UseMangeBuffer │ ├── CMakeLists.txt │ ├── Timer.h │ ├── main.cpp │ ├── main.cu │ └── main.cuh ├── LICENSE ├── README.md └── cmake ├── CuMatrixConfig.cmake └── FindCuda.cmake /CuMatrix/Buffers/GenericBuffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include 4 | enum class CudaDataType : int32_t 5 | { 6 | //! 32-bit floating point format. 7 | kFLOAT = 0, 8 | 9 | //! IEEE 16-bit floating-point format. 10 | kHALF = 1, 11 | 12 | //! 8-bit integer representing a quantized floating-point value. 13 | kINT8 = 2, 14 | 15 | //! Signed 32-bit integer format. 16 | kINT32 = 3, 17 | 18 | //! 8-bit boolean. 0 = false, 1 = true, other values undefined. 19 | kBOOL = 4, 20 | 21 | //! 64-bit (on x64 system) 22 | kPOINTER = 5, 23 | 24 | //! variable size 25 | kSTRUCT = 6 26 | }; 27 | 28 | 29 | //! 30 | //! \brief The GenericBuffer class is a templated class for buffers. Attributes to TensorRT samples. 31 | //! 32 | //! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation, 33 | //! deallocation, querying of buffers on both the device and the host. 34 | //! It can handle data of arbitrary types because it stores byte buffers. 35 | //! The template parameters AllocFunc and FreeFunc are used for the 36 | //! allocation and deallocation of the buffer. 37 | //! AllocFunc must be a functor that takes in (void** ptr, size_t size) 38 | //! and returns bool. ptr is a pointer to where the allocated buffer address should be stored. 39 | //! size is the amount of memory in bytes to allocate. 40 | //! The boolean indicates whether or not the memory allocation was successful. 41 | //! FreeFunc must be a functor that takes in (void* ptr) and returns void. 42 | //! ptr is the allocated buffer address. It must work with nullptr input. 43 | //! 44 | //! 45 | template 46 | class GenericBuffer 47 | { 48 | public: 49 | //! 50 | //! \brief Construct an empty buffer. 51 | //! 52 | //! 53 | GenericBuffer(size_t elementSize, CudaDataType type = CudaDataType::kFLOAT) 54 | : mSize(0) 55 | , mCapacity(0) 56 | , mElementSize(elementSize) 57 | , mType(type) 58 | , mBuffer(nullptr) 59 | , mOwnership(true) 60 | { 61 | } 62 | 63 | GenericBuffer(GenericBuffer&& buf) 64 | : mSize(buf.mSize) 65 | , mCapacity(buf.mCapacity) 66 | , mElementSize(buf.getElementSize()) 67 | , mType(buf.mType) 68 | , mBuffer(buf.mBuffer) 69 | , mOwnership(buf.getOwnerShip()) 70 | { 71 | buf.mSize = 0; 72 | buf.mCapacity = 0; 73 | buf.mType = CudaDataType::kFLOAT; 74 | buf.mBuffer = nullptr; 75 | } 76 | 77 | // takeOwnership will be ignored if pPreAllocBuf is nullptr 78 | GenericBuffer(size_t size, CudaDataType type, size_t elementSize, void* pPreAllocBuf = nullptr, bool takeOwnership = false) 79 | : mSize(size) 80 | , mCapacity(size) 81 | , mElementSize(elementSize) 82 | , mType(type) 83 | { 84 | if (pPreAllocBuf == nullptr && size) 85 | { 86 | initializeWithSpace(size, type); 87 | } 88 | else 89 | { 90 | mOwnership = takeOwnership; 91 | mBuffer = pPreAllocBuf; 92 | 93 | } 94 | } 95 | 96 | ////! 97 | ////! \brief Construct a buffer with the specified allocation size in bytes. 98 | ////! 99 | //GenericBuffer(size_t size, CudaDataType type) 100 | // : mSize(size) 101 | // , mCapacity(size) 102 | // , mType(type) 103 | // , mOwnership(true) 104 | //{ 105 | // if (!allocFn(&mBuffer, this->nbBytes())) 106 | // { 107 | // throw std::bad_alloc(); 108 | // } 109 | //} 110 | 111 | void initializeWithSpace(size_t size, CudaDataType type) { 112 | mSize = size; 113 | mCapacity = size; 114 | mType = type; 115 | mOwnership = true; 116 | 117 | if (!allocFn(&mBuffer, this->nbBytes())) 118 | { 119 | throw std::bad_alloc(); 120 | } 121 | } 122 | 123 | GenericBuffer& operator=(GenericBuffer&& buf) 124 | { 125 | if (this != &buf) 126 | { 127 | freeBuf(); 128 | 129 | mSize = buf.mSize; 130 | mCapacity = buf.mCapacity; 131 | mType = buf.mType; 132 | mBuffer = buf.mBuffer; 133 | // Reset buf. 134 | buf.mSize = 0; 135 | buf.mCapacity = 0; 136 | buf.mBuffer = nullptr; 137 | } 138 | return *this; 139 | } 140 | 141 | inline uint32_t getElementSize() const 142 | { 143 | 144 | return mElementSize; 145 | } 146 | 147 | //! 148 | //! \brief Returns pointer to underlying array. 149 | //! 150 | void* data() 151 | { 152 | return mBuffer; 153 | } 154 | 155 | //! 156 | //! \brief Returns pointer to underlying array. 157 | //! 158 | const void* data() const 159 | { 160 | return mBuffer; 161 | } 162 | 163 | //! 164 | //! \brief Returns the size (in number of elements) of the buffer. 165 | //! 166 | size_t size() const 167 | { 168 | return mSize; 169 | } 170 | 171 | //! 172 | //! \brief Returns the size (in bytes) of the buffer. 173 | //! 174 | size_t nbBytes() const 175 | { 176 | return this->size() * getElementSize(); 177 | } 178 | 179 | size_t nbBytes(size_t numElements) const 180 | { 181 | return numElements * getElementSize(); 182 | } 183 | 184 | //! 185 | //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. 186 | //! 187 | void resize(size_t newSize) 188 | { 189 | mSize = newSize; 190 | if (mCapacity < newSize) 191 | { 192 | freeBuf(); 193 | if (!allocFn(&mBuffer, this->nbBytes())) 194 | { 195 | throw std::bad_alloc{}; 196 | } 197 | mCapacity = newSize; 198 | 199 | mOwnership = true; 200 | } 201 | } 202 | 203 | ////! 204 | ////! \brief Overload of resize that accepts Dims 205 | ////! 206 | //void resize(const Dims& dims) 207 | //{ 208 | // return this->resize(volume(dims)); 209 | //} 210 | 211 | void freeBuf() { 212 | if (mOwnership) 213 | { 214 | freeFn(mBuffer); 215 | } 216 | } 217 | 218 | ~GenericBuffer() 219 | { 220 | freeBuf(); 221 | } 222 | 223 | bool getOwnerShip() { 224 | return mOwnership; 225 | } 226 | 227 | protected: 228 | // size: number of elements, capacity: 229 | size_t mSize{ 0 }, mCapacity{ 0 }; 230 | size_t mElementSize{ 0 }; 231 | CudaDataType mType; 232 | void* mBuffer; 233 | AllocFunc allocFn; 234 | FreeFunc freeFn; 235 | bool mOwnership; 236 | 237 | }; 238 | 239 | class DeviceAllocator 240 | { 241 | public: 242 | bool operator()(void** ptr, size_t size) const 243 | { 244 | auto retVal = cudaMalloc(ptr, size); 245 | CUDA_CHECK_RET(retVal); 246 | return retVal == cudaSuccess; 247 | } 248 | }; 249 | 250 | class DeviceFree 251 | { 252 | public: 253 | void operator()(void* ptr) const 254 | { 255 | CUDA_CHECK_RET(cudaFree(ptr)); 256 | } 257 | }; 258 | 259 | class ManagedAllocator 260 | { 261 | public: 262 | bool operator()(void** ptr, size_t size) const 263 | { 264 | auto retVal = cudaMallocManaged(ptr, size); 265 | CUDA_CHECK_RET(retVal); 266 | return retVal == cudaSuccess; 267 | } 268 | }; 269 | 270 | class ManagedFree 271 | { 272 | public: 273 | void operator()(void* ptr) const 274 | { 275 | CUDA_CHECK_RET(cudaFree(ptr)); 276 | } 277 | }; 278 | 279 | class HostAllocator 280 | { 281 | public: 282 | bool operator()(void** ptr, size_t size) const 283 | { 284 | CUDA_CHECK_RET(cudaMallocHost(ptr, size)); 285 | //cudaHostAlloc 286 | return *ptr != nullptr; 287 | } 288 | }; 289 | 290 | class HostFree 291 | { 292 | public: 293 | void operator()(void* ptr) const 294 | { 295 | //free(ptr); 296 | CUDA_CHECK_RET(cudaFreeHost(ptr)); 297 | } 298 | }; 299 | 300 | 301 | using DeviceBuffer = GenericBuffer; 302 | using HostBuffer = GenericBuffer; 303 | 304 | template 305 | class ClassBuffer 306 | { 307 | public: 308 | typedef std::shared_ptr SharedPtr; 309 | typedef ClassBuffer* Ptr; 310 | 311 | ClassBuffer(bool callConstructor = false) { 312 | if (!allocFn(&data, sizeof(Class))) 313 | { 314 | throw std::bad_alloc(); 315 | } 316 | 317 | if (callConstructor) 318 | { 319 | constructor(); 320 | } 321 | 322 | } 323 | 324 | ~ClassBuffer() 325 | { 326 | freeFn(data); 327 | } 328 | 329 | void constructor() { 330 | new(data) Class(); 331 | } 332 | 333 | Class* getData() { 334 | return (Class *) data; 335 | } 336 | 337 | Class& operator->() const 338 | { 339 | return *target; 340 | } 341 | protected: 342 | void* data; 343 | AllocFunc allocFn; 344 | FreeFunc freeFn; 345 | }; 346 | 347 | template 348 | using ManagedClassBuffer = ClassBuffer; 349 | template 350 | class DeviceClassBuffer : public ClassBuffer 351 | { 352 | public: 353 | 354 | void fromCPU(Class* pObj) { 355 | CUDA_CHECK_RET(cudaMemcpy(data, pObj, sizeof(Class), cudaMemcpyHostToDevice)); 356 | } 357 | }; 358 | -------------------------------------------------------------------------------- /CuMatrix/Buffers/ManagedBuffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include "../MatrixOps/CuMatrixDefs.h" 4 | #include "GenericBuffer.h" 5 | 6 | ////! 32-bit floating point format. 7 | //kFLOAT = 0, 8 | 9 | ////! IEEE 16-bit floating-point format. 10 | //kHALF = 1, 11 | 12 | ////! 8-bit integer representing a quantized floating-point value. 13 | //kINT8 = 2, 14 | 15 | ////! Signed 32-bit integer format. 16 | //kINT32 = 3, 17 | 18 | ////! 8-bit boolean. 0 = false, 1 = true, other values undefined. 19 | //kBOOL = 4 20 | struct TypeSelceter 21 | { 22 | //static CudaDataType selectTypes(long v) { return CudaDataType::kINT32; } 23 | //static CudaDataType selectTypes(int32_t v) { return CudaDataType::kINT32; } 24 | //static CudaDataType selectTypes(float v) {return CudaDataType::kFLOAT; } } 25 | //static CudaDataType selectTypes(short v) { return CudaDataType::kHALF; } 26 | //static CudaDataType selectTypes(unsigned char v) { return CudaDataType::kINT8; } 27 | //static CudaDataType selectTypes(int8_t v) { return CudaDataType::kINT8; } 28 | //static CudaDataType selectTypes(bool v) { return CudaDataType::kBOOL; } 29 | //static CudaDataType selectTypes(void* v) { return CudaDataType::kPOINTER; } 30 | 31 | template 32 | static CudaDataType selectTypes() { 33 | if constexpr (std::is_pointer_v) return CudaDataType::kPOINTER; 34 | else if constexpr (std::is_same_v) return CudaDataType::kINT32; 35 | else if constexpr (std::is_same_v) return CudaDataType::kINT32; 36 | else if constexpr (std::is_same_v) return CudaDataType::kFLOAT; 37 | else if constexpr (std::is_same_v) return CudaDataType::kHALF; 38 | else if constexpr (std::is_same_v) return CudaDataType::kINT8; 39 | else if constexpr (std::is_same_v) return CudaDataType::kINT8; 40 | else if constexpr (std::is_same_v) return CudaDataType::kBOOL; 41 | else return CudaDataType::kSTRUCT; 42 | } 43 | 44 | template 45 | static size_t getTypeSize() { 46 | if constexpr (std::is_pointer_v) return 8; 47 | else if constexpr (std::is_same_v) return 4; 48 | else if constexpr (std::is_same_v) return 4; 49 | else if constexpr (std::is_same_v) return 4; 50 | else if constexpr (std::is_same_v) return 2; 51 | else if constexpr (std::is_same_v) return 1; 52 | else if constexpr (std::is_same_v) return 1; 53 | else if constexpr (std::is_same_v) return 1; 54 | else return sizeof(T); 55 | } 56 | }; 57 | 58 | template 59 | class ManagedBuffer 60 | { 61 | public: 62 | typedef std::shared_ptr> SharedPtr; 63 | typedef ManagedBuffer* Ptr; 64 | 65 | ManagedBuffer(size_t in_size, bool in_useCPUBuf = false, T* in_cpuBuffer = nullptr, bool in_cpuBufferOwnership = false) 66 | : size(in_size) 67 | , gpuBuffer(in_size, TypeSelceter::selectTypes(), TypeSelceter::getTypeSize()) 68 | , cpuBuffer(in_useCPUBuf ? in_size : 0, TypeSelceter::selectTypes(), TypeSelceter::getTypeSize(), 69 | in_cpuBuffer, (in_useCPUBuf && in_cpuBuffer != nullptr)? in_cpuBufferOwnership : false ) 70 | { 71 | if (in_cpuBuffer != nullptr) 72 | { 73 | // std::cout << "Registering address: " << in_cpuBuffer << std::endl; 74 | //CUDA_CHECK_RET(cudaHostRegister(in_cpuBuffer, cpuBuffer.nbBytes(), cudaHostRegisterDefault)); 75 | cudaHostRegister(in_cpuBuffer, cpuBuffer.nbBytes(), cudaHostRegisterDefault); 76 | 77 | } 78 | }; 79 | 80 | void enableCPU() { 81 | cpuBuffer.resize(getSize()); 82 | } 83 | 84 | T* getGPUBuffer() { 85 | return (T*)gpuBuffer.data(); 86 | } 87 | T* getCPUBuffer() { 88 | return (T*)cpuBuffer.data(); 89 | } 90 | 91 | // when stream = 0, sync option won't work 92 | inline void toCPU(bool sync = true, cudaStream_t stream = 0); 93 | inline void toGPU(bool sync = true, cudaStream_t stream = 0); 94 | 95 | // only copy the first numElements elements 96 | inline void toCPU(size_t numElements, bool sync = true, cudaStream_t stream = 0); 97 | inline void toGPU(size_t numElements, bool sync = true, cudaStream_t stream = 0); 98 | 99 | inline void copyToExternalCPUBuffer(void* pExternalCPUBuffer); 100 | 101 | // return the number of elements, not the memory size messured by bytes 102 | size_t getSize() { 103 | return size; 104 | } 105 | 106 | size_t nBytes() { 107 | return gpuBuffer.nbBytes(); 108 | } 109 | private: 110 | 111 | size_t size = 0; 112 | 113 | DeviceBuffer gpuBuffer; 114 | HostBuffer cpuBuffer; 115 | }; 116 | 117 | template 118 | inline void ManagedBuffer::toCPU(bool sync, cudaStream_t stream) 119 | { 120 | if (getCPUBuffer() == nullptr) 121 | { 122 | enableCPU(); 123 | } 124 | 125 | if (sync) 126 | { 127 | CUDA_CHECK_RET(cudaMemcpy( 128 | cpuBuffer.data(), gpuBuffer.data(), gpuBuffer.nbBytes(), cudaMemcpyDeviceToHost)); 129 | } 130 | else 131 | { 132 | CUDA_CHECK_RET(cudaMemcpyAsync( 133 | cpuBuffer.data(), gpuBuffer.data(), gpuBuffer.nbBytes(), cudaMemcpyDeviceToHost, stream)); 134 | } 135 | } 136 | 137 | template 138 | inline void ManagedBuffer::toCPU(size_t numElements, bool sync, cudaStream_t stream) 139 | { 140 | if (getCPUBuffer() == nullptr) 141 | { 142 | enableCPU(); 143 | } 144 | 145 | if (sync) 146 | { 147 | CUDA_CHECK_RET(cudaMemcpy( 148 | cpuBuffer.data(), gpuBuffer.data(), gpuBuffer.nbBytes(numElements), cudaMemcpyDeviceToHost)); 149 | } 150 | else 151 | { 152 | CUDA_CHECK_RET(cudaMemcpyAsync( 153 | cpuBuffer.data(), gpuBuffer.data(), gpuBuffer.nbBytes(numElements), cudaMemcpyDeviceToHost, stream)); 154 | } 155 | } 156 | 157 | template 158 | inline void ManagedBuffer::copyToExternalCPUBuffer(void* pExternalCPUBuffer) 159 | { 160 | CUDA_CHECK_RET(cudaMemcpy( 161 | pExternalCPUBuffer, gpuBuffer.data(), gpuBuffer.nbBytes(), cudaMemcpyDeviceToHost)); 162 | } 163 | 164 | template 165 | inline void ManagedBuffer::toGPU(bool sync, cudaStream_t stream) 166 | { 167 | if (sync) 168 | { 169 | CUDA_CHECK_RET(cudaMemcpy( 170 | gpuBuffer.data(), cpuBuffer.data(), cpuBuffer.nbBytes(), cudaMemcpyHostToDevice)); 171 | } 172 | else 173 | { 174 | CUDA_CHECK_RET(cudaMemcpyAsync( 175 | gpuBuffer.data(), cpuBuffer.data(), cpuBuffer.nbBytes(), cudaMemcpyHostToDevice, stream)); 176 | } 177 | 178 | } 179 | 180 | template 181 | inline void ManagedBuffer::toGPU(size_t numElements, bool sync, cudaStream_t stream) 182 | { 183 | if (sync) 184 | { 185 | CUDA_CHECK_RET(cudaMemcpy( 186 | gpuBuffer.data(), cpuBuffer.data(), cpuBuffer.nbBytes(numElements), cudaMemcpyHostToDevice)); 187 | } 188 | else 189 | { 190 | CUDA_CHECK_RET(cudaMemcpyAsync( 191 | gpuBuffer.data(), cpuBuffer.data(), cpuBuffer.nbBytes(numElements), cudaMemcpyHostToDevice, stream)); 192 | } 193 | } -------------------------------------------------------------------------------- /CuMatrix/Geometry/Geometry.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include "../MatrixOps/CuMatrix.h" 4 | #include "../MatrixOps/CuMatrixDefs.h" 5 | #include "device_launch_parameters.h" 6 | 7 | namespace CuMatrix 8 | { 9 | template 10 | GPU_CPU_INLINE_FUNC void faceOrientedArea(const DType* a, const DType* b, const DType* c, DType* out) 11 | { 12 | DType AB[3]; 13 | vec3Minus(b, a, AB); 14 | DType AC[3]; 15 | vec3Minus(c, a, AC); 16 | 17 | vec3CrossProduct(AB, AC, out); 18 | } 19 | 20 | template 21 | GPU_CPU_INLINE_FUNC void faceNormal(const DType* a, const DType* b, const DType* c, DType* out) 22 | { 23 | faceOrientedArea(a, b, c, out); 24 | vec3Normalize(out); 25 | } 26 | 27 | template 28 | GPU_CPU_INLINE_FUNC void faceNormal(const DType* allVertsArray, const int32_t* faceVIds, DType* out) 29 | { 30 | const DType* a = allVertsArray + PointVecStride * faceVIds[0]; 31 | const DType* b = allVertsArray + PointVecStride * faceVIds[1]; 32 | const DType* c = allVertsArray + PointVecStride * faceVIds[2]; 33 | 34 | faceOrientedNormal(a, b, c, out); 35 | } 36 | 37 | template 38 | GPU_CPU_INLINE_FUNC DType tetOrientedVolume(const DType* allVertsArray, const int32_t* tetVIds) { 39 | const DType* tvs[4] = { 40 | allVertsArray + PointVecStride * tetVIds[0], 41 | allVertsArray + PointVecStride * tetVIds[1], 42 | allVertsArray + PointVecStride * tetVIds[2], 43 | allVertsArray + PointVecStride * tetVIds[3], 44 | }; 45 | 46 | DType AB[3]; 47 | vec3Minus(tvs[1], tvs[0], AB); 48 | DType AC[3]; 49 | vec3Minus(tvs[2], tvs[0], AC); 50 | DType AD[3]; 51 | vec3Minus(tvs[3], tvs[0], AD); 52 | 53 | DType tetOrientedVol = vec3TripleProduct(AB, AC, AD); 54 | 55 | return tetOrientedVol; 56 | } 57 | 58 | template 59 | GPU_CPU_INLINE_FUNC DType tetOrientedVolume(const DType* v1, const DType* v2, const DType* v3, const DType* v4) { 60 | 61 | DType AB[3]; 62 | vec3Minus(v2, v1, AB); 63 | DType AC[3]; 64 | vec3Minus(v3, v1, AC); 65 | DType AD[3]; 66 | vec3Minus(v4, v1, AD); 67 | 68 | DType tetOrientedVol = vec3TripleProduct(AB, AC, AD); 69 | 70 | return tetOrientedVol; 71 | } 72 | 73 | template 74 | GPU_CPU_INLINE_FUNC void tetCentroid(DType* p, const DType* allVertsArray, const int32_t* tetVIds) { 75 | vec3Set(p, DType(0.f)); 76 | 77 | const DType* tvs[4] = { 78 | allVertsArray + PointVecStride * tetVIds[0], 79 | allVertsArray + PointVecStride * tetVIds[1], 80 | allVertsArray + PointVecStride * tetVIds[2], 81 | allVertsArray + PointVecStride * tetVIds[3], 82 | }; 83 | vec3Add(p, tvs[0], p); 84 | vec3Add(p, tvs[1], p); 85 | vec3Add(p, tvs[2], p); 86 | vec3Add(p, tvs[3], p); 87 | 88 | vec3Mul(p, 0.25f, p); 89 | 90 | } 91 | 92 | template 93 | GPU_CPU_INLINE_FUNC bool tetPointInTet(const DType* p, const DType* allVertsArray, const int32_t* tetVIds) { 94 | const DType* tvs[4] = { 95 | allVertsArray + PointVecStride * tetVIds[0], 96 | allVertsArray + PointVecStride * tetVIds[1], 97 | allVertsArray + PointVecStride * tetVIds[2], 98 | allVertsArray + PointVecStride * tetVIds[3], 99 | }; 100 | 101 | DType AB[3]; 102 | vec3Minus(tvs[1], tvs[0], AB); 103 | DType AC[3]; 104 | vec3Minus(tvs[2], tvs[0], AC); 105 | DType AD[3]; 106 | vec3Minus(tvs[3], tvs[0], AD); 107 | 108 | DType tetOrientedVol = vec3TripleProduct(AB, AC, AD); 109 | 110 | const int32_t order[4][3] = { { 1, 2, 3 },{ 2, 0, 3 },{ 0, 1, 3 },{ 1, 0, 2 } }; 111 | 112 | for (int32_t i = 0; i < 4; ++i) { 113 | 114 | DType v1[3]; // = vs4[order[i][1]] - vs4[order[i][0]]; // HalfEdgeVec(pHE1); 115 | vec3Minus(tvs[order[i][1]], tvs[order[i][0]], v1); 116 | 117 | DType v2[3]; // = vs4[order[i][2]] - vs4[order[i][1]]; // HalfEdgeVec(pHE2); 118 | vec3Minus(tvs[order[i][2]], tvs[order[i][1]], v2); 119 | 120 | DType vp[3]; 121 | vec3Minus(p, tvs[order[i][0]], vp); 122 | 123 | if (vec3TripleProduct(vp, v1, v2) * tetOrientedVol >= 0) 124 | { 125 | return false; 126 | } 127 | } 128 | 129 | return true; 130 | } 131 | 132 | template 133 | GPU_CPU_INLINE_FUNC bool tetPointBarycentricsInTet(const DType* p, const DType* allVertsArray, const int32_t* tetVIds, DType* barycentrics) { 134 | const DType* tvs[4] = { 135 | allVertsArray + PointVecStride * tetVIds[0], 136 | allVertsArray + PointVecStride * tetVIds[1], 137 | allVertsArray + PointVecStride * tetVIds[2], 138 | allVertsArray + PointVecStride * tetVIds[3], 139 | }; 140 | 141 | DType AB[3]; 142 | vec3Minus(tvs[1], tvs[0], AB); 143 | DType AC[3]; 144 | vec3Minus(tvs[2], tvs[0], AC); 145 | DType AD[3]; 146 | vec3Minus(tvs[3], tvs[0], AD); 147 | 148 | DType tetOrientedVol = vec3TripleProduct(AB, AC, AD); 149 | 150 | const int32_t order[4][3] = { { 1, 2, 3 },{ 2, 0, 3 },{ 0, 1, 3 },{ 1, 0, 2 } }; 151 | 152 | for (int32_t i = 0; i < 3; ++i) { 153 | 154 | DType v1[3]; // = vs4[order[i][1]] - vs4[order[i][0]]; // HalfEdgeVec(pHE1); 155 | vec3Minus(tvs[order[i][1]], tvs[order[i][0]], v1); 156 | 157 | DType v2[3]; // = vs4[order[i][2]] - vs4[order[i][1]]; // HalfEdgeVec(pHE2); 158 | vec3Minus(tvs[order[i][2]], tvs[order[i][1]], v2); 159 | 160 | DType vp[3]; 161 | vec3Minus(p, tvs[order[i][0]], vp); 162 | 163 | DType subTetOrientedVol = (vec3TripleProduct(vp, v1, v2)); 164 | 165 | barycentrics[i] = - subTetOrientedVol / tetOrientedVol; 166 | } 167 | barycentrics[3] = 1.f - barycentrics[0] - barycentrics[1] - barycentrics[2]; 168 | return true; 169 | } 170 | 171 | template 172 | GPU_CPU_INLINE_FUNC void triangleOrientedArea(DType* allVertsArray, int32_t v1, int32_t v2, int32_t v3, DType* orientedArea) { 173 | DType vec1[3]; // = vs4[order[i][1]] - vs4[order[i][0]]; // HalfEdgeVec(pHE1); 174 | vec3Minus(allVertsArray + PointVecStride * v2, allVertsArray + PointVecStride * v1, vec1); 175 | 176 | DType vec2[3]; // = vs4[order[i][2]] - vs4[order[i][1]]; // HalfEdgeVec(pHE2); 177 | vec3Minus(allVertsArray + PointVecStride * v3, allVertsArray + PointVecStride * v2, vec2); 178 | vec3CrossProduct(vec1, vec2, orientedArea); 179 | } 180 | 181 | // from Building an Orthonormal Basis, Revisited 182 | template 183 | void buildOrthonormalBasis(const DType* n, DType* b1, DType* b2) 184 | { 185 | if (n[2] < 0.) { 186 | const DType a = 1.0f / (1.0f - n[2]); 187 | const DType b = n[0] * n[1] * a; 188 | b1[0] = 1.0f - n[0] * n[0] * a; 189 | b1[1] = -b; 190 | b1[2] = n[0]; 191 | 192 | b2[0] = b; 193 | b2[1] = n[1] * n[1] * a - 1.0f; 194 | b2[2] = -n[1]; 195 | } 196 | else { 197 | const DType a = 1.0f / (1.0f + n[2]); 198 | const DType b = -n[0] * n[1] * a; 199 | b1[0] = 1.0f - n[0] * n[0] * a; 200 | b1[1] = b; 201 | b1[2] = -n[0]; 202 | 203 | b2[0] = b; 204 | b2[1] = 1.0f - n[1] * n[1] * a; 205 | b2[2] = -n[1]; 206 | } 207 | } 208 | 209 | 210 | }; -------------------------------------------------------------------------------- /CuMatrix/Interface/EigenInterface.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //#define EIGEN_NO_CUDA 3 | #include 4 | 5 | -------------------------------------------------------------------------------- /CuMatrix/MatrixOps/CuMatrix.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include "CuMatrixDefs.h" 4 | #include "device_launch_parameters.h" 5 | #define SQR(x) ((x) * (x)) 6 | #define CUBE(x) ((x) * (x) * (x)) 7 | 8 | #ifndef __CUDACC__ 9 | #include 10 | #else 11 | #include 12 | #endif 13 | 14 | namespace CuMatrix 15 | { 16 | /* 17 | Assuming all the matrix to be column major; 18 | */ 19 | 20 | template 21 | GPU_CPU_INLINE_FUNC DType* vecPtr(DType* buffer, int vecPos, int stride) { 22 | return buffer + vecPos * stride; 23 | } 24 | 25 | template 26 | GPU_CPU_INLINE_FUNC void vec3Set(DType* v, const DType val) { 27 | v[0] = val; 28 | v[1] = val; 29 | v[2] = val; 30 | } 31 | 32 | template 33 | GPU_CPU_INLINE_FUNC void vec3Set(DType* v, const DType val1, const DType val2, const DType val3) { 34 | v[0] = val1; 35 | v[1] = val2; 36 | v[2] = val3; 37 | } 38 | 39 | template 40 | GPU_CPU_INLINE_FUNC void vec3Set(DType* out, const DType* in) { 41 | out[0] = in[0]; 42 | out[1] = in[1]; 43 | out[2] = in[2]; 44 | } 45 | 46 | template 47 | GPU_CPU_INLINE_FUNC void vec3Add(const DType* v1, const DType* v2, DType* result) { 48 | result[0] = v1[0] + v2[0]; 49 | result[1] = v1[1] + v2[1]; 50 | result[2] = v1[2] + v2[2]; 51 | } 52 | 53 | template 54 | GPU_CPU_INLINE_FUNC void vec3Minus(const DType* v1, const DType* v2, DType* result) { 55 | result[0] = v1[0] - v2[0]; 56 | result[1] = v1[1] - v2[1]; 57 | result[2] = v1[2] - v2[2]; 58 | } 59 | 60 | 61 | template 62 | GPU_CPU_INLINE_FUNC void vec3Mul(const DType* v1, const DType a, DType* result) { 63 | result[0] = v1[0] * a; 64 | result[1] = v1[1] * a; 65 | result[2] = v1[2] * a; 66 | } 67 | 68 | template 69 | GPU_CPU_INLINE_FUNC void vec3MulAddTo(const DType* v1, const DType a, DType* result) { 70 | result[0] += v1[0] * a; 71 | result[1] += v1[1] * a; 72 | result[2] += v1[2] * a; 73 | } 74 | 75 | // result = a + l * v 76 | template 77 | GPU_CPU_INLINE_FUNC void vec3lerp(const DType* a, const DType l, const DType* v, DType* result) { 78 | result[0] = a[0] + l * v[0]; 79 | result[1] = a[1] + l * v[1]; 80 | result[2] = a[2] + l * v[2]; 81 | } 82 | 83 | template 84 | GPU_CPU_INLINE_FUNC DType vec3DotProduct(const DType* v1, const DType* v2) { 85 | return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; 86 | } 87 | 88 | template 89 | GPU_CPU_INLINE_FUNC void vec3CrossProduct(const DType* v1, const DType* v2, DType* result) { 90 | result[0] = v1[1] * v2[2] - v1[2] * v2[1]; 91 | result[1] = v1[2] * v2[0] - v1[0] * v2[2]; 92 | result[2] = v1[0] * v2[1] - v1[1] * v2[0]; 93 | 94 | } 95 | 96 | template 97 | GPU_CPU_INLINE_FUNC DType vec3Norm(const DType* v) { 98 | return sqrtf(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); 99 | } 100 | 101 | 102 | template 103 | GPU_CPU_INLINE_FUNC DType vec3NormSquare(const DType* v) { 104 | return (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); 105 | } 106 | 107 | template 108 | GPU_CPU_INLINE_FUNC void vec3Normalize(DType* v) { 109 | const DType norm = vec3Norm(v); 110 | 111 | v[0] /= norm; 112 | v[1] /= norm; 113 | v[2] /= norm; 114 | } 115 | 116 | // aka mixed product 117 | template 118 | GPU_CPU_INLINE_FUNC DType vec3TripleProduct(const DType* v1, const DType* v2, const DType* v3) { 119 | DType crossProduct[3]; 120 | // AB* (AC ^ AD); 121 | vec3CrossProduct(v2, v3, crossProduct); 122 | 123 | return vec3DotProduct(v1, crossProduct); 124 | } 125 | 126 | 127 | template 128 | GPU_CPU_INLINE_FUNC void vec3OuterProduct(const DType* v1, const DType* v2, DType * mat) { 129 | for (int iCol = 0; iCol < 3; iCol++) 130 | { 131 | for (int iRow = 0; iRow < 3; iRow++) { 132 | mat[iRow + 3 * iCol] = v1[iRow] * v2[iCol]; 133 | } 134 | } 135 | } 136 | 137 | /* 138 | * Vec2 139 | */ 140 | template 141 | GPU_CPU_INLINE_FUNC DType vec2CrossProduct(const DType* v1, const DType* v2) { 142 | return v1[0] * v2[1] - v1[1] * v2[0]; 143 | 144 | } 145 | 146 | template 147 | GPU_CPU_INLINE_FUNC DType vec2Norm(const DType* v) { 148 | return sqrtf(v[0] * v[0] + v[1] * v[1]);; 149 | 150 | } 151 | 152 | template 153 | GPU_CPU_INLINE_FUNC DType mat3IJ(const DType* m, const int32_t row, const int32_t col) { 154 | return m[(3 * col) + row]; 155 | } 156 | 157 | template 158 | GPU_CPU_INLINE_FUNC void mat3VecProduct(const DType* m, const DType* v, DType* result) { 159 | result[0] = m[0] * v[0]; 160 | result[1] = m[1] * v[0]; 161 | result[2] = m[2] * v[0]; 162 | 163 | result[0] += m[3] * v[1]; 164 | result[1] += m[4] * v[1]; 165 | result[2] += m[5] * v[1]; 166 | 167 | result[0] += m[6] * v[2]; 168 | result[1] += m[7] * v[2]; 169 | result[2] += m[8] * v[2]; 170 | 171 | } 172 | 173 | template 174 | GPU_CPU_INLINE_FUNC void mat3MatProduct(const DType* inA, const DType* inB, DType* outC) { 175 | mat3VecProduct(inA, inB, outC); 176 | mat3VecProduct(inA, inB+3, outC+3); 177 | mat3VecProduct(inA, inB+6, outC+6); 178 | } 179 | 180 | template 181 | GPU_CPU_INLINE_FUNC DType mat3Determinant(const DType* m) { 182 | const DType a11 = m[0]; const DType a12 = m[3]; const DType a13 = m[6]; 183 | const DType a21 = m[1]; const DType a22 = m[4]; const DType a23 = m[7]; 184 | const DType a31 = m[2]; const DType a32 = m[5]; const DType a33 = m[8]; 185 | return a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - a13 * a22 * a31 - a12 * a21 * a33 - a11 * a23 * a32; 186 | } 187 | 188 | template 189 | GPU_CPU_INLINE_FUNC DType mat3FNormSquare(const DType* m) { 190 | const DType a11 = m[0]; const DType a12 = m[3]; const DType a13 = m[6]; 191 | const DType a21 = m[1]; const DType a22 = m[4]; const DType a23 = m[7]; 192 | const DType a31 = m[2]; const DType a32 = m[5]; const DType a33 = m[8]; 193 | return a11 * a11 + a12 * a12 + a13 * a13 194 | + a21 * a21 + a22 * a22 + a23 * a23 195 | + a31 * a31 + a32 * a32 + a33 * a33; 196 | } 197 | 198 | 199 | template 200 | GPU_CPU_INLINE_FUNC bool solve3x3(const DType* m, const DType * b, DType* out) 201 | { 202 | const DType a11 = m[0]; const DType a12 = m[3]; const DType a13 = m[6]; 203 | const DType a21 = m[1]; const DType a22 = m[4]; const DType a23 = m[7]; 204 | const DType a31 = m[2]; const DType a32 = m[5]; const DType a33 = m[8]; 205 | 206 | const DType i11 = a33 * a22 - a32 * a23; 207 | const DType i12 = -(a33 * a12 - a32 * a13); 208 | const DType i13 = a23 * a12 - a22 * a13; 209 | 210 | const DType det = (a11 * i11 + a21 * i12 + a31 * i13); 211 | 212 | if (IS_ZERO_APPROX(det)) 213 | { 214 | return false; 215 | } 216 | 217 | const DType deti = 1.0 / det; 218 | 219 | const DType i21 = -(a33 * a21 - a31 * a23); 220 | const DType i22 = a33 * a11 - a31 * a13; 221 | const DType i23 = -(a23 * a11 - a21 * a13); 222 | 223 | const DType i31 = a32 * a21 - a31 * a22; 224 | const DType i32 = -(a32 * a11 - a31 * a12); 225 | const DType i33 = a22 * a11 - a21 * a12; 226 | 227 | out[0] = deti * (i11 * b[0] + i12 * b[1] + i13 * b[2]); 228 | out[1] = deti * (i21 * b[0] + i22 * b[1] + i23 * b[2]); 229 | out[2] = deti * (i31 * b[0] + i32 * b[1] + i33 * b[2]); 230 | 231 | return true; 232 | } 233 | 234 | template 235 | GPU_CPU_INLINE_FUNC bool solve3x3_psd_stable(const DType* m, const DType* b, DType* out) 236 | { 237 | const DType a11 = m[0]; const DType a12 = m[3]; const DType a13 = m[6]; 238 | const DType a21 = m[1]; const DType a22 = m[4]; const DType a23 = m[7]; 239 | const DType a31 = m[2]; const DType a32 = m[5]; const DType a33 = m[8]; 240 | 241 | const DType i11 = a33 * a22 - a32 * a23; 242 | const DType i12 = -(a33 * a12 - a32 * a13); 243 | const DType i13 = a23 * a12 - a22 * a13; 244 | 245 | const DType det = (a11 * i11 + a21 * i12 + a31 * i13); 246 | 247 | if (abs(det) < CMP_EPSILON * (abs(a11 * i11) + abs(a21 * i12) + abs(a31 * i13))) 248 | { 249 | out[0] = b[0]; 250 | out[1] = b[1]; 251 | out[2] = b[2]; 252 | return false; 253 | } 254 | 255 | const DType deti = 1.0 / det; 256 | 257 | const DType i21 = -(a33 * a21 - a31 * a23); 258 | const DType i22 = a33 * a11 - a31 * a13; 259 | const DType i23 = -(a23 * a11 - a21 * a13); 260 | 261 | const DType i31 = a32 * a21 - a31 * a22; 262 | const DType i32 = -(a32 * a11 - a31 * a12); 263 | const DType i33 = a22 * a11 - a21 * a12; 264 | 265 | out[0] = deti * (i11 * b[0] + i12 * b[1] + i13 * b[2]); 266 | out[1] = deti * (i21 * b[0] + i22 * b[1] + i23 * b[2]); 267 | out[2] = deti * (i31 * b[0] + i32 * b[1] + i33 * b[2]); 268 | 269 | //DType inv[9]; 270 | //// ( 4 8 - 5 7 5 6 - 3 8 3 7 - 4 6 ) 271 | //// ( 2 7 - 1 8 0 8 - 2 6 1 6 - 0 7 ) / det 272 | //// ( 1 5 - 2 4 2 3 - 0 5 0 4 - 1 3 ) 273 | 274 | //inv[0] = (m[4] * m[8] - m[5] * m[7]); 275 | //inv[3] = (m[5] * m[6] - m[3] * m[8]); 276 | //inv[6] = (m[3] * m[7] - m[4] * m[6]); 277 | 278 | //DType det = m[0] * inv[0] + m[1] * inv[3] + m[2] * inv[6]; 279 | //if (det < CMP_EPSILON * (abs(inv[0]) + abs(inv[3]) + abs(inv[6]))) 280 | //{ 281 | // out[0] = b[0]; 282 | // out[1] = b[1]; 283 | // out[2] = b[2]; 284 | // return false; 285 | //} 286 | 287 | //inv[1] = (m[2] * m[7] - m[1] * m[8]); 288 | //inv[2] = (m[1] * m[5] - m[2] * m[4]); 289 | // 290 | //inv[4] = (m[0] * m[8] - m[2] * m[6]); 291 | //inv[5] = (m[2] * m[3] - m[0] * m[5]); 292 | 293 | //inv[7] = (m[1] * m[6] - m[0] * m[7]); 294 | //inv[8] = (m[0] * m[4] - m[1] * m[3]); 295 | 296 | //CuMatrix:mat3VecProduct(inv, b, out); 297 | //CuMatrix::vec3Mul(out, 1.f / det, out); 298 | 299 | 300 | 301 | return true; 302 | } 303 | 304 | template 305 | struct Mat9x9Abstract { 306 | // column major 307 | DType* data; 308 | 309 | GPU_CPU_INLINE_FUNC Mat9x9Abstract(DType* data_in) : data(data_in) {}; 310 | 311 | GPU_CPU_INLINE_FUNC DType* col(int iCol) { return data + iCol * 9; } 312 | GPU_CPU_INLINE_FUNC DType& operator() (int iRow, int iCol) { return data[iCol * 9 + iRow]; } 313 | GPU_CPU_INLINE_FUNC const DType& operator() (int iRow, int iCol) const { return data[iCol * 9 + iRow]; } 314 | 315 | GPU_CPU_INLINE_FUNC void multiplyBy(const DType mul) { 316 | for (size_t iCol = 0; iCol < 9; iCol++) 317 | { 318 | for (size_t iRow = 0; iRow < 9; iRow++) { 319 | data[iCol * 9 + iRow] *= mul; 320 | } 321 | } 322 | } 323 | }; 324 | 325 | 326 | template 327 | struct Mat9x9Static 328 | { 329 | DType data[81]; 330 | 331 | GPU_CPU_INLINE_FUNC DType* col(int iCol) { return data + iCol * 9; } 332 | GPU_CPU_INLINE_FUNC DType& operator() (int iRow, int iCol) { return data[iCol * 9 + iRow]; } 333 | GPU_CPU_INLINE_FUNC const DType& operator() (int iRow, int iCol) const { return data[iCol * 9 + iRow]; } 334 | 335 | GPU_CPU_INLINE_FUNC void multiplyBy(const DType mul) { 336 | for (size_t iCol = 0; iCol < 9; iCol++) 337 | { 338 | for (size_t iRow = 0; iRow < 9; iRow++) { 339 | data[iCol * 9 + iRow] *= mul; 340 | } 341 | } 342 | } 343 | 344 | }; 345 | 346 | template class MatType> 347 | GPU_CPU_INLINE_FUNC void vec9OuterProduct(const DType* v1, const DType* v2, MatType& mat) { 348 | for (int iCol = 0; iCol < 9; iCol++) 349 | { 350 | for (int iRow = 0; iRow < 9; iRow++) { 351 | mat(iRow, iCol) = v1[iRow] * v2[iCol]; 352 | } 353 | } 354 | } 355 | 356 | template 357 | GPU_CPU_INLINE_FUNC void vec9Mul(const DType* v1, const DType a, DType* result) { 358 | result[0] = v1[0] * a; 359 | result[1] = v1[1] * a; 360 | result[2] = v1[2] * a; 361 | result[3] = v1[3] * a; 362 | result[4] = v1[4] * a; 363 | result[5] = v1[5] * a; 364 | result[6] = v1[6] * a; 365 | result[7] = v1[7] * a; 366 | result[8] = v1[8] * a; 367 | } 368 | 369 | template 370 | GPU_CPU_INLINE_FUNC void vec9Add(const DType* v1, const DType* v2, DType* result) { 371 | result[0] = v1[0] + v2[0]; 372 | result[1] = v1[1] + v2[1]; 373 | result[2] = v1[2] + v2[2]; 374 | result[3] = v1[3] + v2[3]; 375 | result[4] = v1[4] + v2[4]; 376 | result[5] = v1[5] + v2[5]; 377 | result[6] = v1[6] + v2[6]; 378 | result[7] = v1[7] + v2[7]; 379 | result[8] = v1[8] + v2[8]; 380 | } 381 | 382 | template 383 | GPU_CPU_INLINE_FUNC void vec9MulAddTo(const DType* v1, const DType a, DType* result) { 384 | result[0] += v1[0] * a; 385 | result[1] += v1[1] * a; 386 | result[2] += v1[2] * a; 387 | 388 | result[3] += v1[3] * a; 389 | result[4] += v1[4] * a; 390 | result[5] += v1[5] * a; 391 | 392 | result[6] += v1[6] * a; 393 | result[7] += v1[7] * a; 394 | result[8] += v1[8] * a; 395 | } 396 | 397 | // m1 : r x c, column major 398 | template 399 | GPU_CPU_INLINE_FUNC DType& accessMatElement(DType* m, int i, int j) 400 | { 401 | assert(i < r && j < c); 402 | return m[r * j + i]; 403 | } 404 | 405 | // m1 : r x c, column major 406 | template 407 | GPU_CPU_INLINE_FUNC const DType& accessMatElement(const DType* m, int i, int j) 408 | { 409 | assert(i < r && j < c); 410 | return m[r * j + i]; 411 | } 412 | 413 | // m1 : r1 x c1r2, m2: c1r2 x c2, result: r1 x c2 414 | // all are column major 415 | template 416 | GPU_CPU_INLINE_FUNC void matMulMxN(const DType* m1, const DType* m2, DType* result) 417 | { 418 | for (int r = 0; r < r1; r++) 419 | { 420 | for (int c = 0; c < c2; c++) { 421 | accessMatElement(result, r, c) = 0.f; 422 | for (int i = 0; i < c1r2; i++) { 423 | accessMatElement(result, r, c) += 424 | accessMatElement(m1, r, i) 425 | * accessMatElement(m2, i, c); 426 | } 427 | } 428 | } 429 | } 430 | 431 | }; 432 | 433 | 434 | 435 | template 436 | __global__ void parallel_for_3x3_matOps(DType* matsFlatten, int numMats, Func func) { 437 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; 438 | i < numMats; 439 | i += blockDim.x * gridDim.x) 440 | { 441 | func(matsFlatten + 9 * i, i); 442 | } 443 | } 444 | 445 | 446 | // multiplying 2 mat with abitary dimensions 447 | template 448 | __global__ void multiplicateMatrixOnDevice(DType* array_A, DType* array_B, DType* array_C, int M_p, int K_p, int N_p) 449 | { 450 | int ix = threadIdx.x + blockDim.x * blockIdx.x;//row number 451 | int iy = threadIdx.y + blockDim.y * blockIdx.y;//col number 452 | 453 | if (ix < N_p && iy < M_p) 454 | { 455 | DType sum = 0; 456 | for (int k = 0; k < K_p; k++) 457 | { 458 | sum += array_A[iy * K_p + k] * array_B[k * N_p + ix]; 459 | } 460 | array_C[iy * N_p + ix] = sum; 461 | } 462 | } 463 | 464 | 465 | -------------------------------------------------------------------------------- /CuMatrix/MatrixOps/CuMatrixDefs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_runtime.h" 3 | #include 4 | 5 | #define GPU_CPU_INLINE_FUNC __forceinline__ __device__ __host__ 6 | #define GPU_CPU_FUNC_NO_INLINE __device__ __host__ 7 | 8 | #define SQR(x) ((x)*(x)) 9 | #define CUBE(x) ((x)*(x)*(x)) 10 | 11 | #ifdef __CUDACC__ 12 | #define HOST_FUNC __host__ 13 | #define DEVICE_FUNC __device__ 14 | #define HOST_INLINE_FUNC __host__ __forceinline__ 15 | #define DEVICE_INLINE_FUNC __device__ __forceinline__ 16 | #define HOST_DEVICE_FUNC __host__ __device__ 17 | #define HOST_DEVICE_INLINE_FUNC __host__ __device__ __forceinline__ 18 | #else 19 | #define HOST_FUNC 20 | #define DEVICE_FUNC 21 | #define HOST_INLINE_FUNC 22 | #define DEVICE_INLINE_FUNC 23 | #define HOST_DEVICE_FUNC 24 | #define HOST_DEVICE_INLINE_FUNC 25 | #endif 26 | 27 | 28 | // to avoid the triple bracket when calling global function 29 | // nvcc does not seem to like variadic macros, so we have to define 30 | // one for each kernel parameter list: 31 | #ifdef __CUDACC__ 32 | #define KERNEL_ARGS2(grid, block) <<< grid, block >>> 33 | #define KERNEL_ARGS3(grid, block, sh_mem) <<< grid, block, sh_mem >>> 34 | #define KERNEL_ARGS4(grid, block, sh_mem, stream) <<< grid, block, sh_mem, stream >>> 35 | #else 36 | #define KERNEL_ARGS2(grid, block) 37 | #define KERNEL_ARGS3(grid, block, sh_mem) 38 | #define KERNEL_ARGS4(grid, block, sh_mem, stream) 39 | #endif 40 | 41 | #define CUDA_CHECK_RET(status) \ 42 | do \ 43 | { \ 44 | auto ret = (status); \ 45 | if (ret != 0) \ 46 | { \ 47 | std::cerr << "Cuda failure: " << ret << " - " << cudaGetErrorString(ret) << std::endl; \ 48 | abort(); \ 49 | } \ 50 | } while (0) 51 | 52 | #define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) 53 | template 54 | void check(T err, const char* const func, const char* const file, 55 | const int line) 56 | { 57 | if (err != cudaSuccess) 58 | { 59 | std::cerr << "CUDA Runtime Error at: " << file << ":" << line 60 | << std::endl; 61 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 62 | // We don't exit when we encounter CUDA errors in this example. 63 | // std::exit(EXIT_FAILURE); 64 | } 65 | } 66 | 67 | #define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) 68 | inline void checkLast(const char* const file, const int line) 69 | { 70 | cudaError_t err{ cudaGetLastError() }; 71 | if (err != cudaSuccess) 72 | { 73 | std::cerr << "CUDA Runtime Error at: " << file << ":" << line 74 | << std::endl; 75 | std::cerr << cudaGetErrorString(err) << std::endl; 76 | // We don't exit when we encounter CUDA errors in this example. 77 | // std::exit(EXIT_FAILURE); 78 | } 79 | } 80 | 81 | #define CMP_EPSILON 0.00001f 82 | #define CMP_EPSILON2 (CMP_EPSILON * CMP_EPSILON) 83 | #define IS_ZERO_APPROX(x) (fabs(x) < CMP_EPSILON) -------------------------------------------------------------------------------- /CuMatrix/MatrixOps/CuMatrixVis.h: -------------------------------------------------------------------------------- 1 | #include "CuMatrix.h" 2 | 3 | namespace CuMatrix 4 | { 5 | template 6 | GPU_CPU_INLINE_FUNC void printMat3(const DType* mat) { 7 | printf("%-7f %-7f %-7f\n%-7f %-7f %-7f\n%-7f %-7f %-7f\n", 8 | mat[0], mat[3], mat[6], 9 | mat[1], mat[4], mat[7], 10 | mat[2], mat[5], mat[8]); 11 | } 12 | 13 | template 14 | GPU_CPU_INLINE_FUNC void printMat(const DType* mat, size_t rows, size_t cols) { 15 | for (int i = 0; i < rows; i++) { 16 | for (int j = 0; j < cols - 1; j++) { 17 | printf("%-7f ", mat[i * cols + j]); 18 | } 19 | printf("%-7f\n", mat[i * cols + cols - 1]); 20 | } 21 | printf("\n"); 22 | } 23 | 24 | 25 | template 26 | GPU_CPU_INLINE_FUNC void printFloatVec(const DType* vec, size_t size) { 27 | for (int i = 0; i < size; i++) { 28 | printf("%f ", vec[i]); 29 | } 30 | printf("\n"); 31 | } 32 | 33 | template 34 | GPU_CPU_INLINE_FUNC void printIntVec(const DType* vec, size_t size) { 35 | for (int i = 0; i < size; i++) { 36 | printf("%d ", vec[i]); 37 | } 38 | printf("\n"); 39 | } 40 | 41 | inline __host__ __device__ void printCharVec(const int8_t* v, size_t size) { 42 | printf("Printing int vector of size %d\n", size); 43 | printf("vector address %p\n", v); 44 | for (int i = 0; i < size; i++) { 45 | printf("%d ", int(v[i])); 46 | } 47 | printf("\n"); 48 | } 49 | } -------------------------------------------------------------------------------- /CuMatrix/MatrixOps/VectorTypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CuMatrixDefs.h" 3 | #include 4 | 5 | typedef unsigned char uchar; 6 | typedef unsigned short ushort; 7 | 8 | template struct vector_helper { }; 9 | template<> struct vector_helper { typedef float ftype; typedef int itype; }; 10 | template<> struct vector_helper { typedef float2 ftype; typedef int2 itype; }; 11 | template<> struct vector_helper { typedef float4 ftype; typedef int4 itype; }; 12 | template<> struct vector_helper { typedef float ftype; typedef int itype; }; 13 | template<> struct vector_helper { typedef float2 ftype; typedef int2 itype; }; 14 | template<> struct vector_helper { typedef float4 ftype; typedef int4 itype; }; 15 | template<> struct vector_helper { typedef float ftype; typedef int itype; }; 16 | template<> struct vector_helper { typedef float2 ftype; typedef int2 itype; }; 17 | template<> struct vector_helper { typedef float4 ftype; typedef int4 itype; }; 18 | 19 | #define floatT typename vector_helper::ftype 20 | #define intT typename vector_helper::itype 21 | 22 | template inline __device__ V to_floatN(const T& a) { return (V)a; } 23 | template inline __device__ T from_floatN(const V& a) { return (T)a; } 24 | 25 | // arithmetic operators fo the built-in vector types 26 | #define OPERATORS2(T) \ 27 | template DEVICE_INLINE_FUNC T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y); } \ 28 | template DEVICE_INLINE_FUNC T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y); } \ 29 | template DEVICE_INLINE_FUNC T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b); } \ 30 | template DEVICE_INLINE_FUNC T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b); } \ 31 | template DEVICE_INLINE_FUNC T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b); } \ 32 | template DEVICE_INLINE_FUNC T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b); } \ 33 | template DEVICE_INLINE_FUNC T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; return a; } \ 34 | template DEVICE_INLINE_FUNC void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; } \ 35 | template DEVICE_INLINE_FUNC void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; } \ 36 | template<> DEVICE_INLINE_FUNC float2 to_floatN(const T &a) { return make_float2(a.x, a.y); } \ 37 | template<> DEVICE_INLINE_FUNC T from_floatN(const float2 &a) { return make_ ## T(a.x, a.y); } 38 | 39 | #define OPERATORS4(T) \ 40 | template DEVICE_INLINE_FUNC T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } \ 41 | template DEVICE_INLINE_FUNC T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } \ 42 | template DEVICE_INLINE_FUNC T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b, a.w * b); } \ 43 | template DEVICE_INLINE_FUNC T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b, a.w / b); } \ 44 | template DEVICE_INLINE_FUNC T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b, a.z >> b, a.w >> b); } \ 45 | template DEVICE_INLINE_FUNC T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b, a.z << b, a.w << b); } \ 46 | template DEVICE_INLINE_FUNC T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } \ 47 | template DEVICE_INLINE_FUNC T &operator-=(T &a, const V &b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } \ 48 | template DEVICE_INLINE_FUNC void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w; } \ 49 | template DEVICE_INLINE_FUNC void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; a.w = b; } \ 50 | template<> DEVICE_INLINE_FUNC float4 to_floatN(const T &a) { return make_float4(a.x, a.y, a.z, a.w); } \ 51 | template<> DEVICE_INLINE_FUNC T from_floatN(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); } \ 52 | DEVICE_INLINE_FUNC float normSquare(const T& a) { return SQR(a.x) + SQR(a.y) + SQR(a.z) + SQR(a.w); } \ 53 | DEVICE_INLINE_FUNC float norm(const T& a) { return sqrtf(normSquare(a)); } \ 54 | DEVICE_INLINE_FUNC float normVec3Square(const T& a) { return SQR(a.x) + SQR(a.y) + SQR(a.z); } \ 55 | DEVICE_INLINE_FUNC float normVec3(const T& a) { return sqrtf(normVec3Square(a)); } \ 56 | DEVICE_INLINE_FUNC float disVec3Square(const T& a, const T &b) { return normVec3Square(a - b); } \ 57 | DEVICE_INLINE_FUNC float disVec3(const T& a, const T &b) { return sqrtf(disVec3Square(a, b)); } 58 | 59 | OPERATORS2(int2) 60 | OPERATORS2(uchar2) 61 | OPERATORS2(ushort2) 62 | OPERATORS2(float2) 63 | OPERATORS4(int4) 64 | OPERATORS4(uchar4) 65 | OPERATORS4(ushort4) 66 | OPERATORS4(float4) 67 | 68 | template DEVICE_INLINE_FUNC void vec_set(int& a, V b) { a = b; } 69 | template DEVICE_INLINE_FUNC void vec_set(float& a, V b) { a = b; } 70 | template DEVICE_INLINE_FUNC void vec_set(uchar& a, V b) { a = b; } 71 | template DEVICE_INLINE_FUNC void vec_set(ushort& a, V b) { a = b; } 72 | template DEVICE_INLINE_FUNC void vec_set_scalar(int& a, V b) { a = b; } 73 | template DEVICE_INLINE_FUNC void vec_set_scalar(float& a, V b) { a = b; } 74 | template DEVICE_INLINE_FUNC void vec_set_scalar(uchar& a, V b) { a = b; } 75 | template DEVICE_INLINE_FUNC void vec_set_scalar(ushort& a, V b) { a = b; } 76 | 77 | template 78 | inline __device__ T lerp_scalar(T v0, T v1, float t) { 79 | return t * v1 + (1.0f - t) * v0; 80 | } 81 | 82 | template<> 83 | inline __device__ float2 lerp_scalar(float2 v0, float2 v1, float t) { 84 | return make_float2( 85 | lerp_scalar(v0.x, v1.x, t), 86 | lerp_scalar(v0.y, v1.y, t) 87 | ); 88 | } 89 | 90 | template<> 91 | inline __device__ float4 lerp_scalar(float4 v0, float4 v1, float t) { 92 | return make_float4( 93 | lerp_scalar(v0.x, v1.x, t), 94 | lerp_scalar(v0.y, v1.y, t), 95 | lerp_scalar(v0.z, v1.z, t), 96 | lerp_scalar(v0.w, v1.w, t) 97 | ); 98 | } 99 | 100 | namespace CuMatrix { 101 | 102 | namespace detail 103 | { 104 | template struct vector_of; 105 | template<> struct vector_of { using type = float4; }; 106 | template<> struct vector_of { using type = double4; }; 107 | } 108 | template 109 | using vector_of_t = typename detail::vector_of::type; 110 | 111 | // 16 bytes aligned 3d vector, with 4th component for padding; 112 | // the 4th component is not used in the vector operations, but it can be customized for other purposes 113 | template 114 | struct Vec3a 115 | { 116 | HOST_DEVICE_INLINE_FUNC Vec3a(const Vec3a& v) { 117 | d.x = v.x(); 118 | d.y = v.y(); 119 | d.z = v.z(); 120 | } 121 | HOST_DEVICE_INLINE_FUNC Vec3a(const DType* data) { 122 | d.x = data[0]; 123 | d.y = data[1]; 124 | d.z = data[2]; 125 | } 126 | HOST_DEVICE_INLINE_FUNC Vec3a(DType x, DType y, DType z) { d.x = x; d.y = y; d.z = z; } 127 | HOST_DEVICE_INLINE_FUNC Vec3a(DType x, DType y, DType z, DType w) { d.x = x; d.y = y; d.z = z; d.w = w; } 128 | HOST_DEVICE_INLINE_FUNC Vec3a(const vector_of_t& v) { d = v; } 129 | HOST_DEVICE_INLINE_FUNC Vec3a(DType val) { d.x = val; d.y = val; d.z = val; } 130 | HOST_DEVICE_INLINE_FUNC Vec3a() {} 131 | 132 | HOST_DEVICE_INLINE_FUNC void set3(DType x, DType y, DType z) { d.x = x; d.y = y; d.z = z; } 133 | HOST_DEVICE_INLINE_FUNC void set3(const DType* data) { d.x = data[0]; d.y = data[1]; d.z = data[2]; } 134 | HOST_DEVICE_INLINE_FUNC void set3(const Vec3a& v) { d.x = v.x(); d.y = v.y(); d.z = v.z(); } 135 | 136 | HOST_DEVICE_INLINE_FUNC DType& x() { return d.x; } 137 | HOST_DEVICE_INLINE_FUNC DType& y() { return d.y; } 138 | HOST_DEVICE_INLINE_FUNC DType& z() { return d.z; } 139 | HOST_DEVICE_INLINE_FUNC DType& w() { return d.w; } 140 | 141 | HOST_DEVICE_INLINE_FUNC const DType& x() const { return d.x; } 142 | HOST_DEVICE_INLINE_FUNC const DType& y() const { return d.y; } 143 | HOST_DEVICE_INLINE_FUNC const DType& z() const { return d.z; } 144 | HOST_DEVICE_INLINE_FUNC const DType& w() const { return d.w; } 145 | 146 | HOST_DEVICE_INLINE_FUNC DType& operator[](int i) { return (&d.x)[i]; } 147 | const DType& operator[](int i) const { return (&d.x)[i]; } 148 | 149 | HOST_DEVICE_INLINE_FUNC Vec3a operator+(const Vec3a& v) const { 150 | return Vec3a(d.x + v.x(), d.y + v.y(), d.z + v.z()); 151 | } 152 | HOST_DEVICE_INLINE_FUNC Vec3a operator-(const Vec3a& v) const 153 | { 154 | return Vec3a(d.x - v.x(), d.y - v.y(), d.z - v.z()); 155 | } 156 | HOST_DEVICE_INLINE_FUNC Vec3a operator*(DType v) const 157 | { 158 | return Vec3a(d.x * v, d.y * v, d.z * v); 159 | } 160 | HOST_DEVICE_INLINE_FUNC Vec3a operator/(DType v) const { 161 | return Vec3a(d.x / v, d.y / v, d.z / v); 162 | } 163 | HOST_DEVICE_INLINE_FUNC Vec3a& operator+=(const Vec3a& v) 164 | { 165 | set3(d.x + v.x(), d.y + v.y(), d.z + v.z()); 166 | return *this; 167 | } 168 | HOST_DEVICE_INLINE_FUNC Vec3a& operator-=(const Vec3a& v) 169 | { 170 | set3(d.x - v.x(), d.y - v.y(), d.z - v.z()); 171 | return *this; 172 | } 173 | 174 | HOST_DEVICE_INLINE_FUNC DType normSquare() const { return SQR(d.x) + SQR(d.y) + SQR(d.z); } 175 | HOST_DEVICE_INLINE_FUNC DType norm() const { return sqrtf(normSquare()); } 176 | HOST_DEVICE_INLINE_FUNC DType disSquare(const Vec3a& v) const { return normSquare(v - *this); } 177 | HOST_DEVICE_INLINE_FUNC DType dis(const Vec3a& v) const { return sqrtf(disSquare(v)); } 178 | 179 | HOST_DEVICE_INLINE_FUNC DType dot(const Vec3a& v) const { return d.x * v.x() + d.y * v.y() + d.z * v.z(); } 180 | HOST_DEVICE_INLINE_FUNC Vec3a cross(const Vec3a& v) const 181 | { 182 | return Vec3a(d.y * v.z() - d.z * v.y(), d.z * v.x() - d.x * v.z(), d.x * v.y() - d.y * v.x()); 183 | } 184 | 185 | 186 | HOST_DEVICE_INLINE_FUNC void setW(DType w) { d.w = w; } 187 | HOST_DEVICE_INLINE_FUNC void getW() { return d.w; }; 188 | 189 | HOST_DEVICE_INLINE_FUNC const vector_of_t& getData() const { return d; } 190 | HOST_DEVICE_INLINE_FUNC const DType* getDataPtr() const { return &d.x; } 191 | 192 | HOST_DEVICE_INLINE_FUNC void print() const { 193 | printf("(%f, %f, %f)", d.x, d.y, d.z); 194 | } 195 | private: 196 | vector_of_t d; 197 | }; 198 | 199 | typedef Vec3a Vec3af; 200 | typedef Vec3a Vec3ad; 201 | 202 | } -------------------------------------------------------------------------------- /Examples/P01_UseMangeBuffer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5 FATAL_ERROR) 2 | 3 | project(P01_UseManagedBuffer LANGUAGES CXX CUDA) 4 | 5 | find_package(CuMatrix REQUIRED PATHS ${CMAKE_CURRENT_LIST_DIR}/../../cmake) 6 | find_package(Eigen3 REQUIRED) 7 | 8 | ## Use C++11 9 | set (CMAKE_CXX_STANDARD 17) 10 | set (CMAKE_CUDA_STANDARD 17) 11 | set (CMAKE_CUDA_ARCHITECTURES 52) 12 | 13 | # SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --expt-extended-lambda; --extended-lambda; ) 14 | 15 | # message(CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}) 16 | 17 | include_directories( 18 | ${CU_MATRIX_INCLUDE_DIR} 19 | ${EIGEN3_INCLUDE_DIR} 20 | ) 21 | 22 | file(GLOB SRC 23 | "*.h" 24 | "*.cpp" 25 | "*.c" 26 | "*.cu" 27 | ) 28 | 29 | add_executable(P01_UseManagedBuffer 30 | ${SRC} 31 | ) 32 | set(CMAKE_CUDA_ARCHITECTURES 52) 33 | target_link_libraries(P01_UseManagedBuffer ${CU_MATRIX_LIBS}) 34 | #arget_Compile_options(P01_UseManagedBuffer PUBLIC $<$COMPILE_LANGUAGE:CUDA>:--extended-lambda) 35 | target_compile_options(P01_UseManagedBuffer PUBLIC $<$: 36 | --extended-lambda 37 | --default-stream per-thread 38 | >) 39 | target_include_directories(P01_UseManagedBuffer PUBLIC 40 | ${CU_MATRIX_INCLUDE_DIR} 41 | ${EIGEN3_INCLUDE_DIR} 42 | ) -------------------------------------------------------------------------------- /Examples/P01_UseMangeBuffer/Timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #define TICK(name)\ 5 | auto name##_t1 = std::chrono::high_resolution_clock::now(); 6 | 7 | #define TOCK(name)\ 8 | auto name##_t2 = std::chrono::high_resolution_clock::now();\ 9 | name = std::chrono::duration_cast(name##_t2 - name##_t1).count() / 1000.0; 10 | 11 | -------------------------------------------------------------------------------- /Examples/P01_UseMangeBuffer/main.cpp: -------------------------------------------------------------------------------- 1 | #include "main.cuh" 2 | #include "Timer.h" 3 | #include 4 | 5 | void init3x3Mats(Eigen::VectorXf & matFlatten, size_t numMats) { 6 | 7 | for (size_t i = 0; i < numMats; i++) 8 | { 9 | Eigen::Map mat(matFlatten.data() + i * 9); 10 | mat = Eigen::Matrix3f::Random(); 11 | } 12 | } 13 | 14 | int main() { 15 | size_t numMats = 100000000; 16 | size_t matW = 3; 17 | 18 | Eigen::VectorXf matFlatten(numMats * matW * matW); 19 | 20 | init3x3Mats(matFlatten, numMats); 21 | 22 | ManagedBuffer matsbuf(matFlatten.size(), true, (void*)matFlatten.data()); 23 | matsbuf.toGPU(); 24 | 25 | ManagedBuffer detsbuf(numMats, true); 26 | 27 | const int numThreads = 512; 28 | 29 | double gpuTime = 0.; 30 | TICK(gpuTime); 31 | computeDetsGPU(matsbuf, detsbuf, numMats, numThreads); 32 | detsbuf.toCPU(); 33 | TOCK(gpuTime); 34 | 35 | 36 | double cpuTime = 0.; 37 | TICK(cpuTime); 38 | std::vector detsCpu(numMats); 39 | 40 | for (size_t i = 0; i < numMats; i++) 41 | { 42 | Eigen::Map mat(matFlatten.data() + i * 9); 43 | detsCpu[i] = mat.determinant(); 44 | //std::cout << "Mat: " << mat << "\n"; 45 | /*std::cout << "GPU: " << detsbuf.getCPUBuffer()[i] << " | CPU:" << mat.determinant() 46 | << " | GPU func computed on CPU: " << CuMatrix::mat3GetDeterminant(mat.data()) << "\n";*/ 47 | } 48 | TOCK(cpuTime); 49 | 50 | for (size_t i = 0; i < numMats; i++) 51 | { 52 | if (abs(detsCpu[i] - detsbuf.getCPUBuffer()[i]) > 1e-6) 53 | { 54 | std::cout << "Bug at " << i << "th mat!\n"; 55 | std::cout << "GPU: " << detsbuf.getCPUBuffer()[i] << " | CPU:" << detsCpu[i] << "\n"; 56 | 57 | getchar(); 58 | } 59 | } 60 | std::cout << detsCpu.size() << "\n"; 61 | std::cout << "GPU time: " << gpuTime << " ms | CPU time: " << cpuTime << " ms | CPU time / 16: " << cpuTime / 16 << "ms\n"; 62 | 63 | return 0; 64 | } -------------------------------------------------------------------------------- /Examples/P01_UseMangeBuffer/main.cu: -------------------------------------------------------------------------------- 1 | #include "main.cuh" 2 | #include "CuMatrix/MatrixOps/CuMatrix.h" 3 | 4 | void computeDetsGPU(ManagedBuffer& matsbuf, ManagedBuffer& detsbuf, int numMats, int numThreads){ 5 | parallel_for_3x3_matOps KERNEL_ARGS2((numMats + numThreads - 1) / numThreads, numThreads) (matsbuf.getGPUBuffer(), numMats, 6 | [dets = detsbuf.getGPUBuffer()] __device__ (float* mat, int iMat) 7 | { 8 | dets[iMat] = CuMatrix::mat3GetDeterminant(mat); 9 | }); 10 | } 11 | 12 | -------------------------------------------------------------------------------- /Examples/P01_UseMangeBuffer/main.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CuMatrix/Buffers/ManagedBuffer.h" 3 | #include "CuMatrix/Interface/EigenInterface.h" 4 | 5 | void computeDetsGPU(ManagedBuffer& matsbuf, ManagedBuffer& detsbuf, int numMats, int numThreads); 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CuMatrix 2 | Cuda matrix computation library that is specified for small matrix operation (3x3, 4x4, 1x3, 1x4, etc.). Including buffer management, also being compatible with Eigen. 3 | -------------------------------------------------------------------------------- /cmake/CuMatrixConfig.cmake: -------------------------------------------------------------------------------- 1 | include(${CMAKE_CURRENT_LIST_DIR}/FindCuda.cmake) 2 | 3 | if (CUDA_FOUND) 4 | message("Find Cuda SUCCESS!\n") 5 | set(CUDA_FOUND ON) 6 | else() 7 | message("Cuda NOT FOUND\n") 8 | 9 | endif (CUDA_FOUND) 10 | 11 | # complain if no backend is installed 12 | if(NOT CUDA_FOUND) 13 | message(FATAL_ERROR 14 | "CUDA must be installed 15 | CUDA_FOUND ${CUDA_FOUND}\n") 16 | endif() 17 | 18 | 19 | SET (CU_MATRIX_INCLUDE_DIR 20 | ${CUDA_INCLUDE_DIRS} 21 | ${CMAKE_CURRENT_LIST_DIR}/../ 22 | ) 23 | 24 | SET (CU_MATRIX_LIBS 25 | ${CUDA_LIBRARIES} 26 | ) 27 | 28 | 29 | 30 | SET (CU_MATRIX_SOURCE_CPP 31 | # 32 | ) 33 | 34 | -------------------------------------------------------------------------------- /cmake/FindCuda.cmake: -------------------------------------------------------------------------------- 1 | find_package(CUDA) 2 | 3 | 4 | if(CUDA_FOUND) 5 | message("CUDA available!") 6 | message("CUDA Libs: ${CUDA_LIBRARIES}") 7 | message("CUDA Headers: ${CUDA_INCLUDE_DIRS}") 8 | 9 | list(APPEND CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY} ${CUDA_npps_LIBRARY} ${CUDA_nppig_LIBRARY}) 10 | message("All togheter now (libs): ${CUDA_LIBRARIES}") 11 | 12 | 13 | else() 14 | message("CUDA NOT Available") 15 | 16 | endif() --------------------------------------------------------------------------------