├── README.md ├── bvh.cpp ├── bvh.h ├── camera.cpp ├── camera.h ├── cuda_pathtracer.cu ├── cuda_pathtracer.h ├── cutil_math.h ├── dragonDOF2.png ├── dragonDOF3.png ├── dragonDOF4.png ├── geometry.h ├── golddragon3.png ├── golddragon4.png ├── linear_algebra.h ├── loader.cpp ├── loader.h └── main.cpp /README.md: -------------------------------------------------------------------------------- 1 | GPU path tracing tutorial 3 2 | Implementing a BVH acceleration structure on the GPU 3 | by Sam lapere, 2016 4 | 5 | More info and screenshots on 6 | 7 | http://raytracey.blogspot.co.nz/2016/01/gpu-path-tracing-tutorial-3-take-your.html 8 | 9 | BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras 10 | (http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html) 11 | 12 | Interactive camera with depth of field and plastic (coat) material based on CUDA path tracer code 13 | by Peter Kutz and Yining Karl Li (https://github.com/peterkutz/GPUPathTracer) 14 | 15 | Phong metal code based on "Realistic Ray Tracing" by Peter Shirley 16 | 17 | Features: 18 | - Fast interactive GPU path tracer 19 | - progressive rendering 20 | - support for diffuse, specular (mirror), refractive, acrylic/coat and metal Phong materials 21 | - support for spheres and triangle meshes 22 | - BVH acceleration structure built with SAH (Surface Area Heuristic) and binning 23 | - interactive camera with mouse and keyboard controls 24 | - anti-aliasing 25 | - depth-of-field 26 | 27 | 28 | Instructions for compiling with Visual Studio 2013/2015: 29 | 30 | - install the CUDA 6.5/7/7.5 toolkit and choose integration with Visual Studio 31 | - open VS2013/2015 (Express or any other version such as the free Community version) 32 | - click New Project... 33 | - select Visual C++, then General, then Empty Project 34 | - right click on the project, select Build Dependies > Build Customizations 35 | - select the CUDA 6.5 (or 7 or 7.5) checkbox, click OK 36 | - in the project explorer window, right click on Source Files, select Add, C++ file, then change the name from "Source.cpp" to "cuda_pathtracer.cu" 37 | - in the project explorer window, right click on the newly created cuda_pathtracer.cu file, select CUDA C++ 38 | - paste the code from cuda_pathtracer.cu in the file 39 | - add the other .h (header) and .cpp files to the project 40 | - right click on the project name, select Properties 41 | - under Linker > Input > Additional Dependencies, add "cudart.lib" and "glew32.lib" (glew32.lib should be automatically found when the CUDA toolkit is installed, if not, you can manually add the path to Linker > General > Additional Library Directories, the path is something like "%NVSDKCOMPUTE_ROOT%\C\common\lib") 42 | - disable SAFESEH by selecting NO in Linker > Advanced > Image Has Safe Exception Handlers 43 | - select Build > Rebuild Solution 44 | - run the program (at the moment there is no CUDA error checking, but so far everything has worked fine even when running the program for prolonged periods) 45 | 46 | Screenshots produced with this code: 47 | 48 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF2.png) 49 | 50 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF3.png) 51 | 52 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF4.png) 53 | 54 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/golddragon3.png) 55 | 56 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/golddragon4.png) 57 | 58 | -------------------------------------------------------------------------------- /bvh.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "bvh.h" 29 | #include "geometry.h" 30 | #include "cuda_pathtracer.h" 31 | 32 | using namespace std; 33 | 34 | // report progress during BVH construction 35 | #define PROGRESS_REPORT 36 | #ifdef PROGRESS_REPORT 37 | #define REPORT(x) x 38 | #define REPORTPRM(x) x, 39 | #else 40 | #define REPORT(x) 41 | #define REPORTPRM(x) 42 | #endif 43 | 44 | unsigned g_reportCounter = 0; 45 | 46 | // The BVH 47 | BVHNode* g_pSceneBVH = NULL; 48 | 49 | // the cache-friendly version of the BVH, to be stored in a file 50 | unsigned g_triIndexListNo = 0; 51 | int* g_triIndexList = NULL; 52 | unsigned g_pCFBVH_No = 0; 53 | CacheFriendlyBVHNode* g_pCFBVH = NULL; 54 | 55 | 56 | ////////////////////////////////////////////////// 57 | // First, the "pure" implementation of the BVH 58 | ////////////////////////////////////////////////// 59 | 60 | // Work item for creation of BVH: 61 | struct BBoxTmp { 62 | // Bottom point (ie minx,miny,minz) 63 | Vector3Df _bottom; 64 | // Top point (ie maxx,maxy,maxz) 65 | Vector3Df _top; 66 | // Center point, ie 0.5*(top-bottom) 67 | Vector3Df _center; // = bbox centroid 68 | // Triangle 69 | const Triangle *_pTri; // triangle list 70 | BBoxTmp() 71 | : 72 | _bottom(FLT_MAX, FLT_MAX, FLT_MAX), 73 | _top(-FLT_MAX, -FLT_MAX, -FLT_MAX), 74 | _pTri(NULL) 75 | {} 76 | }; 77 | 78 | // BVH CONSTRUCTION 79 | // This builds the BVH, finding optimal split planes for each depth 80 | // uses binning: divide the work bounding box into a number of equally sized "bins" along one of the axes 81 | // choose axis and splitting plane resulting in least cost (determined by surface area heuristic or SAH) 82 | // SAH (surface area heuristic): the larger the surface area of a bounding box, the costlier it is to raytrace 83 | // find the bbox with the minimum surface area 84 | // 85 | // I strongly recommend reading Ingo Wald's 2007 paper "On fast SAH based BVH construction", 86 | // http://www.sci.utah.edu/~wald/Publications/2007/ParallelBVHBuild/fastbuild.pdf, to understand the code below 87 | 88 | 89 | typedef std::vector BBoxEntries; // vector of triangle bounding boxes needed during BVH construction 90 | 91 | // recursive building of BVH nodes 92 | // work is the working list (std::vector<>) of triangle bounding boxes 93 | 94 | BVHNode *Recurse(BBoxEntries& work, REPORTPRM(float pct = 0.) int depth = 0) 95 | { 96 | 97 | REPORT(float pctSpan = 11. / pow(3.f, depth);) 98 | 99 | // terminate recursion case: 100 | // if work set has less then 4 elements (triangle bounding boxes), create a leaf node 101 | // and create a list of the triangles contained in the node 102 | 103 | if (work.size() < 4) { 104 | 105 | BVHLeaf *leaf = new BVHLeaf; 106 | for (BBoxEntries::iterator it = work.begin(); it != work.end(); it++) 107 | leaf->_triangles.push_back(it->_pTri); 108 | return leaf; 109 | } 110 | 111 | // else, work size > 4, divide node further into smaller nodes 112 | // start by finding the working list's bounding box (top and bottom) 113 | 114 | Vector3Df bottom(FLT_MAX, FLT_MAX, FLT_MAX); 115 | Vector3Df top(-FLT_MAX, -FLT_MAX, -FLT_MAX); 116 | 117 | // loop over all bboxes in current working list, expanding/growing the working list bbox 118 | for (unsigned i = 0; i < work.size(); i++) { // meer dan 4 bboxen in work 119 | BBoxTmp& v = work[i]; 120 | bottom = min3(bottom, v._bottom); 121 | top = max3(top, v._top); 122 | } 123 | 124 | // SAH, surface area heuristic calculation 125 | // find surface area of bounding box by multiplying the dimensions of the working list's bounding box 126 | float side1 = top.x - bottom.x; // length bbox along X-axis 127 | float side2 = top.y - bottom.y; // length bbox along Y-axis 128 | float side3 = top.z - bottom.z; // length bbox along Z-axis 129 | 130 | // the current bbox has a cost of (number of triangles) * surfaceArea of C = N * SA 131 | float minCost = work.size() * (side1*side2 + side2*side3 + side3*side1); 132 | 133 | float bestSplit = FLT_MAX; // best split along axis, will indicate no split with better cost found (below) 134 | 135 | int bestAxis = -1; 136 | 137 | // Try all 3 axises X, Y, Z 138 | for (int j = 0; j < 3; j++) { // 0 = X, 1 = Y, 2 = Z axis 139 | 140 | int axis = j; 141 | 142 | // we will try dividing the triangles based on the current axis, 143 | // and we will try split values from "start" to "stop", one "step" at a time. 144 | float start, stop, step; 145 | 146 | // X-axis 147 | if (axis == 0) { 148 | start = bottom.x; 149 | stop = top.x; 150 | } 151 | // Y-axis 152 | else if (axis == 1) { 153 | start = bottom.y; 154 | stop = top.y; 155 | } 156 | // Z-axis 157 | else { 158 | start = bottom.z; 159 | stop = top.z; 160 | } 161 | 162 | // In that axis, do the bounding boxes in the work queue "span" across, (meaning distributed over a reasonable distance)? 163 | // Or are they all already "packed" on the axis? Meaning that they are too close to each other 164 | if (fabsf(stop - start)<1e-4) 165 | // BBox side along this axis too short, we must move to a different axis! 166 | continue; // go to next axis 167 | 168 | // Binning: Try splitting at a uniform sampling (at equidistantly spaced planes) that gets smaller the deeper we go: 169 | // size of "sampling grid": 1024 (depth 0), 512 (depth 1), etc 170 | // each bin has size "step" 171 | step = (stop - start) / (1024. / (depth + 1.)); 172 | 173 | #ifdef PROGRESS_REPORT 174 | // Progress report variables... 175 | float pctStart = pct + j*pctSpan; // j is axis 176 | float pctStep = pctSpan / ((stop - start - 2 * step) / step); 177 | #endif 178 | 179 | // for each bin (equally spaced bins of size "step"): 180 | for (float testSplit = start + step; testSplit < stop - step; testSplit += step) { 181 | 182 | #ifdef PROGRESS_REPORT 183 | if ((1023 & g_reportCounter++) == 0) { 184 | std::printf("\b\b\b%02d%%", int(pctStart)); fflush(stdout); 185 | } 186 | pctStart += pctStep; 187 | #endif 188 | 189 | // Create left and right bounding box 190 | Vector3Df lbottom(FLT_MAX, FLT_MAX, FLT_MAX); 191 | Vector3Df ltop(-FLT_MAX, -FLT_MAX, -FLT_MAX); 192 | 193 | Vector3Df rbottom(FLT_MAX, FLT_MAX, FLT_MAX); 194 | Vector3Df rtop(-FLT_MAX, -FLT_MAX, -FLT_MAX); 195 | 196 | // The number of triangles in the left and right bboxes (needed to calculate SAH cost function) 197 | int countLeft = 0, countRight = 0; 198 | 199 | // For each test split (or bin), allocate triangles in remaining work list based on their bbox centers 200 | // this is a fast O(N) pass, no triangle sorting needed (yet) 201 | for (unsigned i = 0; i_triangles.push_back(it->_pTri); // put triangles of working list in leaf's triangle list 265 | return leaf; 266 | } 267 | 268 | // Otherwise, create BVH inner node with L and R child nodes, split with the optimal value we found above 269 | 270 | BBoxEntries left; 271 | BBoxEntries right; // BBoxEntries is a vector/list of BBoxTmp 272 | Vector3Df lbottom(FLT_MAX, FLT_MAX, FLT_MAX); 273 | Vector3Df ltop(-FLT_MAX, -FLT_MAX, -FLT_MAX); 274 | Vector3Df rbottom(FLT_MAX, FLT_MAX, FLT_MAX); 275 | Vector3Df rtop(-FLT_MAX, -FLT_MAX, -FLT_MAX); 276 | 277 | // distribute the triangles in the left or right child nodes 278 | // for each triangle in the work set 279 | for (int i = 0; i < (int)work.size(); i++) { 280 | 281 | // create temporary bbox for triangle 282 | BBoxTmp& v = work[i]; 283 | 284 | // compute bbox center 285 | float value; 286 | if (bestAxis == 0) value = v._center.x; 287 | else if (bestAxis == 1) value = v._center.y; 288 | else value = v._center.z; 289 | 290 | if (value < bestSplit) { // add temporary bbox v from work list to left BBoxentries list, 291 | // becomes new working list of triangles in next step 292 | 293 | left.push_back(v); 294 | lbottom = min3(lbottom, v._bottom); 295 | ltop = max3(ltop, v._top); 296 | } 297 | else { 298 | 299 | // Add triangle bbox v from working list to right BBoxentries, 300 | // becomes new working list of triangles in next step 301 | right.push_back(v); 302 | rbottom = min3(rbottom, v._bottom); 303 | rtop = max3(rtop, v._top); 304 | } 305 | } // end loop for each triangle in working set 306 | 307 | // create inner node 308 | BVHInner *inner = new BVHInner; 309 | 310 | #ifdef PROGRESS_REPORT 311 | if ((1023 & g_reportCounter++) == 0) { 312 | std::printf("\b\b\b%2d%%", int(pct + 3.f*pctSpan)); // Update progress indicator 313 | fflush(stdout); 314 | } 315 | #endif 316 | // recursively build the left child 317 | inner->_left = Recurse(left, REPORTPRM(pct + 3.f*pctSpan) depth + 1); 318 | inner->_left->_bottom = lbottom; 319 | inner->_left->_top = ltop; 320 | 321 | #ifdef PROGRESS_REPORT 322 | if ((1023 & g_reportCounter++) == 0) { 323 | std::printf("\b\b\b%2d%%", int(pct + 6.f*pctSpan)); // Update progress indicator 324 | fflush(stdout); 325 | } 326 | #endif 327 | // recursively build the right child 328 | inner->_right = Recurse(right, REPORTPRM(pct + 6.f*pctSpan) depth + 1); 329 | inner->_right->_bottom = rbottom; 330 | inner->_right->_top = rtop; 331 | 332 | return inner; 333 | } // end of Recurse() function, returns the rootnode (when all recursion calls have finished) 334 | 335 | BVHNode *CreateBVH() 336 | { 337 | /* Summary: 338 | 1. Create work BBox 339 | 2. Create BBox for every triangle and compute bounds 340 | 3. Expand bounds work BBox to fit all triangle bboxes 341 | 4. Compute triangle bbox centre and add triangle to working list 342 | 5. Build BVH tree with Recurse() 343 | 6. Return root node 344 | */ 345 | 346 | std::vector work; 347 | Vector3Df bottom(FLT_MAX, FLT_MAX, FLT_MAX); 348 | Vector3Df top(-FLT_MAX, -FLT_MAX, -FLT_MAX); 349 | 350 | puts("Gathering bounding box info from all triangles..."); 351 | // for each triangle 352 | for (unsigned j = 0; j < g_trianglesNo; j++) { 353 | 354 | const Triangle& triangle = g_triangles[j]; 355 | 356 | // create a new temporary bbox per triangle 357 | BBoxTmp b; 358 | b._pTri = ▵ 359 | 360 | // loop over triangle vertices and pick smallest vertex for bottom of triangle bbox 361 | b._bottom = min3(b._bottom, g_vertices[triangle._idx1]); // index of vertex 362 | b._bottom = min3(b._bottom, g_vertices[triangle._idx2]); 363 | b._bottom = min3(b._bottom, g_vertices[triangle._idx3]); 364 | 365 | // loop over triangle vertices and pick largest vertex for top of triangle bbox 366 | b._top = max3(b._top, g_vertices[triangle._idx1]); 367 | b._top = max3(b._top, g_vertices[triangle._idx2]); 368 | b._top = max3(b._top, g_vertices[triangle._idx3]); 369 | 370 | // expand working list bbox by largest and smallest triangle bbox bounds 371 | bottom = min3(bottom, b._bottom); 372 | top = max3(top, b._top); 373 | 374 | // compute triangle bbox center: (bbox top + bbox bottom) * 0.5 375 | b._center = (b._top + b._bottom) * 0.5f; 376 | 377 | // add triangle bbox to working list 378 | work.push_back(b); 379 | } 380 | 381 | // ...and pass it to the recursive function that creates the SAH AABB BVH 382 | // (Surface Area Heuristic, Axis-Aligned Bounding Boxes, Bounding Volume Hierarchy) 383 | 384 | std::printf("Creating Bounding Volume Hierarchy data... "); fflush(stdout); 385 | BVHNode* root = Recurse(work); // builds BVH and returns root node 386 | printf("\b\b\b100%%\n"); 387 | 388 | root->_bottom = bottom; // bottom is bottom of bbox bounding all triangles in the scene 389 | root->_top = top; 390 | 391 | return root; 392 | } 393 | 394 | // the following functions are required to create the cache-friendly BVH 395 | 396 | // recursively count bboxes 397 | int CountBoxes(BVHNode *root) 398 | { 399 | if (!root->IsLeaf()) { 400 | BVHInner *p = dynamic_cast(root); 401 | return 1 + CountBoxes(p->_left) + CountBoxes(p->_right); 402 | } 403 | else 404 | return 1; 405 | } 406 | 407 | // recursively count triangles 408 | unsigned CountTriangles(BVHNode *root) 409 | { 410 | if (!root->IsLeaf()) { 411 | BVHInner *p = dynamic_cast(root); 412 | return CountTriangles(p->_left) + CountTriangles(p->_right); 413 | } 414 | else { 415 | BVHLeaf *p = dynamic_cast(root); 416 | return (unsigned)p->_triangles.size(); 417 | } 418 | } 419 | 420 | // recursively count depth 421 | void CountDepth(BVHNode *root, int depth, int& maxDepth) 422 | { 423 | if (maxDepthIsLeaf()) { 426 | BVHInner *p = dynamic_cast(root); 427 | CountDepth(p->_left, depth + 1, maxDepth); 428 | CountDepth(p->_right, depth + 1, maxDepth); 429 | } 430 | } 431 | 432 | // Writes in the g_pCFBVH and g_triIndexListNo arrays, 433 | // creating a cache-friendly version of the BVH 434 | void PopulateCacheFriendlyBVH( 435 | const Triangle *pFirstTriangle, 436 | BVHNode *root, 437 | unsigned& idxBoxes, 438 | unsigned &idxTriList) 439 | { 440 | unsigned currIdxBoxes = idxBoxes; 441 | g_pCFBVH[currIdxBoxes]._bottom = root->_bottom; 442 | g_pCFBVH[currIdxBoxes]._top = root->_top; 443 | 444 | //DEPTH FIRST APPROACH (left first until complete) 445 | if (!root->IsLeaf()) { // inner node 446 | BVHInner *p = dynamic_cast(root); 447 | // recursively populate left and right 448 | int idxLeft = ++idxBoxes; 449 | PopulateCacheFriendlyBVH(pFirstTriangle, p->_left, idxBoxes, idxTriList); 450 | int idxRight = ++idxBoxes; 451 | PopulateCacheFriendlyBVH(pFirstTriangle, p->_right, idxBoxes, idxTriList); 452 | g_pCFBVH[currIdxBoxes].u.inner._idxLeft = idxLeft; 453 | g_pCFBVH[currIdxBoxes].u.inner._idxRight = idxRight; 454 | } 455 | 456 | else { // leaf 457 | BVHLeaf *p = dynamic_cast(root); 458 | unsigned count = (unsigned)p->_triangles.size(); 459 | g_pCFBVH[currIdxBoxes].u.leaf._count = 0x80000000 | count; // highest bit set indicates a leaf node (inner node if highest bit is 0) 460 | g_pCFBVH[currIdxBoxes].u.leaf._startIndexInTriIndexList = idxTriList; 461 | 462 | for (std::list::iterator it = p->_triangles.begin(); it != p->_triangles.end(); it++) 463 | { 464 | g_triIndexList[idxTriList++] = *it - pFirstTriangle; 465 | } 466 | } 467 | } 468 | 469 | void CreateCFBVH() 470 | { 471 | if (!g_pSceneBVH) { 472 | puts("Internal bug in CreateCFBVH, please report it..."); fflush(stdout); 473 | exit(1); 474 | } 475 | 476 | unsigned idxTriList = 0; 477 | unsigned idxBoxes = 0; 478 | 479 | g_triIndexListNo = CountTriangles(g_pSceneBVH); 480 | g_triIndexList = new int[g_triIndexListNo]; 481 | 482 | g_pCFBVH_No = CountBoxes(g_pSceneBVH); 483 | g_pCFBVH = new CacheFriendlyBVHNode[g_pCFBVH_No]; // array 484 | 485 | PopulateCacheFriendlyBVH(&g_triangles[0], g_pSceneBVH, idxBoxes, idxTriList); 486 | 487 | if ((idxBoxes != g_pCFBVH_No - 1) || (idxTriList != g_triIndexListNo)) { 488 | puts("Internal bug in CreateCFBVH, please report it..."); fflush(stdout); 489 | exit(1); 490 | } 491 | 492 | int maxDepth = 0; 493 | CountDepth(g_pSceneBVH, 0, maxDepth); 494 | if (maxDepth >= BVH_STACK_SIZE) { 495 | printf("Max depth of BVH was %d\n", maxDepth); 496 | puts("Recompile with BVH_STACK_SIZE set to more than that..."); fflush(stdout); 497 | exit(1); 498 | } 499 | } 500 | 501 | // The gateway - creates the "pure" BVH, and then copies the results in the cache-friendly one 502 | void UpdateBoundingVolumeHierarchy(const char *filename) 503 | { 504 | if (!g_pSceneBVH) { 505 | std::string BVHcacheFilename(filename); 506 | BVHcacheFilename += ".bvh"; 507 | FILE *fp = fopen(BVHcacheFilename.c_str(), "rb"); 508 | if (!fp) { 509 | // No cached BVH data - we need to calculate them 510 | Clock me; 511 | g_pSceneBVH = CreateBVH(); 512 | printf("Building the BVH%s took %.2f seconds\n", 513 | #ifdef SIMD_SSE 514 | " with SSE", // SIMD SSE building has been removed for the tutorial 515 | #else 516 | "", 517 | #endif 518 | me.readMS() / 1000.); 519 | 520 | // Now that the BVH has been created, copy its data into a more cache-friendly format 521 | // (CacheFriendlyBVHNode occupies exactly 32 bytes, i.e. a cache-line) 522 | CreateCFBVH(); 523 | 524 | // Now store the results, if possible... 525 | fp = fopen(BVHcacheFilename.c_str(), "wb"); 526 | if (!fp) return; 527 | if (1 != fwrite(&g_pCFBVH_No, sizeof(unsigned), 1, fp)) return; 528 | if (1 != fwrite(&g_triIndexListNo, sizeof(unsigned), 1, fp)) return; 529 | if (g_pCFBVH_No != fwrite(g_pCFBVH, sizeof(CacheFriendlyBVHNode), g_pCFBVH_No, fp)) return; 530 | if (g_triIndexListNo != fwrite(g_triIndexList, sizeof(int), g_triIndexListNo, fp)) return; 531 | fclose(fp); 532 | } 533 | else { // BVH has been built already and stored in a file, read the file 534 | puts("Cache exists, reading the pre-calculated BVH data..."); 535 | if (1 != fread(&g_pCFBVH_No, sizeof(unsigned), 1, fp)) return; 536 | if (1 != fread(&g_triIndexListNo, sizeof(unsigned), 1, fp)) return; 537 | g_pCFBVH = new CacheFriendlyBVHNode[g_pCFBVH_No]; 538 | g_triIndexList = new int[g_triIndexListNo]; 539 | if (g_pCFBVH_No != fread(g_pCFBVH, sizeof(CacheFriendlyBVHNode), g_pCFBVH_No, fp)) return; 540 | if (g_triIndexListNo != fread(g_triIndexList, sizeof(int), g_triIndexListNo, fp)) return; 541 | fclose(fp); 542 | } 543 | } 544 | } 545 | -------------------------------------------------------------------------------- /bvh.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #ifndef __BVH_H_ 21 | #define __BVH_H_ 22 | 23 | #include 24 | #include "linear_algebra.h" 25 | #include "geometry.h" 26 | 27 | 28 | // The nice version of the BVH - a shallow hierarchy of inner and leaf nodes 29 | struct BVHNode { 30 | Vector3Df _bottom; 31 | Vector3Df _top; 32 | virtual bool IsLeaf() = 0; // pure virtual 33 | }; 34 | 35 | struct BVHInner : BVHNode { 36 | BVHNode *_left; 37 | BVHNode *_right; 38 | virtual bool IsLeaf() { return false; } 39 | }; 40 | 41 | struct BVHLeaf : BVHNode { 42 | std::list _triangles; 43 | virtual bool IsLeaf() { return true; } 44 | }; 45 | 46 | struct CacheFriendlyBVHNode { 47 | // bounding box 48 | Vector3Df _bottom; 49 | Vector3Df _top; 50 | 51 | // parameters for leafnodes and innernodes occupy same space (union) to save memory 52 | // top bit discriminates between leafnode and innernode 53 | // no pointers, but indices (int): faster 54 | 55 | union { 56 | // inner node - stores indexes to array of CacheFriendlyBVHNode 57 | struct { 58 | unsigned _idxLeft; 59 | unsigned _idxRight; 60 | } inner; 61 | // leaf node: stores triangle count and starting index in triangle list 62 | struct { 63 | unsigned _count; // Top-most bit set, leafnode if set, innernode otherwise 64 | unsigned _startIndexInTriIndexList; 65 | } leaf; 66 | } u; 67 | }; 68 | 69 | // The ugly, cache-friendly form of the BVH: 32 bytes 70 | void CreateCFBVH(); // CacheFriendlyBVH 71 | 72 | // The single-point entrance to the BVH - call only this 73 | void UpdateBoundingVolumeHierarchy(const char *filename); 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /camera.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * Interactive camera with depth of field based on CUDA path tracer code 4 | * by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #include "camera.h" 21 | 22 | InteractiveCamera::InteractiveCamera() 23 | { 24 | centerPosition = Vector3Df(0, 0, 0); 25 | yaw = 0; 26 | pitch = 0.3; 27 | radius = 4; 28 | apertureRadius = 0.04; // 0.04 29 | focalDistance = 4.0f; 30 | 31 | resolution = make_float2(512, 512); // width, height 32 | fov = make_float2(40, 40); 33 | } 34 | 35 | InteractiveCamera::~InteractiveCamera() {} 36 | 37 | void InteractiveCamera::changeYaw(float m){ 38 | yaw += m; 39 | fixYaw(); 40 | } 41 | 42 | void InteractiveCamera::changePitch(float m){ 43 | pitch += m; 44 | fixPitch(); 45 | } 46 | 47 | void InteractiveCamera::changeRadius(float m){ 48 | radius += radius * m; // Change proportional to current radius. Assuming radius isn't allowed to go to zero. 49 | fixRadius(); 50 | } 51 | 52 | void InteractiveCamera::changeAltitude(float m){ 53 | centerPosition.y += m; 54 | //fixCenterPosition(); 55 | } 56 | 57 | void InteractiveCamera::goForward(float m){ 58 | centerPosition += viewDirection * m; 59 | } 60 | 61 | void InteractiveCamera::strafe(float m){ 62 | Vector3Df strafeAxis = cross(viewDirection, Vector3Df(0, 1, 0)); 63 | strafeAxis.normalize(); 64 | centerPosition += strafeAxis * m; 65 | } 66 | 67 | void InteractiveCamera::rotateRight(float m){ 68 | float yaw2 = yaw; 69 | yaw2 += m; 70 | float pitch2 = pitch; 71 | float xDirection = sin(yaw2) * cos(pitch2); 72 | float yDirection = sin(pitch2); 73 | float zDirection = cos(yaw2) * cos(pitch2); 74 | Vector3Df directionToCamera = Vector3Df(xDirection, yDirection, zDirection); 75 | viewDirection = directionToCamera * (-1.0); 76 | } 77 | 78 | void InteractiveCamera::changeApertureDiameter(float m){ 79 | apertureRadius += (apertureRadius + 0.01) * m; // Change proportional to current apertureRadius. 80 | fixApertureRadius(); 81 | } 82 | 83 | 84 | void InteractiveCamera::changeFocalDistance(float m){ 85 | focalDistance += m; 86 | fixFocalDistance(); 87 | } 88 | 89 | 90 | void InteractiveCamera::setResolution(float x, float y){ 91 | resolution = make_float2(x, y); 92 | setFOVX(fov.x); 93 | } 94 | 95 | float radiansToDegrees(float radians) { 96 | float degrees = radians * 180.0 / M_PI; 97 | return degrees; 98 | } 99 | 100 | float degreesToRadians(float degrees) { 101 | float radians = degrees / 180.0 * M_PI; 102 | return radians; 103 | } 104 | 105 | void InteractiveCamera::setFOVX(float fovx){ 106 | fov.x = fovx; 107 | fov.y = radiansToDegrees(atan(tan(degreesToRadians(fovx) * 0.5) * (resolution.y / resolution.x)) * 2.0); 108 | // resolution float division 109 | } 110 | 111 | void InteractiveCamera::buildRenderCamera(Camera* renderCamera){ 112 | float xDirection = sin(yaw) * cos(pitch); 113 | float yDirection = sin(pitch); 114 | float zDirection = cos(yaw) * cos(pitch); 115 | Vector3Df directionToCamera = Vector3Df(xDirection, yDirection, zDirection); 116 | viewDirection = directionToCamera * (-1.0); 117 | Vector3Df eyePosition = centerPosition +directionToCamera * radius; 118 | //Vector3Df eyePosition = centerPosition; // rotate camera from stationary viewpoint 119 | 120 | 121 | renderCamera->position = eyePosition; 122 | renderCamera->view = viewDirection; 123 | renderCamera->up = Vector3Df(0, 1, 0); 124 | renderCamera->resolution = make_float2(resolution.x, resolution.y); 125 | renderCamera->fov = make_float2(fov.x, fov.y); 126 | renderCamera->apertureRadius = apertureRadius; 127 | renderCamera->focalDistance = focalDistance; 128 | } 129 | 130 | float mod(float x, float y) { // Does this account for -y ??? 131 | return x - y * floorf(x / y); 132 | } 133 | 134 | void InteractiveCamera::fixYaw() { 135 | yaw = mod(yaw, 2 * M_PI); // Normalize the yaw. 136 | } 137 | 138 | float clamp2(float n, float low, float high) { 139 | n = fminf(n, high); 140 | n = fmaxf(n, low); 141 | return n; 142 | } 143 | 144 | void InteractiveCamera::fixPitch() { 145 | float padding = 0.05; 146 | pitch = clamp2(pitch, -PI_OVER_TWO + padding, PI_OVER_TWO - padding); // Limit the pitch. 147 | } 148 | 149 | void InteractiveCamera::fixRadius() { 150 | float minRadius = 0.2; 151 | float maxRadius = 100.0; 152 | radius = clamp2(radius, minRadius, maxRadius); 153 | } 154 | 155 | void InteractiveCamera::fixApertureRadius() { 156 | float minApertureRadius = 0.0; 157 | float maxApertureRadius = 25.0; 158 | apertureRadius = clamp2(apertureRadius, minApertureRadius, maxApertureRadius); 159 | } 160 | 161 | void InteractiveCamera::fixFocalDistance() { 162 | float minFocalDist = 0.2; 163 | float maxFocalDist = 100.0; 164 | focalDistance = clamp2(focalDistance, minFocalDist, maxFocalDist); 165 | } 166 | -------------------------------------------------------------------------------- /camera.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * Interactive camera with depth of field based on CUDA path tracer code 4 | * by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #ifndef __CAMERA_H__ 21 | #define __CAMERA_H__ 22 | 23 | #include "geometry.h" 24 | #include "linear_algebra.h" 25 | #include 26 | 27 | #define M_PI 3.14156265 28 | #define PI_OVER_TWO 1.5707963267948966192313216916397514420985 29 | 30 | // Camera struct, used to store interactive camera data, copied to the GPU and used by CUDA for each frame 31 | struct Camera { 32 | float2 resolution; 33 | Vector3Df position; 34 | Vector3Df view; 35 | Vector3Df up; 36 | float2 fov; 37 | float apertureRadius; 38 | float focalDistance; 39 | }; 40 | 41 | // class for interactive camera object, updated on the CPU for each frame and copied into Camera struct 42 | class InteractiveCamera 43 | { 44 | private: 45 | 46 | Vector3Df centerPosition; 47 | Vector3Df viewDirection; 48 | float yaw; 49 | float pitch; 50 | float radius; 51 | float apertureRadius; 52 | float focalDistance; 53 | 54 | void fixYaw(); 55 | void fixPitch(); 56 | void fixRadius(); 57 | void fixApertureRadius(); 58 | void fixFocalDistance(); 59 | 60 | public: 61 | InteractiveCamera(); 62 | virtual ~InteractiveCamera(); 63 | void changeYaw(float m); 64 | void changePitch(float m); 65 | void changeRadius(float m); 66 | void changeAltitude(float m); 67 | void changeFocalDistance(float m); 68 | void strafe(float m); 69 | void goForward(float m); 70 | void rotateRight(float m); 71 | void changeApertureDiameter(float m); 72 | void setResolution(float x, float y); 73 | void setFOVX(float fovx); 74 | 75 | void buildRenderCamera(Camera* renderCamera); 76 | 77 | float2 resolution; 78 | float2 fov; 79 | }; 80 | 81 | 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /cuda_pathtracer.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * Interactive camera with depth of field based on CUDA path tracer code 6 | * by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 2 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program; if not, write to the Free Software 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "device_launch_parameters.h" 32 | #include "cutil_math.h" 33 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glew.h" 34 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glut.h" 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "cuda_pathtracer.h" 41 | 42 | #define M_PI 3.1415926535897932384626422832795028841971f 43 | #define TWO_PI 6.2831853071795864769252867665590057683943f 44 | #define NUDGE_FACTOR 1e-3f // epsilon 45 | #define samps 1 // samples 46 | #define BVH_STACK_SIZE 32 47 | #define SCREEN_DIST (height*2) 48 | 49 | int texturewidth = 0; 50 | int textureheight = 0; 51 | int total_number_of_triangles; 52 | 53 | __device__ int depth = 0; 54 | 55 | 56 | // Textures for vertices, triangles and BVH data 57 | // (see CudaRender() below, as well as main() to see the data setup process) 58 | texture g_triIdxListTexture; 59 | texture g_pCFBVHlimitsTexture; 60 | texture g_pCFBVHindexesOrTrilistsTexture; 61 | texture g_trianglesTexture; 62 | 63 | Vertex* cudaVertices; 64 | float* cudaTriangleIntersectionData; 65 | int* cudaTriIdxList = NULL; 66 | float* cudaBVHlimits = NULL; 67 | int* cudaBVHindexesOrTrilists = NULL; 68 | Triangle* cudaTriangles = NULL; 69 | Camera* cudaRendercam = NULL; 70 | 71 | 72 | struct Ray { 73 | float3 orig; // ray origin 74 | float3 dir; // ray direction 75 | __device__ Ray(float3 o_, float3 d_) : orig(o_), dir(d_) {} 76 | }; 77 | 78 | enum Refl_t { DIFF, METAL, SPEC, REFR, COAT }; // material types 79 | 80 | struct Sphere { 81 | 82 | float rad; // radius 83 | float3 pos, emi, col; // position, emission, color 84 | Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive) 85 | 86 | __device__ float intersect(const Ray &r) const { // returns distance, 0 if nohit 87 | 88 | // Ray/sphere intersection 89 | // Quadratic formula required to solve ax^2 + bx + c = 0 90 | // Solution x = (-b +- sqrt(b*b - 4ac)) / 2a 91 | // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0 92 | 93 | float3 op = pos - r.orig; // 94 | float t, epsilon = 0.01f; 95 | float b = dot(op, r.dir); 96 | float disc = b*b - dot(op, op) + rad*rad; // discriminant 97 | if (disc<0) return 0; else disc = sqrtf(disc); 98 | return (t = b - disc)>epsilon ? t : ((t = b + disc)>epsilon ? t : 0); 99 | } 100 | 101 | }; 102 | 103 | __device__ Sphere spheres[] = { 104 | 105 | // sun 106 | { 1.6, { 0.0f, 2.8, 0 }, { 6, 4, 2 }, { 0.f, 0.f, 0.f }, DIFF }, // 37, 34, 30 X: links rechts Y: op neer 107 | //{ 1600, { 3000.0f, 10, 6000 }, { 17, 14, 10 }, { 0.f, 0.f, 0.f }, DIFF }, 108 | 109 | // horizon sun2 110 | // { 1560, { 3500.0f, 0, 7000 }, { 50, 25, 2.5 }, { 0.f, 0.f, 0.f }, DIFF }, // 150, 75, 7.5 111 | 112 | // sky 113 | //{ 10000, { 50.0f, 40.8f, -1060 }, { 0.1, 0.3, 0.55 }, { 0.175f, 0.175f, 0.25f }, DIFF }, // 0.0003, 0.01, 0.15, or brighter: 0.2, 0.3, 0.6 114 | { 10000, { 50.0f, 40.8f, -1060 }, { 0.51, 0.51, 0.51 }, { 0.175f, 0.175f, 0.25f }, DIFF }, 115 | 116 | // ground 117 | { 100000, { 0.0f, -100001.1, 0 }, { 0, 0, 0 }, { 0.5f, 0.0f, 0.0f }, COAT }, 118 | { 100000, { 0.0f, -100001.2, 0 }, { 0, 0, 0 }, { 0.3f, 0.3f, 0.3f }, DIFF }, // double shell to prevent light leaking 119 | 120 | // horizon brightener 121 | { 110000, { 50.0f, -110048.5, 0 }, { 3.6, 2.0, 0.2 }, { 0.f, 0.f, 0.f }, DIFF }, 122 | // mountains 123 | //{ 4e4, { 50.0f, -4e4 - 30, -3000 }, { 0, 0, 0 }, { 0.2f, 0.2f, 0.2f }, DIFF }, 124 | // white Mirr 125 | { 1.1, { 1.6, 0, 1.0 }, { 0, 0.0, 0 }, { 0.9f, .9f, 0.9f }, SPEC } 126 | // Glass 127 | //{ 0.3, { 0.0f, -0.4, 4 }, { .0, 0., .0 }, { 0.9f, 0.9f, 0.9f }, DIFF }, 128 | // Glass2 129 | //{ 22, { 87.0f, 22, 24 }, { 0, 0, 0 }, { 0.9f, 0.9f, 0.9f }, SPEC }, 130 | }; 131 | 132 | 133 | // Create OpenGL BGR value for assignment in OpenGL VBO buffer 134 | __device__ int getColor(Vector3Df& p) // converts Vector3Df colour to int 135 | { 136 | return (((unsigned)p.z) << 16) | (((unsigned)p.y) << 8) | (((unsigned)p.x)); 137 | } 138 | 139 | // Helper function, that checks whether a ray intersects a bounding box (BVH node) 140 | __device__ bool RayIntersectsBox(const Vector3Df& originInWorldSpace, const Vector3Df& rayInWorldSpace, int boxIdx) 141 | { 142 | // set Tnear = - infinity, Tfar = infinity 143 | // 144 | // For each pair of planes P associated with X, Y, and Z do: 145 | // (example using X planes) 146 | // if direction Xd = 0 then the ray is parallel to the X planes, so 147 | // if origin Xo is not between the slabs ( Xo < Xl or Xo > Xh) then 148 | // return false 149 | // else, if the ray is not parallel to the plane then 150 | // begin 151 | // compute the intersection distance of the planes 152 | // T1 = (Xl - Xo) / Xd 153 | // T2 = (Xh - Xo) / Xd 154 | // If T1 > T2 swap (T1, T2) /* since T1 intersection with near plane */ 155 | // If T1 > Tnear set Tnear =T1 /* want largest Tnear */ 156 | // If T2 < Tfar set Tfar="T2" /* want smallest Tfar */ 157 | // If Tnear > Tfar box is missed so 158 | // return false 159 | // If Tfar < 0 box is behind ray 160 | // return false 161 | // end 162 | // end of for loop 163 | 164 | float Tnear, Tfar; 165 | Tnear = -FLT_MAX; 166 | Tfar = FLT_MAX; 167 | 168 | float2 limits; 169 | 170 | // box intersection routine 171 | #define CHECK_NEAR_AND_FAR_INTERSECTION(c) \ 172 | if (rayInWorldSpace.##c == 0.f) { \ 173 | if (originInWorldSpace.##c < limits.x) return false; \ 174 | if (originInWorldSpace.##c > limits.y) return false; \ 175 | } else { \ 176 | float T1 = (limits.x - originInWorldSpace.##c)/rayInWorldSpace.##c; \ 177 | float T2 = (limits.y - originInWorldSpace.##c)/rayInWorldSpace.##c; \ 178 | if (T1>T2) { float tmp=T1; T1=T2; T2=tmp; } \ 179 | if (T1 > Tnear) Tnear = T1; \ 180 | if (T2 < Tfar) Tfar = T2; \ 181 | if (Tnear > Tfar) return false; \ 182 | if (Tfar < 0.f) return false; \ 183 | } 184 | 185 | limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx); // box.bottom._x/top._x placed in limits.x/limits.y 186 | //limits = make_float2(cudaBVHlimits[6 * boxIdx + 0], cudaBVHlimits[6 * boxIdx + 1]); 187 | CHECK_NEAR_AND_FAR_INTERSECTION(x) 188 | limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx + 1); // box.bottom._y/top._y placed in limits.x/limits.y 189 | //limits = make_float2(cudaBVHlimits[6 * boxIdx + 2], cudaBVHlimits[6 * boxIdx + 3]); 190 | CHECK_NEAR_AND_FAR_INTERSECTION(y) 191 | limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx + 2); // box.bottom._z/top._z placed in limits.x/limits.y 192 | //limits = make_float2(cudaBVHlimits[6 * boxIdx + 4], cudaBVHlimits[6 * boxIdx + 5]); 193 | CHECK_NEAR_AND_FAR_INTERSECTION(z) 194 | 195 | // If Box survived all above tests, return true with intersection point Tnear and exit point Tfar. 196 | return true; 197 | } 198 | 199 | 200 | ////////////////////////////////////////// 201 | // BVH intersection routine // 202 | // using CUDA texture memory // 203 | ////////////////////////////////////////// 204 | 205 | // there are 3 forms of the BVH: a "pure" BVH, a cache-friendly BVH (taking up less memory space than the pure BVH) 206 | // and a "textured" BVH which stores its data in CUDA texture memory (which is cached). The last one is gives the 207 | // best performance and is used here. 208 | 209 | __device__ bool BVH_IntersectTriangles( 210 | int* cudaBVHindexesOrTrilists, const Vector3Df& origin, const Vector3Df& ray, unsigned avoidSelf, 211 | int& pBestTriIdx, Vector3Df& pointHitInWorldSpace, float& kAB, float& kBC, float& kCA, float& hitdist, 212 | float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, Vector3Df& boxnormal) 213 | { 214 | // in the loop below, maintain the closest triangle and the point where we hit it: 215 | pBestTriIdx = -1; 216 | float bestTriDist; 217 | 218 | // start from infinity 219 | bestTriDist = FLT_MAX; 220 | 221 | // create a stack for each ray 222 | // the stack is just a fixed size array of indices to BVH nodes 223 | int stack[BVH_STACK_SIZE]; 224 | 225 | int stackIdx = 0; 226 | stack[stackIdx++] = 0; 227 | Vector3Df hitpoint; 228 | 229 | // while the stack is not empty 230 | while (stackIdx) { 231 | 232 | // pop a BVH node (or AABB, Axis Aligned Bounding Box) from the stack 233 | int boxIdx = stack[stackIdx - 1]; 234 | //uint* pCurrent = &cudaBVHindexesOrTrilists[boxIdx]; 235 | 236 | // decrement the stackindex 237 | stackIdx--; 238 | 239 | // fetch the data (indices to childnodes or index in triangle list + trianglecount) associated with this node 240 | uint4 data = tex1Dfetch(g_pCFBVHindexesOrTrilistsTexture, boxIdx); 241 | 242 | // original, "pure" BVH form... 243 | //if (!pCurrent->IsLeaf()) { 244 | 245 | // cache-friendly BVH form... 246 | //if (!(cudaBVHindexesOrTrilists[4 * boxIdx + 0] & 0x80000000)) { // INNER NODE 247 | 248 | // texture memory BVH form... 249 | 250 | // determine if BVH node is an inner node or a leaf node by checking the highest bit (bitwise AND operation) 251 | // inner node if highest bit is 1, leaf node if 0 252 | 253 | if (!(data.x & 0x80000000)) { // INNER NODE 254 | 255 | // if ray intersects inner node, push indices of left and right child nodes on the stack 256 | if (RayIntersectsBox(origin, ray, boxIdx)) { 257 | 258 | //stack[stackIdx++] = pCurrent->u.inner._idxRight; 259 | //stack[stackIdx++] = cudaBVHindexesOrTrilists[4 * boxIdx + 1]; 260 | stack[stackIdx++] = data.y; // right child node index 261 | 262 | //stack[stackIdx++] = pCurrent->u.inner._idxLeft; 263 | //stack[stackIdx++] = cudaBVHindexesOrTrilists[4 * boxIdx + 2]; 264 | stack[stackIdx++] = data.z; // left child node index 265 | 266 | // return if stack size is exceeded 267 | if (stackIdx>BVH_STACK_SIZE) 268 | { 269 | return false; 270 | } 271 | } 272 | } 273 | else { // LEAF NODE 274 | 275 | // original, "pure" BVH form... 276 | // BVHLeaf *p = dynamic_cast(pCurrent); 277 | // for(std::list::iterator it=p->_triangles.begin(); 278 | // it != p->_triangles.end(); 279 | // it++) 280 | 281 | // cache-friendly BVH form... 282 | // for(unsigned i=pCurrent->u.leaf._startIndexInTriIndexList; 283 | // iu.leaf._startIndexInTriIndexList + (pCurrent->u.leaf._count & 0x7fffffff); 284 | 285 | // texture memory BVH form... 286 | // for (unsigned i = cudaBVHindexesOrTrilists[4 * boxIdx + 3]; i< cudaBVHindexesOrTrilists[4 * boxIdx + 3] + (cudaBVHindexesOrTrilists[4 * boxIdx + 0] & 0x7fffffff); i++) { // data.w = number of triangles in leaf 287 | 288 | // loop over every triangle in the leaf node 289 | // data.w is start index in triangle list 290 | // data.x stores number of triangles in leafnode (the bitwise AND operation extracts the triangle number) 291 | for (unsigned i = data.w; i < data.w + (data.x & 0x7fffffff); i++) { 292 | 293 | // original, "pure" BVH form... 294 | //const Triangle& triangle = *(*it); 295 | 296 | // cache-friendly BVH form... 297 | //const Triangle& triangle = pTriangles[cudaTriIdxList[i]]; 298 | 299 | // texture memory BVH form... 300 | // fetch the index of the current triangle 301 | int idx = tex1Dfetch(g_triIdxListTexture, i).x; 302 | //int idx = cudaTriIdxList[i]; 303 | 304 | // check if triangle is the same as the one intersected by previous ray 305 | // to avoid self-reflections/refractions 306 | if (avoidSelf == idx) 307 | continue; 308 | 309 | // fetch triangle center and normal from texture memory 310 | float4 center = tex1Dfetch(g_trianglesTexture, 5 * idx); 311 | float4 normal = tex1Dfetch(g_trianglesTexture, 5 * idx + 1); 312 | 313 | // use the pre-computed triangle intersection data: normal, d, e1/d1, e2/d2, e3/d3 314 | float k = dot(normal, ray); 315 | if (k == 0.0f) 316 | continue; // this triangle is parallel to the ray, ignore it. 317 | 318 | float s = (normal.w - dot(normal, origin)) / k; 319 | if (s <= 0.0f) // this triangle is "behind" the origin. 320 | continue; 321 | if (s <= NUDGE_FACTOR) // epsilon 322 | continue; 323 | Vector3Df hit = ray * s; 324 | hit += origin; 325 | 326 | // ray triangle intersection 327 | // Is the intersection of the ray with the triangle's plane INSIDE the triangle? 328 | 329 | float4 ee1 = tex1Dfetch(g_trianglesTexture, 5 * idx + 2); 330 | //float4 ee1 = make_float4(cudaTriangleIntersectionData[20 * idx + 8], cudaTriangleIntersectionData[20 * idx + 9], cudaTriangleIntersectionData[20 * idx + 10], cudaTriangleIntersectionData[20 * idx + 11]); 331 | float kt1 = dot(ee1, hit) - ee1.w; 332 | if (kt1<0.0f) continue; 333 | 334 | float4 ee2 = tex1Dfetch(g_trianglesTexture, 5 * idx + 3); 335 | //float4 ee2 = make_float4(cudaTriangleIntersectionData[20 * idx + 12], cudaTriangleIntersectionData[20 * idx + 13], cudaTriangleIntersectionData[20 * idx + 14], cudaTriangleIntersectionData[20 * idx + 15]); 336 | float kt2 = dot(ee2, hit) - ee2.w; 337 | if (kt2<0.0f) continue; 338 | 339 | float4 ee3 = tex1Dfetch(g_trianglesTexture, 5 * idx + 4); 340 | //float4 ee3 = make_float4(cudaTriangleIntersectionData[20 * idx + 16], cudaTriangleIntersectionData[20 * idx + 17], cudaTriangleIntersectionData[20 * idx + 18], cudaTriangleIntersectionData[20 * idx + 19]); 341 | float kt3 = dot(ee3, hit) - ee3.w; 342 | if (kt3<0.0f) continue; 343 | 344 | // ray intersects triangle, "hit" is the world space coordinate of the intersection. 345 | { 346 | // is this intersection closer than all the others? 347 | float hitZ = distancesq(origin, hit); 348 | if (hitZ < bestTriDist) { 349 | 350 | // maintain the closest hit 351 | bestTriDist = hitZ; 352 | hitdist = sqrtf(bestTriDist); 353 | pBestTriIdx = idx; 354 | pointHitInWorldSpace = hit; 355 | 356 | // store barycentric coordinates (for texturing, not used for now) 357 | kAB = kt1; 358 | kBC = kt2; 359 | kCA = kt3; 360 | } 361 | } 362 | } 363 | } 364 | } 365 | 366 | return pBestTriIdx != -1; 367 | } 368 | 369 | ////////////////////// 370 | // PATH TRACING 371 | ////////////////////// 372 | 373 | __device__ Vector3Df path_trace(curandState *randstate, Vector3Df originInWorldSpace, Vector3Df rayInWorldSpace, int avoidSelf, 374 | Triangle *pTriangles, int* cudaBVHindexesOrTrilists, float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList) 375 | { 376 | 377 | // colour mask 378 | Vector3Df mask = Vector3Df(1.0f, 1.0f, 1.0f); 379 | // accumulated colour 380 | Vector3Df accucolor = Vector3Df(0.0f, 0.0f, 0.0f); 381 | 382 | for (int bounces = 0; bounces < 5; bounces++){ // iteration up to 4 bounces (instead of recursion in CPU code) 383 | 384 | int sphere_id = -1; 385 | int triangle_id = -1; 386 | int pBestTriIdx = -1; 387 | int geomtype = -1; 388 | const Triangle *pBestTri = NULL; 389 | Vector3Df pointHitInWorldSpace; 390 | float kAB = 0.f, kBC = 0.f, kCA = 0.f; // distances from the 3 edges of the triangle (from where we hit it), to be used for texturing 391 | 392 | float tmin = 1e20; 393 | float tmax = -1e20; 394 | float d = 1e20; 395 | float scene_t = 1e20; 396 | float inf = 1e20; 397 | float hitdistance = 1e20; 398 | Vector3Df f = Vector3Df(0, 0, 0); 399 | Vector3Df emit = Vector3Df(0, 0, 0); 400 | Vector3Df x; // intersection point 401 | Vector3Df n; // normal 402 | Vector3Df nl; // oriented normal 403 | Vector3Df boxnormal = Vector3Df(0, 0, 0); 404 | Vector3Df dw; // ray direction of next path segment 405 | Refl_t refltype; 406 | 407 | float3 rayorig = make_float3(originInWorldSpace.x, originInWorldSpace.y, originInWorldSpace.z); 408 | float3 raydir = make_float3(rayInWorldSpace.x, rayInWorldSpace.y, rayInWorldSpace.z); 409 | 410 | // intersect all triangles in the scene stored in BVH 411 | BVH_IntersectTriangles( 412 | cudaBVHindexesOrTrilists, originInWorldSpace, rayInWorldSpace, avoidSelf, 413 | pBestTriIdx, pointHitInWorldSpace, kAB, kBC, kCA, hitdistance, cudaBVHlimits, 414 | cudaTriangleIntersectionData, cudaTriIdxList, boxnormal); 415 | 416 | // intersect all spheres in the scene 417 | float numspheres = sizeof(spheres) / sizeof(Sphere); 418 | for (int i = int(numspheres); i--;){ // for all spheres in scene 419 | // keep track of distance from origin to closest intersection point 420 | if ((d = spheres[i].intersect(Ray(rayorig, raydir))) && d < scene_t){ scene_t = d; sphere_id = i; geomtype = 1; } 421 | } 422 | // set avoidSelf to current triangle index to avoid intersection between this triangle and the next ray, 423 | // so that we don't get self-shadow or self-reflection from this triangle... 424 | avoidSelf = pBestTriIdx; 425 | 426 | if (hitdistance < scene_t && hitdistance > 0.002) // EPSILON 427 | { 428 | scene_t = hitdistance; 429 | triangle_id = pBestTriIdx; 430 | geomtype = 2; 431 | } 432 | 433 | if (scene_t > 1e20) return Vector3Df(0.0f, 0.0f, 0.0f); 434 | 435 | // SPHERES: 436 | if (geomtype == 1){ 437 | 438 | Sphere &sphere = spheres[sphere_id]; // hit object with closest intersection 439 | x = originInWorldSpace + rayInWorldSpace * scene_t; // intersection point on object 440 | n = Vector3Df(x.x - sphere.pos.x, x.y - sphere.pos.y, x.z - sphere.pos.z); // normal 441 | n.normalize(); 442 | nl = dot(n, rayInWorldSpace) < 0 ? n : n * -1; // correctly oriented normal 443 | f = Vector3Df(sphere.col.x, sphere.col.y, sphere.col.z); // object colour 444 | refltype = sphere.refl; 445 | emit = Vector3Df(sphere.emi.x, sphere.emi.y, sphere.emi.z); // object emission 446 | accucolor += (mask * emit); 447 | } 448 | 449 | // TRIANGLES:5 450 | if (geomtype == 2){ 451 | 452 | pBestTri = &pTriangles[triangle_id]; 453 | 454 | x = pointHitInWorldSpace; // intersection point 455 | n = pBestTri->_normal; // normal 456 | //n = Vector3Df(0,0,1); 457 | n.normalize(); 458 | nl = dot(n, rayInWorldSpace) < 0 ? n : n * -1; // correctly oriented normal 459 | 460 | Vector3Df colour = Vector3Df(0.9f, 0.3f, 0.0f); // hardcoded triangle colour 461 | //Vector3Df colour = pBestTri->_colorf; 462 | refltype = COAT; 463 | f = colour; 464 | emit = Vector3Df(0, 0, 0); // object emission 465 | accucolor += (mask * emit); 466 | } 467 | 468 | // basic material system, all parameters are hard-coded (such as phong exponent, index of refraction) 469 | 470 | // diffuse material, based on smallpt by Kevin Beason 471 | if (refltype == DIFF){ 472 | 473 | // pick two random numbers 474 | float phi = 2 * M_PI * curand_uniform(randstate); 475 | float r2 = curand_uniform(randstate); 476 | float r2s = sqrtf(r2); 477 | 478 | // compute orthonormal coordinate frame uvw with hitpoint as origin 479 | Vector3Df w = nl; w.normalize(); 480 | Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize(); 481 | Vector3Df v = cross(w, u); 482 | 483 | // compute cosine weighted random ray direction on hemisphere 484 | dw = u*cosf(phi)*r2s + v*sinf(phi)*r2s + w*sqrtf(1 - r2); 485 | dw.normalize(); 486 | 487 | // offset origin next path segment to prevent self intersection 488 | pointHitInWorldSpace = x + w * 0.01; // scene size dependent 489 | 490 | // multiply mask with colour of object 491 | mask *= f; 492 | } 493 | 494 | // Phong metal material from "Realistic Ray Tracing", P. Shirley 495 | if (refltype == METAL){ 496 | 497 | // compute random perturbation of ideal reflection vector 498 | // the higher the phong exponent, the closer the perturbed vector is to the ideal reflection direction 499 | float phi = 2 * M_PI * curand_uniform(randstate); 500 | float r2 = curand_uniform(randstate); 501 | float phongexponent = 20; 502 | float cosTheta = powf(1 - r2, 1.0f / (phongexponent + 1)); 503 | float sinTheta = sqrtf(1 - cosTheta * cosTheta); 504 | 505 | // create orthonormal basis uvw around reflection vector with hitpoint as origin 506 | // w is ray direction for ideal reflection 507 | Vector3Df w = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace); w.normalize(); 508 | Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize(); 509 | Vector3Df v = cross(w, u); // v is normalised by default 510 | 511 | // compute cosine weighted random ray direction on hemisphere 512 | dw = u * cosf(phi) * sinTheta + v * sinf(phi) * sinTheta + w * cosTheta; 513 | dw.normalize(); 514 | 515 | // offset origin next path segment to prevent self intersection 516 | pointHitInWorldSpace = x + w * 0.01; // scene size dependent 517 | 518 | // multiply mask with colour of object 519 | mask *= f; 520 | } 521 | 522 | // specular material (perfect mirror) 523 | if (refltype == SPEC){ 524 | 525 | // compute reflected ray direction according to Snell's law 526 | dw = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace); 527 | 528 | // offset origin next path segment to prevent self intersection 529 | pointHitInWorldSpace = x + nl * 0.01; // scene size dependent 530 | 531 | // multiply mask with colour of object 532 | mask *= f; 533 | } 534 | 535 | // COAT material based on https://github.com/peterkutz/GPUPathTracer 536 | // randomly select diffuse or specular reflection 537 | // looks okay-ish but inaccurate (no Fresnel calculation yet) 538 | if (refltype == COAT){ 539 | 540 | float rouletteRandomFloat = curand_uniform(randstate); 541 | float threshold = 0.05f; 542 | Vector3Df specularColor = Vector3Df(1,1,1); // hard-coded 543 | bool reflectFromSurface = (rouletteRandomFloat < threshold); //computeFresnel(make_Vector3Df(n.x, n.y, n.z), incident, incidentIOR, transmittedIOR, reflectionDirection, transmissionDirection).reflectionCoefficient); 544 | 545 | if (reflectFromSurface) { // calculate perfectly specular reflection 546 | 547 | // Ray reflected from the surface. Trace a ray in the reflection direction. 548 | // TODO: Use Russian roulette instead of simple multipliers! (Selecting between diffuse sample and no sample (absorption) in this case.) 549 | 550 | mask *= specularColor; 551 | dw = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace); 552 | 553 | // offset origin next path segment to prevent self intersection 554 | pointHitInWorldSpace = x + nl * 0.01; // scene size dependent 555 | } 556 | 557 | else { // calculate perfectly diffuse reflection 558 | 559 | float r1 = 2 * M_PI * curand_uniform(randstate); 560 | float r2 = curand_uniform(randstate); 561 | float r2s = sqrtf(r2); 562 | 563 | // compute orthonormal coordinate frame uvw with hitpoint as origin 564 | Vector3Df w = nl; w.normalize(); 565 | Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize(); 566 | Vector3Df v = cross(w, u); 567 | 568 | // compute cosine weighted random ray direction on hemisphere 569 | dw = u*cosf(r1)*r2s + v*sinf(r1)*r2s + w*sqrtf(1 - r2); 570 | dw.normalize(); 571 | 572 | // offset origin next path segment to prevent self intersection 573 | pointHitInWorldSpace = x + nl * 0.01; // // scene size dependent 574 | 575 | // multiply mask with colour of object 576 | mask *= f; 577 | //mask *= make_Vector3Df(0.15f, 0.15f, 0.15f); // gold metal 578 | } 579 | } // end COAT 580 | 581 | // perfectly refractive material (glass, water) 582 | if (refltype == REFR){ 583 | 584 | bool into = dot(n, nl) > 0; // is ray entering or leaving refractive material? 585 | float nc = 1.0f; // Index of Refraction air 586 | float nt = 1.5f; // Index of Refraction glass/water 587 | float nnt = into ? nc / nt : nt / nc; // IOR ratio of refractive materials 588 | float ddn = dot(rayInWorldSpace, nl); 589 | float cos2t = 1.0f - nnt*nnt * (1.f - ddn*ddn); 590 | 591 | if (cos2t < 0.0f) // total internal reflection 592 | { 593 | dw = rayInWorldSpace; 594 | dw -= n * 2.0f * dot(n, rayInWorldSpace); 595 | 596 | // offset origin next path segment to prevent self intersection 597 | pointHitInWorldSpace = x + nl * 0.01; // scene size dependent 598 | } 599 | else // cos2t > 0 600 | { 601 | // compute direction of transmission ray 602 | Vector3Df tdir = rayInWorldSpace * nnt; 603 | tdir -= n * ((into ? 1 : -1) * (ddn*nnt + sqrtf(cos2t))); 604 | tdir.normalize(); 605 | 606 | float R0 = (nt - nc)*(nt - nc) / (nt + nc)*(nt + nc); 607 | float c = 1.f - (into ? -ddn : dot(tdir, n)); 608 | float Re = R0 + (1.f - R0) * c * c * c * c * c; 609 | float Tr = 1 - Re; // Transmission 610 | float P = .25f + .5f * Re; 611 | float RP = Re / P; 612 | float TP = Tr / (1.f - P); 613 | 614 | // randomly choose reflection or transmission ray 615 | if (curand_uniform(randstate) < 0.25) // reflection ray 616 | { 617 | mask *= RP; 618 | dw = rayInWorldSpace; 619 | dw -= n * 2.0f * dot(n, rayInWorldSpace); 620 | 621 | pointHitInWorldSpace = x + nl * 0.01; // scene size dependent 622 | } 623 | else // transmission ray 624 | { 625 | mask *= TP; 626 | dw = tdir; //r = Ray(x, tdir); 627 | pointHitInWorldSpace = x + nl * 0.001f; // epsilon must be small to avoid artefacts 628 | } 629 | } 630 | } 631 | 632 | // set up origin and direction of next path segment 633 | originInWorldSpace = pointHitInWorldSpace; 634 | rayInWorldSpace = dw; 635 | } 636 | 637 | return Vector3Df(accucolor.x, accucolor.y, accucolor.z); 638 | } 639 | 640 | union Colour // 4 bytes = 4 chars = 1 float 641 | { 642 | float c; 643 | uchar4 components; 644 | }; 645 | 646 | // the core path tracing kernel, 647 | // running in parallel for all pixels 648 | __global__ void CoreLoopPathTracingKernel(Vector3Df* output, Vector3Df* accumbuffer, Triangle* pTriangles, Camera* cudaRendercam, 649 | int* cudaBVHindexesOrTrilists, float* cudaBVHlimits, float* cudaTriangleIntersectionData, 650 | int* cudaTriIdxList, unsigned int framenumber, unsigned int hashedframenumber) 651 | { 652 | // assign a CUDA thread to every pixel by using the threadIndex 653 | unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 654 | unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 655 | 656 | // global threadId, see richiesams blogspot 657 | int threadId = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; 658 | 659 | // create random number generator and initialise with hashed frame number, see RichieSams blogspot 660 | curandState randState; // state of the random number generator, to prevent repetition 661 | curand_init(hashedframenumber + threadId, 0, 0, &randState); 662 | 663 | Vector3Df finalcol; // final pixel colour 664 | Vector3Df rendercampos = Vector3Df(cudaRendercam->position.x, cudaRendercam->position.y, cudaRendercam->position.z); 665 | 666 | int i = (height - y - 1)*width + x; // pixel index in buffer 667 | int pixelx = x; // pixel x-coordinate on screen 668 | int pixely = height - y - 1; // pixel y-coordintate on screen 669 | 670 | finalcol = Vector3Df(0.0f, 0.0f, 0.0f); // reset colour to zero for every pixel 671 | 672 | for (int s = 0; s < samps; s++){ 673 | 674 | // compute primary ray direction 675 | // use camera view of current frame (transformed on CPU side) to create local orthonormal basis 676 | Vector3Df rendercamview = Vector3Df(cudaRendercam->view.x, cudaRendercam->view.y, cudaRendercam->view.z); rendercamview.normalize(); // view is already supposed to be normalized, but normalize it explicitly just in case. 677 | Vector3Df rendercamup = Vector3Df(cudaRendercam->up.x, cudaRendercam->up.y, cudaRendercam->up.z); rendercamup.normalize(); 678 | Vector3Df horizontalAxis = cross(rendercamview, rendercamup); horizontalAxis.normalize(); // Important to normalize! 679 | Vector3Df verticalAxis = cross(horizontalAxis, rendercamview); verticalAxis.normalize(); // verticalAxis is normalized by default, but normalize it explicitly just for good measure. 680 | 681 | Vector3Df middle = rendercampos + rendercamview; 682 | Vector3Df horizontal = horizontalAxis * tanf(cudaRendercam->fov.x * 0.5 * (M_PI / 180)); // Now treating FOV as the full FOV, not half, so I multiplied it by 0.5. I also normzlized A and B, so there's no need to divide by the length of A or B anymore. Also normalized view and removed lengthOfView. Also removed the cast to float. 683 | Vector3Df vertical = verticalAxis * tanf(-cudaRendercam->fov.y * 0.5 * (M_PI / 180)); // Now treating FOV as the full FOV, not half, so I multiplied it by 0.5. I also normzlized A and B, so there's no need to divide by the length of A or B anymore. Also normalized view and removed lengthOfView. Also removed the cast to float. 684 | 685 | // anti-aliasing 686 | // calculate center of current pixel and add random number in X and Y dimension 687 | // based on https://github.com/peterkutz/GPUPathTracer 688 | float jitterValueX = curand_uniform(&randState) - 0.5; 689 | float jitterValueY = curand_uniform(&randState) - 0.5; 690 | float sx = (jitterValueX + pixelx) / (cudaRendercam->resolution.x - 1); 691 | float sy = (jitterValueY + pixely) / (cudaRendercam->resolution.y - 1); 692 | 693 | // compute pixel on screen 694 | Vector3Df pointOnPlaneOneUnitAwayFromEye = middle + ( horizontal * ((2 * sx) - 1)) + ( vertical * ((2 * sy) - 1)); 695 | Vector3Df pointOnImagePlane = rendercampos + ((pointOnPlaneOneUnitAwayFromEye - rendercampos) * cudaRendercam->focalDistance); // Important for depth of field! 696 | 697 | // calculation of depth of field / camera aperture 698 | // based on https://github.com/peterkutz/GPUPathTracer 699 | 700 | Vector3Df aperturePoint; 701 | 702 | if (cudaRendercam->apertureRadius > 0.00001) { // the small number is an epsilon value. 703 | 704 | // generate random numbers for sampling a point on the aperture 705 | float random1 = curand_uniform(&randState); 706 | float random2 = curand_uniform(&randState); 707 | 708 | // randomly pick a point on the circular aperture 709 | float angle = TWO_PI * random1; 710 | float distance = cudaRendercam->apertureRadius * sqrtf(random2); 711 | float apertureX = cos(angle) * distance; 712 | float apertureY = sin(angle) * distance; 713 | 714 | aperturePoint = rendercampos + (horizontalAxis * apertureX) + (verticalAxis * apertureY); 715 | } 716 | else { // zero aperture 717 | aperturePoint = rendercampos; 718 | } 719 | 720 | // calculate ray direction of next ray in path 721 | Vector3Df apertureToImagePlane = pointOnImagePlane - aperturePoint; 722 | apertureToImagePlane.normalize(); // ray direction, needs to be normalised 723 | Vector3Df rayInWorldSpace = apertureToImagePlane; 724 | // in theory, this should not be required 725 | rayInWorldSpace.normalize(); 726 | 727 | // origin of next ray in path 728 | Vector3Df originInWorldSpace = aperturePoint; 729 | 730 | finalcol += path_trace(&randState, originInWorldSpace, rayInWorldSpace, -1, pTriangles, 731 | cudaBVHindexesOrTrilists, cudaBVHlimits, cudaTriangleIntersectionData, cudaTriIdxList) * (1.0f/samps); 732 | } 733 | 734 | // add pixel colour to accumulation buffer (accumulates all samples) 735 | accumbuffer[i] += finalcol; 736 | // averaged colour: divide colour by the number of calculated frames so far 737 | Vector3Df tempcol = accumbuffer[i] / framenumber; 738 | 739 | Colour fcolour; 740 | Vector3Df colour = Vector3Df(clamp(tempcol.x, 0.0f, 1.0f), clamp(tempcol.y, 0.0f, 1.0f), clamp(tempcol.z, 0.0f, 1.0f)); 741 | // convert from 96-bit to 24-bit colour + perform gamma correction 742 | fcolour.components = make_uchar4((unsigned char)(powf(colour.x, 1 / 2.2f) * 255), (unsigned char)(powf(colour.y, 1 / 2.2f) * 255), (unsigned char)(powf(colour.z, 1 / 2.2f) * 255), 1); 743 | // store pixel coordinates and pixelcolour in OpenGL readable outputbuffer 744 | output[i] = Vector3Df(x, y, fcolour.c); 745 | 746 | } 747 | 748 | bool g_bFirstTime = true; 749 | 750 | // the gateway to CUDA, called from C++ (in void disp() in main.cpp) 751 | void cudarender(Vector3Df* dptr, Vector3Df* accumulatebuffer, Triangle* cudaTriangles, int* cudaBVHindexesOrTrilists, 752 | float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, 753 | unsigned framenumber, unsigned hashedframes, Camera* cudaRendercam){ 754 | 755 | if (g_bFirstTime) { 756 | // if this is the first time cudarender() is called, 757 | // bind the scene data to CUDA textures! 758 | g_bFirstTime = false; 759 | 760 | printf("g_triIndexListNo: %d\n", g_triIndexListNo); 761 | printf("g_pCFBVH_No: %d\n", g_pCFBVH_No); 762 | printf("g_verticesNo: %d\n", g_verticesNo); 763 | printf("g_trianglesNo: %d\n", g_trianglesNo); 764 | 765 | cudaChannelFormatDesc channel1desc = cudaCreateChannelDesc(); 766 | cudaBindTexture(NULL, &g_triIdxListTexture, cudaTriIdxList, &channel1desc, g_triIndexListNo * sizeof(uint1)); 767 | 768 | cudaChannelFormatDesc channel2desc = cudaCreateChannelDesc(); 769 | cudaBindTexture(NULL, &g_pCFBVHlimitsTexture, cudaBVHlimits, &channel2desc, g_pCFBVH_No * 6 * sizeof(float)); 770 | 771 | cudaChannelFormatDesc channel3desc = cudaCreateChannelDesc(); 772 | cudaBindTexture(NULL, &g_pCFBVHindexesOrTrilistsTexture, cudaBVHindexesOrTrilists, &channel3desc, 773 | g_pCFBVH_No * sizeof(uint4)); 774 | 775 | //cudaChannelFormatDesc channel4desc = cudaCreateChannelDesc(); 776 | //cudaBindTexture(NULL, &g_verticesTexture, cudaPtrVertices, &channel4desc, g_verticesNo * 8 * sizeof(float)); 777 | 778 | cudaChannelFormatDesc channel5desc = cudaCreateChannelDesc(); 779 | cudaBindTexture(NULL, &g_trianglesTexture, cudaTriangleIntersectionData, &channel5desc, g_trianglesNo * 20 * sizeof(float)); 780 | } 781 | 782 | dim3 block(16, 16, 1); // dim3 CUDA specific syntax, block and grid are required to schedule CUDA threads over streaming multiprocessors 783 | dim3 grid(width / block.x, height / block.y, 1); 784 | 785 | // Configure grid and block sizes: 786 | int threadsPerBlock = 256; 787 | // Compute the number of blocks required, performing a ceiling operation to make sure there are enough: 788 | int fullBlocksPerGrid = ((width * height) + threadsPerBlock - 1) / threadsPerBlock; 789 | // <<>> 790 | CoreLoopPathTracingKernel << > >(dptr, accumulatebuffer, cudaTriangles, cudaRendercam, cudaBVHindexesOrTrilists, 791 | cudaBVHlimits, cudaTriangleIntersectionData, cudaTriIdxList, framenumber, hashedframes); 792 | 793 | } 794 | -------------------------------------------------------------------------------- /cuda_pathtracer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * Interactive camera with depth of field based on CUDA path tracer code 6 | * by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 2 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program; if not, write to the Free Software 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 | */ 22 | #ifndef __CUDA_PATHTRACER_H_ 23 | #define __CUDA_PATHTRACER_H_ 24 | 25 | #include "linear_algebra.h" 26 | #include "geometry.h" 27 | #include "bvh.h" 28 | #include "camera.h" 29 | #include 30 | 31 | #define BVH_STACK_SIZE 32 32 | #define width 1280 // screenwidth 33 | #define height 720 // screenheight 34 | 35 | #define DBG_PUTS(level, msg) \ 36 | do { if (level <= 1) { puts(msg); fflush(stdout); }} while (0) 37 | 38 | // global variables 39 | extern unsigned g_verticesNo; 40 | extern Vertex* g_vertices; 41 | extern unsigned g_trianglesNo; 42 | extern Triangle* g_triangles; 43 | extern BVHNode* g_pSceneBVH; 44 | extern unsigned g_triIndexListNo; 45 | extern int* g_triIndexList; 46 | extern unsigned g_pCFBVH_No; 47 | extern CacheFriendlyBVHNode* g_pCFBVH; 48 | 49 | // The gateway to CUDA, called from C++ (src/main.cpp) 50 | 51 | void cudarender(Vector3Df* dptr, Vector3Df* accumulatebuffer, Triangle* cudaTriangles, int* cudaBVHindexesOrTrilists, 52 | float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, unsigned framenumber, unsigned hashedframes, Camera* cudaRendercam); 53 | 54 | 55 | struct Clock { 56 | unsigned firstValue; 57 | Clock() { reset(); } 58 | void reset() { firstValue = clock(); } 59 | unsigned readMS() { return (clock() - firstValue) / (CLOCKS_PER_SEC / 1000); } 60 | }; 61 | 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /cutil_math.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /* 13 | This file implements common mathematical operations on vector types 14 | (float3, float4 etc.) since these are not provided as standard by CUDA. 15 | 16 | The syntax is modelled on the Cg standard library. 17 | 18 | This is part of the CUTIL library and is not supported by NVIDIA. 19 | 20 | Thanks to Linh Hah for additions and fixes. 21 | */ 22 | 23 | #ifndef CUTIL_MATH_H 24 | #define CUTIL_MATH_H 25 | 26 | #include 27 | 28 | typedef unsigned int uint; 29 | typedef unsigned short ushort; 30 | 31 | #ifndef __CUDACC__ 32 | #include 33 | 34 | //////////////////////////////////////////////////////////////////////////////// 35 | // host implementations of CUDA functions 36 | //////////////////////////////////////////////////////////////////////////////// 37 | 38 | inline float fminf(float a, float b) 39 | { 40 | return a < b ? a : b; 41 | } 42 | 43 | inline float fmaxf(float a, float b) 44 | { 45 | return a > b ? a : b; 46 | } 47 | 48 | inline int max(int a, int b) 49 | { 50 | return a > b ? a : b; 51 | } 52 | 53 | inline int min(int a, int b) 54 | { 55 | return a < b ? a : b; 56 | } 57 | 58 | inline float rsqrtf(float x) 59 | { 60 | return 1.0f / sqrtf(x); 61 | } 62 | #endif 63 | 64 | //////////////////////////////////////////////////////////////////////////////// 65 | // constructors 66 | //////////////////////////////////////////////////////////////////////////////// 67 | 68 | inline __host__ __device__ float2 make_float2(float s) 69 | { 70 | return make_float2(s, s); 71 | } 72 | inline __host__ __device__ float2 make_float2(float3 a) 73 | { 74 | return make_float2(a.x, a.y); 75 | } 76 | inline __host__ __device__ float2 make_float2(int2 a) 77 | { 78 | return make_float2(float(a.x), float(a.y)); 79 | } 80 | inline __host__ __device__ float2 make_float2(uint2 a) 81 | { 82 | return make_float2(float(a.x), float(a.y)); 83 | } 84 | 85 | inline __host__ __device__ int2 make_int2(int s) 86 | { 87 | return make_int2(s, s); 88 | } 89 | inline __host__ __device__ int2 make_int2(int3 a) 90 | { 91 | return make_int2(a.x, a.y); 92 | } 93 | inline __host__ __device__ int2 make_int2(uint2 a) 94 | { 95 | return make_int2(int(a.x), int(a.y)); 96 | } 97 | inline __host__ __device__ int2 make_int2(float2 a) 98 | { 99 | return make_int2(int(a.x), int(a.y)); 100 | } 101 | 102 | inline __host__ __device__ uint2 make_uint2(uint s) 103 | { 104 | return make_uint2(s, s); 105 | } 106 | inline __host__ __device__ uint2 make_uint2(uint3 a) 107 | { 108 | return make_uint2(a.x, a.y); 109 | } 110 | inline __host__ __device__ uint2 make_uint2(int2 a) 111 | { 112 | return make_uint2(uint(a.x), uint(a.y)); 113 | } 114 | 115 | inline __host__ __device__ float3 make_float3(float s) 116 | { 117 | return make_float3(s, s, s); 118 | } 119 | inline __host__ __device__ float3 make_float3(float2 a) 120 | { 121 | return make_float3(a.x, a.y, 0.0f); 122 | } 123 | inline __host__ __device__ float3 make_float3(float2 a, float s) 124 | { 125 | return make_float3(a.x, a.y, s); 126 | } 127 | inline __host__ __device__ float3 make_float3(float4 a) 128 | { 129 | return make_float3(a.x, a.y, a.z); 130 | } 131 | inline __host__ __device__ float3 make_float3(int3 a) 132 | { 133 | return make_float3(float(a.x), float(a.y), float(a.z)); 134 | } 135 | inline __host__ __device__ float3 make_float3(uint3 a) 136 | { 137 | return make_float3(float(a.x), float(a.y), float(a.z)); 138 | } 139 | 140 | inline __host__ __device__ int3 make_int3(int s) 141 | { 142 | return make_int3(s, s, s); 143 | } 144 | inline __host__ __device__ int3 make_int3(int2 a) 145 | { 146 | return make_int3(a.x, a.y, 0); 147 | } 148 | inline __host__ __device__ int3 make_int3(int2 a, int s) 149 | { 150 | return make_int3(a.x, a.y, s); 151 | } 152 | inline __host__ __device__ int3 make_int3(uint3 a) 153 | { 154 | return make_int3(int(a.x), int(a.y), int(a.z)); 155 | } 156 | inline __host__ __device__ int3 make_int3(float3 a) 157 | { 158 | return make_int3(int(a.x), int(a.y), int(a.z)); 159 | } 160 | 161 | inline __host__ __device__ uint3 make_uint3(uint s) 162 | { 163 | return make_uint3(s, s, s); 164 | } 165 | inline __host__ __device__ uint3 make_uint3(uint2 a) 166 | { 167 | return make_uint3(a.x, a.y, 0); 168 | } 169 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) 170 | { 171 | return make_uint3(a.x, a.y, s); 172 | } 173 | inline __host__ __device__ uint3 make_uint3(uint4 a) 174 | { 175 | return make_uint3(a.x, a.y, a.z); 176 | } 177 | inline __host__ __device__ uint3 make_uint3(int3 a) 178 | { 179 | return make_uint3(uint(a.x), uint(a.y), uint(a.z)); 180 | } 181 | 182 | inline __host__ __device__ float4 make_float4(float s) 183 | { 184 | return make_float4(s, s, s, s); 185 | } 186 | inline __host__ __device__ float4 make_float4(float3 a) 187 | { 188 | return make_float4(a.x, a.y, a.z, 0.0f); 189 | } 190 | inline __host__ __device__ float4 make_float4(float3 a, float w) 191 | { 192 | return make_float4(a.x, a.y, a.z, w); 193 | } 194 | inline __host__ __device__ float4 make_float4(int4 a) 195 | { 196 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 197 | } 198 | inline __host__ __device__ float4 make_float4(uint4 a) 199 | { 200 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 201 | } 202 | 203 | // custom function vec4.xyz 204 | //inline __host__ __device__ float3 fxyz(float4 a) 205 | //{ 206 | // return make_float3(float(a.x), float(a.y), float(a.z)); 207 | //} 208 | 209 | inline __host__ __device__ int4 make_int4(int s) 210 | { 211 | return make_int4(s, s, s, s); 212 | } 213 | inline __host__ __device__ int4 make_int4(int3 a) 214 | { 215 | return make_int4(a.x, a.y, a.z, 0); 216 | } 217 | inline __host__ __device__ int4 make_int4(int3 a, int w) 218 | { 219 | return make_int4(a.x, a.y, a.z, w); 220 | } 221 | inline __host__ __device__ int4 make_int4(uint4 a) 222 | { 223 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 224 | } 225 | inline __host__ __device__ int4 make_int4(float4 a) 226 | { 227 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 228 | } 229 | 230 | 231 | inline __host__ __device__ uint4 make_uint4(uint s) 232 | { 233 | return make_uint4(s, s, s, s); 234 | } 235 | inline __host__ __device__ uint4 make_uint4(uint3 a) 236 | { 237 | return make_uint4(a.x, a.y, a.z, 0); 238 | } 239 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) 240 | { 241 | return make_uint4(a.x, a.y, a.z, w); 242 | } 243 | inline __host__ __device__ uint4 make_uint4(int4 a) 244 | { 245 | return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); 246 | } 247 | 248 | //////////////////////////////////////////////////////////////////////////////// 249 | // negate 250 | //////////////////////////////////////////////////////////////////////////////// 251 | 252 | inline __host__ __device__ float2 operator-(float2 &a) 253 | { 254 | return make_float2(-a.x, -a.y); 255 | } 256 | inline __host__ __device__ int2 operator-(int2 &a) 257 | { 258 | return make_int2(-a.x, -a.y); 259 | } 260 | inline __host__ __device__ float3 operator-(float3 &a) 261 | { 262 | return make_float3(-a.x, -a.y, -a.z); 263 | } 264 | inline __host__ __device__ int3 operator-(int3 &a) 265 | { 266 | return make_int3(-a.x, -a.y, -a.z); 267 | } 268 | inline __host__ __device__ float4 operator-(float4 &a) 269 | { 270 | return make_float4(-a.x, -a.y, -a.z, -a.w); 271 | } 272 | inline __host__ __device__ int4 operator-(int4 &a) 273 | { 274 | return make_int4(-a.x, -a.y, -a.z, -a.w); 275 | } 276 | 277 | //////////////////////////////////////////////////////////////////////////////// 278 | // addition 279 | //////////////////////////////////////////////////////////////////////////////// 280 | 281 | inline __host__ __device__ float2 operator+(float2 a, float2 b) 282 | { 283 | return make_float2(a.x + b.x, a.y + b.y); 284 | } 285 | inline __host__ __device__ void operator+=(float2 &a, float2 b) 286 | { 287 | a.x += b.x; a.y += b.y; 288 | } 289 | inline __host__ __device__ float2 operator+(float2 a, float b) 290 | { 291 | return make_float2(a.x + b, a.y + b); 292 | } 293 | inline __host__ __device__ float2 operator+(float b, float2 a) 294 | { 295 | return make_float2(a.x + b, a.y + b); 296 | } 297 | inline __host__ __device__ void operator+=(float2 &a, float b) 298 | { 299 | a.x += b; a.y += b; 300 | } 301 | 302 | inline __host__ __device__ int2 operator+(int2 a, int2 b) 303 | { 304 | return make_int2(a.x + b.x, a.y + b.y); 305 | } 306 | inline __host__ __device__ void operator+=(int2 &a, int2 b) 307 | { 308 | a.x += b.x; a.y += b.y; 309 | } 310 | inline __host__ __device__ int2 operator+(int2 a, int b) 311 | { 312 | return make_int2(a.x + b, a.y + b); 313 | } 314 | inline __host__ __device__ int2 operator+(int b, int2 a) 315 | { 316 | return make_int2(a.x + b, a.y + b); 317 | } 318 | inline __host__ __device__ void operator+=(int2 &a, int b) 319 | { 320 | a.x += b; a.y += b; 321 | } 322 | 323 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) 324 | { 325 | return make_uint2(a.x + b.x, a.y + b.y); 326 | } 327 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b) 328 | { 329 | a.x += b.x; a.y += b.y; 330 | } 331 | inline __host__ __device__ uint2 operator+(uint2 a, uint b) 332 | { 333 | return make_uint2(a.x + b, a.y + b); 334 | } 335 | inline __host__ __device__ uint2 operator+(uint b, uint2 a) 336 | { 337 | return make_uint2(a.x + b, a.y + b); 338 | } 339 | inline __host__ __device__ void operator+=(uint2 &a, uint b) 340 | { 341 | a.x += b; a.y += b; 342 | } 343 | 344 | 345 | inline __host__ __device__ float3 operator+(float3 a, float3 b) 346 | { 347 | return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); 348 | } 349 | inline __host__ __device__ void operator+=(float3 &a, float3 b) 350 | { 351 | a.x += b.x; a.y += b.y; a.z += b.z; 352 | } 353 | inline __host__ __device__ float3 operator+(float3 a, float b) 354 | { 355 | return make_float3(a.x + b, a.y + b, a.z + b); 356 | } 357 | inline __host__ __device__ void operator+=(float3 &a, float b) 358 | { 359 | a.x += b; a.y += b; a.z += b; 360 | } 361 | 362 | inline __host__ __device__ int3 operator+(int3 a, int3 b) 363 | { 364 | return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); 365 | } 366 | inline __host__ __device__ void operator+=(int3 &a, int3 b) 367 | { 368 | a.x += b.x; a.y += b.y; a.z += b.z; 369 | } 370 | inline __host__ __device__ int3 operator+(int3 a, int b) 371 | { 372 | return make_int3(a.x + b, a.y + b, a.z + b); 373 | } 374 | inline __host__ __device__ void operator+=(int3 &a, int b) 375 | { 376 | a.x += b; a.y += b; a.z += b; 377 | } 378 | 379 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) 380 | { 381 | return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); 382 | } 383 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b) 384 | { 385 | a.x += b.x; a.y += b.y; a.z += b.z; 386 | } 387 | inline __host__ __device__ uint3 operator+(uint3 a, uint b) 388 | { 389 | return make_uint3(a.x + b, a.y + b, a.z + b); 390 | } 391 | inline __host__ __device__ void operator+=(uint3 &a, uint b) 392 | { 393 | a.x += b; a.y += b; a.z += b; 394 | } 395 | 396 | inline __host__ __device__ int3 operator+(int b, int3 a) 397 | { 398 | return make_int3(a.x + b, a.y + b, a.z + b); 399 | } 400 | inline __host__ __device__ uint3 operator+(uint b, uint3 a) 401 | { 402 | return make_uint3(a.x + b, a.y + b, a.z + b); 403 | } 404 | inline __host__ __device__ float3 operator+(float b, float3 a) 405 | { 406 | return make_float3(a.x + b, a.y + b, a.z + b); 407 | } 408 | 409 | inline __host__ __device__ float4 operator+(float4 a, float4 b) 410 | { 411 | return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 412 | } 413 | inline __host__ __device__ void operator+=(float4 &a, float4 b) 414 | { 415 | a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; 416 | } 417 | inline __host__ __device__ float4 operator+(float4 a, float b) 418 | { 419 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 420 | } 421 | inline __host__ __device__ float4 operator+(float b, float4 a) 422 | { 423 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 424 | } 425 | inline __host__ __device__ void operator+=(float4 &a, float b) 426 | { 427 | a.x += b; a.y += b; a.z += b; a.w += b; 428 | } 429 | 430 | inline __host__ __device__ int4 operator+(int4 a, int4 b) 431 | { 432 | return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 433 | } 434 | inline __host__ __device__ void operator+=(int4 &a, int4 b) 435 | { 436 | a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; 437 | } 438 | inline __host__ __device__ int4 operator+(int4 a, int b) 439 | { 440 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 441 | } 442 | inline __host__ __device__ int4 operator+(int b, int4 a) 443 | { 444 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 445 | } 446 | inline __host__ __device__ void operator+=(int4 &a, int b) 447 | { 448 | a.x += b; a.y += b; a.z += b; a.w += b; 449 | } 450 | 451 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) 452 | { 453 | return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 454 | } 455 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b) 456 | { 457 | a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; 458 | } 459 | inline __host__ __device__ uint4 operator+(uint4 a, uint b) 460 | { 461 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 462 | } 463 | inline __host__ __device__ uint4 operator+(uint b, uint4 a) 464 | { 465 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 466 | } 467 | inline __host__ __device__ void operator+=(uint4 &a, uint b) 468 | { 469 | a.x += b; a.y += b; a.z += b; a.w += b; 470 | } 471 | 472 | //////////////////////////////////////////////////////////////////////////////// 473 | // subtract 474 | //////////////////////////////////////////////////////////////////////////////// 475 | 476 | inline __host__ __device__ float2 operator-(float2 a, float2 b) 477 | { 478 | return make_float2(a.x - b.x, a.y - b.y); 479 | } 480 | inline __host__ __device__ void operator-=(float2 &a, float2 b) 481 | { 482 | a.x -= b.x; a.y -= b.y; 483 | } 484 | inline __host__ __device__ float2 operator-(float2 a, float b) 485 | { 486 | return make_float2(a.x - b, a.y - b); 487 | } 488 | inline __host__ __device__ float2 operator-(float b, float2 a) 489 | { 490 | return make_float2(b - a.x, b - a.y); 491 | } 492 | inline __host__ __device__ void operator-=(float2 &a, float b) 493 | { 494 | a.x -= b; a.y -= b; 495 | } 496 | 497 | inline __host__ __device__ int2 operator-(int2 a, int2 b) 498 | { 499 | return make_int2(a.x - b.x, a.y - b.y); 500 | } 501 | inline __host__ __device__ void operator-=(int2 &a, int2 b) 502 | { 503 | a.x -= b.x; a.y -= b.y; 504 | } 505 | inline __host__ __device__ int2 operator-(int2 a, int b) 506 | { 507 | return make_int2(a.x - b, a.y - b); 508 | } 509 | inline __host__ __device__ int2 operator-(int b, int2 a) 510 | { 511 | return make_int2(b - a.x, b - a.y); 512 | } 513 | inline __host__ __device__ void operator-=(int2 &a, int b) 514 | { 515 | a.x -= b; a.y -= b; 516 | } 517 | 518 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) 519 | { 520 | return make_uint2(a.x - b.x, a.y - b.y); 521 | } 522 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b) 523 | { 524 | a.x -= b.x; a.y -= b.y; 525 | } 526 | inline __host__ __device__ uint2 operator-(uint2 a, uint b) 527 | { 528 | return make_uint2(a.x - b, a.y - b); 529 | } 530 | inline __host__ __device__ uint2 operator-(uint b, uint2 a) 531 | { 532 | return make_uint2(b - a.x, b - a.y); 533 | } 534 | inline __host__ __device__ void operator-=(uint2 &a, uint b) 535 | { 536 | a.x -= b; a.y -= b; 537 | } 538 | 539 | inline __host__ __device__ float3 operator-(float3 a, float3 b) 540 | { 541 | return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); 542 | } 543 | inline __host__ __device__ void operator-=(float3 &a, float3 b) 544 | { 545 | a.x -= b.x; a.y -= b.y; a.z -= b.z; 546 | } 547 | inline __host__ __device__ float3 operator-(float3 a, float b) 548 | { 549 | return make_float3(a.x - b, a.y - b, a.z - b); 550 | } 551 | inline __host__ __device__ float3 operator-(float b, float3 a) 552 | { 553 | return make_float3(b - a.x, b - a.y, b - a.z); 554 | } 555 | inline __host__ __device__ void operator-=(float3 &a, float b) 556 | { 557 | a.x -= b; a.y -= b; a.z -= b; 558 | } 559 | 560 | inline __host__ __device__ int3 operator-(int3 a, int3 b) 561 | { 562 | return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); 563 | } 564 | inline __host__ __device__ void operator-=(int3 &a, int3 b) 565 | { 566 | a.x -= b.x; a.y -= b.y; a.z -= b.z; 567 | } 568 | inline __host__ __device__ int3 operator-(int3 a, int b) 569 | { 570 | return make_int3(a.x - b, a.y - b, a.z - b); 571 | } 572 | inline __host__ __device__ int3 operator-(int b, int3 a) 573 | { 574 | return make_int3(b - a.x, b - a.y, b - a.z); 575 | } 576 | inline __host__ __device__ void operator-=(int3 &a, int b) 577 | { 578 | a.x -= b; a.y -= b; a.z -= b; 579 | } 580 | 581 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) 582 | { 583 | return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); 584 | } 585 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b) 586 | { 587 | a.x -= b.x; a.y -= b.y; a.z -= b.z; 588 | } 589 | inline __host__ __device__ uint3 operator-(uint3 a, uint b) 590 | { 591 | return make_uint3(a.x - b, a.y - b, a.z - b); 592 | } 593 | inline __host__ __device__ uint3 operator-(uint b, uint3 a) 594 | { 595 | return make_uint3(b - a.x, b - a.y, b - a.z); 596 | } 597 | inline __host__ __device__ void operator-=(uint3 &a, uint b) 598 | { 599 | a.x -= b; a.y -= b; a.z -= b; 600 | } 601 | 602 | inline __host__ __device__ float4 operator-(float4 a, float4 b) 603 | { 604 | return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 605 | } 606 | inline __host__ __device__ void operator-=(float4 &a, float4 b) 607 | { 608 | a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; 609 | } 610 | inline __host__ __device__ float4 operator-(float4 a, float b) 611 | { 612 | return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); 613 | } 614 | inline __host__ __device__ void operator-=(float4 &a, float b) 615 | { 616 | a.x -= b; a.y -= b; a.z -= b; a.w -= b; 617 | } 618 | 619 | inline __host__ __device__ int4 operator-(int4 a, int4 b) 620 | { 621 | return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 622 | } 623 | inline __host__ __device__ void operator-=(int4 &a, int4 b) 624 | { 625 | a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; 626 | } 627 | inline __host__ __device__ int4 operator-(int4 a, int b) 628 | { 629 | return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); 630 | } 631 | inline __host__ __device__ int4 operator-(int b, int4 a) 632 | { 633 | return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); 634 | } 635 | inline __host__ __device__ void operator-=(int4 &a, int b) 636 | { 637 | a.x -= b; a.y -= b; a.z -= b; a.w -= b; 638 | } 639 | 640 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) 641 | { 642 | return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 643 | } 644 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b) 645 | { 646 | a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; 647 | } 648 | inline __host__ __device__ uint4 operator-(uint4 a, uint b) 649 | { 650 | return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); 651 | } 652 | inline __host__ __device__ uint4 operator-(uint b, uint4 a) 653 | { 654 | return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); 655 | } 656 | inline __host__ __device__ void operator-=(uint4 &a, uint b) 657 | { 658 | a.x -= b; a.y -= b; a.z -= b; a.w -= b; 659 | } 660 | 661 | //////////////////////////////////////////////////////////////////////////////// 662 | // multiply 663 | //////////////////////////////////////////////////////////////////////////////// 664 | 665 | inline __host__ __device__ float2 operator*(float2 a, float2 b) 666 | { 667 | return make_float2(a.x * b.x, a.y * b.y); 668 | } 669 | inline __host__ __device__ void operator*=(float2 &a, float2 b) 670 | { 671 | a.x *= b.x; a.y *= b.y; 672 | } 673 | inline __host__ __device__ float2 operator*(float2 a, float b) 674 | { 675 | return make_float2(a.x * b, a.y * b); 676 | } 677 | inline __host__ __device__ float2 operator*(float b, float2 a) 678 | { 679 | return make_float2(b * a.x, b * a.y); 680 | } 681 | inline __host__ __device__ void operator*=(float2 &a, float b) 682 | { 683 | a.x *= b; a.y *= b; 684 | } 685 | 686 | inline __host__ __device__ int2 operator*(int2 a, int2 b) 687 | { 688 | return make_int2(a.x * b.x, a.y * b.y); 689 | } 690 | inline __host__ __device__ void operator*=(int2 &a, int2 b) 691 | { 692 | a.x *= b.x; a.y *= b.y; 693 | } 694 | inline __host__ __device__ int2 operator*(int2 a, int b) 695 | { 696 | return make_int2(a.x * b, a.y * b); 697 | } 698 | inline __host__ __device__ int2 operator*(int b, int2 a) 699 | { 700 | return make_int2(b * a.x, b * a.y); 701 | } 702 | inline __host__ __device__ void operator*=(int2 &a, int b) 703 | { 704 | a.x *= b; a.y *= b; 705 | } 706 | 707 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) 708 | { 709 | return make_uint2(a.x * b.x, a.y * b.y); 710 | } 711 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b) 712 | { 713 | a.x *= b.x; a.y *= b.y; 714 | } 715 | inline __host__ __device__ uint2 operator*(uint2 a, uint b) 716 | { 717 | return make_uint2(a.x * b, a.y * b); 718 | } 719 | inline __host__ __device__ uint2 operator*(uint b, uint2 a) 720 | { 721 | return make_uint2(b * a.x, b * a.y); 722 | } 723 | inline __host__ __device__ void operator*=(uint2 &a, uint b) 724 | { 725 | a.x *= b; a.y *= b; 726 | } 727 | 728 | inline __host__ __device__ float3 operator*(float3 a, float3 b) 729 | { 730 | return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); 731 | } 732 | inline __host__ __device__ void operator*=(float3 &a, float3 b) 733 | { 734 | a.x *= b.x; a.y *= b.y; a.z *= b.z; 735 | } 736 | inline __host__ __device__ float3 operator*(float3 a, float b) 737 | { 738 | return make_float3(a.x * b, a.y * b, a.z * b); 739 | } 740 | inline __host__ __device__ float3 operator*(float b, float3 a) 741 | { 742 | return make_float3(b * a.x, b * a.y, b * a.z); 743 | } 744 | inline __host__ __device__ void operator*=(float3 &a, float b) 745 | { 746 | a.x *= b; a.y *= b; a.z *= b; 747 | } 748 | 749 | inline __host__ __device__ int3 operator*(int3 a, int3 b) 750 | { 751 | return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); 752 | } 753 | inline __host__ __device__ void operator*=(int3 &a, int3 b) 754 | { 755 | a.x *= b.x; a.y *= b.y; a.z *= b.z; 756 | } 757 | inline __host__ __device__ int3 operator*(int3 a, int b) 758 | { 759 | return make_int3(a.x * b, a.y * b, a.z * b); 760 | } 761 | inline __host__ __device__ int3 operator*(int b, int3 a) 762 | { 763 | return make_int3(b * a.x, b * a.y, b * a.z); 764 | } 765 | inline __host__ __device__ void operator*=(int3 &a, int b) 766 | { 767 | a.x *= b; a.y *= b; a.z *= b; 768 | } 769 | 770 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) 771 | { 772 | return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); 773 | } 774 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b) 775 | { 776 | a.x *= b.x; a.y *= b.y; a.z *= b.z; 777 | } 778 | inline __host__ __device__ uint3 operator*(uint3 a, uint b) 779 | { 780 | return make_uint3(a.x * b, a.y * b, a.z * b); 781 | } 782 | inline __host__ __device__ uint3 operator*(uint b, uint3 a) 783 | { 784 | return make_uint3(b * a.x, b * a.y, b * a.z); 785 | } 786 | inline __host__ __device__ void operator*=(uint3 &a, uint b) 787 | { 788 | a.x *= b; a.y *= b; a.z *= b; 789 | } 790 | 791 | inline __host__ __device__ float4 operator*(float4 a, float4 b) 792 | { 793 | return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 794 | } 795 | inline __host__ __device__ void operator*=(float4 &a, float4 b) 796 | { 797 | a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; 798 | } 799 | inline __host__ __device__ float4 operator*(float4 a, float b) 800 | { 801 | return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); 802 | } 803 | inline __host__ __device__ float4 operator*(float b, float4 a) 804 | { 805 | return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); 806 | } 807 | inline __host__ __device__ void operator*=(float4 &a, float b) 808 | { 809 | a.x *= b; a.y *= b; a.z *= b; a.w *= b; 810 | } 811 | 812 | inline __host__ __device__ int4 operator*(int4 a, int4 b) 813 | { 814 | return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 815 | } 816 | inline __host__ __device__ void operator*=(int4 &a, int4 b) 817 | { 818 | a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; 819 | } 820 | inline __host__ __device__ int4 operator*(int4 a, int b) 821 | { 822 | return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); 823 | } 824 | inline __host__ __device__ int4 operator*(int b, int4 a) 825 | { 826 | return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); 827 | } 828 | inline __host__ __device__ void operator*=(int4 &a, int b) 829 | { 830 | a.x *= b; a.y *= b; a.z *= b; a.w *= b; 831 | } 832 | 833 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) 834 | { 835 | return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 836 | } 837 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b) 838 | { 839 | a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; 840 | } 841 | inline __host__ __device__ uint4 operator*(uint4 a, uint b) 842 | { 843 | return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); 844 | } 845 | inline __host__ __device__ uint4 operator*(uint b, uint4 a) 846 | { 847 | return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); 848 | } 849 | inline __host__ __device__ void operator*=(uint4 &a, uint b) 850 | { 851 | a.x *= b; a.y *= b; a.z *= b; a.w *= b; 852 | } 853 | 854 | //////////////////////////////////////////////////////////////////////////////// 855 | // divide 856 | //////////////////////////////////////////////////////////////////////////////// 857 | 858 | inline __host__ __device__ float2 operator/(float2 a, float2 b) 859 | { 860 | return make_float2(a.x / b.x, a.y / b.y); 861 | } 862 | inline __host__ __device__ void operator/=(float2 &a, float2 b) 863 | { 864 | a.x /= b.x; a.y /= b.y; 865 | } 866 | inline __host__ __device__ float2 operator/(float2 a, float b) 867 | { 868 | return make_float2(a.x / b, a.y / b); 869 | } 870 | inline __host__ __device__ void operator/=(float2 &a, float b) 871 | { 872 | a.x /= b; a.y /= b; 873 | } 874 | inline __host__ __device__ float2 operator/(float b, float2 a) 875 | { 876 | return make_float2(b / a.x, b / a.y); 877 | } 878 | 879 | inline __host__ __device__ float3 operator/(float3 a, float3 b) 880 | { 881 | return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); 882 | } 883 | inline __host__ __device__ void operator/=(float3 &a, float3 b) 884 | { 885 | a.x /= b.x; a.y /= b.y; a.z /= b.z; 886 | } 887 | inline __host__ __device__ float3 operator/(float3 a, float b) 888 | { 889 | return make_float3(a.x / b, a.y / b, a.z / b); 890 | } 891 | inline __host__ __device__ void operator/=(float3 &a, float b) 892 | { 893 | a.x /= b; a.y /= b; a.z /= b; 894 | } 895 | inline __host__ __device__ float3 operator/(float b, float3 a) 896 | { 897 | return make_float3(b / a.x, b / a.y, b / a.z); 898 | } 899 | 900 | inline __host__ __device__ float4 operator/(float4 a, float4 b) 901 | { 902 | return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); 903 | } 904 | inline __host__ __device__ void operator/=(float4 &a, float4 b) 905 | { 906 | a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w; 907 | } 908 | inline __host__ __device__ float4 operator/(float4 a, float b) 909 | { 910 | return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); 911 | } 912 | inline __host__ __device__ void operator/=(float4 &a, float b) 913 | { 914 | a.x /= b; a.y /= b; a.z /= b; a.w /= b; 915 | } 916 | inline __host__ __device__ float4 operator/(float b, float4 a){ 917 | return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); 918 | } 919 | 920 | //////////////////////////////////////////////////////////////////////////////// 921 | // min 922 | //////////////////////////////////////////////////////////////////////////////// 923 | 924 | inline __host__ __device__ float2 fminf(float2 a, float2 b) 925 | { 926 | return make_float2(fminf(a.x, b.x), fminf(a.y, b.y)); 927 | } 928 | inline __host__ __device__ float3 fminf(float3 a, float3 b) 929 | { 930 | return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z)); 931 | } 932 | inline __host__ __device__ float4 fminf(float4 a, float4 b) 933 | { 934 | return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); 935 | } 936 | 937 | inline __host__ __device__ int2 min(int2 a, int2 b) 938 | { 939 | return make_int2(min(a.x, b.x), min(a.y, b.y)); 940 | } 941 | inline __host__ __device__ int3 min(int3 a, int3 b) 942 | { 943 | return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); 944 | } 945 | inline __host__ __device__ int4 min(int4 a, int4 b) 946 | { 947 | return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); 948 | } 949 | 950 | inline __host__ __device__ uint2 min(uint2 a, uint2 b) 951 | { 952 | return make_uint2(min(a.x, b.x), min(a.y, b.y)); 953 | } 954 | inline __host__ __device__ uint3 min(uint3 a, uint3 b) 955 | { 956 | return make_uint3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); 957 | } 958 | inline __host__ __device__ uint4 min(uint4 a, uint4 b) 959 | { 960 | return make_uint4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); 961 | } 962 | 963 | //////////////////////////////////////////////////////////////////////////////// 964 | // max 965 | //////////////////////////////////////////////////////////////////////////////// 966 | 967 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b) 968 | { 969 | return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y)); 970 | } 971 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b) 972 | { 973 | return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z)); 974 | } 975 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b) 976 | { 977 | return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); 978 | } 979 | 980 | inline __host__ __device__ int2 max(int2 a, int2 b) 981 | { 982 | return make_int2(max(a.x, b.x), max(a.y, b.y)); 983 | } 984 | inline __host__ __device__ int3 max(int3 a, int3 b) 985 | { 986 | return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); 987 | } 988 | inline __host__ __device__ int4 max(int4 a, int4 b) 989 | { 990 | return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); 991 | } 992 | 993 | inline __host__ __device__ uint2 max(uint2 a, uint2 b) 994 | { 995 | return make_uint2(max(a.x, b.x), max(a.y, b.y)); 996 | } 997 | inline __host__ __device__ uint3 max(uint3 a, uint3 b) 998 | { 999 | return make_uint3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); 1000 | } 1001 | inline __host__ __device__ uint4 max(uint4 a, uint4 b) 1002 | { 1003 | return make_uint4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); 1004 | } 1005 | 1006 | //////////////////////////////////////////////////////////////////////////////// 1007 | // lerp 1008 | // - linear interpolation between a and b, based on value t in [0, 1] range 1009 | //////////////////////////////////////////////////////////////////////////////// 1010 | 1011 | inline __device__ __host__ float lerp(float a, float b, float t) 1012 | { 1013 | return a + t*(b - a); 1014 | } 1015 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) 1016 | { 1017 | return a + t*(b - a); 1018 | } 1019 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) 1020 | { 1021 | return a + t*(b - a); 1022 | } 1023 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) 1024 | { 1025 | return a + t*(b - a); 1026 | } 1027 | 1028 | //////////////////////////////////////////////////////////////////////////////// 1029 | // clamp 1030 | // - clamp the value v to be in the range [a, b] 1031 | //////////////////////////////////////////////////////////////////////////////// 1032 | 1033 | inline __device__ __host__ float clamp(float f, float a, float b) 1034 | { 1035 | return fmaxf(a, fminf(f, b)); 1036 | } 1037 | inline __device__ __host__ int clamp(int f, int a, int b) 1038 | { 1039 | return max(a, min(f, b)); 1040 | } 1041 | inline __device__ __host__ uint clamp(uint f, uint a, uint b) 1042 | { 1043 | return max(a, min(f, b)); 1044 | } 1045 | 1046 | inline __device__ __host__ float2 clamp(float2 v, float a, float b) 1047 | { 1048 | return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); 1049 | } 1050 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) 1051 | { 1052 | return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1053 | } 1054 | inline __device__ __host__ float3 clamp(float3 v, float a, float b) 1055 | { 1056 | return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1057 | } 1058 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) 1059 | { 1060 | return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1061 | } 1062 | inline __device__ __host__ float4 clamp(float4 v, float a, float b) 1063 | { 1064 | return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1065 | } 1066 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) 1067 | { 1068 | return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1069 | } 1070 | 1071 | inline __device__ __host__ int2 clamp(int2 v, int a, int b) 1072 | { 1073 | return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); 1074 | } 1075 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) 1076 | { 1077 | return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1078 | } 1079 | inline __device__ __host__ int3 clamp(int3 v, int a, int b) 1080 | { 1081 | return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1082 | } 1083 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) 1084 | { 1085 | return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1086 | } 1087 | inline __device__ __host__ int4 clamp(int4 v, int a, int b) 1088 | { 1089 | return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1090 | } 1091 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) 1092 | { 1093 | return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1094 | } 1095 | 1096 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) 1097 | { 1098 | return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); 1099 | } 1100 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) 1101 | { 1102 | return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1103 | } 1104 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) 1105 | { 1106 | return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1107 | } 1108 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) 1109 | { 1110 | return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1111 | } 1112 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) 1113 | { 1114 | return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1115 | } 1116 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) 1117 | { 1118 | return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1119 | } 1120 | 1121 | //////////////////////////////////////////////////////////////////////////////// 1122 | // dot product 1123 | //////////////////////////////////////////////////////////////////////////////// 1124 | 1125 | inline __host__ __device__ float dot(float2 a, float2 b) 1126 | { 1127 | return a.x * b.x + a.y * b.y; 1128 | } 1129 | inline __host__ __device__ float dot(float3 a, float3 b) 1130 | { 1131 | return a.x * b.x + a.y * b.y + a.z * b.z; 1132 | } 1133 | inline __host__ __device__ float dot(float4 a, float4 b) 1134 | { 1135 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1136 | } 1137 | 1138 | inline __host__ __device__ int dot(int2 a, int2 b) 1139 | { 1140 | return a.x * b.x + a.y * b.y; 1141 | } 1142 | inline __host__ __device__ int dot(int3 a, int3 b) 1143 | { 1144 | return a.x * b.x + a.y * b.y + a.z * b.z; 1145 | } 1146 | inline __host__ __device__ int dot(int4 a, int4 b) 1147 | { 1148 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1149 | } 1150 | 1151 | inline __host__ __device__ uint dot(uint2 a, uint2 b) 1152 | { 1153 | return a.x * b.x + a.y * b.y; 1154 | } 1155 | inline __host__ __device__ uint dot(uint3 a, uint3 b) 1156 | { 1157 | return a.x * b.x + a.y * b.y + a.z * b.z; 1158 | } 1159 | inline __host__ __device__ uint dot(uint4 a, uint4 b) 1160 | { 1161 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1162 | } 1163 | 1164 | //////////////////////////////////////////////////////////////////////////////// 1165 | // length 1166 | //////////////////////////////////////////////////////////////////////////////// 1167 | 1168 | inline __host__ __device__ float length(float2 v) 1169 | { 1170 | return sqrtf(dot(v, v)); 1171 | } 1172 | inline __host__ __device__ float length(float3 v) 1173 | { 1174 | return sqrtf(dot(v, v)); 1175 | } 1176 | inline __host__ __device__ float length(float4 v) 1177 | { 1178 | return sqrtf(dot(v, v)); 1179 | } 1180 | 1181 | //////////////////////////////////////////////////////////////////////////////// 1182 | // normalize 1183 | //////////////////////////////////////////////////////////////////////////////// 1184 | 1185 | inline __host__ __device__ float2 normalize(float2 v) 1186 | { 1187 | float invLen = rsqrtf(dot(v, v)); 1188 | return v * invLen; 1189 | } 1190 | inline __host__ __device__ float3 normalize(float3 v) 1191 | { 1192 | float invLen = rsqrtf(dot(v, v)); 1193 | return v * invLen; 1194 | } 1195 | inline __host__ __device__ float4 normalize(float4 v) 1196 | { 1197 | float invLen = rsqrtf(dot(v, v)); 1198 | return v * invLen; 1199 | } 1200 | 1201 | //////////////////////////////////////////////////////////////////////////////// 1202 | // floor 1203 | //////////////////////////////////////////////////////////////////////////////// 1204 | 1205 | inline __host__ __device__ float2 floorf(float2 v) 1206 | { 1207 | return make_float2(floorf(v.x), floorf(v.y)); 1208 | } 1209 | inline __host__ __device__ float3 floorf(float3 v) 1210 | { 1211 | return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); 1212 | } 1213 | inline __host__ __device__ float4 floorf(float4 v) 1214 | { 1215 | return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); 1216 | } 1217 | 1218 | //////////////////////////////////////////////////////////////////////////////// 1219 | // frac - returns the fractional portion of a scalar or each vector component 1220 | //////////////////////////////////////////////////////////////////////////////// 1221 | 1222 | inline __host__ __device__ float fracf(float v) 1223 | { 1224 | return v - floorf(v); 1225 | } 1226 | inline __host__ __device__ float2 fracf(float2 v) 1227 | { 1228 | return make_float2(fracf(v.x), fracf(v.y)); 1229 | } 1230 | inline __host__ __device__ float3 fracf(float3 v) 1231 | { 1232 | return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); 1233 | } 1234 | inline __host__ __device__ float4 fracf(float4 v) 1235 | { 1236 | return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); 1237 | } 1238 | 1239 | //////////////////////////////////////////////////////////////////////////////// 1240 | // fmod 1241 | //////////////////////////////////////////////////////////////////////////////// 1242 | 1243 | inline __host__ __device__ float2 fmodf(float2 a, float2 b) 1244 | { 1245 | return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); 1246 | } 1247 | inline __host__ __device__ float3 fmodf(float3 a, float3 b) 1248 | { 1249 | return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); 1250 | } 1251 | inline __host__ __device__ float4 fmodf(float4 a, float4 b) 1252 | { 1253 | return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); 1254 | } 1255 | 1256 | //////////////////////////////////////////////////////////////////////////////// 1257 | // absolute value 1258 | //////////////////////////////////////////////////////////////////////////////// 1259 | 1260 | inline __host__ __device__ float2 fabs(float2 v) 1261 | { 1262 | return make_float2(fabs(v.x), fabs(v.y)); 1263 | } 1264 | inline __host__ __device__ float3 fabs(float3 v) 1265 | { 1266 | return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); 1267 | } 1268 | inline __host__ __device__ float4 fabs(float4 v) 1269 | { 1270 | return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); 1271 | } 1272 | 1273 | inline __host__ __device__ int2 abs(int2 v) 1274 | { 1275 | return make_int2(abs(v.x), abs(v.y)); 1276 | } 1277 | inline __host__ __device__ int3 abs(int3 v) 1278 | { 1279 | return make_int3(abs(v.x), abs(v.y), abs(v.z)); 1280 | } 1281 | inline __host__ __device__ int4 abs(int4 v) 1282 | { 1283 | return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); 1284 | } 1285 | 1286 | //////////////////////////////////////////////////////////////////////////////// 1287 | // reflect 1288 | // - returns reflection of incident ray I around surface normal N 1289 | // - N should be normalized, reflected vector's length is equal to length of I 1290 | //////////////////////////////////////////////////////////////////////////////// 1291 | 1292 | inline __host__ __device__ float3 reflect(float3 i, float3 n) 1293 | { 1294 | return i - 2.0f * n * dot(n, i); 1295 | } 1296 | 1297 | //////////////////////////////////////////////////////////////////////////////// 1298 | // cross product 1299 | //////////////////////////////////////////////////////////////////////////////// 1300 | 1301 | inline __host__ __device__ float3 cross(float3 a, float3 b) 1302 | { 1303 | return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 1304 | } 1305 | 1306 | //////////////////////////////////////////////////////////////////////////////// 1307 | // smoothstep 1308 | // - returns 0 if x < a 1309 | // - returns 1 if x > b 1310 | // - otherwise returns smooth interpolation between 0 and 1 based on x 1311 | //////////////////////////////////////////////////////////////////////////////// 1312 | 1313 | inline __device__ __host__ float smoothstep(float a, float b, float x) 1314 | { 1315 | float y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1316 | return (y*y*(3.0f - (2.0f*y))); 1317 | } 1318 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) 1319 | { 1320 | float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1321 | return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); 1322 | } 1323 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) 1324 | { 1325 | float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1326 | return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); 1327 | } 1328 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) 1329 | { 1330 | float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1331 | return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); 1332 | } 1333 | 1334 | #endif 1335 | -------------------------------------------------------------------------------- /dragonDOF2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF2.png -------------------------------------------------------------------------------- /dragonDOF3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF3.png -------------------------------------------------------------------------------- /dragonDOF4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF4.png -------------------------------------------------------------------------------- /geometry.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #ifndef __GEOMETRY_H_ 21 | #define __GEOMETRY_H_ 22 | 23 | #include "linear_algebra.h" 24 | 25 | struct Vertex : public Vector3Df 26 | { 27 | // normal vector of this vertex 28 | Vector3Df _normal; 29 | // ambient occlusion of this vertex (pre-calculated in e.g. MeshLab) 30 | float _ambientOcclusionCoeff; 31 | 32 | Vertex(float x, float y, float z, float nx, float ny, float nz, float amb = 60.f) 33 | : 34 | Vector3Df(x, y, z), _normal(Vector3Df(nx, ny, nz)), _ambientOcclusionCoeff(amb) 35 | { 36 | // assert |nx,ny,nz| = 1 37 | } 38 | }; 39 | 40 | struct Triangle { 41 | // indexes in vertices array 42 | unsigned _idx1; 43 | unsigned _idx2; 44 | unsigned _idx3; 45 | // RGB Color Vector3Df 46 | Vector3Df _colorf; 47 | // Center point 48 | Vector3Df _center; 49 | // triangle normal 50 | Vector3Df _normal; 51 | // ignore back-face culling flag 52 | bool _twoSided; 53 | // Raytracing intersection pre-computed cache: 54 | float _d, _d1, _d2, _d3; 55 | Vector3Df _e1, _e2, _e3; 56 | // bounding box 57 | Vector3Df _bottom; 58 | Vector3Df _top; 59 | }; 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /golddragon3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/golddragon3.png -------------------------------------------------------------------------------- /golddragon4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/golddragon4.png -------------------------------------------------------------------------------- /linear_algebra.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #ifndef __LINEAR_ALGEBRA_H_ 21 | #define __LINEAR_ALGEBRA_H_ 22 | 23 | #include // for __host__ __device__ 24 | #include 25 | 26 | struct Vector3Df 27 | { 28 | union { 29 | struct { float x, y, z; }; 30 | float _v[3]; 31 | }; 32 | 33 | __host__ __device__ Vector3Df(float _x = 0, float _y = 0, float _z = 0) : x(_x), y(_y), z(_z) {} 34 | __host__ __device__ Vector3Df(const Vector3Df& v) : x(v.x), y(v.y), z(v.z) {} 35 | __host__ __device__ Vector3Df(const float4& v) : x(v.x), y(v.y), z(v.z) {} 36 | inline __host__ __device__ float length(){ return sqrtf(x*x + y*y + z*z); } 37 | // sometimes we dont need the sqrt, we are just comparing one length with another 38 | inline __host__ __device__ float lengthsq(){ return x*x + y*y + z*z; } 39 | inline __host__ __device__ void normalize(){ float norm = sqrtf(x*x + y*y + z*z); x /= norm; y /= norm; z /= norm; } 40 | inline __host__ __device__ Vector3Df& operator+=(const Vector3Df& v){ x += v.x; y += v.y; z += v.z; return *this; } 41 | inline __host__ __device__ Vector3Df& operator-=(const Vector3Df& v){ x -= v.x; y -= v.y; z -= v.z; return *this; } 42 | inline __host__ __device__ Vector3Df& operator*=(const float& a){ x *= a; y *= a; z *= a; return *this; } 43 | inline __host__ __device__ Vector3Df& operator*=(const Vector3Df& v){ x *= v.x; y *= v.y; z *= v.z; return *this; } 44 | inline __host__ __device__ Vector3Df operator*(float a) const{ return Vector3Df(x*a, y*a, z*a); } 45 | inline __host__ __device__ Vector3Df operator/(float a) const{ return Vector3Df(x/a, y/a, z/a); } 46 | inline __host__ __device__ Vector3Df operator*(const Vector3Df& v) const{ return Vector3Df(x * v.x, y * v.y, z * v.z); } 47 | inline __host__ __device__ Vector3Df operator+(const Vector3Df& v) const{ return Vector3Df(x + v.x, y + v.y, z + v.z); } 48 | inline __host__ __device__ Vector3Df operator-(const Vector3Df& v) const{ return Vector3Df(x - v.x, y - v.y, z - v.z); } 49 | inline __host__ __device__ Vector3Df& operator/=(const float& a){ x /= a; y /= a; z /= a; return *this; } 50 | inline __host__ __device__ bool operator!=(const Vector3Df& v){ return x != v.x || y != v.y || z != v.z; } 51 | }; 52 | 53 | 54 | inline __host__ __device__ Vector3Df min3(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.x < v2.x ? v1.x : v2.x, v1.y < v2.y ? v1.y : v2.y, v1.z < v2.z ? v1.z : v2.z); } 55 | inline __host__ __device__ Vector3Df max3(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.x > v2.x ? v1.x : v2.x, v1.y > v2.y ? v1.y : v2.y, v1.z > v2.z ? v1.z : v2.z); } 56 | inline __host__ __device__ Vector3Df cross(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x); } 57 | inline __host__ __device__ float dot(const Vector3Df& v1, const Vector3Df& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; } 58 | inline __host__ __device__ float dot(const Vector3Df& v1, const float4& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; } 59 | inline __host__ __device__ float dot(const float4& v1, const Vector3Df& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; } 60 | inline __host__ __device__ float distancesq(const Vector3Df& v1, const Vector3Df& v2){ return (v1.x - v2.x)*(v1.x - v2.x) + (v1.y - v2.y)*(v1.y - v2.y) + (v1.z - v2.z)*(v1.z - v2.z); } 61 | inline __host__ __device__ float distance(const Vector3Df& v1, const Vector3Df& v2){ return sqrtf((v1.x - v2.x)*(v1.x - v2.x) + (v1.y - v2.y)*(v1.y - v2.y) + (v1.z - v2.z)*(v1.z - v2.z)); } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /loader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | */ 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | 37 | #include "linear_algebra.h" 38 | #include "geometry.h" 39 | #include "cuda_pathtracer.h" 40 | 41 | using std::string; 42 | 43 | unsigned g_verticesNo = 0; 44 | unsigned g_trianglesNo = 0; 45 | Vertex* g_vertices = NULL; 46 | Triangle* g_triangles = NULL; 47 | 48 | 49 | namespace enums { 50 | enum ColorComponent { 51 | Red = 0, 52 | Green = 1, 53 | Blue = 2 54 | }; 55 | } 56 | 57 | using namespace enums; 58 | 59 | // Rescale input objects to have this size... 60 | const float MaxCoordAfterRescale = 1.2f; 61 | 62 | // if some file cannot be found, panic and exit 63 | void panic(const char *fmt, ...) 64 | { 65 | static char message[131072]; 66 | va_list ap; 67 | 68 | va_start(ap, fmt); 69 | vsnprintf(message, sizeof message, fmt, ap); 70 | printf(message); fflush(stdout); 71 | va_end(ap); 72 | 73 | exit(1); 74 | } 75 | 76 | void fix_normals(void) 77 | { 78 | for (unsigned j = 0; j> word1; 127 | str >> word1; 128 | str >> totalVertices; 129 | g_vertices = (Vertex *)malloc(totalVertices*sizeof(Vertex)); 130 | g_verticesNo = totalVertices; 131 | pCurrentVertex = g_vertices; 132 | } 133 | else if (line.substr(0, 12) == "element face") { 134 | std::istringstream str(line); 135 | string word1; 136 | str >> word1; 137 | str >> word1; 138 | str >> totalTriangles; 139 | g_triangles = (Triangle *)malloc(totalTriangles*sizeof(Triangle)); 140 | g_trianglesNo = totalTriangles; 141 | pCurrentTriangle = g_triangles; 142 | } 143 | else if (line.substr(0, 10) == "end_header") 144 | inside = true; 145 | } 146 | else { 147 | if (totalVertices) { 148 | 149 | totalVertices--; 150 | float x, y, z; 151 | 152 | std::istringstream str_in(line); 153 | str_in >> x >> y >> z; 154 | 155 | pCurrentVertex->x = x; 156 | pCurrentVertex->y = y; 157 | pCurrentVertex->z = z; 158 | pCurrentVertex->_normal.x = 0.f; 159 | pCurrentVertex->_normal.y = 0.f; 160 | pCurrentVertex->_normal.z = 0.f; 161 | pCurrentVertex->_ambientOcclusionCoeff = 60; // fixed, but obsolete in path tracer 162 | pCurrentVertex++; 163 | } 164 | 165 | else if (totalTriangles) { 166 | 167 | totalTriangles--; 168 | unsigned dummy; 169 | float r, g, b; 170 | unsigned idx1, idx2, idx3; // vertex index 171 | std::istringstream str2(line); 172 | if (str2 >> dummy >> idx1 >> idx2 >> idx3) 173 | { 174 | // set rgb colour to white 175 | r = 255; g = 255; b = 255; 176 | 177 | pCurrentTriangle->_idx1 = idx1; 178 | pCurrentTriangle->_idx2 = idx2; 179 | pCurrentTriangle->_idx3 = idx3; 180 | pCurrentTriangle->_colorf.x = r; 181 | pCurrentTriangle->_colorf.y = g; 182 | pCurrentTriangle->_colorf.z = b; 183 | pCurrentTriangle->_twoSided = false; 184 | pCurrentTriangle->_normal = Vector3Df(0, 0, 0); 185 | pCurrentTriangle->_bottom = Vector3Df(FLT_MAX, FLT_MAX, FLT_MAX); 186 | pCurrentTriangle->_top = Vector3Df(-FLT_MAX, -FLT_MAX, -FLT_MAX); 187 | Vertex *vertexA = &g_vertices[idx1]; 188 | Vertex *vertexB = &g_vertices[idx2]; 189 | Vertex *vertexC = &g_vertices[idx3]; 190 | pCurrentTriangle->_center = Vector3Df( 191 | (vertexA->x + vertexB->x + vertexC->x) / 3.0f, 192 | (vertexA->y + vertexB->y + vertexC->y) / 3.0f, 193 | (vertexA->z + vertexB->z + vertexC->z) / 3.0f); 194 | pCurrentTriangle++; 195 | } 196 | } 197 | } 198 | } 199 | 200 | fix_normals(); 201 | } 202 | 203 | else 204 | panic("Unknown extension (only .ply accepted)"); 205 | } 206 | else 207 | panic("No extension in filename (only .ply accepted)"); 208 | 209 | std::cout << "Vertices: " << g_verticesNo << std::endl; 210 | std::cout << "Triangles: " << g_trianglesNo << std::endl; 211 | 212 | // Center scene at world's center 213 | 214 | Vector3Df minp(FLT_MAX, FLT_MAX, FLT_MAX); 215 | Vector3Df maxp(-FLT_MAX, -FLT_MAX, -FLT_MAX); 216 | 217 | // calculate bounds of scene bounding box 218 | // loop over all triangles in scene, grow minp and maxp 219 | for (unsigned i = 0; i triangle._normal.length()) triangle._normal = alt1; // higher precision when triangle has sharp angles 288 | 289 | Vector3Df alt2 = cross(vc3, vc1); 290 | if (alt2.length() > triangle._normal.length()) triangle._normal = alt2; 291 | 292 | 293 | triangle._normal.normalize(); 294 | 295 | // precompute dot product between normal and first triangle vertex 296 | triangle._d = dot(triangle._normal, g_vertices[triangle._idx1]); 297 | 298 | // edge planes 299 | triangle._e1 = cross(triangle._normal, vc1); 300 | triangle._e1.normalize(); 301 | triangle._d1 = dot(triangle._e1, g_vertices[triangle._idx1]); 302 | triangle._e2 = cross(triangle._normal, vc2); 303 | triangle._e2.normalize(); 304 | triangle._d2 = dot(triangle._e2, g_vertices[triangle._idx2]); 305 | triangle._e3 = cross(triangle._normal, vc3); 306 | triangle._e3.normalize(); 307 | triangle._d3 = dot(triangle._e3, g_vertices[triangle._idx3]); 308 | } 309 | 310 | return MaxCoordAfterRescale; 311 | } 312 | -------------------------------------------------------------------------------- /loader.h: -------------------------------------------------------------------------------- 1 | #ifndef __LOADER_H_ 2 | #define __LOADER_H_ 3 | 4 | void panic(const char *fmt, ...); 5 | float load_object(const char *filename); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016 3 | * BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 4 | * http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 5 | * Interactive camera with depth of field based on CUDA path tracer code 6 | * by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 2 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program; if not, write to the Free Software 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 | */ 22 | #include 23 | #include 24 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glew.h" 25 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glut.h" 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "cuda_pathtracer.h" 31 | #include "loader.h" 32 | #include "camera.h" 33 | 34 | #ifndef M_PI 35 | #define M_PI 3.14156265 36 | #endif 37 | 38 | using namespace std; 39 | 40 | unsigned int framenumber = 0; 41 | GLuint vbo; 42 | void *d_vbo_buffer = NULL; 43 | 44 | // CUDA arrays 45 | Vertex* cudaVertices2 = NULL; 46 | Triangle* cudaTriangles2 = NULL; 47 | Camera* cudaRendercam2 = NULL; 48 | float *cudaTriangleIntersectionData2 = NULL; 49 | int* cudaTriIdxList2 = NULL; 50 | float *cudaBVHlimits2 = NULL; 51 | int *cudaBVHindexesOrTrilists2 = NULL; 52 | 53 | bool buffer_reset = false; 54 | 55 | void Timer(int obsolete) { 56 | 57 | glutPostRedisplay(); 58 | glutTimerFunc(10, Timer, 0); 59 | } 60 | 61 | __device__ float timer = 0.0f; 62 | 63 | // image buffer storing accumulated pixel samples 64 | Vector3Df* accumulatebuffer; 65 | // final output buffer storing averaged pixel samples 66 | Vector3Df* finaloutputbuffer; 67 | 68 | // mouse controls 69 | int mouse_old_x, mouse_old_y; 70 | int mouse_buttons = 0; 71 | float rotate_x = 0.0, rotate_y = 0.0; 72 | float translate_z = -30.0; 73 | 74 | // TODO: Delete stuff at some point!!! 75 | InteractiveCamera* interactiveCamera = NULL; 76 | Camera* hostRendercam = NULL; 77 | Clock watch; 78 | 79 | float scalefactor = 1.2f; 80 | 81 | // this hash function calculates a new random number generator seed for each frame, based on framenumber 82 | unsigned int WangHash(unsigned int a) { 83 | a = (a ^ 61) ^ (a >> 16); 84 | a = a + (a << 3); 85 | a = a ^ (a >> 4); 86 | a = a * 0x27d4eb2d; 87 | a = a ^ (a >> 15); 88 | return a; 89 | } 90 | 91 | // initialise camera on the CPU 92 | void initCamera() 93 | { 94 | delete interactiveCamera; 95 | interactiveCamera = new InteractiveCamera(); 96 | 97 | interactiveCamera->setResolution(width, height); 98 | interactiveCamera->setFOVX(45); 99 | } 100 | 101 | // create OpenGL vertex buffer object for CUDA to store calculated pixels 102 | void createVBO(GLuint* vbo) 103 | { 104 | //Create vertex buffer object 105 | glGenBuffers(1, vbo); 106 | glBindBuffer(GL_ARRAY_BUFFER, *vbo); 107 | 108 | //Initialize VBO 109 | unsigned int size = width * height * sizeof(Vector3Df); 110 | glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); 111 | 112 | glBindBuffer(GL_ARRAY_BUFFER, 0); 113 | 114 | //Register VBO with CUDA 115 | cudaGLRegisterBufferObject(*vbo); 116 | } 117 | 118 | // display function called by glutMainLoop(), gets executed every frame 119 | void disp(void) 120 | { 121 | // if camera has moved, reset the accumulation buffer 122 | if (buffer_reset){ cudaMemset(accumulatebuffer, 1, width * height * sizeof(Vector3Df)); framenumber = 0; } 123 | 124 | buffer_reset = false; 125 | framenumber++; 126 | 127 | // build a new camera for each frame on the CPU 128 | interactiveCamera->buildRenderCamera(hostRendercam); 129 | 130 | // copy the CPU camera to a GPU camera 131 | cudaMemcpy(cudaRendercam2, hostRendercam, sizeof(Camera), cudaMemcpyHostToDevice); 132 | 133 | cudaThreadSynchronize(); 134 | 135 | // maps a buffer object for acces by CUDA 136 | cudaGLMapBufferObject((void**)&finaloutputbuffer, vbo); 137 | 138 | //clear all pixels: 139 | glClear(GL_COLOR_BUFFER_BIT); 140 | 141 | // calculate a new seed for the random number generator, based on the framenumber 142 | unsigned int hashedframes = WangHash(framenumber); 143 | 144 | // gateway from host to CUDA, passes all data needed to render frame (triangles, BVH tree, camera) to CUDA for execution 145 | cudarender(finaloutputbuffer, accumulatebuffer, cudaTriangles2, cudaBVHindexesOrTrilists2, cudaBVHlimits2, cudaTriangleIntersectionData2, 146 | cudaTriIdxList2, framenumber, hashedframes, cudaRendercam2); 147 | 148 | cudaThreadSynchronize(); 149 | cudaGLUnmapBufferObject(vbo); 150 | //glFlush(); 151 | glBindBuffer(GL_ARRAY_BUFFER, vbo); 152 | glVertexPointer(2, GL_FLOAT, 12, 0); 153 | glColorPointer(4, GL_UNSIGNED_BYTE, 12, (GLvoid*)8); 154 | 155 | glEnableClientState(GL_VERTEX_ARRAY); 156 | glEnableClientState(GL_COLOR_ARRAY); 157 | glDrawArrays(GL_POINTS, 0, width * height); 158 | glDisableClientState(GL_VERTEX_ARRAY); 159 | 160 | glutSwapBuffers(); 161 | //glutPostRedisplay(); 162 | } 163 | 164 | // keyboard interaction 165 | void keyboard(unsigned char key, int /*x*/, int /*y*/) 166 | { 167 | switch (key) { 168 | 169 | case(27) : exit(0); 170 | case(' ') : initCamera(); buffer_reset = true; break; 171 | case('a') : interactiveCamera->strafe(-0.05f); buffer_reset = true; break; 172 | case('d') : interactiveCamera->strafe(0.05f); buffer_reset = true; break; 173 | case('r') : interactiveCamera->changeAltitude(0.05f); buffer_reset = true; break; 174 | case('f') : interactiveCamera->changeAltitude(-0.05f); buffer_reset = true; break; 175 | case('w') : interactiveCamera->goForward(0.05f); buffer_reset = true; break; 176 | case('s') : interactiveCamera->goForward(-0.05f); buffer_reset = true; break; 177 | case('g') : interactiveCamera->changeApertureDiameter(0.1); buffer_reset = true; break; 178 | case('h') : interactiveCamera->changeApertureDiameter(-0.1); buffer_reset = true; break; 179 | case('t') : interactiveCamera->changeFocalDistance(0.1); buffer_reset = true; break; 180 | case('y') : interactiveCamera->changeFocalDistance(-0.1); buffer_reset = true; break; 181 | } 182 | } 183 | 184 | void specialkeys(int key, int, int){ 185 | 186 | switch (key) { 187 | 188 | case GLUT_KEY_LEFT: interactiveCamera->changeYaw(0.02f); buffer_reset = true; break; 189 | case GLUT_KEY_RIGHT: interactiveCamera->changeYaw(-0.02f); buffer_reset = true; break; 190 | case GLUT_KEY_UP: interactiveCamera->changePitch(0.02f); buffer_reset = true; break; 191 | case GLUT_KEY_DOWN: interactiveCamera->changePitch(-0.02f); buffer_reset = true; break; 192 | 193 | } 194 | } 195 | 196 | // mouse event handlers 197 | 198 | int lastX = 0, lastY = 0; 199 | int theButtonState = 0; 200 | int theModifierState = 0; 201 | 202 | // camera mouse controls in X and Y direction 203 | void motion(int x, int y) 204 | { 205 | int deltaX = lastX - x; 206 | int deltaY = lastY - y; 207 | 208 | if (deltaX != 0 || deltaY != 0) { 209 | 210 | if (theButtonState == GLUT_LEFT_BUTTON) // Rotate 211 | { 212 | interactiveCamera->changeYaw(deltaX * 0.01); 213 | interactiveCamera->changePitch(-deltaY * 0.01); 214 | } 215 | else if (theButtonState == GLUT_MIDDLE_BUTTON) // Zoom 216 | { 217 | interactiveCamera->changeAltitude(-deltaY * 0.01); 218 | } 219 | 220 | if (theButtonState == GLUT_RIGHT_BUTTON) // camera move 221 | { 222 | interactiveCamera->changeRadius(-deltaY * 0.01); 223 | } 224 | 225 | lastX = x; 226 | lastY = y; 227 | buffer_reset = true; 228 | glutPostRedisplay(); 229 | 230 | } 231 | } 232 | 233 | void mouse(int button, int state, int x, int y) 234 | { 235 | theButtonState = button; 236 | theModifierState = glutGetModifiers(); 237 | lastX = x; 238 | lastY = y; 239 | 240 | motion(x, y); 241 | } 242 | 243 | // initialises scene data, builds BVH 244 | void prepCUDAscene(){ 245 | 246 | // specify scene filename 247 | //const char* scenefile = "data/teapot.ply"; // teapot.ply, big_atc.ply 248 | //const char* scenefile = "data/bunny.obj"; 249 | //const char* scenefile = "data/bun_zipper_res2.ply"; // teapot.ply, big_atc.ply 250 | //const char* scenefile = "data/bun_zipper.ply"; // teapot.ply, big_atc.ply 251 | const char* scenefile = "data/dragon_vrip_res4.ply"; // teapot.ply, big_atc.ply 252 | //const char* scenefile = "data/dragon_vrip.ply"; // teapot.ply, big_atc.ply 253 | //const char* scenefile = "data/happy_vrip.ply"; // teapot.ply, big_atc.ply 254 | 255 | // load scene 256 | float maxi = load_object(scenefile); 257 | 258 | // build the BVH 259 | UpdateBoundingVolumeHierarchy(scenefile); 260 | 261 | // now, allocate the CUDA side of the data (in CUDA global memory, 262 | // in preparation for the textures that will store them...) 263 | 264 | // store vertices in a GPU friendly format using float4 265 | float* pVerticesData = (float*)malloc(g_verticesNo * 8 * sizeof(float)); 266 | for (unsigned f = 0; fbuildRenderCamera(hostRendercam); 384 | 385 | // initialise all data needed to start rendering (BVH data, triangles, vertices) 386 | prepCUDAscene(); 387 | 388 | // allocate GPU memory for accumulation buffer 389 | cudaMalloc(&accumulatebuffer, width * height * sizeof(Vector3Df)); 390 | // allocate GPU memory for interactive camera 391 | cudaMalloc((void**)&cudaRendercam2, sizeof(Camera)); 392 | 393 | // init glut: 394 | glutInit(&argc, argv); 395 | // specify the display mode to be RGB and single buffering: 396 | glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); 397 | // specify the initial window position: 398 | glutInitWindowPosition(100, 100); 399 | // specify the initial window size: 400 | glutInitWindowSize(width, height); 401 | // create the window and set title: 402 | glutCreateWindow("Basic triangle mesh path tracer in CUDA"); 403 | 404 | // init opengl: 405 | glClearColor(0.0, 0.0, 0.0, 0.0); 406 | glMatrixMode(GL_PROJECTION); 407 | gluOrtho2D(0.0, width, 0.0, height); 408 | fprintf(stderr, "OpenGL initialized \n"); 409 | 410 | // register callback function to display graphics: 411 | glutDisplayFunc(disp); 412 | 413 | // functions for user interaction 414 | glutKeyboardFunc(keyboard); 415 | glutSpecialFunc(specialkeys); 416 | glutMouseFunc(mouse); 417 | glutMotionFunc(motion); 418 | 419 | glewInit(); 420 | if (!glewIsSupported("GL_VERSION_2_0 ")) { 421 | fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); 422 | fflush(stderr); 423 | exit(0); 424 | } 425 | fprintf(stderr, "glew initialized \n"); 426 | // call Timer(): 427 | Timer(0); 428 | createVBO(&vbo); 429 | fprintf(stderr, "VBO created \n"); 430 | // enter the main loop and process events: 431 | fprintf(stderr, "Entering glutMainLoop... \n"); 432 | glutMainLoop(); 433 | 434 | 435 | printf("CUDA initialised.\nStart rendering...\n"); 436 | 437 | // free CUDA memory 438 | cudaFree(finaloutputbuffer); 439 | cudaFree(accumulatebuffer); 440 | cudaFree(cudaBVHindexesOrTrilists2); 441 | cudaFree(cudaBVHlimits2); 442 | cudaFree(cudaTriIdxList2); 443 | cudaFree(cudaRendercam2); 444 | cudaFree(cudaTriangles2); 445 | cudaFree(cudaTriangleIntersectionData2); 446 | cudaFree(cudaVertices2); 447 | 448 | system("PAUSE"); 449 | } 450 | --------------------------------------------------------------------------------