├── README.md
├── bvh.cpp
├── bvh.h
├── camera.cpp
├── camera.h
├── cuda_pathtracer.cu
├── cuda_pathtracer.h
├── cutil_math.h
├── dragonDOF2.png
├── dragonDOF3.png
├── dragonDOF4.png
├── geometry.h
├── golddragon3.png
├── golddragon4.png
├── linear_algebra.h
├── loader.cpp
├── loader.h
└── main.cpp


/README.md:
--------------------------------------------------------------------------------
 1 | GPU path tracing tutorial 3 
 2 | Implementing a BVH acceleration structure on the GPU
 3 | by Sam lapere, 2016
 4 | 
 5 | More info and screenshots on 
 6 | 
 7 | http://raytracey.blogspot.co.nz/2016/01/gpu-path-tracing-tutorial-3-take-your.html
 8 | 
 9 | BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras
10 | (http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html)
11 | 
12 | Interactive camera with depth of field and plastic (coat) material based on CUDA path tracer code
13 | by Peter Kutz and Yining Karl Li (https://github.com/peterkutz/GPUPathTracer)
14 | 
15 | Phong metal code based on "Realistic Ray Tracing" by Peter Shirley
16 | 
17 | Features:
18 | - Fast interactive GPU path tracer
19 | - progressive rendering
20 | - support for diffuse, specular (mirror), refractive, acrylic/coat and metal Phong materials
21 | - support for spheres and triangle meshes
22 | - BVH acceleration structure built with SAH (Surface Area Heuristic) and binning
23 | - interactive camera with mouse and keyboard controls
24 | - anti-aliasing
25 | - depth-of-field
26 | 
27 | 
28 | Instructions for compiling with Visual Studio 2013/2015:
29 | 
30 | - install the CUDA 6.5/7/7.5 toolkit and choose integration with Visual Studio
31 | - open VS2013/2015 (Express or any other version such as the free Community version)
32 | - click New Project...
33 | - select Visual C++, then General, then Empty Project
34 | - right click on the project, select Build Dependies > Build Customizations
35 | - select the CUDA 6.5 (or 7 or 7.5) checkbox, click OK
36 | - in the project explorer window, right click on Source Files, select Add, C++ file, then change the name from "Source.cpp" to "cuda_pathtracer.cu"
37 | - in the project explorer window, right click on the newly created cuda_pathtracer.cu file, select CUDA C++
38 | - paste the code from cuda_pathtracer.cu in the file
39 | - add the other .h (header) and .cpp files to the project
40 | - right click on the project name, select Properties
41 | - under Linker > Input > Additional Dependencies, add "cudart.lib" and "glew32.lib" (glew32.lib should be automatically found when the CUDA toolkit is installed, if not, you can manually add the path to Linker > General > Additional Library Directories, the path is something like "%NVSDKCOMPUTE_ROOT%\C\common\lib")
42 | - disable SAFESEH by selecting NO in Linker > Advanced > Image Has Safe Exception Handlers
43 | - select Build > Rebuild Solution
44 | - run the program (at the moment there is no CUDA error checking, but so far everything has worked fine even when running the program for prolonged periods)
45 | 
46 | Screenshots produced with this code:
47 | 
48 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF2.png)
49 | 
50 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF3.png)
51 | 
52 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/dragonDOF4.png)
53 | 
54 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/golddragon3.png)
55 | 
56 | ![Image description](https://github.com/straaljager/GPU-path-tracing-tutorial-3/blob/master/golddragon4.png)
57 | 
58 | 


--------------------------------------------------------------------------------
/bvh.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
  3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
  4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
  5 | *
  6 | *  This program is free software; you can redistribute it and/or modify
  7 | *  it under the terms of the GNU General Public License as published by
  8 | *  the Free Software Foundation; either version 2 of the License, or
  9 | *  (at your option) any later version.
 10 | *
 11 | *  This program is distributed in the hope that it will be useful,
 12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | *  GNU General Public License for more details.
 15 | *
 16 | *  You should have received a copy of the GNU General Public License
 17 | *  along with this program; if not, write to the Free Software
 18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 19 | */
 20 | #include <algorithm>
 21 | #include <vector>
 22 | #include <cfloat>
 23 | #include <string>
 24 | #include <assert.h>
 25 | #include <stdio.h>
 26 | #include <ctime>
 27 | 
 28 | #include "bvh.h"
 29 | #include "geometry.h"
 30 | #include "cuda_pathtracer.h"
 31 | 
 32 | using namespace std;
 33 | 
 34 | // report progress during BVH construction
 35 | #define PROGRESS_REPORT
 36 | #ifdef PROGRESS_REPORT
 37 | #define REPORT(x) x
 38 | #define REPORTPRM(x) x,
 39 | #else
 40 | #define REPORT(x)
 41 | #define REPORTPRM(x)
 42 | #endif
 43 | 
 44 | unsigned g_reportCounter = 0;
 45 | 
 46 | // The BVH
 47 | BVHNode* g_pSceneBVH = NULL;
 48 | 
 49 | // the cache-friendly version of the BVH, to be stored in a file
 50 | unsigned g_triIndexListNo = 0;
 51 | int* g_triIndexList = NULL;
 52 | unsigned g_pCFBVH_No = 0;
 53 | CacheFriendlyBVHNode* g_pCFBVH = NULL;
 54 | 
 55 | 
 56 | //////////////////////////////////////////////////
 57 | //  First, the "pure" implementation of the BVH
 58 | //////////////////////////////////////////////////
 59 | 
 60 | // Work item for creation of BVH:
 61 | struct BBoxTmp {
 62 | 	// Bottom point (ie minx,miny,minz)
 63 | 	Vector3Df _bottom;
 64 | 	// Top point (ie maxx,maxy,maxz)
 65 | 	Vector3Df _top;
 66 | 	// Center point, ie 0.5*(top-bottom)
 67 | 	Vector3Df _center; // = bbox centroid
 68 | 	// Triangle
 69 | 	const Triangle *_pTri;  // triangle list
 70 | 	BBoxTmp()
 71 | 		:
 72 | 		_bottom(FLT_MAX, FLT_MAX, FLT_MAX),
 73 | 		_top(-FLT_MAX, -FLT_MAX, -FLT_MAX),
 74 | 		_pTri(NULL)
 75 | 	{}
 76 | };
 77 | 
 78 | // BVH CONSTRUCTION
 79 | // This builds the BVH, finding optimal split planes for each depth
 80 | // uses binning: divide the work bounding box into a number of equally sized "bins" along one of the axes
 81 | // choose axis and splitting plane resulting in least cost (determined by surface area heuristic or SAH)
 82 | // SAH (surface area heuristic): the larger the surface area of a bounding box, the costlier it is to raytrace
 83 | // find the bbox with the minimum surface area
 84 | //
 85 | // I strongly recommend reading Ingo Wald's 2007 paper "On fast SAH based BVH construction",  
 86 | // http://www.sci.utah.edu/~wald/Publications/2007/ParallelBVHBuild/fastbuild.pdf, to understand the code below
 87 | 
 88 | 
 89 | typedef std::vector<BBoxTmp> BBoxEntries;  // vector of triangle bounding boxes needed during BVH construction
 90 | 
 91 | // recursive building of BVH nodes
 92 | // work is the working list (std::vector<>) of triangle bounding boxes 
 93 | 
 94 | BVHNode *Recurse(BBoxEntries& work, REPORTPRM(float pct = 0.) int depth = 0)
 95 | {
 96 | 
 97 | 	REPORT(float pctSpan = 11. / pow(3.f, depth);)
 98 | 
 99 | 	// terminate recursion case: 
100 | 	// if work set has less then 4 elements (triangle bounding boxes), create a leaf node 
101 | 	// and create a list of the triangles contained in the node
102 | 		
103 | 	if (work.size() < 4) {
104 | 			
105 | 		BVHLeaf *leaf = new BVHLeaf;
106 | 		for (BBoxEntries::iterator it = work.begin(); it != work.end(); it++)
107 | 			leaf->_triangles.push_back(it->_pTri);
108 | 		return leaf;
109 | 		}
110 | 
111 | 	// else, work size > 4, divide  node further into smaller nodes
112 | 	// start by finding the working list's bounding box (top and bottom)
113 | 
114 | 	Vector3Df bottom(FLT_MAX, FLT_MAX, FLT_MAX);
115 | 	Vector3Df top(-FLT_MAX, -FLT_MAX, -FLT_MAX);
116 | 
117 | 	// loop over all bboxes in current working list, expanding/growing the working list bbox
118 | 	for (unsigned i = 0; i < work.size(); i++) {  // meer dan 4 bboxen in work
119 | 		BBoxTmp& v = work[i];   
120 | 		bottom = min3(bottom, v._bottom);
121 | 		top = max3(top, v._top);
122 | 	}
123 | 
124 | 	// SAH, surface area heuristic calculation
125 | 	// find surface area of bounding box by multiplying the dimensions of the working list's bounding box
126 | 	float side1 = top.x - bottom.x;  // length bbox along X-axis
127 | 	float side2 = top.y - bottom.y;  // length bbox along Y-axis
128 | 	float side3 = top.z - bottom.z;  // length bbox along Z-axis
129 | 
130 | 	// the current bbox has a cost of (number of triangles) * surfaceArea of C = N * SA
131 | 	float minCost = work.size() * (side1*side2 + side2*side3 + side3*side1);
132 | 
133 | 	float bestSplit = FLT_MAX; // best split along axis, will indicate no split with better cost found (below)
134 | 
135 | 	int bestAxis = -1;
136 | 
137 | 	// Try all 3 axises X, Y, Z
138 | 	for (int j = 0; j < 3; j++) {  // 0 = X, 1 = Y, 2 = Z axis
139 | 
140 | 		int axis = j;
141 | 
142 | 		// we will try dividing the triangles based on the current axis,
143 | 		// and we will try split values from "start" to "stop", one "step" at a time.
144 | 		float start, stop, step;
145 | 
146 | 		// X-axis
147 | 		if (axis == 0) {
148 | 			start = bottom.x;
149 | 			stop = top.x;
150 | 		}
151 | 		// Y-axis
152 | 		else if (axis == 1) {
153 | 			start = bottom.y;
154 | 			stop = top.y;
155 | 		}
156 | 		// Z-axis
157 | 		else {
158 | 			start = bottom.z;
159 | 			stop = top.z;
160 | 		}
161 | 
162 | 		// In that axis, do the bounding boxes in the work queue "span" across, (meaning distributed over a reasonable distance)?
163 | 		// Or are they all already "packed" on the axis? Meaning that they are too close to each other
164 | 		if (fabsf(stop - start)<1e-4)
165 | 			// BBox side along this axis too short, we must move to a different axis!
166 | 			continue; // go to next axis
167 | 
168 | 		// Binning: Try splitting at a uniform sampling (at equidistantly spaced planes) that gets smaller the deeper we go:
169 | 		// size of "sampling grid": 1024 (depth 0), 512 (depth 1), etc
170 | 		// each bin has size "step"
171 | 		step = (stop - start) / (1024. / (depth + 1.));
172 | 
173 | #ifdef PROGRESS_REPORT
174 | 		// Progress report variables...
175 | 		float pctStart = pct + j*pctSpan;  // j is axis
176 | 		float pctStep = pctSpan / ((stop - start - 2 * step) / step);
177 | #endif
178 | 
179 | 		// for each bin (equally spaced bins of size "step"):
180 | 		for (float testSplit = start + step; testSplit < stop - step; testSplit += step) {
181 | 
182 | #ifdef PROGRESS_REPORT
183 | 			if ((1023 & g_reportCounter++) == 0) {
184 | 				std::printf("\b\b\b%02d%%", int(pctStart)); fflush(stdout);
185 | 			}
186 | 			pctStart += pctStep;
187 | #endif
188 | 
189 | 			// Create left and right bounding box
190 | 			Vector3Df lbottom(FLT_MAX, FLT_MAX, FLT_MAX);
191 | 			Vector3Df ltop(-FLT_MAX, -FLT_MAX, -FLT_MAX);
192 | 
193 | 			Vector3Df rbottom(FLT_MAX, FLT_MAX, FLT_MAX);
194 | 			Vector3Df rtop(-FLT_MAX, -FLT_MAX, -FLT_MAX);
195 | 
196 | 			// The number of triangles in the left and right bboxes (needed to calculate SAH cost function)
197 | 			int countLeft = 0, countRight = 0;
198 | 
199 | 			// For each test split (or bin), allocate triangles in remaining work list based on their bbox centers
200 | 			// this is a fast O(N) pass, no triangle sorting needed (yet)
201 | 			for (unsigned i = 0; i<work.size(); i++) {
202 | 
203 | 				BBoxTmp& v = work[i];
204 | 
205 | 				// compute bbox center
206 | 				float value;
207 | 				if (axis == 0) value = v._center.x;       // X-axis
208 | 				else if (axis == 1) value = v._center.y;  // Y-axis
209 | 				else value = v._center.z;			   // Z-axis
210 | 
211 | 				if (value < testSplit) { 
212 | 					// if center is smaller then testSplit value, put triangle in Left bbox
213 | 					lbottom = min3(lbottom, v._bottom);
214 | 					ltop = max3(ltop, v._top);
215 | 					countLeft++;
216 | 				}
217 | 				else {
218 | 					// else put triangle in right bbox
219 | 					rbottom = min3(rbottom, v._bottom);
220 | 					rtop = max3(rtop, v._top);
221 | 					countRight++;
222 | 				}
223 | 			}
224 | 
225 | 			// Now use the Surface Area Heuristic to see if this split has a better "cost"
226 | 
227 | 			// First, check for stupid partitionings, ie bins with 0 or 1 triangles make no sense
228 | 			if (countLeft <= 1 || countRight <= 1) continue;
229 | 
230 | 			// It's a real partitioning, calculate the surface areas
231 | 			float lside1 = ltop.x - lbottom.x;
232 | 			float lside2 = ltop.y - lbottom.y;
233 | 			float lside3 = ltop.z - lbottom.z;
234 | 
235 | 			float rside1 = rtop.x - rbottom.x;
236 | 			float rside2 = rtop.y - rbottom.y;
237 | 			float rside3 = rtop.z - rbottom.z;
238 | 
239 | 			// calculate SurfaceArea of Left and Right BBox
240 | 			float surfaceLeft = lside1*lside2 + lside2*lside3 + lside3*lside1;
241 | 			float surfaceRight = rside1*rside2 + rside2*rside3 + rside3*rside1;
242 | 
243 | 			// calculate total cost by multiplying left and right bbox by number of triangles in each
244 | 			float totalCost = surfaceLeft*countLeft + surfaceRight*countRight;
245 | 
246 | 			// keep track of cheapest split found so far
247 | 			if (totalCost < minCost) {
248 | 				minCost = totalCost;
249 | 				bestSplit = testSplit;
250 | 				bestAxis = axis;
251 | 			}
252 | 		} // end of loop over all bins
253 | 	} // end of loop over all axises
254 | 	
255 | 	// at the end of this loop (which runs for every "bin" or "sample location"), 
256 | 	// we should have the best splitting plane, best splitting axis and bboxes with minimal traversal cost
257 | 
258 | 	// If we found no split to improve the cost, create a BVH leaf
259 | 
260 | 	if (bestAxis == -1) {
261 | 
262 | 		BVHLeaf *leaf = new BVHLeaf;
263 | 		for (BBoxEntries::iterator it = work.begin(); it != work.end(); it++)
264 | 			leaf->_triangles.push_back(it->_pTri); // put triangles of working list in leaf's triangle list
265 | 		return leaf;
266 | 	}
267 | 
268 | 	// Otherwise, create BVH inner node with L and R child nodes, split with the optimal value we found above
269 | 
270 | 	BBoxEntries left;
271 | 	BBoxEntries right;  // BBoxEntries is a vector/list of BBoxTmp 
272 | 	Vector3Df lbottom(FLT_MAX, FLT_MAX, FLT_MAX);
273 | 	Vector3Df ltop(-FLT_MAX, -FLT_MAX, -FLT_MAX);
274 | 	Vector3Df rbottom(FLT_MAX, FLT_MAX, FLT_MAX);
275 | 	Vector3Df rtop(-FLT_MAX, -FLT_MAX, -FLT_MAX);
276 | 	
277 | 	// distribute the triangles in the left or right child nodes
278 | 	// for each triangle in the work set
279 | 	for (int i = 0; i < (int)work.size(); i++) {
280 | 
281 | 		// create temporary bbox for triangle
282 | 		BBoxTmp& v = work[i];
283 | 
284 | 		// compute bbox center 
285 | 		float value;
286 | 		if (bestAxis == 0) value = v._center.x;
287 | 		else if (bestAxis == 1) value = v._center.y;
288 | 		else value = v._center.z;
289 | 
290 | 		if (value < bestSplit) { // add temporary bbox v from work list to left BBoxentries list, 
291 | 			// becomes new working list of triangles in next step
292 | 
293 | 			left.push_back(v);
294 | 			lbottom = min3(lbottom, v._bottom);
295 | 			ltop = max3(ltop, v._top);
296 | 		}
297 | 		else {
298 | 
299 | 			// Add triangle bbox v from working list to right BBoxentries, 
300 | 			// becomes new working list of triangles in next step  
301 | 			right.push_back(v);
302 | 			rbottom = min3(rbottom, v._bottom);
303 | 			rtop = max3(rtop, v._top);
304 | 		}
305 | 	} // end loop for each triangle in working set
306 | 
307 | 	// create inner node
308 | 	BVHInner *inner = new BVHInner;
309 | 
310 | #ifdef PROGRESS_REPORT
311 | 	if ((1023 & g_reportCounter++) == 0) {
312 | 		std::printf("\b\b\b%2d%%", int(pct + 3.f*pctSpan)); // Update progress indicator
313 | 		fflush(stdout);
314 | 	}
315 | #endif
316 | 	// recursively build the left child
317 | 	inner->_left = Recurse(left, REPORTPRM(pct + 3.f*pctSpan) depth + 1);
318 | 	inner->_left->_bottom = lbottom;
319 | 	inner->_left->_top = ltop;
320 | 
321 | #ifdef PROGRESS_REPORT
322 | 	if ((1023 & g_reportCounter++) == 0) {
323 | 		std::printf("\b\b\b%2d%%", int(pct + 6.f*pctSpan)); // Update progress indicator
324 | 		fflush(stdout);
325 | 	}
326 | #endif
327 | 	// recursively build the right child
328 | 	inner->_right = Recurse(right, REPORTPRM(pct + 6.f*pctSpan) depth + 1);
329 | 	inner->_right->_bottom = rbottom;
330 | 	inner->_right->_top = rtop;
331 | 
332 | 	return inner;
333 | }  // end of Recurse() function, returns the rootnode (when all recursion calls have finished)
334 | 
335 | BVHNode *CreateBVH()
336 | {
337 | 	/* Summary:
338 | 	1. Create work BBox
339 | 	2. Create BBox for every triangle and compute bounds
340 | 	3. Expand bounds work BBox to fit all triangle bboxes
341 | 	4. Compute triangle bbox centre and add triangle to working list
342 | 	5. Build BVH tree with Recurse()
343 | 	6. Return root node
344 | 	*/
345 | 
346 | 	std::vector<BBoxTmp> work;
347 | 	Vector3Df bottom(FLT_MAX, FLT_MAX, FLT_MAX);
348 | 	Vector3Df top(-FLT_MAX, -FLT_MAX, -FLT_MAX);
349 | 
350 | 	puts("Gathering bounding box info from all triangles...");
351 | 	// for each triangle
352 | 	for (unsigned j = 0; j < g_trianglesNo; j++) {
353 | 
354 | 		const Triangle& triangle = g_triangles[j];
355 | 
356 | 		// create a new temporary bbox per triangle 
357 | 		BBoxTmp b;
358 | 		b._pTri = &triangle;  
359 | 
360 | 		// loop over triangle vertices and pick smallest vertex for bottom of triangle bbox
361 | 		b._bottom = min3(b._bottom, g_vertices[triangle._idx1]);  // index of vertex
362 | 		b._bottom = min3(b._bottom, g_vertices[triangle._idx2]);
363 | 		b._bottom = min3(b._bottom, g_vertices[triangle._idx3]);
364 | 
365 | 		// loop over triangle vertices and pick largest vertex for top of triangle bbox
366 | 		b._top = max3(b._top, g_vertices[triangle._idx1]);
367 | 		b._top = max3(b._top, g_vertices[triangle._idx2]);
368 | 		b._top = max3(b._top, g_vertices[triangle._idx3]);
369 | 
370 | 		// expand working list bbox by largest and smallest triangle bbox bounds
371 | 		bottom = min3(bottom, b._bottom);
372 | 		top = max3(top, b._top);
373 | 
374 | 		// compute triangle bbox center: (bbox top + bbox bottom) * 0.5
375 | 		b._center = (b._top + b._bottom) * 0.5f;
376 | 
377 | 		// add triangle bbox to working list
378 | 		work.push_back(b);
379 | 	}
380 | 
381 | 	// ...and pass it to the recursive function that creates the SAH AABB BVH
382 | 	// (Surface Area Heuristic, Axis-Aligned Bounding Boxes, Bounding Volume Hierarchy)
383 | 	
384 | 	std::printf("Creating Bounding Volume Hierarchy data...    "); fflush(stdout);
385 | 	BVHNode* root = Recurse(work); // builds BVH and returns root node
386 | 	printf("\b\b\b100%%\n");
387 | 
388 | 	root->_bottom = bottom; // bottom is bottom of bbox bounding all triangles in the scene
389 | 	root->_top = top;
390 | 
391 | 	return root;
392 | }
393 | 
394 | // the following functions are required to create the cache-friendly BVH
395 | 
396 | // recursively count bboxes
397 | int CountBoxes(BVHNode *root)
398 | {
399 | 	if (!root->IsLeaf()) {
400 | 		BVHInner *p = dynamic_cast<BVHInner*>(root);
401 | 		return 1 + CountBoxes(p->_left) + CountBoxes(p->_right);
402 | 	}
403 | 	else
404 | 		return 1;
405 | }
406 | 
407 | // recursively count triangles
408 | unsigned CountTriangles(BVHNode *root)
409 | {
410 | 	if (!root->IsLeaf()) {
411 | 		BVHInner *p = dynamic_cast<BVHInner*>(root);
412 | 		return CountTriangles(p->_left) + CountTriangles(p->_right);
413 | 	}
414 | 	else {
415 | 		BVHLeaf *p = dynamic_cast<BVHLeaf*>(root);
416 | 		return (unsigned)p->_triangles.size();
417 | 	}
418 | }
419 | 
420 | // recursively count depth
421 | void CountDepth(BVHNode *root, int depth, int& maxDepth)
422 | {
423 | 	if (maxDepth<depth)
424 | 		maxDepth = depth;
425 | 	if (!root->IsLeaf()) {
426 | 		BVHInner *p = dynamic_cast<BVHInner*>(root);
427 | 		CountDepth(p->_left, depth + 1, maxDepth);
428 | 		CountDepth(p->_right, depth + 1, maxDepth);
429 | 	}
430 | }
431 | 
432 | // Writes in the g_pCFBVH and g_triIndexListNo arrays,
433 | // creating a cache-friendly version of the BVH
434 | void PopulateCacheFriendlyBVH(
435 | 	const Triangle *pFirstTriangle,
436 | 	BVHNode *root,
437 | 	unsigned& idxBoxes,
438 | 	unsigned &idxTriList)
439 | {
440 | 	unsigned currIdxBoxes = idxBoxes;
441 | 	g_pCFBVH[currIdxBoxes]._bottom = root->_bottom;
442 | 	g_pCFBVH[currIdxBoxes]._top = root->_top;
443 | 
444 | 	//DEPTH FIRST APPROACH (left first until complete)
445 | 	if (!root->IsLeaf()) { // inner node
446 | 		BVHInner *p = dynamic_cast<BVHInner*>(root);
447 | 		// recursively populate left and right
448 | 		int idxLeft = ++idxBoxes;
449 | 		PopulateCacheFriendlyBVH(pFirstTriangle, p->_left, idxBoxes, idxTriList);
450 | 		int idxRight = ++idxBoxes;
451 | 		PopulateCacheFriendlyBVH(pFirstTriangle, p->_right, idxBoxes, idxTriList);
452 | 		g_pCFBVH[currIdxBoxes].u.inner._idxLeft = idxLeft;
453 | 		g_pCFBVH[currIdxBoxes].u.inner._idxRight = idxRight;
454 | 	}
455 | 
456 | 	else { // leaf
457 | 		BVHLeaf *p = dynamic_cast<BVHLeaf*>(root);
458 | 		unsigned count = (unsigned)p->_triangles.size();
459 | 		g_pCFBVH[currIdxBoxes].u.leaf._count = 0x80000000 | count;  // highest bit set indicates a leaf node (inner node if highest bit is 0)
460 | 		g_pCFBVH[currIdxBoxes].u.leaf._startIndexInTriIndexList = idxTriList;
461 | 
462 | 		for (std::list<const Triangle*>::iterator it = p->_triangles.begin(); it != p->_triangles.end(); it++)
463 | 		{
464 | 			g_triIndexList[idxTriList++] = *it - pFirstTriangle;
465 | 		}
466 | 	}
467 | }
468 | 
469 | void CreateCFBVH()
470 | {
471 | 	if (!g_pSceneBVH) {
472 | 		puts("Internal bug in CreateCFBVH, please report it..."); fflush(stdout);
473 | 		exit(1);
474 | 	}
475 | 
476 | 	unsigned idxTriList = 0;
477 | 	unsigned idxBoxes = 0;
478 | 
479 | 	g_triIndexListNo = CountTriangles(g_pSceneBVH);
480 | 	g_triIndexList = new int[g_triIndexListNo];
481 | 
482 | 	g_pCFBVH_No = CountBoxes(g_pSceneBVH);
483 | 	g_pCFBVH = new CacheFriendlyBVHNode[g_pCFBVH_No]; // array
484 | 
485 | 	PopulateCacheFriendlyBVH(&g_triangles[0], g_pSceneBVH, idxBoxes, idxTriList);
486 | 
487 | 	if ((idxBoxes != g_pCFBVH_No - 1) || (idxTriList != g_triIndexListNo)) {
488 | 		puts("Internal bug in CreateCFBVH, please report it..."); fflush(stdout);
489 | 		exit(1);
490 | 	}
491 | 
492 | 	int maxDepth = 0;
493 | 	CountDepth(g_pSceneBVH, 0, maxDepth);
494 | 	if (maxDepth >= BVH_STACK_SIZE) {
495 | 		printf("Max depth of BVH was %d\n", maxDepth);
496 | 		puts("Recompile with BVH_STACK_SIZE set to more than that..."); fflush(stdout);
497 | 		exit(1);
498 | 	}
499 | }
500 | 
501 | // The gateway - creates the "pure" BVH, and then copies the results in the cache-friendly one
502 | void UpdateBoundingVolumeHierarchy(const char *filename)
503 | {
504 | 	if (!g_pSceneBVH) {
505 | 		std::string BVHcacheFilename(filename);
506 | 		BVHcacheFilename += ".bvh";
507 | 		FILE *fp = fopen(BVHcacheFilename.c_str(), "rb");
508 | 		if (!fp) {
509 | 			// No cached BVH data - we need to calculate them
510 | 			Clock me;
511 | 			g_pSceneBVH = CreateBVH();
512 | 			printf("Building the BVH%s took %.2f seconds\n",
513 | #ifdef SIMD_SSE
514 | 				" with SSE", // SIMD SSE building has been removed for the tutorial
515 | #else
516 | 				"",
517 | #endif
518 | 				me.readMS() / 1000.);
519 | 
520 | 			// Now that the BVH has been created, copy its data into a more cache-friendly format
521 | 			// (CacheFriendlyBVHNode occupies exactly 32 bytes, i.e. a cache-line)
522 | 			CreateCFBVH();
523 | 
524 | 			// Now store the results, if possible...
525 | 			fp = fopen(BVHcacheFilename.c_str(), "wb");
526 | 			if (!fp) return;
527 | 			if (1 != fwrite(&g_pCFBVH_No, sizeof(unsigned), 1, fp)) return;
528 | 			if (1 != fwrite(&g_triIndexListNo, sizeof(unsigned), 1, fp)) return;
529 | 			if (g_pCFBVH_No != fwrite(g_pCFBVH, sizeof(CacheFriendlyBVHNode), g_pCFBVH_No, fp)) return;
530 | 			if (g_triIndexListNo != fwrite(g_triIndexList, sizeof(int), g_triIndexListNo, fp)) return;
531 | 			fclose(fp);
532 | 		}
533 | 		else { // BVH has been built already and stored in a file, read the file
534 | 			puts("Cache exists, reading the pre-calculated BVH data...");
535 | 			if (1 != fread(&g_pCFBVH_No, sizeof(unsigned), 1, fp)) return;
536 | 			if (1 != fread(&g_triIndexListNo, sizeof(unsigned), 1, fp)) return;
537 | 			g_pCFBVH = new CacheFriendlyBVHNode[g_pCFBVH_No];
538 | 			g_triIndexList = new int[g_triIndexListNo];
539 | 			if (g_pCFBVH_No != fread(g_pCFBVH, sizeof(CacheFriendlyBVHNode), g_pCFBVH_No, fp)) return;
540 | 			if (g_triIndexListNo != fread(g_triIndexList, sizeof(int), g_triIndexListNo, fp)) return;
541 | 			fclose(fp);
542 | 		}
543 | 	}
544 | }
545 | 


--------------------------------------------------------------------------------
/bvh.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
 3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
 4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
 5 | *
 6 | *  This program is free software; you can redistribute it and/or modify
 7 | *  it under the terms of the GNU General Public License as published by
 8 | *  the Free Software Foundation; either version 2 of the License, or
 9 | *  (at your option) any later version.
10 | *
11 | *  This program is distributed in the hope that it will be useful,
12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | *  GNU General Public License for more details.
15 | *
16 | *  You should have received a copy of the GNU General Public License
17 | *  along with this program; if not, write to the Free Software
18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 | */
20 | #ifndef __BVH_H_
21 | #define __BVH_H_
22 | 
23 | #include <list>
24 | #include "linear_algebra.h"
25 | #include "geometry.h"
26 | 
27 | 
28 | // The nice version of the BVH - a shallow hierarchy of inner and leaf nodes
29 | struct BVHNode {
30 | 	Vector3Df _bottom;
31 | 	Vector3Df _top;
32 | 	virtual bool IsLeaf() = 0; // pure virtual
33 | };
34 | 
35 | struct BVHInner : BVHNode {
36 | 	BVHNode *_left;
37 | 	BVHNode *_right;
38 | 	virtual bool IsLeaf() { return false; }
39 | };
40 | 
41 | struct BVHLeaf : BVHNode {
42 | 	std::list<const Triangle*> _triangles;
43 | 	virtual bool IsLeaf() { return true; }
44 | };
45 | 
46 | struct CacheFriendlyBVHNode {
47 | 	// bounding box
48 | 	Vector3Df _bottom;
49 | 	Vector3Df _top;
50 | 
51 | 	// parameters for leafnodes and innernodes occupy same space (union) to save memory
52 | 	// top bit discriminates between leafnode and innernode
53 | 	// no pointers, but indices (int): faster
54 | 
55 | 	union {
56 | 		// inner node - stores indexes to array of CacheFriendlyBVHNode
57 | 		struct {
58 | 			unsigned _idxLeft;
59 | 			unsigned _idxRight;
60 | 		} inner;
61 | 		// leaf node: stores triangle count and starting index in triangle list
62 | 		struct {
63 | 			unsigned _count; // Top-most bit set, leafnode if set, innernode otherwise
64 | 			unsigned _startIndexInTriIndexList;
65 | 		} leaf;
66 | 	} u;
67 | };
68 | 
69 | // The ugly, cache-friendly form of the BVH: 32 bytes
70 | void CreateCFBVH(); // CacheFriendlyBVH
71 | 
72 | // The single-point entrance to the BVH - call only this
73 | void UpdateBoundingVolumeHierarchy(const char *filename);
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/camera.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
  3 | *  Interactive camera with depth of field based on CUDA path tracer code
  4 | *  by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer
  5 | *
  6 | *  This program is free software; you can redistribute it and/or modify
  7 | *  it under the terms of the GNU General Public License as published by
  8 | *  the Free Software Foundation; either version 2 of the License, or
  9 | *  (at your option) any later version.
 10 | *
 11 | *  This program is distributed in the hope that it will be useful,
 12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | *  GNU General Public License for more details.
 15 | *
 16 | *  You should have received a copy of the GNU General Public License
 17 | *  along with this program; if not, write to the Free Software
 18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 19 | */
 20 | #include "camera.h"
 21 | 
 22 | InteractiveCamera::InteractiveCamera()
 23 | {
 24 | 	centerPosition = Vector3Df(0, 0, 0);
 25 | 	yaw = 0;
 26 | 	pitch = 0.3;
 27 | 	radius = 4;
 28 | 	apertureRadius = 0.04; // 0.04
 29 | 	focalDistance = 4.0f;
 30 | 
 31 | 	resolution = make_float2(512, 512);  // width, height
 32 | 	fov = make_float2(40, 40);
 33 | }
 34 | 
 35 | InteractiveCamera::~InteractiveCamera() {}
 36 | 
 37 | void InteractiveCamera::changeYaw(float m){
 38 | 	yaw += m;
 39 | 	fixYaw();
 40 | }
 41 | 
 42 | void InteractiveCamera::changePitch(float m){
 43 | 	pitch += m;
 44 | 	fixPitch();
 45 | }
 46 | 
 47 | void InteractiveCamera::changeRadius(float m){
 48 | 	radius += radius * m; // Change proportional to current radius. Assuming radius isn't allowed to go to zero.
 49 | 	fixRadius();
 50 | }
 51 | 
 52 | void InteractiveCamera::changeAltitude(float m){
 53 | 	centerPosition.y += m;
 54 | 	//fixCenterPosition();
 55 | }
 56 | 
 57 | void InteractiveCamera::goForward(float m){
 58 | 	centerPosition += viewDirection * m;
 59 | }
 60 | 
 61 | void InteractiveCamera::strafe(float m){
 62 | 	Vector3Df strafeAxis = cross(viewDirection, Vector3Df(0, 1, 0));
 63 | 	strafeAxis.normalize();
 64 | 	centerPosition += strafeAxis * m;
 65 | }
 66 | 
 67 | void InteractiveCamera::rotateRight(float m){
 68 | 	float yaw2 = yaw;
 69 | 	yaw2 += m;
 70 | 	float pitch2 = pitch;
 71 | 	float xDirection = sin(yaw2) * cos(pitch2);
 72 | 	float yDirection = sin(pitch2);
 73 | 	float zDirection = cos(yaw2) * cos(pitch2);
 74 | 	Vector3Df directionToCamera = Vector3Df(xDirection, yDirection, zDirection);
 75 | 	viewDirection = directionToCamera * (-1.0);
 76 | }
 77 | 
 78 | void InteractiveCamera::changeApertureDiameter(float m){
 79 | 	apertureRadius += (apertureRadius + 0.01) * m; // Change proportional to current apertureRadius.
 80 | 	fixApertureRadius();
 81 | }
 82 | 
 83 | 
 84 | void InteractiveCamera::changeFocalDistance(float m){
 85 | 	focalDistance += m;
 86 | 	fixFocalDistance();
 87 | }
 88 | 
 89 | 
 90 | void InteractiveCamera::setResolution(float x, float y){
 91 | 	resolution = make_float2(x, y);
 92 | 	setFOVX(fov.x);
 93 | }
 94 | 
 95 | float radiansToDegrees(float radians) {
 96 | 	float degrees = radians * 180.0 / M_PI;
 97 | 	return degrees;
 98 | }
 99 | 
100 | float degreesToRadians(float degrees) {
101 | 	float radians = degrees / 180.0 * M_PI;
102 | 	return radians;
103 | }
104 | 
105 | void InteractiveCamera::setFOVX(float fovx){
106 | 	fov.x = fovx;
107 | 	fov.y = radiansToDegrees(atan(tan(degreesToRadians(fovx) * 0.5) * (resolution.y / resolution.x)) * 2.0);
108 | 	// resolution float division
109 | }
110 | 
111 | void InteractiveCamera::buildRenderCamera(Camera* renderCamera){
112 | 	float xDirection = sin(yaw) * cos(pitch);
113 | 	float yDirection = sin(pitch);
114 | 	float zDirection = cos(yaw) * cos(pitch);
115 | 	Vector3Df directionToCamera = Vector3Df(xDirection, yDirection, zDirection);
116 | 	viewDirection = directionToCamera * (-1.0);
117 | 	Vector3Df eyePosition = centerPosition +directionToCamera * radius;
118 | 	//Vector3Df eyePosition = centerPosition; // rotate camera from stationary viewpoint
119 | 	
120 | 
121 | 	renderCamera->position = eyePosition;
122 | 	renderCamera->view = viewDirection;
123 | 	renderCamera->up = Vector3Df(0, 1, 0);
124 | 	renderCamera->resolution = make_float2(resolution.x, resolution.y);
125 | 	renderCamera->fov = make_float2(fov.x, fov.y);
126 | 	renderCamera->apertureRadius = apertureRadius;
127 | 	renderCamera->focalDistance = focalDistance;
128 | }
129 | 
130 | float mod(float x, float y) { // Does this account for -y ???
131 | 	return x - y * floorf(x / y);
132 | }
133 | 
134 | void InteractiveCamera::fixYaw() {
135 | 	yaw = mod(yaw, 2 * M_PI); // Normalize the yaw.
136 | }
137 | 
138 | float clamp2(float n, float low, float high) {
139 | 	n = fminf(n, high);
140 | 	n = fmaxf(n, low);
141 | 	return n;
142 | }
143 | 
144 | void InteractiveCamera::fixPitch() {
145 | 	float padding = 0.05;
146 | 	pitch = clamp2(pitch, -PI_OVER_TWO + padding, PI_OVER_TWO - padding); // Limit the pitch.
147 | }
148 | 
149 | void InteractiveCamera::fixRadius() {
150 | 	float minRadius = 0.2;
151 | 	float maxRadius = 100.0;
152 | 	radius = clamp2(radius, minRadius, maxRadius);
153 | }
154 | 
155 | void InteractiveCamera::fixApertureRadius() {
156 | 	float minApertureRadius = 0.0;
157 | 	float maxApertureRadius = 25.0;
158 | 	apertureRadius = clamp2(apertureRadius, minApertureRadius, maxApertureRadius);
159 | }
160 | 
161 | void InteractiveCamera::fixFocalDistance() {
162 | float minFocalDist = 0.2;
163 | float maxFocalDist = 100.0;
164 | focalDistance = clamp2(focalDistance, minFocalDist, maxFocalDist);
165 | }
166 | 


--------------------------------------------------------------------------------
/camera.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
 3 | *  Interactive camera with depth of field based on CUDA path tracer code
 4 | *  by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer
 5 | *
 6 | *  This program is free software; you can redistribute it and/or modify
 7 | *  it under the terms of the GNU General Public License as published by
 8 | *  the Free Software Foundation; either version 2 of the License, or
 9 | *  (at your option) any later version.
10 | *
11 | *  This program is distributed in the hope that it will be useful,
12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | *  GNU General Public License for more details.
15 | *
16 | *  You should have received a copy of the GNU General Public License
17 | *  along with this program; if not, write to the Free Software
18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 | */
20 | #ifndef __CAMERA_H__
21 | #define __CAMERA_H__
22 | 
23 | #include "geometry.h"
24 | #include "linear_algebra.h"
25 | #include <cuda_runtime.h>
26 | 
27 | #define M_PI 3.14156265
28 | #define PI_OVER_TWO 1.5707963267948966192313216916397514420985
29 | 
30 | // Camera struct, used to store interactive camera data, copied to the GPU and used by CUDA for each frame
31 | struct Camera {
32 | 	float2 resolution;
33 | 	Vector3Df position;
34 | 	Vector3Df view;
35 | 	Vector3Df up;
36 | 	float2 fov;
37 | 	float apertureRadius;
38 | 	float focalDistance;
39 | };
40 | 
41 | // class for interactive camera object, updated on the CPU for each frame and copied into Camera struct
42 | class InteractiveCamera
43 | {
44 | private:
45 | 
46 | 	Vector3Df centerPosition;
47 | 	Vector3Df viewDirection;
48 | 	float yaw;
49 | 	float pitch;
50 | 	float radius;
51 | 	float apertureRadius;
52 | 	float focalDistance;
53 | 
54 | 	void fixYaw();
55 | 	void fixPitch();
56 | 	void fixRadius();
57 | 	void fixApertureRadius();
58 | 	void fixFocalDistance();
59 | 
60 | public:
61 | 	InteractiveCamera();
62 | 	virtual ~InteractiveCamera();
63 |    	void changeYaw(float m);
64 | 	void changePitch(float m);
65 | 	void changeRadius(float m);
66 | 	void changeAltitude(float m);
67 | 	void changeFocalDistance(float m);
68 | 	void strafe(float m);
69 | 	void goForward(float m);
70 | 	void rotateRight(float m);
71 | 	void changeApertureDiameter(float m);
72 | 	void setResolution(float x, float y);
73 | 	void setFOVX(float fovx);
74 | 
75 | 	void buildRenderCamera(Camera* renderCamera);
76 | 
77 | 	float2 resolution;
78 | 	float2 fov;
79 | };
80 | 
81 | 
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/cuda_pathtracer.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
  3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras, 
  4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html 
  5 | *  Interactive camera with depth of field based on CUDA path tracer code 
  6 | *  by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer
  7 | *
  8 | *  This program is free software; you can redistribute it and/or modify
  9 | *  it under the terms of the GNU General Public License as published by
 10 | *  the Free Software Foundation; either version 2 of the License, or
 11 | *  (at your option) any later version.
 12 | *
 13 | *  This program is distributed in the hope that it will be useful,
 14 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | *  GNU General Public License for more details.
 17 | *
 18 | *  You should have received a copy of the GNU General Public License
 19 | *  along with this program; if not, write to the Free Software
 20 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 21 | */
 22 |  
 23 | #include <iostream>
 24 | #include <fstream>
 25 | #include <string>
 26 | #include <vector>
 27 | #include <cuda.h>
 28 | #include <math_functions.h>
 29 | #include <vector_types.h>
 30 | #include <vector_functions.h>
 31 | #include "device_launch_parameters.h"
 32 | #include "cutil_math.h"
 33 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glew.h"
 34 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glut.h"
 35 | #include <cuda_runtime.h>
 36 | #include <cuda_gl_interop.h>
 37 | #include <curand.h>
 38 | #include <curand_kernel.h>
 39 | 
 40 | #include "cuda_pathtracer.h"
 41 | 
 42 | #define M_PI 3.1415926535897932384626422832795028841971f
 43 | #define TWO_PI 6.2831853071795864769252867665590057683943f
 44 | #define NUDGE_FACTOR     1e-3f  // epsilon
 45 | #define samps  1 // samples
 46 | #define BVH_STACK_SIZE 32
 47 | #define SCREEN_DIST (height*2)
 48 | 
 49 | int texturewidth = 0;
 50 | int textureheight = 0;
 51 | int total_number_of_triangles;
 52 | 
 53 | __device__ int depth = 0;
 54 | 
 55 | 
 56 | // Textures for vertices, triangles and BVH data
 57 | // (see CudaRender() below, as well as main() to see the data setup process)
 58 | texture<uint1, 1, cudaReadModeElementType> g_triIdxListTexture;
 59 | texture<float2, 1, cudaReadModeElementType> g_pCFBVHlimitsTexture;
 60 | texture<uint4, 1, cudaReadModeElementType> g_pCFBVHindexesOrTrilistsTexture;
 61 | texture<float4, 1, cudaReadModeElementType> g_trianglesTexture;
 62 | 
 63 | Vertex* cudaVertices;
 64 | float* cudaTriangleIntersectionData;
 65 | int* cudaTriIdxList = NULL;
 66 | float* cudaBVHlimits = NULL;
 67 | int* cudaBVHindexesOrTrilists = NULL;
 68 | Triangle* cudaTriangles = NULL;
 69 | Camera* cudaRendercam = NULL;
 70 | 
 71 | 
 72 | struct Ray {
 73 | 	float3 orig;	// ray origin
 74 | 	float3 dir;		// ray direction	
 75 | 	__device__ Ray(float3 o_, float3 d_) : orig(o_), dir(d_) {}
 76 | };
 77 | 
 78 | enum Refl_t { DIFF, METAL, SPEC, REFR, COAT };  // material types
 79 | 
 80 | struct Sphere {
 81 | 
 82 | 	float rad;				// radius 
 83 | 	float3 pos, emi, col;	// position, emission, color 
 84 | 	Refl_t refl;			// reflection type (DIFFuse, SPECular, REFRactive)
 85 | 
 86 | 	__device__ float intersect(const Ray &r) const { // returns distance, 0 if nohit 
 87 | 
 88 | 		// Ray/sphere intersection
 89 | 		// Quadratic formula required to solve ax^2 + bx + c = 0 
 90 | 		// Solution x = (-b +- sqrt(b*b - 4ac)) / 2a
 91 | 		// Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0 
 92 | 
 93 | 		float3 op = pos - r.orig;  // 
 94 | 		float t, epsilon = 0.01f;
 95 | 		float b = dot(op, r.dir);
 96 | 		float disc = b*b - dot(op, op) + rad*rad; // discriminant
 97 | 		if (disc<0) return 0; else disc = sqrtf(disc);
 98 | 		return (t = b - disc)>epsilon ? t : ((t = b + disc)>epsilon ? t : 0);
 99 | 	}
100 | 
101 | };
102 | 
103 | __device__ Sphere spheres[] = {
104 | 
105 | 	// sun
106 | 	{ 1.6, { 0.0f, 2.8, 0 }, { 6, 4, 2 }, { 0.f, 0.f, 0.f }, DIFF },  // 37, 34, 30  X: links rechts Y: op neer
107 | 	//{ 1600, { 3000.0f, 10, 6000 }, { 17, 14, 10 }, { 0.f, 0.f, 0.f }, DIFF },
108 | 
109 | 	// horizon sun2
110 | 	//	{ 1560, { 3500.0f, 0, 7000 }, { 50, 25, 2.5 }, { 0.f, 0.f, 0.f }, DIFF },  //  150, 75, 7.5
111 | 
112 | 	// sky
113 | 	//{ 10000, { 50.0f, 40.8f, -1060 }, { 0.1, 0.3, 0.55 }, { 0.175f, 0.175f, 0.25f }, DIFF }, // 0.0003, 0.01, 0.15, or brighter: 0.2, 0.3, 0.6
114 | 	{ 10000, { 50.0f, 40.8f, -1060 }, { 0.51, 0.51, 0.51 }, { 0.175f, 0.175f, 0.25f }, DIFF },
115 | 
116 | 	// ground
117 | 	{ 100000, { 0.0f, -100001.1, 0 }, { 0, 0, 0 }, { 0.5f, 0.0f, 0.0f }, COAT },
118 | 	{ 100000, { 0.0f, -100001.2, 0 }, { 0, 0, 0 }, { 0.3f, 0.3f, 0.3f }, DIFF }, // double shell to prevent light leaking
119 | 
120 | 	// horizon brightener
121 | 	{ 110000, { 50.0f, -110048.5, 0 }, { 3.6, 2.0, 0.2 }, { 0.f, 0.f, 0.f }, DIFF },
122 | 	// mountains
123 | 	//{ 4e4, { 50.0f, -4e4 - 30, -3000 }, { 0, 0, 0 }, { 0.2f, 0.2f, 0.2f }, DIFF },
124 | 	// white Mirr
125 | 	{ 1.1, { 1.6, 0, 1.0 }, { 0, 0.0, 0 }, { 0.9f, .9f, 0.9f }, SPEC }
126 | 	// Glass
127 | 	//{ 0.3, { 0.0f, -0.4, 4 }, { .0, 0., .0 }, { 0.9f, 0.9f, 0.9f }, DIFF },
128 | 	// Glass2
129 | 	//{ 22, { 87.0f, 22, 24 }, { 0, 0, 0 }, { 0.9f, 0.9f, 0.9f }, SPEC },
130 | };
131 | 
132 | 
133 | // Create OpenGL BGR value for assignment in OpenGL VBO buffer
134 | __device__ int getColor(Vector3Df& p)  // converts Vector3Df colour to int
135 | {
136 | 	return (((unsigned)p.z) << 16) | (((unsigned)p.y) << 8) | (((unsigned)p.x));
137 | }
138 | 
139 | // Helper function, that checks whether a ray intersects a bounding box (BVH node)
140 | __device__ bool RayIntersectsBox(const Vector3Df& originInWorldSpace, const Vector3Df& rayInWorldSpace, int boxIdx)
141 | {
142 | 	// set Tnear = - infinity, Tfar = infinity
143 | 	//
144 | 	// For each pair of planes P associated with X, Y, and Z do:
145 | 	//     (example using X planes)
146 | 	//     if direction Xd = 0 then the ray is parallel to the X planes, so
147 | 	//         if origin Xo is not between the slabs ( Xo < Xl or Xo > Xh) then
148 | 	//             return false
149 | 	//     else, if the ray is not parallel to the plane then
150 | 	//     begin
151 | 	//         compute the intersection distance of the planes
152 | 	//         T1 = (Xl - Xo) / Xd
153 | 	//         T2 = (Xh - Xo) / Xd
154 | 	//         If T1 > T2 swap (T1, T2) /* since T1 intersection with near plane */
155 | 	//         If T1 > Tnear set Tnear =T1 /* want largest Tnear */
156 | 	//         If T2 < Tfar set Tfar="T2" /* want smallest Tfar */
157 | 	//         If Tnear > Tfar box is missed so
158 | 	//             return false
159 | 	//         If Tfar < 0 box is behind ray
160 | 	//             return false
161 | 	//     end
162 | 	// end of for loop
163 | 
164 | 	float Tnear, Tfar;
165 | 	Tnear = -FLT_MAX;
166 | 	Tfar = FLT_MAX;
167 | 
168 | 	float2 limits;
169 | 
170 | // box intersection routine
171 | #define CHECK_NEAR_AND_FAR_INTERSECTION(c)							    \
172 |     if (rayInWorldSpace.##c == 0.f) {						    \
173 | 	if (originInWorldSpace.##c < limits.x) return false;					    \
174 | 	if (originInWorldSpace.##c > limits.y) return false;					    \
175 | 	} else {											    \
176 | 	float T1 = (limits.x - originInWorldSpace.##c)/rayInWorldSpace.##c;			    \
177 | 	float T2 = (limits.y - originInWorldSpace.##c)/rayInWorldSpace.##c;			    \
178 | 	if (T1>T2) { float tmp=T1; T1=T2; T2=tmp; }						    \
179 | 	if (T1 > Tnear) Tnear = T1;								    \
180 | 	if (T2 < Tfar)  Tfar = T2;								    \
181 | 	if (Tnear > Tfar)	return false;									    \
182 | 	if (Tfar < 0.f)	return false;									    \
183 | 	}
184 | 
185 | 	limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx); // box.bottom._x/top._x placed in limits.x/limits.y
186 | 	//limits = make_float2(cudaBVHlimits[6 * boxIdx + 0], cudaBVHlimits[6 * boxIdx + 1]);
187 | 	CHECK_NEAR_AND_FAR_INTERSECTION(x)
188 | 	limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx + 1); // box.bottom._y/top._y placed in limits.x/limits.y
189 | 	//limits = make_float2(cudaBVHlimits[6 * boxIdx + 2], cudaBVHlimits[6 * boxIdx + 3]);
190 | 	CHECK_NEAR_AND_FAR_INTERSECTION(y)
191 | 	limits = tex1Dfetch(g_pCFBVHlimitsTexture, 3 * boxIdx + 2); // box.bottom._z/top._z placed in limits.x/limits.y
192 | 	//limits = make_float2(cudaBVHlimits[6 * boxIdx + 4], cudaBVHlimits[6 * boxIdx + 5]);
193 | 	CHECK_NEAR_AND_FAR_INTERSECTION(z)
194 | 
195 | 	// If Box survived all above tests, return true with intersection point Tnear and exit point Tfar.
196 | 	return true;
197 | }
198 | 
199 | 
200 | //////////////////////////////////////////
201 | //	BVH intersection routine	//
202 | //	using CUDA texture memory	//
203 | //////////////////////////////////////////
204 | 
205 | // there are 3 forms of the BVH: a "pure" BVH, a cache-friendly BVH (taking up less memory space than the pure BVH)
206 | // and a "textured" BVH which stores its data in CUDA texture memory (which is cached). The last one is gives the 
207 | // best performance and is used here.
208 | 
209 | __device__ bool BVH_IntersectTriangles(
210 | 	int* cudaBVHindexesOrTrilists, const Vector3Df& origin, const Vector3Df& ray, unsigned avoidSelf,
211 | 	int& pBestTriIdx, Vector3Df& pointHitInWorldSpace, float& kAB, float& kBC, float& kCA, float& hitdist,
212 | 	float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, Vector3Df& boxnormal)
213 | {
214 | 	// in the loop below, maintain the closest triangle and the point where we hit it:
215 | 	pBestTriIdx = -1;
216 | 	float bestTriDist;
217 | 
218 | 	// start from infinity
219 | 	bestTriDist = FLT_MAX;
220 | 
221 | 	// create a stack for each ray
222 | 	// the stack is just a fixed size array of indices to BVH nodes
223 | 	int stack[BVH_STACK_SIZE];
224 | 	
225 | 	int stackIdx = 0;
226 | 	stack[stackIdx++] = 0; 
227 | 	Vector3Df hitpoint;
228 | 
229 | 	// while the stack is not empty
230 | 	while (stackIdx) {
231 | 		
232 | 		// pop a BVH node (or AABB, Axis Aligned Bounding Box) from the stack
233 | 		int boxIdx = stack[stackIdx - 1];
234 | 		//uint* pCurrent = &cudaBVHindexesOrTrilists[boxIdx]; 
235 | 		
236 | 		// decrement the stackindex
237 | 		stackIdx--;
238 | 
239 | 		// fetch the data (indices to childnodes or index in triangle list + trianglecount) associated with this node
240 | 		uint4 data = tex1Dfetch(g_pCFBVHindexesOrTrilistsTexture, boxIdx);
241 | 
242 | 		// original, "pure" BVH form...
243 | 		//if (!pCurrent->IsLeaf()) {
244 | 		
245 | 		// cache-friendly BVH form...
246 | 		//if (!(cudaBVHindexesOrTrilists[4 * boxIdx + 0] & 0x80000000)) { // INNER NODE
247 | 
248 | 		// texture memory BVH form...
249 | 
250 | 		// determine if BVH node is an inner node or a leaf node by checking the highest bit (bitwise AND operation)
251 | 		// inner node if highest bit is 1, leaf node if 0
252 | 
253 | 		if (!(data.x & 0x80000000)) {   // INNER NODE
254 | 
255 | 			// if ray intersects inner node, push indices of left and right child nodes on the stack
256 | 			if (RayIntersectsBox(origin, ray, boxIdx)) {
257 | 				
258 | 				//stack[stackIdx++] = pCurrent->u.inner._idxRight;
259 | 				//stack[stackIdx++] = cudaBVHindexesOrTrilists[4 * boxIdx + 1];
260 | 				stack[stackIdx++] = data.y; // right child node index
261 | 				
262 | 				//stack[stackIdx++] = pCurrent->u.inner._idxLeft;
263 | 				//stack[stackIdx++] = cudaBVHindexesOrTrilists[4 * boxIdx + 2];
264 | 				stack[stackIdx++] = data.z; // left child node index
265 | 				
266 | 				// return if stack size is exceeded
267 | 				if (stackIdx>BVH_STACK_SIZE)
268 | 				{
269 | 					return false; 
270 | 				}
271 | 			}
272 | 		}
273 | 		else { // LEAF NODE
274 | 
275 | 			// original, "pure" BVH form...
276 | 			// BVHLeaf *p = dynamic_cast<BVHLeaf*>(pCurrent);
277 | 			// for(std::list<const Triangle*>::iterator it=p->_triangles.begin();
278 | 			//    it != p->_triangles.end();
279 | 			//    it++)
280 | 
281 | 			// cache-friendly BVH form...
282 | 			// for(unsigned i=pCurrent->u.leaf._startIndexInTriIndexList;
283 | 			//    i<pCurrent->u.leaf._startIndexInTriIndexList + (pCurrent->u.leaf._count & 0x7fffffff);
284 | 
285 | 			// texture memory BVH form...
286 | 			// for (unsigned i = cudaBVHindexesOrTrilists[4 * boxIdx + 3]; i< cudaBVHindexesOrTrilists[4 * boxIdx + 3] + (cudaBVHindexesOrTrilists[4 * boxIdx + 0] & 0x7fffffff); i++) { // data.w = number of triangles in leaf
287 | 			
288 | 			// loop over every triangle in the leaf node
289 | 			// data.w is start index in triangle list
290 | 			// data.x stores number of triangles in leafnode (the bitwise AND operation extracts the triangle number)
291 | 			for (unsigned i = data.w; i < data.w + (data.x & 0x7fffffff); i++) {
292 | 				
293 | 				// original, "pure" BVH form...
294 | 				//const Triangle& triangle = *(*it);
295 | 
296 | 				// cache-friendly BVH form...
297 | 				//const Triangle& triangle = pTriangles[cudaTriIdxList[i]];
298 | 
299 | 				// texture memory BVH form...
300 | 				// fetch the index of the current triangle
301 | 				int idx = tex1Dfetch(g_triIdxListTexture, i).x;
302 | 				//int idx = cudaTriIdxList[i];
303 | 
304 | 				// check if triangle is the same as the one intersected by previous ray
305 | 				// to avoid self-reflections/refractions
306 | 				if (avoidSelf == idx)
307 | 					continue; 
308 | 
309 | 				// fetch triangle center and normal from texture memory
310 | 				float4 center = tex1Dfetch(g_trianglesTexture, 5 * idx);
311 | 				float4 normal = tex1Dfetch(g_trianglesTexture, 5 * idx + 1);
312 | 
313 | 				// use the pre-computed triangle intersection data: normal, d, e1/d1, e2/d2, e3/d3
314 | 				float k = dot(normal, ray);
315 | 				if (k == 0.0f)
316 | 					continue; // this triangle is parallel to the ray, ignore it.
317 | 
318 | 				float s = (normal.w - dot(normal, origin)) / k;
319 | 				if (s <= 0.0f) // this triangle is "behind" the origin.
320 | 					continue;
321 | 				if (s <= NUDGE_FACTOR)  // epsilon
322 | 					continue;
323 | 				Vector3Df hit = ray * s;
324 | 				hit += origin;
325 | 
326 | 				// ray triangle intersection
327 | 				// Is the intersection of the ray with the triangle's plane INSIDE the triangle?
328 | 				
329 | 				float4 ee1 = tex1Dfetch(g_trianglesTexture, 5 * idx + 2);
330 | 				//float4 ee1 = make_float4(cudaTriangleIntersectionData[20 * idx + 8], cudaTriangleIntersectionData[20 * idx + 9], cudaTriangleIntersectionData[20 * idx + 10], cudaTriangleIntersectionData[20 * idx + 11]);
331 | 				float kt1 = dot(ee1, hit) - ee1.w; 
332 | 				if (kt1<0.0f) continue;
333 | 				
334 | 				float4 ee2 = tex1Dfetch(g_trianglesTexture, 5 * idx + 3);
335 | 				//float4 ee2 = make_float4(cudaTriangleIntersectionData[20 * idx + 12], cudaTriangleIntersectionData[20 * idx + 13], cudaTriangleIntersectionData[20 * idx + 14], cudaTriangleIntersectionData[20 * idx + 15]);
336 | 				float kt2 = dot(ee2, hit) - ee2.w; 
337 | 				if (kt2<0.0f) continue;
338 | 				
339 | 				float4 ee3 = tex1Dfetch(g_trianglesTexture, 5 * idx + 4);
340 | 				//float4 ee3 = make_float4(cudaTriangleIntersectionData[20 * idx + 16], cudaTriangleIntersectionData[20 * idx + 17], cudaTriangleIntersectionData[20 * idx + 18], cudaTriangleIntersectionData[20 * idx + 19]);
341 | 				float kt3 = dot(ee3, hit) - ee3.w; 
342 | 				if (kt3<0.0f) continue;
343 | 
344 | 				// ray intersects triangle, "hit" is the world space coordinate of the intersection.
345 | 				{
346 | 					// is this intersection closer than all the others?
347 | 					float hitZ = distancesq(origin, hit);
348 | 					if (hitZ < bestTriDist) {
349 | 						
350 | 						// maintain the closest hit
351 | 						bestTriDist = hitZ;
352 | 						hitdist = sqrtf(bestTriDist);
353 | 						pBestTriIdx = idx;
354 | 						pointHitInWorldSpace = hit;
355 | 						
356 | 						// store barycentric coordinates (for texturing, not used for now)
357 | 						kAB = kt1;
358 | 						kBC = kt2;
359 | 						kCA = kt3;
360 | 					}
361 | 				}
362 | 			}
363 | 		}
364 | 	}
365 | 	
366 | 	return pBestTriIdx != -1;
367 | }
368 | 
369 | //////////////////////
370 | // PATH TRACING
371 | //////////////////////
372 | 
373 | __device__ Vector3Df path_trace(curandState *randstate, Vector3Df originInWorldSpace, Vector3Df rayInWorldSpace, int avoidSelf,
374 | 	Triangle *pTriangles, int* cudaBVHindexesOrTrilists, float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList)
375 | {
376 | 
377 | 	// colour mask
378 | 	Vector3Df mask = Vector3Df(1.0f, 1.0f, 1.0f);
379 | 	// accumulated colour
380 | 	Vector3Df accucolor = Vector3Df(0.0f, 0.0f, 0.0f);
381 | 
382 | 	for (int bounces = 0; bounces < 5; bounces++){  // iteration up to 4 bounces (instead of recursion in CPU code)
383 | 
384 | 		int sphere_id = -1;
385 | 		int triangle_id = -1;
386 | 		int pBestTriIdx = -1;
387 | 		int geomtype = -1;
388 | 		const Triangle *pBestTri = NULL;
389 | 		Vector3Df pointHitInWorldSpace;
390 | 		float kAB = 0.f, kBC = 0.f, kCA = 0.f; // distances from the 3 edges of the triangle (from where we hit it), to be used for texturing
391 | 
392 | 		float tmin = 1e20;
393 | 		float tmax = -1e20;
394 | 		float d = 1e20;
395 | 		float scene_t = 1e20;
396 | 		float inf = 1e20;
397 | 		float hitdistance = 1e20;
398 | 		Vector3Df f = Vector3Df(0, 0, 0);
399 | 		Vector3Df emit = Vector3Df(0, 0, 0);
400 | 		Vector3Df x; // intersection point
401 | 		Vector3Df n; // normal
402 | 		Vector3Df nl; // oriented normal
403 | 		Vector3Df boxnormal = Vector3Df(0, 0, 0);
404 | 		Vector3Df dw; // ray direction of next path segment
405 | 		Refl_t refltype;
406 | 
407 | 		float3 rayorig = make_float3(originInWorldSpace.x, originInWorldSpace.y, originInWorldSpace.z);
408 | 		float3 raydir = make_float3(rayInWorldSpace.x, rayInWorldSpace.y, rayInWorldSpace.z);
409 | 
410 | 		// intersect all triangles in the scene stored in BVH
411 | 		BVH_IntersectTriangles(
412 | 			cudaBVHindexesOrTrilists, originInWorldSpace, rayInWorldSpace, avoidSelf,
413 | 			pBestTriIdx, pointHitInWorldSpace, kAB, kBC, kCA, hitdistance, cudaBVHlimits,
414 | 			cudaTriangleIntersectionData, cudaTriIdxList, boxnormal);
415 | 
416 | 		// intersect all spheres in the scene
417 | 		float numspheres = sizeof(spheres) / sizeof(Sphere);
418 | 		for (int i = int(numspheres); i--;){  // for all spheres in scene
419 | 			// keep track of distance from origin to closest intersection point
420 | 			if ((d = spheres[i].intersect(Ray(rayorig, raydir))) && d < scene_t){ scene_t = d; sphere_id = i; geomtype = 1; }
421 | 		}
422 | 		// set avoidSelf to current triangle index to avoid intersection between this triangle and the next ray, 
423 | 		// so that we don't get self-shadow or self-reflection from this triangle...
424 | 		avoidSelf = pBestTriIdx;
425 | 
426 | 		if (hitdistance < scene_t && hitdistance > 0.002) // EPSILON
427 | 		{
428 | 			scene_t = hitdistance;
429 | 			triangle_id = pBestTriIdx;
430 | 			geomtype = 2;
431 | 		}
432 | 
433 | 		if (scene_t > 1e20) return Vector3Df(0.0f, 0.0f, 0.0f);
434 | 
435 | 		// SPHERES:
436 | 		if (geomtype == 1){
437 | 
438 | 			Sphere &sphere = spheres[sphere_id]; // hit object with closest intersection
439 | 			x = originInWorldSpace + rayInWorldSpace * scene_t;  // intersection point on object
440 | 			n = Vector3Df(x.x - sphere.pos.x, x.y - sphere.pos.y, x.z - sphere.pos.z);		// normal
441 | 			n.normalize();
442 | 			nl = dot(n, rayInWorldSpace) < 0 ? n : n * -1; // correctly oriented normal
443 | 			f = Vector3Df(sphere.col.x, sphere.col.y, sphere.col.z);   // object colour
444 | 			refltype = sphere.refl;
445 | 			emit = Vector3Df(sphere.emi.x, sphere.emi.y, sphere.emi.z);  // object emission
446 | 			accucolor += (mask * emit);
447 | 		}
448 | 
449 | 		// TRIANGLES:5
450 | 		if (geomtype == 2){
451 | 
452 | 			pBestTri = &pTriangles[triangle_id];
453 | 
454 | 			x = pointHitInWorldSpace;  // intersection point
455 | 			n = pBestTri->_normal;  // normal
456 | 			//n = Vector3Df(0,0,1);
457 | 			n.normalize();
458 | 			nl = dot(n, rayInWorldSpace) < 0 ? n : n * -1;  // correctly oriented normal
459 | 
460 | 			Vector3Df colour = Vector3Df(0.9f, 0.3f, 0.0f); // hardcoded triangle colour
461 | 			//Vector3Df colour = pBestTri->_colorf;
462 | 			refltype = COAT;
463 | 			f = colour;
464 | 			emit = Vector3Df(0, 0, 0);  // object emission
465 | 			accucolor += (mask * emit);
466 | 		}
467 | 
468 | 		// basic material system, all parameters are hard-coded (such as phong exponent, index of refraction)
469 | 		
470 | 		// diffuse material, based on smallpt by Kevin Beason 
471 | 		if (refltype == DIFF){
472 | 
473 | 			// pick two random numbers
474 | 			float phi = 2 * M_PI * curand_uniform(randstate);
475 | 			float r2 = curand_uniform(randstate);
476 | 			float r2s = sqrtf(r2);
477 | 
478 | 			// compute orthonormal coordinate frame uvw with hitpoint as origin 
479 | 			Vector3Df w = nl; w.normalize();
480 | 			Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize();
481 | 			Vector3Df v = cross(w, u);
482 | 
483 | 			// compute cosine weighted random ray direction on hemisphere 
484 | 			dw = u*cosf(phi)*r2s + v*sinf(phi)*r2s + w*sqrtf(1 - r2); 
485 | 			dw.normalize();
486 | 
487 | 			// offset origin next path segment to prevent self intersection
488 | 			pointHitInWorldSpace = x + w * 0.01;  // scene size dependent
489 | 
490 | 			// multiply mask with colour of object
491 | 			mask *= f;
492 | 		}
493 | 
494 | 		// Phong metal material from "Realistic Ray Tracing", P. Shirley
495 | 		if (refltype == METAL){
496 | 
497 | 			// compute random perturbation of ideal reflection vector
498 | 			// the higher the phong exponent, the closer the perturbed vector is to the ideal reflection direction
499 | 			float phi = 2 * M_PI * curand_uniform(randstate);
500 | 			float r2 = curand_uniform(randstate);
501 | 			float phongexponent = 20;
502 | 			float cosTheta = powf(1 - r2, 1.0f / (phongexponent + 1));
503 | 			float sinTheta = sqrtf(1 - cosTheta * cosTheta);
504 | 
505 | 			// create orthonormal basis uvw around reflection vector with hitpoint as origin 
506 | 			// w is ray direction for ideal reflection
507 | 			Vector3Df w = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace); w.normalize();
508 | 			Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize();
509 | 			Vector3Df v = cross(w, u); // v is normalised by default
510 | 
511 | 			// compute cosine weighted random ray direction on hemisphere 
512 | 			dw = u * cosf(phi) * sinTheta + v * sinf(phi) * sinTheta + w * cosTheta; 
513 | 			dw.normalize();
514 | 
515 | 			// offset origin next path segment to prevent self intersection
516 | 			pointHitInWorldSpace = x + w * 0.01;  // scene size dependent
517 | 
518 | 			// multiply mask with colour of object
519 | 			mask *= f;
520 | 		}
521 | 
522 | 		// specular material (perfect mirror)
523 | 		if (refltype == SPEC){
524 | 
525 | 			// compute reflected ray direction according to Snell's law
526 | 			dw = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace);
527 | 
528 | 			// offset origin next path segment to prevent self intersection
529 | 			pointHitInWorldSpace = x + nl * 0.01;   // scene size dependent
530 | 
531 | 			// multiply mask with colour of object
532 | 			mask *= f;
533 | 		}
534 | 
535 | 		// COAT material based on https://github.com/peterkutz/GPUPathTracer
536 | 		// randomly select diffuse or specular reflection
537 | 		// looks okay-ish but inaccurate (no Fresnel calculation yet)
538 | 		if (refltype == COAT){
539 | 		
540 | 			float rouletteRandomFloat = curand_uniform(randstate);
541 | 			float threshold = 0.05f;
542 | 			Vector3Df specularColor = Vector3Df(1,1,1);  // hard-coded
543 | 			bool reflectFromSurface = (rouletteRandomFloat < threshold); //computeFresnel(make_Vector3Df(n.x, n.y, n.z), incident, incidentIOR, transmittedIOR, reflectionDirection, transmissionDirection).reflectionCoefficient);
544 | 			
545 | 			if (reflectFromSurface) { // calculate perfectly specular reflection
546 | 				
547 | 				// Ray reflected from the surface. Trace a ray in the reflection direction.
548 | 				// TODO: Use Russian roulette instead of simple multipliers! (Selecting between diffuse sample and no sample (absorption) in this case.)
549 | 				
550 | 				mask *= specularColor;
551 | 				dw = rayInWorldSpace - n * 2.0f * dot(n, rayInWorldSpace);
552 | 
553 | 				// offset origin next path segment to prevent self intersection
554 | 				pointHitInWorldSpace = x + nl * 0.01; // scene size dependent
555 | 			}
556 | 
557 | 			else {  // calculate perfectly diffuse reflection
558 | 			
559 | 				float r1 = 2 * M_PI * curand_uniform(randstate);
560 | 				float r2 = curand_uniform(randstate);
561 | 				float r2s = sqrtf(r2);
562 | 
563 | 				// compute orthonormal coordinate frame uvw with hitpoint as origin 
564 | 				Vector3Df w = nl; w.normalize();
565 | 				Vector3Df u = cross((fabs(w.x) > .1 ? Vector3Df(0, 1, 0) : Vector3Df(1, 0, 0)), w); u.normalize();
566 | 				Vector3Df v = cross(w, u);
567 | 
568 | 				// compute cosine weighted random ray direction on hemisphere 
569 | 				dw = u*cosf(r1)*r2s + v*sinf(r1)*r2s + w*sqrtf(1 - r2);
570 | 				dw.normalize();
571 | 
572 | 				// offset origin next path segment to prevent self intersection
573 | 				pointHitInWorldSpace = x + nl * 0.01;  // // scene size dependent
574 | 
575 | 				// multiply mask with colour of object
576 | 				mask *= f;
577 | 				//mask *= make_Vector3Df(0.15f, 0.15f, 0.15f);  // gold metal
578 | 			}	
579 | 		} // end COAT
580 | 
581 | 		// perfectly refractive material (glass, water)
582 | 		if (refltype == REFR){
583 | 
584 | 			bool into = dot(n, nl) > 0; // is ray entering or leaving refractive material?
585 | 			float nc = 1.0f;  // Index of Refraction air
586 | 			float nt = 1.5f;  // Index of Refraction glass/water
587 | 			float nnt = into ? nc / nt : nt / nc;  // IOR ratio of refractive materials
588 | 			float ddn = dot(rayInWorldSpace, nl);
589 | 			float cos2t = 1.0f - nnt*nnt * (1.f - ddn*ddn);
590 | 
591 | 			if (cos2t < 0.0f) // total internal reflection 
592 | 			{
593 | 				dw = rayInWorldSpace;
594 | 				dw -= n * 2.0f * dot(n, rayInWorldSpace);
595 | 
596 | 				// offset origin next path segment to prevent self intersection
597 | 				pointHitInWorldSpace = x + nl * 0.01; // scene size dependent
598 | 			}
599 | 			else // cos2t > 0
600 | 			{
601 | 				// compute direction of transmission ray
602 | 				Vector3Df tdir = rayInWorldSpace * nnt;
603 | 				tdir -= n * ((into ? 1 : -1) * (ddn*nnt + sqrtf(cos2t)));
604 | 				tdir.normalize();
605 | 
606 | 				float R0 = (nt - nc)*(nt - nc) / (nt + nc)*(nt + nc);
607 | 				float c = 1.f - (into ? -ddn : dot(tdir, n));
608 | 				float Re = R0 + (1.f - R0) * c * c * c * c * c;
609 | 				float Tr = 1 - Re; // Transmission
610 | 				float P = .25f + .5f * Re;
611 | 				float RP = Re / P;
612 | 				float TP = Tr / (1.f - P);
613 | 
614 | 				// randomly choose reflection or transmission ray
615 | 				if (curand_uniform(randstate) < 0.25) // reflection ray
616 | 				{
617 | 					mask *= RP;
618 | 					dw = rayInWorldSpace;
619 | 					dw -= n * 2.0f * dot(n, rayInWorldSpace);
620 | 
621 | 					pointHitInWorldSpace = x + nl * 0.01; // scene size dependent
622 | 				}
623 | 				else // transmission ray
624 | 				{
625 | 					mask *= TP;
626 | 					dw = tdir; //r = Ray(x, tdir); 
627 | 					pointHitInWorldSpace = x + nl * 0.001f; // epsilon must be small to avoid artefacts
628 | 				}
629 | 			}
630 | 		}
631 | 
632 | 		// set up origin and direction of next path segment
633 | 		originInWorldSpace = pointHitInWorldSpace;
634 | 		rayInWorldSpace = dw;
635 | 	}
636 | 
637 | 	return Vector3Df(accucolor.x, accucolor.y, accucolor.z);
638 | }
639 | 
640 | union Colour  // 4 bytes = 4 chars = 1 float
641 | {
642 | 	float c;
643 | 	uchar4 components;
644 | };
645 | 
646 | // the core path tracing kernel, 
647 | // running in parallel for all pixels
648 | __global__ void CoreLoopPathTracingKernel(Vector3Df* output, Vector3Df* accumbuffer, Triangle* pTriangles, Camera* cudaRendercam,
649 | 	int* cudaBVHindexesOrTrilists, float* cudaBVHlimits, float* cudaTriangleIntersectionData,
650 | 	int* cudaTriIdxList, unsigned int framenumber, unsigned int hashedframenumber)
651 | {
652 | 	// assign a CUDA thread to every pixel by using the threadIndex
653 | 	unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
654 | 	unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
655 | 
656 | 	// global threadId, see richiesams blogspot
657 | 	int threadId = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
658 | 
659 | 	// create random number generator and initialise with hashed frame number, see RichieSams blogspot
660 | 	curandState randState; // state of the random number generator, to prevent repetition
661 | 	curand_init(hashedframenumber + threadId, 0, 0, &randState);
662 | 
663 | 	Vector3Df finalcol; // final pixel colour  
664 | 	Vector3Df rendercampos = Vector3Df(cudaRendercam->position.x, cudaRendercam->position.y, cudaRendercam->position.z);
665 | 
666 | 	int i = (height - y - 1)*width + x; // pixel index in buffer
667 | 	int pixelx = x; // pixel x-coordinate on screen
668 | 	int pixely = height - y - 1; // pixel y-coordintate on screen
669 | 
670 | 	finalcol = Vector3Df(0.0f, 0.0f, 0.0f); // reset colour to zero for every pixel	
671 | 
672 | 	for (int s = 0; s < samps; s++){
673 | 
674 | 		// compute primary ray direction
675 | 		// use camera view of current frame (transformed on CPU side) to create local orthonormal basis
676 | 		Vector3Df rendercamview = Vector3Df(cudaRendercam->view.x, cudaRendercam->view.y, cudaRendercam->view.z); rendercamview.normalize(); // view is already supposed to be normalized, but normalize it explicitly just in case.
677 | 		Vector3Df rendercamup = Vector3Df(cudaRendercam->up.x, cudaRendercam->up.y, cudaRendercam->up.z); rendercamup.normalize();
678 | 		Vector3Df horizontalAxis = cross(rendercamview, rendercamup); horizontalAxis.normalize(); // Important to normalize!
679 | 		Vector3Df verticalAxis = cross(horizontalAxis, rendercamview); verticalAxis.normalize(); // verticalAxis is normalized by default, but normalize it explicitly just for good measure.
680 | 
681 | 		Vector3Df middle = rendercampos + rendercamview;
682 | 		Vector3Df horizontal = horizontalAxis * tanf(cudaRendercam->fov.x * 0.5 * (M_PI / 180)); // Now treating FOV as the full FOV, not half, so I multiplied it by 0.5. I also normzlized A and B, so there's no need to divide by the length of A or B anymore. Also normalized view and removed lengthOfView. Also removed the cast to float.
683 | 		Vector3Df vertical = verticalAxis * tanf(-cudaRendercam->fov.y * 0.5 * (M_PI / 180)); // Now treating FOV as the full FOV, not half, so I multiplied it by 0.5. I also normzlized A and B, so there's no need to divide by the length of A or B anymore. Also normalized view and removed lengthOfView. Also removed the cast to float.
684 | 
685 | 		// anti-aliasing
686 | 		// calculate center of current pixel and add random number in X and Y dimension
687 | 		// based on https://github.com/peterkutz/GPUPathTracer 
688 | 		float jitterValueX = curand_uniform(&randState) - 0.5;
689 | 		float jitterValueY = curand_uniform(&randState) - 0.5;
690 | 		float sx = (jitterValueX + pixelx) / (cudaRendercam->resolution.x - 1);
691 | 		float sy = (jitterValueY + pixely) / (cudaRendercam->resolution.y - 1);
692 | 
693 | 		// compute pixel on screen
694 | 		Vector3Df pointOnPlaneOneUnitAwayFromEye = middle + ( horizontal * ((2 * sx) - 1)) + ( vertical * ((2 * sy) - 1));
695 | 		Vector3Df pointOnImagePlane = rendercampos + ((pointOnPlaneOneUnitAwayFromEye - rendercampos) * cudaRendercam->focalDistance); // Important for depth of field!		
696 | 
697 | 		// calculation of depth of field / camera aperture 
698 | 		// based on https://github.com/peterkutz/GPUPathTracer 
699 | 		
700 | 		Vector3Df aperturePoint;
701 | 
702 | 		if (cudaRendercam->apertureRadius > 0.00001) { // the small number is an epsilon value.
703 | 		
704 | 			// generate random numbers for sampling a point on the aperture
705 | 			float random1 = curand_uniform(&randState);
706 | 			float random2 = curand_uniform(&randState);
707 | 
708 | 			// randomly pick a point on the circular aperture
709 | 			float angle = TWO_PI * random1;
710 | 			float distance = cudaRendercam->apertureRadius * sqrtf(random2);
711 | 			float apertureX = cos(angle) * distance;
712 | 			float apertureY = sin(angle) * distance;
713 | 
714 | 			aperturePoint = rendercampos + (horizontalAxis * apertureX) + (verticalAxis * apertureY);
715 | 		}
716 | 		else { // zero aperture
717 | 			aperturePoint = rendercampos;
718 | 		}
719 | 
720 | 		// calculate ray direction of next ray in path
721 | 		Vector3Df apertureToImagePlane = pointOnImagePlane - aperturePoint; 
722 | 		apertureToImagePlane.normalize(); // ray direction, needs to be normalised
723 | 		Vector3Df rayInWorldSpace = apertureToImagePlane;
724 | 		// in theory, this should not be required
725 | 		rayInWorldSpace.normalize();
726 | 
727 | 		// origin of next ray in path
728 | 		Vector3Df originInWorldSpace = aperturePoint;
729 | 
730 | 		finalcol += path_trace(&randState, originInWorldSpace, rayInWorldSpace, -1, pTriangles, 
731 | 			cudaBVHindexesOrTrilists, cudaBVHlimits, cudaTriangleIntersectionData, cudaTriIdxList) * (1.0f/samps);
732 | 	}       
733 | 
734 | 	// add pixel colour to accumulation buffer (accumulates all samples) 
735 | 	accumbuffer[i] += finalcol;
736 | 	// averaged colour: divide colour by the number of calculated frames so far
737 | 	Vector3Df tempcol = accumbuffer[i] / framenumber;
738 | 
739 | 	Colour fcolour;
740 | 	Vector3Df colour = Vector3Df(clamp(tempcol.x, 0.0f, 1.0f), clamp(tempcol.y, 0.0f, 1.0f), clamp(tempcol.z, 0.0f, 1.0f));
741 | 	// convert from 96-bit to 24-bit colour + perform gamma correction
742 | 	fcolour.components = make_uchar4((unsigned char)(powf(colour.x, 1 / 2.2f) * 255), (unsigned char)(powf(colour.y, 1 / 2.2f) * 255), (unsigned char)(powf(colour.z, 1 / 2.2f) * 255), 1);
743 | 	// store pixel coordinates and pixelcolour in OpenGL readable outputbuffer
744 | 	output[i] = Vector3Df(x, y, fcolour.c);
745 | 
746 | }
747 | 
748 | bool g_bFirstTime = true;
749 | 
750 | // the gateway to CUDA, called from C++ (in void disp() in main.cpp)
751 | void cudarender(Vector3Df* dptr, Vector3Df* accumulatebuffer, Triangle* cudaTriangles, int* cudaBVHindexesOrTrilists,
752 | 	float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, 
753 | 	unsigned framenumber, unsigned hashedframes, Camera* cudaRendercam){
754 | 
755 | 	if (g_bFirstTime) {
756 | 		// if this is the first time cudarender() is called,
757 | 		// bind the scene data to CUDA textures!
758 | 		g_bFirstTime = false;
759 | 
760 | 		printf("g_triIndexListNo: %d\n", g_triIndexListNo);
761 | 		printf("g_pCFBVH_No: %d\n", g_pCFBVH_No);
762 | 		printf("g_verticesNo: %d\n", g_verticesNo);
763 | 		printf("g_trianglesNo: %d\n", g_trianglesNo);
764 | 
765 | 		cudaChannelFormatDesc channel1desc = cudaCreateChannelDesc<uint1>();
766 | 		cudaBindTexture(NULL, &g_triIdxListTexture, cudaTriIdxList, &channel1desc, g_triIndexListNo * sizeof(uint1));
767 | 
768 | 		cudaChannelFormatDesc channel2desc = cudaCreateChannelDesc<float2>();
769 | 		cudaBindTexture(NULL, &g_pCFBVHlimitsTexture, cudaBVHlimits, &channel2desc, g_pCFBVH_No * 6 * sizeof(float));
770 | 
771 | 		cudaChannelFormatDesc channel3desc = cudaCreateChannelDesc<uint4>();
772 | 		cudaBindTexture(NULL, &g_pCFBVHindexesOrTrilistsTexture, cudaBVHindexesOrTrilists, &channel3desc,
773 | 			g_pCFBVH_No * sizeof(uint4));
774 | 
775 | 		//cudaChannelFormatDesc channel4desc = cudaCreateChannelDesc<float4>();
776 | 		//cudaBindTexture(NULL, &g_verticesTexture, cudaPtrVertices, &channel4desc, g_verticesNo * 8 * sizeof(float));
777 | 
778 | 		cudaChannelFormatDesc channel5desc = cudaCreateChannelDesc<float4>();
779 | 		cudaBindTexture(NULL, &g_trianglesTexture, cudaTriangleIntersectionData, &channel5desc, g_trianglesNo * 20 * sizeof(float));
780 | 	}
781 | 
782 | 	dim3 block(16, 16, 1);   // dim3 CUDA specific syntax, block and grid are required to schedule CUDA threads over streaming multiprocessors
783 | 	dim3 grid(width / block.x, height / block.y, 1);
784 | 
785 | 	// Configure grid and block sizes:
786 | 	int threadsPerBlock = 256;
787 | 	// Compute the number of blocks required, performing a ceiling operation to make sure there are enough:
788 | 	int fullBlocksPerGrid = ((width * height) + threadsPerBlock - 1) / threadsPerBlock;
789 | 	// <<<fullBlocksPerGrid, threadsPerBlock>>>
790 | 	CoreLoopPathTracingKernel << <grid, block >> >(dptr, accumulatebuffer, cudaTriangles, cudaRendercam, cudaBVHindexesOrTrilists,
791 | 		cudaBVHlimits, cudaTriangleIntersectionData, cudaTriIdxList, framenumber, hashedframes);
792 | 
793 | }
794 | 


--------------------------------------------------------------------------------
/cuda_pathtracer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
 3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
 4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
 5 | *  Interactive camera with depth of field based on CUDA path tracer code
 6 | *  by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer
 7 | *
 8 | *  This program is free software; you can redistribute it and/or modify
 9 | *  it under the terms of the GNU General Public License as published by
10 | *  the Free Software Foundation; either version 2 of the License, or
11 | *  (at your option) any later version.
12 | *
13 | *  This program is distributed in the hope that it will be useful,
14 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 | *  GNU General Public License for more details.
17 | *
18 | *  You should have received a copy of the GNU General Public License
19 | *  along with this program; if not, write to the Free Software
20 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21 | */
22 | #ifndef __CUDA_PATHTRACER_H_
23 | #define __CUDA_PATHTRACER_H_
24 | 
25 | #include "linear_algebra.h"
26 | #include "geometry.h"
27 | #include "bvh.h"
28 | #include "camera.h"
29 | #include <ctime>
30 |  
31 | #define BVH_STACK_SIZE 32
32 | #define width 1280	// screenwidth
33 | #define height 720 // screenheight
34 | 
35 | #define DBG_PUTS(level, msg) \
36 |     do { if (level <= 1) { puts(msg); fflush(stdout); }} while (0)
37 | 
38 | // global variables
39 | extern unsigned g_verticesNo;
40 | extern Vertex* g_vertices;
41 | extern unsigned g_trianglesNo;
42 | extern Triangle* g_triangles;
43 | extern BVHNode* g_pSceneBVH;
44 | extern unsigned g_triIndexListNo;
45 | extern int* g_triIndexList;
46 | extern unsigned g_pCFBVH_No;
47 | extern CacheFriendlyBVHNode* g_pCFBVH;
48 | 
49 | // The gateway to CUDA, called from C++ (src/main.cpp)
50 | 
51 | void cudarender(Vector3Df* dptr, Vector3Df* accumulatebuffer, Triangle* cudaTriangles, int* cudaBVHindexesOrTrilists,
52 | 	float* cudaBVHlimits, float* cudaTriangleIntersectionData, int* cudaTriIdxList, unsigned framenumber, unsigned hashedframes, Camera* cudaRendercam); 
53 | 
54 | 
55 | struct Clock {
56 | 	unsigned firstValue;
57 | 	Clock() { reset(); }
58 | 	void reset() { firstValue = clock(); }
59 | 	unsigned readMS() { return (clock() - firstValue) / (CLOCKS_PER_SEC / 1000); }
60 | };
61 | 
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/cutil_math.h:
--------------------------------------------------------------------------------
   1 | /*
   2 | * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
   3 | *
   4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
   5 | * with this source code for terms and conditions that govern your use of
   6 | * this software. Any use, reproduction, disclosure, or distribution of
   7 | * this software and related documentation outside the terms of the EULA
   8 | * is strictly prohibited.
   9 | *
  10 | */
  11 | 
  12 | /*
  13 | This file implements common mathematical operations on vector types
  14 | (float3, float4 etc.) since these are not provided as standard by CUDA.
  15 | 
  16 | The syntax is modelled on the Cg standard library.
  17 | 
  18 | This is part of the CUTIL library and is not supported by NVIDIA.
  19 | 
  20 | Thanks to Linh Hah for additions and fixes.
  21 | */
  22 | 
  23 | #ifndef CUTIL_MATH_H
  24 | #define CUTIL_MATH_H
  25 | 
  26 | #include <cuda_runtime.h>
  27 | 
  28 | typedef unsigned int uint;
  29 | typedef unsigned short ushort;
  30 | 
  31 | #ifndef __CUDACC__
  32 | #include <math.h>
  33 | 
  34 | ////////////////////////////////////////////////////////////////////////////////
  35 | // host implementations of CUDA functions
  36 | ////////////////////////////////////////////////////////////////////////////////
  37 | 
  38 | inline float fminf(float a, float b)
  39 | {
  40 | 	return a < b ? a : b;
  41 | }
  42 | 
  43 | inline float fmaxf(float a, float b)
  44 | {
  45 | 	return a > b ? a : b;
  46 | }
  47 | 
  48 | inline int max(int a, int b)
  49 | {
  50 | 	return a > b ? a : b;
  51 | }
  52 | 
  53 | inline int min(int a, int b)
  54 | {
  55 | 	return a < b ? a : b;
  56 | }
  57 | 
  58 | inline float rsqrtf(float x)
  59 | {
  60 | 	return 1.0f / sqrtf(x);
  61 | }
  62 | #endif
  63 | 
  64 | ////////////////////////////////////////////////////////////////////////////////
  65 | // constructors
  66 | ////////////////////////////////////////////////////////////////////////////////
  67 | 
  68 | inline __host__ __device__ float2 make_float2(float s)
  69 | {
  70 | 	return make_float2(s, s);
  71 | }
  72 | inline __host__ __device__ float2 make_float2(float3 a)
  73 | {
  74 | 	return make_float2(a.x, a.y);
  75 | }
  76 | inline __host__ __device__ float2 make_float2(int2 a)
  77 | {
  78 | 	return make_float2(float(a.x), float(a.y));
  79 | }
  80 | inline __host__ __device__ float2 make_float2(uint2 a)
  81 | {
  82 | 	return make_float2(float(a.x), float(a.y));
  83 | }
  84 | 
  85 | inline __host__ __device__ int2 make_int2(int s)
  86 | {
  87 | 	return make_int2(s, s);
  88 | }
  89 | inline __host__ __device__ int2 make_int2(int3 a)
  90 | {
  91 | 	return make_int2(a.x, a.y);
  92 | }
  93 | inline __host__ __device__ int2 make_int2(uint2 a)
  94 | {
  95 | 	return make_int2(int(a.x), int(a.y));
  96 | }
  97 | inline __host__ __device__ int2 make_int2(float2 a)
  98 | {
  99 | 	return make_int2(int(a.x), int(a.y));
 100 | }
 101 | 
 102 | inline __host__ __device__ uint2 make_uint2(uint s)
 103 | {
 104 | 	return make_uint2(s, s);
 105 | }
 106 | inline __host__ __device__ uint2 make_uint2(uint3 a)
 107 | {
 108 | 	return make_uint2(a.x, a.y);
 109 | }
 110 | inline __host__ __device__ uint2 make_uint2(int2 a)
 111 | {
 112 | 	return make_uint2(uint(a.x), uint(a.y));
 113 | }
 114 | 
 115 | inline __host__ __device__ float3 make_float3(float s)
 116 | {
 117 | 	return make_float3(s, s, s);
 118 | }
 119 | inline __host__ __device__ float3 make_float3(float2 a)
 120 | {
 121 | 	return make_float3(a.x, a.y, 0.0f);
 122 | }
 123 | inline __host__ __device__ float3 make_float3(float2 a, float s)
 124 | {
 125 | 	return make_float3(a.x, a.y, s);
 126 | }
 127 | inline __host__ __device__ float3 make_float3(float4 a)
 128 | {
 129 | 	return make_float3(a.x, a.y, a.z);
 130 | }
 131 | inline __host__ __device__ float3 make_float3(int3 a)
 132 | {
 133 | 	return make_float3(float(a.x), float(a.y), float(a.z));
 134 | }
 135 | inline __host__ __device__ float3 make_float3(uint3 a)
 136 | {
 137 | 	return make_float3(float(a.x), float(a.y), float(a.z));
 138 | }
 139 | 
 140 | inline __host__ __device__ int3 make_int3(int s)
 141 | {
 142 | 	return make_int3(s, s, s);
 143 | }
 144 | inline __host__ __device__ int3 make_int3(int2 a)
 145 | {
 146 | 	return make_int3(a.x, a.y, 0);
 147 | }
 148 | inline __host__ __device__ int3 make_int3(int2 a, int s)
 149 | {
 150 | 	return make_int3(a.x, a.y, s);
 151 | }
 152 | inline __host__ __device__ int3 make_int3(uint3 a)
 153 | {
 154 | 	return make_int3(int(a.x), int(a.y), int(a.z));
 155 | }
 156 | inline __host__ __device__ int3 make_int3(float3 a)
 157 | {
 158 | 	return make_int3(int(a.x), int(a.y), int(a.z));
 159 | }
 160 | 
 161 | inline __host__ __device__ uint3 make_uint3(uint s)
 162 | {
 163 | 	return make_uint3(s, s, s);
 164 | }
 165 | inline __host__ __device__ uint3 make_uint3(uint2 a)
 166 | {
 167 | 	return make_uint3(a.x, a.y, 0);
 168 | }
 169 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
 170 | {
 171 | 	return make_uint3(a.x, a.y, s);
 172 | }
 173 | inline __host__ __device__ uint3 make_uint3(uint4 a)
 174 | {
 175 | 	return make_uint3(a.x, a.y, a.z);
 176 | }
 177 | inline __host__ __device__ uint3 make_uint3(int3 a)
 178 | {
 179 | 	return make_uint3(uint(a.x), uint(a.y), uint(a.z));
 180 | }
 181 | 
 182 | inline __host__ __device__ float4 make_float4(float s)
 183 | {
 184 | 	return make_float4(s, s, s, s);
 185 | }
 186 | inline __host__ __device__ float4 make_float4(float3 a)
 187 | {
 188 | 	return make_float4(a.x, a.y, a.z, 0.0f);
 189 | }
 190 | inline __host__ __device__ float4 make_float4(float3 a, float w)
 191 | {
 192 | 	return make_float4(a.x, a.y, a.z, w);
 193 | }
 194 | inline __host__ __device__ float4 make_float4(int4 a)
 195 | {
 196 | 	return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 197 | }
 198 | inline __host__ __device__ float4 make_float4(uint4 a)
 199 | {
 200 | 	return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 201 | }
 202 | 
 203 | // custom function vec4.xyz
 204 | //inline __host__ __device__ float3 fxyz(float4 a)
 205 | //{
 206 | //	return make_float3(float(a.x), float(a.y), float(a.z));
 207 | //}
 208 | 
 209 | inline __host__ __device__ int4 make_int4(int s)
 210 | {
 211 | 	return make_int4(s, s, s, s);
 212 | }
 213 | inline __host__ __device__ int4 make_int4(int3 a)
 214 | {
 215 | 	return make_int4(a.x, a.y, a.z, 0);
 216 | }
 217 | inline __host__ __device__ int4 make_int4(int3 a, int w)
 218 | {
 219 | 	return make_int4(a.x, a.y, a.z, w);
 220 | }
 221 | inline __host__ __device__ int4 make_int4(uint4 a)
 222 | {
 223 | 	return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 224 | }
 225 | inline __host__ __device__ int4 make_int4(float4 a)
 226 | {
 227 | 	return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 228 | }
 229 | 
 230 | 
 231 | inline __host__ __device__ uint4 make_uint4(uint s)
 232 | {
 233 | 	return make_uint4(s, s, s, s);
 234 | }
 235 | inline __host__ __device__ uint4 make_uint4(uint3 a)
 236 | {
 237 | 	return make_uint4(a.x, a.y, a.z, 0);
 238 | }
 239 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
 240 | {
 241 | 	return make_uint4(a.x, a.y, a.z, w);
 242 | }
 243 | inline __host__ __device__ uint4 make_uint4(int4 a)
 244 | {
 245 | 	return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
 246 | }
 247 | 
 248 | ////////////////////////////////////////////////////////////////////////////////
 249 | // negate
 250 | ////////////////////////////////////////////////////////////////////////////////
 251 | 
 252 | inline __host__ __device__ float2 operator-(float2 &a)
 253 | {
 254 | 	return make_float2(-a.x, -a.y);
 255 | }
 256 | inline __host__ __device__ int2 operator-(int2 &a)
 257 | {
 258 | 	return make_int2(-a.x, -a.y);
 259 | }
 260 | inline __host__ __device__ float3 operator-(float3 &a)
 261 | {
 262 | 	return make_float3(-a.x, -a.y, -a.z);
 263 | }
 264 | inline __host__ __device__ int3 operator-(int3 &a)
 265 | {
 266 | 	return make_int3(-a.x, -a.y, -a.z);
 267 | }
 268 | inline __host__ __device__ float4 operator-(float4 &a)
 269 | {
 270 | 	return make_float4(-a.x, -a.y, -a.z, -a.w);
 271 | }
 272 | inline __host__ __device__ int4 operator-(int4 &a)
 273 | {
 274 | 	return make_int4(-a.x, -a.y, -a.z, -a.w);
 275 | }
 276 | 
 277 | ////////////////////////////////////////////////////////////////////////////////
 278 | // addition
 279 | ////////////////////////////////////////////////////////////////////////////////
 280 | 
 281 | inline __host__ __device__ float2 operator+(float2 a, float2 b)
 282 | {
 283 | 	return make_float2(a.x + b.x, a.y + b.y);
 284 | }
 285 | inline __host__ __device__ void operator+=(float2 &a, float2 b)
 286 | {
 287 | 	a.x += b.x; a.y += b.y;
 288 | }
 289 | inline __host__ __device__ float2 operator+(float2 a, float b)
 290 | {
 291 | 	return make_float2(a.x + b, a.y + b);
 292 | }
 293 | inline __host__ __device__ float2 operator+(float b, float2 a)
 294 | {
 295 | 	return make_float2(a.x + b, a.y + b);
 296 | }
 297 | inline __host__ __device__ void operator+=(float2 &a, float b)
 298 | {
 299 | 	a.x += b; a.y += b;
 300 | }
 301 | 
 302 | inline __host__ __device__ int2 operator+(int2 a, int2 b)
 303 | {
 304 | 	return make_int2(a.x + b.x, a.y + b.y);
 305 | }
 306 | inline __host__ __device__ void operator+=(int2 &a, int2 b)
 307 | {
 308 | 	a.x += b.x; a.y += b.y;
 309 | }
 310 | inline __host__ __device__ int2 operator+(int2 a, int b)
 311 | {
 312 | 	return make_int2(a.x + b, a.y + b);
 313 | }
 314 | inline __host__ __device__ int2 operator+(int b, int2 a)
 315 | {
 316 | 	return make_int2(a.x + b, a.y + b);
 317 | }
 318 | inline __host__ __device__ void operator+=(int2 &a, int b)
 319 | {
 320 | 	a.x += b; a.y += b;
 321 | }
 322 | 
 323 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
 324 | {
 325 | 	return make_uint2(a.x + b.x, a.y + b.y);
 326 | }
 327 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
 328 | {
 329 | 	a.x += b.x; a.y += b.y;
 330 | }
 331 | inline __host__ __device__ uint2 operator+(uint2 a, uint b)
 332 | {
 333 | 	return make_uint2(a.x + b, a.y + b);
 334 | }
 335 | inline __host__ __device__ uint2 operator+(uint b, uint2 a)
 336 | {
 337 | 	return make_uint2(a.x + b, a.y + b);
 338 | }
 339 | inline __host__ __device__ void operator+=(uint2 &a, uint b)
 340 | {
 341 | 	a.x += b; a.y += b;
 342 | }
 343 | 
 344 | 
 345 | inline __host__ __device__ float3 operator+(float3 a, float3 b)
 346 | {
 347 | 	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 348 | }
 349 | inline __host__ __device__ void operator+=(float3 &a, float3 b)
 350 | {
 351 | 	a.x += b.x; a.y += b.y; a.z += b.z;
 352 | }
 353 | inline __host__ __device__ float3 operator+(float3 a, float b)
 354 | {
 355 | 	return make_float3(a.x + b, a.y + b, a.z + b);
 356 | }
 357 | inline __host__ __device__ void operator+=(float3 &a, float b)
 358 | {
 359 | 	a.x += b; a.y += b; a.z += b;
 360 | }
 361 | 
 362 | inline __host__ __device__ int3 operator+(int3 a, int3 b)
 363 | {
 364 | 	return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
 365 | }
 366 | inline __host__ __device__ void operator+=(int3 &a, int3 b)
 367 | {
 368 | 	a.x += b.x; a.y += b.y; a.z += b.z;
 369 | }
 370 | inline __host__ __device__ int3 operator+(int3 a, int b)
 371 | {
 372 | 	return make_int3(a.x + b, a.y + b, a.z + b);
 373 | }
 374 | inline __host__ __device__ void operator+=(int3 &a, int b)
 375 | {
 376 | 	a.x += b; a.y += b; a.z += b;
 377 | }
 378 | 
 379 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
 380 | {
 381 | 	return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
 382 | }
 383 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
 384 | {
 385 | 	a.x += b.x; a.y += b.y; a.z += b.z;
 386 | }
 387 | inline __host__ __device__ uint3 operator+(uint3 a, uint b)
 388 | {
 389 | 	return make_uint3(a.x + b, a.y + b, a.z + b);
 390 | }
 391 | inline __host__ __device__ void operator+=(uint3 &a, uint b)
 392 | {
 393 | 	a.x += b; a.y += b; a.z += b;
 394 | }
 395 | 
 396 | inline __host__ __device__ int3 operator+(int b, int3 a)
 397 | {
 398 | 	return make_int3(a.x + b, a.y + b, a.z + b);
 399 | }
 400 | inline __host__ __device__ uint3 operator+(uint b, uint3 a)
 401 | {
 402 | 	return make_uint3(a.x + b, a.y + b, a.z + b);
 403 | }
 404 | inline __host__ __device__ float3 operator+(float b, float3 a)
 405 | {
 406 | 	return make_float3(a.x + b, a.y + b, a.z + b);
 407 | }
 408 | 
 409 | inline __host__ __device__ float4 operator+(float4 a, float4 b)
 410 | {
 411 | 	return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 412 | }
 413 | inline __host__ __device__ void operator+=(float4 &a, float4 b)
 414 | {
 415 | 	a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
 416 | }
 417 | inline __host__ __device__ float4 operator+(float4 a, float b)
 418 | {
 419 | 	return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 420 | }
 421 | inline __host__ __device__ float4 operator+(float b, float4 a)
 422 | {
 423 | 	return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 424 | }
 425 | inline __host__ __device__ void operator+=(float4 &a, float b)
 426 | {
 427 | 	a.x += b; a.y += b; a.z += b; a.w += b;
 428 | }
 429 | 
 430 | inline __host__ __device__ int4 operator+(int4 a, int4 b)
 431 | {
 432 | 	return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 433 | }
 434 | inline __host__ __device__ void operator+=(int4 &a, int4 b)
 435 | {
 436 | 	a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
 437 | }
 438 | inline __host__ __device__ int4 operator+(int4 a, int b)
 439 | {
 440 | 	return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
 441 | }
 442 | inline __host__ __device__ int4 operator+(int b, int4 a)
 443 | {
 444 | 	return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
 445 | }
 446 | inline __host__ __device__ void operator+=(int4 &a, int b)
 447 | {
 448 | 	a.x += b; a.y += b; a.z += b; a.w += b;
 449 | }
 450 | 
 451 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
 452 | {
 453 | 	return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 454 | }
 455 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
 456 | {
 457 | 	a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
 458 | }
 459 | inline __host__ __device__ uint4 operator+(uint4 a, uint b)
 460 | {
 461 | 	return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
 462 | }
 463 | inline __host__ __device__ uint4 operator+(uint b, uint4 a)
 464 | {
 465 | 	return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
 466 | }
 467 | inline __host__ __device__ void operator+=(uint4 &a, uint b)
 468 | {
 469 | 	a.x += b; a.y += b; a.z += b; a.w += b;
 470 | }
 471 | 
 472 | ////////////////////////////////////////////////////////////////////////////////
 473 | // subtract
 474 | ////////////////////////////////////////////////////////////////////////////////
 475 | 
 476 | inline __host__ __device__ float2 operator-(float2 a, float2 b)
 477 | {
 478 | 	return make_float2(a.x - b.x, a.y - b.y);
 479 | }
 480 | inline __host__ __device__ void operator-=(float2 &a, float2 b)
 481 | {
 482 | 	a.x -= b.x; a.y -= b.y;
 483 | }
 484 | inline __host__ __device__ float2 operator-(float2 a, float b)
 485 | {
 486 | 	return make_float2(a.x - b, a.y - b);
 487 | }
 488 | inline __host__ __device__ float2 operator-(float b, float2 a)
 489 | {
 490 | 	return make_float2(b - a.x, b - a.y);
 491 | }
 492 | inline __host__ __device__ void operator-=(float2 &a, float b)
 493 | {
 494 | 	a.x -= b; a.y -= b;
 495 | }
 496 | 
 497 | inline __host__ __device__ int2 operator-(int2 a, int2 b)
 498 | {
 499 | 	return make_int2(a.x - b.x, a.y - b.y);
 500 | }
 501 | inline __host__ __device__ void operator-=(int2 &a, int2 b)
 502 | {
 503 | 	a.x -= b.x; a.y -= b.y;
 504 | }
 505 | inline __host__ __device__ int2 operator-(int2 a, int b)
 506 | {
 507 | 	return make_int2(a.x - b, a.y - b);
 508 | }
 509 | inline __host__ __device__ int2 operator-(int b, int2 a)
 510 | {
 511 | 	return make_int2(b - a.x, b - a.y);
 512 | }
 513 | inline __host__ __device__ void operator-=(int2 &a, int b)
 514 | {
 515 | 	a.x -= b; a.y -= b;
 516 | }
 517 | 
 518 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
 519 | {
 520 | 	return make_uint2(a.x - b.x, a.y - b.y);
 521 | }
 522 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
 523 | {
 524 | 	a.x -= b.x; a.y -= b.y;
 525 | }
 526 | inline __host__ __device__ uint2 operator-(uint2 a, uint b)
 527 | {
 528 | 	return make_uint2(a.x - b, a.y - b);
 529 | }
 530 | inline __host__ __device__ uint2 operator-(uint b, uint2 a)
 531 | {
 532 | 	return make_uint2(b - a.x, b - a.y);
 533 | }
 534 | inline __host__ __device__ void operator-=(uint2 &a, uint b)
 535 | {
 536 | 	a.x -= b; a.y -= b;
 537 | }
 538 | 
 539 | inline __host__ __device__ float3 operator-(float3 a, float3 b)
 540 | {
 541 | 	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
 542 | }
 543 | inline __host__ __device__ void operator-=(float3 &a, float3 b)
 544 | {
 545 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z;
 546 | }
 547 | inline __host__ __device__ float3 operator-(float3 a, float b)
 548 | {
 549 | 	return make_float3(a.x - b, a.y - b, a.z - b);
 550 | }
 551 | inline __host__ __device__ float3 operator-(float b, float3 a)
 552 | {
 553 | 	return make_float3(b - a.x, b - a.y, b - a.z);
 554 | }
 555 | inline __host__ __device__ void operator-=(float3 &a, float b)
 556 | {
 557 | 	a.x -= b; a.y -= b; a.z -= b;
 558 | }
 559 | 
 560 | inline __host__ __device__ int3 operator-(int3 a, int3 b)
 561 | {
 562 | 	return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
 563 | }
 564 | inline __host__ __device__ void operator-=(int3 &a, int3 b)
 565 | {
 566 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z;
 567 | }
 568 | inline __host__ __device__ int3 operator-(int3 a, int b)
 569 | {
 570 | 	return make_int3(a.x - b, a.y - b, a.z - b);
 571 | }
 572 | inline __host__ __device__ int3 operator-(int b, int3 a)
 573 | {
 574 | 	return make_int3(b - a.x, b - a.y, b - a.z);
 575 | }
 576 | inline __host__ __device__ void operator-=(int3 &a, int b)
 577 | {
 578 | 	a.x -= b; a.y -= b; a.z -= b;
 579 | }
 580 | 
 581 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
 582 | {
 583 | 	return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
 584 | }
 585 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
 586 | {
 587 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z;
 588 | }
 589 | inline __host__ __device__ uint3 operator-(uint3 a, uint b)
 590 | {
 591 | 	return make_uint3(a.x - b, a.y - b, a.z - b);
 592 | }
 593 | inline __host__ __device__ uint3 operator-(uint b, uint3 a)
 594 | {
 595 | 	return make_uint3(b - a.x, b - a.y, b - a.z);
 596 | }
 597 | inline __host__ __device__ void operator-=(uint3 &a, uint b)
 598 | {
 599 | 	a.x -= b; a.y -= b; a.z -= b;
 600 | }
 601 | 
 602 | inline __host__ __device__ float4 operator-(float4 a, float4 b)
 603 | {
 604 | 	return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 605 | }
 606 | inline __host__ __device__ void operator-=(float4 &a, float4 b)
 607 | {
 608 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
 609 | }
 610 | inline __host__ __device__ float4 operator-(float4 a, float b)
 611 | {
 612 | 	return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
 613 | }
 614 | inline __host__ __device__ void operator-=(float4 &a, float b)
 615 | {
 616 | 	a.x -= b; a.y -= b; a.z -= b; a.w -= b;
 617 | }
 618 | 
 619 | inline __host__ __device__ int4 operator-(int4 a, int4 b)
 620 | {
 621 | 	return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 622 | }
 623 | inline __host__ __device__ void operator-=(int4 &a, int4 b)
 624 | {
 625 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
 626 | }
 627 | inline __host__ __device__ int4 operator-(int4 a, int b)
 628 | {
 629 | 	return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
 630 | }
 631 | inline __host__ __device__ int4 operator-(int b, int4 a)
 632 | {
 633 | 	return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
 634 | }
 635 | inline __host__ __device__ void operator-=(int4 &a, int b)
 636 | {
 637 | 	a.x -= b; a.y -= b; a.z -= b; a.w -= b;
 638 | }
 639 | 
 640 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
 641 | {
 642 | 	return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 643 | }
 644 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
 645 | {
 646 | 	a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
 647 | }
 648 | inline __host__ __device__ uint4 operator-(uint4 a, uint b)
 649 | {
 650 | 	return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
 651 | }
 652 | inline __host__ __device__ uint4 operator-(uint b, uint4 a)
 653 | {
 654 | 	return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
 655 | }
 656 | inline __host__ __device__ void operator-=(uint4 &a, uint b)
 657 | {
 658 | 	a.x -= b; a.y -= b; a.z -= b; a.w -= b;
 659 | }
 660 | 
 661 | ////////////////////////////////////////////////////////////////////////////////
 662 | // multiply
 663 | ////////////////////////////////////////////////////////////////////////////////
 664 | 
 665 | inline __host__ __device__ float2 operator*(float2 a, float2 b)
 666 | {
 667 | 	return make_float2(a.x * b.x, a.y * b.y);
 668 | }
 669 | inline __host__ __device__ void operator*=(float2 &a, float2 b)
 670 | {
 671 | 	a.x *= b.x; a.y *= b.y;
 672 | }
 673 | inline __host__ __device__ float2 operator*(float2 a, float b)
 674 | {
 675 | 	return make_float2(a.x * b, a.y * b);
 676 | }
 677 | inline __host__ __device__ float2 operator*(float b, float2 a)
 678 | {
 679 | 	return make_float2(b * a.x, b * a.y);
 680 | }
 681 | inline __host__ __device__ void operator*=(float2 &a, float b)
 682 | {
 683 | 	a.x *= b; a.y *= b;
 684 | }
 685 | 
 686 | inline __host__ __device__ int2 operator*(int2 a, int2 b)
 687 | {
 688 | 	return make_int2(a.x * b.x, a.y * b.y);
 689 | }
 690 | inline __host__ __device__ void operator*=(int2 &a, int2 b)
 691 | {
 692 | 	a.x *= b.x; a.y *= b.y;
 693 | }
 694 | inline __host__ __device__ int2 operator*(int2 a, int b)
 695 | {
 696 | 	return make_int2(a.x * b, a.y * b);
 697 | }
 698 | inline __host__ __device__ int2 operator*(int b, int2 a)
 699 | {
 700 | 	return make_int2(b * a.x, b * a.y);
 701 | }
 702 | inline __host__ __device__ void operator*=(int2 &a, int b)
 703 | {
 704 | 	a.x *= b; a.y *= b;
 705 | }
 706 | 
 707 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
 708 | {
 709 | 	return make_uint2(a.x * b.x, a.y * b.y);
 710 | }
 711 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
 712 | {
 713 | 	a.x *= b.x; a.y *= b.y;
 714 | }
 715 | inline __host__ __device__ uint2 operator*(uint2 a, uint b)
 716 | {
 717 | 	return make_uint2(a.x * b, a.y * b);
 718 | }
 719 | inline __host__ __device__ uint2 operator*(uint b, uint2 a)
 720 | {
 721 | 	return make_uint2(b * a.x, b * a.y);
 722 | }
 723 | inline __host__ __device__ void operator*=(uint2 &a, uint b)
 724 | {
 725 | 	a.x *= b; a.y *= b;
 726 | }
 727 | 
 728 | inline __host__ __device__ float3 operator*(float3 a, float3 b)
 729 | {
 730 | 	return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 731 | }
 732 | inline __host__ __device__ void operator*=(float3 &a, float3 b)
 733 | {
 734 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z;
 735 | }
 736 | inline __host__ __device__ float3 operator*(float3 a, float b)
 737 | {
 738 | 	return make_float3(a.x * b, a.y * b, a.z * b);
 739 | }
 740 | inline __host__ __device__ float3 operator*(float b, float3 a)
 741 | {
 742 | 	return make_float3(b * a.x, b * a.y, b * a.z);
 743 | }
 744 | inline __host__ __device__ void operator*=(float3 &a, float b)
 745 | {
 746 | 	a.x *= b; a.y *= b; a.z *= b;
 747 | }
 748 | 
 749 | inline __host__ __device__ int3 operator*(int3 a, int3 b)
 750 | {
 751 | 	return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
 752 | }
 753 | inline __host__ __device__ void operator*=(int3 &a, int3 b)
 754 | {
 755 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z;
 756 | }
 757 | inline __host__ __device__ int3 operator*(int3 a, int b)
 758 | {
 759 | 	return make_int3(a.x * b, a.y * b, a.z * b);
 760 | }
 761 | inline __host__ __device__ int3 operator*(int b, int3 a)
 762 | {
 763 | 	return make_int3(b * a.x, b * a.y, b * a.z);
 764 | }
 765 | inline __host__ __device__ void operator*=(int3 &a, int b)
 766 | {
 767 | 	a.x *= b; a.y *= b; a.z *= b;
 768 | }
 769 | 
 770 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
 771 | {
 772 | 	return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
 773 | }
 774 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
 775 | {
 776 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z;
 777 | }
 778 | inline __host__ __device__ uint3 operator*(uint3 a, uint b)
 779 | {
 780 | 	return make_uint3(a.x * b, a.y * b, a.z * b);
 781 | }
 782 | inline __host__ __device__ uint3 operator*(uint b, uint3 a)
 783 | {
 784 | 	return make_uint3(b * a.x, b * a.y, b * a.z);
 785 | }
 786 | inline __host__ __device__ void operator*=(uint3 &a, uint b)
 787 | {
 788 | 	a.x *= b; a.y *= b; a.z *= b;
 789 | }
 790 | 
 791 | inline __host__ __device__ float4 operator*(float4 a, float4 b)
 792 | {
 793 | 	return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 794 | }
 795 | inline __host__ __device__ void operator*=(float4 &a, float4 b)
 796 | {
 797 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
 798 | }
 799 | inline __host__ __device__ float4 operator*(float4 a, float b)
 800 | {
 801 | 	return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
 802 | }
 803 | inline __host__ __device__ float4 operator*(float b, float4 a)
 804 | {
 805 | 	return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
 806 | }
 807 | inline __host__ __device__ void operator*=(float4 &a, float b)
 808 | {
 809 | 	a.x *= b; a.y *= b; a.z *= b; a.w *= b;
 810 | }
 811 | 
 812 | inline __host__ __device__ int4 operator*(int4 a, int4 b)
 813 | {
 814 | 	return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 815 | }
 816 | inline __host__ __device__ void operator*=(int4 &a, int4 b)
 817 | {
 818 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
 819 | }
 820 | inline __host__ __device__ int4 operator*(int4 a, int b)
 821 | {
 822 | 	return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
 823 | }
 824 | inline __host__ __device__ int4 operator*(int b, int4 a)
 825 | {
 826 | 	return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
 827 | }
 828 | inline __host__ __device__ void operator*=(int4 &a, int b)
 829 | {
 830 | 	a.x *= b; a.y *= b; a.z *= b; a.w *= b;
 831 | }
 832 | 
 833 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
 834 | {
 835 | 	return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 836 | }
 837 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
 838 | {
 839 | 	a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
 840 | }
 841 | inline __host__ __device__ uint4 operator*(uint4 a, uint b)
 842 | {
 843 | 	return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
 844 | }
 845 | inline __host__ __device__ uint4 operator*(uint b, uint4 a)
 846 | {
 847 | 	return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
 848 | }
 849 | inline __host__ __device__ void operator*=(uint4 &a, uint b)
 850 | {
 851 | 	a.x *= b; a.y *= b; a.z *= b; a.w *= b;
 852 | }
 853 | 
 854 | ////////////////////////////////////////////////////////////////////////////////
 855 | // divide
 856 | ////////////////////////////////////////////////////////////////////////////////
 857 | 
 858 | inline __host__ __device__ float2 operator/(float2 a, float2 b)
 859 | {
 860 | 	return make_float2(a.x / b.x, a.y / b.y);
 861 | }
 862 | inline __host__ __device__ void operator/=(float2 &a, float2 b)
 863 | {
 864 | 	a.x /= b.x; a.y /= b.y;
 865 | }
 866 | inline __host__ __device__ float2 operator/(float2 a, float b)
 867 | {
 868 | 	return make_float2(a.x / b, a.y / b);
 869 | }
 870 | inline __host__ __device__ void operator/=(float2 &a, float b)
 871 | {
 872 | 	a.x /= b; a.y /= b;
 873 | }
 874 | inline __host__ __device__ float2 operator/(float b, float2 a)
 875 | {
 876 | 	return make_float2(b / a.x, b / a.y);
 877 | }
 878 | 
 879 | inline __host__ __device__ float3 operator/(float3 a, float3 b)
 880 | {
 881 | 	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 882 | }
 883 | inline __host__ __device__ void operator/=(float3 &a, float3 b)
 884 | {
 885 | 	a.x /= b.x; a.y /= b.y; a.z /= b.z;
 886 | }
 887 | inline __host__ __device__ float3 operator/(float3 a, float b)
 888 | {
 889 | 	return make_float3(a.x / b, a.y / b, a.z / b);
 890 | }
 891 | inline __host__ __device__ void operator/=(float3 &a, float b)
 892 | {
 893 | 	a.x /= b; a.y /= b; a.z /= b;
 894 | }
 895 | inline __host__ __device__ float3 operator/(float b, float3 a)
 896 | {
 897 | 	return make_float3(b / a.x, b / a.y, b / a.z);
 898 | }
 899 | 
 900 | inline __host__ __device__ float4 operator/(float4 a, float4 b)
 901 | {
 902 | 	return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
 903 | }
 904 | inline __host__ __device__ void operator/=(float4 &a, float4 b)
 905 | {
 906 | 	a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
 907 | }
 908 | inline __host__ __device__ float4 operator/(float4 a, float b)
 909 | {
 910 | 	return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
 911 | }
 912 | inline __host__ __device__ void operator/=(float4 &a, float b)
 913 | {
 914 | 	a.x /= b; a.y /= b; a.z /= b; a.w /= b;
 915 | }
 916 | inline __host__ __device__ float4 operator/(float b, float4 a){
 917 | 	return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
 918 | }
 919 | 
 920 | ////////////////////////////////////////////////////////////////////////////////
 921 | // min
 922 | ////////////////////////////////////////////////////////////////////////////////
 923 | 
 924 | inline  __host__ __device__ float2 fminf(float2 a, float2 b)
 925 | {
 926 | 	return make_float2(fminf(a.x, b.x), fminf(a.y, b.y));
 927 | }
 928 | inline __host__ __device__ float3 fminf(float3 a, float3 b)
 929 | {
 930 | 	return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z));
 931 | }
 932 | inline  __host__ __device__ float4 fminf(float4 a, float4 b)
 933 | {
 934 | 	return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
 935 | }
 936 | 
 937 | inline __host__ __device__ int2 min(int2 a, int2 b)
 938 | {
 939 | 	return make_int2(min(a.x, b.x), min(a.y, b.y));
 940 | }
 941 | inline __host__ __device__ int3 min(int3 a, int3 b)
 942 | {
 943 | 	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 944 | }
 945 | inline __host__ __device__ int4 min(int4 a, int4 b)
 946 | {
 947 | 	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 948 | }
 949 | 
 950 | inline __host__ __device__ uint2 min(uint2 a, uint2 b)
 951 | {
 952 | 	return make_uint2(min(a.x, b.x), min(a.y, b.y));
 953 | }
 954 | inline __host__ __device__ uint3 min(uint3 a, uint3 b)
 955 | {
 956 | 	return make_uint3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 957 | }
 958 | inline __host__ __device__ uint4 min(uint4 a, uint4 b)
 959 | {
 960 | 	return make_uint4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 961 | }
 962 | 
 963 | ////////////////////////////////////////////////////////////////////////////////
 964 | // max
 965 | ////////////////////////////////////////////////////////////////////////////////
 966 | 
 967 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
 968 | {
 969 | 	return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y));
 970 | }
 971 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
 972 | {
 973 | 	return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
 974 | }
 975 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
 976 | {
 977 | 	return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
 978 | }
 979 | 
 980 | inline __host__ __device__ int2 max(int2 a, int2 b)
 981 | {
 982 | 	return make_int2(max(a.x, b.x), max(a.y, b.y));
 983 | }
 984 | inline __host__ __device__ int3 max(int3 a, int3 b)
 985 | {
 986 | 	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 987 | }
 988 | inline __host__ __device__ int4 max(int4 a, int4 b)
 989 | {
 990 | 	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
 991 | }
 992 | 
 993 | inline __host__ __device__ uint2 max(uint2 a, uint2 b)
 994 | {
 995 | 	return make_uint2(max(a.x, b.x), max(a.y, b.y));
 996 | }
 997 | inline __host__ __device__ uint3 max(uint3 a, uint3 b)
 998 | {
 999 | 	return make_uint3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
1000 | }
1001 | inline __host__ __device__ uint4 max(uint4 a, uint4 b)
1002 | {
1003 | 	return make_uint4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
1004 | }
1005 | 
1006 | ////////////////////////////////////////////////////////////////////////////////
1007 | // lerp
1008 | // - linear interpolation between a and b, based on value t in [0, 1] range
1009 | ////////////////////////////////////////////////////////////////////////////////
1010 | 
1011 | inline __device__ __host__ float lerp(float a, float b, float t)
1012 | {
1013 | 	return a + t*(b - a);
1014 | }
1015 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
1016 | {
1017 | 	return a + t*(b - a);
1018 | }
1019 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
1020 | {
1021 | 	return a + t*(b - a);
1022 | }
1023 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
1024 | {
1025 | 	return a + t*(b - a);
1026 | }
1027 | 
1028 | ////////////////////////////////////////////////////////////////////////////////
1029 | // clamp
1030 | // - clamp the value v to be in the range [a, b]
1031 | ////////////////////////////////////////////////////////////////////////////////
1032 | 
1033 | inline __device__ __host__ float clamp(float f, float a, float b)
1034 | {
1035 | 	return fmaxf(a, fminf(f, b));
1036 | }
1037 | inline __device__ __host__ int clamp(int f, int a, int b)
1038 | {
1039 | 	return max(a, min(f, b));
1040 | }
1041 | inline __device__ __host__ uint clamp(uint f, uint a, uint b)
1042 | {
1043 | 	return max(a, min(f, b));
1044 | }
1045 | 
1046 | inline __device__ __host__ float2 clamp(float2 v, float a, float b)
1047 | {
1048 | 	return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
1049 | }
1050 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
1051 | {
1052 | 	return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1053 | }
1054 | inline __device__ __host__ float3 clamp(float3 v, float a, float b)
1055 | {
1056 | 	return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1057 | }
1058 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
1059 | {
1060 | 	return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1061 | }
1062 | inline __device__ __host__ float4 clamp(float4 v, float a, float b)
1063 | {
1064 | 	return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1065 | }
1066 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
1067 | {
1068 | 	return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1069 | }
1070 | 
1071 | inline __device__ __host__ int2 clamp(int2 v, int a, int b)
1072 | {
1073 | 	return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
1074 | }
1075 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
1076 | {
1077 | 	return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1078 | }
1079 | inline __device__ __host__ int3 clamp(int3 v, int a, int b)
1080 | {
1081 | 	return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1082 | }
1083 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
1084 | {
1085 | 	return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1086 | }
1087 | inline __device__ __host__ int4 clamp(int4 v, int a, int b)
1088 | {
1089 | 	return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1090 | }
1091 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
1092 | {
1093 | 	return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1094 | }
1095 | 
1096 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
1097 | {
1098 | 	return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
1099 | }
1100 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
1101 | {
1102 | 	return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1103 | }
1104 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
1105 | {
1106 | 	return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1107 | }
1108 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
1109 | {
1110 | 	return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1111 | }
1112 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
1113 | {
1114 | 	return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1115 | }
1116 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
1117 | {
1118 | 	return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1119 | }
1120 | 
1121 | ////////////////////////////////////////////////////////////////////////////////
1122 | // dot product
1123 | ////////////////////////////////////////////////////////////////////////////////
1124 | 
1125 | inline __host__ __device__ float dot(float2 a, float2 b)
1126 | {
1127 | 	return a.x * b.x + a.y * b.y;
1128 | }
1129 | inline __host__ __device__ float dot(float3 a, float3 b)
1130 | {
1131 | 	return a.x * b.x + a.y * b.y + a.z * b.z;
1132 | }
1133 | inline __host__ __device__ float dot(float4 a, float4 b)
1134 | {
1135 | 	return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1136 | }
1137 | 
1138 | inline __host__ __device__ int dot(int2 a, int2 b)
1139 | {
1140 | 	return a.x * b.x + a.y * b.y;
1141 | }
1142 | inline __host__ __device__ int dot(int3 a, int3 b)
1143 | {
1144 | 	return a.x * b.x + a.y * b.y + a.z * b.z;
1145 | }
1146 | inline __host__ __device__ int dot(int4 a, int4 b)
1147 | {
1148 | 	return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1149 | }
1150 | 
1151 | inline __host__ __device__ uint dot(uint2 a, uint2 b)
1152 | {
1153 | 	return a.x * b.x + a.y * b.y;
1154 | }
1155 | inline __host__ __device__ uint dot(uint3 a, uint3 b)
1156 | {
1157 | 	return a.x * b.x + a.y * b.y + a.z * b.z;
1158 | }
1159 | inline __host__ __device__ uint dot(uint4 a, uint4 b)
1160 | {
1161 | 	return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1162 | }
1163 | 
1164 | ////////////////////////////////////////////////////////////////////////////////
1165 | // length
1166 | ////////////////////////////////////////////////////////////////////////////////
1167 | 
1168 | inline __host__ __device__ float length(float2 v)
1169 | {
1170 | 	return sqrtf(dot(v, v));
1171 | }
1172 | inline __host__ __device__ float length(float3 v)
1173 | {
1174 | 	return sqrtf(dot(v, v));
1175 | }
1176 | inline __host__ __device__ float length(float4 v)
1177 | {
1178 | 	return sqrtf(dot(v, v));
1179 | }
1180 | 
1181 | ////////////////////////////////////////////////////////////////////////////////
1182 | // normalize
1183 | ////////////////////////////////////////////////////////////////////////////////
1184 | 
1185 | inline __host__ __device__ float2 normalize(float2 v)
1186 | {
1187 | 	float invLen = rsqrtf(dot(v, v));
1188 | 	return v * invLen;
1189 | }
1190 | inline __host__ __device__ float3 normalize(float3 v)
1191 | {
1192 | 	float invLen = rsqrtf(dot(v, v));
1193 | 	return v * invLen;
1194 | }
1195 | inline __host__ __device__ float4 normalize(float4 v)
1196 | {
1197 | 	float invLen = rsqrtf(dot(v, v));
1198 | 	return v * invLen;
1199 | }
1200 | 
1201 | ////////////////////////////////////////////////////////////////////////////////
1202 | // floor
1203 | ////////////////////////////////////////////////////////////////////////////////
1204 | 
1205 | inline __host__ __device__ float2 floorf(float2 v)
1206 | {
1207 | 	return make_float2(floorf(v.x), floorf(v.y));
1208 | }
1209 | inline __host__ __device__ float3 floorf(float3 v)
1210 | {
1211 | 	return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
1212 | }
1213 | inline __host__ __device__ float4 floorf(float4 v)
1214 | {
1215 | 	return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
1216 | }
1217 | 
1218 | ////////////////////////////////////////////////////////////////////////////////
1219 | // frac - returns the fractional portion of a scalar or each vector component
1220 | ////////////////////////////////////////////////////////////////////////////////
1221 | 
1222 | inline __host__ __device__ float fracf(float v)
1223 | {
1224 | 	return v - floorf(v);
1225 | }
1226 | inline __host__ __device__ float2 fracf(float2 v)
1227 | {
1228 | 	return make_float2(fracf(v.x), fracf(v.y));
1229 | }
1230 | inline __host__ __device__ float3 fracf(float3 v)
1231 | {
1232 | 	return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
1233 | }
1234 | inline __host__ __device__ float4 fracf(float4 v)
1235 | {
1236 | 	return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
1237 | }
1238 | 
1239 | ////////////////////////////////////////////////////////////////////////////////
1240 | // fmod
1241 | ////////////////////////////////////////////////////////////////////////////////
1242 | 
1243 | inline __host__ __device__ float2 fmodf(float2 a, float2 b)
1244 | {
1245 | 	return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
1246 | }
1247 | inline __host__ __device__ float3 fmodf(float3 a, float3 b)
1248 | {
1249 | 	return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
1250 | }
1251 | inline __host__ __device__ float4 fmodf(float4 a, float4 b)
1252 | {
1253 | 	return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
1254 | }
1255 | 
1256 | ////////////////////////////////////////////////////////////////////////////////
1257 | // absolute value
1258 | ////////////////////////////////////////////////////////////////////////////////
1259 | 
1260 | inline __host__ __device__ float2 fabs(float2 v)
1261 | {
1262 | 	return make_float2(fabs(v.x), fabs(v.y));
1263 | }
1264 | inline __host__ __device__ float3 fabs(float3 v)
1265 | {
1266 | 	return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
1267 | }
1268 | inline __host__ __device__ float4 fabs(float4 v)
1269 | {
1270 | 	return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
1271 | }
1272 | 
1273 | inline __host__ __device__ int2 abs(int2 v)
1274 | {
1275 | 	return make_int2(abs(v.x), abs(v.y));
1276 | }
1277 | inline __host__ __device__ int3 abs(int3 v)
1278 | {
1279 | 	return make_int3(abs(v.x), abs(v.y), abs(v.z));
1280 | }
1281 | inline __host__ __device__ int4 abs(int4 v)
1282 | {
1283 | 	return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
1284 | }
1285 | 
1286 | ////////////////////////////////////////////////////////////////////////////////
1287 | // reflect
1288 | // - returns reflection of incident ray I around surface normal N
1289 | // - N should be normalized, reflected vector's length is equal to length of I
1290 | ////////////////////////////////////////////////////////////////////////////////
1291 | 
1292 | inline __host__ __device__ float3 reflect(float3 i, float3 n)
1293 | {
1294 | 	return i - 2.0f * n * dot(n, i);
1295 | }
1296 | 
1297 | ////////////////////////////////////////////////////////////////////////////////
1298 | // cross product
1299 | ////////////////////////////////////////////////////////////////////////////////
1300 | 
1301 | inline __host__ __device__ float3 cross(float3 a, float3 b)
1302 | {
1303 | 	return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
1304 | }
1305 | 
1306 | ////////////////////////////////////////////////////////////////////////////////
1307 | // smoothstep
1308 | // - returns 0 if x < a
1309 | // - returns 1 if x > b
1310 | // - otherwise returns smooth interpolation between 0 and 1 based on x
1311 | ////////////////////////////////////////////////////////////////////////////////
1312 | 
1313 | inline __device__ __host__ float smoothstep(float a, float b, float x)
1314 | {
1315 | 	float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1316 | 	return (y*y*(3.0f - (2.0f*y)));
1317 | }
1318 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
1319 | {
1320 | 	float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1321 | 	return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
1322 | }
1323 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
1324 | {
1325 | 	float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1326 | 	return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
1327 | }
1328 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
1329 | {
1330 | 	float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1331 | 	return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
1332 | }
1333 | 
1334 | #endif
1335 | 


--------------------------------------------------------------------------------
/dragonDOF2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF2.png


--------------------------------------------------------------------------------
/dragonDOF3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF3.png


--------------------------------------------------------------------------------
/dragonDOF4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/dragonDOF4.png


--------------------------------------------------------------------------------
/geometry.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
 3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
 4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
 5 | *
 6 | *  This program is free software; you can redistribute it and/or modify
 7 | *  it under the terms of the GNU General Public License as published by
 8 | *  the Free Software Foundation; either version 2 of the License, or
 9 | *  (at your option) any later version.
10 | *
11 | *  This program is distributed in the hope that it will be useful,
12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | *  GNU General Public License for more details.
15 | *
16 | *  You should have received a copy of the GNU General Public License
17 | *  along with this program; if not, write to the Free Software
18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 | */
20 | #ifndef __GEOMETRY_H_
21 | #define __GEOMETRY_H_
22 | 
23 | #include "linear_algebra.h"
24 | 
25 | struct Vertex : public Vector3Df
26 | {
27 | 	// normal vector of this vertex
28 | 	Vector3Df _normal;
29 | 	// ambient occlusion of this vertex (pre-calculated in e.g. MeshLab)
30 | 	float _ambientOcclusionCoeff;
31 | 
32 | 	Vertex(float x, float y, float z, float nx, float ny, float nz, float amb = 60.f)
33 | 		:
34 | 		Vector3Df(x, y, z), _normal(Vector3Df(nx, ny, nz)), _ambientOcclusionCoeff(amb)
35 | 	{
36 | 		// assert |nx,ny,nz| = 1
37 | 	}
38 | };
39 | 
40 | struct Triangle {
41 | 	// indexes in vertices array
42 | 	unsigned _idx1;
43 | 	unsigned _idx2;
44 | 	unsigned _idx3;
45 | 	// RGB Color Vector3Df 
46 | 	Vector3Df _colorf;
47 | 	// Center point
48 | 	Vector3Df _center;
49 | 	// triangle normal
50 | 	Vector3Df _normal;
51 | 	// ignore back-face culling flag
52 | 	bool _twoSided;
53 | 	// Raytracing intersection pre-computed cache:
54 | 	float _d, _d1, _d2, _d3;
55 | 	Vector3Df _e1, _e2, _e3;
56 | 	// bounding box
57 | 	Vector3Df _bottom;
58 | 	Vector3Df _top;
59 | };
60 | 
61 | #endif 
62 | 


--------------------------------------------------------------------------------
/golddragon3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/golddragon3.png


--------------------------------------------------------------------------------
/golddragon4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/straaljager/GPU-path-tracing-with-CUDA-tutorial-3/0163f7080e40d19647b748a0e578843427441e85/golddragon4.png


--------------------------------------------------------------------------------
/linear_algebra.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
 3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
 4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
 5 | *
 6 | *  This program is free software; you can redistribute it and/or modify
 7 | *  it under the terms of the GNU General Public License as published by
 8 | *  the Free Software Foundation; either version 2 of the License, or
 9 | *  (at your option) any later version.
10 | *
11 | *  This program is distributed in the hope that it will be useful,
12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | *  GNU General Public License for more details.
15 | *
16 | *  You should have received a copy of the GNU General Public License
17 | *  along with this program; if not, write to the Free Software
18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 | */
20 | #ifndef __LINEAR_ALGEBRA_H_
21 | #define __LINEAR_ALGEBRA_H_
22 | 
23 | #include <cuda_runtime.h> // for __host__  __device__
24 | #include <math.h>
25 | 
26 | struct Vector3Df
27 | {
28 | 	union {
29 | 		struct { float x, y, z; };
30 | 		float _v[3];
31 | 	};
32 | 
33 | 	__host__ __device__ Vector3Df(float _x = 0, float _y = 0, float _z = 0) : x(_x), y(_y), z(_z) {}
34 | 	__host__ __device__ Vector3Df(const Vector3Df& v) : x(v.x), y(v.y), z(v.z) {}
35 | 	__host__ __device__ Vector3Df(const float4& v) : x(v.x), y(v.y), z(v.z) {}
36 | 	inline __host__ __device__ float length(){ return sqrtf(x*x + y*y + z*z); }
37 | 	// sometimes we dont need the sqrt, we are just comparing one length with another
38 | 	inline __host__ __device__ float lengthsq(){ return x*x + y*y + z*z; }
39 | 	inline __host__ __device__ void normalize(){ float norm = sqrtf(x*x + y*y + z*z); x /= norm; y /= norm; z /= norm; }
40 | 	inline __host__ __device__ Vector3Df& operator+=(const Vector3Df& v){ x += v.x; y += v.y; z += v.z; return *this; }
41 | 	inline __host__ __device__ Vector3Df& operator-=(const Vector3Df& v){ x -= v.x; y -= v.y; z -= v.z; return *this; }
42 | 	inline __host__ __device__ Vector3Df& operator*=(const float& a){ x *= a; y *= a; z *= a; return *this; }
43 | 	inline __host__ __device__ Vector3Df& operator*=(const Vector3Df& v){ x *= v.x; y *= v.y; z *= v.z; return *this; }
44 | 	inline __host__ __device__ Vector3Df operator*(float a) const{ return Vector3Df(x*a, y*a, z*a); }
45 | 	inline __host__ __device__ Vector3Df operator/(float a) const{ return Vector3Df(x/a, y/a, z/a); }
46 | 	inline __host__ __device__ Vector3Df operator*(const Vector3Df& v) const{ return Vector3Df(x * v.x, y * v.y, z * v.z); }
47 | 	inline __host__ __device__ Vector3Df operator+(const Vector3Df& v) const{ return Vector3Df(x + v.x, y + v.y, z + v.z); }
48 | 	inline __host__ __device__ Vector3Df operator-(const Vector3Df& v) const{ return Vector3Df(x - v.x, y - v.y, z - v.z); }
49 | 	inline __host__ __device__ Vector3Df& operator/=(const float& a){ x /= a; y /= a; z /= a; return *this; }
50 | 	inline __host__ __device__ bool operator!=(const Vector3Df& v){ return x != v.x || y != v.y || z != v.z; }
51 | };
52 | 
53 | 
54 | inline __host__ __device__ Vector3Df min3(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.x < v2.x ? v1.x : v2.x, v1.y < v2.y ? v1.y : v2.y, v1.z < v2.z ? v1.z : v2.z); }
55 | inline __host__ __device__ Vector3Df max3(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.x > v2.x ? v1.x : v2.x, v1.y > v2.y ? v1.y : v2.y, v1.z > v2.z ? v1.z : v2.z); }
56 | inline __host__ __device__ Vector3Df cross(const Vector3Df& v1, const Vector3Df& v2){ return Vector3Df(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x); }
57 | inline __host__ __device__ float dot(const Vector3Df& v1, const Vector3Df& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; }
58 | inline __host__ __device__ float dot(const Vector3Df& v1, const float4& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; }
59 | inline __host__ __device__ float dot(const float4& v1, const Vector3Df& v2){ return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z; }
60 | inline __host__ __device__ float distancesq(const Vector3Df& v1, const Vector3Df& v2){ return (v1.x - v2.x)*(v1.x - v2.x) + (v1.y - v2.y)*(v1.y - v2.y) + (v1.z - v2.z)*(v1.z - v2.z); }
61 | inline __host__ __device__ float distance(const Vector3Df& v1, const Vector3Df& v2){ return sqrtf((v1.x - v2.x)*(v1.x - v2.x) + (v1.y - v2.y)*(v1.y - v2.y) + (v1.z - v2.z)*(v1.z - v2.z)); }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/loader.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
  3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
  4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
  5 | *
  6 | *  This program is free software; you can redistribute it and/or modify
  7 | *  it under the terms of the GNU General Public License as published by
  8 | *  the Free Software Foundation; either version 2 of the License, or
  9 | *  (at your option) any later version.
 10 | *
 11 | *  This program is distributed in the hope that it will be useful,
 12 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | *  GNU General Public License for more details.
 15 | *
 16 | *  You should have received a copy of the GNU General Public License
 17 | *  along with this program; if not, write to the Free Software
 18 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 19 | */
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <stdarg.h>
 23 | 
 24 | #include <cstdio>
 25 | #include <cstdlib>
 26 | #include <iostream>
 27 | #include <sstream>
 28 | #include <fstream>
 29 | #include <algorithm>
 30 | #include <vector>
 31 | #include <map>
 32 | #include <cfloat>
 33 | 
 34 | #include <string.h>
 35 | #include <assert.h>
 36 | 
 37 | #include "linear_algebra.h"
 38 | #include "geometry.h"
 39 | #include "cuda_pathtracer.h"
 40 | 
 41 | using std::string;
 42 | 
 43 | unsigned g_verticesNo = 0;
 44 | unsigned g_trianglesNo = 0;
 45 | Vertex* g_vertices = NULL;
 46 | Triangle* g_triangles = NULL;
 47 | 
 48 | 
 49 | namespace enums {
 50 | 	enum ColorComponent {
 51 | 		Red = 0,
 52 | 		Green = 1,
 53 | 		Blue = 2
 54 | 	};
 55 | }
 56 | 
 57 | using namespace enums;
 58 | 
 59 | // Rescale input objects to have this size...
 60 | const float MaxCoordAfterRescale = 1.2f;
 61 | 
 62 | // if some file cannot be found, panic and exit
 63 | void panic(const char *fmt, ...)
 64 | {
 65 | 	static char message[131072];
 66 | 	va_list ap;
 67 | 
 68 | 	va_start(ap, fmt);
 69 | 	vsnprintf(message, sizeof message, fmt, ap);
 70 | 	printf(message); fflush(stdout);
 71 | 	va_end(ap);
 72 | 
 73 | 	exit(1);
 74 | }
 75 | 
 76 | void fix_normals(void)
 77 | {
 78 | 	for (unsigned j = 0; j<g_trianglesNo; j++) {
 79 | 		Vector3Df worldPointA = g_vertices[g_triangles[j]._idx1];
 80 | 		Vector3Df worldPointB = g_vertices[g_triangles[j]._idx2];
 81 | 		Vector3Df worldPointC = g_vertices[g_triangles[j]._idx3];
 82 | 		Vector3Df AB = worldPointB;
 83 | 		AB -= worldPointA;
 84 | 		Vector3Df AC = worldPointC;
 85 | 		AC -= worldPointA;
 86 | 		Vector3Df cr = cross(AB, AC);
 87 | 		cr.normalize();
 88 | 		g_triangles[j]._normal = cr;
 89 | 		g_vertices[g_triangles[j]._idx1]._normal += cr;
 90 | 		g_vertices[g_triangles[j]._idx2]._normal += cr;
 91 | 		g_vertices[g_triangles[j]._idx3]._normal += cr;
 92 | 	}
 93 | 	for (unsigned j = 0; j<g_trianglesNo; j++) {
 94 | 		g_vertices[g_triangles[j]._idx1]._normal.normalize();
 95 | 		g_vertices[g_triangles[j]._idx2]._normal.normalize();
 96 | 		g_vertices[g_triangles[j]._idx3]._normal.normalize();
 97 | 	}
 98 | }
 99 | 
100 | float load_object(const char *filename)
101 | {
102 | 	std::cout << "Loading object..." << std::endl;
103 | 	const char *edot = strrchr(filename, '.');
104 | 	if (edot) {
105 | 		edot++;
106 | 		
107 | 		if (!strcmp(edot, "PLY") || !strcmp(edot, "ply")) {
108 | 			// Only shadevis generated objects, not full blown parser!
109 | 			std::ifstream file(filename, std::ios::in);
110 | 			if (!file) {
111 | 				panic((string("Missing ") + string(filename)).c_str());
112 | 			}
113 | 
114 | 			Vertex *pCurrentVertex = NULL;
115 | 			Triangle *pCurrentTriangle = NULL;
116 | 
117 | 			string line;
118 | 			unsigned totalVertices, totalTriangles, lineNo = 0;
119 | 			bool inside = false;
120 | 			while (getline(file, line)) {
121 | 				lineNo++;
122 | 				if (!inside) {
123 | 					if (line.substr(0, 14) == "element vertex") {
124 | 						std::istringstream str(line);
125 | 						string word1;
126 | 						str >> word1;
127 | 						str >> word1;
128 | 						str >> totalVertices;
129 | 						g_vertices = (Vertex *)malloc(totalVertices*sizeof(Vertex));
130 | 						g_verticesNo = totalVertices;
131 | 						pCurrentVertex = g_vertices;
132 | 					}
133 | 					else if (line.substr(0, 12) == "element face") {
134 | 						std::istringstream str(line);
135 | 						string word1;
136 | 						str >> word1;
137 | 						str >> word1;
138 | 						str >> totalTriangles;
139 | 						g_triangles = (Triangle *)malloc(totalTriangles*sizeof(Triangle));
140 | 						g_trianglesNo = totalTriangles;
141 | 						pCurrentTriangle = g_triangles;
142 | 					}
143 | 					else if (line.substr(0, 10) == "end_header")
144 | 						inside = true;
145 | 				}
146 | 				else {
147 | 					if (totalVertices) {
148 | 
149 | 						totalVertices--;
150 | 						float x, y, z;
151 | 
152 | 						std::istringstream str_in(line);
153 | 						str_in >> x >> y >> z;
154 | 
155 | 						pCurrentVertex->x = x;
156 | 						pCurrentVertex->y = y;
157 | 						pCurrentVertex->z = z;
158 | 						pCurrentVertex->_normal.x = 0.f;
159 | 						pCurrentVertex->_normal.y = 0.f;
160 | 						pCurrentVertex->_normal.z = 0.f;
161 | 						pCurrentVertex->_ambientOcclusionCoeff = 60;  // fixed, but obsolete in path tracer
162 | 						pCurrentVertex++;
163 | 					}
164 | 
165 | 					else if (totalTriangles) {
166 | 
167 | 						totalTriangles--;
168 | 						unsigned dummy;
169 | 						float r, g, b;
170 | 						unsigned idx1, idx2, idx3; // vertex index
171 | 						std::istringstream str2(line);
172 | 						if (str2 >> dummy >> idx1 >> idx2 >> idx3)
173 | 						{
174 | 						    // set rgb colour to white
175 | 							r = 255; g = 255; b = 255;
176 | 	
177 | 							pCurrentTriangle->_idx1 = idx1;
178 | 							pCurrentTriangle->_idx2 = idx2;
179 | 							pCurrentTriangle->_idx3 = idx3;
180 | 							pCurrentTriangle->_colorf.x = r;
181 | 							pCurrentTriangle->_colorf.y = g;
182 | 							pCurrentTriangle->_colorf.z = b;
183 | 							pCurrentTriangle->_twoSided = false;
184 | 							pCurrentTriangle->_normal = Vector3Df(0, 0, 0);
185 | 							pCurrentTriangle->_bottom = Vector3Df(FLT_MAX, FLT_MAX, FLT_MAX);
186 | 							pCurrentTriangle->_top = Vector3Df(-FLT_MAX, -FLT_MAX, -FLT_MAX);
187 | 							Vertex *vertexA = &g_vertices[idx1];
188 | 							Vertex *vertexB = &g_vertices[idx2];
189 | 							Vertex *vertexC = &g_vertices[idx3];
190 | 							pCurrentTriangle->_center = Vector3Df(
191 | 								(vertexA->x + vertexB->x + vertexC->x) / 3.0f,
192 | 								(vertexA->y + vertexB->y + vertexC->y) / 3.0f,
193 | 								(vertexA->z + vertexB->z + vertexC->z) / 3.0f);
194 | 							pCurrentTriangle++;
195 | 						}
196 | 					}
197 | 				}
198 | 			}
199 | 			
200 | 			fix_normals();
201 | 		}
202 | 
203 | 		else
204 | 			panic("Unknown extension (only .ply accepted)");
205 | 	}
206 | 	else
207 | 		panic("No extension in filename (only .ply accepted)");
208 | 
209 | 	std::cout << "Vertices:  " << g_verticesNo << std::endl;
210 | 	std::cout << "Triangles: " << g_trianglesNo << std::endl;
211 | 
212 | 	// Center scene at world's center
213 | 
214 | 	Vector3Df minp(FLT_MAX, FLT_MAX, FLT_MAX);
215 | 	Vector3Df maxp(-FLT_MAX, -FLT_MAX, -FLT_MAX);
216 | 
217 | 	// calculate bounds of scene bounding box 
218 | 	// loop over all triangles in scene, grow minp and maxp
219 | 	for (unsigned i = 0; i<g_trianglesNo; i++) {
220 | 
221 | 		minp = min3(minp, g_vertices[g_triangles[i]._idx1]);
222 | 		minp = min3(minp, g_vertices[g_triangles[i]._idx2]);
223 | 		minp = min3(minp, g_vertices[g_triangles[i]._idx3]);
224 | 
225 | 		maxp = max3(maxp, g_vertices[g_triangles[i]._idx1]);
226 | 		maxp = max3(maxp, g_vertices[g_triangles[i]._idx2]);
227 | 		maxp = max3(maxp, g_vertices[g_triangles[i]._idx3]);
228 | 	}
229 | 
230 | 	// scene bounding box center before scaling and translating
231 | 	Vector3Df origCenter = Vector3Df(
232 | 		(maxp.x + minp.x) * 0.5,
233 | 		(maxp.y + minp.y) * 0.5,
234 | 		(maxp.z + minp.z) * 0.5);
235 | 
236 | 	minp -= origCenter;
237 | 	maxp -= origCenter;
238 | 
239 | 	// Scale scene so max(abs x,y,z coordinates) = MaxCoordAfterRescale
240 | 
241 | 	float maxi = 0;
242 | 	maxi = std::max(maxi, (float)fabs(minp.x));
243 | 	maxi = std::max(maxi, (float)fabs(minp.y));
244 | 	maxi = std::max(maxi, (float)fabs(minp.z));
245 | 	maxi = std::max(maxi, (float)fabs(maxp.x));
246 | 	maxi = std::max(maxi, (float)fabs(maxp.y));
247 | 	maxi = std::max(maxi, (float)fabs(maxp.z));
248 | 
249 | 	std::cout << "Centering and scaling vertices..." << std::endl;
250 | 	for (unsigned i = 0; i<g_verticesNo; i++) {
251 | 		g_vertices[i] -= origCenter;
252 | 		g_vertices[i] *= (MaxCoordAfterRescale / maxi);
253 | 	}
254 | 	std::cout << "Centering and scaling triangles..." << std::endl;
255 | 	for (unsigned i = 0; i<g_trianglesNo; i++) {
256 | 		g_triangles[i]._center -= origCenter;
257 | 		g_triangles[i]._center *= (MaxCoordAfterRescale / maxi);
258 | 	}
259 | 	std::cout << "Updating triangle bounding boxes (used by BVH)..." << std::endl;
260 | 	for (unsigned i = 0; i<g_trianglesNo; i++) {
261 | 
262 | 		g_triangles[i]._bottom = min3(g_triangles[i]._bottom, g_vertices[g_triangles[i]._idx1]);
263 | 		g_triangles[i]._bottom = min3(g_triangles[i]._bottom, g_vertices[g_triangles[i]._idx2]);
264 | 		g_triangles[i]._bottom = min3(g_triangles[i]._bottom, g_vertices[g_triangles[i]._idx3]);
265 | 		g_triangles[i]._top = max3(g_triangles[i]._top, g_vertices[g_triangles[i]._idx1]);
266 | 		g_triangles[i]._top = max3(g_triangles[i]._top, g_vertices[g_triangles[i]._idx2]);
267 | 		g_triangles[i]._top = max3(g_triangles[i]._top, g_vertices[g_triangles[i]._idx3]);
268 | 	}
269 | 	
270 | 	std::cout << "Pre-computing triangle intersection data (used by raytracer)..." << std::endl;
271 | 	
272 | 	for (unsigned i = 0; i<g_trianglesNo; i++) {
273 | 
274 | 		Triangle& triangle = g_triangles[i];
275 | 
276 | 		// Algorithm for triangle intersection is taken from Roman Kuchkuda's paper.
277 | 		// precompute edge vectors
278 | 		Vector3Df vc1 = g_vertices[triangle._idx2] - g_vertices[triangle._idx1];
279 | 		Vector3Df vc2 = g_vertices[triangle._idx3] - g_vertices[triangle._idx2];
280 | 		Vector3Df vc3 = g_vertices[triangle._idx1] - g_vertices[triangle._idx3];
281 | 
282 | 		// plane of triangle, cross product of edge vectors vc1 and vc2
283 | 		triangle._normal = cross(vc1, vc2);  
284 | 
285 | 		// choose longest alternative normal for maximum precision
286 | 		Vector3Df alt1 = cross(vc2, vc3);
287 | 		if (alt1.length() > triangle._normal.length()) triangle._normal = alt1; // higher precision when triangle has sharp angles
288 | 
289 | 		Vector3Df alt2 = cross(vc3, vc1);
290 | 		if (alt2.length() > triangle._normal.length()) triangle._normal = alt2;
291 | 
292 | 
293 | 		triangle._normal.normalize();
294 | 
295 | 		// precompute dot product between normal and first triangle vertex
296 | 		triangle._d = dot(triangle._normal, g_vertices[triangle._idx1]); 
297 | 
298 | 		// edge planes
299 | 		triangle._e1 = cross(triangle._normal, vc1);
300 | 		triangle._e1.normalize();
301 | 		triangle._d1 = dot(triangle._e1, g_vertices[triangle._idx1]);
302 | 		triangle._e2 = cross(triangle._normal, vc2);
303 | 		triangle._e2.normalize();
304 | 		triangle._d2 = dot(triangle._e2, g_vertices[triangle._idx2]);
305 | 		triangle._e3 = cross(triangle._normal, vc3);
306 | 		triangle._e3.normalize();
307 | 		triangle._d3 = dot(triangle._e3, g_vertices[triangle._idx3]);
308 | 	}
309 | 	
310 | 	return MaxCoordAfterRescale;
311 | }
312 | 


--------------------------------------------------------------------------------
/loader.h:
--------------------------------------------------------------------------------
1 | #ifndef __LOADER_H_
2 | #define __LOADER_H_
3 | 
4 | void panic(const char *fmt, ...);
5 | float load_object(const char *filename);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *  CUDA based triangle mesh path tracer using BVH acceleration by Sam lapere, 2016
  3 | *  BVH implementation based on real-time CUDA ray tracer by Thanassis Tsiodras,
  4 | *  http://users.softlab.ntua.gr/~ttsiod/cudarenderer-BVH.html
  5 | *  Interactive camera with depth of field based on CUDA path tracer code
  6 | *  by Peter Kutz and Yining Karl Li, https://github.com/peterkutz/GPUPathTracer
  7 | *
  8 | *  This program is free software; you can redistribute it and/or modify
  9 | *  it under the terms of the GNU General Public License as published by
 10 | *  the Free Software Foundation; either version 2 of the License, or
 11 | *  (at your option) any later version.
 12 | *
 13 | *  This program is distributed in the hope that it will be useful,
 14 | *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | *  GNU General Public License for more details.
 17 | *
 18 | *  You should have received a copy of the GNU General Public License
 19 | *  along with this program; if not, write to the Free Software
 20 | *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 21 | */
 22 | #include <cuda.h>
 23 | #include <cuda_runtime.h>
 24 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glew.h"
 25 | #include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\extras\CUPTI\include\GL\glut.h"
 26 | #include <cuda_gl_interop.h>
 27 | #include <sstream>
 28 | #include <iostream>
 29 | #include <math.h>
 30 | #include "cuda_pathtracer.h"
 31 | #include "loader.h"
 32 | #include "camera.h"
 33 | 
 34 | #ifndef M_PI
 35 | #define M_PI 3.14156265
 36 | #endif
 37 | 
 38 | using namespace std;
 39 | 
 40 | unsigned int framenumber = 0;
 41 | GLuint vbo;
 42 | void *d_vbo_buffer = NULL;
 43 | 
 44 | // CUDA arrays
 45 | Vertex* cudaVertices2 = NULL;
 46 | Triangle* cudaTriangles2 = NULL;
 47 | Camera* cudaRendercam2 = NULL;
 48 | float *cudaTriangleIntersectionData2 = NULL;
 49 | int* cudaTriIdxList2 = NULL;
 50 | float *cudaBVHlimits2 = NULL;
 51 | int *cudaBVHindexesOrTrilists2 = NULL;
 52 | 
 53 | bool buffer_reset = false;
 54 | 
 55 | void Timer(int obsolete) {
 56 | 
 57 | 	glutPostRedisplay();
 58 | 	glutTimerFunc(10, Timer, 0);
 59 | }
 60 | 
 61 | __device__ float timer = 0.0f;
 62 | 
 63 | // image buffer storing accumulated pixel samples
 64 | Vector3Df* accumulatebuffer;
 65 | // final output buffer storing averaged pixel samples
 66 | Vector3Df* finaloutputbuffer;
 67 | 
 68 | // mouse controls
 69 | int mouse_old_x, mouse_old_y;
 70 | int mouse_buttons = 0;
 71 | float rotate_x = 0.0, rotate_y = 0.0;
 72 | float translate_z = -30.0;
 73 | 
 74 | // TODO: Delete stuff at some point!!!
 75 | InteractiveCamera* interactiveCamera = NULL;
 76 | Camera* hostRendercam = NULL;  
 77 | Clock watch;
 78 | 
 79 | float scalefactor = 1.2f;
 80 | 
 81 | // this hash function calculates a new random number generator seed for each frame, based on framenumber  
 82 | unsigned int WangHash(unsigned int a) {
 83 | 	a = (a ^ 61) ^ (a >> 16);
 84 | 	a = a + (a << 3);
 85 | 	a = a ^ (a >> 4);
 86 | 	a = a * 0x27d4eb2d;
 87 | 	a = a ^ (a >> 15);
 88 | 	return a;
 89 | }
 90 | 
 91 | // initialise camera on the CPU
 92 | void initCamera()
 93 | {
 94 | 	delete interactiveCamera;
 95 | 	interactiveCamera = new InteractiveCamera();
 96 | 
 97 | 	interactiveCamera->setResolution(width, height);
 98 | 	interactiveCamera->setFOVX(45);
 99 | }
100 | 
101 | // create OpenGL vertex buffer object for CUDA to store calculated pixels
102 | void createVBO(GLuint* vbo)
103 | {
104 | 	//Create vertex buffer object
105 | 	glGenBuffers(1, vbo);
106 | 	glBindBuffer(GL_ARRAY_BUFFER, *vbo);
107 | 
108 | 	//Initialize VBO
109 | 	unsigned int size = width * height * sizeof(Vector3Df);
110 | 	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
111 | 
112 | 	glBindBuffer(GL_ARRAY_BUFFER, 0);
113 | 
114 | 	//Register VBO with CUDA
115 | 	cudaGLRegisterBufferObject(*vbo);
116 | }
117 | 
118 | // display function called by glutMainLoop(), gets executed every frame 
119 | void disp(void)   
120 | {
121 | 	// if camera has moved, reset the accumulation buffer
122 | 	if (buffer_reset){ cudaMemset(accumulatebuffer, 1, width * height * sizeof(Vector3Df)); framenumber = 0; }
123 | 
124 | 	buffer_reset = false;
125 | 	framenumber++;
126 | 
127 | 	// build a new camera for each frame on the CPU
128 | 	interactiveCamera->buildRenderCamera(hostRendercam); 
129 | 
130 | 	// copy the CPU camera to a GPU camera
131 | 	cudaMemcpy(cudaRendercam2, hostRendercam, sizeof(Camera), cudaMemcpyHostToDevice);
132 | 
133 | 	cudaThreadSynchronize();
134 | 
135 | 	// maps a buffer object for acces by CUDA
136 | 	cudaGLMapBufferObject((void**)&finaloutputbuffer, vbo);									
137 | 
138 | 	//clear all pixels:
139 | 	glClear(GL_COLOR_BUFFER_BIT);
140 | 
141 | 	// calculate a new seed for the random number generator, based on the framenumber
142 | 	unsigned int hashedframes = WangHash(framenumber);
143 | 
144 | 	// gateway from host to CUDA, passes all data needed to render frame (triangles, BVH tree, camera) to CUDA for execution
145 | 	cudarender(finaloutputbuffer, accumulatebuffer, cudaTriangles2, cudaBVHindexesOrTrilists2, cudaBVHlimits2, cudaTriangleIntersectionData2, 
146 | 					cudaTriIdxList2, framenumber, hashedframes, cudaRendercam2); 
147 | 
148 | 	cudaThreadSynchronize();
149 | 	cudaGLUnmapBufferObject(vbo);
150 | 	//glFlush();
151 | 	glBindBuffer(GL_ARRAY_BUFFER, vbo);
152 | 	glVertexPointer(2, GL_FLOAT, 12, 0);
153 | 	glColorPointer(4, GL_UNSIGNED_BYTE, 12, (GLvoid*)8);
154 | 
155 | 	glEnableClientState(GL_VERTEX_ARRAY);
156 | 	glEnableClientState(GL_COLOR_ARRAY);
157 | 	glDrawArrays(GL_POINTS, 0, width * height);
158 | 	glDisableClientState(GL_VERTEX_ARRAY);
159 | 
160 | 	glutSwapBuffers();
161 | 	//glutPostRedisplay();
162 | }
163 | 
164 | // keyboard interaction
165 | void keyboard(unsigned char key, int /*x*/, int /*y*/)
166 | {
167 | 	switch (key) {
168 | 	
169 | 	case(27) : exit(0);
170 | 	case(' ') : initCamera(); buffer_reset = true; break;
171 | 	case('a') : interactiveCamera->strafe(-0.05f); buffer_reset = true; break;
172 | 	case('d') : interactiveCamera->strafe(0.05f); buffer_reset = true; break;
173 | 	case('r') : interactiveCamera->changeAltitude(0.05f); buffer_reset = true; break;
174 | 	case('f') : interactiveCamera->changeAltitude(-0.05f); buffer_reset = true; break;
175 | 	case('w') : interactiveCamera->goForward(0.05f); buffer_reset = true; break;
176 | 	case('s') : interactiveCamera->goForward(-0.05f); buffer_reset = true; break;
177 | 	case('g') : interactiveCamera->changeApertureDiameter(0.1); buffer_reset = true; break;
178 | 	case('h') : interactiveCamera->changeApertureDiameter(-0.1); buffer_reset = true; break;
179 | 	case('t') : interactiveCamera->changeFocalDistance(0.1); buffer_reset = true; break;
180 | 	case('y') : interactiveCamera->changeFocalDistance(-0.1); buffer_reset = true; break;
181 | 	}
182 | }
183 | 
184 | void specialkeys(int key, int, int){
185 | 
186 | 	switch (key) {
187 | 
188 | 	case GLUT_KEY_LEFT: interactiveCamera->changeYaw(0.02f); buffer_reset = true; break;
189 | 	case GLUT_KEY_RIGHT: interactiveCamera->changeYaw(-0.02f); buffer_reset = true; break;
190 | 	case GLUT_KEY_UP: interactiveCamera->changePitch(0.02f); buffer_reset = true; break;
191 | 	case GLUT_KEY_DOWN: interactiveCamera->changePitch(-0.02f); buffer_reset = true; break;
192 | 
193 | 	}
194 | }
195 | 
196 | // mouse event handlers
197 | 
198 | int lastX = 0, lastY = 0;
199 | int theButtonState = 0;
200 | int theModifierState = 0;
201 | 
202 | // camera mouse controls in X and Y direction
203 | void motion(int x, int y)
204 | {
205 | 	int deltaX = lastX - x;
206 | 	int deltaY = lastY - y;
207 | 
208 | 	if (deltaX != 0 || deltaY != 0) {
209 | 
210 | 		if (theButtonState == GLUT_LEFT_BUTTON)  // Rotate
211 | 		{
212 | 			interactiveCamera->changeYaw(deltaX * 0.01);
213 | 			interactiveCamera->changePitch(-deltaY * 0.01);
214 | 		}
215 | 		else if (theButtonState == GLUT_MIDDLE_BUTTON) // Zoom
216 | 		{
217 | 			interactiveCamera->changeAltitude(-deltaY * 0.01);
218 | 		}
219 | 
220 | 		if (theButtonState == GLUT_RIGHT_BUTTON) // camera move
221 | 		{
222 | 			interactiveCamera->changeRadius(-deltaY * 0.01);
223 | 		}
224 | 
225 | 		lastX = x;
226 | 		lastY = y;
227 | 		buffer_reset = true;
228 | 		glutPostRedisplay(); 
229 | 
230 | 	}
231 | }
232 | 
233 | void mouse(int button, int state, int x, int y)
234 | {
235 | 	theButtonState = button;
236 | 	theModifierState = glutGetModifiers();
237 | 	lastX = x;
238 | 	lastY = y;
239 | 
240 | 	motion(x, y);
241 | }
242 | 
243 | // initialises scene data, builds BVH
244 | void prepCUDAscene(){
245 | 
246 | 	// specify scene filename 
247 | 	//const char* scenefile = "data/teapot.ply";  // teapot.ply, big_atc.ply
248 | 	//const char* scenefile = "data/bunny.obj";
249 | 	//const char* scenefile = "data/bun_zipper_res2.ply";  // teapot.ply, big_atc.ply
250 | 	//const char* scenefile = "data/bun_zipper.ply";  // teapot.ply, big_atc.ply
251 | 	const char* scenefile = "data/dragon_vrip_res4.ply";  // teapot.ply, big_atc.ply
252 | 	//const char* scenefile = "data/dragon_vrip.ply";  // teapot.ply, big_atc.ply
253 | 	//const char* scenefile = "data/happy_vrip.ply";  // teapot.ply, big_atc.ply
254 | 
255 | 	// load scene
256 | 	float maxi = load_object(scenefile);
257 | 
258 | 	// build the BVH
259 | 	UpdateBoundingVolumeHierarchy(scenefile);
260 | 
261 | 	// now, allocate the CUDA side of the data (in CUDA global memory,
262 | 	// in preparation for the textures that will store them...)
263 | 
264 | 	// store vertices in a GPU friendly format using float4
265 | 	float* pVerticesData = (float*)malloc(g_verticesNo * 8 * sizeof(float));
266 | 	for (unsigned f = 0; f<g_verticesNo; f++) {
267 | 		
268 | 		// first float4 stores vertex xyz position and precomputed ambient occlusion
269 | 		pVerticesData[f * 8 + 0] = g_vertices[f].x;
270 | 		pVerticesData[f * 8 + 1] = g_vertices[f].y;
271 | 		pVerticesData[f * 8 + 2] = g_vertices[f].z;
272 | 		pVerticesData[f * 8 + 3] = g_vertices[f]._ambientOcclusionCoeff;
273 | 		// second float4 stores vertex normal xyz
274 | 		pVerticesData[f * 8 + 4] = g_vertices[f]._normal.x;
275 | 		pVerticesData[f * 8 + 5] = g_vertices[f]._normal.y;
276 | 		pVerticesData[f * 8 + 6] = g_vertices[f]._normal.z;
277 | 		pVerticesData[f * 8 + 7] = 0.f;
278 | 	}
279 | 
280 | 	// copy vertex data to CUDA global memory
281 | 	cudaMalloc((void**)&cudaVertices2, g_verticesNo * 8 * sizeof(float));
282 | 	cudaMemcpy(cudaVertices2, pVerticesData, g_verticesNo * 8 * sizeof(float), cudaMemcpyHostToDevice);
283 | 
284 | 	// store precomputed triangle intersection data in a GPU friendly format using float4
285 | 	float *pTrianglesIntersectionData = (float *)malloc(g_trianglesNo * 20 * sizeof(float));
286 | 
287 | 	for (unsigned e = 0; e<g_trianglesNo; e++) {
288 | 		// Texture-wise:
289 | 		//
290 | 		// first float4, triangle center + two-sided bool
291 | 		pTrianglesIntersectionData[20 * e + 0] = g_triangles[e]._center.x;
292 | 		pTrianglesIntersectionData[20 * e + 1] = g_triangles[e]._center.y;
293 | 		pTrianglesIntersectionData[20 * e + 2] = g_triangles[e]._center.z;
294 | 		pTrianglesIntersectionData[20 * e + 3] = g_triangles[e]._twoSided ? 1.0f : 0.0f;
295 | 		// second float4, normal
296 | 		pTrianglesIntersectionData[20 * e + 4] = g_triangles[e]._normal.x;
297 | 		pTrianglesIntersectionData[20 * e + 5] = g_triangles[e]._normal.y;
298 | 		pTrianglesIntersectionData[20 * e + 6] = g_triangles[e]._normal.z;
299 | 		pTrianglesIntersectionData[20 * e + 7] = g_triangles[e]._d;
300 | 		// third float4, precomputed plane normal of triangle edge 1
301 | 		pTrianglesIntersectionData[20 * e + 8] = g_triangles[e]._e1.x;  
302 | 		pTrianglesIntersectionData[20 * e + 9] = g_triangles[e]._e1.y;
303 | 		pTrianglesIntersectionData[20 * e + 10] = g_triangles[e]._e1.z;
304 | 		pTrianglesIntersectionData[20 * e + 11] = g_triangles[e]._d1;
305 | 		// fourth float4, precomputed plane normal of triangle edge 2
306 | 		pTrianglesIntersectionData[20 * e + 12] = g_triangles[e]._e2.x; 
307 | 		pTrianglesIntersectionData[20 * e + 13] = g_triangles[e]._e2.y;
308 | 		pTrianglesIntersectionData[20 * e + 14] = g_triangles[e]._e2.z;
309 | 		pTrianglesIntersectionData[20 * e + 15] = g_triangles[e]._d2;
310 | 		// fifth float4, precomputed plane normal of triangle edge 3
311 | 		pTrianglesIntersectionData[20 * e + 16] = g_triangles[e]._e3.x;
312 | 		pTrianglesIntersectionData[20 * e + 17] = g_triangles[e]._e3.y;
313 | 		pTrianglesIntersectionData[20 * e + 18] = g_triangles[e]._e3.z;
314 | 		pTrianglesIntersectionData[20 * e + 19] = g_triangles[e]._d3;
315 | 	}
316 | 
317 | 	// copy precomputed triangle intersection data to CUDA global memory
318 | 	cudaMalloc((void**)&cudaTriangleIntersectionData2, g_trianglesNo * 20 * sizeof(float));
319 | 	cudaMemcpy(cudaTriangleIntersectionData2, pTrianglesIntersectionData, g_trianglesNo * 20 * sizeof(float), cudaMemcpyHostToDevice);
320 | 
321 | 	// copy triangle data to CUDA global memory
322 | 	cudaMalloc((void**)&cudaTriangles2, g_trianglesNo*sizeof(Triangle));
323 | 	cudaMemcpy(cudaTriangles2, g_triangles, g_trianglesNo*sizeof(Triangle), cudaMemcpyHostToDevice);
324 | 
325 | 	// Allocate CUDA-side data (global memory for corresponding textures) for Bounding Volume Hierarchy data
326 | 	// See BVH.h for the data we are storing (from CacheFriendlyBVHNode)
327 | 
328 | 	// Leaf nodes triangle lists (indices to global triangle list)
329 | 	// copy triangle indices to CUDA global memory
330 | 	cudaMalloc((void**)&cudaTriIdxList2, g_triIndexListNo*sizeof(int));
331 | 	cudaMemcpy(cudaTriIdxList2, g_triIndexList, g_triIndexListNo*sizeof(int), cudaMemcpyHostToDevice);
332 | 
333 | 	// Bounding box limits need bottom._x, top._x, bottom._y, top._y, bottom._z, top._z...
334 | 	// store BVH bounding box limits in a GPU friendly format using float2
335 | 	float *pLimits = (float *)malloc(g_pCFBVH_No * 6 * sizeof(float));
336 | 
337 | 	for (unsigned h = 0; h<g_pCFBVH_No; h++) {
338 | 		// Texture-wise:
339 | 		// First float2
340 | 		pLimits[6 * h + 0] = g_pCFBVH[h]._bottom.x;
341 | 		pLimits[6 * h + 1] = g_pCFBVH[h]._top.x;
342 | 		// Second float2
343 | 		pLimits[6 * h + 2] = g_pCFBVH[h]._bottom.y;
344 | 		pLimits[6 * h + 3] = g_pCFBVH[h]._top.y;
345 | 		// Third float2
346 | 		pLimits[6 * h + 4] = g_pCFBVH[h]._bottom.z;
347 | 		pLimits[6 * h + 5] = g_pCFBVH[h]._top.z;
348 | 	}
349 | 	
350 | 	// copy BVH limits to CUDA global memory
351 | 	cudaMalloc((void**)&cudaBVHlimits2, g_pCFBVH_No * 6 * sizeof(float));
352 | 	cudaMemcpy(cudaBVHlimits2, pLimits, g_pCFBVH_No * 6 * sizeof(float), cudaMemcpyHostToDevice);
353 | 
354 | 	// ..and finally, from CacheFriendlyBVHNode, the 4 integer values:
355 | 	// store BVH node attributes (triangle count, startindex, left and right child indices) in a GPU friendly format using uint4
356 | 	int *pIndexesOrTrilists = (int *)malloc(g_pCFBVH_No * 4 * sizeof(unsigned));
357 | 
358 | 	for (unsigned g = 0; g<g_pCFBVH_No; g++) {
359 | 		// Texture-wise:
360 | 		// A single uint4
361 | 		pIndexesOrTrilists[4 * g + 0] = g_pCFBVH[g].u.leaf._count;  // number of triangles stored in this node if leaf node
362 | 		pIndexesOrTrilists[4 * g + 1] = g_pCFBVH[g].u.inner._idxRight; // index to right child if inner node
363 | 		pIndexesOrTrilists[4 * g + 2] = g_pCFBVH[g].u.inner._idxLeft;  // index to left node if inner node
364 | 		pIndexesOrTrilists[4 * g + 3] = g_pCFBVH[g].u.leaf._startIndexInTriIndexList; // start index in list of triangle indices if leaf node
365 | 		// union
366 | 
367 | 	}
368 | 
369 | 	// copy BVH node attributes to CUDA global memory
370 | 	cudaMalloc((void**)&cudaBVHindexesOrTrilists2, g_pCFBVH_No * 4 * sizeof(unsigned));
371 | 	cudaMemcpy(cudaBVHindexesOrTrilists2, pIndexesOrTrilists, g_pCFBVH_No * 4 * sizeof(unsigned), cudaMemcpyHostToDevice);
372 | 	
373 | 	// Initialisation Done!
374 | 	std::cout << "Rendering data initialised and copied to CUDA global memory\n";
375 | }
376 | 
377 | int main(int argc, char** argv){
378 | 
379 | 	// initialise an interactive camera on the CPU side
380 | 	initCamera();
381 | 	// create a CPU camera
382 | 	hostRendercam = new Camera();
383 | 	interactiveCamera->buildRenderCamera(hostRendercam);
384 | 
385 | 	// initialise all data needed to start rendering (BVH data, triangles, vertices)
386 | 	prepCUDAscene();
387 | 	
388 | 	// allocate GPU memory for accumulation buffer
389 | 	cudaMalloc(&accumulatebuffer, width * height * sizeof(Vector3Df));
390 | 	// allocate GPU memory for interactive camera
391 | 	cudaMalloc((void**)&cudaRendercam2, sizeof(Camera));
392 | 
393 | 	// init glut:
394 | 	glutInit(&argc, argv);
395 | 	// specify the display mode to be RGB and single buffering:
396 | 	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
397 | 	// specify the initial window position:
398 | 	glutInitWindowPosition(100, 100);
399 | 	// specify the initial window size:
400 | 	glutInitWindowSize(width, height);
401 | 	// create the window and set title:
402 | 	glutCreateWindow("Basic triangle mesh path tracer in CUDA");
403 | 	
404 | 	// init opengl:
405 | 	glClearColor(0.0, 0.0, 0.0, 0.0);
406 | 	glMatrixMode(GL_PROJECTION);
407 | 	gluOrtho2D(0.0, width, 0.0, height);
408 | 	fprintf(stderr, "OpenGL initialized \n");
409 | 
410 | 	// register callback function to display graphics:
411 | 	glutDisplayFunc(disp);
412 | 	
413 | 	// functions for user interaction
414 | 	glutKeyboardFunc(keyboard);
415 | 	glutSpecialFunc(specialkeys);
416 | 	glutMouseFunc(mouse);
417 | 	glutMotionFunc(motion);
418 | 
419 | 	glewInit();
420 | 	if (!glewIsSupported("GL_VERSION_2_0 ")) {
421 | 		fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing.");
422 | 		fflush(stderr);
423 | 		exit(0);
424 | 	}
425 | 	fprintf(stderr, "glew initialized  \n");
426 | 	// call Timer():
427 | 	Timer(0);
428 | 	createVBO(&vbo);
429 | 	fprintf(stderr, "VBO created  \n");
430 | 	// enter the main loop and process events:
431 | 	fprintf(stderr, "Entering glutMainLoop...  \n");
432 | 	glutMainLoop();
433 | 
434 | 
435 | 	printf("CUDA initialised.\nStart rendering...\n");
436 | 
437 | 	// free CUDA memory
438 | 	cudaFree(finaloutputbuffer);  
439 | 	cudaFree(accumulatebuffer);
440 | 	cudaFree(cudaBVHindexesOrTrilists2);
441 | 	cudaFree(cudaBVHlimits2);
442 | 	cudaFree(cudaTriIdxList2);
443 | 	cudaFree(cudaRendercam2);
444 | 	cudaFree(cudaTriangles2);
445 | 	cudaFree(cudaTriangleIntersectionData2);
446 | 	cudaFree(cudaVertices2);
447 | 
448 | 	system("PAUSE");
449 | }
450 | 


--------------------------------------------------------------------------------