├── .gitignore
├── README.md
├── docs
    └── prisacariu_reid_tr2310_09.pdf
└── source
    ├── fastHOG
        ├── Files
        │   ├── Images
        │   │   └── testImage.bmp
        │   └── SVM
        │   │   └── head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt
        ├── HOG
        │   ├── HOGConvolution.cu
        │   ├── HOGConvolution.h
        │   ├── HOGConvolution.linkinfo
        │   ├── HOGDefines.h
        │   ├── HOGEngine.cpp
        │   ├── HOGEngine.h
        │   ├── HOGEngineDevice.cu
        │   ├── HOGEngineDevice.h
        │   ├── HOGEngineDevice.linkinfo
        │   ├── HOGHistogram.cu
        │   ├── HOGHistogram.h
        │   ├── HOGHistogram.linkinfo
        │   ├── HOGImage.cpp
        │   ├── HOGImage.h
        │   ├── HOGNMS.cpp
        │   ├── HOGNMS.h
        │   ├── HOGPadding.cu
        │   ├── HOGPadding.h
        │   ├── HOGPadding.linkinfo
        │   ├── HOGPoint3.h
        │   ├── HOGResult.h
        │   ├── HOGSVMSlider.cu
        │   ├── HOGSVMSlider.h
        │   ├── HOGSVMSlider.linkinfo
        │   ├── HOGScale.cu
        │   ├── HOGScale.h
        │   ├── HOGScale.linkinfo
        │   ├── HOGUtils.cu
        │   ├── HOGUtils.h
        │   ├── HOGUtils.linkinfo
        │   └── cutil.h
        ├── HOGConvolution.linkinfo
        ├── HOGEngineDevice.linkinfo
        ├── HOGHistogram.linkinfo
        ├── HOGPadding.linkinfo
        ├── HOGSVMSlider.linkinfo
        ├── HOGScale.linkinfo
        ├── HOGUtils.linkinfo
        ├── Makefile
        ├── Others
        │   └── persondetectorwt.tcc
        ├── Utils
        │   ├── ImageWindow.cpp
        │   ├── ImageWindow.h
        │   └── Timer.h
        ├── bin
        │   └── release
        │   │   └── fastHOG
        ├── common.mk
        ├── fastHOG.cpp
        └── fastHOG.vcproj
    ├── fastHOGLib.sln
    └── fastHOGLib.suo


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | 
 6 | # Compiled Dynamic libraries
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Compiled Static libraries
11 | *.lai
12 | *.la
13 | *.a
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | FastHOG
 2 | =======
 3 | 
 4 | The original **FastHOG** source files can be obtained [here](http://www.robots.ox.ac.uk/~lav/Papers/prisacariu_reid_tr2310_09/prisacariu_reid_tr2310_09.html).
 5 | These source files do not compile under any recent version of CUDA on Ubuntu (or any Linux distribution).
 6 | 
 7 | These source files were fixed to compile with CUDA 5.5 on Ubuntu 12.04.
 8 | 
 9 | Steps to compile and use this version of FastHOG:
10 | 
11 | 1. Install CUDA 5.5 or a recent version.
12 | 2. Install `libxinerama-dev` and `libfreeimage-dev`.
13 | 3. Build and install the 2.0 branch of FLTK. Instructions to do this can be found [here](http://choorucode.com/2014/01/22/how-to-build-and-install-fltk-2-0/).
14 | 4. `cd source/fastHOG` and build using `make`.
15 | 5. Run the sample FastHOG program using `bin/release/fastHOG`. (Note that it has to be run from this directory, else it fails.) A picture of pedestrians is displayed. Click anywhere on it to detect the people.
16 | 


--------------------------------------------------------------------------------
/docs/prisacariu_reid_tr2310_09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/docs/prisacariu_reid_tr2310_09.pdf


--------------------------------------------------------------------------------
/source/fastHOG/Files/Images/testImage.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/Files/Images/testImage.bmp


--------------------------------------------------------------------------------
/source/fastHOG/Files/SVM/head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/Files/SVM/head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGConvolution.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGConvolution.h"
  2 | #include "HOGUtils.h"
  3 | #include "cutil.h"
  4 | 
  5 | dim3 blockGridRows;
  6 | dim3 blockGridColumns;
  7 | dim3 threadBlockRows;
  8 | dim3 threadBlockColumns;
  9 | 
 10 | #define convKernelRadius 1
 11 | #define convKernelWidth (2 * convKernelRadius + 1)
 12 | __device__ __constant__ float d_Kernel[convKernelWidth];
 13 | float *h_Kernel;
 14 | 
 15 | #define convRowTileWidth 128
 16 | #define convKernelRadiusAligned 16
 17 | 
 18 | #define convColumnTileWidth 16
 19 | #define convColumnTileHeight 48
 20 | 
 21 | float4 *convBuffer4;
 22 | float1 *convBuffer1;
 23 | 
 24 | int convWidth;
 25 | int convHeight;
 26 | const int convKernelSize = convKernelWidth * sizeof(float);
 27 | 
 28 | bool convUseGrayscale;
 29 | 
 30 | template<int i> __device__ float1 convolutionRow(float1 *data) {
 31 | 	float1 val = data[convKernelRadius-i];
 32 | 	val.x *= d_Kernel[i];
 33 | 	val.x += convolutionRow<i-1>(data).x;
 34 | 	return val;
 35 | }
 36 | template<> __device__ float1 convolutionRow<-1>(float1 *data){float1 zero; zero.x = 0; return zero;}
 37 | template<int i> __device__ float1 convolutionColumn(float1 *data) {
 38 | 	float1 val = data[(convKernelRadius-i)*convColumnTileWidth];
 39 | 	val.x *= d_Kernel[i];
 40 | 	val.x += convolutionColumn<i-1>(data).x;
 41 | 	return val;
 42 | }
 43 | template<> __device__ float1 convolutionColumn<-1>(float1 *data){float1 zero; zero.x = 0; return zero;}
 44 | 
 45 | template<int i> __device__ float4 convolutionRow(float4 *data) {
 46 | 	float4 val = data[convKernelRadius-i];
 47 | 	val.x *= d_Kernel[i]; val.y *= d_Kernel[i];
 48 | 	val.z *= d_Kernel[i]; val.w *= d_Kernel[i];
 49 | 	float4 val2 = convolutionRow<i-1>(data);
 50 | 	val.x += val2.x; val.y += val2.y;
 51 | 	val.z += val2.z; val.w += val2.w;
 52 | 	return val;
 53 | }
 54 | template<> __device__ float4 convolutionRow<-1>(float4 *data) {
 55 | 	float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0;
 56 | 	return zero;
 57 | }
 58 | template<int i> __device__ float4 convolutionColumn(float4 *data) {
 59 | 	float4 val = data[(convKernelRadius-i)*convColumnTileWidth];
 60 | 	val.x *= d_Kernel[i]; val.y *= d_Kernel[i];
 61 | 	val.z *= d_Kernel[i]; val.w *= d_Kernel[i];
 62 | 	float4 val2 = convolutionColumn<i-1>(data);
 63 | 	val.x += val2.x; val.y += val2.y;
 64 | 	val.z += val2.z; val.w += val2.w;
 65 | 	return val;
 66 | }
 67 | template<> __device__ float4 convolutionColumn<-1>(float4 *data) {
 68 | 	float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0;
 69 | 	return zero;
 70 | }
 71 | 
 72 | __global__ void convolutionRowGPU1(float1 *d_Result, float1 *d_Data, int dataW, int dataH)
 73 | {
 74 | 	float1 zero; zero.x = 0;
 75 | 
 76 | 	const int rowStart = IMUL(blockIdx.y, dataW);
 77 | 
 78 | 	__shared__ float1 data[convKernelRadius + convRowTileWidth + convKernelRadius];
 79 | 
 80 | 	const int tileStart = IMUL(blockIdx.x, convRowTileWidth);
 81 | 	const int tileEnd = tileStart + convRowTileWidth - 1;
 82 | 	const int apronStart = tileStart - convKernelRadius;
 83 | 	const int apronEnd = tileEnd + convKernelRadius;
 84 | 
 85 | 	const int tileEndClamped = min(tileEnd, dataW - 1);
 86 | 	const int apronStartClamped = max(apronStart, 0);
 87 | 	const int apronEndClamped = min(apronEnd, dataW - 1);
 88 | 
 89 | 	const int apronStartAligned = tileStart - convKernelRadiusAligned;
 90 | 
 91 | 	const int loadPos = apronStartAligned + threadIdx.x;
 92 | 
 93 | 	if(loadPos >= apronStart)
 94 | 	{
 95 | 		const int smemPos = loadPos - apronStart;
 96 | 		data[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? d_Data[rowStart + loadPos] : zero;
 97 | 	}
 98 | 
 99 | 	__syncthreads();
100 | 	const int writePos = tileStart + threadIdx.x;
101 | 
102 | 	if(writePos <= tileEndClamped)
103 | 	{
104 | 		const int smemPos = writePos - apronStart;
105 | 		float1 sum = convolutionRow<2 * convKernelRadius>(data + smemPos);
106 | 		d_Result[rowStart + writePos] = sum;
107 | 	}
108 | }
109 | __global__ void convolutionRowGPU4(float4 *d_Result, float4 *d_Data, int dataW, int dataH)
110 | {
111 | 	float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0;
112 | 
113 | 	const int rowStart = IMUL(blockIdx.y, dataW);
114 | 
115 | 	__shared__ float4 data[convKernelRadius + convRowTileWidth + convKernelRadius];
116 | 
117 | 	const int tileStart = IMUL(blockIdx.x, convRowTileWidth);
118 | 	const int tileEnd = tileStart + convRowTileWidth - 1;
119 | 	const int apronStart = tileStart - convKernelRadius;
120 | 	const int apronEnd = tileEnd + convKernelRadius;
121 | 
122 | 	const int tileEndClamped = min(tileEnd, dataW - 1);
123 | 	const int apronStartClamped = max(apronStart, 0);
124 | 	const int apronEndClamped = min(apronEnd, dataW - 1);
125 | 
126 | 	const int apronStartAligned = tileStart - convKernelRadiusAligned;
127 | 
128 | 	const int loadPos = apronStartAligned + threadIdx.x;
129 | 
130 | 	if(loadPos >= apronStart)
131 | 	{
132 | 		const int smemPos = loadPos - apronStart;
133 | 		data[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? d_Data[rowStart + loadPos] : zero;
134 | 	}
135 | 
136 | 	__syncthreads();
137 | 	const int writePos = tileStart + threadIdx.x;
138 | 
139 | 	if(writePos <= tileEndClamped)
140 | 	{
141 | 		const int smemPos = writePos - apronStart;
142 | 		float4 sum = convolutionRow<2 * convKernelRadius>(data + smemPos);
143 | 		d_Result[rowStart + writePos] = sum;
144 | 	}
145 | }
146 | __global__ void convolutionColumnGPU1to2 ( float2 *d_Result, float1 *d_Data, float1 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride)
147 | {
148 | 	float1 rowValue;
149 | 	float1 zero; zero.x = 0;
150 | 	float2 result;
151 | 
152 | 	const int columnStart = IMUL(blockIdx.x, convColumnTileWidth) + threadIdx.x;
153 | 
154 | 	__shared__ float1 data[convColumnTileWidth * (convKernelRadius + convColumnTileHeight + convKernelRadius)];
155 | 
156 | 	const int tileStart = IMUL(blockIdx.y, convColumnTileHeight);
157 | 	const int tileEnd = tileStart + convColumnTileHeight - 1;
158 | 	const int apronStart = tileStart - convKernelRadius;
159 | 	const int apronEnd = tileEnd   + convKernelRadius;
160 | 
161 | 	const int tileEndClamped = min(tileEnd, dataH - 1);
162 | 	const int apronStartClamped = max(apronStart, 0);
163 | 	const int apronEndClamped = min(apronEnd, dataH - 1);
164 | 
165 | 	int smemPos = IMUL(threadIdx.y, convColumnTileWidth) + threadIdx.x;
166 | 	int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
167 | 
168 | 	for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y)
169 | 	{
170 | 		data[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ?  d_Data[gmemPos] : zero;
171 | 		smemPos += smemStride;
172 | 		gmemPos += gmemStride;
173 | 	}
174 | 
175 | 	__syncthreads();
176 | 
177 | 	smemPos = IMUL(threadIdx.y + convKernelRadius, convColumnTileWidth) + threadIdx.x;
178 | 	gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
179 | 
180 | 	for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y)
181 | 	{
182 | 		float1 sum = convolutionColumn<2 * convKernelRadius>(data + smemPos);
183 | 		rowValue = d_DataRow[gmemPos];
184 | 
185 | 		result.x = sqrtf(sum.x * sum.x + rowValue.x * rowValue.x);
186 | 		result.y = atan2f(sum.x, rowValue.x) * RADTODEG;
187 | 
188 | 		d_Result[gmemPos] = result;
189 | 		smemPos += smemStride;
190 | 		gmemPos += gmemStride;
191 | 	}
192 | }
193 | 
194 | __global__ void convolutionColumnGPU4to2 ( float2 *d_Result, float4 *d_Data, float4 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride)
195 | {
196 | 	//float3 max12, mag4;
197 | 	float3 mag1, mag2, mag3;
198 | 	float3 max34, magMax;
199 | 	float2 result;
200 | 	float4 rowValue;
201 | 	float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0;
202 | 
203 | 	const int columnStart = IMUL(blockIdx.x, convColumnTileWidth) + threadIdx.x;
204 | 
205 | 	__shared__ float4 data[convColumnTileWidth * (convKernelRadius + convColumnTileHeight + convKernelRadius)];
206 | 
207 | 	const int tileStart = IMUL(blockIdx.y, convColumnTileHeight);
208 | 	const int tileEnd = tileStart + convColumnTileHeight - 1;
209 | 	const int apronStart = tileStart - convKernelRadius;
210 | 	const int apronEnd = tileEnd   + convKernelRadius;
211 | 
212 | 	const int tileEndClamped = min(tileEnd, dataH - 1);
213 | 	const int apronStartClamped = max(apronStart, 0);
214 | 	const int apronEndClamped = min(apronEnd, dataH - 1);
215 | 
216 | 	int smemPos = IMUL(threadIdx.y, convColumnTileWidth) + threadIdx.x;
217 | 	int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
218 | 
219 | 	for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y)
220 | 	{
221 | 		data[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ?  d_Data[gmemPos] : zero;
222 | 		smemPos += smemStride;
223 | 		gmemPos += gmemStride;
224 | 	}
225 | 
226 | 	__syncthreads();
227 | 
228 | 	smemPos = IMUL(threadIdx.y + convKernelRadius, convColumnTileWidth) + threadIdx.x;
229 | 	gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
230 | 
231 | 	for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y)
232 | 	{
233 | 		float4 sum = convolutionColumn<2 * convKernelRadius>(data + smemPos);
234 | 		rowValue = d_DataRow[gmemPos];
235 | 
236 | 		mag1.x = sqrtf(sum.x * sum.x + rowValue.x * rowValue.x); mag1.y = sum.x; mag1.z = rowValue.x;
237 | 		mag2.x = sqrtf(sum.y * sum.y + rowValue.y * rowValue.y); mag2.y = sum.y; mag2.z = rowValue.y;
238 | 		mag3.x = sqrtf(sum.z * sum.z + rowValue.z * rowValue.z); mag3.y = sum.z; mag3.z = rowValue.z;
239 | 
240 | 		max34 = (mag2.x > mag3.x) ? mag2 : mag3;
241 | 		magMax = (mag1.x > max34.x) ? mag1 : max34;
242 | 
243 | 		result.x = magMax.x;
244 | 		result.y = atan2f(magMax.y, magMax.z);
245 | 		result.y = result.y * 180 / PI + 180;
246 | 		result.y = int(result.y) % 180; //TODO-> if semicerc
247 | 
248 | 		d_Result[gmemPos] = result;
249 | 		smemPos += smemStride;
250 | 		gmemPos += gmemStride;
251 | 	}
252 | }
253 | __host__ void InitConvolution(int width, int height, bool useGrayscale)
254 | {
255 | 	convUseGrayscale = useGrayscale;
256 | 
257 | 	h_Kernel = (float *)malloc(convKernelSize);
258 | 	h_Kernel[0] = 1.0f; h_Kernel[1] = 0;  h_Kernel[2] = -1.0f;
259 | 
260 | 	cutilSafeCall( cudaMemcpyToSymbol(d_Kernel, h_Kernel, convKernelSize) );
261 | 
262 | 	if (useGrayscale)
263 | 		cutilSafeCall(cudaMalloc((void**) &convBuffer1, sizeof(float1) * width * height));
264 | 	else
265 | 		cutilSafeCall(cudaMalloc((void**) &convBuffer4, sizeof(float4) * width * height));
266 | }
267 | 
268 | __host__ void SetConvolutionSize(int width, int height)
269 | {
270 | 	convWidth = width;
271 | 	convHeight = height;
272 | 
273 | 	blockGridRows = dim3(iDivUp(convWidth, convRowTileWidth), convHeight);
274 | 	blockGridColumns = dim3(iDivUp(convWidth, convColumnTileWidth), iDivUp(convHeight, convColumnTileHeight));
275 | 	threadBlockRows = dim3(convKernelRadiusAligned + convRowTileWidth + convKernelRadius);
276 | 	threadBlockColumns = dim3(convColumnTileWidth, 8);
277 | }
278 | __host__ void CloseConvolution()
279 | {
280 | 	if (convUseGrayscale)
281 | 		cutilSafeCall(cudaFree(convBuffer1));
282 | 	else
283 | 		cutilSafeCall(cudaFree(convBuffer4));
284 | 
285 | 	free(h_Kernel);
286 | }
287 | __host__ void ComputeColorGradients1to2(float1* inputImage, float2* outputImage)
288 | {
289 | 	convolutionRowGPU1<<<blockGridRows, threadBlockRows>>>(convBuffer1, inputImage, convWidth, convHeight);
290 | 	convolutionColumnGPU1to2<<<blockGridColumns, threadBlockColumns>>>(outputImage, inputImage, convBuffer1, convWidth, convHeight,
291 | 		convColumnTileWidth * threadBlockColumns.y, convWidth * threadBlockColumns.y);
292 | }
293 | 
294 | __host__ void ComputeColorGradients4to2(float4* inputImage, float2* outputImage)
295 | {
296 | 	convolutionRowGPU4<<<blockGridRows, threadBlockRows>>>(convBuffer4, inputImage, convWidth, convHeight);
297 | 	convolutionColumnGPU4to2<<<blockGridColumns, threadBlockColumns>>>(outputImage, inputImage, convBuffer4, convWidth, convHeight,
298 | 		convColumnTileWidth * threadBlockColumns.y, convWidth * threadBlockColumns.y);
299 | }
300 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGConvolution.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_CONVOLUTION__
 2 | #define __HOG_CONVOLUTION__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | 
16 | #include "HOGDefines.h"
17 | 
18 | __host__ void InitConvolution(int width, int height, bool useGrayscale);
19 | __host__ void SetConvolutionSize(int width, int height);
20 | __host__ void CloseConvolution();
21 | 
22 | __host__ void ComputeColorGradients1to2(float1* inputImage, float2* outputImage);
23 | __host__ void ComputeColorGradients4to2(float4* inputImage, float2* outputImage);
24 | 
25 | __global__ void convolutionRowGPU1(float1 *d_Result, float1 *d_Data, int dataW, int dataH);
26 | __global__ void convolutionRowGPU4(float4 *d_Result, float4 *d_Data, int dataW, int dataH);
27 | 
28 | __global__ void convolutionColumnGPU1to2 ( float1 *d_Result, float1 *d_Data, float1 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride);
29 | __global__ void convolutionColumnGPU4to2 ( float2 *d_Result, float4 *d_Data, float4 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride);
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGConvolution.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z24convolutionColumnGPU1to2P6float2P6float1S2_iiii,_Z18convolutionRowGPU1P6float1S0_ii,_Z18convolutionRowGPU4P6float4S0_ii,_Z24convolutionColumnGPU4to2P6float2P6float4S2_iiii


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGDefines.h:
--------------------------------------------------------------------------------
  1 | #ifndef __HOG_DEFINES__
  2 | #define __HOG_DEFINES__
  3 | 
  4 | #define UNROLL_LOOPS
  5 | 
  6 | #ifdef _WIN32
  7 | 	#pragma comment( lib, "C:\\CUDA\\lib\\cuda.lib" )
  8 | 	#pragma comment( lib, "C:\\CUDA\\lib\\cudart.lib" )
  9 | 	#pragma comment( lib, "C:\\CUDA\\SDK\\common\\lib\\cutil32.lib" )
 10 | #endif
 11 | 
 12 | #ifndef CUDA_PIXEL
 13 | #define CUDA_PIXEL unsigned char
 14 | #endif
 15 | 
 16 | #ifndef CUDA_FLOAT
 17 | #define CUDA_FLOAT float
 18 | #endif
 19 | 
 20 | #ifndef CUDA_DT_PIXEL
 21 | #define CUDA_DT_PIXEL float
 22 | #endif
 23 | 
 24 | #ifndef CUDA_DT_PIXEL_INT
 25 | #define CUDA_DT_PIXEL_INT int
 26 | #endif
 27 | 
 28 | #ifndef THREAD_SIZE_W
 29 | #define THREAD_SIZE_W 16
 30 | #endif
 31 | 
 32 | #ifndef THREAD_SIZE_H
 33 | #define THREAD_SIZE_H 16
 34 | #endif
 35 | 
 36 | #ifndef BLOCK_SIZE_H
 37 | #define BLOCK_SIZE_H 16
 38 | #endif
 39 | 
 40 | #ifndef BLOCK_SIZE_W
 41 | #define BLOCK_SIZE_W 16
 42 | #endif
 43 | 
 44 | #ifndef MAX_HISTOGRAM_NO_BINS
 45 | #define MAX_HISTOGRAM_NO_BINS 9
 46 | #endif
 47 | 
 48 | #ifndef MAX_CELL_SIZE_Y
 49 | #define MAX_CELL_SIZE_Y 8
 50 | #endif
 51 | 
 52 | #ifndef MAX_CELL_SIZE_X
 53 | #define MAX_CELL_SIZE_X 8
 54 | #endif
 55 | 
 56 | #ifndef MAX_BLOCK_SIZE_X
 57 | #define MAX_BLOCK_SIZE_X 2
 58 | #endif
 59 | 
 60 | #ifndef MAX_BLOCK_SIZE_Y
 61 | #define MAX_BLOCK_SIZE_Y 2
 62 | #endif
 63 | 
 64 | #ifndef MAX_BLOCKS_PER_WINDOW_X
 65 | #define MAX_BLOCKS_PER_WINDOW_X 7
 66 | #endif
 67 | 
 68 | #ifndef MAX_BLOCKS_PER_WINDOW_Y
 69 | #define MAX_BLOCKS_PER_WINDOW_Y 15
 70 | #endif
 71 | 
 72 | #ifndef EXECUTYIN512THREADS
 73 | #define EXECUTYIN512THREADS(counter, startPoint, func, params) \
 74 | 	startPoint = 0;\
 75 | 	if (counter / 512 > 0) \
 76 | 	{ \
 77 | 		while (counter / 512 > 0) \
 78 | 		{ \
 79 | 		func<<<1, 512>>> ## params; \
 80 | 			startPoint += 512; \
 81 | 			counter -= 512; \
 82 | 		} \
 83 | 		if (counter != 0) \
 84 | 		func<<<1, counter>>> ## params; \
 85 | 	} \
 86 | 	else \
 87 | 	func<<<1, counter>>> ## params;
 88 | #endif
 89 | 
 90 | #ifndef WARP_SIZE
 91 | #define WARP_SIZE 32
 92 | #endif
 93 | 
 94 | #ifndef MAX_BLOCKS_PER_DIM
 95 | #define MAX_BLOCKS_PER_DIM	65536
 96 | #endif
 97 | 
 98 | #ifndef IMUL
 99 | #define IMUL(a, b) __mul24(a, b)
100 | #endif
101 | 
102 | #ifndef PI
103 | #define PI 3.1415926535897932384626433832795
104 | #endif
105 | 
106 | #ifndef DEGTORAD
107 | #define DEGTORAD 0.017453292519943295769236907684886
108 | #endif
109 | 
110 | #ifndef RADTODEG
111 | #define RADTODEG 57.2957795
112 | #endif
113 | 
114 | #endif
115 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGEngine.cpp:
--------------------------------------------------------------------------------
  1 | #include "HOGEngine.h"
  2 | #include "HOGNMS.h"
  3 | 
  4 | #include "HOGDefines.h"
  5 | 
  6 | #include <stdlib.h>
  7 | #include <math.h>
  8 | #include <string.h>
  9 | #include <cstdio>
 10 | 
 11 | using namespace HOG;
 12 | 
 13 | HOGEngine* HOGEngine::instance;
 14 | 
 15 | extern "C" void InitHOG(int width, int height, int avSizeX, int avSizeY,
 16 | 								 int marginX, int marginY, int cellSizeX, int cellSizeY,
 17 | 								 int blockSizeX, int blockSizeY, int windowSizeX, int windowSizeY,
 18 | 								 int noOfHistogramBins, float wtscale, float svmBias, float* svmWeights,
 19 | 								 int svmWeightsCount, bool useGrayscale);
 20 | 
 21 | extern "C" void CloseHOG();
 22 | 
 23 | extern "C" void BeginHOGProcessing(unsigned char* hostImage, int minx, int miny, int maxx, int maxy, float minScale, float maxScale);
 24 | extern "C" float* EndHOGProcessing();
 25 | 
 26 | extern "C" void GetProcessedImage(unsigned char* hostImage, int imageType);
 27 | extern "C" void GetHOGParameters(float *cStartScale, float *cEndScale, float *cScaleRatio, int *cScaleCount,
 28 | 										   int *cPaddingSizeX, int *cPaddingSizeY, int *cPaddedWidth, int *cPaddedHeight,
 29 | 										   int *cNoOfCellsX, int *cNoOfCellsY, int *cNoOfBlocksX, int *cNoOfBlocksY,
 30 | 										   int *cNumberOfWindowsX, int *cNumberOfWindowsY,
 31 | 										   int *cNumberOfBlockPerWindowX, int *cNumberOfBlockPerWindowY);
 32 | 
 33 | int HOGEngine::iDivUpF(int a, float b) { return (a % int(b) != 0) ? int(a / b + 1) : int(a / b);}
 34 | 
 35 | void HOGEngine::InitializeHOG(int iw, int ih, std::string fileName)
 36 | {
 37 | 	this->imageWidth = iw;
 38 | 	this->imageHeight = ih;
 39 | 
 40 | 	this->avSizeX = 0;
 41 | 	this->avSizeY = 0;
 42 | 	this->marginX = 0;
 43 | 	this->marginY = 0;
 44 | 
 45 | 	this->hCellSizeX = 4; // 8
 46 | 	this->hCellSizeY = 4; // 8
 47 | 	this->hBlockSizeX = 2;
 48 | 	this->hBlockSizeY = 2;
 49 | 	this->hWindowSizeX = 24; //64
 50 | 	this->hWindowSizeY = 24; //128
 51 | 	this->hNoOfHistogramBins = 9;
 52 | 
 53 | 	this->wtScale = 2.0f;
 54 | 
 55 | 	this->useGrayscale = false;
 56 | 
 57 | 	this->readSVMFromFile(fileName);
 58 | 
 59 | 	this->formattedResultsAvailable = false;
 60 | 
 61 | 	nmsProcessor = new HOGNMS();
 62 | 
 63 | 	InitHOG(iw, ih, avSizeX, avSizeY, marginX, marginY, hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY,
 64 | 		hWindowSizeX, hWindowSizeY, hNoOfHistogramBins, wtScale, svmBias, svmWeights, svmWeightsCount, useGrayscale);
 65 | }
 66 | 
 67 | void HOGEngine::InitializeHOG(int iw, int ih, float svmBias, float* svmWeights, int svmWeightsCount)
 68 | {
 69 | 	this->imageWidth = iw;
 70 | 	this->imageHeight = ih;
 71 | 
 72 | 	this->avSizeX = 48; //48
 73 | 	this->avSizeY = 96; //96
 74 | 	this->marginX = 4; // 4
 75 | 	this->marginY = 4; // 4
 76 | 
 77 | 	this->hCellSizeX = 8;
 78 | 	this->hCellSizeY = 8;
 79 | 	this->hBlockSizeX = 2;
 80 | 	this->hBlockSizeY = 2;
 81 | 	this->hWindowSizeX = 64;
 82 | 	this->hWindowSizeY = 128;
 83 | 	this->hNoOfHistogramBins = 9;
 84 | 
 85 | 	this->svmWeightsCount = svmWeightsCount;
 86 | 	this->svmBias = svmBias;
 87 | 	this->svmWeights = svmWeights;
 88 | 
 89 | 	this->wtScale = 2.0f;
 90 | 
 91 | 	this->useGrayscale = false;
 92 | 
 93 | 	this->formattedResultsAvailable = false;
 94 | 
 95 | 	nmsProcessor = new HOGNMS();
 96 | 
 97 | 	InitHOG(iw, ih, avSizeX, avSizeY, marginX, marginY, hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY,
 98 | 		hWindowSizeX, hWindowSizeY, hNoOfHistogramBins, wtScale, svmBias, svmWeights, svmWeightsCount, useGrayscale);
 99 | }
100 | 
101 | 
102 | void HOGEngine::readSVMFromFile(std::string modelfile)
103 | {
104 | 	double linearbias_, *linearwt_;
105 | 
106 |     FILE *modelfl;
107 | #ifdef _WIN32
108 |     if ((fopen_s (&modelfl, modelfile.c_str(), "rb")) != 0)
109 |     { printf("File not found!\n"); exit(1); }
110 | #else
111 |     if ((modelfl = fopen (modelfile.c_str(), "rb")) == NULL)
112 |     { printf("File not found!\n"); exit(1); }
113 | #endif
114 |     char version_buffer[10];
115 |     if (!fread (&version_buffer,sizeof(char),10,modelfl))
116 |     { printf("Wrong file version!\n"); exit(1); }
117 | 
118 |     if(strcmp(version_buffer,"V6.01")) {
119 |     	printf("Wrong file version!\n"); exit(1);
120 |     }
121 |     /* read version number */
122 |     int version = 0;
123 |     if (!fread (&version,sizeof(int),1,modelfl))
124 |     { printf("Wrong file version!\n"); exit(1); }
125 |     if (version < 200)
126 |     { printf("Wrong file version!\n"); exit(1); }
127 | 
128 |     long long kernel_type;
129 |     fread(&(kernel_type),sizeof(long long),1,modelfl);
130 | 
131 |     {// ignore these
132 |         long long poly_degree;
133 |         fread(&(poly_degree),sizeof(long long),1,modelfl);
134 | 
135 |         double rbf_gamma;
136 |         fread(&(rbf_gamma),sizeof(double),1,modelfl);
137 | 
138 |         double  coef_lin;
139 |         fread(&(coef_lin),sizeof(double),1,modelfl);
140 |         double coef_const;
141 |         fread(&(coef_const),sizeof(double),1,modelfl);
142 | 
143 |         long long l;
144 |         fread(&l,sizeof(long long),1,modelfl);
145 |         char* custom = new char[(unsigned int)l];
146 |         fread(custom,sizeof(char),(size_t)l,modelfl);
147 |         delete[] custom;
148 |     }
149 | 
150 |     long long totwords;
151 |     fread(&(totwords),sizeof(long long),1,modelfl);
152 | 
153 |     {// ignore these
154 |         long long totdoc;
155 |         fread(&(totdoc),sizeof(long long),1,modelfl);
156 | 
157 |         long long sv_num;
158 |         fread(&(sv_num), sizeof(long long),1,modelfl);
159 |     }
160 | 
161 |     fread(&linearbias_, sizeof(double),1,modelfl);
162 | 
163 |     if(kernel_type == 0) { /* linear kernel */
164 |         /* save linear wts also */
165 |         linearwt_ = new double[(unsigned int)totwords+1];
166 | 		svmWeightsCount = (int) totwords;
167 |         fread(linearwt_, sizeof(double),(size_t)totwords+1,modelfl);
168 |     } else {
169 |         exit(1);
170 |     }
171 | 
172 | 	svmWeights = new float[svmWeightsCount+1];
173 | 	for (int i=0; i<svmWeightsCount; i++)
174 | 		svmWeights[i] = (float) linearwt_[i];
175 | 
176 | 	svmBias = (float)linearbias_;
177 | 
178 | 	fclose(modelfl);
179 | 
180 | 	delete linearwt_;
181 | }
182 | 
183 | void HOGEngine::FinalizeHOG()
184 | {
185 | 	delete nmsProcessor;
186 | 
187 | 	CloseHOG();
188 | }
189 | 
190 | void HOGEngine::BeginProcess(HOGImage* hostImage,
191 | 		int _minx, int _miny, int _maxx, int _maxy, float minScale, float maxScale)
192 | {
193 | 	minX = _minx, minY = _miny, maxX = _maxx, maxY = _maxy;
194 | 
195 | 	if (minY == -1 && minY == -1 && maxX == -1 && maxY == -1)
196 | 	{
197 | 		minX = 0;
198 | 		minY = 0;
199 | 		maxX = imageWidth;
200 | 		maxY = imageHeight;
201 | 	}
202 | 
203 | 	BeginHOGProcessing(hostImage->pixels, minX, minY, maxX, maxY, minScale, maxScale);
204 | }
205 | 
206 | void HOGEngine::EndProcess()
207 | {
208 | 	cppResult = EndHOGProcessing();
209 | 
210 | 	GetHOGParameters(&startScale, &endScale, &scaleRatio, &scaleCount,
211 | 		&hPaddingSizeX, &hPaddingSizeY, &hPaddedWidth, &hPaddedHeight,
212 | 		&hNoOfCellsX, &hNoOfCellsY, &hNoOfBlocksX, &hNoOfBlocksY, &hNumberOfWindowsX,
213 | 		&hNumberOfWindowsY, &hNumberOfBlockPerWindowX, &hNumberOfBlockPerWindowY);
214 | 
215 | 	ComputeFormattedResults();
216 | 
217 | 	nmsResults = nmsProcessor->ComputeNMSResults(formattedResults, formattedResultsCount, &nmsResultsAvailable, &nmsResultsCount,
218 | 		hWindowSizeX, hWindowSizeY);
219 | }
220 | 
221 | void HOGEngine::GetImage(HOGImage *imageCUDA, ImageType imageType)
222 | {
223 | 	switch (imageType)
224 | 	{
225 | 	case IMAGE_RESIZED:
226 | 		GetProcessedImage(imageCUDA->pixels, 0);
227 | 		break;
228 | 	case IMAGE_COLOR_GRADIENTS:
229 | 		GetProcessedImage(imageCUDA->pixels, 1);
230 | 		break;
231 | 	case IMAGE_GRADIENT_ORIENTATIONS:
232 | 		GetProcessedImage(imageCUDA->pixels, 2);
233 | 		break;
234 | 	case IMAGE_PADDED:
235 | 		GetProcessedImage(imageCUDA->pixels, 3);
236 | 		break;
237 | 	case IMAGE_ROI:
238 | 		GetProcessedImage(imageCUDA->pixels, 4);
239 | 		break;
240 | 	}
241 | }
242 | 
243 | void HOGEngine::SaveResultsToDisk(char* fileName)
244 | {
245 | 	FILE* f; 
246 | #ifdef _WIN32
247 | 	fopen_s(&f, fileName, "w+");
248 | #else
249 | 	f = fopen(fileName, "w+");
250 | #endif
251 | 	fprintf(f, "%d\n", formattedResultsCount);
252 | 	for (int i=0; i<formattedResultsCount; i++)
253 | 	{
254 | 		fprintf(f, "%f %f %d %d %d %d %d %d\n",
255 | 			formattedResults[i].scale, formattedResults[i].score,
256 | 			formattedResults[i].width, formattedResults[i].height,
257 | 			formattedResults[i].x, formattedResults[i].y,
258 | 			formattedResults[i].origX, formattedResults[i].origY);
259 | 	}
260 | 	fclose(f);
261 | }
262 | 
263 | void HOGEngine::ComputeFormattedResults()
264 | {
265 | 	int i, j, k, resultId;
266 | 	int leftoverX, leftoverY, currentWidth, currentHeight, rNumberOfWindowsX, rNumberOfWindowsY;
267 | 
268 | 	resultId = 0;
269 | 	formattedResultsCount = 0;
270 | 
271 | 	float* currentScaleWOffset;
272 | 	float currentScale = startScale;
273 | 
274 | 	for (i=0; i<scaleCount; i++)
275 | 	{
276 | 		currentScaleWOffset = cppResult + i * hNumberOfWindowsX * hNumberOfWindowsY;
277 | 
278 | 		for (j = 0; j < hNumberOfWindowsY; j++)
279 | 		{
280 | 			for (k = 0; k < hNumberOfWindowsX; k++)
281 | 			{
282 | 				float score = currentScaleWOffset[k + j * hNumberOfWindowsX];
283 | 				if (score > 0)
284 | 					formattedResultsCount++;
285 | 			}
286 | 		}
287 | 	}
288 | 
289 | 	if (formattedResultsAvailable) delete formattedResults;
290 | 	formattedResults = new HOGResult[formattedResultsCount];
291 | 
292 | 	for (i=0; i<scaleCount; i++)
293 | 	{
294 | 		currentScaleWOffset = cppResult + i * hNumberOfWindowsX * hNumberOfWindowsY;
295 | 
296 | 		for (j=0; j<hNumberOfWindowsY; j++)
297 | 		{
298 | 			for (k=0; k<hNumberOfWindowsX; k++)
299 | 			{
300 | 				float score = currentScaleWOffset[k + j * hNumberOfWindowsX];
301 | 				if (score > 0)
302 | 				{
303 | 					HOGResult hogResult;
304 | 
305 | 					currentWidth = iDivUpF(hPaddedWidth, currentScale);
306 | 					currentHeight = iDivUpF(hPaddedHeight, currentScale);
307 | 
308 | 					rNumberOfWindowsX = (currentWidth - hWindowSizeX) / hCellSizeX + 1;
309 | 					rNumberOfWindowsY = (currentHeight - hWindowSizeY) / hCellSizeY + 1;
310 | 
311 | 					leftoverX = (currentWidth - hWindowSizeX - hCellSizeX * (rNumberOfWindowsX - 1)) / 2;
312 | 					leftoverY = (currentHeight - hWindowSizeY - hCellSizeY * (rNumberOfWindowsY - 1)) / 2;
313 | 
314 | 					hogResult.origX = k * hCellSizeX + leftoverX;
315 | 					hogResult.origY = j * hCellSizeY + leftoverY;
316 | 
317 | 					hogResult.width = (int)floorf((float)hWindowSizeX * currentScale);
318 | 					hogResult.height = (int)floorf((float)hWindowSizeY * currentScale);
319 | 
320 | 					hogResult.x = (int)ceilf(currentScale * (hogResult.origX + hWindowSizeX / 2) - (float) hWindowSizeX * currentScale / 2) - hPaddingSizeX + minX;
321 | 					hogResult.y = (int)ceilf(currentScale * (hogResult.origY + hWindowSizeY / 2) - (float) hWindowSizeY * currentScale / 2) - hPaddingSizeY + minY;
322 | 
323 | 					hogResult.scale = currentScale;
324 | 					hogResult.score = score;
325 | 
326 | 					formattedResults[resultId] = hogResult;
327 | 					resultId++;
328 | 				}
329 | 			}
330 | 		}
331 | 
332 | 		currentScale = currentScale * scaleRatio;
333 | 	}
334 | }
335 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGEngine.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_ENGINE__
 2 | #define __HOG_ENGINE__
 3 | 
 4 | #include "HOGResult.h"
 5 | #include "HOGNMS.h"
 6 | #include "HOGImage.h"
 7 | 
 8 | #include <string>
 9 | 
10 | using namespace std;
11 | 
12 | namespace HOG
13 | {
14 | 	class HOGEngine
15 | 	{
16 | 	private:
17 | 		static HOGEngine* instance;
18 | 
19 | 		int iDivUpF(int a, float b);
20 | 
21 | 		HOGNMS* nmsProcessor;
22 | 		void readSVMFromFile(std::string fileName);
23 | 
24 | 	public:
25 | 		int imageWidth, imageHeight;
26 | 
27 | 		int avSizeX, avSizeY, marginX, marginY;
28 | 
29 | 		int scaleCount;
30 | 		int hCellSizeX, hCellSizeY;
31 | 		int hBlockSizeX, hBlockSizeY;
32 | 		int hWindowSizeX, hWindowSizeY;
33 | 		int hNoOfHistogramBins;
34 | 		int hPaddedWidth, hPaddedHeight;
35 | 		int hPaddingSizeX, hPaddingSizeY;
36 | 
37 | 		int minX, minY, maxX, maxY;
38 | 
39 | 		float wtScale;
40 | 
41 | 		float startScale, endScale, scaleRatio;
42 | 
43 | 		int svmWeightsCount;
44 | 		float svmBias, *svmWeights;
45 | 
46 | 		int hNoOfCellsX, hNoOfCellsY;
47 | 		int hNoOfBlocksX, hNoOfBlocksY;
48 | 		int hNumberOfWindowsX, hNumberOfWindowsY;
49 | 		int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY;
50 | 
51 | 		bool useGrayscale;
52 | 
53 | 		float* cppResult;
54 | 
55 | 		HOGResult* formattedResults;
56 | 		HOGResult* nmsResults;
57 | 
58 | 		bool formattedResultsAvailable;
59 | 		int formattedResultsCount;
60 | 
61 | 		bool nmsResultsAvailable;
62 | 		int nmsResultsCount;
63 | 
64 | 		enum ImageType
65 | 		{
66 | 			IMAGE_RESIZED,
67 | 			IMAGE_COLOR_GRADIENTS,
68 | 			IMAGE_GRADIENT_ORIENTATIONS,
69 | 			IMAGE_PADDED,
70 | 			IMAGE_ROI
71 | 		};
72 | 
73 | 		static HOGEngine* Instance(void) {
74 | 			if (instance == NULL) instance = new HOGEngine();
75 | 			return instance;
76 | 		}
77 | 
78 | 		void InitializeHOG(int iw, int ih, float svmBias, float* svmWeights, int svmWeightsCount);
79 | 		void InitializeHOG(int iw, int ih, std::string fileName);
80 | 
81 | 		void FinalizeHOG();
82 | 
83 | 		void BeginProcess(HOGImage* hostImage, int _minx = -1, int _miny = -1, int _maxx = -1, int _maxy = -1,
84 | 			float minScale = -1.0f, float maxScale = -1.0f);
85 | 		void EndProcess();
86 | 		void GetImage(HOGImage *imageCUDA, ImageType imageType);
87 | 
88 | 		void ComputeFormattedResults();
89 | 
90 | 		void SaveResultsToDisk(char* fileName);
91 | 
92 | 		HOGEngine(void) { }
93 | 		~HOGEngine(void) { }
94 | 	};
95 | }
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGEngineDevice.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGEngineDevice.h"
  2 | #include "HOGUtils.h"
  3 | #include "HOGConvolution.h"
  4 | #include "HOGHistogram.h"
  5 | #include "HOGSVMSlider.h"
  6 | #include "HOGScale.h"
  7 | #include "HOGPadding.h"
  8 | #include "cutil.h"
  9 | 
 10 | int hWidth, hHeight;
 11 | int hWidthROI, hHeightROI;
 12 | int hPaddedWidth, hPaddedHeight;
 13 | int rPaddedWidth, rPaddedHeight;
 14 | 
 15 | int minX, minY, maxX, maxY;
 16 | 
 17 | int hNoHistogramBins, rNoHistogramBins;
 18 | 
 19 | int hPaddingSizeX, hPaddingSizeY;
 20 | int hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, hWindowSizeX, hWindowSizeY;
 21 | int hNoOfCellsX, hNoOfCellsY, hNoOfBlocksX, hNoOfBlocksY;
 22 | int rNoOfCellsX, rNoOfCellsY, rNoOfBlocksX, rNoOfBlocksY;
 23 | 
 24 | int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY;
 25 | int hNumberOfWindowsX, hNumberOfWindowsY;
 26 | int rNumberOfWindowsX, rNumberOfWindowsY;
 27 | 
 28 | float4 *paddedRegisteredImage;
 29 | 
 30 | float1 *resizedPaddedImageF1;
 31 | float4 *resizedPaddedImageF4;
 32 | 
 33 | float2 *colorGradientsF2;
 34 | 
 35 | float1 *blockHistograms;
 36 | float1 *cellHistograms;
 37 | 
 38 | float1 *svmScores;
 39 | 
 40 | bool hUseGrayscale;
 41 | 
 42 | uchar1* outputTest1;
 43 | uchar4* outputTest4;
 44 | 
 45 | float* hResult;
 46 | 
 47 | float scaleRatio;
 48 | float startScale;
 49 | float endScale;
 50 | int scaleCount;
 51 | 
 52 | int avSizeX, avSizeY, marginX, marginY;
 53 | 
 54 | extern uchar4* paddedRegisteredImageU4;
 55 | 
 56 | __host__ void InitHOG(int width, int height,
 57 | 					  int _avSizeX, int _avSizeY,
 58 | 					  int _marginX, int _marginY,
 59 | 					  int cellSizeX, int cellSizeY,
 60 | 					  int blockSizeX, int blockSizeY,
 61 | 					  int windowSizeX, int windowSizeY,
 62 | 					  int noOfHistogramBins, float wtscale,
 63 | 					  float svmBias, float* svmWeights, int svmWeightsCount,
 64 | 					  bool useGrayscale)
 65 | {
 66 | 	cudaSetDevice( cutGetMaxGflopsDeviceId() );
 67 | 
 68 | 	int i;
 69 | 	int toaddxx = 0, toaddxy = 0, toaddyx = 0, toaddyy = 0;
 70 | 
 71 | 	hWidth = width; hHeight = height;
 72 | 	avSizeX = _avSizeX; avSizeY = _avSizeY; marginX = _marginX; marginY = _marginY;
 73 | 
 74 | 	if (avSizeX) { toaddxx = hWidth * marginX / avSizeX; toaddxy = hHeight * marginY / avSizeX; }
 75 | 	if (avSizeY) { toaddyx = hWidth * marginX / avSizeY; toaddyy = hHeight * marginY / avSizeY; }
 76 | 
 77 | 	hPaddingSizeX = max(toaddxx, toaddyx); hPaddingSizeY = max(toaddxy, toaddyy);
 78 | 
 79 | 	hPaddedWidth = hWidth + hPaddingSizeX*2;
 80 | 	hPaddedHeight = hHeight + hPaddingSizeY*2;
 81 | 
 82 | 	hUseGrayscale = useGrayscale;
 83 | 
 84 | 	hNoHistogramBins = noOfHistogramBins;
 85 | 	hCellSizeX = cellSizeX; hCellSizeY = cellSizeY; hBlockSizeX = blockSizeX; hBlockSizeY = blockSizeY;
 86 | 	hWindowSizeX = windowSizeX; hWindowSizeY = windowSizeY;
 87 | 
 88 | 	hNoOfCellsX = hPaddedWidth / cellSizeX;
 89 | 	hNoOfCellsY = hPaddedHeight / cellSizeY;
 90 | 
 91 | 	hNoOfBlocksX = hNoOfCellsX - blockSizeX + 1;
 92 | 	hNoOfBlocksY = hNoOfCellsY - blockSizeY + 1;
 93 | 
 94 | 	hNumberOfBlockPerWindowX = (windowSizeX - cellSizeX * blockSizeX) / cellSizeX + 1;
 95 | 	hNumberOfBlockPerWindowY = (windowSizeY - cellSizeY * blockSizeY) / cellSizeY + 1;
 96 | 
 97 | 	hNumberOfWindowsX = 0;
 98 | 	for (i=0; i<hNumberOfBlockPerWindowX; i++) hNumberOfWindowsX += (hNoOfBlocksX-i)/hNumberOfBlockPerWindowX;
 99 | 
100 | 	hNumberOfWindowsY = 0;
101 | 	for (i=0; i<hNumberOfBlockPerWindowY; i++) hNumberOfWindowsY += (hNoOfBlocksY-i)/hNumberOfBlockPerWindowY;
102 | 
103 | 	scaleRatio = 1.05f;
104 | 	startScale = 1.0f;
105 | 	endScale = min(hPaddedWidth / (float) hWindowSizeX, hPaddedHeight / (float) hWindowSizeY);
106 | 	scaleCount = (int)floor(logf(endScale/startScale)/logf(scaleRatio)) + 1;
107 | 
108 | 	cutilSafeCall(cudaMalloc((void**) &paddedRegisteredImage, sizeof(float4) * hPaddedWidth * hPaddedHeight));
109 | 
110 | 	if (useGrayscale)
111 | 		cutilSafeCall(cudaMalloc((void**) &resizedPaddedImageF1, sizeof(float1) * hPaddedWidth * hPaddedHeight));
112 | 	else
113 | 		cutilSafeCall(cudaMalloc((void**) &resizedPaddedImageF4, sizeof(float4) * hPaddedWidth * hPaddedHeight));
114 | 
115 | 	cutilSafeCall(cudaMalloc((void**) &colorGradientsF2, sizeof(float2) * hPaddedWidth * hPaddedHeight));
116 | 	cutilSafeCall(cudaMalloc((void**) &blockHistograms, sizeof(float1) * hNoOfBlocksX * hNoOfBlocksY * cellSizeX * cellSizeY * hNoHistogramBins));
117 | 	cutilSafeCall(cudaMalloc((void**) &cellHistograms, sizeof(float1) * hNoOfCellsX * hNoOfCellsY * hNoHistogramBins));
118 | 
119 | 	cutilSafeCall(cudaMalloc((void**) &svmScores, sizeof(float1) * hNumberOfWindowsX * hNumberOfWindowsY * scaleCount));
120 | 
121 | 	InitConvolution(hPaddedWidth, hPaddedHeight, useGrayscale);
122 | 	InitHistograms(cellSizeX, cellSizeY, blockSizeX, blockSizeY, noOfHistogramBins, wtscale);
123 | 	InitSVM(svmBias, svmWeights, svmWeightsCount);
124 | 	InitScale(hPaddedWidth, hPaddedHeight);
125 | 	InitPadding(hPaddedWidth, hPaddedHeight);
126 | 
127 | 	rPaddedWidth = hPaddedWidth;
128 | 	rPaddedHeight = hPaddedHeight;
129 | 
130 | 	if (useGrayscale)
131 | 		cutilSafeCall(cudaMalloc((void**) &outputTest1, sizeof(uchar1) * hPaddedWidth * hPaddedHeight));
132 | 	else
133 | 		cutilSafeCall(cudaMalloc((void**) &outputTest4, sizeof(uchar4) * hPaddedWidth * hPaddedHeight));
134 | 
135 | 	cutilSafeCall(cudaMallocHost((void**)&hResult, sizeof(float) * hNumberOfWindowsX * hNumberOfWindowsY * scaleCount));
136 | }
137 | 
138 | __host__ void CloseHOG()
139 | {
140 | 	cutilSafeCall(cudaFree(paddedRegisteredImage));
141 | 
142 | 	if (hUseGrayscale)
143 | 		cutilSafeCall(cudaFree(resizedPaddedImageF1));
144 | 	else
145 | 		cutilSafeCall(cudaFree(resizedPaddedImageF4));
146 | 
147 | 	cutilSafeCall(cudaFree(colorGradientsF2));
148 | 	cutilSafeCall(cudaFree(blockHistograms));
149 | 	cutilSafeCall(cudaFree(cellHistograms));
150 | 
151 | 	cutilSafeCall(cudaFree(svmScores));
152 | 
153 | 	CloseConvolution();
154 | 	CloseHistogram();
155 | 	CloseSVM();
156 | 	CloseScale();
157 | 	ClosePadding();
158 | 
159 | 	if (hUseGrayscale)
160 | 		cutilSafeCall(cudaFree(outputTest1));
161 | 	else
162 | 		cutilSafeCall(cudaFree(outputTest4));
163 | 
164 | 	cutilSafeCall(cudaFreeHost(hResult));
165 | 
166 | 	cudaThreadExit();
167 | }
168 | 
169 | __host__ void BeginHOGProcessing(unsigned char* hostImage, int minx, int miny, int maxx, int maxy, float minScale, float maxScale)
170 | {
171 | 	int i;
172 | 	minX = minx; minY = miny; maxX = maxx; maxY = maxy;
173 | 	PadHostImage((uchar4*)hostImage, paddedRegisteredImage, minX, minY, maxX, maxY);
174 | 
175 | 	rPaddedWidth = hPaddedWidth; rPaddedHeight = hPaddedHeight;
176 | 	scaleRatio = 1.05f;
177 | 	startScale = (minScale < 0.0f) ? 1.0f : minScale;
178 | 	endScale = (maxScale < 0.0f) ? min(hPaddedWidth / (float) hWindowSizeX, hPaddedHeight / (float) hWindowSizeY) : maxScale;
179 | 	scaleCount = (int)floor(logf(endScale/startScale)/logf(scaleRatio)) + 1;
180 | 
181 | 	float currentScale = startScale;
182 | 
183 | 	ResetSVMScores(svmScores);
184 | 
185 | 	for (i=0; i<scaleCount; i++)
186 | 	{
187 | 		DownscaleImage(0, scaleCount, i, currentScale, hUseGrayscale, paddedRegisteredImage, resizedPaddedImageF1, resizedPaddedImageF4);
188 | 
189 | 		SetConvolutionSize(rPaddedWidth, rPaddedHeight);
190 | 
191 | 		if(hUseGrayscale) ComputeColorGradients1to2(resizedPaddedImageF1, colorGradientsF2);
192 | 		else ComputeColorGradients4to2(resizedPaddedImageF4, colorGradientsF2);
193 | 
194 | 		ComputeBlockHistogramsWithGauss(colorGradientsF2, blockHistograms, hNoHistogramBins,
195 | 			hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, hWindowSizeX, hWindowSizeY,  rPaddedWidth, rPaddedHeight);
196 | 
197 | 		NormalizeBlockHistograms(blockHistograms, hNoHistogramBins, hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, rPaddedWidth, rPaddedHeight);
198 | 
199 | 		LinearSVMEvaluation(svmScores, blockHistograms, hNoHistogramBins, hWindowSizeX, hWindowSizeY, hCellSizeX, hCellSizeY,
200 | 			hBlockSizeX, hBlockSizeY, rNoOfBlocksX, rNoOfBlocksY, i, rPaddedWidth, rPaddedHeight);
201 | 
202 | 		currentScale *= scaleRatio;
203 | 	}
204 | }
205 | 
206 | __host__ float* EndHOGProcessing()
207 | {
208 | 	cudaThreadSynchronize();
209 | 	cutilSafeCall(cudaMemcpy(hResult, svmScores, sizeof(float) * scaleCount * hNumberOfWindowsX * hNumberOfWindowsY, cudaMemcpyDeviceToHost));
210 | 	return hResult;
211 | }
212 | 
213 | __host__ void GetProcessedImage(unsigned char* hostImage, int imageType)
214 | {
215 | 		switch (imageType)
216 | 		{
217 | 		case 0:
218 | 			Float4ToUchar4(resizedPaddedImageF4, outputTest4, rPaddedWidth, rPaddedHeight);
219 | 			break;
220 | 		case 1:
221 | 			Float2ToUchar4(colorGradientsF2, outputTest4, rPaddedWidth, rPaddedHeight, 0);
222 | 			break;
223 | 		case 2:
224 | 			Float2ToUchar4(colorGradientsF2, outputTest4, rPaddedWidth, rPaddedHeight, 1);
225 | 			break;
226 | 		case 3:
227 | 			cutilSafeCall(cudaMemcpy(hostImage, paddedRegisteredImageU4, sizeof(uchar4) * hPaddedWidth * hPaddedHeight, cudaMemcpyDeviceToHost));
228 | 			return;
229 | 		case 4:
230 | 			cutilSafeCall(cudaMemcpy2D(((uchar4*)hostImage) + minX + minY * hWidth, hWidth * sizeof(uchar4), 
231 | 				paddedRegisteredImageU4 + hPaddingSizeX + hPaddingSizeY * hPaddedWidth, hPaddedWidth * sizeof(uchar4),
232 | 				hWidthROI * sizeof(uchar4), hHeightROI, cudaMemcpyDeviceToHost));
233 | 			return;
234 | 		}
235 | 		cutilSafeCall(cudaMemcpy2D(hostImage, hPaddedWidth * sizeof(uchar4), outputTest4, rPaddedWidth * sizeof(uchar4),
236 | 			rPaddedWidth * sizeof(uchar4), rPaddedHeight, cudaMemcpyDeviceToHost));
237 | 
238 | 	//cutilSafeCall(cudaMemcpy(hostImage, paddedRegisteredImage, sizeof(uchar4) * hPaddedWidth * hPaddedHeight, cudaMemcpyDeviceToHost));
239 | }
240 | 
241 | __host__ void GetHOGParameters(float *cStartScale, float *cEndScale, float *cScaleRatio, int *cScaleCount,
242 | 							   int *cPaddingSizeX, int *cPaddingSizeY, int *cPaddedWidth, int *cPaddedHeight,
243 | 							   int *cNoOfCellsX, int *cNoOfCellsY, int *cNoOfBlocksX, int *cNoOfBlocksY,
244 | 							   int *cNumberOfWindowsX, int *cNumberOfWindowsY,
245 | 							   int *cNumberOfBlockPerWindowX, int *cNumberOfBlockPerWindowY)
246 | {
247 | 	*cStartScale = startScale;
248 | 	*cEndScale = endScale;
249 | 	*cScaleRatio = scaleRatio;
250 | 	*cScaleCount = scaleCount;
251 | 	*cPaddingSizeX = hPaddingSizeX;
252 | 	*cPaddingSizeY = hPaddingSizeY;
253 | 	*cPaddedWidth = hPaddedWidth;
254 | 	*cPaddedHeight = hPaddedHeight;
255 | 	*cNoOfCellsX = hNoOfCellsX;
256 | 	*cNoOfCellsY = hNoOfCellsY;
257 | 	*cNoOfBlocksX = hNoOfBlocksX;
258 | 	*cNoOfBlocksY = hNoOfBlocksY;
259 | 	*cNumberOfWindowsX = hNumberOfWindowsX;
260 | 	*cNumberOfWindowsY = hNumberOfWindowsY;
261 | 	*cNumberOfBlockPerWindowX = hNumberOfBlockPerWindowX;
262 | 	*cNumberOfBlockPerWindowY = hNumberOfBlockPerWindowY;
263 | }
264 | 
265 | cudaArray *imageArray2 = 0;
266 | texture<float4, 2, cudaReadModeElementType> tex2;
267 | cudaChannelFormatDesc channelDescDownscale2;
268 | 
269 | __global__ void resizeFastBicubic3(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale)
270 | {
271 | 	int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
272 | 	int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
273 | 	int i = __umul24(y, width) + x;
274 | 
275 | 	float u = x*scale;
276 | 	float v = y*scale;
277 | 
278 | 	if (x < width && y < height)
279 | 	{
280 | 		float4 cF;
281 | 
282 | 		if (scale == 1.0f)
283 | 			cF = paddedRegisteredImage[x + y * width];
284 | 		else
285 | 			cF = tex2D(tex2, u, v);
286 | 
287 | 		outputFloat[i] = cF;
288 | 	}
289 | }
290 | 
291 | __host__ void DownscaleImage2(float scale, float4* paddedRegisteredImage,
292 | 							  float4* resizedPaddedImageF4, int width, int height,
293 | 							  int &rPaddedWidth, int &rPaddedHeight)
294 | {
295 | 	dim3 hThreadSize, hBlockSize;
296 | 
297 | 	hThreadSize = dim3(THREAD_SIZE_W, THREAD_SIZE_H);
298 | 
299 | 	rPaddedWidth = iDivUpF(width, scale);
300 | 	rPaddedHeight = iDivUpF(height, scale);
301 | 
302 | 	hBlockSize = dim3(iDivUp(rPaddedWidth, hThreadSize.x), iDivUp(rPaddedHeight, hThreadSize.y));
303 | 
304 | 	cutilSafeCall(cudaMemcpyToArray(imageArray2, 0, 0, paddedRegisteredImage, sizeof(float4) * width * height, cudaMemcpyDeviceToDevice));
305 | 	cutilSafeCall(cudaBindTextureToArray(tex2, imageArray2, channelDescDownscale2));
306 | 
307 | 	cutilSafeCall(cudaMemset(resizedPaddedImageF4, 0, width * height * sizeof(float4)));
308 | 	resizeFastBicubic3<<<hBlockSize, hThreadSize>>>((float4*)resizedPaddedImageF4, (float4*)paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale);
309 | 
310 | 	cutilSafeCall(cudaUnbindTexture(tex2));
311 | }
312 | 
313 | __host__ float3* CUDAImageRescale(float3* src, int width, int height, int &rWidth, int &rHeight, float scale)
314 | {
315 | 	int i, j, offsetC, offsetL;
316 | 
317 | 	float4* srcH; float4* srcD;
318 | 	float4* dstD; float4* dstH;
319 | 	float3 val3; float4 val4;
320 | 
321 | 	channelDescDownscale2 = cudaCreateChannelDesc<float4>();
322 | 	tex2.filterMode = cudaFilterModeLinear; tex2.normalized = false;
323 | 
324 | 	cudaMalloc((void**)&srcD, sizeof(float4) * width * height);
325 | 	cudaMalloc((void**)&dstD, sizeof(float4) * width * height);
326 | 	cudaMallocHost((void**)&srcH, sizeof(float4) * width * height);
327 | 	cudaMallocHost((void**)&dstH, sizeof(float4) * width * height);
328 | 	cutilSafeCall(cudaMallocArray(&imageArray2, &channelDescDownscale2, width, height) );
329 | 
330 | 	for (i=0; i<width; i++)
331 | 	{
332 | 		for (j=0; j<height; j++)
333 | 		{
334 | 			offsetC = j + i * height;
335 | 			offsetL = j * width + i;
336 | 
337 | 			val3 = src[offsetC];
338 | 
339 | 			srcH[offsetL].x = val3.x;
340 | 			srcH[offsetL].y = val3.y;
341 | 			srcH[offsetL].z = val3.z;
342 | 		}
343 | 	}
344 | 	cudaMemcpy(srcD, srcH, sizeof(float4) * width * height, cudaMemcpyHostToDevice);
345 | 
346 | 	DownscaleImage2(scale, srcD, dstD, width, height, rWidth, rHeight);
347 | 
348 | 	cudaMemcpy(dstH, dstD, sizeof(float4) * rWidth * rHeight, cudaMemcpyDeviceToHost);
349 | 
350 | 	float3* dst = (float3*) malloc (rWidth * rHeight * sizeof(float3));
351 | 	for (i=0; i<rWidth; i++)
352 | 	{
353 | 		for (j=0; j<rHeight; j++)
354 | 		{
355 | 			offsetC = j + i * rHeight;
356 | 			offsetL = j * rWidth + i;
357 | 
358 | 			val4 = dstH[offsetL];
359 | 
360 | 			dst[offsetC].x = val4.x;
361 | 			dst[offsetC].y = val4.y;
362 | 			dst[offsetC].z = val4.z;
363 | 		}
364 | 	}
365 | 
366 | 	cutilSafeCall(cudaFreeArray(imageArray2));
367 | 	cudaFree(srcD);
368 | 	cudaFree(dstD);
369 | 	cudaFreeHost(srcH);
370 | 	cudaFreeHost(dstH);
371 | 
372 | 	return dst;
373 | }
374 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGEngineDevice.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUDA_HOG__
 2 | #define __CUDA_HOG__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | 
16 | #include "HOGDefines.h"
17 | 
18 | extern "C" __host__ void InitHOG(int width, int height,
19 | 								 int avSizeX, int avSizeY,
20 | 								 int marginX, int marginY,
21 | 								 int cellSizeX, int cellSizeY,
22 | 								 int blockSizeX, int blockSizeY,
23 | 								 int windowSizeX, int windowSizeY,
24 | 								 int noOfHistogramBins, float wtscale,
25 | 								 float svmBias, float* svmWeights, int svmWeightsCount,
26 | 								 bool useGrayscale);
27 | 
28 | extern "C" __host__ void CloseHOG();
29 | 
30 | extern "C" __host__ void BeginHOGProcessing(unsigned char* hostImage, int minx, int miny, int maxx, int maxy, float minScale, float maxScale);
31 | extern "C" __host__ float* EndHOGProcessing();
32 | 
33 | extern "C"  __host__ void GetHOGParameters(float *cStartScale, float *cEndScale, float *cScaleRatio, int *cScaleCount,
34 | 										   int *cPaddingSizeX, int *cPaddingSizeY, int *cPaddedWidth, int *cPaddedHeight,
35 | 										   int *cNoOfCellsX, int *cNoOfCellsY, int *cNoOfBlocksX, int *cNoOfBlocksY,
36 | 										   int *cNumberOfWindowsX, int *cNumberOfWindowsY,
37 | 										   int *cNumberOfBlockPerWindowX, int *cNumberOfBlockPerWindowY);
38 | 
39 | extern "C" __host__ void GetProcessedImage(unsigned char* hostImage, int imageType);
40 | 
41 | extern "C" __host__ float3* CUDAImageRescale(float3* src, int width, int height, int &rWidth, int &rHeight, float scale);
42 | 
43 | __host__ void InitCUDAHOG(int cellSizeX, int cellSizeY,
44 | 						  int blockSizeX, int blockSizeY,
45 | 						  int windowSizeX, int windowSizeY,
46 | 						  int noOfHistogramBins, float wtscale,
47 | 						  float svmBias, float* svmWeights, int svmWeightsCount,
48 | 						  bool useGrayscale);
49 | __host__ void CloseCUDAHOG();
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGEngineDevice.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic3P6float4S0_iif


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGHistogram.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGHistogram.h"
  2 | #include "HOGUtils.h"
  3 | #include "cutil.h"
  4 | 
  5 | __device__ __constant__ float cenBound[3], halfBin[3], bandWidth[3], oneHalf = 0.5f;
  6 | __device__ __constant__ int tvbin[3];
  7 | 
  8 | texture<float, 1, cudaReadModeElementType> texGauss;
  9 | cudaArray* gaussArray;
 10 | cudaChannelFormatDesc channelDescGauss;
 11 | 
 12 | extern __shared__ float allShared[];
 13 | 
 14 | extern int rNoHistogramBins, rNoOfCellsX, rNoOfCellsY, rNoOfBlocksX, rNoOfBlocksY, rNumberOfWindowsX, rNumberOfWindowsY;
 15 | 
 16 | // wt scale == scale for weighting function span
 17 | __host__ void InitHistograms(int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, int noHistogramBins, float wtscale)
 18 | {
 19 | 	int i, j;
 20 | 
 21 | 	float var2x = cellSizeX * blockSizeX / (2 * wtscale);
 22 | 	float var2y = cellSizeY * blockSizeY / (2 * wtscale);
 23 | 	var2x *= var2x * 2; var2y *= var2y * 2;
 24 | 
 25 | 	float centerX = cellSizeX * blockSizeX / 2.0f;
 26 | 	float centerY = cellSizeY * blockSizeY / 2.0f;
 27 | 
 28 | 	float* weights = (float*)malloc(cellSizeX * blockSizeX * cellSizeY * blockSizeY * sizeof(float));
 29 | 
 30 | 	for (i=0; i<cellSizeX * blockSizeX; i++)
 31 | 	{
 32 | 		for (j=0; j<cellSizeY * blockSizeY; j++)
 33 | 		{
 34 | 			float tx = i - centerX;
 35 | 			float ty = j - centerY;
 36 | 
 37 | 			tx *= tx / var2x;
 38 | 			ty *= ty / var2y;
 39 | 
 40 | 			weights[i + j * cellSizeX * blockSizeX] = exp(-(tx + ty));
 41 | 		}
 42 | 	}
 43 | 
 44 | 	channelDescGauss = cudaCreateChannelDesc<float>();
 45 | 
 46 | 	cutilSafeCall(cudaMallocArray(&gaussArray, &channelDescGauss, cellSizeX * blockSizeX * cellSizeY * blockSizeY, 1) );
 47 | 	cutilSafeCall(cudaMemcpyToArray(gaussArray, 0, 0, weights, sizeof(float) * cellSizeX * blockSizeX * cellSizeY * blockSizeY, cudaMemcpyHostToDevice));
 48 | 
 49 | 	int h_tvbin[3];
 50 | 	float h_cenBound[3], h_halfBin[3], h_bandWidth[3];
 51 | 	h_cenBound[0] = cellSizeX * blockSizeX / 2.0f;
 52 | 	h_cenBound[1] = cellSizeY * blockSizeY / 2.0f;
 53 | 	h_cenBound[2] = 180 / 2.0f; //TODO -> can be 360
 54 | 
 55 | 	h_halfBin[0] = blockSizeX / 2.0f;
 56 | 	h_halfBin[1] = blockSizeY / 2.0f;
 57 | 	h_halfBin[2] = noHistogramBins / 2.0f;
 58 | 
 59 | 	h_bandWidth[0] = (float) cellSizeX; h_bandWidth[0] = 1.0f / h_bandWidth[0];
 60 | 	h_bandWidth[1] = (float) cellSizeY; h_bandWidth[1] = 1.0f / h_bandWidth[1];
 61 | 	h_bandWidth[2] = 180.0f / (float) noHistogramBins; h_bandWidth[2] = 1.0f / h_bandWidth[2]; //TODO -> can be 360
 62 | 
 63 | 	h_tvbin[0] = blockSizeX; h_tvbin[1] = blockSizeY; h_tvbin[2] = noHistogramBins;
 64 | 
 65 | 	cutilSafeCall(cudaMemcpyToSymbol(cenBound, h_cenBound, 3 * sizeof(float), 0, cudaMemcpyHostToDevice));
 66 | 	cutilSafeCall(cudaMemcpyToSymbol(halfBin, h_halfBin, 3 * sizeof(float), 0, cudaMemcpyHostToDevice));
 67 | 	cutilSafeCall(cudaMemcpyToSymbol(bandWidth, h_bandWidth, 3 * sizeof(float), 0, cudaMemcpyHostToDevice));
 68 | 	cutilSafeCall(cudaMemcpyToSymbol(tvbin, h_tvbin, 3 * sizeof(int), 0, cudaMemcpyHostToDevice));
 69 | }
 70 | 
 71 | __host__ void CloseHistogram()
 72 | {
 73 | }
 74 | 
 75 | __global__ void computeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins,
 76 | 												int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
 77 | 												int leftoverX, int leftoverY, int width, int height)
 78 | {
 79 | 	int i;
 80 | 	float2 localValue;
 81 | 	float* shLocalHistograms = (float*)allShared;
 82 | 
 83 | 	int cellIdx = threadIdx.y;
 84 | 	int cellIdy = threadIdx.z;
 85 | 	int columnId = threadIdx.x;
 86 | 
 87 | 	int smemReadPos = __mul24(cellIdx, noHistogramBins) + __mul24(cellIdy, blockSizeX) * noHistogramBins;
 88 | 	int gmemWritePos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, gridDim.x) * __mul24(blockDim.y, noHistogramBins) +
 89 | 		__mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, noHistogramBins) * blockDim.z;
 90 | 
 91 | 	int gmemReadStride = width;
 92 | 
 93 | 	int gmemReadPos = leftoverX + __mul24(leftoverY, gmemReadStride) +
 94 | 		(__mul24(blockIdx.x, cellSizeX) + __mul24(blockIdx.y, cellSizeY) * gmemReadStride)
 95 | 		+ (columnId + __mul24(cellIdx, cellSizeX) + __mul24(cellIdy, cellSizeY) * gmemReadStride);
 96 | 
 97 | 	int histogramSize = __mul24(noHistogramBins, blockSizeX) * blockSizeY;
 98 | 	int smemLocalHistogramPos = (columnId + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX);
 99 | 
100 | 	int cmemReadPos = columnId + __mul24(cellIdx, cellSizeX) + __mul24(cellIdy, cellSizeY) * __mul24(cellSizeX, blockSizeX);
101 | 
102 | 	float atx, aty;
103 | 	float pIx, pIy, pIz;
104 | 
105 | 	int fIx, fIy, fIz;
106 | 	int cIx, cIy, cIz;
107 | 	float dx, dy, dz;
108 | 	float cx, cy, cz;
109 | 
110 | 	bool lowervalidx, lowervalidy;
111 | 	bool uppervalidx, uppervalidy;
112 | 	bool canWrite;
113 | 
114 | 	int offset;
115 | 
116 | 	for (i=0; i<histogramSize; i++) shLocalHistograms[smemLocalHistogramPos + i] = 0;
117 | 
118 | #ifdef UNROLL_LOOPS
119 | 	int halfSizeYm1 = cellSizeY / 2 - 1;
120 | #endif
121 | 
122 | 	//if (blockIdx.x == 5 && blockIdx.y == 4)
123 | 	//{
124 | 	//	int asasa;
125 | 	//	asasa = 0;
126 | 	//	asasa++;
127 | 	//}
128 | 
129 | 	for (i=0; i<cellSizeY; i++)
130 | 	{
131 | 		localValue = inputImage[gmemReadPos + i * gmemReadStride];
132 | 		localValue.x *= tex1D(texGauss, cmemReadPos + i * cellSizeX * blockSizeX);
133 | 
134 | 		atx = cellIdx * cellSizeX + columnId + 0.5;
135 | 		aty = cellIdy * cellSizeY + i + 0.5;
136 | 
137 | 		pIx = halfBin[0] - oneHalf + (atx - cenBound[0]) * bandWidth[0];
138 | 		pIy = halfBin[1] - oneHalf + (aty - cenBound[1]) * bandWidth[1];
139 | 		pIz = halfBin[2] - oneHalf + (localValue.y - cenBound[2]) * bandWidth[2];
140 | 
141 | 		fIx = floorf(pIx); fIy = floorf(pIy); fIz = floorf(pIz);
142 | 		cIx = fIx + 1; cIy = fIy + 1; cIz = fIz + 1; //eq ceilf(pI.)
143 | 
144 | 		dx = pIx - fIx; dy = pIy - fIy; dz = pIz - fIz;
145 | 		cx = 1 - dx; cy = 1 - dy; cz = 1 - dz;
146 | 
147 | 		cIz %= tvbin[2];
148 | 		fIz %= tvbin[2];
149 | 		if (fIz < 0) fIz += tvbin[2];
150 | 		if (cIz < 0) cIz += tvbin[2];
151 | 
152 | #ifdef UNROLL_LOOPS
153 | 		if ((i & halfSizeYm1) == 0)
154 | #endif
155 | 		{
156 | 			uppervalidx = !(cIx >= tvbin[0] - oneHalf || cIx < -oneHalf);
157 | 			uppervalidy = !(cIy >= tvbin[1] - oneHalf || cIy < -oneHalf);
158 | 			lowervalidx = !(fIx < -oneHalf || fIx >= tvbin[0] - oneHalf);
159 | 			lowervalidy = !(fIy < -oneHalf || fIy >= tvbin[1] - oneHalf);
160 | 		}
161 | 
162 | 		canWrite = (lowervalidx) && (lowervalidy);
163 | 		if (canWrite)
164 | 		{
165 | 			offset = smemLocalHistogramPos + (fIx + fIy * blockSizeY) * noHistogramBins;
166 | 			shLocalHistograms[offset + fIz] += localValue.x * cx * cy * cz;
167 | 			shLocalHistograms[offset + cIz] += localValue.x * cx * cy * dz;
168 | 		}
169 | 
170 | 		canWrite = (lowervalidx) && (uppervalidy);
171 | 		if (canWrite)
172 | 		{
173 | 			offset = smemLocalHistogramPos + (fIx + cIy * blockSizeY) * noHistogramBins;
174 | 			shLocalHistograms[offset + fIz] += localValue.x * cx * dy * cz;
175 | 			shLocalHistograms[offset + cIz] += localValue.x * cx * dy * dz;
176 | 		}
177 | 
178 | 		canWrite = (uppervalidx) && (lowervalidy);
179 | 		if (canWrite)
180 | 		{
181 | 			offset = smemLocalHistogramPos + (cIx + fIy * blockSizeY) * noHistogramBins;
182 | 			shLocalHistograms[offset + fIz] += localValue.x * dx * cy * cz;
183 | 			shLocalHistograms[offset + cIz] += localValue.x * dx * cy * dz;
184 | 		}
185 | 
186 | 		canWrite = (uppervalidx) && (uppervalidy);
187 | 		if (canWrite)
188 | 		{
189 | 			offset = smemLocalHistogramPos + (cIx + cIy * blockSizeY) * noHistogramBins;
190 | 			shLocalHistograms[offset + fIz] += localValue.x * dx * dy * cz;
191 | 			shLocalHistograms[offset + cIz] += localValue.x * dx * dy * dz;
192 | 		}
193 | 	}
194 | 
195 | 	__syncthreads();
196 | 
197 | 	//TODO -> aligned block size * cell size
198 | 	int smemTargetHistogramPos;
199 | 	for(unsigned int s = blockSizeY >> 1; s>0; s>>=1)
200 | 	{
201 | 		if (cellIdy < s && (cellIdy + s) < blockSizeY)
202 | 		{
203 | 			smemTargetHistogramPos = (columnId + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24((cellIdy + s), histogramSize) * __mul24(blockSizeX, cellSizeX);
204 | 
205 | #ifdef UNROLL_LOOPS
206 | 			shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0];
207 | 			shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1];
208 | 			shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2];
209 | 			shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3];
210 | 			shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4];
211 | 			shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5];
212 | 			shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6];
213 | 			shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7];
214 | 			shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8];
215 | 			shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9];
216 | 			shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10];
217 | 			shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11];
218 | 			shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12];
219 | 			shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13];
220 | 			shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14];
221 | 			shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15];
222 | 			shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16];
223 | 			shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17];
224 | 			shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18];
225 | 			shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19];
226 | 			shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20];
227 | 			shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21];
228 | 			shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22];
229 | 			shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23];
230 | 			shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24];
231 | 			shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25];
232 | 			shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26];
233 | 			shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27];
234 | 			shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28];
235 | 			shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29];
236 | 			shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30];
237 | 			shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31];
238 | 			shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32];
239 | 			shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33];
240 | 			shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34];
241 | 			shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35];
242 | #else
243 | 			for (i=0; i<histogramSize; i++)
244 | 				shLocalHistograms[smemLocalHistogramPos + i] += shLocalHistograms[smemTargetHistogramPos + i];
245 | #endif
246 | 		}
247 | 
248 | 		__syncthreads();
249 | 	}
250 | 
251 | 	for(unsigned int s = blockSizeX >> 1; s>0; s>>=1)
252 | 	{
253 | 		if (cellIdx < s && (cellIdx + s) < blockSizeX)
254 | 		{
255 | 			smemTargetHistogramPos = (columnId + __mul24((cellIdx + s), cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX);
256 | 
257 | #ifdef UNROLL_LOOPS
258 | 			shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0];
259 | 			shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1];
260 | 			shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2];
261 | 			shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3];
262 | 			shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4];
263 | 			shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5];
264 | 			shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6];
265 | 			shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7];
266 | 			shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8];
267 | 			shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9];
268 | 			shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10];
269 | 			shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11];
270 | 			shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12];
271 | 			shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13];
272 | 			shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14];
273 | 			shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15];
274 | 			shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16];
275 | 			shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17];
276 | 			shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18];
277 | 			shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19];
278 | 			shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20];
279 | 			shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21];
280 | 			shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22];
281 | 			shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23];
282 | 			shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24];
283 | 			shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25];
284 | 			shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26];
285 | 			shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27];
286 | 			shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28];
287 | 			shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29];
288 | 			shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30];
289 | 			shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31];
290 | 			shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32];
291 | 			shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33];
292 | 			shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34];
293 | 			shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35];
294 | #else
295 | 			for (i=0; i<histogramSize; i++)
296 | 				shLocalHistograms[smemLocalHistogramPos + i] += shLocalHistograms[smemTargetHistogramPos + i];
297 | #endif
298 | 		}
299 | 
300 | 		__syncthreads();
301 | 	}
302 | 
303 | 	for(unsigned int s = cellSizeX >> 1; s>0; s>>=1)
304 | 	{
305 | 		if (columnId < s && (columnId + s) < cellSizeX)
306 | 		{
307 | 			smemTargetHistogramPos = (columnId + s + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX);
308 | 
309 | #ifdef UNROLL_LOOPS
310 | 			shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0];
311 | 			shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1];
312 | 			shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2];
313 | 			shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3];
314 | 			shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4];
315 | 			shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5];
316 | 			shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6];
317 | 			shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7];
318 | 			shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8];
319 | 			shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9];
320 | 			shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10];
321 | 			shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11];
322 | 			shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12];
323 | 			shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13];
324 | 			shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14];
325 | 			shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15];
326 | 			shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16];
327 | 			shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17];
328 | 			shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18];
329 | 			shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19];
330 | 			shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20];
331 | 			shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21];
332 | 			shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22];
333 | 			shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23];
334 | 			shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24];
335 | 			shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25];
336 | 			shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26];
337 | 			shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27];
338 | 			shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28];
339 | 			shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29];
340 | 			shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30];
341 | 			shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31];
342 | 			shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32];
343 | 			shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33];
344 | 			shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34];
345 | 			shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35];
346 | #else
347 | 			for (i=0; i<histogramSize; i++)
348 | 				shLocalHistograms[smemLocalHistogramPos + i] += shLocalHistograms[smemTargetHistogramPos + i];
349 | #endif
350 | 		}
351 | 
352 | 		__syncthreads();
353 | 	}
354 | 
355 | 	if (columnId == 0)
356 | 	{
357 | 		//write result to gmem
358 | #ifdef UNROLL_LOOPS
359 | 		blockHistograms[gmemWritePos + 0].x = shLocalHistograms[smemReadPos + 0];
360 | 		blockHistograms[gmemWritePos + 1].x = shLocalHistograms[smemReadPos + 1];
361 | 		blockHistograms[gmemWritePos + 2].x = shLocalHistograms[smemReadPos + 2];
362 | 		blockHistograms[gmemWritePos + 3].x = shLocalHistograms[smemReadPos + 3];
363 | 		blockHistograms[gmemWritePos + 4].x = shLocalHistograms[smemReadPos + 4];
364 | 		blockHistograms[gmemWritePos + 5].x = shLocalHistograms[smemReadPos + 5];
365 | 		blockHistograms[gmemWritePos + 6].x = shLocalHistograms[smemReadPos + 6];
366 | 		blockHistograms[gmemWritePos + 7].x = shLocalHistograms[smemReadPos + 7];
367 | 		blockHistograms[gmemWritePos + 8].x = shLocalHistograms[smemReadPos + 8];
368 | #else
369 | 		for (i=0; i<noHistogramBins; i++)
370 | 			blockHistograms[gmemWritePos + i].x = shLocalHistograms[smemReadPos + i];
371 | #endif
372 | 	}
373 | 
374 | 	if (blockIdx.x == 10 && blockIdx.y == 8)
375 | 	{
376 | 		int asasa;
377 | 		asasa = 0;
378 | 		asasa++;
379 | 	}
380 | }
381 | 
382 | __host__ void ComputeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins,
383 | 											  int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
384 | 											  int windowSizeX, int windowSizeY,
385 | 											  int width, int height)
386 | {
387 | 	int leftoverX;
388 | 	int leftoverY;
389 | 
390 | 	dim3 hThreadSize, hBlockSize;
391 | 
392 | 	rNoOfCellsX = width / cellSizeX;
393 | 	rNoOfCellsY = height / cellSizeY;
394 | 
395 | 	rNoOfBlocksX = rNoOfCellsX - blockSizeX + 1;
396 | 	rNoOfBlocksY = rNoOfCellsY - blockSizeY + 1;
397 | 
398 | 	rNumberOfWindowsX = (width-windowSizeX)/cellSizeX + 1;
399 | 	rNumberOfWindowsY = (height-windowSizeY)/cellSizeY + 1;
400 | 
401 | 	leftoverX = (width - windowSizeX - cellSizeX * (rNumberOfWindowsX - 1))/2;
402 | 	leftoverY = (height - windowSizeY - cellSizeY * (rNumberOfWindowsY - 1))/2;
403 | 
404 | 	hThreadSize = dim3(cellSizeX, blockSizeX, blockSizeY);
405 | 	hBlockSize = dim3(rNoOfBlocksX, rNoOfBlocksY);
406 | 
407 | 	cutilSafeCall(cudaBindTextureToArray(texGauss, gaussArray, channelDescGauss));
408 | 
409 | 	computeBlockHistogramsWithGauss<<<hBlockSize, hThreadSize, noHistogramBins * blockSizeX * blockSizeY * cellSizeX * blockSizeY * blockSizeX * sizeof(float) >>>
410 | 		(inputImage, blockHistograms, noHistogramBins, cellSizeX, cellSizeY, blockSizeX, blockSizeY, leftoverX, leftoverY, width, height);
411 | 
412 | 	cutilSafeCall(cudaUnbindTexture(texGauss));
413 | }
414 | 
415 | __host__ void NormalizeBlockHistograms(float1* blockHistograms, int noHistogramBins,
416 | 									   int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
417 | 									   int width, int height)
418 | {
419 | 	dim3 hThreadSize, hBlockSize;
420 | 
421 | 	rNoOfCellsX = width / cellSizeX;
422 | 	rNoOfCellsY = height / cellSizeY;
423 | 
424 | 	rNoOfBlocksX = rNoOfCellsX - blockSizeX + 1;
425 | 	rNoOfBlocksY = rNoOfCellsY - blockSizeY + 1;
426 | 
427 | 	hThreadSize = dim3(noHistogramBins, blockSizeX, blockSizeY);
428 | 	hBlockSize = dim3(rNoOfBlocksX, rNoOfBlocksY);
429 | 
430 | 	int alignedBlockDimX = iClosestPowerOfTwo(noHistogramBins);
431 | 	int alignedBlockDimY = iClosestPowerOfTwo(blockSizeX);
432 | 	int alignedBlockDimZ = iClosestPowerOfTwo(blockSizeY);
433 | 
434 | 	normalizeBlockHistograms<<<hBlockSize, hThreadSize, noHistogramBins * blockSizeX * blockSizeY * sizeof(float)>>>
435 | 		(blockHistograms, noHistogramBins,
436 | 		rNoOfBlocksX, rNoOfBlocksY, blockSizeX, blockSizeY,
437 | 		alignedBlockDimX, alignedBlockDimY, alignedBlockDimZ,
438 | 		noHistogramBins * rNoOfCellsX, rNoOfCellsY);
439 | 
440 | }
441 | 
442 | __global__ void normalizeBlockHistograms(float1 *blockHistograms, int noHistogramBins,
443 | 										 int rNoOfHOGBlocksX, int rNoOfHOGBlocksY,
444 | 										 int blockSizeX, int blockSizeY,
445 | 										 int alignedBlockDimX, int alignedBlockDimY, int alignedBlockDimZ,
446 | 										 int width, int height)
447 | {
448 | 	int smemLocalHistogramPos, smemTargetHistogramPos, gmemPosBlock, gmemWritePosBlock;
449 | 
450 | 	float* shLocalHistogram = (float*)allShared;
451 | 
452 | 	float localValue, norm1, norm2; float eps2 = 0.01f;
453 | 
454 | 	smemLocalHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x;
455 | 	gmemPosBlock = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, gridDim.x) * __mul24(blockDim.y, blockDim.x) +
456 | 		threadIdx.x + __mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) * blockDim.z;
457 | 	gmemWritePosBlock = __mul24(threadIdx.z, noHistogramBins) + __mul24(threadIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) +
458 | 		threadIdx.x + __mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) * blockDim.z;
459 | 
460 | 	localValue = blockHistograms[gmemPosBlock].x;
461 | 	shLocalHistogram[smemLocalHistogramPos] = localValue * localValue;
462 | 
463 | 	if (blockIdx.x == 10 && blockIdx.y == 8)
464 | 	{
465 | 		int asasa;
466 | 		asasa = 0;
467 | 		asasa++;
468 | 	}
469 | 
470 | 	__syncthreads();
471 | 
472 | 	for(unsigned int s = alignedBlockDimZ >> 1; s>0; s>>=1)
473 | 	{
474 | 		if (threadIdx.z < s && (threadIdx.z + s) < blockDim.z)
475 | 		{
476 | 			smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24((threadIdx.z + s), blockDim.x) * blockDim.y + threadIdx.x;
477 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
478 | 		}
479 | 
480 | 		__syncthreads();
481 | 
482 | 	}
483 | 
484 | 	for (unsigned int s = alignedBlockDimY >> 1; s>0; s>>=1)
485 | 	{
486 | 		if (threadIdx.y < s && (threadIdx.y + s) < blockDim.y)
487 | 		{
488 | 			smemTargetHistogramPos = __mul24((threadIdx.y + s), noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x;
489 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
490 | 		}
491 | 
492 | 		__syncthreads();
493 | 
494 | 	}
495 | 
496 | 	for(unsigned int s = alignedBlockDimX >> 1; s>0; s>>=1)
497 | 	{
498 | 		if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x)
499 | 		{
500 | 			smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + (threadIdx.x + s);
501 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
502 | 		}
503 | 
504 | 		__syncthreads();
505 | 	}
506 | 
507 | 	//if (blockIdx.x == 5 && blockIdx.y == 4)
508 | 	//{
509 | 	//	int asasa;
510 | 	//	asasa = 0;
511 | 	//	asasa++;
512 | 	//}
513 | 
514 | 	norm1 = sqrtf(shLocalHistogram[0]) + __mul24(noHistogramBins, blockSizeX) * blockSizeY;
515 | 	localValue /= norm1;
516 | 
517 | 	localValue = fminf(0.2f, localValue); //why 0.2 ??
518 | 
519 | 	__syncthreads();
520 | 
521 | 	shLocalHistogram[smemLocalHistogramPos] = localValue * localValue;
522 | 
523 | 	__syncthreads();
524 | 
525 | 	for(unsigned int s = alignedBlockDimZ >> 1; s>0; s>>=1)
526 | 	{
527 | 		if (threadIdx.z < s && (threadIdx.z + s) < blockDim.z)
528 | 		{
529 | 			smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24((threadIdx.z + s), blockDim.x) * blockDim.y + threadIdx.x;
530 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
531 | 		}
532 | 
533 | 		__syncthreads();
534 | 
535 | 	}
536 | 
537 | 	for (unsigned int s = alignedBlockDimY >> 1; s>0; s>>=1)
538 | 	{
539 | 		if (threadIdx.y < s && (threadIdx.y + s) < blockDim.y)
540 | 		{
541 | 			smemTargetHistogramPos = __mul24((threadIdx.y + s), noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x;
542 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
543 | 		}
544 | 
545 | 		__syncthreads();
546 | 
547 | 	}
548 | 
549 | 	for(unsigned int s = alignedBlockDimX >> 1; s>0; s>>=1)
550 | 	{
551 | 		if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x)
552 | 		{
553 | 			smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + (threadIdx.x + s);
554 | 			shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos];
555 | 		}
556 | 
557 | 		__syncthreads();
558 | 	}
559 | 
560 | 	norm2 = sqrtf(shLocalHistogram[0]) + eps2;
561 | 	localValue /= norm2;
562 | 
563 | 	blockHistograms[gmemWritePosBlock].x = localValue;
564 | 
565 | 	if (blockIdx.x == 10 && blockIdx.y == 8)
566 | 	{
567 | 		int asasa;
568 | 		asasa = 0;
569 | 		asasa++;
570 | 	}
571 | }
572 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGHistogram.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_HISTOGRAM__
 2 | #define __HOG_HISTOGRAM__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | 
16 | #include "HOGDefines.h"
17 | 
18 | __host__ void InitHistograms(int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, int noHistogramBins, float wtscale);
19 | __host__ void CloseHistogram();
20 | 
21 | __host__ void ComputeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins,
22 | 											  int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
23 | 											  int windowSizeX, int windowSizeY,
24 | 											  int width, int height);
25 | __host__ void NormalizeBlockHistograms(float1* blockHistograms, int noHistogramBins,
26 | 									  int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
27 | 									  int width, int height);
28 | 
29 | __global__ void computeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins,
30 | 												int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
31 | 												int leftoverX, int leftoverY, int width, int height);
32 | 
33 | __global__ void normalizeBlockHistograms(float1 *blockHistograms, int noHistogramBins,
34 | 										int rNoOfHOGBlocksX, int rNoOfHOGBlocksY,
35 | 										int blockSizeX, int blockSizeY,
36 | 										int alignedBlockDimX, int alignedBlockDimY, int alignedBlockDimZ,
37 | 										int width, int height);
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGHistogram.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,allShared,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z31computeBlockHistogramsWithGaussP6float2P6float1iiiiiiiii,_Z24normalizeBlockHistogramsP6float1iiiiiiiiii


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGImage.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * HOGImage.cpp
 3 |  *
 4 |  *  Created on: May 14, 2009
 5 |  *      Author: viprad
 6 |  */
 7 | 
 8 | #include "HOGImage.h"
 9 | 
10 | #include <stdlib.h>
11 | #include <string.h>
12 | 
13 | #include <stdio.h>
14 | 
15 | #include <FreeImage.h>
16 | 
17 | using namespace HOG;
18 | 
19 | HOGImage::HOGImage(int width, int height)
20 | {
21 | 	this->width = width;
22 | 	this->height = height;
23 | 
24 | 	isLoaded = false;
25 | 	this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height);
26 | 	memset(this->pixels, 0, sizeof(unsigned char) * 4 * width * height);
27 | }
28 | 
29 | HOGImage::HOGImage(int width, int height, unsigned char* pixels)
30 | {
31 | 	this->width = width;
32 | 	this->height = height;
33 | 
34 | 	this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height);
35 | 	memcpy(this->pixels, pixels, sizeof(unsigned char) * 4 * width * height);
36 | 
37 | 	isLoaded = true;
38 | }
39 | 
40 | HOGImage::HOGImage(char* fileName)
41 | {
42 | 	bool bLoaded = false;
43 | 	int bpp;
44 | 	FIBITMAP *bmp = 0;
45 | 	FREE_IMAGE_FORMAT fif = FIF_UNKNOWN;
46 | 	fif = FreeImage_GetFileType(fileName);
47 | 	if (fif == FIF_UNKNOWN)
48 | 	{
49 | 		fif = FreeImage_GetFIFFromFilename(fileName);
50 | 	}
51 | 
52 | 	if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif))
53 | 	{
54 | 		bmp = FreeImage_Load(fif, fileName, 0);
55 | 		bLoaded = true;
56 | 		if (bmp == NULL)
57 | 			bLoaded = false;
58 | 	}
59 | 
60 | 	if (bLoaded)
61 | 	{
62 | 		width = FreeImage_GetWidth(bmp);
63 | 		height = FreeImage_GetHeight(bmp);
64 | 
65 | 		bpp = FreeImage_GetBPP(bmp);
66 | 		switch (bpp)
67 | 		{
68 | 		case 32:
69 | 			break;
70 | 		default:
71 | 			FIBITMAP *bmpTemp = FreeImage_ConvertTo32Bits(bmp);
72 | 			if (bmp != NULL) FreeImage_Unload(bmp);
73 | 			bmp = bmpTemp;
74 | 			bpp = FreeImage_GetBPP(bmp);
75 | 			break;
76 | 		}
77 | 
78 | 		this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height);
79 | 		FreeImage_ConvertToRawBits(this->pixels, bmp, width * 4, bpp, FI_RGBA_RED_MASK, FI_RGBA_GREEN_MASK, FI_RGBA_BLUE_MASK, true);
80 | 
81 | 		isLoaded = true;
82 | 	}
83 | 	else
84 | 		isLoaded = false;
85 | }
86 | 
87 | HOGImage::~HOGImage()
88 | {
89 | 	free(pixels);
90 | }
91 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGImage.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * HOGImage.h
 3 | *
 4 | *  Created on: May 14, 2009
 5 | *      Author: viprad
 6 | */
 7 | 
 8 | #ifndef __HOGIMAGE_H__
 9 | #define __HOGIMAGE_H__
10 | 
11 | namespace HOG
12 | {
13 | 	class HOGImage
14 | 	{
15 | 	public:
16 | 		//must me uchar4
17 | 		bool isLoaded;
18 | 
19 | 		int width, height;
20 | 		unsigned char* pixels;
21 | 
22 | 		HOGImage(char* fileName);
23 | 		HOGImage(int width, int height);
24 | 		HOGImage(int width, int height, unsigned char *pixels);
25 | 
26 | 		virtual ~HOGImage();
27 | 	};
28 | }
29 | 
30 | #endif /* HOGIMAGE_H_ */
31 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGNMS.cpp:
--------------------------------------------------------------------------------
  1 | #include "HOGNMS.h"
  2 | 
  3 | #include <math.h>
  4 | 
  5 | using namespace HOG;
  6 | 
  7 | HOGNMS::HOGNMS()
  8 | {
  9 | 	center = 0.0f; scale = 1.0f;
 10 | 	nonmaxSigma[0] = 8.0f; nonmaxSigma[1] = 16.0f; nonmaxSigma[2] = 1.3f;
 11 | 	maxIterations = 100;
 12 | 	modeEpsilon = (float)1e-5;
 13 | 	epsFinalDist = 1.0f;
 14 | 
 15 | 	nsigma[0] = nonmaxSigma[0]; nsigma[1] = nonmaxSigma[1]; nsigma[2] = logf(nonmaxSigma[2]);
 16 | 
 17 | 	isAllocated = false;
 18 | }
 19 | 
 20 | HOGNMS::~HOGNMS()
 21 | {
 22 | 	if (isAllocated)
 23 | 	{
 24 | 		delete tomode;
 25 | 		delete wt;
 26 | 		delete ms;
 27 | 		delete at;
 28 | 		delete nmsResults;
 29 | 		delete nmsToMode;
 30 | 	}
 31 | }
 32 | 
 33 | void HOGNMS::nvalue(HOGPoint3* ms, HOGPoint3* at, float* wt, int length)
 34 | {
 35 | 	int i, j;
 36 | 	float dotxmr, w;
 37 | 	HOGPoint3 x, r, ns, numer, denum;
 38 | 
 39 | 	for (i=0; i<length; i++)
 40 | 	{
 41 | 		numer.x = 0; numer.y = 0; numer.z = 0;
 42 | 		denum.x = 0; denum.y = 0; denum.z = 0;
 43 | 
 44 | 		for (j=0; j<length; j++)
 45 | 		{
 46 | 			ns.x = nsigma[0] * expf(at[j].z); ns.y =  nsigma[1] * expf(at[j].z); ns.z = nsigma[2];
 47 | 			x.x = at[j].x / ns.x; x.y = at[j].y / ns.y; x.z = at[j].z / ns.z;
 48 | 			r.x = at[i].x / ns.x; r.y = at[i].y / ns.y; r.z = at[i].z / ns.z;
 49 | 
 50 | 			dotxmr = (x.x - r.x) * (x.x - r.x) + (x.y - r.y) * (x.y - r.y) + (x.z - r.z) * (x.z - r.z);
 51 | 			w = wt[j] * expf(-dotxmr/2.0f)/sqrtf(ns.x * ns.y * ns.z);
 52 | 
 53 | 			numer.x += w * x.x; numer.y += w * x.y; numer.z += w * x.z;
 54 | 			denum.x += w / ns.x; denum.y += w / ns.y; denum.z += w / ns.z;
 55 | 		}
 56 | 
 57 | 		ms[i].x = numer.x / denum.x; ms[i].y = numer.y / denum.y; ms[i].z = numer.z / denum.z;
 58 | 	}
 59 | }
 60 | 
 61 | void HOGNMS::nvalue(HOGPoint3 *ms, HOGPoint3* msnext, HOGPoint3* at, float* wt, int length)
 62 | {
 63 | 	int j;
 64 | 	float dotxmr, w;
 65 | 	HOGPoint3 x, r, ns, numer, denum, toReturn;
 66 | 
 67 | 	for (j=0; j<length; j++)
 68 | 	{
 69 | 		ns.x = nsigma[0] * expf(at[j].z); ns.y =  nsigma[1] * expf(at[j].z); ns.z = nsigma[2];
 70 | 		x.x = at[j].x / ns.x; x.y = at[j].y / ns.y; x.z = at[j].z / ns.z;
 71 | 		r.x = ms->x / ns.x; r.y = ms->y / ns.y; r.z = ms->z / ns.z;
 72 | 
 73 | 		dotxmr = (x.x - r.x) * (x.x - r.x) + (x.y - r.y) * (x.y - r.y) + (x.z - r.z) * (x.z - r.z);
 74 | 		w = wt[j] * expf(-dotxmr/2.0f)/sqrtf(ns.x * ns.y * ns.z);
 75 | 
 76 | 		numer.x += w * x.x; numer.y += w * x.y; numer.z += w * x.z;
 77 | 		denum.x += w / ns.x; denum.y += w / ns.y; denum.z += w / ns.z;
 78 | 	}
 79 | 
 80 | 	msnext->x = numer.x / denum.x; msnext->y = numer.y / denum.y; msnext->z = numer.z / denum.z;
 81 | }
 82 | 
 83 | void HOGNMS::fvalue(HOGPoint3* modes, HOGResult* results, int lengthModes, HOGPoint3* at, float* wt, int length)
 84 | {
 85 | 	int i, j;
 86 | 	float no, dotxx;
 87 | 	HOGPoint3 x, ns;
 88 | 	for (i=0; i<lengthModes; i++)
 89 | 	{
 90 | 		no = 0;
 91 | 		for (j=0; j<length; j++)
 92 | 		{
 93 | 			ns.x = nsigma[0] * expf(at[j].z); ns.y =  nsigma[1] * expf(at[j].z); ns.z = nsigma[2];
 94 | 			x.x = (at[j].x - modes[i].x) / ns.x;
 95 | 			x.y = (at[j].y - modes[i].y) / ns.y;
 96 | 			x.z = (at[j].z - modes[i].z) / ns.z;
 97 | 
 98 | 			dotxx = x.x * x.x + x.y * x.y + x.z * x.z;
 99 | 
100 | 			no += wt[j] * expf(-dotxx/2)/sqrtf(ns.x * ns.y * ns.z);
101 | 		}
102 | 		results[i].score = no;
103 | 	}
104 | }
105 | 
106 | float HOGNMS::distqt(HOGPoint3 *p1, HOGPoint3 *p2)
107 | {
108 | 	HOGPoint3 ns, b;
109 | 	ns.x = nsigma[0] * expf(p2->z); ns.y = nsigma[1] * expf(p2->z); ns.z = nsigma[2];
110 | 	b.x = p2->x - p1->x; b.y = p2->y - p1->y; b.z = p2->z - p1->z;
111 | 	b.x /= ns.x; b.y /= ns.y; b.z /= ns.z;
112 | 	return b.x * b.x + b.y * b.y + b.z * b.z;
113 | }
114 | 
115 | void HOGNMS::shiftToMode(HOGPoint3* ms, HOGPoint3* at, float* wt, HOGPoint3 *tomode, int length)
116 | {
117 | 	int i, count;
118 | 	HOGPoint3 ii,II;
119 | 	for (i=0; i<length; i++)
120 | 	{
121 | 		II = ms[i];;
122 | 		count = 0;
123 | 
124 | 		do
125 | 		{
126 | 			ii = II;
127 | 			nvalue(&ii, &II, at, wt, length);
128 | 			++count;
129 | 		} while ( count < maxIterations && distqt(&ii,&II) > modeEpsilon );
130 | 
131 | 		tomode[i].x = II.x; tomode[i].y = II.y; tomode[i].z = II.z;
132 | 	}
133 | }
134 | 
135 | HOGResult* HOGNMS::ComputeNMSResults(HOGResult* formattedResults, int formattedResultsCount, bool *nmsResultsAvailable, int *nmsResultsCount,
136 | 									 int hWindowSizeX, int hWindowSizeY)
137 | {
138 | 	if (!isAllocated)
139 | 	{
140 | 		wt = new float[hWindowSizeX * hWindowSizeX];
141 | 		at = new HOGPoint3[hWindowSizeX * hWindowSizeX];
142 | 		ms = new HOGPoint3[hWindowSizeX * hWindowSizeX];
143 | 		tomode = new HOGPoint3[hWindowSizeX * hWindowSizeX];
144 | 		nmsToMode = new HOGPoint3[hWindowSizeX * hWindowSizeX];
145 | 		nmsResults = new HOGResult[hWindowSizeX * hWindowSizeX];
146 | 		isAllocated = true;
147 | 	}
148 | 
149 | 	int i, j;
150 | 	float cenx, ceny, nmsOK;
151 | 
152 | 	*nmsResultsCount = 0;
153 | 	nmsResultsAvailable = false;
154 | 
155 | 	for (i=0; i<formattedResultsCount; i++)
156 | 	{
157 | 		wt[i] = this->sigmoid(formattedResults[i].score);
158 | 		cenx = formattedResults[i].x + formattedResults[i].width / 2.0f;
159 | 		ceny = formattedResults[i].y + formattedResults[i].height / 2.0f;
160 | 		at[i] = HOGPoint3(cenx, ceny, logf(formattedResults[i].scale));
161 | 	}
162 | 
163 | 	nvalue(ms, at, wt, formattedResultsCount);
164 | 	shiftToMode(ms, at, wt, tomode, formattedResultsCount);
165 | 
166 | 	for (i=0; i<formattedResultsCount; i++)
167 | 	{
168 | 		nmsOK = true;
169 | 		for (j=0; j<*nmsResultsCount; j++)
170 | 		{
171 | 			if (distqt(&nmsToMode[j], &tomode[i]) < epsFinalDist)
172 | 			{
173 | 				nmsOK = false;
174 | 				break;
175 | 			}
176 | 		}
177 | 
178 | 		if (nmsOK)
179 | 		{
180 | 			nmsResults[*nmsResultsCount].scale = expf(tomode[i].z);
181 | 
182 | 			nmsResults[*nmsResultsCount].width = (int)floorf((float)hWindowSizeX * nmsResults[*nmsResultsCount].scale);
183 | 			nmsResults[*nmsResultsCount].height = (int)floorf((float)hWindowSizeY * nmsResults[*nmsResultsCount].scale);
184 | 
185 | 			nmsResults[*nmsResultsCount].x = (int)ceilf(tomode[i].x - (float) hWindowSizeX * nmsResults[*nmsResultsCount].scale / 2);
186 | 			nmsResults[*nmsResultsCount].y = (int)ceilf(tomode[i].y - (float) hWindowSizeY * nmsResults[*nmsResultsCount].scale / 2);
187 | 	
188 | 			nmsToMode[*nmsResultsCount] = tomode[i];
189 | 
190 | 			(*nmsResultsCount)++;
191 | 		}
192 | 	}
193 | 
194 | 	fvalue(nmsToMode, nmsResults, *nmsResultsCount, at, wt, formattedResultsCount);
195 | 
196 | 	return nmsResults;
197 | }
198 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGNMS.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_NMS__
 2 | #define __HOG_NMS__
 3 | 
 4 | #include "HOGPoint3.h"
 5 | #include "HOGResult.h"
 6 | 
 7 | namespace HOG
 8 | {
 9 | 	class HOGNMS
10 | 	{
11 | 	private:
12 | 		HOGPoint3 *at, *ms, *tomode, *nmsToMode;
13 | 
14 | 		HOGResult *nmsResults;
15 | 
16 | 		float* wt;
17 | 
18 | 		float center, scale;
19 | 		float nonmaxSigma[3];
20 | 		float nsigma[3];
21 | 		float modeEpsilon;
22 | 		float epsFinalDist;
23 | 
24 | 		int maxIterations;
25 | 
26 | 		bool isAllocated;
27 | 
28 | 		float sigmoid(float score) { return (score > center) ? scale * (score - center) : 0.0f; }
29 | 		void nvalue(HOGPoint3* ms, HOGPoint3* at, float* wt, int length);
30 | 		void nvalue(HOGPoint3* ms, HOGPoint3* msnext, HOGPoint3* at, float* wt, int length);
31 | 		void fvalue(HOGPoint3* modes, HOGResult* results, int lengthModes, HOGPoint3* at, float* wt, int length);
32 | 		void shiftToMode(HOGPoint3* ms, HOGPoint3* at, float* wt, HOGPoint3 *tomode, int length);
33 | 		float distqt(HOGPoint3 *p1, HOGPoint3 *p2);
34 | 
35 | 	public:
36 | 		HOGResult* ComputeNMSResults(HOGResult* formattedResults, int formattedResultsCount, bool *nmsResultsAvailable, int *nmsResultsCount,
37 | 			int hWindowSizeX, int hWindowSizeY);
38 | 
39 | 		HOGNMS();
40 | 		~HOGNMS(void);
41 | 	};
42 | }
43 | #endif
44 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGPadding.cu:
--------------------------------------------------------------------------------
 1 | #include "HOGPadding.h"
 2 | #include "HOGUtils.h"
 3 | #include "cutil.h"
 4 | 
 5 | extern int hWidthROI, hHeightROI;
 6 | extern int hPaddedWidth, hPaddedHeight;
 7 | extern int hWidth, hHeight;
 8 | extern int hPaddingSizeX, hPaddingSizeY;
 9 | 
10 | extern int avSizeX, avSizeY, marginX, marginY;
11 | 
12 | uchar4* paddedRegisteredImageU4;
13 | 
14 | __host__ void InitPadding(int hPaddedWidth, int hPaddedHeight)
15 | {
16 | 	cutilSafeCall(cudaMalloc((void**) &paddedRegisteredImageU4, sizeof(uchar4) * hPaddedWidth * hPaddedHeight));
17 | }
18 | 
19 | __host__ void ClosePadding()
20 | {
21 | 	cutilSafeCall(cudaFree(paddedRegisteredImageU4));
22 | }
23 | 
24 | __host__ void PadHostImage(uchar4* registeredImage, float4 *paddedRegisteredImage,
25 | 		int minx, int miny, int maxx, int maxy)
26 | {
27 | 	hWidthROI = maxx - minx;
28 | 	hHeightROI = maxy - miny;
29 | 
30 | 	int toaddxx = 0, toaddxy = 0, toaddyx = 0, toaddyy = 0;
31 | 
32 | 	if (avSizeX) { toaddxx = hWidthROI * marginX / avSizeX; toaddxy = hHeightROI * marginY / avSizeX; }
33 | 	if (avSizeY) { toaddyx = hWidthROI * marginX / avSizeY; toaddyy = hHeightROI * marginY / avSizeY; }
34 | 
35 | 	hPaddingSizeX = max(toaddxx, toaddyx); hPaddingSizeY = max(toaddxy, toaddyy);
36 | 
37 | 	hPaddedWidth = hWidthROI + hPaddingSizeX*2;
38 | 	hPaddedHeight = hHeightROI + hPaddingSizeY*2;
39 | 
40 | 	cutilSafeCall(cudaMemset(paddedRegisteredImageU4, 0, sizeof(uchar4) * hPaddedWidth * hPaddedHeight));
41 | 
42 | 	cutilSafeCall(cudaMemcpy2D(paddedRegisteredImageU4 + hPaddingSizeX + hPaddingSizeY * hPaddedWidth,
43 | 			hPaddedWidth * sizeof(uchar4), registeredImage + minx + miny * hWidth,
44 | 			hWidth * sizeof(uchar4), hWidthROI * sizeof(uchar4),
45 | 			hHeightROI, cudaMemcpyHostToDevice));
46 | 
47 | 	Uchar4ToFloat4(paddedRegisteredImageU4, paddedRegisteredImage, hPaddedWidth, hPaddedHeight);
48 | }
49 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGPadding.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_PADDING__
 2 | #define __HOG_PADDING__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | #include "HOGDefines.h"
16 | 
17 | __host__ void InitPadding(int hPaddedWidth, int hPaddedHeight);
18 | __host__ void ClosePadding();
19 | 
20 | __host__ void PadHostImage(uchar4* registeredImage, float4 *paddedRegisteredImage,
21 | 		int minx, int miny, int maxx, int maxy);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGPadding.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export __dummy_entry__


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGPoint3.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_VECTOR_3D__
 2 | #define __HOG_VECTOR_3D__
 3 | 
 4 | namespace HOG
 5 | {
 6 | 	class HOGPoint3
 7 | 	{
 8 | 	public:
 9 | 		float x,y,z;
10 | 
11 | 		HOGPoint3(float x, float y, float z) { this->x = x; this->y = y; this->z = z; }
12 | 		HOGPoint3() { this->x = 0; this->y = 0; this->z = 0; }
13 | 	};
14 | }
15 | 
16 | #endif
17 | 
18 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGResult.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_RESUL__
 2 | #define __HOG_RESUL__
 3 | 
 4 | namespace HOG
 5 | {
 6 | 	class HOGResult
 7 | 	{
 8 | 	public:
 9 | 		float score;
10 | 		float scale;
11 | 
12 | 		int width, height;
13 | 		int origX, origY;
14 | 		int x, y;
15 | 
16 | 		HOGResult()
17 | 		{
18 | 			width = 0;
19 | 			height = 0;
20 | 			origX = 0;
21 | 			origY = 0;
22 | 			x = 0;
23 | 			y = 0;
24 | 		}
25 | 	};
26 | }
27 | 
28 | #endif
29 | 
30 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGSVMSlider.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGSVMSlider.h"
  2 | #include "HOGUtils.h"
  3 | #include "cutil.h"
  4 | 
  5 | texture<float, 1, cudaReadModeElementType> texSVM;
  6 | cudaArray *svmArray = 0;
  7 | 
  8 | cudaChannelFormatDesc channelDescSVM;
  9 | 
 10 | extern int scaleCount;
 11 | extern int hNumberOfWindowsX, hNumberOfWindowsY;
 12 | extern int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY;
 13 | extern int rNumberOfWindowsX, rNumberOfWindowsY;
 14 | 
 15 | extern __shared__ float1 allSharedF1[];
 16 | 
 17 | float svmBias;
 18 | 
 19 | __host__ void InitSVM(float _svmBias, float* svmWeights, int svmWeightsCount)
 20 | {
 21 | 	channelDescSVM = cudaCreateChannelDesc<float>();
 22 | 	cutilSafeCall(cudaMallocArray(&svmArray, &channelDescSVM, svmWeightsCount, 1));
 23 | 	cutilSafeCall(cudaMemcpyToArray(svmArray, 0, 0, svmWeights, svmWeightsCount * sizeof(float), cudaMemcpyHostToDevice));
 24 | 	svmBias = _svmBias;
 25 | }
 26 | 
 27 | __host__ void CloseSVM()
 28 | {
 29 | 	cutilSafeCall(cudaFreeArray(svmArray));
 30 | }
 31 | 
 32 | __global__ void linearSVMEvaluation(float1* svmScores, float svmBias,
 33 | 									float1* blockHistograms, int noHistogramBins,
 34 | 									int windowSizeX, int windowSizeY, int hogBlockCountX, int hogBlockCountY,
 35 | 									int cellSizeX, int cellSizeY,
 36 | 									int numberOfBlockPerWindowX, int numberOfBlockPerWindowY,
 37 | 									int blockSizeX, int blockSizeY,
 38 | 									int alignedBlockDimX,
 39 | 									int scaleId, int scaleCount,
 40 | 									int hNumberOfWindowsX, int hNumberOfWindowsY,
 41 | 									int width, int height)
 42 | {
 43 | 	int i;
 44 | 	int texPos;
 45 | 	float1 localValue;
 46 | 	float texValue;
 47 | 
 48 | 	float1* smem = (float1*) allSharedF1;
 49 | 
 50 | 	int gmemPosWindow, gmemPosInWindow, gmemPosInWindowDown, smemLocalPos, smemTargetPos;
 51 | 	int gmemStride = hogBlockCountX * noHistogramBins * blockSizeX;
 52 | 
 53 | 	gmemPosWindow = blockIdx.x * noHistogramBins * blockSizeX + blockIdx.y * blockSizeY * gmemStride;
 54 | 	gmemPosInWindow = gmemPosWindow + threadIdx.x;
 55 | 	smemLocalPos = threadIdx.x;
 56 | 
 57 | 	int val1 = (blockSizeY * blockSizeX * noHistogramBins) * numberOfBlockPerWindowY;
 58 | 	int val2 = blockSizeX * noHistogramBins;
 59 | 	localValue.x = 0;
 60 | 
 61 | 	if (blockIdx.x == 10 && blockIdx.y == 8)
 62 | 	{
 63 | 		int asasasa;
 64 | 		asasasa = 0;
 65 | 		asasasa++;
 66 | 	}
 67 | 
 68 | 	for (i = 0; i<blockSizeY * numberOfBlockPerWindowY; i++)
 69 | 	{
 70 | 		gmemPosInWindowDown = gmemPosInWindow + i * gmemStride;
 71 | 		texPos = threadIdx.x % val2 + i * val2 + threadIdx.x / val2 * val1;
 72 | 		texValue =  tex1D(texSVM, texPos);
 73 | 		localValue.x += blockHistograms[gmemPosInWindowDown].x * texValue;
 74 | 	}
 75 | 
 76 | 	smem[smemLocalPos] = localValue;
 77 | 
 78 | 	__syncthreads();
 79 | 
 80 | 	for(unsigned int s = alignedBlockDimX >> 1; s>0; s>>=1)
 81 | 	{
 82 | 		if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x)
 83 | 		{
 84 | 			smemTargetPos = threadIdx.x + s;
 85 | 			smem[smemLocalPos].x += smem[smemTargetPos].x;
 86 | 		}
 87 | 
 88 | 		__syncthreads();
 89 | 	}
 90 | 
 91 | 	if (threadIdx.x == 0)
 92 | 	{
 93 | 		smem[smemLocalPos].x -= svmBias;
 94 | 		svmScores[blockIdx.x + blockIdx.y * hNumberOfWindowsX + scaleId * hNumberOfWindowsX * hNumberOfWindowsY] = smem[smemLocalPos];
 95 | 	}
 96 | 
 97 | 	if (blockIdx.x == 10 && blockIdx.y == 8)
 98 | 	{
 99 | 		int asasasa;
100 | 		asasasa = 0;
101 | 		asasasa++;
102 | 	}
103 | }
104 | 
105 | __host__ void ResetSVMScores(float1* svmScores)
106 | {
107 | 	cutilSafeCall(cudaMemset(svmScores, 0, sizeof(float) * scaleCount * hNumberOfWindowsX * hNumberOfWindowsY));
108 | }
109 | 
110 | __host__ void LinearSVMEvaluation(float1* svmScores, float1* blockHistograms, int noHistogramBins,
111 | 								  int windowSizeX, int windowSizeY,
112 | 								  int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
113 | 								  int hogBlockCountX, int hogBlockCountY,
114 | 								  int scaleId, int width, int height)
115 | {
116 | 	rNumberOfWindowsX = (width-windowSizeX)/cellSizeX + 1;
117 | 	rNumberOfWindowsY = (height-windowSizeY)/cellSizeY + 1;
118 | 
119 | 	dim3 threadCount = dim3(noHistogramBins * blockSizeX * hNumberOfBlockPerWindowX);
120 | 	dim3 blockCount = dim3(rNumberOfWindowsX, rNumberOfWindowsY);
121 | 
122 | 	int alignedBlockDimX = iClosestPowerOfTwo(noHistogramBins * blockSizeX * hNumberOfBlockPerWindowX);
123 | 
124 | 	cutilSafeCall(cudaBindTextureToArray(texSVM, svmArray, channelDescSVM));
125 | 
126 | 	linearSVMEvaluation<<<blockCount, threadCount, noHistogramBins * blockSizeX * hNumberOfBlockPerWindowX * sizeof(float1)>>>
127 | 		(svmScores, svmBias, blockHistograms, noHistogramBins,
128 | 		windowSizeX, windowSizeY, hogBlockCountX, hogBlockCountY, cellSizeX, cellSizeY,
129 | 		hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY,
130 | 		blockSizeX, blockSizeY, alignedBlockDimX, scaleId, scaleCount,
131 | 		hNumberOfWindowsX, hNumberOfWindowsY, width, height);
132 | 
133 | 	cutilSafeCall(cudaUnbindTexture(texSVM));
134 | }
135 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGSVMSlider.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_SVM_SLIDER__
 2 | #define __HOG_SVM_SLIDER__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | 
16 | #include "HOGDefines.h"
17 | 
18 | __host__ void InitSVM(float svmBias, float* svmWeights, int svmWeightsCount);
19 | __host__ void CloseSVM();
20 | 
21 | __global__ void linearSVMEvaluation(float1* svmScores, float svmBias,
22 | 									float1* blockHistograms, int noHistogramBins,
23 | 									int windowSizeX, int windowSizeY, int hogBlockCountX, int hogBlockCountY,
24 | 									int cellSizeX, int cellSizeY,
25 | 									int numberOfBlockPerWindowX, int numberOfBlockPerWindowY,
26 | 									int blockSizeX, int blockSizeY,
27 | 									int alignedBlockDimX,
28 | 									int scaleId, int scaleCount,
29 | 									int hNumberOfWindowsX, int hNumberOfWindowsY,
30 | 									int width, int height);
31 | 
32 | __host__ void ResetSVMScores(float1* svmScores);
33 | __host__ void LinearSVMEvaluation(float1* svmScores, float1* blockHistograms, int noHistogramBins,
34 | 								  int windowSizeX, int windowSizeY,
35 | 								  int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY,
36 | 								  int hogBlockCountX, int hogBlockCountY,
37 | 								  int scaleId, int width, int height);
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGSVMSlider.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,allSharedF1,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z19linearSVMEvaluationP6float1fS0_iiiiiiiiiiiiiiiiii


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGScale.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGScale.h"
  2 | #include "HOGUtils.h"
  3 | #include "cutil.h"
  4 | 
  5 | extern int rPaddedHeight;
  6 | extern int rPaddedWidth;
  7 | extern int hPaddedHeight;
  8 | extern int hPaddedWidth;
  9 | cudaArray *imageArray = 0;
 10 | texture<float4, 2, cudaReadModeElementType> tex;
 11 | cudaChannelFormatDesc channelDescDownscale;
 12 | 
 13 | bool isAlocated;
 14 | 
 15 | // w0, w1, w2, and w3 are the four cubic B-spline basis functions
 16 | __device__ float w0(float a) { return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); }
 17 | __device__ float w1(float a) { return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); }
 18 | __device__ float w2(float a) { return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); }
 19 | __device__ float w3(float a) { return (1.0f/6.0f)*(a*a*a); }
 20 | 
 21 | // g0 and g1 are the two amplitude functions
 22 | __device__ float g0(float a) { return w0(a) + w1(a); }
 23 | __device__ float g1(float a) { return w2(a) + w3(a); }
 24 | 
 25 | // h0 and h1 are the two offset functions
 26 | __device__ float h0(float a) { return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f; }
 27 | __device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; }
 28 | 
 29 | __host__ void InitScale(int hPaddedWidth, int hPaddedHeight)
 30 | {
 31 | 	channelDescDownscale = cudaCreateChannelDesc<float4>();
 32 | 	tex.filterMode = cudaFilterModeLinear;
 33 | 	tex.normalized = false;
 34 | 	isAlocated = false;
 35 | }
 36 | 
 37 | __host__ void CloseScale()
 38 | {
 39 | 	//if (isAlocated) cutilSafeCall(cudaFreeArray(imageArray));
 40 | }
 41 | 
 42 | __host__ void DownscaleImage(int startScaleId, int endScaleId, int scaleId, float scale, 
 43 | 							 bool useGrayscale, float4* paddedRegisteredImage,
 44 | 							 float1* resizedPaddedImageF1, float4* resizedPaddedImageF4)
 45 | {
 46 | 	dim3 hThreadSize, hBlockSize;
 47 | 
 48 | 	hThreadSize = dim3(THREAD_SIZE_W, THREAD_SIZE_H);
 49 | 
 50 | 	rPaddedWidth = iDivUpF(hPaddedWidth, scale);
 51 | 	rPaddedHeight = iDivUpF(hPaddedHeight, scale);
 52 | 
 53 | 	hBlockSize = dim3(iDivUp(rPaddedWidth, hThreadSize.x), iDivUp(rPaddedHeight, hThreadSize.y));
 54 | 
 55 | 	if (scaleId == startScaleId)
 56 | 	{
 57 | 		if (isAlocated)
 58 | 			cutilSafeCall(cudaFreeArray(imageArray));
 59 | 		cutilSafeCall(cudaMallocArray(&imageArray, &channelDescDownscale, hPaddedWidth, hPaddedHeight) );
 60 | 		cutilSafeCall(cudaMemcpyToArray(imageArray, 0, 0, paddedRegisteredImage, sizeof(float4) * hPaddedWidth * hPaddedHeight, cudaMemcpyDeviceToDevice));
 61 | 		isAlocated = true;
 62 | 	}
 63 | 
 64 | 	cutilSafeCall(cudaBindTextureToArray(tex, imageArray, channelDescDownscale));
 65 | 
 66 | 	if (useGrayscale)
 67 | 	{
 68 | 		cutilSafeCall(cudaMemset(resizedPaddedImageF1, 0, hPaddedWidth * hPaddedHeight * sizeof(float1)));
 69 | 		resizeFastBicubic1<<<hBlockSize, hThreadSize>>>(resizedPaddedImageF1, paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale);
 70 | 	}
 71 | 	else
 72 | 	{
 73 | 		cutilSafeCall(cudaMemset(resizedPaddedImageF4, 0, hPaddedWidth * hPaddedHeight * sizeof(float4)));
 74 | 		resizeFastBicubic4<<<hBlockSize, hThreadSize>>>(resizedPaddedImageF4, paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale);
 75 | 	}
 76 | 
 77 | 	cutilSafeCall(cudaUnbindTexture(tex));
 78 | 
 79 | 	if (scaleId == endScaleId)
 80 | 	{
 81 | 		cutilSafeCall(cudaFreeArray(imageArray));
 82 | 		isAlocated = false;
 83 | 	}
 84 | }
 85 | 
 86 | __device__ float4 tex2DFastBicubic(const texture<float4, 2, cudaReadModeElementType> texref, float x, float y)
 87 | {
 88 | 	float4 r;
 89 | 	float4 val0, val1, val2, val3;
 90 | 
 91 | 	x -= 0.5f;
 92 | 	y -= 0.5f;
 93 | 	float px = floor(x);
 94 | 	float py = floor(y);
 95 | 	float fx = x - px;
 96 | 	float fy = y - py;
 97 | 
 98 | 	float g0x = g0(fx);
 99 | 	float g1x = g1(fx);
100 | 	float h0x = h0(fx);
101 | 	float h1x = h1(fx);
102 | 	float h0y = h0(fy);
103 | 	float h1y = h1(fy);
104 | 
105 | 	val0 = tex2D(texref, px + h0x, py + h0y);
106 | 	val1 = tex2D(texref, px + h1x, py + h0y);
107 | 	val2 = tex2D(texref, px + h0x, py + h1y);
108 | 	val3 = tex2D(texref, px + h1x, py + h1y);
109 | 
110 | 	r.x = (g0(fy) * (g0x * val0.x + g1x * val1.x) + g1(fy) * (g0x * val2.x + g1x * val3.x));
111 | 	r.y = (g0(fy) * (g0x * val0.y + g1x * val1.y) + g1(fy) * (g0x * val2.y + g1x * val3.y));
112 | 	r.z = (g0(fy) * (g0x * val0.z + g1x * val1.z) + g1(fy) * (g0x * val2.z + g1x * val3.z));
113 | 	r.w = (g0(fy) * (g0x * val0.w + g1x * val1.w) + g1(fy) * (g0x * val2.w + g1x * val3.w));
114 | 
115 | 	return r;
116 | }
117 | 
118 | __global__ void resizeFastBicubic4(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale)
119 | {
120 | 	int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
121 | 	int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
122 | 	int i = __umul24(y, width) + x;
123 | 
124 | 	float u = x*scale;
125 | 	float v = y*scale;
126 | 
127 | 	if (x < width && y < height)
128 | 	{
129 | 		float4 cF;
130 | 
131 | 		if (scale == 1.0f)
132 | 		{
133 | 			cF = paddedRegisteredImage[x + y * width];
134 | 			cF.w = 0;
135 | 		}
136 | 		else
137 | 		{
138 | 			cF = tex2D(tex, u, v);
139 | 			cF.w = 0;
140 | 		}
141 | 
142 | 		cF.x = sqrtf(cF.x); cF.y = sqrtf(cF.y); cF.z = sqrtf(cF.z); cF.w = 0;
143 | 		outputFloat[i] = cF;
144 | 	}
145 | }
146 | 
147 | __global__ void resizeFastBicubic1(float1 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale)
148 | {
149 | 	int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
150 | 	int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
151 | 	int i = __umul24(y, width) + x;
152 | 
153 | 	float u = x*scale;
154 | 	float v = y*scale;
155 | 
156 | 	if (x < width && y < height)
157 | 	{
158 | 		float4 cF;
159 | 
160 | 		if (scale == 1.0f)
161 | 		{
162 | 			cF = paddedRegisteredImage[x + y * width];
163 | 			cF.w = 0;
164 | 		}
165 | 		else
166 | 		{
167 | 			cF = tex2D(tex, u, v);
168 | 			cF.w = 0;
169 | 		}
170 | 
171 | 		outputFloat[i].x = sqrtf(0.2989f * cF.x + 0.5870f * cF.y + 0.1140f * cF.z);
172 | 	}
173 | }
174 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGScale.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_SCALE__
 2 | #define __HOG_SCALE__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | 
16 | #include "HOGDefines.h"
17 | 
18 | __host__ void InitScale(int hPaddedWidth, int hPaddedHeight);
19 | __host__ void CloseScale();
20 | 
21 | __host__ void DownscaleImage(int startScaleId, int endScaleId, int scaleId, float scale, 
22 | 							 bool useGrayscale, float4* paddedRegisteredImage,
23 | 							 float1* resizedPaddedImageF1, float4* resizedPaddedImageF4);
24 | 
25 | __global__ void resizeFastBicubic1(float1 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale);
26 | __global__ void resizeFastBicubic4(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale);
27 | 
28 | //__device__ float4 tex2DFastBicubic(const texture<uchar4, 2, cudaReadModeElementType> texref, float x, float y, float scale);
29 | //
30 | //__device__ float w0(float a);
31 | //__device__ float w1(float a);
32 | //__device__ float w2(float a);
33 | //__device__ float w3(float a);
34 | //
35 | //__device__ float g0(float a);
36 | //__device__ float g1(float a);
37 | //
38 | //__device__ float h0(float a);
39 | //__device__ float h1(float a);
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGScale.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic1P6float1P6float4iif,_Z18resizeFastBicubic4P6float4S0_iif


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGUtils.cu:
--------------------------------------------------------------------------------
  1 | #include "HOGUtils.h"
  2 | 
  3 | //Round a / b to nearest higher integer value
  4 | __host__ int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
  5 | 
  6 | //Round a / b to nearest lower integer value
  7 | __host__ int iDivDown(int a, int b) { return a / b; }
  8 | 
  9 | //Align a to nearest higher multiple of b
 10 | __host__ int iAlignUp(int a, int b) { return (a % b != 0) ?  (a - a % b + b) : a; }
 11 | 
 12 | //Align a to nearest lower multiple of b
 13 | __host__ int iAlignDown(int a, int b)  {return a - a % b; }
 14 | 
 15 | //Round a / b to nearest higher integer value
 16 | __host__ int iDivUpF(int a, float b) { return (a % int(b) != 0) ? int(a / b + 1) : int(a / b);}
 17 | 
 18 | __host__ int iClosestPowerOfTwo(int x) { x--; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x++; return x; }
 19 | 
 20 | __host__ void Uchar4ToFloat4(uchar4 *inputImage, float4 *outputImage, int width, int height)
 21 | {
 22 | 	dim3 threads_in_block(16,16);
 23 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 24 | 	uchar4tofloat4<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height);
 25 | }
 26 | __host__ void Float4ToUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height)
 27 | {
 28 | 	dim3 threads_in_block(16,16);
 29 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 30 | 	float4toUchar4<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height);
 31 | }
 32 | __host__ void Float2ToUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index)
 33 | {
 34 | 	dim3 threads_in_block(16,16);
 35 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 36 | 	float2toUchar4<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height, index);
 37 | }
 38 | __host__ void Float2ToUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index)
 39 | {
 40 | 	dim3 threads_in_block(16,16);
 41 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 42 | 	float2toUchar1<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height, index);
 43 | }
 44 | __host__ void Float1ToUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height)
 45 | {
 46 | 	dim3 threads_in_block(16,16);
 47 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 48 | 	float1toUchar4<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height);
 49 | }
 50 | __host__ void Float1ToUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height)
 51 | {
 52 | 	dim3 threads_in_block(16,16);
 53 | 	dim3 blocks(iDivUp(width,16), iDivUp(height,16));
 54 | 	float1toUchar1<<<blocks, threads_in_block>>>(inputImage, outputImage, width, height);
 55 | }
 56 | __global__ void float4toUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height)
 57 | {
 58 | 	int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
 59 | 	int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
 60 | 
 61 | 	float4 pixelf = inputImage[offset];
 62 | 	uchar4 pixel;
 63 | 	pixel.x = (unsigned char) pixelf.x; pixel.y = (unsigned char) pixelf.y;
 64 | 	pixel.z = (unsigned char) pixelf.z; pixel.w = (unsigned char) pixelf.w;
 65 | 
 66 | 	outputImage[offset] = pixel;
 67 | }
 68 | __global__ void float2toUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index)
 69 | {
 70 | 	int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
 71 | 	int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
 72 | 
 73 | 	float2 pixelf = inputImage[offset];
 74 | 	float pixelfIndexed = (index == 0) ? pixelf.x : pixelf.y;
 75 | 
 76 | 	uchar4 pixel;
 77 | 	pixel.x = (unsigned char) abs(pixelfIndexed); pixel.y = (unsigned char) abs(pixelfIndexed);
 78 | 	pixel.z = (unsigned char) abs(pixelfIndexed); pixel.w = (unsigned char) abs(pixelfIndexed);
 79 | 	outputImage[offset] = pixel;
 80 | }
 81 | __global__ void float2toUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index)
 82 | {
 83 | 	int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
 84 | 	int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
 85 | 
 86 | 	float2 pixelf = inputImage[offset];
 87 | 	float pixelfIndexed = (index == 0) ? pixelf.x : pixelf.y;
 88 | 
 89 | 	uchar1 pixel;
 90 | 	pixel.x = (unsigned char) pixelfIndexed;
 91 | 
 92 | 	outputImage[offset] = pixel;
 93 | }
 94 | __global__ void float1toUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height)
 95 | {
 96 | 	int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
 97 | 	int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
 98 | 
 99 | 	float1 pixelf = inputImage[offset];
100 | 	uchar4 pixel;
101 | 	pixel.x = (unsigned char) pixelf.x; pixel.y = (unsigned char) pixelf.x;
102 | 	pixel.z = (unsigned char) pixelf.x; pixel.w = (unsigned char) pixelf.x;
103 | 
104 | 	outputImage[offset] = pixel;
105 | }
106 | __global__ void float1toUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height)
107 | {
108 | 	int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
109 | 	int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
110 | 
111 | 	float1 pixelf = inputImage[offset];
112 | 	uchar1 pixel;
113 | 	pixel.x = (unsigned char) pixelf.x;
114 | 
115 | 	outputImage[offset] = pixel;
116 | }
117 | 
118 | __global__ void uchar4tofloat4(uchar4 *inputImage, float4 *outputImage, int width, int height)
119 | {
120 | 	int offsetX = blockIdx.x * blockDim.x + threadIdx.x;
121 | 	int offsetY = blockIdx.y * blockDim.y + threadIdx.y;
122 | 
123 | 	if (offsetX < width && offsetY < height)
124 | 	{
125 | 		int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width;
126 | 		int offset = offsetBlock + threadIdx.x + threadIdx.y * width;
127 | 
128 | 		uchar4 pixel = inputImage[offset];
129 | 		float4 pixelf;
130 | 		pixelf.x = pixel.x; pixelf.y = pixel.y;
131 | 		pixelf.z = pixel.z; pixelf.w = pixel.w;
132 | 
133 | 		outputImage[offset] = pixelf;
134 | 	}
135 | }
136 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOG_UTILS__
 2 | #define __HOG_UTILS__
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <math.h>
 7 | 
 8 | #ifdef _WIN32
 9 | #  define WINDOWS_LEAN_AND_MEAN
10 | #  include <windows.h>
11 | #endif
12 | 
13 | #include <cuda_gl_interop.h>
14 | #include <cuda.h>
15 | #include "HOGDefines.h"
16 | 
17 | __host__ int iDivUp(int a, int b);
18 | __host__ int iDivDown(int a, int b);
19 | __host__ int iAlignUp(int a, int b);
20 | __host__ int iAlignDown(int a, int b);
21 | 
22 | __host__ int iDivUpF(int a, float b);
23 | __host__ int iClosestPowerOfTwo(int x);
24 | 
25 | __host__ void Float4ToUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height);
26 | __host__ void Float2ToUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index);
27 | __host__ void Float2ToUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index);
28 | __host__ void Float1ToUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height);
29 | __host__ void Float1ToUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height);
30 | 
31 | __global__ void float4toUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height);
32 | __global__ void float2toUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index);
33 | __global__ void float2toUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index);
34 | __global__ void float1toUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height);
35 | __global__ void float1toUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height);
36 | 
37 | __host__ void Uchar4ToFloat4(uchar4 *inputImage, float4 *outputImage, int width, int height);
38 | __global__ void uchar4tofloat4(uchar4 *inputImage, float4 *outputImage, int width, int height);
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOG/HOGUtils.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z14float1toUchar1P6float1P6uchar1ii,_Z14uchar4tofloat4P6uchar4P6float4ii,_Z14float2toUchar1P6float2P6uchar1iii,_Z14float4toUchar4P6float4P6uchar4ii,_Z14float2toUchar4P6float2P6uchar4iii,_Z14float1toUchar4P6float1P6uchar4ii


--------------------------------------------------------------------------------
/source/fastHOG/HOG/cutil.h:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // These functions and macros were copied over from the old cutil header files,
  3 | // so that FastHOG could be compiled.
  4 | // The cutil files were picked from the GPU Computing SDK that shipped with
  5 | // the old CUDA 3.2 SDK.
  6 | //-----------------------------------------------------------------------------
  7 | 
  8 | #pragma once
  9 | 
 10 | // Give a little more for Windows : the console window often disapears before we can read the message
 11 | #ifdef _WIN32
 12 | # if 1//ndef UNICODE
 13 | #  ifdef _DEBUG // Do this only in debug mode...
 14 | 	inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
 15 | 	{
 16 | 		size_t fmt2_sz	= 2048;
 17 | 		char *fmt2		= (char*)malloc(fmt2_sz);
 18 | 		va_list  vlist;
 19 | 		va_start(vlist, fmt);
 20 | 		while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
 21 | 		{
 22 | 			fmt2_sz *= 2;
 23 | 			if(fmt2) free(fmt2);
 24 | 			fmt2 = (char*)malloc(fmt2_sz);
 25 | 		}
 26 | 		OutputDebugStringA(fmt2);
 27 | 		fprintf(file, fmt2);
 28 | 		free(fmt2);
 29 | 	}
 30 | #	define FPRINTF(a) VSPrintf a
 31 | #  else //debug
 32 | #	define FPRINTF(a) fprintf a
 33 | // For other than Win32
 34 | #  endif //debug
 35 | # else //unicode
 36 | // Unicode case... let's give-up for now and keep basic printf
 37 | #	define FPRINTF(a) fprintf a
 38 | # endif //unicode
 39 | #else //win32
 40 | #	define FPRINTF(a) fprintf a
 41 | #endif //win32
 42 | 
 43 | #define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
 44 | 
 45 | inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 46 | {
 47 |     if( cudaSuccess != err) {
 48 | 		FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
 49 |                 file, line, cudaGetErrorString( err) ));
 50 |         exit(-1);
 51 |     }
 52 | }
 53 | 
 54 | #define MIN(a,b) ((a < b) ? a : b)
 55 | #define MAX(a,b) ((a > b) ? a : b)
 56 | 
 57 | // Beginning of GPU Architecture definitions
 58 | inline int _ConvertSMVer2Cores(int major, int minor)
 59 | {
 60 | 	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
 61 | 	typedef struct {
 62 | 		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
 63 | 		int Cores;
 64 | 	} sSMtoCores;
 65 | 
 66 | 	sSMtoCores nGpuArchCoresPerSM[] = 
 67 | 	{ { 0x10,  8 },
 68 | 	  { 0x11,  8 },
 69 | 	  { 0x12,  8 },
 70 | 	  { 0x13,  8 },
 71 | 	  { 0x20, 32 },
 72 | 	  { 0x21, 48 },
 73 | 	  {   -1, -1 } 
 74 | 	};
 75 | 
 76 | 	int index = 0;
 77 | 	while (nGpuArchCoresPerSM[index].SM != -1) {
 78 | 		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
 79 | 			return nGpuArchCoresPerSM[index].Cores;
 80 | 		}
 81 | 		index++;
 82 | 	}
 83 | 	printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
 84 | 	return -1;
 85 | }
 86 | // end of GPU Architecture definitions
 87 | 
 88 | // This function returns the best GPU (with maximum GFLOPS)
 89 | inline int cutGetMaxGflopsDeviceId()
 90 | {
 91 | 	int current_device   = 0, sm_per_multiproc = 0;
 92 | 	int max_compute_perf = 0, max_perf_device  = 0;
 93 | 	int device_count     = 0, best_SM_arch     = 0;
 94 | 	cudaDeviceProp deviceProp;
 95 | 
 96 | 	cudaGetDeviceCount( &device_count );
 97 | 	// Find the best major SM Architecture GPU device
 98 | 	while ( current_device < device_count ) {
 99 | 		cudaGetDeviceProperties( &deviceProp, current_device );
100 | 		if (deviceProp.major > 0 && deviceProp.major < 9999) {
101 | 			best_SM_arch = MAX(best_SM_arch, deviceProp.major);
102 | 		}
103 | 		current_device++;
104 | 	}
105 | 
106 |     // Find the best CUDA capable GPU device
107 | 	current_device = 0;
108 | 	while( current_device < device_count ) {
109 | 		cudaGetDeviceProperties( &deviceProp, current_device );
110 | 		if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
111 | 		    sm_per_multiproc = 1;
112 | 		} else {
113 | 			sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
114 | 		}
115 | 
116 | 		int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
117 | 		if( compute_perf  > max_compute_perf ) {
118 |             // If we find GPU with SM major > 2, search only these
119 | 			if ( best_SM_arch > 2 ) {
120 | 				// If our device==dest_SM_arch, choose this, or else pass
121 | 				if (deviceProp.major == best_SM_arch) {	
122 | 					max_compute_perf  = compute_perf;
123 | 					max_perf_device   = current_device;
124 | 				}
125 | 			} else {
126 | 				max_compute_perf  = compute_perf;
127 | 				max_perf_device   = current_device;
128 | 			}
129 | 		}
130 | 		++current_device;
131 | 	}
132 | 	return max_perf_device;
133 | }
134 | 


--------------------------------------------------------------------------------
/source/fastHOG/HOGConvolution.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z24convolutionColumnGPU1to2P6float2P6float1S2_iiii,_Z18convolutionRowGPU1P6float1S0_ii,_Z18convolutionRowGPU4P6float4S0_ii,_Z24convolutionColumnGPU4to2P6float2P6float4S2_iiii


--------------------------------------------------------------------------------
/source/fastHOG/HOGEngineDevice.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic3P6float4S0_iif


--------------------------------------------------------------------------------
/source/fastHOG/HOGHistogram.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,allShared,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z31computeBlockHistogramsWithGaussP6float2P6float1iiiiiiiii,_Z24normalizeBlockHistogramsP6float1iiiiiiiiii


--------------------------------------------------------------------------------
/source/fastHOG/HOGPadding.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export __dummy_entry__


--------------------------------------------------------------------------------
/source/fastHOG/HOGSVMSlider.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,allSharedF1,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z19linearSVMEvaluationP6float1fS0_iiiiiiiiiiiiiiiiii


--------------------------------------------------------------------------------
/source/fastHOG/HOGScale.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic1P6float1P6float4iif,_Z18resizeFastBicubic4P6float4S0_iif


--------------------------------------------------------------------------------
/source/fastHOG/HOGUtils.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z14float1toUchar1P6float1P6uchar1ii,_Z14uchar4tofloat4P6uchar4P6float4ii,_Z14float2toUchar1P6float2P6uchar1iii,_Z14float4toUchar4P6float4P6uchar4ii,_Z14float2toUchar4P6float2P6uchar4iii,_Z14float1toUchar4P6float1P6uchar4ii


--------------------------------------------------------------------------------
/source/fastHOG/Makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
 4 | #
 5 | # NOTICE TO USER:   
 6 | #
 7 | # This source code is subject to NVIDIA ownership rights under U.S. and 
 8 | # international Copyright laws.  
 9 | #
10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
19 | # OR PERFORMANCE OF THIS SOURCE CODE.  
20 | #
21 | # U.S. Government End Users.  This source code is a "commercial item" as 
22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
23 | # "commercial computer software" and "commercial computer software 
24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
25 | # and is provided to the U.S. Government only as a commercial end item.  
26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
28 | # source code with only those rights set forth herein.
29 | #
30 | ################################################################################
31 | #
32 | # Build script for project
33 | #
34 | ################################################################################
35 | 
36 | # Add source files here
37 | EXECUTABLE	:= fastHOG
38 | # C/C++ source files (compiled with gcc / c++)
39 |  CCFILES		:= \
40 | 	fastHOG.cpp \
41 | # HOG UTILS 
42 |  CCUTILS		:= \
43 | 	ImageWindow.cpp \
44 | # CC HOG 
45 |  CCHOG		:= \
46 | 	HOGImage.cpp \
47 | 	HOGEngine.cpp \
48 | 	HOGNMS.cpp \
49 | # CUDA HOG 
50 |  CUFILES		:= \
51 | 	HOGEngineDevice.cu \
52 | 	HOGConvolution.cu \
53 | 	HOGHistogram.cu \
54 | 	HOGPadding.cu \
55 | 	HOGScale.cu \
56 | 	HOGSVMSlider.cu \
57 | 	HOGUtils.cu \
58 | ################################################################################
59 | # Rules and targets
60 | 
61 | include common.mk
62 | 


--------------------------------------------------------------------------------
/source/fastHOG/Utils/ImageWindow.cpp:
--------------------------------------------------------------------------------
 1 | #include "ImageWindow.h"
 2 | 
 3 | #include <stdio.h>
 4 | #include <fltk/draw.h>
 5 | ImageWindow::ImageWindow(int width, int height, char* title) :
 6 | 	fltk::Window(width, height, title)
 7 | {
 8 | 	this->width = width;
 9 | 	this->height = height;
10 | 
11 | 	this->begin();
12 | 	imageWidget = new ImageWidget(0, 0, width, height);
13 | 	this->end();
14 | 
15 | 	doStuff = 0;
16 | }
17 | 
18 | ImageWindow::ImageWindow(HOGImage* image, char* title) :
19 | 	fltk::Window(image->width, image->height, title)
20 | {
21 | 	this->width = image->width;
22 | 	this->height = image->height;
23 | 
24 | 	this->begin();
25 | 	imageWidget = new ImageWidget(0, 0, image->width, image->height, image->pixels);
26 | 	this->end();
27 | 
28 | 	doStuff = 0;
29 | }
30 | 
31 | ImageWindow::~ImageWindow(void) { }
32 | 
33 | void ImageWindow::show(int x, int y)
34 | {
35 | 	if (x == -1 || y == 1)
36 | 		fltk::Window::show();
37 | 	else
38 | 	{
39 | 		this->position(x, y);
40 | 		fltk::Window::show();
41 | 	}
42 | }
43 | 
44 | void ImageWindow::setImage(HOGImage* image)
45 | {
46 | 	this->begin();
47 | 	imageWidget->setImage((unsigned char*) image->pixels);
48 | 	imageWidget->draw();
49 | 	this->end();
50 | }
51 | 
52 | int ImageWindow::handle(int eventId)
53 | {
54 | 	int ret = 0;
55 | 	switch (eventId)
56 | 	{
57 | 	case fltk::MOVE:
58 | 		ret = 1;
59 | 
60 | 		break;
61 | 	case fltk::PUSH:
62 | 
63 | 		imageWidget->rects.clear();
64 | 
65 | 		if (doStuff != 0)
66 | 			doStuff();
67 | 
68 | 		break;
69 | 	}
70 | 
71 | 	return ret;
72 | }
73 | 
74 | void ImageWindow::drawRect(int x, int y, int w, int h)
75 | {
76 | 	imageWidget->drawRect(x, y, w, h);
77 | }
78 | 
79 | void ImageWindow::Close()
80 | {
81 | 	delete imageWidget;
82 | 
83 | 	this->destroy();
84 | }
85 | 


--------------------------------------------------------------------------------
/source/fastHOG/Utils/ImageWindow.h:
--------------------------------------------------------------------------------
 1 | #ifndef __IMAGE_WINDOW_H__
 2 | #define __IMAGE_WINDOW_H__
 3 | 
 4 | #include <fltk/Window.h>
 5 | #include <fltk/draw.h>
 6 | #include <fltk/Rectangle.h>
 7 | #include <fltk/Widget.h>
 8 | #include <fltk/events.h>
 9 | 
10 | #include "../HOG/HOGImage.h"
11 | 
12 | #include <vector>
13 | #include <cstddef>
14 | 
15 | using namespace HOG;
16 | 
17 | class ImageWidget: public fltk::Widget
18 | {
19 | 	struct rect
20 | 	{
21 | 		int x, y, w, h;
22 | 		rect(int _x, int _y, int _w, int _h) { x = _x; y = _y; w = _w; h = _h; }
23 | 	};
24 | 
25 | public:
26 | 	std::vector<rect> rects;
27 | 
28 | 	unsigned char* pixels;
29 | 	fltk::Rectangle* rectangle;
30 | 
31 | 	ImageWidget(int x, int y, int w, int h) :
32 | 		fltk::Widget(x, y, w, h)
33 | 	{
34 | 		rectangle = new fltk::Rectangle(0, 0, w, h);
35 | 		this->box(fltk::BORDER_BOX);
36 | 		this->buttonbox(fltk::FLAT_BOX);
37 | 	}
38 | 
39 | 	ImageWidget(int x, int y, int w, int h, unsigned char* pixels) :
40 | 		fltk::Widget(x, y, w, h)
41 | 	{
42 | 		this->pixels = pixels;
43 | 		rectangle = new fltk::Rectangle(0, 0, w, h);
44 | 		this->box(fltk::BORDER_BOX);
45 | 		this->buttonbox(fltk::FLAT_BOX);
46 | 	}
47 | 
48 | 	void draw()
49 | 	{
50 | 		fltk::drawimage((unsigned char*) pixels, fltk::RGB32, *rectangle);
51 | 		fltk::setcolor(fltk::RED);
52 | 		for (size_t i=0; i<rects.size(); i++)
53 | 			fltk::strokerect(rects[i].x, rects[i].y, rects[i].w, rects[i].h);
54 | 		this->redraw();
55 | 	}
56 | 
57 | 	void setImage(unsigned char* pixelsNew)
58 | 	{
59 | 		this->pixels = pixelsNew;
60 | 	}
61 | 
62 | 	void drawRect(int x, int y, int w, int h)
63 | 	{
64 | 		rects.push_back(rect(x,y,w,h));
65 | 		this->redraw();
66 | 	}
67 | };
68 | 
69 | class ImageWindow: public fltk::Window
70 | {
71 | 	bool colorImage;
72 | 
73 | 	int width, height;
74 | 
75 | 	ImageWidget* imageWidget;
76 | 	fltk::Window *otherWindow;
77 | 
78 | public:
79 | 
80 | 	void (*doStuff)();
81 | 
82 | 	ImageWindow(int width, int height, char* title);
83 | 	ImageWindow(HOGImage* image, char* title);
84 | 
85 | 	void setImage(HOGImage* image);
86 | 
87 | 	void show(int x = -1, int y = -1);
88 | 	void drawRect(int x, int y, int w, int h);
89 | 
90 | 	int handle(int);
91 | 
92 | 	void Close();
93 | 
94 | 	~ImageWindow(void);
95 | };
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/source/fastHOG/Utils/Timer.h:
--------------------------------------------------------------------------------
  1 | #ifndef Timer_H
  2 | #define Timer_H
  3 | 
  4 | #include <ctime>
  5 | #include <iostream>
  6 | #include <iomanip>
  7 | #include <string>
  8 | 
  9 | class Timer
 10 | {
 11 | 	friend std::ostream& operator<<(std::ostream& os, Timer& t);
 12 | 
 13 | private:
 14 | 	bool running;
 15 | 	clock_t start_clock;
 16 | 	time_t start_time;
 17 | 	double acc_time;
 18 | 
 19 | 	double elapsed_time();
 20 | 
 21 | public:
 22 | 	// 'running' is initially false.  A Timer needs to be explicitly started
 23 | 	// using 'start' or 'restart'
 24 | 	Timer() :
 25 | 		running(false), start_clock(0), start_time(0), acc_time(0)
 26 | 	{
 27 | 	}
 28 | 
 29 | 	void start(const char* msg = 0);
 30 | 	void restart(const char* msg = 0);
 31 | 	void stop(const char* msg = 0);
 32 | 	void check(const char* msg = 0);
 33 | 	void check(const char* msg, int msg_count);
 34 | 
 35 | }; // class Timer
 36 | 
 37 | //===========================================================================
 38 | // Return the total time that the Timer has been in the "running"
 39 | // state since it was first "started" or last "restarted".  For
 40 | // "short" time periods (less than an hour), the actual cpu time
 41 | // used is reported instead of the elapsed time.
 42 | 
 43 | inline double Timer::elapsed_time()
 44 | {
 45 | 	time_t acc_sec = time(0) - start_time;
 46 | 	if (acc_sec < 3600)
 47 | 		return (clock() - start_clock) / (1.0 * CLOCKS_PER_SEC);
 48 | 	else
 49 | 		return (1.0 * acc_sec);
 50 | 
 51 | } // Timer::elapsed_time
 52 | 
 53 | //===========================================================================
 54 | // Start a Timer.  If it is already running, let it continue running.
 55 | // Print an optional message.
 56 | 
 57 | inline void Timer::start(const char* msg)
 58 | {
 59 | 	// Print an optional message, something like "Starting Timer t";
 60 | 	if (msg)
 61 | 		std::cout << msg << std::endl;
 62 | 
 63 | 	// Return immediately if the Timer is already running
 64 | 	if (running)
 65 | 		return;
 66 | 
 67 | 	// Set Timer status to running and set the start time
 68 | 	running = true;
 69 | 	start_clock = clock();
 70 | 	start_time = time(0);
 71 | 
 72 | } // Timer::start
 73 | 
 74 | //===========================================================================
 75 | // Turn the Timer off and start it again from 0.  Print an optional message.
 76 | 
 77 | inline void Timer::restart(const char* msg)
 78 | {
 79 | 	// Print an optional message, something like "Restarting Timer t";
 80 | 	if (msg)
 81 | 		std::cout << msg << std::endl;
 82 | 
 83 | 	// Set Timer status to running, reset accumulated time, and set start time
 84 | 	running = true;
 85 | 	acc_time = 0;
 86 | 	start_clock = clock();
 87 | 	start_time = time(0);
 88 | 
 89 | } // Timer::restart
 90 | 
 91 | //===========================================================================
 92 | // Stop the Timer and print an optional message.
 93 | 
 94 | inline void Timer::stop(const char* msg)
 95 | {
 96 | 	// Print an optional message, something like "Stopping Timer t";
 97 | 	if (msg)
 98 | 		std::cout << msg << std::endl;
 99 | 
100 | 	// Compute accumulated running time and set Timer status to not running
101 | 	if (running)
102 | 		acc_time += elapsed_time();
103 | 	running = false;
104 | 
105 | } // Timer::stop
106 | 
107 | //===========================================================================
108 | // Print out an optional message followed by the current Timer timing.
109 | 
110 | inline void Timer::check(const char* msg)
111 | {
112 | 	std::string s;
113 | 	// Print an optional message, something like "Checking Timer t";
114 | 	if (msg)
115 | 		std::cout << msg << " : ";
116 | 
117 | 	std::cout << "Time [" << std::setiosflags(std::ios::fixed)
118 | 			<< std::setprecision(3) << acc_time
119 | 			+ (running ? elapsed_time() : 0) << "] seconds\n";
120 | } // Timer::check
121 | 
122 | inline void Timer::check(const char* msg, int msg_count)
123 | {
124 | 	std::string s;
125 | 	// Print an optional message, something like "Checking Timer t";
126 | 	if (msg)
127 | 		std::cout << msg << ":";
128 | 
129 | 	std::cout << msg_count << ": " << "Time ["
130 | 			<< std::setiosflags(std::ios::fixed) << std::setprecision(3)
131 | 			<< acc_time + (running ? elapsed_time() : 0) << "] seconds\n";
132 | } // Timer::check
133 | 
134 | //===========================================================================
135 | // Allow Timers to be printed to ostreams using the syntax 'os << t'
136 | // for an ostream 'os' and a Timer 't'.  For example, "cout << t" will
137 | // print out the total amount of time 't' has been "running".
138 | 
139 | inline std::ostream& operator<<(std::ostream& os, Timer& t)
140 | {
141 | 	os << std::setprecision(3) << std::setiosflags(std::ios::fixed)
142 | 			<< t.acc_time + (t.running ? t.elapsed_time() : 0);
143 | 	return os;
144 | }
145 | 
146 | //===========================================================================
147 | 
148 | #endif // Timer_H
149 | 


--------------------------------------------------------------------------------
/source/fastHOG/bin/release/fastHOG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/bin/release/fastHOG


--------------------------------------------------------------------------------
/source/fastHOG/common.mk:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
  4 | #
  5 | # NOTICE TO USER:   
  6 | #
  7 | # This source code is subject to NVIDIA ownership rights under U.S. and 
  8 | # international Copyright laws.  
  9 | #
 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
 11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
 12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
 19 | # OR PERFORMANCE OF THIS SOURCE CODE.  
 20 | #
 21 | # U.S. Government End Users.  This source code is a "commercial item" as 
 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
 23 | # "commercial computer software" and "commercial computer software 
 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
 25 | # and is provided to the U.S. Government only as a commercial end item.  
 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
 28 | # source code with only those rights set forth herein.
 29 | #
 30 | ################################################################################
 31 | #
 32 | # Common build script
 33 | #
 34 | ################################################################################
 35 | 
 36 | .SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin
 37 | 
 38 | # Add new SM Versions here as devices with new Compute Capability are released
 39 | SM_VERSIONS := sm_10 sm_11 sm_12 sm_13
 40 | 
 41 | CUDA_INSTALL_PATH ?= /usr/local/cuda
 42 | 
 43 | ifdef cuda-install
 44 | 	CUDA_INSTALL_PATH := $(cuda-install)
 45 | endif
 46 | 
 47 | # detect OS
 48 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 49 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
 50 | # 'linux' is output for Linux system, 'darwin' for OS X
 51 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
 52 | 
 53 | # Basic directory setup for SDK
 54 | # (override directories only if they are not already defined)
 55 | SRCDIR      ?= 
 56 | SRCDIRUTILS ?= Utils
 57 | SRCDIRHOG   ?= HOG
 58 | ROOTDIR     ?= $(CUDA_INSTALL_PATH)
 59 | ROOTBINDIR  ?= bin
 60 | BINDIR      ?= $(ROOTBINDIR)
 61 | ROOTOBJDIR  ?= obj
 62 | LIBDIR      := $(ROOTDIR)/lib64
 63 | COMMONDIR   := $(ROOTDIR)/common
 64 | 
 65 | # Compilers
 66 | NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc 
 67 | CXX        := g++
 68 | CC         := gcc
 69 | LINK       := g++ -fPIC
 70 | 
 71 | # Includes
 72 | INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc
 73 | 
 74 | # architecture flag for cubin build
 75 | CUBIN_ARCH_FLAG := -m32
 76 | 
 77 | # Warning flags
 78 | CXXWARN_FLAGS := \
 79 | 	-W -Wall \
 80 | 	-Wimplicit \
 81 | 	-Wswitch \
 82 | 	-Wformat \
 83 | 	-Wchar-subscripts \
 84 | 	-Wparentheses \
 85 | 	-Wmultichar \
 86 | 	-Wtrigraphs \
 87 | 	-Wpointer-arith \
 88 | 	-Wcast-align \
 89 | 	-Wreturn-type \
 90 | 	-Wno-unused-function \
 91 | 	$(SPACE)
 92 | 
 93 | CWARN_FLAGS := $(CXXWARN_FLAGS) \
 94 | 	-Wstrict-prototypes \
 95 | 	-Wmissing-prototypes \
 96 | 	-Wmissing-declarations \
 97 | 	-Wnested-externs \
 98 | 	-Wmain \
 99 | 
100 | # Compiler-specific flags
101 | NVCCFLAGS := 
102 | CXXFLAGS  := $(CXXWARN_FLAGS)
103 | CFLAGS    := $(CWARN_FLAGS)
104 | 
105 | # Common flags
106 | COMMONFLAGS += $(INCLUDES) -DUNIX
107 | 
108 | # Debug/release configuration
109 | ifeq ($(dbg),1)
110 | 	COMMONFLAGS += -g
111 | 	NVCCFLAGS   += -D_DEBUG
112 | 	BINSUBDIR   := debug
113 | 	LIBSUFFIX   := D
114 | else 
115 | 	COMMONFLAGS += -O3 
116 | 	BINSUBDIR   := release
117 | 	LIBSUFFIX   :=
118 | 	NVCCFLAGS   += --compiler-options -fno-strict-aliasing
119 | 	CXXFLAGS    += -fno-strict-aliasing
120 | 	CFLAGS      += -fno-strict-aliasing
121 | endif
122 | 
123 | # append optional arch/SM version flags (such as -arch sm_11)
124 | #NVCCFLAGS += $(SMVERSIONFLAGS)
125 | 
126 | # architecture flag for cubin build
127 | CUBIN_ARCH_FLAG := -m32
128 | 
129 | # detect if 32 bit or 64 bit system
130 | HP_64 =	$(shell uname -m | grep 64)
131 | 
132 | # OpenGL is used or not (if it is used, then it is necessary to include GLEW)
133 | ifeq ($(USEGLLIB),1)
134 | 
135 | 	ifneq ($(DARWIN),)
136 | 		OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
137 | 	else
138 | 		OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu
139 | 
140 | 		ifeq "$(strip $(HP_64))" ""
141 | 			OPENGLLIB += -lGLEW -L/usr/X11R6/lib
142 | 		else
143 | 			OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
144 | 		endif
145 | 	endif
146 | 
147 | 	CUBIN_ARCH_FLAG := -m64
148 | endif
149 | 
150 | ifeq ($(USEGLUT),1)
151 | 	ifneq ($(DARWIN),)
152 | 		OPENGLLIB += -framework GLUT
153 | 	else
154 | 		OPENGLLIB += -lglut
155 | 	endif
156 | endif
157 | 
158 | ifeq ($(USEPARAMGL),1)
159 | 	PARAMGLLIB := -lparamgl$(LIBSUFFIX)
160 | endif
161 | 
162 | ifeq ($(USERENDERCHECKGL),1)
163 | 	RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX)
164 | endif
165 | 
166 | ifeq ($(USECUDPP), 1)
167 | 	ifeq "$(strip $(HP_64))" ""
168 | 		CUDPPLIB := -lcudpp
169 | 	else
170 | 		CUDPPLIB := -lcudpp64
171 | 	endif
172 | 
173 | 	CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX)
174 | 
175 | 	ifeq ($(emu), 1)
176 | 		CUDPPLIB := $(CUDPPLIB)_emu
177 | 	endif
178 | endif
179 | 
180 | # Libs
181 | LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) 
182 | 
183 | # If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB
184 | ifeq ($(USECUDADYNLIB),1)
185 |      LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 
186 | else
187 | # static linking, we will statically link against CUDA and CUDART
188 |   ifeq ($(USEDRVAPI),1)
189 |      LIB += -lcuda   ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 
190 |   else
191 |      LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
192 |   endif
193 | endif
194 | 
195 | ifeq ($(USECUFFT),1)
196 |   ifeq ($(emu),1)
197 |     LIB += -lcufftemu
198 |   else
199 |     LIB += -lcufft
200 |   endif
201 | endif
202 | 
203 | ifeq ($(USECUBLAS),1)
204 |   ifeq ($(emu),1)
205 |     LIB += -lcublasemu
206 |   else
207 |     LIB += -lcublas
208 |   endif
209 | endif
210 | 
211 | # Lib/exe configuration
212 | ifneq ($(STATIC_LIB),)
213 | 	TARGETDIR := $(LIBDIR)
214 | 	TARGET   := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
215 | 	LINKLINE  = ar rucv $(TARGET) $(OBJS) 
216 | else
217 | 	# Device emulation configuration
218 | 	ifeq ($(emu), 1)
219 | 		NVCCFLAGS   += -deviceemu
220 | 		CUDACCFLAGS += 
221 | 		BINSUBDIR   := emu$(BINSUBDIR)
222 | 		# consistency, makes developing easier
223 | 		CXXFLAGS		+= -D__DEVICE_EMULATION__
224 | 		CFLAGS			+= -D__DEVICE_EMULATION__
225 | 	endif
226 | 	TARGETDIR := $(BINDIR)/$(BINSUBDIR)
227 | 	TARGET    := $(TARGETDIR)/$(EXECUTABLE)
228 | 	#fltk
229 | 	LIB += -lfltk2 -lXft -lfltk2_images -lXext -lXinerama -lXi
230 | 	#boost thread for interface
231 | 	LIB += -lboost_thread
232 | 	#read images in HOGImage from file
233 | 	LIB += -lfreeimage
234 | 	LIB += -lboost_system
235 | 	LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
236 | endif
237 | 
238 | # check if verbose 
239 | ifeq ($(verbose), 1)
240 | 	VERBOSE := 
241 | else
242 | 	VERBOSE := @
243 | endif
244 | 
245 | ################################################################################
246 | # Check for input flags and set compiler flags appropriately
247 | ################################################################################
248 | ifeq ($(fastmath), 1)
249 | 	NVCCFLAGS += -use_fast_math
250 | endif
251 | 
252 | ifeq ($(keep), 1)
253 | 	NVCCFLAGS += -keep
254 | 	NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
255 | endif
256 | 
257 | ifdef maxregisters
258 | 	NVCCFLAGS += -maxrregcount $(maxregisters)
259 | endif
260 | 
261 | # Add cudacc flags
262 | NVCCFLAGS += $(CUDACCFLAGS)
263 | 
264 | # workaround for mac os x cuda 1.1 compiler issues
265 | ifneq ($(DARWIN),)
266 | 	NVCCFLAGS += --host-compilation=C
267 | endif
268 | 
269 | # Add common flags
270 | NVCCFLAGS += $(COMMONFLAGS)
271 | CXXFLAGS  += $(COMMONFLAGS)
272 | CFLAGS    += $(COMMONFLAGS)
273 | 
274 | ifeq ($(nvcc_warn_verbose),1)
275 | 	NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 
276 | 	NVCCFLAGS += --compiler-options -fno-strict-aliasing
277 | endif
278 | 
279 | ################################################################################
280 | # Set up object files
281 | ################################################################################
282 | OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR)
283 | OBJS  +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))
284 | 
285 | OBJDIRUTILS := $(ROOTOBJDIR)/$(BINSUBDIR)/$(SRCDIRUTILS)
286 | OBJS +=  $(patsubst %.cpp,$(OBJDIRUTILS)/%.cpp.o,$(notdir $(CCUTILS)))
287 | 
288 | OBJDIRHOG := $(ROOTOBJDIR)/$(BINSUBDIR)/$(SRCDIRHOG)
289 | OBJS +=  $(patsubst %.cu,$(OBJDIRHOG)/%.cu.o,$(notdir $(CUFILES)))
290 | OBJS +=  $(patsubst %.cpp,$(OBJDIRHOG)/%.cpp.o,$(notdir $(CCHOG)))
291 | 
292 | ################################################################################
293 | # Set up cubin files
294 | ################################################################################
295 | CUBINDIR := $(SRCDIR)data
296 | CUBINS +=  $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
297 | 
298 | ################################################################################
299 | # Rules
300 | ################################################################################
301 | $(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)
302 | 	$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
303 | 
304 | $(OBJDIRUTILS)/%.cpp.o : $(SRCDIRUTILS)%.cpp $(C_DEPS)
305 | 	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
306 | 
307 | $(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)
308 | 	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
309 | 
310 | $(OBJDIRHOG)/%.cpp.o : $(SRCDIRHOG)%.cpp $(C_DEPS)
311 | 	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
312 | 	
313 | $(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)
314 | 	$(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
315 | 
316 | $(OBJDIRHOG)/%.cu.o : $(SRCDIRHOG)%.cu $(CU_DEPS)
317 | 	$(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
318 | 	
319 | $(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
320 | 	$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
321 | 
322 | #
323 | # The following definition is a template that gets instantiated for each SM
324 | # version (sm_10, sm_13, etc.) stored in SMVERSIONS.  It does 2 things:
325 | # 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.
326 | # 2. It generates a rule for building .cu_sm_XX.o files from the corresponding 
327 | #    .cu file.
328 | #
329 | # The intended use for this is to allow Makefiles that use common.mk to compile
330 | # files to different Compute Capability targets (aka SM arch version).  To do
331 | # so, in the Makefile, list files for each SM arch separately, like so:
332 | #
333 | # CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
334 | # CUFILES_sm_12 := anothercudakernel_sm12.cu
335 | #
336 | define SMVERSION_template
337 | OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))
338 | $(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)
339 | 	$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1)
340 | endef
341 | 
342 | # This line invokes the above template for each arch version stored in
343 | # SM_VERSIONS.  The call funtion invokes the template, and the eval
344 | # function interprets it as make commands.
345 | $(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
346 | 
347 | $(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile
348 | 	$(VERBOSE)$(LINKLINE)
349 | 	
350 | cubindirectory:
351 | 	$(VERBOSE)mkdir -p $(CUBINDIR)
352 | 
353 | makedirectories:
354 | 	$(VERBOSE)mkdir -p $(LIBDIR)
355 | 	$(VERBOSE)mkdir -p $(OBJDIR)
356 | 	$(VERBOSE)mkdir -p $(OBJDIRUTILS)	
357 | 	$(VERBOSE)mkdir -p $(OBJDIRHOG)		
358 | 	$(VERBOSE)mkdir -p $(TARGETDIR)
359 | 
360 | tidy :-lboost_thread 
361 | 	$(VERBOSE)find . | egrep "#" | xargs rm -f
362 | 	$(VERBOSE)find . | egrep "\~" | xargs rm -f
363 | 
364 | clean : tidy
365 | 	$(VERBOSE)rm -f $(OBJS)
366 | 	$(VERBOSE)rm -f $(CUBINS)
367 | 	$(VERBOSE)rm -f $(TARGET)
368 | 	$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
369 | 
370 | clobber : clean
371 | 	$(VERBOSE)rm -rf $(ROOTOBJDIR)
372 | 


--------------------------------------------------------------------------------
/source/fastHOG/fastHOG.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * fastHog.cpp
 3 |  *
 4 |  *  Created on: May 14, 2009
 5 |  *      Author: viprad
 6 |  */
 7 | 
 8 | #include <stdio.h>
 9 | 
10 | #include <boost/thread/thread.hpp>
11 | #include <fltk/run.h>
12 | 
13 | #include "HOG/HOGEngine.h"
14 | #include "HOG/HOGImage.h"
15 | 
16 | #include "Utils/ImageWindow.h"
17 | #include "Utils/Timer.h"
18 | 
19 | #include "Others/persondetectorwt.tcc"
20 | 
21 | using namespace HOG;
22 | 
23 | ImageWindow* fastHOGWindow;
24 | HOGImage* image;
25 | HOGImage* imageCUDA;
26 | 
27 | void doStuffHere()
28 | {
29 | 	HOGEngine::Instance()->InitializeHOG(image->width, image->height,
30 | 			PERSON_LINEAR_BIAS, PERSON_WEIGHT_VEC, PERSON_WEIGHT_VEC_LENGTH);
31 | 
32 | 	//HOGEngine::Instance()->InitializeHOG(image->width, image->height,
33 | 	//		"Files//SVM//head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt");
34 | 
35 | 	Timer t;
36 | 	t.restart();
37 | 	HOGEngine::Instance()->BeginProcess(image);
38 | 	HOGEngine::Instance()->EndProcess();
39 | 	t.stop(); t.check("Processing time");
40 | 
41 | 	printf("Found %d positive results.\n", HOGEngine::Instance()->formattedResultsCount);
42 | 
43 | 	HOGEngine::Instance()->GetImage(imageCUDA, HOGEngine::IMAGE_ROI);
44 | 	fastHOGWindow->setImage(imageCUDA);
45 | 
46 | 	for (int i=0; i<HOGEngine::Instance()->nmsResultsCount; i++)
47 | 	{
48 | 		printf("%1.5f %1.5f %4d %4d %4d %4d %4d %4d\n",
49 | 				HOGEngine::Instance()->nmsResults[i].scale,
50 | 				HOGEngine::Instance()->nmsResults[i].score,
51 | 				HOGEngine::Instance()->nmsResults[i].origX,
52 | 				HOGEngine::Instance()->nmsResults[i].origY,
53 | 				HOGEngine::Instance()->nmsResults[i].x,
54 | 				HOGEngine::Instance()->nmsResults[i].y,
55 | 				HOGEngine::Instance()->nmsResults[i].width,
56 | 				HOGEngine::Instance()->nmsResults[i].height);
57 | 				fastHOGWindow->drawRect(HOGEngine::Instance()->nmsResults[i].x,
58 | 						HOGEngine::Instance()->nmsResults[i].y,
59 | 						HOGEngine::Instance()->nmsResults[i].width,
60 | 						HOGEngine::Instance()->nmsResults[i].height);
61 | 	}
62 | 
63 | 	printf("Drawn %d positive results.\n", HOGEngine::Instance()->nmsResultsCount);
64 | 
65 | 	HOGEngine::Instance()->FinalizeHOG();
66 | }
67 | 
68 | int main(void)
69 | {
70 | 	image = new HOGImage("Files//Images//testImage.bmp");
71 | 	imageCUDA = new HOGImage(image->width,image->height);
72 | 
73 | 	fastHOGWindow = new ImageWindow(image, "fastHOG");
74 | 	fastHOGWindow->doStuff = &doStuffHere;
75 | 	fastHOGWindow->show();
76 | 
77 | 	fltk::run();
78 | 
79 | 	delete image;
80 | 	delete imageCUDA;
81 | 
82 | 	return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/source/fastHOG/fastHOG.vcproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="Windows-1252"?>
  2 | <VisualStudioProject
  3 | 	ProjectType="Visual C++"
  4 | 	Version="9.00"
  5 | 	Name="fastHOG"
  6 | 	ProjectGUID="{98951235-E3D7-48E9-BA01-C7291E55FDEF}"
  7 | 	RootNamespace="fastHog"
  8 | 	Keyword="ManagedCProj"
  9 | 	TargetFrameworkVersion="196613"
 10 | 	>
 11 | 	<Platforms>
 12 | 		<Platform
 13 | 			Name="Win32"
 14 | 		/>
 15 | 	</Platforms>
 16 | 	<ToolFiles>
 17 | 		<ToolFile
 18 | 			RelativePath="..\..\..\CUDA\SDK\common\Cuda.rules"
 19 | 		/>
 20 | 	</ToolFiles>
 21 | 	<Configurations>
 22 | 		<Configuration
 23 | 			Name="Debug|Win32"
 24 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 25 | 			IntermediateDirectory="$(ConfigurationName)"
 26 | 			ConfigurationType="1"
 27 | 			CharacterSet="1"
 28 | 			ManagedExtensions="0"
 29 | 			>
 30 | 			<Tool
 31 | 				Name="VCPreBuildEventTool"
 32 | 			/>
 33 | 			<Tool
 34 | 				Name="VCCustomBuildTool"
 35 | 			/>
 36 | 			<Tool
 37 | 				Name="CUDA Build Rule"
 38 | 			/>
 39 | 			<Tool
 40 | 				Name="VCXMLDataGeneratorTool"
 41 | 			/>
 42 | 			<Tool
 43 | 				Name="VCWebServiceProxyGeneratorTool"
 44 | 			/>
 45 | 			<Tool
 46 | 				Name="VCMIDLTool"
 47 | 			/>
 48 | 			<Tool
 49 | 				Name="VCCLCompilerTool"
 50 | 				Optimization="0"
 51 | 				AdditionalIncludeDirectories="C:\SDK\FLTK2\include\;C:\SDK\boost\include\;C:\SDK\FreeImage\include"
 52 | 				PreprocessorDefinitions="WIN32;_DEBUG"
 53 | 				BasicRuntimeChecks="3"
 54 | 				RuntimeLibrary="3"
 55 | 				OpenMP="true"
 56 | 				WarningLevel="3"
 57 | 				DebugInformationFormat="4"
 58 | 			/>
 59 | 			<Tool
 60 | 				Name="VCManagedResourceCompilerTool"
 61 | 			/>
 62 | 			<Tool
 63 | 				Name="VCResourceCompilerTool"
 64 | 			/>
 65 | 			<Tool
 66 | 				Name="VCPreLinkEventTool"
 67 | 			/>
 68 | 			<Tool
 69 | 				Name="VCLinkerTool"
 70 | 				AdditionalOptions=" /NODEFAULTLIB:LIBCMTD.lib /NODEFAULTLIB:LIBCMT.lib  /nodefaultlib:&quot;libcpmt.lib&quot;"
 71 | 				AdditionalDependencies="comctl32.lib fltk2d.lib wsock32.lib freeimage.lib"
 72 | 				AdditionalLibraryDirectories="C:\SDK\boost\lib;C:\SDK\FLTK2\lib;C:\SDK\FreeImage\lib"
 73 | 				GenerateDebugInformation="true"
 74 | 				OptimizeReferences="1"
 75 | 				EnableCOMDATFolding="1"
 76 | 				TargetMachine="1"
 77 | 			/>
 78 | 			<Tool
 79 | 				Name="VCALinkTool"
 80 | 			/>
 81 | 			<Tool
 82 | 				Name="VCManifestTool"
 83 | 			/>
 84 | 			<Tool
 85 | 				Name="VCXDCMakeTool"
 86 | 			/>
 87 | 			<Tool
 88 | 				Name="VCBscMakeTool"
 89 | 			/>
 90 | 			<Tool
 91 | 				Name="VCFxCopTool"
 92 | 			/>
 93 | 			<Tool
 94 | 				Name="VCAppVerifierTool"
 95 | 			/>
 96 | 			<Tool
 97 | 				Name="VCPostBuildEventTool"
 98 | 			/>
 99 | 		</Configuration>
100 | 		<Configuration
101 | 			Name="Release|Win32"
102 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
103 | 			IntermediateDirectory="$(ConfigurationName)"
104 | 			ConfigurationType="1"
105 | 			CharacterSet="1"
106 | 			ManagedExtensions="0"
107 | 			WholeProgramOptimization="1"
108 | 			>
109 | 			<Tool
110 | 				Name="VCPreBuildEventTool"
111 | 			/>
112 | 			<Tool
113 | 				Name="VCCustomBuildTool"
114 | 			/>
115 | 			<Tool
116 | 				Name="CUDA Build Rule"
117 | 			/>
118 | 			<Tool
119 | 				Name="VCXMLDataGeneratorTool"
120 | 			/>
121 | 			<Tool
122 | 				Name="VCWebServiceProxyGeneratorTool"
123 | 			/>
124 | 			<Tool
125 | 				Name="VCMIDLTool"
126 | 			/>
127 | 			<Tool
128 | 				Name="VCCLCompilerTool"
129 | 				AdditionalIncludeDirectories="C:\SDK\FLTK2\include\;C:\SDK\boost\include\;C:\SDK\FreeImage\include"
130 | 				PreprocessorDefinitions="WIN32;NDEBUG"
131 | 				RuntimeLibrary="3"
132 | 				WarningLevel="3"
133 | 				DebugInformationFormat="3"
134 | 			/>
135 | 			<Tool
136 | 				Name="VCManagedResourceCompilerTool"
137 | 			/>
138 | 			<Tool
139 | 				Name="VCResourceCompilerTool"
140 | 			/>
141 | 			<Tool
142 | 				Name="VCPreLinkEventTool"
143 | 			/>
144 | 			<Tool
145 | 				Name="VCLinkerTool"
146 | 				AdditionalOptions=" /NODEFAULTLIB:LIBCMTD.lib /NODEFAULTLIB:LIBCMT.lib  /nodefaultlib:&quot;libcpmt.lib&quot;"
147 | 				AdditionalDependencies="fltk2.lib comctl32.lib wsock32.lib freeimage.lib"
148 | 				AdditionalLibraryDirectories="C:\SDK\boost\lib;C:\SDK\FLTK2\lib;C:\SDK\FreeImage\lib"
149 | 				GenerateDebugInformation="true"
150 | 				TargetMachine="1"
151 | 			/>
152 | 			<Tool
153 | 				Name="VCALinkTool"
154 | 			/>
155 | 			<Tool
156 | 				Name="VCManifestTool"
157 | 			/>
158 | 			<Tool
159 | 				Name="VCXDCMakeTool"
160 | 			/>
161 | 			<Tool
162 | 				Name="VCBscMakeTool"
163 | 			/>
164 | 			<Tool
165 | 				Name="VCFxCopTool"
166 | 			/>
167 | 			<Tool
168 | 				Name="VCAppVerifierTool"
169 | 			/>
170 | 			<Tool
171 | 				Name="VCPostBuildEventTool"
172 | 			/>
173 | 		</Configuration>
174 | 		<Configuration
175 | 			Name="EmuDebug|Win32"
176 | 			ConfigurationType="1"
177 | 			ManagedExtensions="1"
178 | 			>
179 | 			<Tool
180 | 				Name="VCPreBuildEventTool"
181 | 			/>
182 | 			<Tool
183 | 				Name="VCCustomBuildTool"
184 | 			/>
185 | 			<Tool
186 | 				Name="CUDA Build Rule"
187 | 			/>
188 | 			<Tool
189 | 				Name="VCXMLDataGeneratorTool"
190 | 			/>
191 | 			<Tool
192 | 				Name="VCWebServiceProxyGeneratorTool"
193 | 			/>
194 | 			<Tool
195 | 				Name="VCMIDLTool"
196 | 			/>
197 | 			<Tool
198 | 				Name="VCCLCompilerTool"
199 | 			/>
200 | 			<Tool
201 | 				Name="VCManagedResourceCompilerTool"
202 | 			/>
203 | 			<Tool
204 | 				Name="VCResourceCompilerTool"
205 | 			/>
206 | 			<Tool
207 | 				Name="VCPreLinkEventTool"
208 | 			/>
209 | 			<Tool
210 | 				Name="VCLinkerTool"
211 | 			/>
212 | 			<Tool
213 | 				Name="VCALinkTool"
214 | 			/>
215 | 			<Tool
216 | 				Name="VCManifestTool"
217 | 			/>
218 | 			<Tool
219 | 				Name="VCXDCMakeTool"
220 | 			/>
221 | 			<Tool
222 | 				Name="VCBscMakeTool"
223 | 			/>
224 | 			<Tool
225 | 				Name="VCFxCopTool"
226 | 			/>
227 | 			<Tool
228 | 				Name="VCAppVerifierTool"
229 | 			/>
230 | 			<Tool
231 | 				Name="VCPostBuildEventTool"
232 | 			/>
233 | 		</Configuration>
234 | 		<Configuration
235 | 			Name="EmuRelease|Win32"
236 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
237 | 			IntermediateDirectory="$(ConfigurationName)"
238 | 			ConfigurationType="1"
239 | 			CharacterSet="1"
240 | 			ManagedExtensions="0"
241 | 			>
242 | 			<Tool
243 | 				Name="VCPreBuildEventTool"
244 | 			/>
245 | 			<Tool
246 | 				Name="VCCustomBuildTool"
247 | 			/>
248 | 			<Tool
249 | 				Name="CUDA Build Rule"
250 | 			/>
251 | 			<Tool
252 | 				Name="VCXMLDataGeneratorTool"
253 | 			/>
254 | 			<Tool
255 | 				Name="VCWebServiceProxyGeneratorTool"
256 | 			/>
257 | 			<Tool
258 | 				Name="VCMIDLTool"
259 | 			/>
260 | 			<Tool
261 | 				Name="VCCLCompilerTool"
262 | 				Optimization="0"
263 | 				AdditionalIncludeDirectories="C:\SDK\FLTK2\include\;C:\SDK\boost\include\"
264 | 				PreprocessorDefinitions="WIN32;_DEBUG"
265 | 				RuntimeLibrary="3"
266 | 				OpenMP="true"
267 | 				WarningLevel="3"
268 | 				DebugInformationFormat="3"
269 | 			/>
270 | 			<Tool
271 | 				Name="VCManagedResourceCompilerTool"
272 | 			/>
273 | 			<Tool
274 | 				Name="VCResourceCompilerTool"
275 | 			/>
276 | 			<Tool
277 | 				Name="VCPreLinkEventTool"
278 | 			/>
279 | 			<Tool
280 | 				Name="VCLinkerTool"
281 | 				AdditionalOptions=" /NODEFAULTLIB:LIBCMTD.lib /NODEFAULTLIB:LIBCMT.lib  /nodefaultlib:&quot;libcpmt.lib&quot;"
282 | 				AdditionalDependencies="comctl32.lib fltk2d.lib wsock32.lib"
283 | 				AdditionalLibraryDirectories="C:\SDK\boost\lib;C:\SDK\FLTK2\lib"
284 | 				GenerateDebugInformation="true"
285 | 				AssemblyDebug="1"
286 | 				TargetMachine="1"
287 | 			/>
288 | 			<Tool
289 | 				Name="VCALinkTool"
290 | 			/>
291 | 			<Tool
292 | 				Name="VCManifestTool"
293 | 			/>
294 | 			<Tool
295 | 				Name="VCXDCMakeTool"
296 | 			/>
297 | 			<Tool
298 | 				Name="VCBscMakeTool"
299 | 			/>
300 | 			<Tool
301 | 				Name="VCFxCopTool"
302 | 			/>
303 | 			<Tool
304 | 				Name="VCAppVerifierTool"
305 | 			/>
306 | 			<Tool
307 | 				Name="VCPostBuildEventTool"
308 | 			/>
309 | 		</Configuration>
310 | 		<Configuration
311 | 			Name="CUDAEmuDebug|Win32"
312 | 			ConfigurationType="1"
313 | 			ManagedExtensions="1"
314 | 			>
315 | 			<Tool
316 | 				Name="VCPreBuildEventTool"
317 | 			/>
318 | 			<Tool
319 | 				Name="VCCustomBuildTool"
320 | 			/>
321 | 			<Tool
322 | 				Name="CUDA Build Rule"
323 | 			/>
324 | 			<Tool
325 | 				Name="VCXMLDataGeneratorTool"
326 | 			/>
327 | 			<Tool
328 | 				Name="VCWebServiceProxyGeneratorTool"
329 | 			/>
330 | 			<Tool
331 | 				Name="VCMIDLTool"
332 | 			/>
333 | 			<Tool
334 | 				Name="VCCLCompilerTool"
335 | 			/>
336 | 			<Tool
337 | 				Name="VCManagedResourceCompilerTool"
338 | 			/>
339 | 			<Tool
340 | 				Name="VCResourceCompilerTool"
341 | 			/>
342 | 			<Tool
343 | 				Name="VCPreLinkEventTool"
344 | 			/>
345 | 			<Tool
346 | 				Name="VCLinkerTool"
347 | 			/>
348 | 			<Tool
349 | 				Name="VCALinkTool"
350 | 			/>
351 | 			<Tool
352 | 				Name="VCManifestTool"
353 | 			/>
354 | 			<Tool
355 | 				Name="VCXDCMakeTool"
356 | 			/>
357 | 			<Tool
358 | 				Name="VCBscMakeTool"
359 | 			/>
360 | 			<Tool
361 | 				Name="VCFxCopTool"
362 | 			/>
363 | 			<Tool
364 | 				Name="VCAppVerifierTool"
365 | 			/>
366 | 			<Tool
367 | 				Name="VCPostBuildEventTool"
368 | 			/>
369 | 		</Configuration>
370 | 	</Configurations>
371 | 	<References>
372 | 	</References>
373 | 	<Files>
374 | 		<Filter
375 | 			Name="Source Files"
376 | 			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
377 | 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
378 | 			>
379 | 			<File
380 | 				RelativePath=".\fastHOG.cpp"
381 | 				>
382 | 			</File>
383 | 		</Filter>
384 | 		<Filter
385 | 			Name="Notes"
386 | 			>
387 | 			<File
388 | 				RelativePath=".\Notes\CUDANotes.txt"
389 | 				>
390 | 			</File>
391 | 		</Filter>
392 | 		<Filter
393 | 			Name="Others"
394 | 			>
395 | 			<File
396 | 				RelativePath=".\Others\persondetectorwt.tcc"
397 | 				>
398 | 			</File>
399 | 		</Filter>
400 | 		<Filter
401 | 			Name="Utils"
402 | 			>
403 | 			<File
404 | 				RelativePath=".\Utils\FreeImage.h"
405 | 				>
406 | 			</File>
407 | 			<File
408 | 				RelativePath=".\Utils\ImageWindow.cpp"
409 | 				>
410 | 			</File>
411 | 			<File
412 | 				RelativePath=".\Utils\ImageWindow.h"
413 | 				>
414 | 			</File>
415 | 			<File
416 | 				RelativePath=".\Utils\Timer.h"
417 | 				>
418 | 			</File>
419 | 		</Filter>
420 | 		<Filter
421 | 			Name="HOG"
422 | 			>
423 | 			<Filter
424 | 				Name="Header Files"
425 | 				>
426 | 				<Filter
427 | 					Name="CPU"
428 | 					>
429 | 					<File
430 | 						RelativePath=".\HOG\HOGEngine.h"
431 | 						>
432 | 					</File>
433 | 					<File
434 | 						RelativePath=".\HOG\HOGImage.h"
435 | 						>
436 | 					</File>
437 | 					<File
438 | 						RelativePath=".\HOG\HOGNMS.h"
439 | 						>
440 | 					</File>
441 | 					<File
442 | 						RelativePath=".\HOG\HOGPoint3.h"
443 | 						>
444 | 					</File>
445 | 					<File
446 | 						RelativePath=".\HOG\HOGResult.h"
447 | 						>
448 | 					</File>
449 | 				</Filter>
450 | 				<Filter
451 | 					Name="GPU"
452 | 					>
453 | 					<File
454 | 						RelativePath=".\HOG\HOGConvolution.h"
455 | 						>
456 | 					</File>
457 | 					<File
458 | 						RelativePath=".\HOG\HOGDefines.h"
459 | 						>
460 | 					</File>
461 | 					<File
462 | 						RelativePath=".\HOG\HOGEngineDevice.h"
463 | 						>
464 | 					</File>
465 | 					<File
466 | 						RelativePath=".\HOG\HOGHistogram.h"
467 | 						>
468 | 					</File>
469 | 					<File
470 | 						RelativePath=".\HOG\HOGPadding.h"
471 | 						>
472 | 					</File>
473 | 					<File
474 | 						RelativePath=".\HOG\HOGScale.h"
475 | 						>
476 | 					</File>
477 | 					<File
478 | 						RelativePath=".\HOG\HOGSVMSlider.h"
479 | 						>
480 | 					</File>
481 | 					<File
482 | 						RelativePath=".\HOG\HOGUtils.h"
483 | 						>
484 | 					</File>
485 | 				</Filter>
486 | 			</Filter>
487 | 			<Filter
488 | 				Name="Source Files"
489 | 				>
490 | 				<Filter
491 | 					Name="GPU"
492 | 					>
493 | 					<File
494 | 						RelativePath=".\HOG\HOGConvolution.cu"
495 | 						>
496 | 					</File>
497 | 					<File
498 | 						RelativePath=".\HOG\HOGEngineDevice.cu"
499 | 						>
500 | 					</File>
501 | 					<File
502 | 						RelativePath=".\HOG\HOGHistogram.cu"
503 | 						>
504 | 					</File>
505 | 					<File
506 | 						RelativePath=".\HOG\HOGPadding.cu"
507 | 						>
508 | 					</File>
509 | 					<File
510 | 						RelativePath=".\HOG\HOGScale.cu"
511 | 						>
512 | 					</File>
513 | 					<File
514 | 						RelativePath=".\HOG\HOGSVMSlider.cu"
515 | 						>
516 | 					</File>
517 | 					<File
518 | 						RelativePath=".\HOG\HOGUtils.cu"
519 | 						>
520 | 					</File>
521 | 				</Filter>
522 | 				<Filter
523 | 					Name="CPU"
524 | 					>
525 | 					<File
526 | 						RelativePath=".\HOG\HOGEngine.cpp"
527 | 						>
528 | 					</File>
529 | 					<File
530 | 						RelativePath=".\HOG\HOGImage.cpp"
531 | 						>
532 | 					</File>
533 | 					<File
534 | 						RelativePath=".\HOG\HOGNMS.cpp"
535 | 						>
536 | 					</File>
537 | 				</Filter>
538 | 			</Filter>
539 | 		</Filter>
540 | 	</Files>
541 | 	<Globals>
542 | 	</Globals>
543 | </VisualStudioProject>
544 | 


--------------------------------------------------------------------------------
/source/fastHOGLib.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 10.00
 3 | # Visual Studio 2008
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fastHOG", "fastHOG\fastHOG.vcproj", "{98951235-E3D7-48E9-BA01-C7291E55FDEF}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		CUDAEmuDebug|Win32 = CUDAEmuDebug|Win32
 9 | 		Debug|Win32 = Debug|Win32
10 | 		Release|Win32 = Release|Win32
11 | 	EndGlobalSection
12 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
13 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.CUDAEmuDebug|Win32.ActiveCfg = Debug|Win32
14 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.CUDAEmuDebug|Win32.Build.0 = Debug|Win32
15 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.Debug|Win32.ActiveCfg = Debug|Win32
16 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.Debug|Win32.Build.0 = Debug|Win32
17 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.Release|Win32.ActiveCfg = Release|Win32
18 | 		{98951235-E3D7-48E9-BA01-C7291E55FDEF}.Release|Win32.Build.0 = Release|Win32
19 | 	EndGlobalSection
20 | 	GlobalSection(SolutionProperties) = preSolution
21 | 		HideSolutionNode = FALSE
22 | 	EndGlobalSection
23 | EndGlobal
24 | 


--------------------------------------------------------------------------------
/source/fastHOGLib.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOGLib.suo


--------------------------------------------------------------------------------