├── .gitignore ├── README.md ├── docs └── prisacariu_reid_tr2310_09.pdf └── source ├── fastHOG ├── Files │ ├── Images │ │ └── testImage.bmp │ └── SVM │ │ └── head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt ├── HOG │ ├── HOGConvolution.cu │ ├── HOGConvolution.h │ ├── HOGConvolution.linkinfo │ ├── HOGDefines.h │ ├── HOGEngine.cpp │ ├── HOGEngine.h │ ├── HOGEngineDevice.cu │ ├── HOGEngineDevice.h │ ├── HOGEngineDevice.linkinfo │ ├── HOGHistogram.cu │ ├── HOGHistogram.h │ ├── HOGHistogram.linkinfo │ ├── HOGImage.cpp │ ├── HOGImage.h │ ├── HOGNMS.cpp │ ├── HOGNMS.h │ ├── HOGPadding.cu │ ├── HOGPadding.h │ ├── HOGPadding.linkinfo │ ├── HOGPoint3.h │ ├── HOGResult.h │ ├── HOGSVMSlider.cu │ ├── HOGSVMSlider.h │ ├── HOGSVMSlider.linkinfo │ ├── HOGScale.cu │ ├── HOGScale.h │ ├── HOGScale.linkinfo │ ├── HOGUtils.cu │ ├── HOGUtils.h │ ├── HOGUtils.linkinfo │ └── cutil.h ├── HOGConvolution.linkinfo ├── HOGEngineDevice.linkinfo ├── HOGHistogram.linkinfo ├── HOGPadding.linkinfo ├── HOGSVMSlider.linkinfo ├── HOGScale.linkinfo ├── HOGUtils.linkinfo ├── Makefile ├── Others │ └── persondetectorwt.tcc ├── Utils │ ├── ImageWindow.cpp │ ├── ImageWindow.h │ └── Timer.h ├── bin │ └── release │ │ └── fastHOG ├── common.mk ├── fastHOG.cpp └── fastHOG.vcproj ├── fastHOGLib.sln └── fastHOGLib.suo /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FastHOG 2 | ======= 3 | 4 | The original **FastHOG** source files can be obtained [here](http://www.robots.ox.ac.uk/~lav/Papers/prisacariu_reid_tr2310_09/prisacariu_reid_tr2310_09.html). 5 | These source files do not compile under any recent version of CUDA on Ubuntu (or any Linux distribution). 6 | 7 | These source files were fixed to compile with CUDA 5.5 on Ubuntu 12.04. 8 | 9 | Steps to compile and use this version of FastHOG: 10 | 11 | 1. Install CUDA 5.5 or a recent version. 12 | 2. Install `libxinerama-dev` and `libfreeimage-dev`. 13 | 3. Build and install the 2.0 branch of FLTK. Instructions to do this can be found [here](http://choorucode.com/2014/01/22/how-to-build-and-install-fltk-2-0/). 14 | 4. `cd source/fastHOG` and build using `make`. 15 | 5. Run the sample FastHOG program using `bin/release/fastHOG`. (Note that it has to be run from this directory, else it fails.) A picture of pedestrians is displayed. Click anywhere on it to detect the people. 16 | -------------------------------------------------------------------------------- /docs/prisacariu_reid_tr2310_09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/docs/prisacariu_reid_tr2310_09.pdf -------------------------------------------------------------------------------- /source/fastHOG/Files/Images/testImage.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/Files/Images/testImage.bmp -------------------------------------------------------------------------------- /source/fastHOG/Files/SVM/head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/Files/SVM/head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "HOGConvolution.h" 2 | #include "HOGUtils.h" 3 | #include "cutil.h" 4 | 5 | dim3 blockGridRows; 6 | dim3 blockGridColumns; 7 | dim3 threadBlockRows; 8 | dim3 threadBlockColumns; 9 | 10 | #define convKernelRadius 1 11 | #define convKernelWidth (2 * convKernelRadius + 1) 12 | __device__ __constant__ float d_Kernel[convKernelWidth]; 13 | float *h_Kernel; 14 | 15 | #define convRowTileWidth 128 16 | #define convKernelRadiusAligned 16 17 | 18 | #define convColumnTileWidth 16 19 | #define convColumnTileHeight 48 20 | 21 | float4 *convBuffer4; 22 | float1 *convBuffer1; 23 | 24 | int convWidth; 25 | int convHeight; 26 | const int convKernelSize = convKernelWidth * sizeof(float); 27 | 28 | bool convUseGrayscale; 29 | 30 | template __device__ float1 convolutionRow(float1 *data) { 31 | float1 val = data[convKernelRadius-i]; 32 | val.x *= d_Kernel[i]; 33 | val.x += convolutionRow(data).x; 34 | return val; 35 | } 36 | template<> __device__ float1 convolutionRow<-1>(float1 *data){float1 zero; zero.x = 0; return zero;} 37 | template __device__ float1 convolutionColumn(float1 *data) { 38 | float1 val = data[(convKernelRadius-i)*convColumnTileWidth]; 39 | val.x *= d_Kernel[i]; 40 | val.x += convolutionColumn(data).x; 41 | return val; 42 | } 43 | template<> __device__ float1 convolutionColumn<-1>(float1 *data){float1 zero; zero.x = 0; return zero;} 44 | 45 | template __device__ float4 convolutionRow(float4 *data) { 46 | float4 val = data[convKernelRadius-i]; 47 | val.x *= d_Kernel[i]; val.y *= d_Kernel[i]; 48 | val.z *= d_Kernel[i]; val.w *= d_Kernel[i]; 49 | float4 val2 = convolutionRow(data); 50 | val.x += val2.x; val.y += val2.y; 51 | val.z += val2.z; val.w += val2.w; 52 | return val; 53 | } 54 | template<> __device__ float4 convolutionRow<-1>(float4 *data) { 55 | float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0; 56 | return zero; 57 | } 58 | template __device__ float4 convolutionColumn(float4 *data) { 59 | float4 val = data[(convKernelRadius-i)*convColumnTileWidth]; 60 | val.x *= d_Kernel[i]; val.y *= d_Kernel[i]; 61 | val.z *= d_Kernel[i]; val.w *= d_Kernel[i]; 62 | float4 val2 = convolutionColumn(data); 63 | val.x += val2.x; val.y += val2.y; 64 | val.z += val2.z; val.w += val2.w; 65 | return val; 66 | } 67 | template<> __device__ float4 convolutionColumn<-1>(float4 *data) { 68 | float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0; 69 | return zero; 70 | } 71 | 72 | __global__ void convolutionRowGPU1(float1 *d_Result, float1 *d_Data, int dataW, int dataH) 73 | { 74 | float1 zero; zero.x = 0; 75 | 76 | const int rowStart = IMUL(blockIdx.y, dataW); 77 | 78 | __shared__ float1 data[convKernelRadius + convRowTileWidth + convKernelRadius]; 79 | 80 | const int tileStart = IMUL(blockIdx.x, convRowTileWidth); 81 | const int tileEnd = tileStart + convRowTileWidth - 1; 82 | const int apronStart = tileStart - convKernelRadius; 83 | const int apronEnd = tileEnd + convKernelRadius; 84 | 85 | const int tileEndClamped = min(tileEnd, dataW - 1); 86 | const int apronStartClamped = max(apronStart, 0); 87 | const int apronEndClamped = min(apronEnd, dataW - 1); 88 | 89 | const int apronStartAligned = tileStart - convKernelRadiusAligned; 90 | 91 | const int loadPos = apronStartAligned + threadIdx.x; 92 | 93 | if(loadPos >= apronStart) 94 | { 95 | const int smemPos = loadPos - apronStart; 96 | data[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? d_Data[rowStart + loadPos] : zero; 97 | } 98 | 99 | __syncthreads(); 100 | const int writePos = tileStart + threadIdx.x; 101 | 102 | if(writePos <= tileEndClamped) 103 | { 104 | const int smemPos = writePos - apronStart; 105 | float1 sum = convolutionRow<2 * convKernelRadius>(data + smemPos); 106 | d_Result[rowStart + writePos] = sum; 107 | } 108 | } 109 | __global__ void convolutionRowGPU4(float4 *d_Result, float4 *d_Data, int dataW, int dataH) 110 | { 111 | float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0; 112 | 113 | const int rowStart = IMUL(blockIdx.y, dataW); 114 | 115 | __shared__ float4 data[convKernelRadius + convRowTileWidth + convKernelRadius]; 116 | 117 | const int tileStart = IMUL(blockIdx.x, convRowTileWidth); 118 | const int tileEnd = tileStart + convRowTileWidth - 1; 119 | const int apronStart = tileStart - convKernelRadius; 120 | const int apronEnd = tileEnd + convKernelRadius; 121 | 122 | const int tileEndClamped = min(tileEnd, dataW - 1); 123 | const int apronStartClamped = max(apronStart, 0); 124 | const int apronEndClamped = min(apronEnd, dataW - 1); 125 | 126 | const int apronStartAligned = tileStart - convKernelRadiusAligned; 127 | 128 | const int loadPos = apronStartAligned + threadIdx.x; 129 | 130 | if(loadPos >= apronStart) 131 | { 132 | const int smemPos = loadPos - apronStart; 133 | data[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? d_Data[rowStart + loadPos] : zero; 134 | } 135 | 136 | __syncthreads(); 137 | const int writePos = tileStart + threadIdx.x; 138 | 139 | if(writePos <= tileEndClamped) 140 | { 141 | const int smemPos = writePos - apronStart; 142 | float4 sum = convolutionRow<2 * convKernelRadius>(data + smemPos); 143 | d_Result[rowStart + writePos] = sum; 144 | } 145 | } 146 | __global__ void convolutionColumnGPU1to2 ( float2 *d_Result, float1 *d_Data, float1 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride) 147 | { 148 | float1 rowValue; 149 | float1 zero; zero.x = 0; 150 | float2 result; 151 | 152 | const int columnStart = IMUL(blockIdx.x, convColumnTileWidth) + threadIdx.x; 153 | 154 | __shared__ float1 data[convColumnTileWidth * (convKernelRadius + convColumnTileHeight + convKernelRadius)]; 155 | 156 | const int tileStart = IMUL(blockIdx.y, convColumnTileHeight); 157 | const int tileEnd = tileStart + convColumnTileHeight - 1; 158 | const int apronStart = tileStart - convKernelRadius; 159 | const int apronEnd = tileEnd + convKernelRadius; 160 | 161 | const int tileEndClamped = min(tileEnd, dataH - 1); 162 | const int apronStartClamped = max(apronStart, 0); 163 | const int apronEndClamped = min(apronEnd, dataH - 1); 164 | 165 | int smemPos = IMUL(threadIdx.y, convColumnTileWidth) + threadIdx.x; 166 | int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart; 167 | 168 | for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y) 169 | { 170 | data[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? d_Data[gmemPos] : zero; 171 | smemPos += smemStride; 172 | gmemPos += gmemStride; 173 | } 174 | 175 | __syncthreads(); 176 | 177 | smemPos = IMUL(threadIdx.y + convKernelRadius, convColumnTileWidth) + threadIdx.x; 178 | gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart; 179 | 180 | for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y) 181 | { 182 | float1 sum = convolutionColumn<2 * convKernelRadius>(data + smemPos); 183 | rowValue = d_DataRow[gmemPos]; 184 | 185 | result.x = sqrtf(sum.x * sum.x + rowValue.x * rowValue.x); 186 | result.y = atan2f(sum.x, rowValue.x) * RADTODEG; 187 | 188 | d_Result[gmemPos] = result; 189 | smemPos += smemStride; 190 | gmemPos += gmemStride; 191 | } 192 | } 193 | 194 | __global__ void convolutionColumnGPU4to2 ( float2 *d_Result, float4 *d_Data, float4 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride) 195 | { 196 | //float3 max12, mag4; 197 | float3 mag1, mag2, mag3; 198 | float3 max34, magMax; 199 | float2 result; 200 | float4 rowValue; 201 | float4 zero; zero.x = 0; zero.y = 0; zero.z = 0; zero.w = 0; 202 | 203 | const int columnStart = IMUL(blockIdx.x, convColumnTileWidth) + threadIdx.x; 204 | 205 | __shared__ float4 data[convColumnTileWidth * (convKernelRadius + convColumnTileHeight + convKernelRadius)]; 206 | 207 | const int tileStart = IMUL(blockIdx.y, convColumnTileHeight); 208 | const int tileEnd = tileStart + convColumnTileHeight - 1; 209 | const int apronStart = tileStart - convKernelRadius; 210 | const int apronEnd = tileEnd + convKernelRadius; 211 | 212 | const int tileEndClamped = min(tileEnd, dataH - 1); 213 | const int apronStartClamped = max(apronStart, 0); 214 | const int apronEndClamped = min(apronEnd, dataH - 1); 215 | 216 | int smemPos = IMUL(threadIdx.y, convColumnTileWidth) + threadIdx.x; 217 | int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart; 218 | 219 | for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y) 220 | { 221 | data[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? d_Data[gmemPos] : zero; 222 | smemPos += smemStride; 223 | gmemPos += gmemStride; 224 | } 225 | 226 | __syncthreads(); 227 | 228 | smemPos = IMUL(threadIdx.y + convKernelRadius, convColumnTileWidth) + threadIdx.x; 229 | gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart; 230 | 231 | for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y) 232 | { 233 | float4 sum = convolutionColumn<2 * convKernelRadius>(data + smemPos); 234 | rowValue = d_DataRow[gmemPos]; 235 | 236 | mag1.x = sqrtf(sum.x * sum.x + rowValue.x * rowValue.x); mag1.y = sum.x; mag1.z = rowValue.x; 237 | mag2.x = sqrtf(sum.y * sum.y + rowValue.y * rowValue.y); mag2.y = sum.y; mag2.z = rowValue.y; 238 | mag3.x = sqrtf(sum.z * sum.z + rowValue.z * rowValue.z); mag3.y = sum.z; mag3.z = rowValue.z; 239 | 240 | max34 = (mag2.x > mag3.x) ? mag2 : mag3; 241 | magMax = (mag1.x > max34.x) ? mag1 : max34; 242 | 243 | result.x = magMax.x; 244 | result.y = atan2f(magMax.y, magMax.z); 245 | result.y = result.y * 180 / PI + 180; 246 | result.y = int(result.y) % 180; //TODO-> if semicerc 247 | 248 | d_Result[gmemPos] = result; 249 | smemPos += smemStride; 250 | gmemPos += gmemStride; 251 | } 252 | } 253 | __host__ void InitConvolution(int width, int height, bool useGrayscale) 254 | { 255 | convUseGrayscale = useGrayscale; 256 | 257 | h_Kernel = (float *)malloc(convKernelSize); 258 | h_Kernel[0] = 1.0f; h_Kernel[1] = 0; h_Kernel[2] = -1.0f; 259 | 260 | cutilSafeCall( cudaMemcpyToSymbol(d_Kernel, h_Kernel, convKernelSize) ); 261 | 262 | if (useGrayscale) 263 | cutilSafeCall(cudaMalloc((void**) &convBuffer1, sizeof(float1) * width * height)); 264 | else 265 | cutilSafeCall(cudaMalloc((void**) &convBuffer4, sizeof(float4) * width * height)); 266 | } 267 | 268 | __host__ void SetConvolutionSize(int width, int height) 269 | { 270 | convWidth = width; 271 | convHeight = height; 272 | 273 | blockGridRows = dim3(iDivUp(convWidth, convRowTileWidth), convHeight); 274 | blockGridColumns = dim3(iDivUp(convWidth, convColumnTileWidth), iDivUp(convHeight, convColumnTileHeight)); 275 | threadBlockRows = dim3(convKernelRadiusAligned + convRowTileWidth + convKernelRadius); 276 | threadBlockColumns = dim3(convColumnTileWidth, 8); 277 | } 278 | __host__ void CloseConvolution() 279 | { 280 | if (convUseGrayscale) 281 | cutilSafeCall(cudaFree(convBuffer1)); 282 | else 283 | cutilSafeCall(cudaFree(convBuffer4)); 284 | 285 | free(h_Kernel); 286 | } 287 | __host__ void ComputeColorGradients1to2(float1* inputImage, float2* outputImage) 288 | { 289 | convolutionRowGPU1<<>>(convBuffer1, inputImage, convWidth, convHeight); 290 | convolutionColumnGPU1to2<<>>(outputImage, inputImage, convBuffer1, convWidth, convHeight, 291 | convColumnTileWidth * threadBlockColumns.y, convWidth * threadBlockColumns.y); 292 | } 293 | 294 | __host__ void ComputeColorGradients4to2(float4* inputImage, float2* outputImage) 295 | { 296 | convolutionRowGPU4<<>>(convBuffer4, inputImage, convWidth, convHeight); 297 | convolutionColumnGPU4to2<<>>(outputImage, inputImage, convBuffer4, convWidth, convHeight, 298 | convColumnTileWidth * threadBlockColumns.y, convWidth * threadBlockColumns.y); 299 | } 300 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGConvolution.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_CONVOLUTION__ 2 | #define __HOG_CONVOLUTION__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #include "HOGDefines.h" 17 | 18 | __host__ void InitConvolution(int width, int height, bool useGrayscale); 19 | __host__ void SetConvolutionSize(int width, int height); 20 | __host__ void CloseConvolution(); 21 | 22 | __host__ void ComputeColorGradients1to2(float1* inputImage, float2* outputImage); 23 | __host__ void ComputeColorGradients4to2(float4* inputImage, float2* outputImage); 24 | 25 | __global__ void convolutionRowGPU1(float1 *d_Result, float1 *d_Data, int dataW, int dataH); 26 | __global__ void convolutionRowGPU4(float4 *d_Result, float4 *d_Data, int dataW, int dataH); 27 | 28 | __global__ void convolutionColumnGPU1to2 ( float1 *d_Result, float1 *d_Data, float1 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride); 29 | __global__ void convolutionColumnGPU4to2 ( float2 *d_Result, float4 *d_Data, float4 *d_DataRow, int dataW, int dataH, int smemStride, int gmemStride); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGConvolution.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z24convolutionColumnGPU1to2P6float2P6float1S2_iiii,_Z18convolutionRowGPU1P6float1S0_ii,_Z18convolutionRowGPU4P6float4S0_ii,_Z24convolutionColumnGPU4to2P6float2P6float4S2_iiii -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGDefines.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_DEFINES__ 2 | #define __HOG_DEFINES__ 3 | 4 | #define UNROLL_LOOPS 5 | 6 | #ifdef _WIN32 7 | #pragma comment( lib, "C:\\CUDA\\lib\\cuda.lib" ) 8 | #pragma comment( lib, "C:\\CUDA\\lib\\cudart.lib" ) 9 | #pragma comment( lib, "C:\\CUDA\\SDK\\common\\lib\\cutil32.lib" ) 10 | #endif 11 | 12 | #ifndef CUDA_PIXEL 13 | #define CUDA_PIXEL unsigned char 14 | #endif 15 | 16 | #ifndef CUDA_FLOAT 17 | #define CUDA_FLOAT float 18 | #endif 19 | 20 | #ifndef CUDA_DT_PIXEL 21 | #define CUDA_DT_PIXEL float 22 | #endif 23 | 24 | #ifndef CUDA_DT_PIXEL_INT 25 | #define CUDA_DT_PIXEL_INT int 26 | #endif 27 | 28 | #ifndef THREAD_SIZE_W 29 | #define THREAD_SIZE_W 16 30 | #endif 31 | 32 | #ifndef THREAD_SIZE_H 33 | #define THREAD_SIZE_H 16 34 | #endif 35 | 36 | #ifndef BLOCK_SIZE_H 37 | #define BLOCK_SIZE_H 16 38 | #endif 39 | 40 | #ifndef BLOCK_SIZE_W 41 | #define BLOCK_SIZE_W 16 42 | #endif 43 | 44 | #ifndef MAX_HISTOGRAM_NO_BINS 45 | #define MAX_HISTOGRAM_NO_BINS 9 46 | #endif 47 | 48 | #ifndef MAX_CELL_SIZE_Y 49 | #define MAX_CELL_SIZE_Y 8 50 | #endif 51 | 52 | #ifndef MAX_CELL_SIZE_X 53 | #define MAX_CELL_SIZE_X 8 54 | #endif 55 | 56 | #ifndef MAX_BLOCK_SIZE_X 57 | #define MAX_BLOCK_SIZE_X 2 58 | #endif 59 | 60 | #ifndef MAX_BLOCK_SIZE_Y 61 | #define MAX_BLOCK_SIZE_Y 2 62 | #endif 63 | 64 | #ifndef MAX_BLOCKS_PER_WINDOW_X 65 | #define MAX_BLOCKS_PER_WINDOW_X 7 66 | #endif 67 | 68 | #ifndef MAX_BLOCKS_PER_WINDOW_Y 69 | #define MAX_BLOCKS_PER_WINDOW_Y 15 70 | #endif 71 | 72 | #ifndef EXECUTYIN512THREADS 73 | #define EXECUTYIN512THREADS(counter, startPoint, func, params) \ 74 | startPoint = 0;\ 75 | if (counter / 512 > 0) \ 76 | { \ 77 | while (counter / 512 > 0) \ 78 | { \ 79 | func<<<1, 512>>> ## params; \ 80 | startPoint += 512; \ 81 | counter -= 512; \ 82 | } \ 83 | if (counter != 0) \ 84 | func<<<1, counter>>> ## params; \ 85 | } \ 86 | else \ 87 | func<<<1, counter>>> ## params; 88 | #endif 89 | 90 | #ifndef WARP_SIZE 91 | #define WARP_SIZE 32 92 | #endif 93 | 94 | #ifndef MAX_BLOCKS_PER_DIM 95 | #define MAX_BLOCKS_PER_DIM 65536 96 | #endif 97 | 98 | #ifndef IMUL 99 | #define IMUL(a, b) __mul24(a, b) 100 | #endif 101 | 102 | #ifndef PI 103 | #define PI 3.1415926535897932384626433832795 104 | #endif 105 | 106 | #ifndef DEGTORAD 107 | #define DEGTORAD 0.017453292519943295769236907684886 108 | #endif 109 | 110 | #ifndef RADTODEG 111 | #define RADTODEG 57.2957795 112 | #endif 113 | 114 | #endif 115 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGEngine.cpp: -------------------------------------------------------------------------------- 1 | #include "HOGEngine.h" 2 | #include "HOGNMS.h" 3 | 4 | #include "HOGDefines.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace HOG; 12 | 13 | HOGEngine* HOGEngine::instance; 14 | 15 | extern "C" void InitHOG(int width, int height, int avSizeX, int avSizeY, 16 | int marginX, int marginY, int cellSizeX, int cellSizeY, 17 | int blockSizeX, int blockSizeY, int windowSizeX, int windowSizeY, 18 | int noOfHistogramBins, float wtscale, float svmBias, float* svmWeights, 19 | int svmWeightsCount, bool useGrayscale); 20 | 21 | extern "C" void CloseHOG(); 22 | 23 | extern "C" void BeginHOGProcessing(unsigned char* hostImage, int minx, int miny, int maxx, int maxy, float minScale, float maxScale); 24 | extern "C" float* EndHOGProcessing(); 25 | 26 | extern "C" void GetProcessedImage(unsigned char* hostImage, int imageType); 27 | extern "C" void GetHOGParameters(float *cStartScale, float *cEndScale, float *cScaleRatio, int *cScaleCount, 28 | int *cPaddingSizeX, int *cPaddingSizeY, int *cPaddedWidth, int *cPaddedHeight, 29 | int *cNoOfCellsX, int *cNoOfCellsY, int *cNoOfBlocksX, int *cNoOfBlocksY, 30 | int *cNumberOfWindowsX, int *cNumberOfWindowsY, 31 | int *cNumberOfBlockPerWindowX, int *cNumberOfBlockPerWindowY); 32 | 33 | int HOGEngine::iDivUpF(int a, float b) { return (a % int(b) != 0) ? int(a / b + 1) : int(a / b);} 34 | 35 | void HOGEngine::InitializeHOG(int iw, int ih, std::string fileName) 36 | { 37 | this->imageWidth = iw; 38 | this->imageHeight = ih; 39 | 40 | this->avSizeX = 0; 41 | this->avSizeY = 0; 42 | this->marginX = 0; 43 | this->marginY = 0; 44 | 45 | this->hCellSizeX = 4; // 8 46 | this->hCellSizeY = 4; // 8 47 | this->hBlockSizeX = 2; 48 | this->hBlockSizeY = 2; 49 | this->hWindowSizeX = 24; //64 50 | this->hWindowSizeY = 24; //128 51 | this->hNoOfHistogramBins = 9; 52 | 53 | this->wtScale = 2.0f; 54 | 55 | this->useGrayscale = false; 56 | 57 | this->readSVMFromFile(fileName); 58 | 59 | this->formattedResultsAvailable = false; 60 | 61 | nmsProcessor = new HOGNMS(); 62 | 63 | InitHOG(iw, ih, avSizeX, avSizeY, marginX, marginY, hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, 64 | hWindowSizeX, hWindowSizeY, hNoOfHistogramBins, wtScale, svmBias, svmWeights, svmWeightsCount, useGrayscale); 65 | } 66 | 67 | void HOGEngine::InitializeHOG(int iw, int ih, float svmBias, float* svmWeights, int svmWeightsCount) 68 | { 69 | this->imageWidth = iw; 70 | this->imageHeight = ih; 71 | 72 | this->avSizeX = 48; //48 73 | this->avSizeY = 96; //96 74 | this->marginX = 4; // 4 75 | this->marginY = 4; // 4 76 | 77 | this->hCellSizeX = 8; 78 | this->hCellSizeY = 8; 79 | this->hBlockSizeX = 2; 80 | this->hBlockSizeY = 2; 81 | this->hWindowSizeX = 64; 82 | this->hWindowSizeY = 128; 83 | this->hNoOfHistogramBins = 9; 84 | 85 | this->svmWeightsCount = svmWeightsCount; 86 | this->svmBias = svmBias; 87 | this->svmWeights = svmWeights; 88 | 89 | this->wtScale = 2.0f; 90 | 91 | this->useGrayscale = false; 92 | 93 | this->formattedResultsAvailable = false; 94 | 95 | nmsProcessor = new HOGNMS(); 96 | 97 | InitHOG(iw, ih, avSizeX, avSizeY, marginX, marginY, hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, 98 | hWindowSizeX, hWindowSizeY, hNoOfHistogramBins, wtScale, svmBias, svmWeights, svmWeightsCount, useGrayscale); 99 | } 100 | 101 | 102 | void HOGEngine::readSVMFromFile(std::string modelfile) 103 | { 104 | double linearbias_, *linearwt_; 105 | 106 | FILE *modelfl; 107 | #ifdef _WIN32 108 | if ((fopen_s (&modelfl, modelfile.c_str(), "rb")) != 0) 109 | { printf("File not found!\n"); exit(1); } 110 | #else 111 | if ((modelfl = fopen (modelfile.c_str(), "rb")) == NULL) 112 | { printf("File not found!\n"); exit(1); } 113 | #endif 114 | char version_buffer[10]; 115 | if (!fread (&version_buffer,sizeof(char),10,modelfl)) 116 | { printf("Wrong file version!\n"); exit(1); } 117 | 118 | if(strcmp(version_buffer,"V6.01")) { 119 | printf("Wrong file version!\n"); exit(1); 120 | } 121 | /* read version number */ 122 | int version = 0; 123 | if (!fread (&version,sizeof(int),1,modelfl)) 124 | { printf("Wrong file version!\n"); exit(1); } 125 | if (version < 200) 126 | { printf("Wrong file version!\n"); exit(1); } 127 | 128 | long long kernel_type; 129 | fread(&(kernel_type),sizeof(long long),1,modelfl); 130 | 131 | {// ignore these 132 | long long poly_degree; 133 | fread(&(poly_degree),sizeof(long long),1,modelfl); 134 | 135 | double rbf_gamma; 136 | fread(&(rbf_gamma),sizeof(double),1,modelfl); 137 | 138 | double coef_lin; 139 | fread(&(coef_lin),sizeof(double),1,modelfl); 140 | double coef_const; 141 | fread(&(coef_const),sizeof(double),1,modelfl); 142 | 143 | long long l; 144 | fread(&l,sizeof(long long),1,modelfl); 145 | char* custom = new char[(unsigned int)l]; 146 | fread(custom,sizeof(char),(size_t)l,modelfl); 147 | delete[] custom; 148 | } 149 | 150 | long long totwords; 151 | fread(&(totwords),sizeof(long long),1,modelfl); 152 | 153 | {// ignore these 154 | long long totdoc; 155 | fread(&(totdoc),sizeof(long long),1,modelfl); 156 | 157 | long long sv_num; 158 | fread(&(sv_num), sizeof(long long),1,modelfl); 159 | } 160 | 161 | fread(&linearbias_, sizeof(double),1,modelfl); 162 | 163 | if(kernel_type == 0) { /* linear kernel */ 164 | /* save linear wts also */ 165 | linearwt_ = new double[(unsigned int)totwords+1]; 166 | svmWeightsCount = (int) totwords; 167 | fread(linearwt_, sizeof(double),(size_t)totwords+1,modelfl); 168 | } else { 169 | exit(1); 170 | } 171 | 172 | svmWeights = new float[svmWeightsCount+1]; 173 | for (int i=0; ipixels, minX, minY, maxX, maxY, minScale, maxScale); 204 | } 205 | 206 | void HOGEngine::EndProcess() 207 | { 208 | cppResult = EndHOGProcessing(); 209 | 210 | GetHOGParameters(&startScale, &endScale, &scaleRatio, &scaleCount, 211 | &hPaddingSizeX, &hPaddingSizeY, &hPaddedWidth, &hPaddedHeight, 212 | &hNoOfCellsX, &hNoOfCellsY, &hNoOfBlocksX, &hNoOfBlocksY, &hNumberOfWindowsX, 213 | &hNumberOfWindowsY, &hNumberOfBlockPerWindowX, &hNumberOfBlockPerWindowY); 214 | 215 | ComputeFormattedResults(); 216 | 217 | nmsResults = nmsProcessor->ComputeNMSResults(formattedResults, formattedResultsCount, &nmsResultsAvailable, &nmsResultsCount, 218 | hWindowSizeX, hWindowSizeY); 219 | } 220 | 221 | void HOGEngine::GetImage(HOGImage *imageCUDA, ImageType imageType) 222 | { 223 | switch (imageType) 224 | { 225 | case IMAGE_RESIZED: 226 | GetProcessedImage(imageCUDA->pixels, 0); 227 | break; 228 | case IMAGE_COLOR_GRADIENTS: 229 | GetProcessedImage(imageCUDA->pixels, 1); 230 | break; 231 | case IMAGE_GRADIENT_ORIENTATIONS: 232 | GetProcessedImage(imageCUDA->pixels, 2); 233 | break; 234 | case IMAGE_PADDED: 235 | GetProcessedImage(imageCUDA->pixels, 3); 236 | break; 237 | case IMAGE_ROI: 238 | GetProcessedImage(imageCUDA->pixels, 4); 239 | break; 240 | } 241 | } 242 | 243 | void HOGEngine::SaveResultsToDisk(char* fileName) 244 | { 245 | FILE* f; 246 | #ifdef _WIN32 247 | fopen_s(&f, fileName, "w+"); 248 | #else 249 | f = fopen(fileName, "w+"); 250 | #endif 251 | fprintf(f, "%d\n", formattedResultsCount); 252 | for (int i=0; i 0) 284 | formattedResultsCount++; 285 | } 286 | } 287 | } 288 | 289 | if (formattedResultsAvailable) delete formattedResults; 290 | formattedResults = new HOGResult[formattedResultsCount]; 291 | 292 | for (i=0; i 0) 302 | { 303 | HOGResult hogResult; 304 | 305 | currentWidth = iDivUpF(hPaddedWidth, currentScale); 306 | currentHeight = iDivUpF(hPaddedHeight, currentScale); 307 | 308 | rNumberOfWindowsX = (currentWidth - hWindowSizeX) / hCellSizeX + 1; 309 | rNumberOfWindowsY = (currentHeight - hWindowSizeY) / hCellSizeY + 1; 310 | 311 | leftoverX = (currentWidth - hWindowSizeX - hCellSizeX * (rNumberOfWindowsX - 1)) / 2; 312 | leftoverY = (currentHeight - hWindowSizeY - hCellSizeY * (rNumberOfWindowsY - 1)) / 2; 313 | 314 | hogResult.origX = k * hCellSizeX + leftoverX; 315 | hogResult.origY = j * hCellSizeY + leftoverY; 316 | 317 | hogResult.width = (int)floorf((float)hWindowSizeX * currentScale); 318 | hogResult.height = (int)floorf((float)hWindowSizeY * currentScale); 319 | 320 | hogResult.x = (int)ceilf(currentScale * (hogResult.origX + hWindowSizeX / 2) - (float) hWindowSizeX * currentScale / 2) - hPaddingSizeX + minX; 321 | hogResult.y = (int)ceilf(currentScale * (hogResult.origY + hWindowSizeY / 2) - (float) hWindowSizeY * currentScale / 2) - hPaddingSizeY + minY; 322 | 323 | hogResult.scale = currentScale; 324 | hogResult.score = score; 325 | 326 | formattedResults[resultId] = hogResult; 327 | resultId++; 328 | } 329 | } 330 | } 331 | 332 | currentScale = currentScale * scaleRatio; 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGEngine.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_ENGINE__ 2 | #define __HOG_ENGINE__ 3 | 4 | #include "HOGResult.h" 5 | #include "HOGNMS.h" 6 | #include "HOGImage.h" 7 | 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace HOG 13 | { 14 | class HOGEngine 15 | { 16 | private: 17 | static HOGEngine* instance; 18 | 19 | int iDivUpF(int a, float b); 20 | 21 | HOGNMS* nmsProcessor; 22 | void readSVMFromFile(std::string fileName); 23 | 24 | public: 25 | int imageWidth, imageHeight; 26 | 27 | int avSizeX, avSizeY, marginX, marginY; 28 | 29 | int scaleCount; 30 | int hCellSizeX, hCellSizeY; 31 | int hBlockSizeX, hBlockSizeY; 32 | int hWindowSizeX, hWindowSizeY; 33 | int hNoOfHistogramBins; 34 | int hPaddedWidth, hPaddedHeight; 35 | int hPaddingSizeX, hPaddingSizeY; 36 | 37 | int minX, minY, maxX, maxY; 38 | 39 | float wtScale; 40 | 41 | float startScale, endScale, scaleRatio; 42 | 43 | int svmWeightsCount; 44 | float svmBias, *svmWeights; 45 | 46 | int hNoOfCellsX, hNoOfCellsY; 47 | int hNoOfBlocksX, hNoOfBlocksY; 48 | int hNumberOfWindowsX, hNumberOfWindowsY; 49 | int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY; 50 | 51 | bool useGrayscale; 52 | 53 | float* cppResult; 54 | 55 | HOGResult* formattedResults; 56 | HOGResult* nmsResults; 57 | 58 | bool formattedResultsAvailable; 59 | int formattedResultsCount; 60 | 61 | bool nmsResultsAvailable; 62 | int nmsResultsCount; 63 | 64 | enum ImageType 65 | { 66 | IMAGE_RESIZED, 67 | IMAGE_COLOR_GRADIENTS, 68 | IMAGE_GRADIENT_ORIENTATIONS, 69 | IMAGE_PADDED, 70 | IMAGE_ROI 71 | }; 72 | 73 | static HOGEngine* Instance(void) { 74 | if (instance == NULL) instance = new HOGEngine(); 75 | return instance; 76 | } 77 | 78 | void InitializeHOG(int iw, int ih, float svmBias, float* svmWeights, int svmWeightsCount); 79 | void InitializeHOG(int iw, int ih, std::string fileName); 80 | 81 | void FinalizeHOG(); 82 | 83 | void BeginProcess(HOGImage* hostImage, int _minx = -1, int _miny = -1, int _maxx = -1, int _maxy = -1, 84 | float minScale = -1.0f, float maxScale = -1.0f); 85 | void EndProcess(); 86 | void GetImage(HOGImage *imageCUDA, ImageType imageType); 87 | 88 | void ComputeFormattedResults(); 89 | 90 | void SaveResultsToDisk(char* fileName); 91 | 92 | HOGEngine(void) { } 93 | ~HOGEngine(void) { } 94 | }; 95 | } 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGEngineDevice.cu: -------------------------------------------------------------------------------- 1 | #include "HOGEngineDevice.h" 2 | #include "HOGUtils.h" 3 | #include "HOGConvolution.h" 4 | #include "HOGHistogram.h" 5 | #include "HOGSVMSlider.h" 6 | #include "HOGScale.h" 7 | #include "HOGPadding.h" 8 | #include "cutil.h" 9 | 10 | int hWidth, hHeight; 11 | int hWidthROI, hHeightROI; 12 | int hPaddedWidth, hPaddedHeight; 13 | int rPaddedWidth, rPaddedHeight; 14 | 15 | int minX, minY, maxX, maxY; 16 | 17 | int hNoHistogramBins, rNoHistogramBins; 18 | 19 | int hPaddingSizeX, hPaddingSizeY; 20 | int hCellSizeX, hCellSizeY, hBlockSizeX, hBlockSizeY, hWindowSizeX, hWindowSizeY; 21 | int hNoOfCellsX, hNoOfCellsY, hNoOfBlocksX, hNoOfBlocksY; 22 | int rNoOfCellsX, rNoOfCellsY, rNoOfBlocksX, rNoOfBlocksY; 23 | 24 | int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY; 25 | int hNumberOfWindowsX, hNumberOfWindowsY; 26 | int rNumberOfWindowsX, rNumberOfWindowsY; 27 | 28 | float4 *paddedRegisteredImage; 29 | 30 | float1 *resizedPaddedImageF1; 31 | float4 *resizedPaddedImageF4; 32 | 33 | float2 *colorGradientsF2; 34 | 35 | float1 *blockHistograms; 36 | float1 *cellHistograms; 37 | 38 | float1 *svmScores; 39 | 40 | bool hUseGrayscale; 41 | 42 | uchar1* outputTest1; 43 | uchar4* outputTest4; 44 | 45 | float* hResult; 46 | 47 | float scaleRatio; 48 | float startScale; 49 | float endScale; 50 | int scaleCount; 51 | 52 | int avSizeX, avSizeY, marginX, marginY; 53 | 54 | extern uchar4* paddedRegisteredImageU4; 55 | 56 | __host__ void InitHOG(int width, int height, 57 | int _avSizeX, int _avSizeY, 58 | int _marginX, int _marginY, 59 | int cellSizeX, int cellSizeY, 60 | int blockSizeX, int blockSizeY, 61 | int windowSizeX, int windowSizeY, 62 | int noOfHistogramBins, float wtscale, 63 | float svmBias, float* svmWeights, int svmWeightsCount, 64 | bool useGrayscale) 65 | { 66 | cudaSetDevice( cutGetMaxGflopsDeviceId() ); 67 | 68 | int i; 69 | int toaddxx = 0, toaddxy = 0, toaddyx = 0, toaddyy = 0; 70 | 71 | hWidth = width; hHeight = height; 72 | avSizeX = _avSizeX; avSizeY = _avSizeY; marginX = _marginX; marginY = _marginY; 73 | 74 | if (avSizeX) { toaddxx = hWidth * marginX / avSizeX; toaddxy = hHeight * marginY / avSizeX; } 75 | if (avSizeY) { toaddyx = hWidth * marginX / avSizeY; toaddyy = hHeight * marginY / avSizeY; } 76 | 77 | hPaddingSizeX = max(toaddxx, toaddyx); hPaddingSizeY = max(toaddxy, toaddyy); 78 | 79 | hPaddedWidth = hWidth + hPaddingSizeX*2; 80 | hPaddedHeight = hHeight + hPaddingSizeY*2; 81 | 82 | hUseGrayscale = useGrayscale; 83 | 84 | hNoHistogramBins = noOfHistogramBins; 85 | hCellSizeX = cellSizeX; hCellSizeY = cellSizeY; hBlockSizeX = blockSizeX; hBlockSizeY = blockSizeY; 86 | hWindowSizeX = windowSizeX; hWindowSizeY = windowSizeY; 87 | 88 | hNoOfCellsX = hPaddedWidth / cellSizeX; 89 | hNoOfCellsY = hPaddedHeight / cellSizeY; 90 | 91 | hNoOfBlocksX = hNoOfCellsX - blockSizeX + 1; 92 | hNoOfBlocksY = hNoOfCellsY - blockSizeY + 1; 93 | 94 | hNumberOfBlockPerWindowX = (windowSizeX - cellSizeX * blockSizeX) / cellSizeX + 1; 95 | hNumberOfBlockPerWindowY = (windowSizeY - cellSizeY * blockSizeY) / cellSizeY + 1; 96 | 97 | hNumberOfWindowsX = 0; 98 | for (i=0; i tex2; 267 | cudaChannelFormatDesc channelDescDownscale2; 268 | 269 | __global__ void resizeFastBicubic3(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale) 270 | { 271 | int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; 272 | int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; 273 | int i = __umul24(y, width) + x; 274 | 275 | float u = x*scale; 276 | float v = y*scale; 277 | 278 | if (x < width && y < height) 279 | { 280 | float4 cF; 281 | 282 | if (scale == 1.0f) 283 | cF = paddedRegisteredImage[x + y * width]; 284 | else 285 | cF = tex2D(tex2, u, v); 286 | 287 | outputFloat[i] = cF; 288 | } 289 | } 290 | 291 | __host__ void DownscaleImage2(float scale, float4* paddedRegisteredImage, 292 | float4* resizedPaddedImageF4, int width, int height, 293 | int &rPaddedWidth, int &rPaddedHeight) 294 | { 295 | dim3 hThreadSize, hBlockSize; 296 | 297 | hThreadSize = dim3(THREAD_SIZE_W, THREAD_SIZE_H); 298 | 299 | rPaddedWidth = iDivUpF(width, scale); 300 | rPaddedHeight = iDivUpF(height, scale); 301 | 302 | hBlockSize = dim3(iDivUp(rPaddedWidth, hThreadSize.x), iDivUp(rPaddedHeight, hThreadSize.y)); 303 | 304 | cutilSafeCall(cudaMemcpyToArray(imageArray2, 0, 0, paddedRegisteredImage, sizeof(float4) * width * height, cudaMemcpyDeviceToDevice)); 305 | cutilSafeCall(cudaBindTextureToArray(tex2, imageArray2, channelDescDownscale2)); 306 | 307 | cutilSafeCall(cudaMemset(resizedPaddedImageF4, 0, width * height * sizeof(float4))); 308 | resizeFastBicubic3<<>>((float4*)resizedPaddedImageF4, (float4*)paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale); 309 | 310 | cutilSafeCall(cudaUnbindTexture(tex2)); 311 | } 312 | 313 | __host__ float3* CUDAImageRescale(float3* src, int width, int height, int &rWidth, int &rHeight, float scale) 314 | { 315 | int i, j, offsetC, offsetL; 316 | 317 | float4* srcH; float4* srcD; 318 | float4* dstD; float4* dstH; 319 | float3 val3; float4 val4; 320 | 321 | channelDescDownscale2 = cudaCreateChannelDesc(); 322 | tex2.filterMode = cudaFilterModeLinear; tex2.normalized = false; 323 | 324 | cudaMalloc((void**)&srcD, sizeof(float4) * width * height); 325 | cudaMalloc((void**)&dstD, sizeof(float4) * width * height); 326 | cudaMallocHost((void**)&srcH, sizeof(float4) * width * height); 327 | cudaMallocHost((void**)&dstH, sizeof(float4) * width * height); 328 | cutilSafeCall(cudaMallocArray(&imageArray2, &channelDescDownscale2, width, height) ); 329 | 330 | for (i=0; i 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #include "HOGDefines.h" 17 | 18 | extern "C" __host__ void InitHOG(int width, int height, 19 | int avSizeX, int avSizeY, 20 | int marginX, int marginY, 21 | int cellSizeX, int cellSizeY, 22 | int blockSizeX, int blockSizeY, 23 | int windowSizeX, int windowSizeY, 24 | int noOfHistogramBins, float wtscale, 25 | float svmBias, float* svmWeights, int svmWeightsCount, 26 | bool useGrayscale); 27 | 28 | extern "C" __host__ void CloseHOG(); 29 | 30 | extern "C" __host__ void BeginHOGProcessing(unsigned char* hostImage, int minx, int miny, int maxx, int maxy, float minScale, float maxScale); 31 | extern "C" __host__ float* EndHOGProcessing(); 32 | 33 | extern "C" __host__ void GetHOGParameters(float *cStartScale, float *cEndScale, float *cScaleRatio, int *cScaleCount, 34 | int *cPaddingSizeX, int *cPaddingSizeY, int *cPaddedWidth, int *cPaddedHeight, 35 | int *cNoOfCellsX, int *cNoOfCellsY, int *cNoOfBlocksX, int *cNoOfBlocksY, 36 | int *cNumberOfWindowsX, int *cNumberOfWindowsY, 37 | int *cNumberOfBlockPerWindowX, int *cNumberOfBlockPerWindowY); 38 | 39 | extern "C" __host__ void GetProcessedImage(unsigned char* hostImage, int imageType); 40 | 41 | extern "C" __host__ float3* CUDAImageRescale(float3* src, int width, int height, int &rWidth, int &rHeight, float scale); 42 | 43 | __host__ void InitCUDAHOG(int cellSizeX, int cellSizeY, 44 | int blockSizeX, int blockSizeY, 45 | int windowSizeX, int windowSizeY, 46 | int noOfHistogramBins, float wtscale, 47 | float svmBias, float* svmWeights, int svmWeightsCount, 48 | bool useGrayscale); 49 | __host__ void CloseCUDAHOG(); 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGEngineDevice.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic3P6float4S0_iif -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGHistogram.cu: -------------------------------------------------------------------------------- 1 | #include "HOGHistogram.h" 2 | #include "HOGUtils.h" 3 | #include "cutil.h" 4 | 5 | __device__ __constant__ float cenBound[3], halfBin[3], bandWidth[3], oneHalf = 0.5f; 6 | __device__ __constant__ int tvbin[3]; 7 | 8 | texture texGauss; 9 | cudaArray* gaussArray; 10 | cudaChannelFormatDesc channelDescGauss; 11 | 12 | extern __shared__ float allShared[]; 13 | 14 | extern int rNoHistogramBins, rNoOfCellsX, rNoOfCellsY, rNoOfBlocksX, rNoOfBlocksY, rNumberOfWindowsX, rNumberOfWindowsY; 15 | 16 | // wt scale == scale for weighting function span 17 | __host__ void InitHistograms(int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, int noHistogramBins, float wtscale) 18 | { 19 | int i, j; 20 | 21 | float var2x = cellSizeX * blockSizeX / (2 * wtscale); 22 | float var2y = cellSizeY * blockSizeY / (2 * wtscale); 23 | var2x *= var2x * 2; var2y *= var2y * 2; 24 | 25 | float centerX = cellSizeX * blockSizeX / 2.0f; 26 | float centerY = cellSizeY * blockSizeY / 2.0f; 27 | 28 | float* weights = (float*)malloc(cellSizeX * blockSizeX * cellSizeY * blockSizeY * sizeof(float)); 29 | 30 | for (i=0; i(); 45 | 46 | cutilSafeCall(cudaMallocArray(&gaussArray, &channelDescGauss, cellSizeX * blockSizeX * cellSizeY * blockSizeY, 1) ); 47 | cutilSafeCall(cudaMemcpyToArray(gaussArray, 0, 0, weights, sizeof(float) * cellSizeX * blockSizeX * cellSizeY * blockSizeY, cudaMemcpyHostToDevice)); 48 | 49 | int h_tvbin[3]; 50 | float h_cenBound[3], h_halfBin[3], h_bandWidth[3]; 51 | h_cenBound[0] = cellSizeX * blockSizeX / 2.0f; 52 | h_cenBound[1] = cellSizeY * blockSizeY / 2.0f; 53 | h_cenBound[2] = 180 / 2.0f; //TODO -> can be 360 54 | 55 | h_halfBin[0] = blockSizeX / 2.0f; 56 | h_halfBin[1] = blockSizeY / 2.0f; 57 | h_halfBin[2] = noHistogramBins / 2.0f; 58 | 59 | h_bandWidth[0] = (float) cellSizeX; h_bandWidth[0] = 1.0f / h_bandWidth[0]; 60 | h_bandWidth[1] = (float) cellSizeY; h_bandWidth[1] = 1.0f / h_bandWidth[1]; 61 | h_bandWidth[2] = 180.0f / (float) noHistogramBins; h_bandWidth[2] = 1.0f / h_bandWidth[2]; //TODO -> can be 360 62 | 63 | h_tvbin[0] = blockSizeX; h_tvbin[1] = blockSizeY; h_tvbin[2] = noHistogramBins; 64 | 65 | cutilSafeCall(cudaMemcpyToSymbol(cenBound, h_cenBound, 3 * sizeof(float), 0, cudaMemcpyHostToDevice)); 66 | cutilSafeCall(cudaMemcpyToSymbol(halfBin, h_halfBin, 3 * sizeof(float), 0, cudaMemcpyHostToDevice)); 67 | cutilSafeCall(cudaMemcpyToSymbol(bandWidth, h_bandWidth, 3 * sizeof(float), 0, cudaMemcpyHostToDevice)); 68 | cutilSafeCall(cudaMemcpyToSymbol(tvbin, h_tvbin, 3 * sizeof(int), 0, cudaMemcpyHostToDevice)); 69 | } 70 | 71 | __host__ void CloseHistogram() 72 | { 73 | } 74 | 75 | __global__ void computeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins, 76 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 77 | int leftoverX, int leftoverY, int width, int height) 78 | { 79 | int i; 80 | float2 localValue; 81 | float* shLocalHistograms = (float*)allShared; 82 | 83 | int cellIdx = threadIdx.y; 84 | int cellIdy = threadIdx.z; 85 | int columnId = threadIdx.x; 86 | 87 | int smemReadPos = __mul24(cellIdx, noHistogramBins) + __mul24(cellIdy, blockSizeX) * noHistogramBins; 88 | int gmemWritePos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, gridDim.x) * __mul24(blockDim.y, noHistogramBins) + 89 | __mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, noHistogramBins) * blockDim.z; 90 | 91 | int gmemReadStride = width; 92 | 93 | int gmemReadPos = leftoverX + __mul24(leftoverY, gmemReadStride) + 94 | (__mul24(blockIdx.x, cellSizeX) + __mul24(blockIdx.y, cellSizeY) * gmemReadStride) 95 | + (columnId + __mul24(cellIdx, cellSizeX) + __mul24(cellIdy, cellSizeY) * gmemReadStride); 96 | 97 | int histogramSize = __mul24(noHistogramBins, blockSizeX) * blockSizeY; 98 | int smemLocalHistogramPos = (columnId + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX); 99 | 100 | int cmemReadPos = columnId + __mul24(cellIdx, cellSizeX) + __mul24(cellIdy, cellSizeY) * __mul24(cellSizeX, blockSizeX); 101 | 102 | float atx, aty; 103 | float pIx, pIy, pIz; 104 | 105 | int fIx, fIy, fIz; 106 | int cIx, cIy, cIz; 107 | float dx, dy, dz; 108 | float cx, cy, cz; 109 | 110 | bool lowervalidx, lowervalidy; 111 | bool uppervalidx, uppervalidy; 112 | bool canWrite; 113 | 114 | int offset; 115 | 116 | for (i=0; i= tvbin[0] - oneHalf || cIx < -oneHalf); 157 | uppervalidy = !(cIy >= tvbin[1] - oneHalf || cIy < -oneHalf); 158 | lowervalidx = !(fIx < -oneHalf || fIx >= tvbin[0] - oneHalf); 159 | lowervalidy = !(fIy < -oneHalf || fIy >= tvbin[1] - oneHalf); 160 | } 161 | 162 | canWrite = (lowervalidx) && (lowervalidy); 163 | if (canWrite) 164 | { 165 | offset = smemLocalHistogramPos + (fIx + fIy * blockSizeY) * noHistogramBins; 166 | shLocalHistograms[offset + fIz] += localValue.x * cx * cy * cz; 167 | shLocalHistograms[offset + cIz] += localValue.x * cx * cy * dz; 168 | } 169 | 170 | canWrite = (lowervalidx) && (uppervalidy); 171 | if (canWrite) 172 | { 173 | offset = smemLocalHistogramPos + (fIx + cIy * blockSizeY) * noHistogramBins; 174 | shLocalHistograms[offset + fIz] += localValue.x * cx * dy * cz; 175 | shLocalHistograms[offset + cIz] += localValue.x * cx * dy * dz; 176 | } 177 | 178 | canWrite = (uppervalidx) && (lowervalidy); 179 | if (canWrite) 180 | { 181 | offset = smemLocalHistogramPos + (cIx + fIy * blockSizeY) * noHistogramBins; 182 | shLocalHistograms[offset + fIz] += localValue.x * dx * cy * cz; 183 | shLocalHistograms[offset + cIz] += localValue.x * dx * cy * dz; 184 | } 185 | 186 | canWrite = (uppervalidx) && (uppervalidy); 187 | if (canWrite) 188 | { 189 | offset = smemLocalHistogramPos + (cIx + cIy * blockSizeY) * noHistogramBins; 190 | shLocalHistograms[offset + fIz] += localValue.x * dx * dy * cz; 191 | shLocalHistograms[offset + cIz] += localValue.x * dx * dy * dz; 192 | } 193 | } 194 | 195 | __syncthreads(); 196 | 197 | //TODO -> aligned block size * cell size 198 | int smemTargetHistogramPos; 199 | for(unsigned int s = blockSizeY >> 1; s>0; s>>=1) 200 | { 201 | if (cellIdy < s && (cellIdy + s) < blockSizeY) 202 | { 203 | smemTargetHistogramPos = (columnId + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24((cellIdy + s), histogramSize) * __mul24(blockSizeX, cellSizeX); 204 | 205 | #ifdef UNROLL_LOOPS 206 | shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0]; 207 | shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1]; 208 | shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2]; 209 | shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3]; 210 | shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4]; 211 | shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5]; 212 | shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6]; 213 | shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7]; 214 | shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8]; 215 | shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9]; 216 | shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10]; 217 | shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11]; 218 | shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12]; 219 | shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13]; 220 | shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14]; 221 | shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15]; 222 | shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16]; 223 | shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17]; 224 | shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18]; 225 | shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19]; 226 | shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20]; 227 | shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21]; 228 | shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22]; 229 | shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23]; 230 | shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24]; 231 | shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25]; 232 | shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26]; 233 | shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27]; 234 | shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28]; 235 | shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29]; 236 | shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30]; 237 | shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31]; 238 | shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32]; 239 | shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33]; 240 | shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34]; 241 | shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35]; 242 | #else 243 | for (i=0; i> 1; s>0; s>>=1) 252 | { 253 | if (cellIdx < s && (cellIdx + s) < blockSizeX) 254 | { 255 | smemTargetHistogramPos = (columnId + __mul24((cellIdx + s), cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX); 256 | 257 | #ifdef UNROLL_LOOPS 258 | shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0]; 259 | shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1]; 260 | shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2]; 261 | shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3]; 262 | shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4]; 263 | shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5]; 264 | shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6]; 265 | shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7]; 266 | shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8]; 267 | shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9]; 268 | shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10]; 269 | shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11]; 270 | shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12]; 271 | shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13]; 272 | shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14]; 273 | shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15]; 274 | shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16]; 275 | shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17]; 276 | shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18]; 277 | shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19]; 278 | shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20]; 279 | shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21]; 280 | shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22]; 281 | shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23]; 282 | shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24]; 283 | shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25]; 284 | shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26]; 285 | shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27]; 286 | shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28]; 287 | shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29]; 288 | shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30]; 289 | shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31]; 290 | shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32]; 291 | shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33]; 292 | shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34]; 293 | shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35]; 294 | #else 295 | for (i=0; i> 1; s>0; s>>=1) 304 | { 305 | if (columnId < s && (columnId + s) < cellSizeX) 306 | { 307 | smemTargetHistogramPos = (columnId + s + __mul24(cellIdx, cellSizeX)) * histogramSize + __mul24(cellIdy, histogramSize) * __mul24(blockSizeX, cellSizeX); 308 | 309 | #ifdef UNROLL_LOOPS 310 | shLocalHistograms[smemLocalHistogramPos + 0] += shLocalHistograms[smemTargetHistogramPos + 0]; 311 | shLocalHistograms[smemLocalHistogramPos + 1] += shLocalHistograms[smemTargetHistogramPos + 1]; 312 | shLocalHistograms[smemLocalHistogramPos + 2] += shLocalHistograms[smemTargetHistogramPos + 2]; 313 | shLocalHistograms[smemLocalHistogramPos + 3] += shLocalHistograms[smemTargetHistogramPos + 3]; 314 | shLocalHistograms[smemLocalHistogramPos + 4] += shLocalHistograms[smemTargetHistogramPos + 4]; 315 | shLocalHistograms[smemLocalHistogramPos + 5] += shLocalHistograms[smemTargetHistogramPos + 5]; 316 | shLocalHistograms[smemLocalHistogramPos + 6] += shLocalHistograms[smemTargetHistogramPos + 6]; 317 | shLocalHistograms[smemLocalHistogramPos + 7] += shLocalHistograms[smemTargetHistogramPos + 7]; 318 | shLocalHistograms[smemLocalHistogramPos + 8] += shLocalHistograms[smemTargetHistogramPos + 8]; 319 | shLocalHistograms[smemLocalHistogramPos + 9] += shLocalHistograms[smemTargetHistogramPos + 9]; 320 | shLocalHistograms[smemLocalHistogramPos + 10] += shLocalHistograms[smemTargetHistogramPos + 10]; 321 | shLocalHistograms[smemLocalHistogramPos + 11] += shLocalHistograms[smemTargetHistogramPos + 11]; 322 | shLocalHistograms[smemLocalHistogramPos + 12] += shLocalHistograms[smemTargetHistogramPos + 12]; 323 | shLocalHistograms[smemLocalHistogramPos + 13] += shLocalHistograms[smemTargetHistogramPos + 13]; 324 | shLocalHistograms[smemLocalHistogramPos + 14] += shLocalHistograms[smemTargetHistogramPos + 14]; 325 | shLocalHistograms[smemLocalHistogramPos + 15] += shLocalHistograms[smemTargetHistogramPos + 15]; 326 | shLocalHistograms[smemLocalHistogramPos + 16] += shLocalHistograms[smemTargetHistogramPos + 16]; 327 | shLocalHistograms[smemLocalHistogramPos + 17] += shLocalHistograms[smemTargetHistogramPos + 17]; 328 | shLocalHistograms[smemLocalHistogramPos + 18] += shLocalHistograms[smemTargetHistogramPos + 18]; 329 | shLocalHistograms[smemLocalHistogramPos + 19] += shLocalHistograms[smemTargetHistogramPos + 19]; 330 | shLocalHistograms[smemLocalHistogramPos + 20] += shLocalHistograms[smemTargetHistogramPos + 20]; 331 | shLocalHistograms[smemLocalHistogramPos + 21] += shLocalHistograms[smemTargetHistogramPos + 21]; 332 | shLocalHistograms[smemLocalHistogramPos + 22] += shLocalHistograms[smemTargetHistogramPos + 22]; 333 | shLocalHistograms[smemLocalHistogramPos + 23] += shLocalHistograms[smemTargetHistogramPos + 23]; 334 | shLocalHistograms[smemLocalHistogramPos + 24] += shLocalHistograms[smemTargetHistogramPos + 24]; 335 | shLocalHistograms[smemLocalHistogramPos + 25] += shLocalHistograms[smemTargetHistogramPos + 25]; 336 | shLocalHistograms[smemLocalHistogramPos + 26] += shLocalHistograms[smemTargetHistogramPos + 26]; 337 | shLocalHistograms[smemLocalHistogramPos + 27] += shLocalHistograms[smemTargetHistogramPos + 27]; 338 | shLocalHistograms[smemLocalHistogramPos + 28] += shLocalHistograms[smemTargetHistogramPos + 28]; 339 | shLocalHistograms[smemLocalHistogramPos + 29] += shLocalHistograms[smemTargetHistogramPos + 29]; 340 | shLocalHistograms[smemLocalHistogramPos + 30] += shLocalHistograms[smemTargetHistogramPos + 30]; 341 | shLocalHistograms[smemLocalHistogramPos + 31] += shLocalHistograms[smemTargetHistogramPos + 31]; 342 | shLocalHistograms[smemLocalHistogramPos + 32] += shLocalHistograms[smemTargetHistogramPos + 32]; 343 | shLocalHistograms[smemLocalHistogramPos + 33] += shLocalHistograms[smemTargetHistogramPos + 33]; 344 | shLocalHistograms[smemLocalHistogramPos + 34] += shLocalHistograms[smemTargetHistogramPos + 34]; 345 | shLocalHistograms[smemLocalHistogramPos + 35] += shLocalHistograms[smemTargetHistogramPos + 35]; 346 | #else 347 | for (i=0; i>> 410 | (inputImage, blockHistograms, noHistogramBins, cellSizeX, cellSizeY, blockSizeX, blockSizeY, leftoverX, leftoverY, width, height); 411 | 412 | cutilSafeCall(cudaUnbindTexture(texGauss)); 413 | } 414 | 415 | __host__ void NormalizeBlockHistograms(float1* blockHistograms, int noHistogramBins, 416 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 417 | int width, int height) 418 | { 419 | dim3 hThreadSize, hBlockSize; 420 | 421 | rNoOfCellsX = width / cellSizeX; 422 | rNoOfCellsY = height / cellSizeY; 423 | 424 | rNoOfBlocksX = rNoOfCellsX - blockSizeX + 1; 425 | rNoOfBlocksY = rNoOfCellsY - blockSizeY + 1; 426 | 427 | hThreadSize = dim3(noHistogramBins, blockSizeX, blockSizeY); 428 | hBlockSize = dim3(rNoOfBlocksX, rNoOfBlocksY); 429 | 430 | int alignedBlockDimX = iClosestPowerOfTwo(noHistogramBins); 431 | int alignedBlockDimY = iClosestPowerOfTwo(blockSizeX); 432 | int alignedBlockDimZ = iClosestPowerOfTwo(blockSizeY); 433 | 434 | normalizeBlockHistograms<<>> 435 | (blockHistograms, noHistogramBins, 436 | rNoOfBlocksX, rNoOfBlocksY, blockSizeX, blockSizeY, 437 | alignedBlockDimX, alignedBlockDimY, alignedBlockDimZ, 438 | noHistogramBins * rNoOfCellsX, rNoOfCellsY); 439 | 440 | } 441 | 442 | __global__ void normalizeBlockHistograms(float1 *blockHistograms, int noHistogramBins, 443 | int rNoOfHOGBlocksX, int rNoOfHOGBlocksY, 444 | int blockSizeX, int blockSizeY, 445 | int alignedBlockDimX, int alignedBlockDimY, int alignedBlockDimZ, 446 | int width, int height) 447 | { 448 | int smemLocalHistogramPos, smemTargetHistogramPos, gmemPosBlock, gmemWritePosBlock; 449 | 450 | float* shLocalHistogram = (float*)allShared; 451 | 452 | float localValue, norm1, norm2; float eps2 = 0.01f; 453 | 454 | smemLocalHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x; 455 | gmemPosBlock = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, gridDim.x) * __mul24(blockDim.y, blockDim.x) + 456 | threadIdx.x + __mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) * blockDim.z; 457 | gmemWritePosBlock = __mul24(threadIdx.z, noHistogramBins) + __mul24(threadIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) + 458 | threadIdx.x + __mul24(blockIdx.x, noHistogramBins) * blockDim.y + __mul24(blockIdx.y, gridDim.x) * __mul24(blockDim.y, blockDim.x) * blockDim.z; 459 | 460 | localValue = blockHistograms[gmemPosBlock].x; 461 | shLocalHistogram[smemLocalHistogramPos] = localValue * localValue; 462 | 463 | if (blockIdx.x == 10 && blockIdx.y == 8) 464 | { 465 | int asasa; 466 | asasa = 0; 467 | asasa++; 468 | } 469 | 470 | __syncthreads(); 471 | 472 | for(unsigned int s = alignedBlockDimZ >> 1; s>0; s>>=1) 473 | { 474 | if (threadIdx.z < s && (threadIdx.z + s) < blockDim.z) 475 | { 476 | smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24((threadIdx.z + s), blockDim.x) * blockDim.y + threadIdx.x; 477 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 478 | } 479 | 480 | __syncthreads(); 481 | 482 | } 483 | 484 | for (unsigned int s = alignedBlockDimY >> 1; s>0; s>>=1) 485 | { 486 | if (threadIdx.y < s && (threadIdx.y + s) < blockDim.y) 487 | { 488 | smemTargetHistogramPos = __mul24((threadIdx.y + s), noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x; 489 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 490 | } 491 | 492 | __syncthreads(); 493 | 494 | } 495 | 496 | for(unsigned int s = alignedBlockDimX >> 1; s>0; s>>=1) 497 | { 498 | if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x) 499 | { 500 | smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + (threadIdx.x + s); 501 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 502 | } 503 | 504 | __syncthreads(); 505 | } 506 | 507 | //if (blockIdx.x == 5 && blockIdx.y == 4) 508 | //{ 509 | // int asasa; 510 | // asasa = 0; 511 | // asasa++; 512 | //} 513 | 514 | norm1 = sqrtf(shLocalHistogram[0]) + __mul24(noHistogramBins, blockSizeX) * blockSizeY; 515 | localValue /= norm1; 516 | 517 | localValue = fminf(0.2f, localValue); //why 0.2 ?? 518 | 519 | __syncthreads(); 520 | 521 | shLocalHistogram[smemLocalHistogramPos] = localValue * localValue; 522 | 523 | __syncthreads(); 524 | 525 | for(unsigned int s = alignedBlockDimZ >> 1; s>0; s>>=1) 526 | { 527 | if (threadIdx.z < s && (threadIdx.z + s) < blockDim.z) 528 | { 529 | smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24((threadIdx.z + s), blockDim.x) * blockDim.y + threadIdx.x; 530 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 531 | } 532 | 533 | __syncthreads(); 534 | 535 | } 536 | 537 | for (unsigned int s = alignedBlockDimY >> 1; s>0; s>>=1) 538 | { 539 | if (threadIdx.y < s && (threadIdx.y + s) < blockDim.y) 540 | { 541 | smemTargetHistogramPos = __mul24((threadIdx.y + s), noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + threadIdx.x; 542 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 543 | } 544 | 545 | __syncthreads(); 546 | 547 | } 548 | 549 | for(unsigned int s = alignedBlockDimX >> 1; s>0; s>>=1) 550 | { 551 | if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x) 552 | { 553 | smemTargetHistogramPos = __mul24(threadIdx.y, noHistogramBins) + __mul24(threadIdx.z, blockDim.x) * blockDim.y + (threadIdx.x + s); 554 | shLocalHistogram[smemLocalHistogramPos] += shLocalHistogram[smemTargetHistogramPos]; 555 | } 556 | 557 | __syncthreads(); 558 | } 559 | 560 | norm2 = sqrtf(shLocalHistogram[0]) + eps2; 561 | localValue /= norm2; 562 | 563 | blockHistograms[gmemWritePosBlock].x = localValue; 564 | 565 | if (blockIdx.x == 10 && blockIdx.y == 8) 566 | { 567 | int asasa; 568 | asasa = 0; 569 | asasa++; 570 | } 571 | } 572 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGHistogram.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_HISTOGRAM__ 2 | #define __HOG_HISTOGRAM__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #include "HOGDefines.h" 17 | 18 | __host__ void InitHistograms(int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, int noHistogramBins, float wtscale); 19 | __host__ void CloseHistogram(); 20 | 21 | __host__ void ComputeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins, 22 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 23 | int windowSizeX, int windowSizeY, 24 | int width, int height); 25 | __host__ void NormalizeBlockHistograms(float1* blockHistograms, int noHistogramBins, 26 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 27 | int width, int height); 28 | 29 | __global__ void computeBlockHistogramsWithGauss(float2* inputImage, float1* blockHistograms, int noHistogramBins, 30 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 31 | int leftoverX, int leftoverY, int width, int height); 32 | 33 | __global__ void normalizeBlockHistograms(float1 *blockHistograms, int noHistogramBins, 34 | int rNoOfHOGBlocksX, int rNoOfHOGBlocksY, 35 | int blockSizeX, int blockSizeY, 36 | int alignedBlockDimX, int alignedBlockDimY, int alignedBlockDimZ, 37 | int width, int height); 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGHistogram.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,allShared,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z31computeBlockHistogramsWithGaussP6float2P6float1iiiiiiiii,_Z24normalizeBlockHistogramsP6float1iiiiiiiiii -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGImage.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * HOGImage.cpp 3 | * 4 | * Created on: May 14, 2009 5 | * Author: viprad 6 | */ 7 | 8 | #include "HOGImage.h" 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | using namespace HOG; 18 | 19 | HOGImage::HOGImage(int width, int height) 20 | { 21 | this->width = width; 22 | this->height = height; 23 | 24 | isLoaded = false; 25 | this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height); 26 | memset(this->pixels, 0, sizeof(unsigned char) * 4 * width * height); 27 | } 28 | 29 | HOGImage::HOGImage(int width, int height, unsigned char* pixels) 30 | { 31 | this->width = width; 32 | this->height = height; 33 | 34 | this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height); 35 | memcpy(this->pixels, pixels, sizeof(unsigned char) * 4 * width * height); 36 | 37 | isLoaded = true; 38 | } 39 | 40 | HOGImage::HOGImage(char* fileName) 41 | { 42 | bool bLoaded = false; 43 | int bpp; 44 | FIBITMAP *bmp = 0; 45 | FREE_IMAGE_FORMAT fif = FIF_UNKNOWN; 46 | fif = FreeImage_GetFileType(fileName); 47 | if (fif == FIF_UNKNOWN) 48 | { 49 | fif = FreeImage_GetFIFFromFilename(fileName); 50 | } 51 | 52 | if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) 53 | { 54 | bmp = FreeImage_Load(fif, fileName, 0); 55 | bLoaded = true; 56 | if (bmp == NULL) 57 | bLoaded = false; 58 | } 59 | 60 | if (bLoaded) 61 | { 62 | width = FreeImage_GetWidth(bmp); 63 | height = FreeImage_GetHeight(bmp); 64 | 65 | bpp = FreeImage_GetBPP(bmp); 66 | switch (bpp) 67 | { 68 | case 32: 69 | break; 70 | default: 71 | FIBITMAP *bmpTemp = FreeImage_ConvertTo32Bits(bmp); 72 | if (bmp != NULL) FreeImage_Unload(bmp); 73 | bmp = bmpTemp; 74 | bpp = FreeImage_GetBPP(bmp); 75 | break; 76 | } 77 | 78 | this->pixels = (unsigned char*) malloc(sizeof(unsigned char) * 4 * width * height); 79 | FreeImage_ConvertToRawBits(this->pixels, bmp, width * 4, bpp, FI_RGBA_RED_MASK, FI_RGBA_GREEN_MASK, FI_RGBA_BLUE_MASK, true); 80 | 81 | isLoaded = true; 82 | } 83 | else 84 | isLoaded = false; 85 | } 86 | 87 | HOGImage::~HOGImage() 88 | { 89 | free(pixels); 90 | } 91 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGImage.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HOGImage.h 3 | * 4 | * Created on: May 14, 2009 5 | * Author: viprad 6 | */ 7 | 8 | #ifndef __HOGIMAGE_H__ 9 | #define __HOGIMAGE_H__ 10 | 11 | namespace HOG 12 | { 13 | class HOGImage 14 | { 15 | public: 16 | //must me uchar4 17 | bool isLoaded; 18 | 19 | int width, height; 20 | unsigned char* pixels; 21 | 22 | HOGImage(char* fileName); 23 | HOGImage(int width, int height); 24 | HOGImage(int width, int height, unsigned char *pixels); 25 | 26 | virtual ~HOGImage(); 27 | }; 28 | } 29 | 30 | #endif /* HOGIMAGE_H_ */ 31 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGNMS.cpp: -------------------------------------------------------------------------------- 1 | #include "HOGNMS.h" 2 | 3 | #include 4 | 5 | using namespace HOG; 6 | 7 | HOGNMS::HOGNMS() 8 | { 9 | center = 0.0f; scale = 1.0f; 10 | nonmaxSigma[0] = 8.0f; nonmaxSigma[1] = 16.0f; nonmaxSigma[2] = 1.3f; 11 | maxIterations = 100; 12 | modeEpsilon = (float)1e-5; 13 | epsFinalDist = 1.0f; 14 | 15 | nsigma[0] = nonmaxSigma[0]; nsigma[1] = nonmaxSigma[1]; nsigma[2] = logf(nonmaxSigma[2]); 16 | 17 | isAllocated = false; 18 | } 19 | 20 | HOGNMS::~HOGNMS() 21 | { 22 | if (isAllocated) 23 | { 24 | delete tomode; 25 | delete wt; 26 | delete ms; 27 | delete at; 28 | delete nmsResults; 29 | delete nmsToMode; 30 | } 31 | } 32 | 33 | void HOGNMS::nvalue(HOGPoint3* ms, HOGPoint3* at, float* wt, int length) 34 | { 35 | int i, j; 36 | float dotxmr, w; 37 | HOGPoint3 x, r, ns, numer, denum; 38 | 39 | for (i=0; ix / ns.x; r.y = ms->y / ns.y; r.z = ms->z / ns.z; 72 | 73 | dotxmr = (x.x - r.x) * (x.x - r.x) + (x.y - r.y) * (x.y - r.y) + (x.z - r.z) * (x.z - r.z); 74 | w = wt[j] * expf(-dotxmr/2.0f)/sqrtf(ns.x * ns.y * ns.z); 75 | 76 | numer.x += w * x.x; numer.y += w * x.y; numer.z += w * x.z; 77 | denum.x += w / ns.x; denum.y += w / ns.y; denum.z += w / ns.z; 78 | } 79 | 80 | msnext->x = numer.x / denum.x; msnext->y = numer.y / denum.y; msnext->z = numer.z / denum.z; 81 | } 82 | 83 | void HOGNMS::fvalue(HOGPoint3* modes, HOGResult* results, int lengthModes, HOGPoint3* at, float* wt, int length) 84 | { 85 | int i, j; 86 | float no, dotxx; 87 | HOGPoint3 x, ns; 88 | for (i=0; iz); ns.y = nsigma[1] * expf(p2->z); ns.z = nsigma[2]; 110 | b.x = p2->x - p1->x; b.y = p2->y - p1->y; b.z = p2->z - p1->z; 111 | b.x /= ns.x; b.y /= ns.y; b.z /= ns.z; 112 | return b.x * b.x + b.y * b.y + b.z * b.z; 113 | } 114 | 115 | void HOGNMS::shiftToMode(HOGPoint3* ms, HOGPoint3* at, float* wt, HOGPoint3 *tomode, int length) 116 | { 117 | int i, count; 118 | HOGPoint3 ii,II; 119 | for (i=0; i modeEpsilon ); 130 | 131 | tomode[i].x = II.x; tomode[i].y = II.y; tomode[i].z = II.z; 132 | } 133 | } 134 | 135 | HOGResult* HOGNMS::ComputeNMSResults(HOGResult* formattedResults, int formattedResultsCount, bool *nmsResultsAvailable, int *nmsResultsCount, 136 | int hWindowSizeX, int hWindowSizeY) 137 | { 138 | if (!isAllocated) 139 | { 140 | wt = new float[hWindowSizeX * hWindowSizeX]; 141 | at = new HOGPoint3[hWindowSizeX * hWindowSizeX]; 142 | ms = new HOGPoint3[hWindowSizeX * hWindowSizeX]; 143 | tomode = new HOGPoint3[hWindowSizeX * hWindowSizeX]; 144 | nmsToMode = new HOGPoint3[hWindowSizeX * hWindowSizeX]; 145 | nmsResults = new HOGResult[hWindowSizeX * hWindowSizeX]; 146 | isAllocated = true; 147 | } 148 | 149 | int i, j; 150 | float cenx, ceny, nmsOK; 151 | 152 | *nmsResultsCount = 0; 153 | nmsResultsAvailable = false; 154 | 155 | for (i=0; isigmoid(formattedResults[i].score); 158 | cenx = formattedResults[i].x + formattedResults[i].width / 2.0f; 159 | ceny = formattedResults[i].y + formattedResults[i].height / 2.0f; 160 | at[i] = HOGPoint3(cenx, ceny, logf(formattedResults[i].scale)); 161 | } 162 | 163 | nvalue(ms, at, wt, formattedResultsCount); 164 | shiftToMode(ms, at, wt, tomode, formattedResultsCount); 165 | 166 | for (i=0; i center) ? scale * (score - center) : 0.0f; } 29 | void nvalue(HOGPoint3* ms, HOGPoint3* at, float* wt, int length); 30 | void nvalue(HOGPoint3* ms, HOGPoint3* msnext, HOGPoint3* at, float* wt, int length); 31 | void fvalue(HOGPoint3* modes, HOGResult* results, int lengthModes, HOGPoint3* at, float* wt, int length); 32 | void shiftToMode(HOGPoint3* ms, HOGPoint3* at, float* wt, HOGPoint3 *tomode, int length); 33 | float distqt(HOGPoint3 *p1, HOGPoint3 *p2); 34 | 35 | public: 36 | HOGResult* ComputeNMSResults(HOGResult* formattedResults, int formattedResultsCount, bool *nmsResultsAvailable, int *nmsResultsCount, 37 | int hWindowSizeX, int hWindowSizeY); 38 | 39 | HOGNMS(); 40 | ~HOGNMS(void); 41 | }; 42 | } 43 | #endif 44 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGPadding.cu: -------------------------------------------------------------------------------- 1 | #include "HOGPadding.h" 2 | #include "HOGUtils.h" 3 | #include "cutil.h" 4 | 5 | extern int hWidthROI, hHeightROI; 6 | extern int hPaddedWidth, hPaddedHeight; 7 | extern int hWidth, hHeight; 8 | extern int hPaddingSizeX, hPaddingSizeY; 9 | 10 | extern int avSizeX, avSizeY, marginX, marginY; 11 | 12 | uchar4* paddedRegisteredImageU4; 13 | 14 | __host__ void InitPadding(int hPaddedWidth, int hPaddedHeight) 15 | { 16 | cutilSafeCall(cudaMalloc((void**) &paddedRegisteredImageU4, sizeof(uchar4) * hPaddedWidth * hPaddedHeight)); 17 | } 18 | 19 | __host__ void ClosePadding() 20 | { 21 | cutilSafeCall(cudaFree(paddedRegisteredImageU4)); 22 | } 23 | 24 | __host__ void PadHostImage(uchar4* registeredImage, float4 *paddedRegisteredImage, 25 | int minx, int miny, int maxx, int maxy) 26 | { 27 | hWidthROI = maxx - minx; 28 | hHeightROI = maxy - miny; 29 | 30 | int toaddxx = 0, toaddxy = 0, toaddyx = 0, toaddyy = 0; 31 | 32 | if (avSizeX) { toaddxx = hWidthROI * marginX / avSizeX; toaddxy = hHeightROI * marginY / avSizeX; } 33 | if (avSizeY) { toaddyx = hWidthROI * marginX / avSizeY; toaddyy = hHeightROI * marginY / avSizeY; } 34 | 35 | hPaddingSizeX = max(toaddxx, toaddyx); hPaddingSizeY = max(toaddxy, toaddyy); 36 | 37 | hPaddedWidth = hWidthROI + hPaddingSizeX*2; 38 | hPaddedHeight = hHeightROI + hPaddingSizeY*2; 39 | 40 | cutilSafeCall(cudaMemset(paddedRegisteredImageU4, 0, sizeof(uchar4) * hPaddedWidth * hPaddedHeight)); 41 | 42 | cutilSafeCall(cudaMemcpy2D(paddedRegisteredImageU4 + hPaddingSizeX + hPaddingSizeY * hPaddedWidth, 43 | hPaddedWidth * sizeof(uchar4), registeredImage + minx + miny * hWidth, 44 | hWidth * sizeof(uchar4), hWidthROI * sizeof(uchar4), 45 | hHeightROI, cudaMemcpyHostToDevice)); 46 | 47 | Uchar4ToFloat4(paddedRegisteredImageU4, paddedRegisteredImage, hPaddedWidth, hPaddedHeight); 48 | } 49 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGPadding.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_PADDING__ 2 | #define __HOG_PADDING__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | #include "HOGDefines.h" 16 | 17 | __host__ void InitPadding(int hPaddedWidth, int hPaddedHeight); 18 | __host__ void ClosePadding(); 19 | 20 | __host__ void PadHostImage(uchar4* registeredImage, float4 *paddedRegisteredImage, 21 | int minx, int miny, int maxx, int maxy); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGPadding.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export __dummy_entry__ -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGPoint3.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_VECTOR_3D__ 2 | #define __HOG_VECTOR_3D__ 3 | 4 | namespace HOG 5 | { 6 | class HOGPoint3 7 | { 8 | public: 9 | float x,y,z; 10 | 11 | HOGPoint3(float x, float y, float z) { this->x = x; this->y = y; this->z = z; } 12 | HOGPoint3() { this->x = 0; this->y = 0; this->z = 0; } 13 | }; 14 | } 15 | 16 | #endif 17 | 18 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGResult.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_RESUL__ 2 | #define __HOG_RESUL__ 3 | 4 | namespace HOG 5 | { 6 | class HOGResult 7 | { 8 | public: 9 | float score; 10 | float scale; 11 | 12 | int width, height; 13 | int origX, origY; 14 | int x, y; 15 | 16 | HOGResult() 17 | { 18 | width = 0; 19 | height = 0; 20 | origX = 0; 21 | origY = 0; 22 | x = 0; 23 | y = 0; 24 | } 25 | }; 26 | } 27 | 28 | #endif 29 | 30 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGSVMSlider.cu: -------------------------------------------------------------------------------- 1 | #include "HOGSVMSlider.h" 2 | #include "HOGUtils.h" 3 | #include "cutil.h" 4 | 5 | texture texSVM; 6 | cudaArray *svmArray = 0; 7 | 8 | cudaChannelFormatDesc channelDescSVM; 9 | 10 | extern int scaleCount; 11 | extern int hNumberOfWindowsX, hNumberOfWindowsY; 12 | extern int hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY; 13 | extern int rNumberOfWindowsX, rNumberOfWindowsY; 14 | 15 | extern __shared__ float1 allSharedF1[]; 16 | 17 | float svmBias; 18 | 19 | __host__ void InitSVM(float _svmBias, float* svmWeights, int svmWeightsCount) 20 | { 21 | channelDescSVM = cudaCreateChannelDesc(); 22 | cutilSafeCall(cudaMallocArray(&svmArray, &channelDescSVM, svmWeightsCount, 1)); 23 | cutilSafeCall(cudaMemcpyToArray(svmArray, 0, 0, svmWeights, svmWeightsCount * sizeof(float), cudaMemcpyHostToDevice)); 24 | svmBias = _svmBias; 25 | } 26 | 27 | __host__ void CloseSVM() 28 | { 29 | cutilSafeCall(cudaFreeArray(svmArray)); 30 | } 31 | 32 | __global__ void linearSVMEvaluation(float1* svmScores, float svmBias, 33 | float1* blockHistograms, int noHistogramBins, 34 | int windowSizeX, int windowSizeY, int hogBlockCountX, int hogBlockCountY, 35 | int cellSizeX, int cellSizeY, 36 | int numberOfBlockPerWindowX, int numberOfBlockPerWindowY, 37 | int blockSizeX, int blockSizeY, 38 | int alignedBlockDimX, 39 | int scaleId, int scaleCount, 40 | int hNumberOfWindowsX, int hNumberOfWindowsY, 41 | int width, int height) 42 | { 43 | int i; 44 | int texPos; 45 | float1 localValue; 46 | float texValue; 47 | 48 | float1* smem = (float1*) allSharedF1; 49 | 50 | int gmemPosWindow, gmemPosInWindow, gmemPosInWindowDown, smemLocalPos, smemTargetPos; 51 | int gmemStride = hogBlockCountX * noHistogramBins * blockSizeX; 52 | 53 | gmemPosWindow = blockIdx.x * noHistogramBins * blockSizeX + blockIdx.y * blockSizeY * gmemStride; 54 | gmemPosInWindow = gmemPosWindow + threadIdx.x; 55 | smemLocalPos = threadIdx.x; 56 | 57 | int val1 = (blockSizeY * blockSizeX * noHistogramBins) * numberOfBlockPerWindowY; 58 | int val2 = blockSizeX * noHistogramBins; 59 | localValue.x = 0; 60 | 61 | if (blockIdx.x == 10 && blockIdx.y == 8) 62 | { 63 | int asasasa; 64 | asasasa = 0; 65 | asasasa++; 66 | } 67 | 68 | for (i = 0; i> 1; s>0; s>>=1) 81 | { 82 | if (threadIdx.x < s && (threadIdx.x + s) < blockDim.x) 83 | { 84 | smemTargetPos = threadIdx.x + s; 85 | smem[smemLocalPos].x += smem[smemTargetPos].x; 86 | } 87 | 88 | __syncthreads(); 89 | } 90 | 91 | if (threadIdx.x == 0) 92 | { 93 | smem[smemLocalPos].x -= svmBias; 94 | svmScores[blockIdx.x + blockIdx.y * hNumberOfWindowsX + scaleId * hNumberOfWindowsX * hNumberOfWindowsY] = smem[smemLocalPos]; 95 | } 96 | 97 | if (blockIdx.x == 10 && blockIdx.y == 8) 98 | { 99 | int asasasa; 100 | asasasa = 0; 101 | asasasa++; 102 | } 103 | } 104 | 105 | __host__ void ResetSVMScores(float1* svmScores) 106 | { 107 | cutilSafeCall(cudaMemset(svmScores, 0, sizeof(float) * scaleCount * hNumberOfWindowsX * hNumberOfWindowsY)); 108 | } 109 | 110 | __host__ void LinearSVMEvaluation(float1* svmScores, float1* blockHistograms, int noHistogramBins, 111 | int windowSizeX, int windowSizeY, 112 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 113 | int hogBlockCountX, int hogBlockCountY, 114 | int scaleId, int width, int height) 115 | { 116 | rNumberOfWindowsX = (width-windowSizeX)/cellSizeX + 1; 117 | rNumberOfWindowsY = (height-windowSizeY)/cellSizeY + 1; 118 | 119 | dim3 threadCount = dim3(noHistogramBins * blockSizeX * hNumberOfBlockPerWindowX); 120 | dim3 blockCount = dim3(rNumberOfWindowsX, rNumberOfWindowsY); 121 | 122 | int alignedBlockDimX = iClosestPowerOfTwo(noHistogramBins * blockSizeX * hNumberOfBlockPerWindowX); 123 | 124 | cutilSafeCall(cudaBindTextureToArray(texSVM, svmArray, channelDescSVM)); 125 | 126 | linearSVMEvaluation<<>> 127 | (svmScores, svmBias, blockHistograms, noHistogramBins, 128 | windowSizeX, windowSizeY, hogBlockCountX, hogBlockCountY, cellSizeX, cellSizeY, 129 | hNumberOfBlockPerWindowX, hNumberOfBlockPerWindowY, 130 | blockSizeX, blockSizeY, alignedBlockDimX, scaleId, scaleCount, 131 | hNumberOfWindowsX, hNumberOfWindowsY, width, height); 132 | 133 | cutilSafeCall(cudaUnbindTexture(texSVM)); 134 | } 135 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGSVMSlider.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_SVM_SLIDER__ 2 | #define __HOG_SVM_SLIDER__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #include "HOGDefines.h" 17 | 18 | __host__ void InitSVM(float svmBias, float* svmWeights, int svmWeightsCount); 19 | __host__ void CloseSVM(); 20 | 21 | __global__ void linearSVMEvaluation(float1* svmScores, float svmBias, 22 | float1* blockHistograms, int noHistogramBins, 23 | int windowSizeX, int windowSizeY, int hogBlockCountX, int hogBlockCountY, 24 | int cellSizeX, int cellSizeY, 25 | int numberOfBlockPerWindowX, int numberOfBlockPerWindowY, 26 | int blockSizeX, int blockSizeY, 27 | int alignedBlockDimX, 28 | int scaleId, int scaleCount, 29 | int hNumberOfWindowsX, int hNumberOfWindowsY, 30 | int width, int height); 31 | 32 | __host__ void ResetSVMScores(float1* svmScores); 33 | __host__ void LinearSVMEvaluation(float1* svmScores, float1* blockHistograms, int noHistogramBins, 34 | int windowSizeX, int windowSizeY, 35 | int cellSizeX, int cellSizeY, int blockSizeX, int blockSizeY, 36 | int hogBlockCountX, int hogBlockCountY, 37 | int scaleId, int width, int height); 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGSVMSlider.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,allSharedF1,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z19linearSVMEvaluationP6float1fS0_iiiiiiiiiiiiiiiiii -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGScale.cu: -------------------------------------------------------------------------------- 1 | #include "HOGScale.h" 2 | #include "HOGUtils.h" 3 | #include "cutil.h" 4 | 5 | extern int rPaddedHeight; 6 | extern int rPaddedWidth; 7 | extern int hPaddedHeight; 8 | extern int hPaddedWidth; 9 | cudaArray *imageArray = 0; 10 | texture tex; 11 | cudaChannelFormatDesc channelDescDownscale; 12 | 13 | bool isAlocated; 14 | 15 | // w0, w1, w2, and w3 are the four cubic B-spline basis functions 16 | __device__ float w0(float a) { return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); } 17 | __device__ float w1(float a) { return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); } 18 | __device__ float w2(float a) { return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); } 19 | __device__ float w3(float a) { return (1.0f/6.0f)*(a*a*a); } 20 | 21 | // g0 and g1 are the two amplitude functions 22 | __device__ float g0(float a) { return w0(a) + w1(a); } 23 | __device__ float g1(float a) { return w2(a) + w3(a); } 24 | 25 | // h0 and h1 are the two offset functions 26 | __device__ float h0(float a) { return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f; } 27 | __device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; } 28 | 29 | __host__ void InitScale(int hPaddedWidth, int hPaddedHeight) 30 | { 31 | channelDescDownscale = cudaCreateChannelDesc(); 32 | tex.filterMode = cudaFilterModeLinear; 33 | tex.normalized = false; 34 | isAlocated = false; 35 | } 36 | 37 | __host__ void CloseScale() 38 | { 39 | //if (isAlocated) cutilSafeCall(cudaFreeArray(imageArray)); 40 | } 41 | 42 | __host__ void DownscaleImage(int startScaleId, int endScaleId, int scaleId, float scale, 43 | bool useGrayscale, float4* paddedRegisteredImage, 44 | float1* resizedPaddedImageF1, float4* resizedPaddedImageF4) 45 | { 46 | dim3 hThreadSize, hBlockSize; 47 | 48 | hThreadSize = dim3(THREAD_SIZE_W, THREAD_SIZE_H); 49 | 50 | rPaddedWidth = iDivUpF(hPaddedWidth, scale); 51 | rPaddedHeight = iDivUpF(hPaddedHeight, scale); 52 | 53 | hBlockSize = dim3(iDivUp(rPaddedWidth, hThreadSize.x), iDivUp(rPaddedHeight, hThreadSize.y)); 54 | 55 | if (scaleId == startScaleId) 56 | { 57 | if (isAlocated) 58 | cutilSafeCall(cudaFreeArray(imageArray)); 59 | cutilSafeCall(cudaMallocArray(&imageArray, &channelDescDownscale, hPaddedWidth, hPaddedHeight) ); 60 | cutilSafeCall(cudaMemcpyToArray(imageArray, 0, 0, paddedRegisteredImage, sizeof(float4) * hPaddedWidth * hPaddedHeight, cudaMemcpyDeviceToDevice)); 61 | isAlocated = true; 62 | } 63 | 64 | cutilSafeCall(cudaBindTextureToArray(tex, imageArray, channelDescDownscale)); 65 | 66 | if (useGrayscale) 67 | { 68 | cutilSafeCall(cudaMemset(resizedPaddedImageF1, 0, hPaddedWidth * hPaddedHeight * sizeof(float1))); 69 | resizeFastBicubic1<<>>(resizedPaddedImageF1, paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale); 70 | } 71 | else 72 | { 73 | cutilSafeCall(cudaMemset(resizedPaddedImageF4, 0, hPaddedWidth * hPaddedHeight * sizeof(float4))); 74 | resizeFastBicubic4<<>>(resizedPaddedImageF4, paddedRegisteredImage, rPaddedWidth, rPaddedHeight, scale); 75 | } 76 | 77 | cutilSafeCall(cudaUnbindTexture(tex)); 78 | 79 | if (scaleId == endScaleId) 80 | { 81 | cutilSafeCall(cudaFreeArray(imageArray)); 82 | isAlocated = false; 83 | } 84 | } 85 | 86 | __device__ float4 tex2DFastBicubic(const texture texref, float x, float y) 87 | { 88 | float4 r; 89 | float4 val0, val1, val2, val3; 90 | 91 | x -= 0.5f; 92 | y -= 0.5f; 93 | float px = floor(x); 94 | float py = floor(y); 95 | float fx = x - px; 96 | float fy = y - py; 97 | 98 | float g0x = g0(fx); 99 | float g1x = g1(fx); 100 | float h0x = h0(fx); 101 | float h1x = h1(fx); 102 | float h0y = h0(fy); 103 | float h1y = h1(fy); 104 | 105 | val0 = tex2D(texref, px + h0x, py + h0y); 106 | val1 = tex2D(texref, px + h1x, py + h0y); 107 | val2 = tex2D(texref, px + h0x, py + h1y); 108 | val3 = tex2D(texref, px + h1x, py + h1y); 109 | 110 | r.x = (g0(fy) * (g0x * val0.x + g1x * val1.x) + g1(fy) * (g0x * val2.x + g1x * val3.x)); 111 | r.y = (g0(fy) * (g0x * val0.y + g1x * val1.y) + g1(fy) * (g0x * val2.y + g1x * val3.y)); 112 | r.z = (g0(fy) * (g0x * val0.z + g1x * val1.z) + g1(fy) * (g0x * val2.z + g1x * val3.z)); 113 | r.w = (g0(fy) * (g0x * val0.w + g1x * val1.w) + g1(fy) * (g0x * val2.w + g1x * val3.w)); 114 | 115 | return r; 116 | } 117 | 118 | __global__ void resizeFastBicubic4(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale) 119 | { 120 | int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; 121 | int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; 122 | int i = __umul24(y, width) + x; 123 | 124 | float u = x*scale; 125 | float v = y*scale; 126 | 127 | if (x < width && y < height) 128 | { 129 | float4 cF; 130 | 131 | if (scale == 1.0f) 132 | { 133 | cF = paddedRegisteredImage[x + y * width]; 134 | cF.w = 0; 135 | } 136 | else 137 | { 138 | cF = tex2D(tex, u, v); 139 | cF.w = 0; 140 | } 141 | 142 | cF.x = sqrtf(cF.x); cF.y = sqrtf(cF.y); cF.z = sqrtf(cF.z); cF.w = 0; 143 | outputFloat[i] = cF; 144 | } 145 | } 146 | 147 | __global__ void resizeFastBicubic1(float1 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale) 148 | { 149 | int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; 150 | int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; 151 | int i = __umul24(y, width) + x; 152 | 153 | float u = x*scale; 154 | float v = y*scale; 155 | 156 | if (x < width && y < height) 157 | { 158 | float4 cF; 159 | 160 | if (scale == 1.0f) 161 | { 162 | cF = paddedRegisteredImage[x + y * width]; 163 | cF.w = 0; 164 | } 165 | else 166 | { 167 | cF = tex2D(tex, u, v); 168 | cF.w = 0; 169 | } 170 | 171 | outputFloat[i].x = sqrtf(0.2989f * cF.x + 0.5870f * cF.y + 0.1140f * cF.z); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGScale.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_SCALE__ 2 | #define __HOG_SCALE__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | #include "HOGDefines.h" 17 | 18 | __host__ void InitScale(int hPaddedWidth, int hPaddedHeight); 19 | __host__ void CloseScale(); 20 | 21 | __host__ void DownscaleImage(int startScaleId, int endScaleId, int scaleId, float scale, 22 | bool useGrayscale, float4* paddedRegisteredImage, 23 | float1* resizedPaddedImageF1, float4* resizedPaddedImageF4); 24 | 25 | __global__ void resizeFastBicubic1(float1 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale); 26 | __global__ void resizeFastBicubic4(float4 *outputFloat, float4* paddedRegisteredImage, int width, int height, float scale); 27 | 28 | //__device__ float4 tex2DFastBicubic(const texture texref, float x, float y, float scale); 29 | // 30 | //__device__ float w0(float a); 31 | //__device__ float w1(float a); 32 | //__device__ float w2(float a); 33 | //__device__ float w3(float a); 34 | // 35 | //__device__ float g0(float a); 36 | //__device__ float g1(float a); 37 | // 38 | //__device__ float h0(float a); 39 | //__device__ float h1(float a); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGScale.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic1P6float1P6float4iif,_Z18resizeFastBicubic4P6float4S0_iif -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGUtils.cu: -------------------------------------------------------------------------------- 1 | #include "HOGUtils.h" 2 | 3 | //Round a / b to nearest higher integer value 4 | __host__ int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } 5 | 6 | //Round a / b to nearest lower integer value 7 | __host__ int iDivDown(int a, int b) { return a / b; } 8 | 9 | //Align a to nearest higher multiple of b 10 | __host__ int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } 11 | 12 | //Align a to nearest lower multiple of b 13 | __host__ int iAlignDown(int a, int b) {return a - a % b; } 14 | 15 | //Round a / b to nearest higher integer value 16 | __host__ int iDivUpF(int a, float b) { return (a % int(b) != 0) ? int(a / b + 1) : int(a / b);} 17 | 18 | __host__ int iClosestPowerOfTwo(int x) { x--; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x++; return x; } 19 | 20 | __host__ void Uchar4ToFloat4(uchar4 *inputImage, float4 *outputImage, int width, int height) 21 | { 22 | dim3 threads_in_block(16,16); 23 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 24 | uchar4tofloat4<<>>(inputImage, outputImage, width, height); 25 | } 26 | __host__ void Float4ToUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height) 27 | { 28 | dim3 threads_in_block(16,16); 29 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 30 | float4toUchar4<<>>(inputImage, outputImage, width, height); 31 | } 32 | __host__ void Float2ToUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index) 33 | { 34 | dim3 threads_in_block(16,16); 35 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 36 | float2toUchar4<<>>(inputImage, outputImage, width, height, index); 37 | } 38 | __host__ void Float2ToUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index) 39 | { 40 | dim3 threads_in_block(16,16); 41 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 42 | float2toUchar1<<>>(inputImage, outputImage, width, height, index); 43 | } 44 | __host__ void Float1ToUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height) 45 | { 46 | dim3 threads_in_block(16,16); 47 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 48 | float1toUchar4<<>>(inputImage, outputImage, width, height); 49 | } 50 | __host__ void Float1ToUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height) 51 | { 52 | dim3 threads_in_block(16,16); 53 | dim3 blocks(iDivUp(width,16), iDivUp(height,16)); 54 | float1toUchar1<<>>(inputImage, outputImage, width, height); 55 | } 56 | __global__ void float4toUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height) 57 | { 58 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 59 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 60 | 61 | float4 pixelf = inputImage[offset]; 62 | uchar4 pixel; 63 | pixel.x = (unsigned char) pixelf.x; pixel.y = (unsigned char) pixelf.y; 64 | pixel.z = (unsigned char) pixelf.z; pixel.w = (unsigned char) pixelf.w; 65 | 66 | outputImage[offset] = pixel; 67 | } 68 | __global__ void float2toUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index) 69 | { 70 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 71 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 72 | 73 | float2 pixelf = inputImage[offset]; 74 | float pixelfIndexed = (index == 0) ? pixelf.x : pixelf.y; 75 | 76 | uchar4 pixel; 77 | pixel.x = (unsigned char) abs(pixelfIndexed); pixel.y = (unsigned char) abs(pixelfIndexed); 78 | pixel.z = (unsigned char) abs(pixelfIndexed); pixel.w = (unsigned char) abs(pixelfIndexed); 79 | outputImage[offset] = pixel; 80 | } 81 | __global__ void float2toUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index) 82 | { 83 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 84 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 85 | 86 | float2 pixelf = inputImage[offset]; 87 | float pixelfIndexed = (index == 0) ? pixelf.x : pixelf.y; 88 | 89 | uchar1 pixel; 90 | pixel.x = (unsigned char) pixelfIndexed; 91 | 92 | outputImage[offset] = pixel; 93 | } 94 | __global__ void float1toUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height) 95 | { 96 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 97 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 98 | 99 | float1 pixelf = inputImage[offset]; 100 | uchar4 pixel; 101 | pixel.x = (unsigned char) pixelf.x; pixel.y = (unsigned char) pixelf.x; 102 | pixel.z = (unsigned char) pixelf.x; pixel.w = (unsigned char) pixelf.x; 103 | 104 | outputImage[offset] = pixel; 105 | } 106 | __global__ void float1toUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height) 107 | { 108 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 109 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 110 | 111 | float1 pixelf = inputImage[offset]; 112 | uchar1 pixel; 113 | pixel.x = (unsigned char) pixelf.x; 114 | 115 | outputImage[offset] = pixel; 116 | } 117 | 118 | __global__ void uchar4tofloat4(uchar4 *inputImage, float4 *outputImage, int width, int height) 119 | { 120 | int offsetX = blockIdx.x * blockDim.x + threadIdx.x; 121 | int offsetY = blockIdx.y * blockDim.y + threadIdx.y; 122 | 123 | if (offsetX < width && offsetY < height) 124 | { 125 | int offsetBlock = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y * width; 126 | int offset = offsetBlock + threadIdx.x + threadIdx.y * width; 127 | 128 | uchar4 pixel = inputImage[offset]; 129 | float4 pixelf; 130 | pixelf.x = pixel.x; pixelf.y = pixel.y; 131 | pixelf.z = pixel.z; pixelf.w = pixel.w; 132 | 133 | outputImage[offset] = pixelf; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOG_UTILS__ 2 | #define __HOG_UTILS__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | # define WINDOWS_LEAN_AND_MEAN 10 | # include 11 | #endif 12 | 13 | #include 14 | #include 15 | #include "HOGDefines.h" 16 | 17 | __host__ int iDivUp(int a, int b); 18 | __host__ int iDivDown(int a, int b); 19 | __host__ int iAlignUp(int a, int b); 20 | __host__ int iAlignDown(int a, int b); 21 | 22 | __host__ int iDivUpF(int a, float b); 23 | __host__ int iClosestPowerOfTwo(int x); 24 | 25 | __host__ void Float4ToUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height); 26 | __host__ void Float2ToUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index); 27 | __host__ void Float2ToUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index); 28 | __host__ void Float1ToUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height); 29 | __host__ void Float1ToUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height); 30 | 31 | __global__ void float4toUchar4(float4 *inputImage, uchar4 *outputImage, int width, int height); 32 | __global__ void float2toUchar4(float2 *inputImage, uchar4 *outputImage, int width, int height, int index); 33 | __global__ void float2toUchar1(float2 *inputImage, uchar1 *outputImage, int width, int height, int index); 34 | __global__ void float1toUchar4(float1 *inputImage, uchar4 *outputImage, int width, int height); 35 | __global__ void float1toUchar1(float1 *inputImage, uchar1 *outputImage, int width, int height); 36 | 37 | __host__ void Uchar4ToFloat4(uchar4 *inputImage, float4 *outputImage, int width, int height); 38 | __global__ void uchar4tofloat4(uchar4 *inputImage, float4 *outputImage, int width, int height); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /source/fastHOG/HOG/HOGUtils.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z14float1toUchar1P6float1P6uchar1ii,_Z14uchar4tofloat4P6uchar4P6float4ii,_Z14float2toUchar1P6float2P6uchar1iii,_Z14float4toUchar4P6float4P6uchar4ii,_Z14float2toUchar4P6float2P6uchar4iii,_Z14float1toUchar4P6float1P6uchar4ii -------------------------------------------------------------------------------- /source/fastHOG/HOG/cutil.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // These functions and macros were copied over from the old cutil header files, 3 | // so that FastHOG could be compiled. 4 | // The cutil files were picked from the GPU Computing SDK that shipped with 5 | // the old CUDA 3.2 SDK. 6 | //----------------------------------------------------------------------------- 7 | 8 | #pragma once 9 | 10 | // Give a little more for Windows : the console window often disapears before we can read the message 11 | #ifdef _WIN32 12 | # if 1//ndef UNICODE 13 | # ifdef _DEBUG // Do this only in debug mode... 14 | inline void VSPrintf(FILE *file, LPCSTR fmt, ...) 15 | { 16 | size_t fmt2_sz = 2048; 17 | char *fmt2 = (char*)malloc(fmt2_sz); 18 | va_list vlist; 19 | va_start(vlist, fmt); 20 | while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room 21 | { 22 | fmt2_sz *= 2; 23 | if(fmt2) free(fmt2); 24 | fmt2 = (char*)malloc(fmt2_sz); 25 | } 26 | OutputDebugStringA(fmt2); 27 | fprintf(file, fmt2); 28 | free(fmt2); 29 | } 30 | # define FPRINTF(a) VSPrintf a 31 | # else //debug 32 | # define FPRINTF(a) fprintf a 33 | // For other than Win32 34 | # endif //debug 35 | # else //unicode 36 | // Unicode case... let's give-up for now and keep basic printf 37 | # define FPRINTF(a) fprintf a 38 | # endif //unicode 39 | #else //win32 40 | # define FPRINTF(a) fprintf a 41 | #endif //win32 42 | 43 | #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__) 44 | 45 | inline void __cudaSafeCall( cudaError err, const char *file, const int line ) 46 | { 47 | if( cudaSuccess != err) { 48 | FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n", 49 | file, line, cudaGetErrorString( err) )); 50 | exit(-1); 51 | } 52 | } 53 | 54 | #define MIN(a,b) ((a < b) ? a : b) 55 | #define MAX(a,b) ((a > b) ? a : b) 56 | 57 | // Beginning of GPU Architecture definitions 58 | inline int _ConvertSMVer2Cores(int major, int minor) 59 | { 60 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM 61 | typedef struct { 62 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version 63 | int Cores; 64 | } sSMtoCores; 65 | 66 | sSMtoCores nGpuArchCoresPerSM[] = 67 | { { 0x10, 8 }, 68 | { 0x11, 8 }, 69 | { 0x12, 8 }, 70 | { 0x13, 8 }, 71 | { 0x20, 32 }, 72 | { 0x21, 48 }, 73 | { -1, -1 } 74 | }; 75 | 76 | int index = 0; 77 | while (nGpuArchCoresPerSM[index].SM != -1) { 78 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) { 79 | return nGpuArchCoresPerSM[index].Cores; 80 | } 81 | index++; 82 | } 83 | printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor); 84 | return -1; 85 | } 86 | // end of GPU Architecture definitions 87 | 88 | // This function returns the best GPU (with maximum GFLOPS) 89 | inline int cutGetMaxGflopsDeviceId() 90 | { 91 | int current_device = 0, sm_per_multiproc = 0; 92 | int max_compute_perf = 0, max_perf_device = 0; 93 | int device_count = 0, best_SM_arch = 0; 94 | cudaDeviceProp deviceProp; 95 | 96 | cudaGetDeviceCount( &device_count ); 97 | // Find the best major SM Architecture GPU device 98 | while ( current_device < device_count ) { 99 | cudaGetDeviceProperties( &deviceProp, current_device ); 100 | if (deviceProp.major > 0 && deviceProp.major < 9999) { 101 | best_SM_arch = MAX(best_SM_arch, deviceProp.major); 102 | } 103 | current_device++; 104 | } 105 | 106 | // Find the best CUDA capable GPU device 107 | current_device = 0; 108 | while( current_device < device_count ) { 109 | cudaGetDeviceProperties( &deviceProp, current_device ); 110 | if (deviceProp.major == 9999 && deviceProp.minor == 9999) { 111 | sm_per_multiproc = 1; 112 | } else { 113 | sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); 114 | } 115 | 116 | int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; 117 | if( compute_perf > max_compute_perf ) { 118 | // If we find GPU with SM major > 2, search only these 119 | if ( best_SM_arch > 2 ) { 120 | // If our device==dest_SM_arch, choose this, or else pass 121 | if (deviceProp.major == best_SM_arch) { 122 | max_compute_perf = compute_perf; 123 | max_perf_device = current_device; 124 | } 125 | } else { 126 | max_compute_perf = compute_perf; 127 | max_perf_device = current_device; 128 | } 129 | } 130 | ++current_device; 131 | } 132 | return max_perf_device; 133 | } 134 | -------------------------------------------------------------------------------- /source/fastHOG/HOGConvolution.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z24convolutionColumnGPU1to2P6float2P6float1S2_iiii,_Z18convolutionRowGPU1P6float1S0_ii,_Z18convolutionRowGPU4P6float4S0_ii,_Z24convolutionColumnGPU4to2P6float2P6float4S2_iiii -------------------------------------------------------------------------------- /source/fastHOG/HOGEngineDevice.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic3P6float4S0_iif -------------------------------------------------------------------------------- /source/fastHOG/HOGHistogram.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,allShared,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z31computeBlockHistogramsWithGaussP6float2P6float1iiiiiiiii,_Z24normalizeBlockHistogramsP6float1iiiiiiiiii -------------------------------------------------------------------------------- /source/fastHOG/HOGPadding.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export __dummy_entry__ -------------------------------------------------------------------------------- /source/fastHOG/HOGSVMSlider.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,allSharedF1,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z19linearSVMEvaluationP6float1fS0_iiiiiiiiiiiiiiiiii -------------------------------------------------------------------------------- /source/fastHOG/HOGScale.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z18resizeFastBicubic1P6float1P6float4iif,_Z18resizeFastBicubic4P6float4S0_iif -------------------------------------------------------------------------------- /source/fastHOG/HOGUtils.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export _Z14float1toUchar1P6float1P6uchar1ii,_Z14uchar4tofloat4P6uchar4P6float4ii,_Z14float2toUchar1P6float2P6uchar1iii,_Z14float4toUchar4P6float4P6uchar4ii,_Z14float2toUchar4P6float2P6uchar4iii,_Z14float1toUchar4P6float1P6uchar4ii -------------------------------------------------------------------------------- /source/fastHOG/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright 1993-2006 NVIDIA Corporation. All rights reserved. 4 | # 5 | # NOTICE TO USER: 6 | # 7 | # This source code is subject to NVIDIA ownership rights under U.S. and 8 | # international Copyright laws. 9 | # 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 11 | # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 12 | # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 19 | # OR PERFORMANCE OF THIS SOURCE CODE. 20 | # 21 | # U.S. Government End Users. This source code is a "commercial item" as 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 23 | # "commercial computer software" and "commercial computer software 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 25 | # and is provided to the U.S. Government only as a commercial end item. 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 28 | # source code with only those rights set forth herein. 29 | # 30 | ################################################################################ 31 | # 32 | # Build script for project 33 | # 34 | ################################################################################ 35 | 36 | # Add source files here 37 | EXECUTABLE := fastHOG 38 | # C/C++ source files (compiled with gcc / c++) 39 | CCFILES := \ 40 | fastHOG.cpp \ 41 | # HOG UTILS 42 | CCUTILS := \ 43 | ImageWindow.cpp \ 44 | # CC HOG 45 | CCHOG := \ 46 | HOGImage.cpp \ 47 | HOGEngine.cpp \ 48 | HOGNMS.cpp \ 49 | # CUDA HOG 50 | CUFILES := \ 51 | HOGEngineDevice.cu \ 52 | HOGConvolution.cu \ 53 | HOGHistogram.cu \ 54 | HOGPadding.cu \ 55 | HOGScale.cu \ 56 | HOGSVMSlider.cu \ 57 | HOGUtils.cu \ 58 | ################################################################################ 59 | # Rules and targets 60 | 61 | include common.mk 62 | -------------------------------------------------------------------------------- /source/fastHOG/Utils/ImageWindow.cpp: -------------------------------------------------------------------------------- 1 | #include "ImageWindow.h" 2 | 3 | #include 4 | #include 5 | ImageWindow::ImageWindow(int width, int height, char* title) : 6 | fltk::Window(width, height, title) 7 | { 8 | this->width = width; 9 | this->height = height; 10 | 11 | this->begin(); 12 | imageWidget = new ImageWidget(0, 0, width, height); 13 | this->end(); 14 | 15 | doStuff = 0; 16 | } 17 | 18 | ImageWindow::ImageWindow(HOGImage* image, char* title) : 19 | fltk::Window(image->width, image->height, title) 20 | { 21 | this->width = image->width; 22 | this->height = image->height; 23 | 24 | this->begin(); 25 | imageWidget = new ImageWidget(0, 0, image->width, image->height, image->pixels); 26 | this->end(); 27 | 28 | doStuff = 0; 29 | } 30 | 31 | ImageWindow::~ImageWindow(void) { } 32 | 33 | void ImageWindow::show(int x, int y) 34 | { 35 | if (x == -1 || y == 1) 36 | fltk::Window::show(); 37 | else 38 | { 39 | this->position(x, y); 40 | fltk::Window::show(); 41 | } 42 | } 43 | 44 | void ImageWindow::setImage(HOGImage* image) 45 | { 46 | this->begin(); 47 | imageWidget->setImage((unsigned char*) image->pixels); 48 | imageWidget->draw(); 49 | this->end(); 50 | } 51 | 52 | int ImageWindow::handle(int eventId) 53 | { 54 | int ret = 0; 55 | switch (eventId) 56 | { 57 | case fltk::MOVE: 58 | ret = 1; 59 | 60 | break; 61 | case fltk::PUSH: 62 | 63 | imageWidget->rects.clear(); 64 | 65 | if (doStuff != 0) 66 | doStuff(); 67 | 68 | break; 69 | } 70 | 71 | return ret; 72 | } 73 | 74 | void ImageWindow::drawRect(int x, int y, int w, int h) 75 | { 76 | imageWidget->drawRect(x, y, w, h); 77 | } 78 | 79 | void ImageWindow::Close() 80 | { 81 | delete imageWidget; 82 | 83 | this->destroy(); 84 | } 85 | -------------------------------------------------------------------------------- /source/fastHOG/Utils/ImageWindow.h: -------------------------------------------------------------------------------- 1 | #ifndef __IMAGE_WINDOW_H__ 2 | #define __IMAGE_WINDOW_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "../HOG/HOGImage.h" 11 | 12 | #include 13 | #include 14 | 15 | using namespace HOG; 16 | 17 | class ImageWidget: public fltk::Widget 18 | { 19 | struct rect 20 | { 21 | int x, y, w, h; 22 | rect(int _x, int _y, int _w, int _h) { x = _x; y = _y; w = _w; h = _h; } 23 | }; 24 | 25 | public: 26 | std::vector rects; 27 | 28 | unsigned char* pixels; 29 | fltk::Rectangle* rectangle; 30 | 31 | ImageWidget(int x, int y, int w, int h) : 32 | fltk::Widget(x, y, w, h) 33 | { 34 | rectangle = new fltk::Rectangle(0, 0, w, h); 35 | this->box(fltk::BORDER_BOX); 36 | this->buttonbox(fltk::FLAT_BOX); 37 | } 38 | 39 | ImageWidget(int x, int y, int w, int h, unsigned char* pixels) : 40 | fltk::Widget(x, y, w, h) 41 | { 42 | this->pixels = pixels; 43 | rectangle = new fltk::Rectangle(0, 0, w, h); 44 | this->box(fltk::BORDER_BOX); 45 | this->buttonbox(fltk::FLAT_BOX); 46 | } 47 | 48 | void draw() 49 | { 50 | fltk::drawimage((unsigned char*) pixels, fltk::RGB32, *rectangle); 51 | fltk::setcolor(fltk::RED); 52 | for (size_t i=0; iredraw(); 55 | } 56 | 57 | void setImage(unsigned char* pixelsNew) 58 | { 59 | this->pixels = pixelsNew; 60 | } 61 | 62 | void drawRect(int x, int y, int w, int h) 63 | { 64 | rects.push_back(rect(x,y,w,h)); 65 | this->redraw(); 66 | } 67 | }; 68 | 69 | class ImageWindow: public fltk::Window 70 | { 71 | bool colorImage; 72 | 73 | int width, height; 74 | 75 | ImageWidget* imageWidget; 76 | fltk::Window *otherWindow; 77 | 78 | public: 79 | 80 | void (*doStuff)(); 81 | 82 | ImageWindow(int width, int height, char* title); 83 | ImageWindow(HOGImage* image, char* title); 84 | 85 | void setImage(HOGImage* image); 86 | 87 | void show(int x = -1, int y = -1); 88 | void drawRect(int x, int y, int w, int h); 89 | 90 | int handle(int); 91 | 92 | void Close(); 93 | 94 | ~ImageWindow(void); 95 | }; 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /source/fastHOG/Utils/Timer.h: -------------------------------------------------------------------------------- 1 | #ifndef Timer_H 2 | #define Timer_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Timer 10 | { 11 | friend std::ostream& operator<<(std::ostream& os, Timer& t); 12 | 13 | private: 14 | bool running; 15 | clock_t start_clock; 16 | time_t start_time; 17 | double acc_time; 18 | 19 | double elapsed_time(); 20 | 21 | public: 22 | // 'running' is initially false. A Timer needs to be explicitly started 23 | // using 'start' or 'restart' 24 | Timer() : 25 | running(false), start_clock(0), start_time(0), acc_time(0) 26 | { 27 | } 28 | 29 | void start(const char* msg = 0); 30 | void restart(const char* msg = 0); 31 | void stop(const char* msg = 0); 32 | void check(const char* msg = 0); 33 | void check(const char* msg, int msg_count); 34 | 35 | }; // class Timer 36 | 37 | //=========================================================================== 38 | // Return the total time that the Timer has been in the "running" 39 | // state since it was first "started" or last "restarted". For 40 | // "short" time periods (less than an hour), the actual cpu time 41 | // used is reported instead of the elapsed time. 42 | 43 | inline double Timer::elapsed_time() 44 | { 45 | time_t acc_sec = time(0) - start_time; 46 | if (acc_sec < 3600) 47 | return (clock() - start_clock) / (1.0 * CLOCKS_PER_SEC); 48 | else 49 | return (1.0 * acc_sec); 50 | 51 | } // Timer::elapsed_time 52 | 53 | //=========================================================================== 54 | // Start a Timer. If it is already running, let it continue running. 55 | // Print an optional message. 56 | 57 | inline void Timer::start(const char* msg) 58 | { 59 | // Print an optional message, something like "Starting Timer t"; 60 | if (msg) 61 | std::cout << msg << std::endl; 62 | 63 | // Return immediately if the Timer is already running 64 | if (running) 65 | return; 66 | 67 | // Set Timer status to running and set the start time 68 | running = true; 69 | start_clock = clock(); 70 | start_time = time(0); 71 | 72 | } // Timer::start 73 | 74 | //=========================================================================== 75 | // Turn the Timer off and start it again from 0. Print an optional message. 76 | 77 | inline void Timer::restart(const char* msg) 78 | { 79 | // Print an optional message, something like "Restarting Timer t"; 80 | if (msg) 81 | std::cout << msg << std::endl; 82 | 83 | // Set Timer status to running, reset accumulated time, and set start time 84 | running = true; 85 | acc_time = 0; 86 | start_clock = clock(); 87 | start_time = time(0); 88 | 89 | } // Timer::restart 90 | 91 | //=========================================================================== 92 | // Stop the Timer and print an optional message. 93 | 94 | inline void Timer::stop(const char* msg) 95 | { 96 | // Print an optional message, something like "Stopping Timer t"; 97 | if (msg) 98 | std::cout << msg << std::endl; 99 | 100 | // Compute accumulated running time and set Timer status to not running 101 | if (running) 102 | acc_time += elapsed_time(); 103 | running = false; 104 | 105 | } // Timer::stop 106 | 107 | //=========================================================================== 108 | // Print out an optional message followed by the current Timer timing. 109 | 110 | inline void Timer::check(const char* msg) 111 | { 112 | std::string s; 113 | // Print an optional message, something like "Checking Timer t"; 114 | if (msg) 115 | std::cout << msg << " : "; 116 | 117 | std::cout << "Time [" << std::setiosflags(std::ios::fixed) 118 | << std::setprecision(3) << acc_time 119 | + (running ? elapsed_time() : 0) << "] seconds\n"; 120 | } // Timer::check 121 | 122 | inline void Timer::check(const char* msg, int msg_count) 123 | { 124 | std::string s; 125 | // Print an optional message, something like "Checking Timer t"; 126 | if (msg) 127 | std::cout << msg << ":"; 128 | 129 | std::cout << msg_count << ": " << "Time [" 130 | << std::setiosflags(std::ios::fixed) << std::setprecision(3) 131 | << acc_time + (running ? elapsed_time() : 0) << "] seconds\n"; 132 | } // Timer::check 133 | 134 | //=========================================================================== 135 | // Allow Timers to be printed to ostreams using the syntax 'os << t' 136 | // for an ostream 'os' and a Timer 't'. For example, "cout << t" will 137 | // print out the total amount of time 't' has been "running". 138 | 139 | inline std::ostream& operator<<(std::ostream& os, Timer& t) 140 | { 141 | os << std::setprecision(3) << std::setiosflags(std::ios::fixed) 142 | << t.acc_time + (t.running ? t.elapsed_time() : 0); 143 | return os; 144 | } 145 | 146 | //=========================================================================== 147 | 148 | #endif // Timer_H 149 | -------------------------------------------------------------------------------- /source/fastHOG/bin/release/fastHOG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOG/bin/release/fastHOG -------------------------------------------------------------------------------- /source/fastHOG/common.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright 1993-2006 NVIDIA Corporation. All rights reserved. 4 | # 5 | # NOTICE TO USER: 6 | # 7 | # This source code is subject to NVIDIA ownership rights under U.S. and 8 | # international Copyright laws. 9 | # 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 11 | # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 12 | # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 19 | # OR PERFORMANCE OF THIS SOURCE CODE. 20 | # 21 | # U.S. Government End Users. This source code is a "commercial item" as 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 23 | # "commercial computer software" and "commercial computer software 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 25 | # and is provided to the U.S. Government only as a commercial end item. 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 28 | # source code with only those rights set forth herein. 29 | # 30 | ################################################################################ 31 | # 32 | # Common build script 33 | # 34 | ################################################################################ 35 | 36 | .SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin 37 | 38 | # Add new SM Versions here as devices with new Compute Capability are released 39 | SM_VERSIONS := sm_10 sm_11 sm_12 sm_13 40 | 41 | CUDA_INSTALL_PATH ?= /usr/local/cuda 42 | 43 | ifdef cuda-install 44 | CUDA_INSTALL_PATH := $(cuda-install) 45 | endif 46 | 47 | # detect OS 48 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 49 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 50 | # 'linux' is output for Linux system, 'darwin' for OS X 51 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 52 | 53 | # Basic directory setup for SDK 54 | # (override directories only if they are not already defined) 55 | SRCDIR ?= 56 | SRCDIRUTILS ?= Utils 57 | SRCDIRHOG ?= HOG 58 | ROOTDIR ?= $(CUDA_INSTALL_PATH) 59 | ROOTBINDIR ?= bin 60 | BINDIR ?= $(ROOTBINDIR) 61 | ROOTOBJDIR ?= obj 62 | LIBDIR := $(ROOTDIR)/lib64 63 | COMMONDIR := $(ROOTDIR)/common 64 | 65 | # Compilers 66 | NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc 67 | CXX := g++ 68 | CC := gcc 69 | LINK := g++ -fPIC 70 | 71 | # Includes 72 | INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc 73 | 74 | # architecture flag for cubin build 75 | CUBIN_ARCH_FLAG := -m32 76 | 77 | # Warning flags 78 | CXXWARN_FLAGS := \ 79 | -W -Wall \ 80 | -Wimplicit \ 81 | -Wswitch \ 82 | -Wformat \ 83 | -Wchar-subscripts \ 84 | -Wparentheses \ 85 | -Wmultichar \ 86 | -Wtrigraphs \ 87 | -Wpointer-arith \ 88 | -Wcast-align \ 89 | -Wreturn-type \ 90 | -Wno-unused-function \ 91 | $(SPACE) 92 | 93 | CWARN_FLAGS := $(CXXWARN_FLAGS) \ 94 | -Wstrict-prototypes \ 95 | -Wmissing-prototypes \ 96 | -Wmissing-declarations \ 97 | -Wnested-externs \ 98 | -Wmain \ 99 | 100 | # Compiler-specific flags 101 | NVCCFLAGS := 102 | CXXFLAGS := $(CXXWARN_FLAGS) 103 | CFLAGS := $(CWARN_FLAGS) 104 | 105 | # Common flags 106 | COMMONFLAGS += $(INCLUDES) -DUNIX 107 | 108 | # Debug/release configuration 109 | ifeq ($(dbg),1) 110 | COMMONFLAGS += -g 111 | NVCCFLAGS += -D_DEBUG 112 | BINSUBDIR := debug 113 | LIBSUFFIX := D 114 | else 115 | COMMONFLAGS += -O3 116 | BINSUBDIR := release 117 | LIBSUFFIX := 118 | NVCCFLAGS += --compiler-options -fno-strict-aliasing 119 | CXXFLAGS += -fno-strict-aliasing 120 | CFLAGS += -fno-strict-aliasing 121 | endif 122 | 123 | # append optional arch/SM version flags (such as -arch sm_11) 124 | #NVCCFLAGS += $(SMVERSIONFLAGS) 125 | 126 | # architecture flag for cubin build 127 | CUBIN_ARCH_FLAG := -m32 128 | 129 | # detect if 32 bit or 64 bit system 130 | HP_64 = $(shell uname -m | grep 64) 131 | 132 | # OpenGL is used or not (if it is used, then it is necessary to include GLEW) 133 | ifeq ($(USEGLLIB),1) 134 | 135 | ifneq ($(DARWIN),) 136 | OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a 137 | else 138 | OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu 139 | 140 | ifeq "$(strip $(HP_64))" "" 141 | OPENGLLIB += -lGLEW -L/usr/X11R6/lib 142 | else 143 | OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 144 | endif 145 | endif 146 | 147 | CUBIN_ARCH_FLAG := -m64 148 | endif 149 | 150 | ifeq ($(USEGLUT),1) 151 | ifneq ($(DARWIN),) 152 | OPENGLLIB += -framework GLUT 153 | else 154 | OPENGLLIB += -lglut 155 | endif 156 | endif 157 | 158 | ifeq ($(USEPARAMGL),1) 159 | PARAMGLLIB := -lparamgl$(LIBSUFFIX) 160 | endif 161 | 162 | ifeq ($(USERENDERCHECKGL),1) 163 | RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX) 164 | endif 165 | 166 | ifeq ($(USECUDPP), 1) 167 | ifeq "$(strip $(HP_64))" "" 168 | CUDPPLIB := -lcudpp 169 | else 170 | CUDPPLIB := -lcudpp64 171 | endif 172 | 173 | CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX) 174 | 175 | ifeq ($(emu), 1) 176 | CUDPPLIB := $(CUDPPLIB)_emu 177 | endif 178 | endif 179 | 180 | # Libs 181 | LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) 182 | 183 | # If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB 184 | ifeq ($(USECUDADYNLIB),1) 185 | LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 186 | else 187 | # static linking, we will statically link against CUDA and CUDART 188 | ifeq ($(USEDRVAPI),1) 189 | LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 190 | else 191 | LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 192 | endif 193 | endif 194 | 195 | ifeq ($(USECUFFT),1) 196 | ifeq ($(emu),1) 197 | LIB += -lcufftemu 198 | else 199 | LIB += -lcufft 200 | endif 201 | endif 202 | 203 | ifeq ($(USECUBLAS),1) 204 | ifeq ($(emu),1) 205 | LIB += -lcublasemu 206 | else 207 | LIB += -lcublas 208 | endif 209 | endif 210 | 211 | # Lib/exe configuration 212 | ifneq ($(STATIC_LIB),) 213 | TARGETDIR := $(LIBDIR) 214 | TARGET := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB)) 215 | LINKLINE = ar rucv $(TARGET) $(OBJS) 216 | else 217 | # Device emulation configuration 218 | ifeq ($(emu), 1) 219 | NVCCFLAGS += -deviceemu 220 | CUDACCFLAGS += 221 | BINSUBDIR := emu$(BINSUBDIR) 222 | # consistency, makes developing easier 223 | CXXFLAGS += -D__DEVICE_EMULATION__ 224 | CFLAGS += -D__DEVICE_EMULATION__ 225 | endif 226 | TARGETDIR := $(BINDIR)/$(BINSUBDIR) 227 | TARGET := $(TARGETDIR)/$(EXECUTABLE) 228 | #fltk 229 | LIB += -lfltk2 -lXft -lfltk2_images -lXext -lXinerama -lXi 230 | #boost thread for interface 231 | LIB += -lboost_thread 232 | #read images in HOGImage from file 233 | LIB += -lfreeimage 234 | LIB += -lboost_system 235 | LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB) 236 | endif 237 | 238 | # check if verbose 239 | ifeq ($(verbose), 1) 240 | VERBOSE := 241 | else 242 | VERBOSE := @ 243 | endif 244 | 245 | ################################################################################ 246 | # Check for input flags and set compiler flags appropriately 247 | ################################################################################ 248 | ifeq ($(fastmath), 1) 249 | NVCCFLAGS += -use_fast_math 250 | endif 251 | 252 | ifeq ($(keep), 1) 253 | NVCCFLAGS += -keep 254 | NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx 255 | endif 256 | 257 | ifdef maxregisters 258 | NVCCFLAGS += -maxrregcount $(maxregisters) 259 | endif 260 | 261 | # Add cudacc flags 262 | NVCCFLAGS += $(CUDACCFLAGS) 263 | 264 | # workaround for mac os x cuda 1.1 compiler issues 265 | ifneq ($(DARWIN),) 266 | NVCCFLAGS += --host-compilation=C 267 | endif 268 | 269 | # Add common flags 270 | NVCCFLAGS += $(COMMONFLAGS) 271 | CXXFLAGS += $(COMMONFLAGS) 272 | CFLAGS += $(COMMONFLAGS) 273 | 274 | ifeq ($(nvcc_warn_verbose),1) 275 | NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 276 | NVCCFLAGS += --compiler-options -fno-strict-aliasing 277 | endif 278 | 279 | ################################################################################ 280 | # Set up object files 281 | ################################################################################ 282 | OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR) 283 | OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES))) 284 | 285 | OBJDIRUTILS := $(ROOTOBJDIR)/$(BINSUBDIR)/$(SRCDIRUTILS) 286 | OBJS += $(patsubst %.cpp,$(OBJDIRUTILS)/%.cpp.o,$(notdir $(CCUTILS))) 287 | 288 | OBJDIRHOG := $(ROOTOBJDIR)/$(BINSUBDIR)/$(SRCDIRHOG) 289 | OBJS += $(patsubst %.cu,$(OBJDIRHOG)/%.cu.o,$(notdir $(CUFILES))) 290 | OBJS += $(patsubst %.cpp,$(OBJDIRHOG)/%.cpp.o,$(notdir $(CCHOG))) 291 | 292 | ################################################################################ 293 | # Set up cubin files 294 | ################################################################################ 295 | CUBINDIR := $(SRCDIR)data 296 | CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES))) 297 | 298 | ################################################################################ 299 | # Rules 300 | ################################################################################ 301 | $(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS) 302 | $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $< 303 | 304 | $(OBJDIRUTILS)/%.cpp.o : $(SRCDIRUTILS)%.cpp $(C_DEPS) 305 | $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< 306 | 307 | $(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS) 308 | $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< 309 | 310 | $(OBJDIRHOG)/%.cpp.o : $(SRCDIRHOG)%.cpp $(C_DEPS) 311 | $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< 312 | 313 | $(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS) 314 | $(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< 315 | 316 | $(OBJDIRHOG)/%.cu.o : $(SRCDIRHOG)%.cu $(CU_DEPS) 317 | $(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< 318 | 319 | $(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory 320 | $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $< 321 | 322 | # 323 | # The following definition is a template that gets instantiated for each SM 324 | # version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things: 325 | # 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX. 326 | # 2. It generates a rule for building .cu_sm_XX.o files from the corresponding 327 | # .cu file. 328 | # 329 | # The intended use for this is to allow Makefiles that use common.mk to compile 330 | # files to different Compute Capability targets (aka SM arch version). To do 331 | # so, in the Makefile, list files for each SM arch separately, like so: 332 | # 333 | # CUFILES_sm_10 := mycudakernel_sm10.cu app.cu 334 | # CUFILES_sm_12 := anothercudakernel_sm12.cu 335 | # 336 | define SMVERSION_template 337 | OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1)))) 338 | $(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS) 339 | $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1) 340 | endef 341 | 342 | # This line invokes the above template for each arch version stored in 343 | # SM_VERSIONS. The call funtion invokes the template, and the eval 344 | # function interprets it as make commands. 345 | $(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver)))) 346 | 347 | $(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile 348 | $(VERBOSE)$(LINKLINE) 349 | 350 | cubindirectory: 351 | $(VERBOSE)mkdir -p $(CUBINDIR) 352 | 353 | makedirectories: 354 | $(VERBOSE)mkdir -p $(LIBDIR) 355 | $(VERBOSE)mkdir -p $(OBJDIR) 356 | $(VERBOSE)mkdir -p $(OBJDIRUTILS) 357 | $(VERBOSE)mkdir -p $(OBJDIRHOG) 358 | $(VERBOSE)mkdir -p $(TARGETDIR) 359 | 360 | tidy :-lboost_thread 361 | $(VERBOSE)find . | egrep "#" | xargs rm -f 362 | $(VERBOSE)find . | egrep "\~" | xargs rm -f 363 | 364 | clean : tidy 365 | $(VERBOSE)rm -f $(OBJS) 366 | $(VERBOSE)rm -f $(CUBINS) 367 | $(VERBOSE)rm -f $(TARGET) 368 | $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN) 369 | 370 | clobber : clean 371 | $(VERBOSE)rm -rf $(ROOTOBJDIR) 372 | -------------------------------------------------------------------------------- /source/fastHOG/fastHOG.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * fastHog.cpp 3 | * 4 | * Created on: May 14, 2009 5 | * Author: viprad 6 | */ 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include "HOG/HOGEngine.h" 14 | #include "HOG/HOGImage.h" 15 | 16 | #include "Utils/ImageWindow.h" 17 | #include "Utils/Timer.h" 18 | 19 | #include "Others/persondetectorwt.tcc" 20 | 21 | using namespace HOG; 22 | 23 | ImageWindow* fastHOGWindow; 24 | HOGImage* image; 25 | HOGImage* imageCUDA; 26 | 27 | void doStuffHere() 28 | { 29 | HOGEngine::Instance()->InitializeHOG(image->width, image->height, 30 | PERSON_LINEAR_BIAS, PERSON_WEIGHT_VEC, PERSON_WEIGHT_VEC_LENGTH); 31 | 32 | //HOGEngine::Instance()->InitializeHOG(image->width, image->height, 33 | // "Files//SVM//head_W24x24_C4x4_N2x2_G4x4_HeadSize16x16.alt"); 34 | 35 | Timer t; 36 | t.restart(); 37 | HOGEngine::Instance()->BeginProcess(image); 38 | HOGEngine::Instance()->EndProcess(); 39 | t.stop(); t.check("Processing time"); 40 | 41 | printf("Found %d positive results.\n", HOGEngine::Instance()->formattedResultsCount); 42 | 43 | HOGEngine::Instance()->GetImage(imageCUDA, HOGEngine::IMAGE_ROI); 44 | fastHOGWindow->setImage(imageCUDA); 45 | 46 | for (int i=0; inmsResultsCount; i++) 47 | { 48 | printf("%1.5f %1.5f %4d %4d %4d %4d %4d %4d\n", 49 | HOGEngine::Instance()->nmsResults[i].scale, 50 | HOGEngine::Instance()->nmsResults[i].score, 51 | HOGEngine::Instance()->nmsResults[i].origX, 52 | HOGEngine::Instance()->nmsResults[i].origY, 53 | HOGEngine::Instance()->nmsResults[i].x, 54 | HOGEngine::Instance()->nmsResults[i].y, 55 | HOGEngine::Instance()->nmsResults[i].width, 56 | HOGEngine::Instance()->nmsResults[i].height); 57 | fastHOGWindow->drawRect(HOGEngine::Instance()->nmsResults[i].x, 58 | HOGEngine::Instance()->nmsResults[i].y, 59 | HOGEngine::Instance()->nmsResults[i].width, 60 | HOGEngine::Instance()->nmsResults[i].height); 61 | } 62 | 63 | printf("Drawn %d positive results.\n", HOGEngine::Instance()->nmsResultsCount); 64 | 65 | HOGEngine::Instance()->FinalizeHOG(); 66 | } 67 | 68 | int main(void) 69 | { 70 | image = new HOGImage("Files//Images//testImage.bmp"); 71 | imageCUDA = new HOGImage(image->width,image->height); 72 | 73 | fastHOGWindow = new ImageWindow(image, "fastHOG"); 74 | fastHOGWindow->doStuff = &doStuffHere; 75 | fastHOGWindow->show(); 76 | 77 | fltk::run(); 78 | 79 | delete image; 80 | delete imageCUDA; 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /source/fastHOG/fastHOG.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 15 | 16 | 17 | 20 | 21 | 22 | 30 | 33 | 36 | 39 | 42 | 45 | 48 | 59 | 62 | 65 | 68 | 78 | 81 | 84 | 87 | 90 | 93 | 96 | 99 | 100 | 109 | 112 | 115 | 118 | 121 | 124 | 127 | 135 | 138 | 141 | 144 | 152 | 155 | 158 | 161 | 164 | 167 | 170 | 173 | 174 | 179 | 182 | 185 | 188 | 191 | 194 | 197 | 200 | 203 | 206 | 209 | 212 | 215 | 218 | 221 | 224 | 227 | 230 | 233 | 234 | 242 | 245 | 248 | 251 | 254 | 257 | 260 | 270 | 273 | 276 | 279 | 288 | 291 | 294 | 297 | 300 | 303 | 306 | 309 | 310 | 315 | 318 | 321 | 324 | 327 | 330 | 333 | 336 | 339 | 342 | 345 | 348 | 351 | 354 | 357 | 360 | 363 | 366 | 369 | 370 | 371 | 372 | 373 | 374 | 379 | 382 | 383 | 384 | 387 | 390 | 391 | 392 | 395 | 398 | 399 | 400 | 403 | 406 | 407 | 410 | 411 | 414 | 415 | 418 | 419 | 420 | 423 | 426 | 429 | 432 | 433 | 436 | 437 | 440 | 441 | 444 | 445 | 448 | 449 | 450 | 453 | 456 | 457 | 460 | 461 | 464 | 465 | 468 | 469 | 472 | 473 | 476 | 477 | 480 | 481 | 484 | 485 | 486 | 487 | 490 | 493 | 496 | 497 | 500 | 501 | 504 | 505 | 508 | 509 | 512 | 513 | 516 | 517 | 520 | 521 | 522 | 525 | 528 | 529 | 532 | 533 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | -------------------------------------------------------------------------------- /source/fastHOGLib.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 10.00 3 | # Visual Studio 2008 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fastHOG", "fastHOG\fastHOG.vcproj", "{98951235-E3D7-48E9-BA01-C7291E55FDEF}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | CUDAEmuDebug|Win32 = CUDAEmuDebug|Win32 9 | Debug|Win32 = Debug|Win32 10 | Release|Win32 = Release|Win32 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.CUDAEmuDebug|Win32.ActiveCfg = Debug|Win32 14 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.CUDAEmuDebug|Win32.Build.0 = Debug|Win32 15 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.Debug|Win32.ActiveCfg = Debug|Win32 16 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.Debug|Win32.Build.0 = Debug|Win32 17 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.Release|Win32.ActiveCfg = Release|Win32 18 | {98951235-E3D7-48E9-BA01-C7291E55FDEF}.Release|Win32.Build.0 = Release|Win32 19 | EndGlobalSection 20 | GlobalSection(SolutionProperties) = preSolution 21 | HideSolutionNode = FALSE 22 | EndGlobalSection 23 | EndGlobal 24 | -------------------------------------------------------------------------------- /source/fastHOGLib.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashwin/fasthog/377dd96b870ae0ef22c93b5b36950f92214dd399/source/fastHOGLib.suo --------------------------------------------------------------------------------