├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── kMeansCuda.cu ├── kMeansCuda.h ├── main.cpp └── nbproject ├── Makefile-Debug.mk ├── Makefile-Release.mk ├── Makefile-impl.mk ├── Makefile-variables.mk ├── Package-Debug.bash ├── Package-Release.bash ├── configurations.xml ├── private ├── Makefile-variables.mk ├── configurations.xml ├── launcher.properties └── private.xml └── project.xml /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2005 Wei-keng Liao 4 | Copyright (c) 2011 Serban Giuroiu 5 | Copyright (c) 2013 PHAM Hoai Vu 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of 8 | this software and associated documentation files (the "Software"), to deal in 9 | the Software without restriction, including without limitation the rights to 10 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 11 | the Software, and to permit persons to whom the Software is furnished to do so, 12 | subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 19 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 20 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 21 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # There exist several targets which are by default empty and which can be 3 | # used for execution of your targets. These targets are usually executed 4 | # before and after some main targets. They are: 5 | # 6 | # .build-pre: called before 'build' target 7 | # .build-post: called after 'build' target 8 | # .clean-pre: called before 'clean' target 9 | # .clean-post: called after 'clean' target 10 | # .clobber-pre: called before 'clobber' target 11 | # .clobber-post: called after 'clobber' target 12 | # .all-pre: called before 'all' target 13 | # .all-post: called after 'all' target 14 | # .help-pre: called before 'help' target 15 | # .help-post: called after 'help' target 16 | # 17 | # Targets beginning with '.' are not intended to be called on their own. 18 | # 19 | # Main targets can be executed directly, and they are: 20 | # 21 | # build build a specific configuration 22 | # clean remove built files from a configuration 23 | # clobber remove all built files 24 | # all build all configurations 25 | # help print help mesage 26 | # 27 | # Targets .build-impl, .clean-impl, .clobber-impl, .all-impl, and 28 | # .help-impl are implemented in nbproject/makefile-impl.mk. 29 | # 30 | # Available make variables: 31 | # 32 | # CND_BASEDIR base directory for relative paths 33 | # CND_DISTDIR default top distribution directory (build artifacts) 34 | # CND_BUILDDIR default top build directory (object files, ...) 35 | # CONF name of current configuration 36 | # CND_PLATFORM_${CONF} platform name (current configuration) 37 | # CND_ARTIFACT_DIR_${CONF} directory of build artifact (current configuration) 38 | # CND_ARTIFACT_NAME_${CONF} name of build artifact (current configuration) 39 | # CND_ARTIFACT_PATH_${CONF} path to build artifact (current configuration) 40 | # CND_PACKAGE_DIR_${CONF} directory of package (current configuration) 41 | # CND_PACKAGE_NAME_${CONF} name of package (current configuration) 42 | # CND_PACKAGE_PATH_${CONF} path to package (current configuration) 43 | # 44 | # NOCDDL 45 | 46 | 47 | # Environment 48 | MKDIR=mkdir 49 | CP=cp 50 | CCADMIN=CCadmin 51 | 52 | 53 | # build 54 | build: .build-post 55 | 56 | .build-pre: 57 | # Add your pre 'build' code here... 58 | 59 | .build-post: .build-impl 60 | # Add your post 'build' code here... 61 | 62 | 63 | # clean 64 | clean: .clean-post 65 | 66 | .clean-pre: 67 | # Add your pre 'clean' code here... 68 | 69 | .clean-post: .clean-impl 70 | # Add your post 'clean' code here... 71 | 72 | 73 | # clobber 74 | clobber: .clobber-post 75 | 76 | .clobber-pre: 77 | # Add your pre 'clobber' code here... 78 | 79 | .clobber-post: .clobber-impl 80 | # Add your post 'clobber' code here... 81 | 82 | 83 | # all 84 | all: .all-post 85 | 86 | .all-pre: 87 | # Add your pre 'all' code here... 88 | 89 | .all-post: .all-impl 90 | # Add your post 'all' code here... 91 | 92 | 93 | # build tests 94 | build-tests: .build-tests-post 95 | 96 | .build-tests-pre: 97 | # Add your pre 'build-tests' code here... 98 | 99 | .build-tests-post: .build-tests-impl 100 | # Add your post 'build-tests' code here... 101 | 102 | 103 | # run tests 104 | test: .test-post 105 | 106 | .test-pre: build-tests 107 | # Add your pre 'test' code here... 108 | 109 | .test-post: .test-impl 110 | # Add your post 'test' code here... 111 | 112 | 113 | # help 114 | help: .help-post 115 | 116 | .help-pre: 117 | # Add your pre 'help' code here... 118 | 119 | .help-post: .help-impl 120 | # Add your post 'help' code here... 121 | 122 | 123 | 124 | # include project implementation makefile 125 | include nbproject/Makefile-impl.mk 126 | 127 | # include project make variables 128 | include nbproject/Makefile-variables.mk 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kmeans_cuda 2 | =========== 3 | 4 | CUDA implementation of k-means 5 | 6 | The original version of k-means in CUDA was made available by Serban Giuroiu at https://github.com/serban/kmeans. 7 | 8 | However Serban used pointer of pointers to represent a 2D matrix, which might not be very convenient in some cases. Moreover in my application, I have the data matrix on the device memory already, and the matrix is stored in column major order (to be used in CUBLAS and other CUDA libraries). Therefore I made some changes to Serban's implementation, concretely: 9 | 10 | 1. The function now works with column major matrix stored in device memory, and the result is also stored in device memory. This reduces the overhead caused by transposing the matrix in Serban's code, and makes it easier to integrate k-means in other applications. 11 | 2. A simple CUDA kernel is added for updating the cluster centroids after each iteration. This reduces the overhead caused by multiple memory transfers at each iteration. However I was lazy and this kernel (called `update_cluster`) has not been well optimized. 12 | 3. The `membership` array can be set to `NULL` when calling the function if you don't want to have it in the results. 13 | 4. Added a parameter for the maximum number of k-means iterations. 14 | 15 | With the new kernel, the program seems to be faster. I already included a simple test case and benchmark in `main.cpp`, you can compile and run it yourself. Serban's original version is also included. 16 | -------------------------------------------------------------------------------- /kMeansCuda.cu: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 2 | /* File: cuda_kmeans.cu (CUDA version) */ 3 | /* Description: Implementation of simple k-means clustering algorithm */ 4 | /* This program takes an array of N data objects, each with */ 5 | /* M coordinates and performs a k-means clustering given a */ 6 | /* user-provided value of the number of clusters (K). The */ 7 | /* clustering results are saved in 2 arrays: */ 8 | /* 1. a returned array of size [K][N] indicating the center */ 9 | /* coordinates of K clusters */ 10 | /* 2. membership[N] stores the cluster center ids, each */ 11 | /* corresponding to the cluster a data object is assigned */ 12 | /* */ 13 | /* Author: Wei-keng Liao */ 14 | /* ECE Department, Northwestern University */ 15 | /* email: wkliao@ece.northwestern.edu */ 16 | /* Copyright, 2005, Wei-keng Liao */ 17 | /* */ 18 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 19 | 20 | // Copyright (c) 2005 Wei-keng Liao 21 | // Copyright (c) 2011 Serban Giuroiu 22 | // Copyright (c) 2013 Vu Pham 23 | // 24 | // Permission is hereby granted, free of charge, to any person obtaining a copy 25 | // of this software and associated documentation files (the "Software"), to deal 26 | // in the Software without restriction, including without limitation the rights 27 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | // copies of the Software, and to permit persons to whom the Software is 29 | // furnished to do so, subject to the following conditions: 30 | // 31 | // The above copyright notice and this permission notice shall be included in 32 | // all copies or substantial portions of the Software. 33 | // 34 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 40 | // THE SOFTWARE. 41 | 42 | // ----------------------------------------------------------------------------- 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #include "kMeansCuda.h" 49 | 50 | namespace cuda 51 | { 52 | 53 | void get_kernel_config_given_ratios(int sz1, int sz2, dim3& szGrid, dim3& szBlock 54 | , int& rowPerThread, int& colPerThread 55 | , int nThreadXRatio, int nThreadYRatio) 56 | { 57 | szBlock.x = std::min(sz1, nThreadXRatio); 58 | szBlock.y = std::min(sz2, nThreadYRatio); 59 | szBlock.z = 1; 60 | szGrid.x = szGrid.y = szGrid.z = 1; 61 | colPerThread = rowPerThread = 1; 62 | 63 | if (sz1 > nThreadXRatio || sz2 > nThreadYRatio) 64 | { 65 | int ratio = sz1/nThreadXRatio, k; 66 | for (k = 1; (1 << k) <= ratio; ++k) 67 | { 68 | rowPerThread = (2 << (k/2)); 69 | } 70 | //rowPerThread = 2 << (int)(std::log(std::sqrt((float)sz1/nThreadX))/std::log((float)2)); 71 | szGrid.x = (sz1 + szBlock.x*rowPerThread - 1) / (szBlock.x*rowPerThread); 72 | 73 | ratio = sz2/nThreadYRatio; 74 | for (k = 1; (1 << k) <= ratio; ++k) 75 | { 76 | colPerThread = (2 << (k/2)); 77 | } 78 | //colPerThread = 2 << (int)(std::log(std::sqrt((float)sz2/nThreadY))/std::log((float)2)); 79 | szGrid.y = (sz2 + szBlock.y*colPerThread - 1) / (szBlock.y*colPerThread); 80 | } 81 | assert(szGrid.x*szBlock.x*rowPerThread >= sz1); 82 | assert(szGrid.y*szBlock.y*colPerThread >= sz2); 83 | } 84 | 85 | void get_kernel_config(int sz1, int sz2, dim3& szGrid, dim3& szBlock 86 | , int& rowPerThread, int& colPerThread) 87 | { 88 | // CUDA 2.x: maximum 1024 threads/block. CUDA < 2.x: 512 threads/block 89 | 90 | int nThreadX, nThreadY; 91 | if (sz1 / sz2 >= 2) 92 | { 93 | nThreadX = 64; nThreadY = 16; 94 | } 95 | else if (sz2 / sz1 >= 2) 96 | { 97 | nThreadX = 16; nThreadY = 64; 98 | } 99 | else 100 | { 101 | nThreadX = nThreadY = 32; 102 | } 103 | get_kernel_config_given_ratios(sz1, sz2, szGrid, szBlock 104 | , rowPerThread, colPerThread, nThreadX, nThreadY); 105 | } 106 | 107 | /******************************************************************************/ 108 | 109 | 110 | static inline int nextPowerOfTwo(int n) { 111 | n--; 112 | 113 | n = n >> 1 | n; 114 | n = n >> 2 | n; 115 | n = n >> 4 | n; 116 | n = n >> 8 | n; 117 | n = n >> 16 | n; 118 | // n = n >> 32 | n; // For 64-bit ints 119 | 120 | return ++n; 121 | } 122 | 123 | /*----< euclid_dist_2() >----------------------------------------------------*/ 124 | /* square of Euclid distance between two multi-dimensional points */ 125 | __host__ __device__ inline static 126 | float euclid_dist_2(int numCoords, 127 | int numObjs, 128 | int numClusters, 129 | float *objects, // [numCoords][numObjs] 130 | float *clusters, // [numCoords][numClusters] 131 | int objectId, 132 | int clusterId) 133 | { 134 | int i; 135 | float ans=0.0; 136 | 137 | for (i = 0; i < numCoords; i++) { 138 | ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * 139 | (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); 140 | } 141 | 142 | return(ans); 143 | } 144 | 145 | /*----< find_nearest_cluster() >---------------------------------------------*/ 146 | __global__ static 147 | void find_nearest_cluster(int numCoords, 148 | int numObjs, 149 | int numClusters, 150 | float *objects, // [numCoords][numObjs] 151 | float *deviceClusters, // [numCoords][numClusters] 152 | int *membership, // [numObjs] 153 | int *intermediates) 154 | { 155 | extern __shared__ char sharedMemory[]; 156 | 157 | // The type chosen for membershipChanged must be large enough to support 158 | // reductions! There are blockDim.x elements, one for each thread in the 159 | // block. See numThreadsPerClusterBlock in cuda_kmeans(). 160 | unsigned char *membershipChanged = (unsigned char *)sharedMemory; 161 | #if BLOCK_SHARED_MEM_OPTIMIZATION 162 | float *clusters = (float *)(sharedMemory + blockDim.x); 163 | #else 164 | float *clusters = deviceClusters; 165 | #endif 166 | 167 | membershipChanged[threadIdx.x] = 0; 168 | 169 | #if BLOCK_SHARED_MEM_OPTIMIZATION 170 | // BEWARE: We can overrun our shared memory here if there are too many 171 | // clusters or too many coordinates! For reference, a Tesla C1060 has 16 172 | // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of 173 | // shared memory per block. 174 | for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { 175 | for (int j = 0; j < numCoords; j++) { 176 | clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; 177 | } 178 | } 179 | __syncthreads(); 180 | #endif 181 | 182 | int objectId = blockDim.x * blockIdx.x + threadIdx.x; 183 | 184 | if (objectId < numObjs) { 185 | int index, i; 186 | float dist, min_dist; 187 | 188 | /* find the cluster id that has min distance to object */ 189 | index = 0; 190 | min_dist = euclid_dist_2(numCoords, numObjs, numClusters, 191 | objects, clusters, objectId, 0); 192 | 193 | for (i=1; i 0; s >>= 1) { 214 | if (threadIdx.x < s) { 215 | membershipChanged[threadIdx.x] += 216 | membershipChanged[threadIdx.x + s]; 217 | } 218 | __syncthreads(); 219 | } 220 | 221 | if (threadIdx.x == 0) { 222 | intermediates[blockIdx.x] = membershipChanged[0]; 223 | } 224 | } 225 | } 226 | 227 | __global__ static 228 | void compute_delta(int *deviceIntermediates, 229 | int numIntermediates, // The actual number of intermediates 230 | int numIntermediates2) // The next power of two 231 | { 232 | // The number of elements in this array should be equal to 233 | // numIntermediates2, the number of threads launched. It *must* be a power 234 | // of two! 235 | extern __shared__ unsigned int intermediates[]; 236 | 237 | // Copy global intermediate values into shared memory. 238 | intermediates[threadIdx.x] = 239 | (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; 240 | 241 | __syncthreads(); 242 | 243 | // numIntermediates2 *must* be a power of two! 244 | for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { 245 | if (threadIdx.x < s) { 246 | intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; 247 | } 248 | __syncthreads(); 249 | } 250 | 251 | if (threadIdx.x == 0) { 252 | deviceIntermediates[0] = intermediates[0]; 253 | } 254 | } 255 | 256 | #define malloc2D(name, xDim, yDim, type) do { \ 257 | name = (type **)malloc(xDim * sizeof(type *)); \ 258 | assert(name != NULL); \ 259 | name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ 260 | assert(name[0] != NULL); \ 261 | for (size_t i = 1; i < xDim; i++) \ 262 | name[i] = name[i-1] + yDim; \ 263 | } while (0) 264 | 265 | 266 | /*----< cuda_kmeans() >-------------------------------------------------------*/ 267 | // 268 | // ---------------------------------------- 269 | // DATA LAYOUT 270 | // 271 | // objects [numObjs][numCoords] 272 | // clusters [numClusters][numCoords] 273 | // dimObjects [numCoords][numObjs] 274 | // dimClusters [numCoords][numClusters] 275 | // newClusters [numCoords][numClusters] 276 | // deviceObjects [numCoords][numObjs] 277 | // deviceClusters [numCoords][numClusters] 278 | // ---------------------------------------- 279 | // 280 | /* return an array of cluster centers of size [numClusters][numCoords] */ 281 | float** kMeansHost(float **objects, /* in: [numObjs][numCoords] */ 282 | int numCoords, /* no. features */ 283 | int numObjs, /* no. objects */ 284 | int numClusters, /* no. clusters */ 285 | float threshold, /* % objects change membership */ 286 | int *membership, /* out: [numObjs] */ 287 | int *loop_iterations) 288 | { 289 | int i, j, index, loop=0; 290 | int *newClusterSize; /* [numClusters]: no. objects assigned in each 291 | new cluster */ 292 | float delta; /* % of objects change their clusters */ 293 | float **dimObjects; 294 | float **clusters; /* out: [numClusters][numCoords] */ 295 | float **dimClusters; 296 | float **newClusters; /* [numCoords][numClusters] */ 297 | 298 | float *deviceObjects; 299 | float *deviceClusters; 300 | int *deviceMembership; 301 | int *deviceIntermediates; 302 | 303 | // Copy objects given in [numObjs][numCoords] layout to new 304 | // [numCoords][numObjs] layout 305 | malloc2D(dimObjects, numCoords, numObjs, float); 306 | for (i = 0; i < numCoords; i++) { 307 | for (j = 0; j < numObjs; j++) { 308 | dimObjects[i][j] = objects[j][i]; 309 | } 310 | } 311 | 312 | /* pick first numClusters elements of objects[] as initial cluster centers*/ 313 | malloc2D(dimClusters, numCoords, numClusters, float); 314 | for (i = 0; i < numCoords; i++) { 315 | for (j = 0; j < numClusters; j++) { 316 | dimClusters[i][j] = dimObjects[i][j]; 317 | } 318 | } 319 | 320 | /* initialize membership[] */ 321 | for (i=0; i deviceProp.sharedMemPerBlock) { 348 | err("WARNING: Your CUDA hardware has insufficient block shared memory. " 349 | "You need to recompile with BLOCK_SHARED_MEM_OPTIMIZATION=0. " 350 | "See the README for details.\n"); 351 | } 352 | #else 353 | const unsigned int clusterBlockSharedDataSize = 354 | numThreadsPerClusterBlock * sizeof(unsigned char); 355 | #endif 356 | 357 | const unsigned int numReductionThreads = 358 | nextPowerOfTwo(numClusterBlocks); 359 | const unsigned int reductionBlockSharedDataSize = 360 | numReductionThreads * sizeof(unsigned int); 361 | 362 | CHECK_CUDA(cudaMalloc(&deviceObjects, numObjs*numCoords*sizeof(float))); 363 | CHECK_CUDA(cudaMalloc(&deviceClusters, numClusters*numCoords*sizeof(float))); 364 | CHECK_CUDA(cudaMalloc(&deviceMembership, numObjs*sizeof(int))); 365 | CHECK_CUDA(cudaMalloc(&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); 366 | 367 | CHECK_CUDA(cudaMemcpy(deviceObjects, dimObjects[0], 368 | numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); 369 | CHECK_CUDA(cudaMemcpy(deviceMembership, membership, 370 | numObjs*sizeof(int), cudaMemcpyHostToDevice)); 371 | 372 | do { 373 | CHECK_CUDA(cudaMemcpy(deviceClusters, dimClusters[0], 374 | numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); 375 | 376 | find_nearest_cluster 377 | <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> 378 | (numCoords, numObjs, numClusters, 379 | deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); 380 | 381 | cudaDeviceSynchronize(); 382 | CHECK_CUDA(cudaGetLastError()); 383 | 384 | compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> 385 | (deviceIntermediates, numClusterBlocks, numReductionThreads); 386 | 387 | cudaDeviceSynchronize(); 388 | CHECK_CUDA(cudaGetLastError()); 389 | 390 | int d; 391 | CHECK_CUDA(cudaMemcpy(&d, deviceIntermediates, 392 | sizeof(int), cudaMemcpyDeviceToHost)); 393 | delta = (float)d; 394 | 395 | CHECK_CUDA(cudaMemcpy(membership, deviceMembership, 396 | numObjs*sizeof(int), cudaMemcpyDeviceToHost)); 397 | 398 | for (i=0; i 0) 414 | dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; 415 | newClusters[j][i] = 0.0; /* set back to 0 */ 416 | } 417 | newClusterSize[i] = 0; /* set back to 0 */ 418 | } 419 | 420 | delta /= numObjs; 421 | } while (delta > threshold && loop++ < 500); 422 | 423 | *loop_iterations = loop + 1; 424 | 425 | /* allocate a 2D space for returning variable clusters[] (coordinates 426 | of cluster centers) */ 427 | malloc2D(clusters, numClusters, numCoords, float); 428 | for (i = 0; i < numClusters; i++) { 429 | for (j = 0; j < numCoords; j++) { 430 | clusters[i][j] = dimClusters[j][i]; 431 | } 432 | } 433 | 434 | CHECK_CUDA(cudaFree(deviceObjects)); 435 | CHECK_CUDA(cudaFree(deviceClusters)); 436 | CHECK_CUDA(cudaFree(deviceMembership)); 437 | CHECK_CUDA(cudaFree(deviceIntermediates)); 438 | 439 | free(dimObjects[0]); 440 | free(dimObjects); 441 | free(dimClusters[0]); 442 | free(dimClusters); 443 | free(newClusters[0]); 444 | free(newClusters); 445 | free(newClusterSize); 446 | 447 | return clusters; 448 | } 449 | 450 | /******************************************************************************/ 451 | 452 | __global__ static 453 | void update_cluster(const float* objects, const int* membership, float* clusters 454 | , const int nCoords, const int nObjs, const int nClusters 455 | , const int rowPerThread, const int colPerThread) 456 | { 457 | for (int cIdx = 0; cIdx < colPerThread; ++cIdx) 458 | { 459 | int c = cIdx * gridDim.y * blockDim.y + blockIdx.y * blockDim.y + threadIdx.y; 460 | if (c >= nClusters) 461 | break; 462 | 463 | for (int rIdx = 0; rIdx < rowPerThread; ++rIdx) 464 | { 465 | int r = rIdx * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; 466 | if (r >= nCoords) 467 | break; 468 | 469 | float sumVal(0); 470 | int clusterCount(0); 471 | for (int i = 0; i < nObjs; ++i) 472 | { 473 | if (membership[i] == c) 474 | { 475 | sumVal += objects[r*nObjs + i]; 476 | clusterCount++; 477 | } 478 | } 479 | if (clusterCount > 0) 480 | clusters[nClusters*r+c] = sumVal / clusterCount; 481 | } 482 | } 483 | } 484 | 485 | __global__ static 486 | void copy_rows(const float* src, const int sz1, const int sz2 487 | , const int copiedRows, float* dest 488 | , const int rowPerThread, const int colPerThread) 489 | { 490 | for (int rIdx = 0; rIdx < rowPerThread; ++rIdx) 491 | { 492 | int r = rIdx * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; 493 | if (r >= copiedRows) 494 | break; 495 | 496 | for (int cIdx = 0; cIdx < colPerThread; ++cIdx) 497 | { 498 | int c = cIdx * gridDim.y * blockDim.y + blockIdx.y * blockDim.y + threadIdx.y; 499 | if (c >= sz2) 500 | break; 501 | dest[c*copiedRows+r] = src[c*sz1+r]; 502 | } 503 | } 504 | } 505 | 506 | int kMeans(float *deviceObjects, /* in: [numObjs][numCoords] */ 507 | int numCoords, /* no. features */ 508 | int numObjs, /* no. objects */ 509 | int numClusters, /* no. clusters */ 510 | float threshold, /* % objects change membership */ 511 | int maxLoop, /* maximum number of loops */ 512 | int *membership, /* out: [numObjs] */ 513 | float *deviceClusters) 514 | { 515 | int loop(0); 516 | float delta; /* % of objects change their clusters */ 517 | int *deviceMembership; 518 | int *deviceIntermediates; 519 | 520 | CHECK_PARAM(deviceClusters, "deviceClusters cannot be NULL"); 521 | 522 | // To support reduction, numThreadsPerClusterBlock *must* be a power of 523 | // two, and it *must* be no larger than the number of bits that will 524 | // fit into an unsigned char, the type used to keep track of membership 525 | // changes in the kernel. 526 | const unsigned int numThreadsPerClusterBlock = 128; 527 | const unsigned int numClusterBlocks = 528 | (numObjs + numThreadsPerClusterBlock - 1) / numThreadsPerClusterBlock; 529 | #if BLOCK_SHARED_MEM_OPTIMIZATION 530 | const unsigned int clusterBlockSharedDataSize = 531 | numThreadsPerClusterBlock * sizeof(unsigned char) + 532 | numClusters * numCoords * sizeof(float); 533 | 534 | cudaDeviceProp deviceProp; 535 | int deviceNum; 536 | cudaGetDevice(&deviceNum); 537 | cudaGetDeviceProperties(&deviceProp, deviceNum); 538 | 539 | if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) { 540 | err("WARNING: Your CUDA hardware has insufficient block shared memory. " 541 | "You need to recompile with BLOCK_SHARED_MEM_OPTIMIZATION=0. " 542 | "See the README for details.\n"); 543 | } 544 | #else 545 | const unsigned int clusterBlockSharedDataSize = 546 | numThreadsPerClusterBlock * sizeof(unsigned char); 547 | #endif 548 | 549 | const unsigned int numReductionThreads = nextPowerOfTwo(numClusterBlocks); 550 | const unsigned int reductionBlockSharedDataSize = numReductionThreads * sizeof(unsigned int); 551 | 552 | CHECK_CUDA(cudaMalloc(&deviceMembership, numObjs*sizeof(int))); 553 | CHECK_CUDA(cudaMalloc(&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); 554 | 555 | // initialize membership[] 556 | if (membership) 557 | { 558 | for (int i=0; i>>(deviceObjects, numObjs, numCoords 580 | , numClusters, deviceClusters, rowPerThread, colPerThread); 581 | 582 | do 583 | { 584 | find_nearest_cluster 585 | <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> 586 | (numCoords, numObjs, numClusters, 587 | deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); 588 | 589 | //cudaDeviceSynchronize(); 590 | //CHECK_CUDA(cudaGetLastError()); 591 | 592 | compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> 593 | (deviceIntermediates, numClusterBlocks, numReductionThreads); 594 | 595 | //cudaDeviceSynchronize(); 596 | //CHECK_CUDA(cudaGetLastError()); 597 | 598 | get_kernel_config(numCoords, numClusters, szGrid, szBlock, rowPerThread, colPerThread); 599 | 600 | update_cluster <<< szGrid, szBlock >>> (deviceObjects, deviceMembership 601 | , deviceClusters, numCoords, numObjs, numClusters, rowPerThread, colPerThread); 602 | 603 | cudaDeviceSynchronize(); 604 | CHECK_CUDA(cudaGetLastError()); 605 | 606 | // inefficient memory transfer 607 | int d; 608 | CHECK_CUDA(cudaMemcpy(&d, deviceIntermediates, 609 | sizeof(int), cudaMemcpyDeviceToHost)); 610 | delta = (float)d/numObjs; 611 | } 612 | while (delta > threshold && loop++ < maxLoop); 613 | 614 | if (membership) 615 | { 616 | CHECK_CUDA(cudaMemcpy(membership, deviceMembership, 617 | numObjs*sizeof(int), cudaMemcpyDeviceToHost)); 618 | } 619 | CHECK_CUDA(cudaFree(deviceMembership)); 620 | CHECK_CUDA(cudaFree(deviceIntermediates)); 621 | 622 | return (loop + 1); 623 | } 624 | 625 | } -------------------------------------------------------------------------------- /kMeansCuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: kMeansCuda.h 3 | * Author: hvpham 4 | * 5 | * Created on December 22, 2013, 12:27 AM 6 | */ 7 | 8 | #ifndef KMEANSCUDA_H 9 | #define KMEANSCUDA_H 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace cuda 16 | { 17 | 18 | inline void checkCudaError(cudaError_t err 19 | , char const * file, unsigned int line) 20 | { 21 | if (err != cudaSuccess) 22 | { 23 | std::stringstream ss; 24 | ss << "CUDA error " << err << " at " << file << ":" << line; 25 | throw std::runtime_error(ss.str()); 26 | } 27 | } 28 | 29 | inline void check(bool bTrue, const char* msg 30 | , char const * file, unsigned int line) 31 | { 32 | if (!bTrue) 33 | { 34 | std::stringstream ss; 35 | ss << "Error: \"" << msg << "\" at " << file << ":" << line; 36 | throw std::runtime_error(ss.str()); 37 | } 38 | } 39 | 40 | #define CHECK_PARAM(x, msg) cuda::check((x), (msg), __FILE__, __LINE__) 41 | #define CHECK_CUDA(cudaError) cuda::checkCudaError((cudaError), __FILE__, __LINE__) 42 | 43 | // device memory, column-majored 44 | int kMeans(float *deviceObjects, /* in: [numObjs][numCoords] */ 45 | int numCoords, /* no. features */ 46 | int numObjs, /* no. objects */ 47 | int numClusters, /* no. clusters */ 48 | float threshold, /* % objects change membership */ 49 | int maxLoop, /* maximum number of loops */ 50 | int *membership, /* out: [numObjs] */ 51 | float *deviceClusters); 52 | 53 | // original version: host memory, row-majored 54 | float** kMeansHost(float **objects, /* in: [numObjs][numCoords] */ 55 | int numCoords, /* no. features */ 56 | int numObjs, /* no. objects */ 57 | int numClusters, /* no. clusters */ 58 | float threshold, /* % objects change membership */ 59 | int *membership, /* out: [numObjs] */ 60 | int *loop_iterations); 61 | } 62 | 63 | #endif /* KMEANSCUDA_H */ 64 | 65 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * File: main.cpp 3 | * Author: hvpham 4 | * 5 | * Created on December 22, 2013, 12:27 AM 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "kMeansCuda.h" 14 | 15 | float* createDataColMajored(int sz1, int sz2, bool cudaMalloc) 16 | { 17 | // I use malloc() here just to make it coherent with createDataRowMajored() 18 | // you should use the C++ way... 19 | float* arr; 20 | if (cudaMalloc) 21 | CHECK_CUDA(cudaMallocHost(&arr, sz1*sz2*sizeof(float), cudaHostAllocDefault)); 22 | else 23 | arr = (float*)malloc(sz1*sz2*sizeof(float)); 24 | for (int i = 0; i < sz1; ++i) 25 | for (int j = 0; j < sz2; ++j) 26 | { 27 | arr[sz1*j + i] = i*100 + j; 28 | } 29 | return arr; 30 | } 31 | 32 | float** createDataRowMajored(int sz1, int sz2) 33 | { 34 | float** ret = (float**)malloc(sz1*sizeof(float*)); 35 | ret[0] = (float*)malloc(sz1*sz2*sizeof(float)); 36 | for (int i = 1; i < sz1; ++i) 37 | { 38 | ret[i] = ret[i-1] + sz2; 39 | } 40 | for (int i = 0; i < sz1; ++i) 41 | for (int j = 0; j < sz2; ++j) 42 | { 43 | ret[i][j] = i*100 + j; 44 | } 45 | return ret; 46 | } 47 | 48 | float* callkMeans1(float* hostData, int nObjs, int nDim, int nClusters, int*& membership) 49 | { 50 | float* devData, *devClusters, *hostClusters; 51 | CHECK_CUDA(cudaMalloc(&devData, nObjs*nDim*sizeof(float))); 52 | CHECK_CUDA(cudaMemcpy(devData, hostData, nObjs*nDim*sizeof(float), cudaMemcpyHostToDevice)); 53 | CHECK_CUDA(cudaMalloc(&devClusters, nClusters*nDim*sizeof(float))); 54 | if (membership) 55 | membership = new int[nObjs]; 56 | 57 | cuda::kMeans(devData, nDim, nObjs, nClusters, 0, 500, membership, devClusters); 58 | hostClusters = new float[nClusters*nDim*sizeof(float)]; 59 | 60 | CHECK_CUDA(cudaDeviceSynchronize()); 61 | CHECK_CUDA(cudaGetLastError()); 62 | CHECK_CUDA(cudaMemcpy(hostClusters, devClusters, nClusters*nDim*sizeof(float), cudaMemcpyDeviceToHost)); 63 | CHECK_CUDA(cudaFree(devData)); 64 | CHECK_CUDA(cudaFree(devClusters)); 65 | 66 | return hostClusters; 67 | } 68 | 69 | float** callkMeans2(float** hostData, int nObjs, int nDim, int nClusters, int*& membership) 70 | { 71 | int loops; 72 | membership = new int[nObjs]; 73 | return cuda::kMeansHost(hostData, nDim, nObjs, nClusters, 0, membership, &loops); 74 | } 75 | 76 | void checkCorrectness() 77 | { 78 | const int sz1 = 1024, sz2 = 1024, nClusters = 10; 79 | float* dataCm = createDataColMajored(sz1, sz2, false); 80 | float** dataRm = createDataRowMajored(sz1, sz2); 81 | int* membership1, *membership2; 82 | float *clusters1, **clusters2, *clusters3; 83 | 84 | clusters1 = callkMeans1(dataCm, sz1, sz2, nClusters, membership1); 85 | clusters2 = callkMeans2(dataRm, sz1, sz2, nClusters, membership2); 86 | 87 | for (int i = 0; i < sz1; ++i) 88 | { 89 | CHECK_PARAM(membership1[i] == membership2[i], "membership"); 90 | //if(membership1[i] != membership2[i]) 91 | // std::cout << "Membership " << i << " " << membership1[i] << " " << membership2[i] << std::endl; 92 | } 93 | 94 | for (int i = 0; i < nClusters; ++i) 95 | for (int j = 0; j < sz2; ++j) 96 | { 97 | CHECK_PARAM(std::abs(clusters1[nClusters*j + i] - clusters2[i][j]) <= 1E-2, "clusters"); 98 | //if(std::abs(clusters1[nClusters*j + i] - clusters2[i][j]) > 1E-2) 99 | // std::cout << "Clusters " << i << " " << j << " " << clusters1[nClusters*j + i] 100 | // << " " << clusters2[i][j] 101 | // << " " << clusters1[nClusters*j + i] - clusters2[i][j] << std::endl; 102 | } 103 | 104 | // membership = NULL is also fine 105 | int* dummyMembership = NULL; 106 | clusters3 = callkMeans1(dataCm, sz1, sz2, nClusters, dummyMembership); 107 | for (int i = 0; i < nClusters; ++i) 108 | for (int j = 0; j < sz2; ++j) 109 | { 110 | CHECK_PARAM(std::abs(clusters3[nClusters*j + i] - clusters2[i][j]) <= 1E-2, "clusters"); 111 | } 112 | 113 | delete[] membership1; 114 | delete[] membership2; 115 | delete[] clusters1; 116 | delete[] clusters3; 117 | free(clusters2[0]); 118 | free(clusters2); 119 | free(dataCm); 120 | free(dataRm[0]); 121 | free(dataRm); 122 | } 123 | 124 | void benchMark() 125 | { 126 | const int sz1 = 1024, sz2 = 1024, nClusters = 10; 127 | float* dataCm = createDataColMajored(sz1, sz2, true); 128 | float** dataRm = createDataRowMajored(sz1, sz2); 129 | int* membership1, *membership2; 130 | float *clusters1, **clusters2; 131 | const int TIMES = 100; 132 | 133 | { 134 | clock_t begin = clock(); 135 | for (int i = 0; i < TIMES; ++i) 136 | clusters1 = callkMeans1(dataCm, sz1, sz2, nClusters, membership1); 137 | double elapsed_secs = double(clock() - begin) / CLOCKS_PER_SEC; 138 | std::cout << "callkMeans1: " << elapsed_secs << " secs" << std::endl; 139 | } 140 | 141 | { 142 | clock_t begin = clock(); 143 | for (int i = 0; i < TIMES; ++i) 144 | clusters2 = callkMeans2(dataRm, sz1, sz2, nClusters, membership2); 145 | double elapsed_secs = double(clock() - begin) / CLOCKS_PER_SEC; 146 | std::cout << "callkMeans2: " << elapsed_secs << " secs" << std::endl; 147 | } 148 | 149 | delete[] membership1; 150 | delete[] membership2; 151 | delete[] clusters1; 152 | free(clusters2[0]); 153 | free(clusters2); 154 | CHECK_CUDA(cudaFreeHost(dataCm)); 155 | free(dataRm[0]); 156 | free(dataRm); 157 | } 158 | 159 | int main(int argc, char** argv) 160 | { 161 | checkCorrectness(); 162 | benchMark(); 163 | // callkMeans1: 116.61 secs 164 | // callkMeans2: 143.17 secs 165 | 166 | return 0; 167 | } 168 | 169 | -------------------------------------------------------------------------------- /nbproject/Makefile-Debug.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Generated Makefile - do not edit! 3 | # 4 | # Edit the Makefile in the project folder instead (../Makefile). Each target 5 | # has a -pre and a -post target defined where you can add customized code. 6 | # 7 | # This makefile implements configuration specific macros and targets. 8 | 9 | 10 | # Environment 11 | MKDIR=mkdir 12 | CP=cp 13 | GREP=grep 14 | NM=nm 15 | CCADMIN=CCadmin 16 | RANLIB=ranlib 17 | CC=gcc 18 | CCC=nvcc 19 | CXX=nvcc 20 | FC=gfortran 21 | AS=as 22 | 23 | # Macros 24 | CND_PLATFORM=CUDA-Linux-x86 25 | CND_DLIB_EXT=so 26 | CND_CONF=Debug 27 | CND_DISTDIR=dist 28 | CND_BUILDDIR=build 29 | 30 | # Include project Makefile 31 | include Makefile 32 | 33 | # Object Directory 34 | OBJECTDIR=${CND_BUILDDIR}/${CND_CONF}/${CND_PLATFORM} 35 | 36 | # Object Files 37 | OBJECTFILES= \ 38 | ${OBJECTDIR}/kMeansCuda.o \ 39 | ${OBJECTDIR}/main.o 40 | 41 | 42 | # C Compiler Flags 43 | CFLAGS= 44 | 45 | # CC Compiler Flags 46 | CCFLAGS= 47 | CXXFLAGS= 48 | 49 | # Fortran Compiler Flags 50 | FFLAGS= 51 | 52 | # Assembler Flags 53 | ASFLAGS= 54 | 55 | # Link Libraries and Options 56 | LDLIBSOPTIONS= 57 | 58 | # Build Targets 59 | .build-conf: ${BUILD_SUBPROJECTS} 60 | "${MAKE}" -f nbproject/Makefile-${CND_CONF}.mk ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 61 | 62 | ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda: ${OBJECTFILES} 63 | ${MKDIR} -p ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM} 64 | ${LINK.cc} -o ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda ${OBJECTFILES} ${LDLIBSOPTIONS} 65 | 66 | ${OBJECTDIR}/kMeansCuda.o: kMeansCuda.cu 67 | ${MKDIR} -p ${OBJECTDIR} 68 | $(COMPILE.cc) -g -o ${OBJECTDIR}/kMeansCuda.o kMeansCuda.cu 69 | 70 | ${OBJECTDIR}/main.o: main.cpp 71 | ${MKDIR} -p ${OBJECTDIR} 72 | $(COMPILE.cc) -g -o ${OBJECTDIR}/main.o main.cpp 73 | 74 | # Subprojects 75 | .build-subprojects: 76 | 77 | # Clean Targets 78 | .clean-conf: ${CLEAN_SUBPROJECTS} 79 | ${RM} -r ${CND_BUILDDIR}/${CND_CONF} 80 | ${RM} ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 81 | 82 | # Subprojects 83 | .clean-subprojects: 84 | -------------------------------------------------------------------------------- /nbproject/Makefile-Release.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Generated Makefile - do not edit! 3 | # 4 | # Edit the Makefile in the project folder instead (../Makefile). Each target 5 | # has a -pre and a -post target defined where you can add customized code. 6 | # 7 | # This makefile implements configuration specific macros and targets. 8 | 9 | 10 | # Environment 11 | MKDIR=mkdir 12 | CP=cp 13 | GREP=grep 14 | NM=nm 15 | CCADMIN=CCadmin 16 | RANLIB=ranlib 17 | CC=gcc 18 | CCC=nvcc 19 | CXX=nvcc 20 | FC=gfortran 21 | AS=as 22 | 23 | # Macros 24 | CND_PLATFORM=CUDA-Linux-x86 25 | CND_DLIB_EXT=so 26 | CND_CONF=Release 27 | CND_DISTDIR=dist 28 | CND_BUILDDIR=build 29 | 30 | # Include project Makefile 31 | include Makefile 32 | 33 | # Object Directory 34 | OBJECTDIR=${CND_BUILDDIR}/${CND_CONF}/${CND_PLATFORM} 35 | 36 | # Object Files 37 | OBJECTFILES= \ 38 | ${OBJECTDIR}/kMeansCuda.o \ 39 | ${OBJECTDIR}/main.o 40 | 41 | 42 | # C Compiler Flags 43 | CFLAGS= 44 | 45 | # CC Compiler Flags 46 | CCFLAGS= 47 | CXXFLAGS= 48 | 49 | # Fortran Compiler Flags 50 | FFLAGS= 51 | 52 | # Assembler Flags 53 | ASFLAGS= 54 | 55 | # Link Libraries and Options 56 | LDLIBSOPTIONS= 57 | 58 | # Build Targets 59 | .build-conf: ${BUILD_SUBPROJECTS} 60 | "${MAKE}" -f nbproject/Makefile-${CND_CONF}.mk ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 61 | 62 | ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda: ${OBJECTFILES} 63 | ${MKDIR} -p ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM} 64 | ${LINK.cc} -o ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda ${OBJECTFILES} ${LDLIBSOPTIONS} 65 | 66 | ${OBJECTDIR}/kMeansCuda.o: kMeansCuda.cu 67 | ${MKDIR} -p ${OBJECTDIR} 68 | $(COMPILE.cc) -O2 -o ${OBJECTDIR}/kMeansCuda.o kMeansCuda.cu 69 | 70 | ${OBJECTDIR}/main.o: main.cpp 71 | ${MKDIR} -p ${OBJECTDIR} 72 | $(COMPILE.cc) -O2 -o ${OBJECTDIR}/main.o main.cpp 73 | 74 | # Subprojects 75 | .build-subprojects: 76 | 77 | # Clean Targets 78 | .clean-conf: ${CLEAN_SUBPROJECTS} 79 | ${RM} -r ${CND_BUILDDIR}/${CND_CONF} 80 | ${RM} ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 81 | 82 | # Subprojects 83 | .clean-subprojects: 84 | -------------------------------------------------------------------------------- /nbproject/Makefile-impl.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Generated Makefile - do not edit! 3 | # 4 | # Edit the Makefile in the project folder instead (../Makefile). Each target 5 | # has a pre- and a post- target defined where you can add customization code. 6 | # 7 | # This makefile implements macros and targets common to all configurations. 8 | # 9 | # NOCDDL 10 | 11 | 12 | # Building and Cleaning subprojects are done by default, but can be controlled with the SUB 13 | # macro. If SUB=no, subprojects will not be built or cleaned. The following macro 14 | # statements set BUILD_SUB-CONF and CLEAN_SUB-CONF to .build-reqprojects-conf 15 | # and .clean-reqprojects-conf unless SUB has the value 'no' 16 | SUB_no=NO 17 | SUBPROJECTS=${SUB_${SUB}} 18 | BUILD_SUBPROJECTS_=.build-subprojects 19 | BUILD_SUBPROJECTS_NO= 20 | BUILD_SUBPROJECTS=${BUILD_SUBPROJECTS_${SUBPROJECTS}} 21 | CLEAN_SUBPROJECTS_=.clean-subprojects 22 | CLEAN_SUBPROJECTS_NO= 23 | CLEAN_SUBPROJECTS=${CLEAN_SUBPROJECTS_${SUBPROJECTS}} 24 | 25 | 26 | # Project Name 27 | PROJECTNAME=kmeans_cuda 28 | 29 | # Active Configuration 30 | DEFAULTCONF=Debug 31 | CONF=${DEFAULTCONF} 32 | 33 | # All Configurations 34 | ALLCONFS=Debug Release 35 | 36 | 37 | # build 38 | .build-impl: .build-pre .validate-impl .depcheck-impl 39 | @#echo "=> Running $@... Configuration=$(CONF)" 40 | "${MAKE}" -f nbproject/Makefile-${CONF}.mk QMAKE=${QMAKE} SUBPROJECTS=${SUBPROJECTS} .build-conf 41 | 42 | 43 | # clean 44 | .clean-impl: .clean-pre .validate-impl .depcheck-impl 45 | @#echo "=> Running $@... Configuration=$(CONF)" 46 | "${MAKE}" -f nbproject/Makefile-${CONF}.mk QMAKE=${QMAKE} SUBPROJECTS=${SUBPROJECTS} .clean-conf 47 | 48 | 49 | # clobber 50 | .clobber-impl: .clobber-pre .depcheck-impl 51 | @#echo "=> Running $@..." 52 | for CONF in ${ALLCONFS}; \ 53 | do \ 54 | "${MAKE}" -f nbproject/Makefile-$${CONF}.mk QMAKE=${QMAKE} SUBPROJECTS=${SUBPROJECTS} .clean-conf; \ 55 | done 56 | 57 | # all 58 | .all-impl: .all-pre .depcheck-impl 59 | @#echo "=> Running $@..." 60 | for CONF in ${ALLCONFS}; \ 61 | do \ 62 | "${MAKE}" -f nbproject/Makefile-$${CONF}.mk QMAKE=${QMAKE} SUBPROJECTS=${SUBPROJECTS} .build-conf; \ 63 | done 64 | 65 | # build tests 66 | .build-tests-impl: .build-impl .build-tests-pre 67 | @#echo "=> Running $@... Configuration=$(CONF)" 68 | "${MAKE}" -f nbproject/Makefile-${CONF}.mk SUBPROJECTS=${SUBPROJECTS} .build-tests-conf 69 | 70 | # run tests 71 | .test-impl: .build-tests-impl .test-pre 72 | @#echo "=> Running $@... Configuration=$(CONF)" 73 | "${MAKE}" -f nbproject/Makefile-${CONF}.mk SUBPROJECTS=${SUBPROJECTS} .test-conf 74 | 75 | # dependency checking support 76 | .depcheck-impl: 77 | @echo "# This code depends on make tool being used" >.dep.inc 78 | @if [ -n "${MAKE_VERSION}" ]; then \ 79 | echo "DEPFILES=\$$(wildcard \$$(addsuffix .d, \$${OBJECTFILES}))" >>.dep.inc; \ 80 | echo "ifneq (\$${DEPFILES},)" >>.dep.inc; \ 81 | echo "include \$${DEPFILES}" >>.dep.inc; \ 82 | echo "endif" >>.dep.inc; \ 83 | else \ 84 | echo ".KEEP_STATE:" >>.dep.inc; \ 85 | echo ".KEEP_STATE_FILE:.make.state.\$${CONF}" >>.dep.inc; \ 86 | fi 87 | 88 | # configuration validation 89 | .validate-impl: 90 | @if [ ! -f nbproject/Makefile-${CONF}.mk ]; \ 91 | then \ 92 | echo ""; \ 93 | echo "Error: can not find the makefile for configuration '${CONF}' in project ${PROJECTNAME}"; \ 94 | echo "See 'make help' for details."; \ 95 | echo "Current directory: " `pwd`; \ 96 | echo ""; \ 97 | fi 98 | @if [ ! -f nbproject/Makefile-${CONF}.mk ]; \ 99 | then \ 100 | exit 1; \ 101 | fi 102 | 103 | 104 | # help 105 | .help-impl: .help-pre 106 | @echo "This makefile supports the following configurations:" 107 | @echo " ${ALLCONFS}" 108 | @echo "" 109 | @echo "and the following targets:" 110 | @echo " build (default target)" 111 | @echo " clean" 112 | @echo " clobber" 113 | @echo " all" 114 | @echo " help" 115 | @echo "" 116 | @echo "Makefile Usage:" 117 | @echo " make [CONF=] [SUB=no] build" 118 | @echo " make [CONF=] [SUB=no] clean" 119 | @echo " make [SUB=no] clobber" 120 | @echo " make [SUB=no] all" 121 | @echo " make help" 122 | @echo "" 123 | @echo "Target 'build' will build a specific configuration and, unless 'SUB=no'," 124 | @echo " also build subprojects." 125 | @echo "Target 'clean' will clean a specific configuration and, unless 'SUB=no'," 126 | @echo " also clean subprojects." 127 | @echo "Target 'clobber' will remove all built files from all configurations and," 128 | @echo " unless 'SUB=no', also from subprojects." 129 | @echo "Target 'all' will will build all configurations and, unless 'SUB=no'," 130 | @echo " also build subprojects." 131 | @echo "Target 'help' prints this message." 132 | @echo "" 133 | 134 | -------------------------------------------------------------------------------- /nbproject/Makefile-variables.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Generated - do not edit! 3 | # 4 | # NOCDDL 5 | # 6 | CND_BASEDIR=`pwd` 7 | CND_BUILDDIR=build 8 | CND_DISTDIR=dist 9 | # Debug configuration 10 | CND_PLATFORM_Debug=CUDA-Linux-x86 11 | CND_ARTIFACT_DIR_Debug=dist/Debug/CUDA-Linux-x86 12 | CND_ARTIFACT_NAME_Debug=kmeans_cuda 13 | CND_ARTIFACT_PATH_Debug=dist/Debug/CUDA-Linux-x86/kmeans_cuda 14 | CND_PACKAGE_DIR_Debug=dist/Debug/CUDA-Linux-x86/package 15 | CND_PACKAGE_NAME_Debug=kmeanscuda.tar 16 | CND_PACKAGE_PATH_Debug=dist/Debug/CUDA-Linux-x86/package/kmeanscuda.tar 17 | # Release configuration 18 | CND_PLATFORM_Release=CUDA-Linux-x86 19 | CND_ARTIFACT_DIR_Release=dist/Release/CUDA-Linux-x86 20 | CND_ARTIFACT_NAME_Release=kmeans_cuda 21 | CND_ARTIFACT_PATH_Release=dist/Release/CUDA-Linux-x86/kmeans_cuda 22 | CND_PACKAGE_DIR_Release=dist/Release/CUDA-Linux-x86/package 23 | CND_PACKAGE_NAME_Release=kmeanscuda.tar 24 | CND_PACKAGE_PATH_Release=dist/Release/CUDA-Linux-x86/package/kmeanscuda.tar 25 | # 26 | # include compiler specific variables 27 | # 28 | # dmake command 29 | ROOT:sh = test -f nbproject/private/Makefile-variables.mk || \ 30 | (mkdir -p nbproject/private && touch nbproject/private/Makefile-variables.mk) 31 | # 32 | # gmake command 33 | .PHONY: $(shell test -f nbproject/private/Makefile-variables.mk || (mkdir -p nbproject/private && touch nbproject/private/Makefile-variables.mk)) 34 | # 35 | include nbproject/private/Makefile-variables.mk 36 | -------------------------------------------------------------------------------- /nbproject/Package-Debug.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # 4 | # Generated - do not edit! 5 | # 6 | 7 | # Macros 8 | TOP=`pwd` 9 | CND_PLATFORM=CUDA-Linux-x86 10 | CND_CONF=Debug 11 | CND_DISTDIR=dist 12 | CND_BUILDDIR=build 13 | CND_DLIB_EXT=so 14 | NBTMPDIR=${CND_BUILDDIR}/${CND_CONF}/${CND_PLATFORM}/tmp-packaging 15 | TMPDIRNAME=tmp-packaging 16 | OUTPUT_PATH=${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 17 | OUTPUT_BASENAME=kmeans_cuda 18 | PACKAGE_TOP_DIR=kmeanscuda/ 19 | 20 | # Functions 21 | function checkReturnCode 22 | { 23 | rc=$? 24 | if [ $rc != 0 ] 25 | then 26 | exit $rc 27 | fi 28 | } 29 | function makeDirectory 30 | # $1 directory path 31 | # $2 permission (optional) 32 | { 33 | mkdir -p "$1" 34 | checkReturnCode 35 | if [ "$2" != "" ] 36 | then 37 | chmod $2 "$1" 38 | checkReturnCode 39 | fi 40 | } 41 | function copyFileToTmpDir 42 | # $1 from-file path 43 | # $2 to-file path 44 | # $3 permission 45 | { 46 | cp "$1" "$2" 47 | checkReturnCode 48 | if [ "$3" != "" ] 49 | then 50 | chmod $3 "$2" 51 | checkReturnCode 52 | fi 53 | } 54 | 55 | # Setup 56 | cd "${TOP}" 57 | mkdir -p ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package 58 | rm -rf ${NBTMPDIR} 59 | mkdir -p ${NBTMPDIR} 60 | 61 | # Copy files and create directories and links 62 | cd "${TOP}" 63 | makeDirectory "${NBTMPDIR}/kmeanscuda/bin" 64 | copyFileToTmpDir "${OUTPUT_PATH}" "${NBTMPDIR}/${PACKAGE_TOP_DIR}bin/${OUTPUT_BASENAME}" 0755 65 | 66 | 67 | # Generate tar file 68 | cd "${TOP}" 69 | rm -f ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package/kmeanscuda.tar 70 | cd ${NBTMPDIR} 71 | tar -vcf ../../../../${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package/kmeanscuda.tar * 72 | checkReturnCode 73 | 74 | # Cleanup 75 | cd "${TOP}" 76 | rm -rf ${NBTMPDIR} 77 | -------------------------------------------------------------------------------- /nbproject/Package-Release.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # 4 | # Generated - do not edit! 5 | # 6 | 7 | # Macros 8 | TOP=`pwd` 9 | CND_PLATFORM=CUDA-Linux-x86 10 | CND_CONF=Release 11 | CND_DISTDIR=dist 12 | CND_BUILDDIR=build 13 | CND_DLIB_EXT=so 14 | NBTMPDIR=${CND_BUILDDIR}/${CND_CONF}/${CND_PLATFORM}/tmp-packaging 15 | TMPDIRNAME=tmp-packaging 16 | OUTPUT_PATH=${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/kmeans_cuda 17 | OUTPUT_BASENAME=kmeans_cuda 18 | PACKAGE_TOP_DIR=kmeanscuda/ 19 | 20 | # Functions 21 | function checkReturnCode 22 | { 23 | rc=$? 24 | if [ $rc != 0 ] 25 | then 26 | exit $rc 27 | fi 28 | } 29 | function makeDirectory 30 | # $1 directory path 31 | # $2 permission (optional) 32 | { 33 | mkdir -p "$1" 34 | checkReturnCode 35 | if [ "$2" != "" ] 36 | then 37 | chmod $2 "$1" 38 | checkReturnCode 39 | fi 40 | } 41 | function copyFileToTmpDir 42 | # $1 from-file path 43 | # $2 to-file path 44 | # $3 permission 45 | { 46 | cp "$1" "$2" 47 | checkReturnCode 48 | if [ "$3" != "" ] 49 | then 50 | chmod $3 "$2" 51 | checkReturnCode 52 | fi 53 | } 54 | 55 | # Setup 56 | cd "${TOP}" 57 | mkdir -p ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package 58 | rm -rf ${NBTMPDIR} 59 | mkdir -p ${NBTMPDIR} 60 | 61 | # Copy files and create directories and links 62 | cd "${TOP}" 63 | makeDirectory "${NBTMPDIR}/kmeanscuda/bin" 64 | copyFileToTmpDir "${OUTPUT_PATH}" "${NBTMPDIR}/${PACKAGE_TOP_DIR}bin/${OUTPUT_BASENAME}" 0755 65 | 66 | 67 | # Generate tar file 68 | cd "${TOP}" 69 | rm -f ${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package/kmeanscuda.tar 70 | cd ${NBTMPDIR} 71 | tar -vcf ../../../../${CND_DISTDIR}/${CND_CONF}/${CND_PLATFORM}/package/kmeanscuda.tar * 72 | checkReturnCode 73 | 74 | # Cleanup 75 | cd "${TOP}" 76 | rm -rf ${NBTMPDIR} 77 | -------------------------------------------------------------------------------- /nbproject/configurations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | kMeansCuda.h 8 | 9 | 12 | 13 | 16 | kMeansCuda.cu 17 | main.cpp 18 | 19 | 23 | 24 | 28 | Makefile 29 | 30 | 31 | Makefile 32 | 33 | 34 | 35 | CUDA|GNU 36 | false 37 | false 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | CUDA|GNU 51 | false 52 | false 53 | 54 | 55 | 56 | 5 57 | 58 | 59 | 5 60 | 61 | 62 | 5 63 | 64 | 65 | 5 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /nbproject/private/Makefile-variables.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Generated - do not edit! 3 | # 4 | # NOCDDL 5 | # 6 | # Debug configuration 7 | # Release configuration 8 | -------------------------------------------------------------------------------- /nbproject/private/configurations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Makefile 4 | 5 | 6 | 7 | localhost 8 | 2 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | gdb 24 | 25 | 26 | 27 | "${OUTPUT_PATH}" 28 | 29 | "${OUTPUT_PATH}" 30 | 31 | true 32 | 0 33 | 0 34 | 35 | 36 | 37 | 38 | 39 | 40 | localhost 41 | 2 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | gdb 57 | 58 | 59 | 60 | "${OUTPUT_PATH}" 61 | 62 | "${OUTPUT_PATH}" 63 | 64 | true 65 | 0 66 | 0 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /nbproject/private/launcher.properties: -------------------------------------------------------------------------------- 1 | # Launchers File syntax: 2 | # 3 | # [Must-have property line] 4 | # launcher1.runCommand= 5 | # [Optional extra properties] 6 | # launcher1.displayName= 7 | # launcher1.buildCommand= 8 | # launcher1.runDir= 9 | # launcher1.symbolFiles= 10 | # launcher1.env.= 11 | # (If this value is quoted with ` it is handled as a native command which execution result will become the value) 12 | # [Common launcher properties] 13 | # common.runDir= 14 | # (This value is overwritten by a launcher specific runDir value if the latter exists) 15 | # common.env.= 16 | # (Environment variables from common launcher are merged with launcher specific variables) 17 | # common.symbolFiles= 18 | # (This value is overwritten by a launcher specific symbolFiles value if the latter exists) 19 | # 20 | # In runDir, symbolFiles and env fields you can use these macroses: 21 | # ${PROJECT_DIR} - project directory absolute path 22 | # ${OUTPUT_PATH} - linker output path (relative to project directory path) 23 | # ${OUTPUT_BASENAME}- linker output filename 24 | # ${TESTDIR} - test files directory (relative to project directory path) 25 | # ${OBJECTDIR} - object files directory (relative to project directory path) 26 | # ${CND_DISTDIR} - distribution directory (relative to project directory path) 27 | # ${CND_BUILDDIR} - build directory (relative to project directory path) 28 | # ${CND_PLATFORM} - platform name 29 | # ${CND_CONF} - configuration name 30 | # ${CND_DLIB_EXT} - dynamic library extension 31 | # 32 | # All the project launchers must be listed in the file! 33 | # 34 | # launcher1.runCommand=... 35 | # launcher2.runCommand=... 36 | # ... 37 | # common.runDir=... 38 | # common.env.KEY=VALUE 39 | 40 | # launcher1.runCommand= -------------------------------------------------------------------------------- /nbproject/private/private.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1 5 | 0 6 | 7 | 8 | 9 | 10 | file:/home/hvpham/code/kmeans_cuda/kMeansCuda.cu 11 | file:/home/hvpham/code/kmeans_cuda/main.cpp 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /nbproject/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.netbeans.modules.cnd.makeproject 4 | 5 | 6 | kMeansCuda 7 | 8 | cpp,cu 9 | h 10 | UTF-8 11 | 12 | 13 | 14 | 15 | Debug 16 | 1 17 | 18 | 19 | Release 20 | 1 21 | 22 | 23 | 24 | false 25 | 26 | 27 | 28 | 29 | --------------------------------------------------------------------------------