├── LICENSE ├── README ├── README.md ├── centroids.cu ├── centroids.h ├── contributors.txt ├── kmeans.cu ├── kmeans.h ├── labels.cu ├── labels.h ├── orig ├── Makefile └── test.cpp ├── test.cu ├── timer.cu ├── timer.h └── util.h /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ** Building kmeans for GPU ** 2 | 3 | Prereq: 4 | CUB (https://github.com/nvlabs/CUB) 5 | CUDA 6 | 7 | Either modify Makefile to specify the path to CUB, or specify CUB_HOME on your make invocation 8 | 9 | %> make CUB_HOME=/home-2/lbarnes/CUB/cub-1.0.2 10 | 11 | By default, it will compile with some additional data exchange between CPU and GPU in order to provide updates on screen with each iteration. This reduces performance a bit. Disable these by setting USERFLAGS to "" 12 | 13 | %> make CUDA_HOME=/home-2/lbarnes/CUB/cub-1.0.2 USERFLAGS="" 14 | 15 | ** Running ** 16 | 17 | The code is set to generate random data and cluster a set of 5M points of dimension 50. By default it iterates 100 times. Adjust these parameters in test.cu. It will use all the GPUs it finds. Use CUDA_VISIBLE_DEVICES to limit the number of GPUs. 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kmeans 2 | ====== 3 | 4 | A simple kmeans clustering implementation for double precision data, 5 | written for CUDA GPUs. 6 | 7 | There are two ideas here: 8 | 9 | 1. The relabel step of kmeans relies on computing distances between 10 | all n points (x) and all k centroids (y). This code refactors the distance 11 | computation using the identity ||x-y||^2 = x.x + y.y - 2x.y; this 12 | refactorization moves the x.x computation outside the kmeans loop, and 13 | uses GEMM to compute the x.y, getting us peak performance. 14 | 2. The computation of new centroids can be tricky because the labels 15 | change every iteration. This code shows how to sort to group all points with 16 | the same label, transforming the centroid accumulation into 17 | simple additions, minimizing atomic memory operations. For many 18 | practical problem sizes, sorting reduces the centroid computation to less 19 | than 20% of the overall runtime of the algorithm. 20 | 21 | The CUDA code here is purposefully non-optimized - this code is not 22 | meant to be the fastest possible kmeans implementation, but rather to 23 | show how using libraries like thrust and BLAS can provide reasonable 24 | performance with high programmer productivity. 25 | 26 | Multi-GPU version 27 | ================= 28 | This version has been updated to use multiple GPUs attached to the same machine. 29 | You do not need to specify the number of GPUs, the program will detect and use 30 | them. 31 | 32 | Prerequisites 33 | ============= 34 | * CUDA toolkit 4.2 35 | * CUB 1.0.2 https://github.com/NVLabs/cub 36 | 37 | Build 38 | ===== 39 | To build, edit Makefile to specify CUB_HOME, the location of your CUB files 40 | Then call make. 41 | 42 | Run 43 | === 44 | A simple test case is run when you invoke the executable 'test'. 45 | 46 | For demonstration, test will generate and solve 3 test cases of different 47 | sizes. At the prompt, specify 't' for a tiny test case, 'm' for a slightly 48 | bigger test case, and 'h' for a huge test case: 1 million points, with 50 49 | dimensions and 100 clusters, for 50 iterations. 50 | -------------------------------------------------------------------------------- /centroids.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "labels.h" 7 | 8 | __device__ double atomicAdd(double* address, double val) 9 | { 10 | unsigned long long int* address_as_ull = 11 | (unsigned long long int*)address; 12 | unsigned long long int old = *address_as_ull, assumed; 13 | do { 14 | assumed = old; 15 | old = atomicCAS(address_as_ull, assumed, 16 | __double_as_longlong(val + 17 | __longlong_as_double(assumed))); 18 | } while (assumed != old); 19 | return __longlong_as_double(old); 20 | } 21 | 22 | namespace kmeans { 23 | namespace detail { 24 | 25 | __device__ __forceinline__ void update_centroid(int label, int dimension, 26 | int d, 27 | double accumulator, double* centroids, 28 | int count, int* counts) { 29 | int index = label * d + dimension; 30 | double* target = centroids + index; 31 | atomicAdd(target, accumulator); 32 | if (dimension == 0) { 33 | atomicAdd(counts + label, count); 34 | } 35 | } 36 | 37 | __global__ void calculate_centroids(int n, int d, int k, 38 | double* data, 39 | int* ordered_labels, 40 | int* ordered_indices, 41 | double* centroids, 42 | int* counts) { 43 | int in_flight = blockDim.y * gridDim.y; 44 | int labels_per_row = (n - 1) / in_flight + 1; 45 | for(int dimension = threadIdx.x; dimension < d; dimension += blockDim.x) { 46 | double accumulator = 0; 47 | int count = 0; 48 | int global_id = threadIdx.y + blockIdx.y * blockDim.y; 49 | int start = global_id * labels_per_row; 50 | int end = (global_id + 1) * labels_per_row; 51 | end = (end > n) ? n : end; 52 | int prior_label; 53 | if (start < n) { 54 | prior_label = ordered_labels[start]; 55 | 56 | for(int label_number = start; label_number < end; label_number++) { 57 | int label = ordered_labels[label_number]; 58 | if (label != prior_label) { 59 | update_centroid(prior_label, dimension, 60 | d, 61 | accumulator, centroids, 62 | count, counts); 63 | accumulator = 0; 64 | count = 0; 65 | } 66 | 67 | double value = data[dimension + ordered_indices[label_number] * d]; 68 | accumulator += value; 69 | prior_label = label; 70 | count++; 71 | } 72 | update_centroid(prior_label, dimension, 73 | d, 74 | accumulator, centroids, 75 | count, counts); 76 | } 77 | } 78 | } 79 | 80 | __global__ void scale_centroids(int d, int k, int* counts, double* centroids) { 81 | int global_id_x = threadIdx.x + blockIdx.x * blockDim.x; 82 | int global_id_y = threadIdx.y + blockIdx.y * blockDim.y; 83 | if ((global_id_x < d) && (global_id_y < k)) { 84 | int count = counts[global_id_y]; 85 | //To avoid introducing divide by zero errors 86 | //If a centroid has no weight, we'll do no normalization 87 | //This will keep its coordinates defined. 88 | if (count < 1) { 89 | count = 1; 90 | } 91 | double scale = 1.0/double(count); 92 | centroids[global_id_x + d * global_id_y] *= scale; 93 | } 94 | } 95 | 96 | void find_centroids(int n, int d, int k, 97 | thrust::device_vector& data, 98 | thrust::device_vector& labels, 99 | thrust::device_vector& centroids, 100 | thrust::device_vector& range, 101 | thrust::device_vector& indices, 102 | thrust::device_vector& counts) { 103 | int dev_num; 104 | cudaGetDevice(&dev_num); 105 | detail::memcpy(indices,range); 106 | //Bring all labels with the same value together 107 | #if 0 108 | thrust::sort_by_key(labels.begin(), 109 | labels.end(), 110 | indices.begin()); 111 | #else 112 | mycub::sort_by_key_int(labels, indices); 113 | #endif 114 | 115 | //Initialize centroids to all zeros 116 | detail::memzero(centroids); 117 | 118 | //Initialize counts to all zeros 119 | detail::memzero(counts); 120 | 121 | //Calculate centroids 122 | int n_threads_x = 64; 123 | int n_threads_y = 16; 124 | //XXX Number of blocks here is hard coded at 30 125 | //This should be taken care of more thoughtfully. 126 | detail::calculate_centroids<<>> 128 | (n, d, k, 129 | thrust::raw_pointer_cast(data.data()), 130 | thrust::raw_pointer_cast(labels.data()), 131 | thrust::raw_pointer_cast(indices.data()), 132 | thrust::raw_pointer_cast(centroids.data()), 133 | thrust::raw_pointer_cast(counts.data())); 134 | 135 | //Scale centroids 136 | detail::scale_centroids<<>> 138 | (d, k, 139 | thrust::raw_pointer_cast(counts.data()), 140 | thrust::raw_pointer_cast(centroids.data())); 141 | } 142 | 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /centroids.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace kmeans { 4 | namespace detail { 5 | void find_centroids(int n, int d, int k, 6 | thrust::device_vector& data, 7 | thrust::device_vector& labels, 8 | thrust::device_vector& centroids, 9 | thrust::device_vector& range, 10 | thrust::device_vector& indices, 11 | thrust::device_vector& counts); 12 | 13 | 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /contributors.txt: -------------------------------------------------------------------------------- 1 | Levi Barnes 2 | -------------------------------------------------------------------------------- /kmeans.cu: -------------------------------------------------------------------------------- 1 | #include "kmeans.h" 2 | #include "util.h" 3 | #include 4 | 5 | namespace kmeans { 6 | 7 | int kmeans(int iterations, 8 | int n, int d, int k, 9 | thrust::device_vector** data, 10 | thrust::device_vector** labels, 11 | thrust::device_vector** centroids, 12 | thrust::device_vector** distances, 13 | int n_gpu, 14 | bool init_from_labels, 15 | double threshold) { 16 | thrust::device_vector *data_dots[16]; 17 | thrust::device_vector *centroid_dots[16]; 18 | thrust::device_vector *pairwise_distances[16]; 19 | thrust::device_vector *labels_copy[16]; 20 | thrust::device_vector *range[16]; 21 | thrust::device_vector *indices[16]; 22 | thrust::device_vector *counts[16]; 23 | 24 | thrust::host_vector h_centroids( k * d ); 25 | thrust::host_vector h_centroids_tmp( k * d ); 26 | int h_changes[16], *d_changes[16]; 27 | double h_distance_sum[16], *d_distance_sum[16]; 28 | 29 | 30 | for (int q = 0; q < n_gpu; q++) { 31 | 32 | cudaSetDevice(q); 33 | cudaMalloc(&d_changes[q], sizeof(int)); 34 | cudaMalloc(&d_distance_sum[q], sizeof(double)); 35 | detail::labels_init(); 36 | data_dots[q] = new thrust::device_vector (n/n_gpu); 37 | centroid_dots[q] = new thrust::device_vector(n/n_gpu); 38 | pairwise_distances[q] = new thrust::device_vector(n/n_gpu * k); 39 | labels_copy[q] = new thrust::device_vector(n/n_gpu * d); 40 | range[q] = new thrust::device_vector(n/n_gpu); 41 | counts[q] = new thrust::device_vector(k); 42 | indices[q] = new thrust::device_vector(n/n_gpu); 43 | //Create and save "range" for initializing labels 44 | thrust::copy(thrust::counting_iterator(0), 45 | thrust::counting_iterator(n/n_gpu), 46 | (*range[q]).begin()); 47 | 48 | detail::make_self_dots(n/n_gpu, d, *data[q], *data_dots[q]); 49 | if (init_from_labels) { 50 | detail::find_centroids(n/n_gpu, d, k, *data[q], *labels[q], *centroids[q], *range[q], *indices[q], *counts[q]); 51 | } 52 | } 53 | 54 | double prior_distance_sum = 0; 55 | int i=0; 56 | for(; i < iterations; i++) { 57 | //Average the centroids from each device 58 | if (n_gpu > 1) { 59 | for (int p = 0; p < k * d; p++) h_centroids[p] = 0.0; 60 | for (int q = 0; q < n_gpu; q++) { 61 | cudaSetDevice(q); 62 | detail::memcpy(h_centroids_tmp, *centroids[q]); 63 | detail::streamsync(q); 64 | for (int p = 0; p < k * d; p++) h_centroids[p] += h_centroids_tmp[p]; 65 | } 66 | for (int p = 0; p < k * d; p++) h_centroids[p] /= n_gpu; 67 | //Copy the averaged centroids to each device 68 | for (int q = 0; q < n_gpu; q++) { 69 | cudaSetDevice(q); 70 | detail::memcpy(*centroids[q],h_centroids); 71 | } 72 | } 73 | for (int q = 0; q < n_gpu; q++) { 74 | //TODO compute total distance 75 | cudaSetDevice(q); 76 | 77 | detail::calculate_distances(n/n_gpu, d, k, 78 | *data[q], *centroids[q], *data_dots[q], 79 | *centroid_dots[q], *pairwise_distances[q]); 80 | 81 | detail::relabel(n/n_gpu, k, *pairwise_distances[q], *labels[q], *distances[q], d_changes[q]); 82 | //TODO remove one memcpy 83 | detail::memcpy(*labels_copy[q], *labels[q]); 84 | detail::find_centroids(n/n_gpu, d, k, *data[q], *labels[q], *centroids[q], *range[q], *indices[q], *counts[q]); 85 | detail::memcpy(*labels[q], *labels_copy[q]); 86 | //double d_distance_sum[q] = thrust::reduce(distances[q].begin(), distances[q].end()) 87 | mycub::sum_reduce(*distances[q], d_distance_sum[q]); 88 | } 89 | #if __VERBOSE 90 | double distance_sum = 0.0; 91 | for (int q = 0; q < n_gpu; q++) { 92 | cudaMemcpyAsync(h_changes+q, d_changes[q], sizeof(int), cudaMemcpyDeviceToHost, cuda_stream[q]); 93 | cudaMemcpyAsync(h_distance_sum+q, d_distance_sum[q], sizeof(double), cudaMemcpyDeviceToHost, cuda_stream[q]); 94 | detail::streamsync(q); 95 | std::cout << "Device " << q << ": Iteration " << i << " produced " << h_changes[q] 96 | << " changes and the total_distance is " << h_distance_sum[q] << std::endl; 97 | distance_sum += h_distance_sum[q]; 98 | } 99 | if (i > 0) { 100 | double delta = distance_sum / prior_distance_sum; 101 | if (delta > 1 - threshold) { 102 | std::cout << "Threshold triggered. Terminating iterations early." << std::endl; 103 | return i + 1; 104 | } 105 | } 106 | prior_distance_sum = distance_sum; 107 | #endif 108 | 109 | } 110 | for (int q = 0; q < n_gpu; q++) { 111 | cudaSetDevice(q); 112 | cudaFree(d_changes[q]); 113 | detail::labels_close(); 114 | delete(pairwise_distances[q]); 115 | delete(data_dots[q]); 116 | delete(centroid_dots[q]); 117 | } 118 | return i; 119 | 120 | 121 | } 122 | 123 | 124 | } 125 | -------------------------------------------------------------------------------- /kmeans.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "centroids.h" 4 | #include "labels.h" 5 | 6 | namespace kmeans { 7 | 8 | 9 | //! kmeans clusters data into k groups 10 | /*! 11 | 12 | \param iterations How many iterations to run 13 | \param n Number of data points 14 | \param d Number of dimensions 15 | \param k Number of clusters 16 | \param data Data points, in row-major order. This vector must have 17 | size n * d, and since it's in row-major order, data point x occupies 18 | positions [x * d, (x + 1) * d) in the vector. The vector is passed 19 | by reference since it is shared with the caller and not copied. 20 | \param labels Cluster labels. This vector has size n. 21 | The vector is passed by reference since it is shared with the caller 22 | and not copied. 23 | \param centroids Centroid locations, in row-major order. This 24 | vector must have size k * d, and since it's in row-major order, 25 | centroid x occupies positions [x * d, (x + 1) * d) in the 26 | vector. The vector is passed by reference since it is shared 27 | with the caller and not copied. 28 | \param distances Distances from points to centroids. This vector has 29 | size n. It is passed by reference since it is shared with the caller 30 | and not copied. 31 | \param init_from_labels If true, the labels need to be initialized 32 | before calling kmeans. If false, the centroids need to be 33 | initialized before calling kmeans. Defaults to true, which means 34 | the labels must be initialized. 35 | \param threshold This controls early termination of the kmeans 36 | iterations. If the ratio of the sum of distances from points to 37 | centroids from this iteration to the previous iteration changes by 38 | less than the threshold, than the iterations are 39 | terminated. Defaults to 0.000001 40 | \return The number of iterations actually performed. 41 | */ 42 | 43 | int kmeans(int iterations, 44 | int n, int d, int k, 45 | thrust::device_vector** data, 46 | thrust::device_vector** labels, 47 | thrust::device_vector** centroids, 48 | thrust::device_vector** distances, 49 | int n_gpu=1, 50 | bool init_from_labels=true, 51 | double threshold=0.000001 52 | ); 53 | 54 | } 55 | -------------------------------------------------------------------------------- /labels.cu: -------------------------------------------------------------------------------- 1 | #include "labels.h" 2 | #include 3 | #include 4 | 5 | cudaStream_t cuda_stream[16]; 6 | namespace kmeans { 7 | namespace detail { 8 | 9 | cublasHandle_t cublas_handle[16]; 10 | 11 | void labels_init() { 12 | cublasStatus_t stat; 13 | cudaError_t err; 14 | int dev_num; 15 | cudaGetDevice(&dev_num); 16 | stat = cublasCreate(&detail::cublas_handle[dev_num]); 17 | if (stat != CUBLAS_STATUS_SUCCESS) { 18 | std::cout << "CUBLAS initialization failed" << std::endl; 19 | exit(1); 20 | } 21 | err = cudaStreamCreate(&cuda_stream[dev_num]); 22 | if (err != cudaSuccess) { 23 | std::cout << "Stream creation failed" << std::endl; 24 | exit(1); 25 | } 26 | cublasSetStream(cublas_handle[dev_num], cuda_stream[dev_num]); 27 | mycub::cub_init(); 28 | } 29 | void labels_close() { 30 | int dev_num; 31 | cudaGetDevice(&dev_num); 32 | cublasDestroy(cublas_handle[dev_num]); 33 | cudaStreamDestroy(cuda_stream[dev_num]); 34 | mycub::cub_close(); 35 | } 36 | 37 | 38 | void streamsync(int dev_num) { 39 | 40 | cudaStreamSynchronize(cuda_stream[dev_num]); 41 | } 42 | 43 | //n: number of points 44 | //d: dimensionality of points 45 | //data: points, laid out in row-major order (n rows, d cols) 46 | //dots: result vector (n rows) 47 | // NOTE: 48 | //Memory accesses in this function are uncoalesced!! 49 | //This is because data is in row major order 50 | //However, in k-means, it's called outside the optimization loop 51 | //on the large data array, and inside the optimization loop it's 52 | //called only on a small array, so it doesn't really matter. 53 | //If this becomes a performance limiter, transpose the data somewhere 54 | __global__ void self_dots(int n, int d, double* data, double* dots) { 55 | double accumulator = 0; 56 | int global_id = blockDim.x * blockIdx.x + threadIdx.x; 57 | 58 | if (global_id < n) { 59 | for (int i = 0; i < d; i++) { 60 | double value = data[i + global_id * d]; 61 | accumulator += value * value; 62 | } 63 | dots[global_id] = accumulator; 64 | } 65 | } 66 | 67 | 68 | void make_self_dots(int n, int d, thrust::device_vector& data, thrust::device_vector& dots) { 69 | int dev_num; 70 | cudaGetDevice(&dev_num); 71 | self_dots<<<(n-1)/256+1, 256, 0, cuda_stream[dev_num]>>>(n, d, thrust::raw_pointer_cast(data.data()), 72 | thrust::raw_pointer_cast(dots.data())); 73 | } 74 | 75 | __global__ void all_dots(int n, int k, double* data_dots, double* centroid_dots, double* dots) { 76 | __shared__ double local_data_dots[32]; 77 | __shared__ double local_centroid_dots[32]; 78 | 79 | int data_index = threadIdx.x + blockIdx.x * blockDim.x; 80 | if ((data_index < n) && (threadIdx.y == 0)) { 81 | local_data_dots[threadIdx.x] = data_dots[data_index]; 82 | } 83 | 84 | 85 | 86 | 87 | int centroid_index = threadIdx.x + blockIdx.y * blockDim.y; 88 | if ((centroid_index < k) && (threadIdx.y == 1)) { 89 | local_centroid_dots[threadIdx.x] = centroid_dots[centroid_index]; 90 | } 91 | 92 | __syncthreads(); 93 | 94 | centroid_index = threadIdx.y + blockIdx.y * blockDim.y; 95 | if ((data_index < n) && (centroid_index < k)) { 96 | dots[data_index + centroid_index * n] = local_data_dots[threadIdx.x] + 97 | local_centroid_dots[threadIdx.y]; 98 | } 99 | } 100 | 101 | void make_all_dots(int n, int k, thrust::device_vector& data_dots, 102 | thrust::device_vector& centroid_dots, 103 | thrust::device_vector& dots) { 104 | int dev_num; 105 | cudaGetDevice(&dev_num); 106 | all_dots<<< 107 | dim3((n-1)/32+1, 108 | (k-1)/32+1), 109 | dim3(32, 32), 0, 110 | cuda_stream[dev_num]>>>(n, k, thrust::raw_pointer_cast(data_dots.data()), 111 | thrust::raw_pointer_cast(centroid_dots.data()), 112 | thrust::raw_pointer_cast(dots.data())); 113 | }; 114 | 115 | void calculate_distances(int n, int d, int k, 116 | thrust::device_vector& data, 117 | thrust::device_vector& centroids, 118 | thrust::device_vector& data_dots, 119 | thrust::device_vector& centroid_dots, 120 | thrust::device_vector& pairwise_distances) { 121 | detail::make_self_dots(k, d, centroids, centroid_dots); 122 | detail::make_all_dots(n, k, data_dots, centroid_dots, pairwise_distances); 123 | //||x-y||^2 = ||x||^2 + ||y||^2 - 2 x . y 124 | //pairwise_distances has ||x||^2 + ||y||^2, so beta = 1 125 | //The dgemm calculates x.y for all x and y, so alpha = -2.0 126 | double alpha = -2.0; 127 | double beta = 1.0; 128 | //If the data were in standard column major order, we'd do a 129 | //centroids * data ^ T 130 | //But the data is in row major order, so we have to permute 131 | //the arguments a little 132 | int dev_num; 133 | cudaGetDevice(&dev_num); 134 | cublasStatus_t stat = 135 | cublasDgemm(detail::cublas_handle[dev_num], 136 | CUBLAS_OP_T, CUBLAS_OP_N, 137 | n, k, d, &alpha, 138 | thrust::raw_pointer_cast(data.data()), 139 | d,//Has to be n or d 140 | thrust::raw_pointer_cast(centroids.data()), 141 | d,//Has to be k or d 142 | &beta, 143 | thrust::raw_pointer_cast(pairwise_distances.data()), 144 | n); //Has to be n or k 145 | 146 | if (stat != CUBLAS_STATUS_SUCCESS) { 147 | std::cout << "Invalid Dgemm" << std::endl; 148 | exit(1); 149 | } 150 | 151 | } 152 | 153 | __global__ void make_new_labels(int n, int k, double* pairwise_distances, 154 | int* labels, int* changes, 155 | double* distances) { 156 | double min_distance = DBL_MAX; 157 | double min_idx = -1; 158 | int global_id = threadIdx.x + blockIdx.x * blockDim.x; 159 | if (global_id < n) { 160 | int old_label = labels[global_id]; 161 | for(int c = 0; c < k; c++) { 162 | double distance = pairwise_distances[c * n + global_id]; 163 | if (distance < min_distance) { 164 | min_distance = distance; 165 | min_idx = c; 166 | } 167 | } 168 | labels[global_id] = min_idx; 169 | distances[global_id] = min_distance; 170 | if (old_label != min_idx) { 171 | atomicAdd(changes, 1); 172 | } 173 | } 174 | } 175 | 176 | 177 | void relabel(int n, int k, 178 | thrust::device_vector& pairwise_distances, 179 | thrust::device_vector& labels, 180 | thrust::device_vector& distances, 181 | int *d_changes) { 182 | int dev_num; 183 | cudaGetDevice(&dev_num); 184 | cudaMemsetAsync(d_changes, 0, sizeof(int), cuda_stream[dev_num]); 185 | make_new_labels<<<(n-1)/256+1,256,0,cuda_stream[dev_num]>>>( 186 | n, k, 187 | thrust::raw_pointer_cast(pairwise_distances.data()), 188 | thrust::raw_pointer_cast(labels.data()), 189 | d_changes, 190 | thrust::raw_pointer_cast(distances.data())); 191 | } 192 | 193 | } 194 | } 195 | namespace mycub { 196 | void *d_key_alt_buf[16]; 197 | unsigned int key_alt_buf_bytes[16]; 198 | void *d_value_alt_buf[16]; 199 | unsigned int value_alt_buf_bytes[16]; 200 | void *d_temp_storage[16]; 201 | size_t temp_storage_bytes[16]; 202 | void *d_temp_storage2[16]; 203 | size_t temp_storage_bytes2[16]; 204 | bool cub_initted; 205 | void cub_init() { 206 | std::cout <<"CUB init" << std::endl; 207 | for (int q=0; q<16; q++) { 208 | d_key_alt_buf[q] = NULL; 209 | key_alt_buf_bytes[q] = 0; 210 | d_value_alt_buf[q] = NULL; 211 | value_alt_buf_bytes[q] = 0; 212 | d_temp_storage[q] = NULL; 213 | temp_storage_bytes[q] = 0; 214 | d_temp_storage2[q] = NULL; 215 | temp_storage_bytes2[q] = 0; 216 | } 217 | cub_initted = true; 218 | } 219 | void cub_close() { 220 | for (int q=0; q<16; q++) { 221 | if(d_key_alt_buf[q]) cudaFree(d_key_alt_buf[q]); 222 | if(d_value_alt_buf[q]) cudaFree(d_value_alt_buf[q]); 223 | if(d_temp_storage[q]) cudaFree(d_temp_storage[q]); 224 | if(d_temp_storage2[q]) cudaFree(d_temp_storage2[q]); 225 | d_temp_storage[q] = NULL; 226 | d_temp_storage2[q] = NULL; 227 | } 228 | cub_initted = false; 229 | } 230 | void sort_by_key_int(thrust::device_vector& keys, thrust::device_vector& values) { 231 | int dev_num; 232 | cudaGetDevice(&dev_num); 233 | cudaStream_t this_stream = cuda_stream[dev_num]; 234 | int SIZE = keys.size(); 235 | //int *d_key_alt_buf, *d_value_alt_buf; 236 | if (key_alt_buf_bytes[dev_num] < sizeof(int)*SIZE) { 237 | if (d_key_alt_buf[dev_num]) cudaFree(d_key_alt_buf[dev_num]); 238 | cudaMalloc(&d_key_alt_buf[dev_num], sizeof(int)*SIZE); 239 | key_alt_buf_bytes[dev_num] = sizeof(int)*SIZE; 240 | } 241 | if (value_alt_buf_bytes[dev_num] < sizeof(int)*SIZE) { 242 | if (d_value_alt_buf[dev_num]) cudaFree(d_value_alt_buf[dev_num]); 243 | cudaMalloc(&d_value_alt_buf[dev_num], sizeof(int)*SIZE); 244 | value_alt_buf_bytes[dev_num] = sizeof(int)*SIZE; 245 | } 246 | cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), (int*)d_key_alt_buf[dev_num]); 247 | cub::DoubleBuffer d_values(thrust::raw_pointer_cast(values.data()), (int*)d_value_alt_buf[dev_num]); 248 | 249 | // Determine temporary device storage requirements for sorting operation 250 | if (!d_temp_storage[dev_num]) { 251 | cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 252 | d_values, SIZE, 0, sizeof(int)*8, this_stream); 253 | // Allocate temporary storage for sorting operation 254 | cudaMalloc(&d_temp_storage[dev_num], temp_storage_bytes[dev_num]); 255 | } 256 | // Run sorting operation 257 | cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 258 | d_values, SIZE, 0, sizeof(int)*8, this_stream); 259 | // Sorted keys and values are referenced by d_keys.Current() and d_values.Current() 260 | 261 | 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /labels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | extern cudaStream_t cuda_stream[16]; 6 | 7 | template 8 | extern __global__ void debugMark(){}; 9 | 10 | namespace kmeans { 11 | namespace detail { 12 | 13 | void labels_init(); 14 | void labels_close(); 15 | 16 | template 17 | void memcpy(thrust::host_vector > &H, 18 | thrust::device_vector > &D) { 19 | int dev_num; 20 | cudaGetDevice(&dev_num); 21 | cudaMemcpyAsync(thrust::raw_pointer_cast(H.data()), 22 | thrust::raw_pointer_cast(D.data()), 23 | sizeof(T) * D.size(), cudaMemcpyDeviceToHost, cuda_stream[dev_num]); 24 | } 25 | 26 | template 27 | void memcpy(thrust::device_vector > &D, 28 | thrust::host_vector > &H) { 29 | int dev_num; 30 | cudaGetDevice(&dev_num); 31 | cudaMemcpyAsync(thrust::raw_pointer_cast(D.data()), 32 | thrust::raw_pointer_cast(H.data()), 33 | sizeof(T) * H.size(), cudaMemcpyHostToDevice, cuda_stream[dev_num]); 34 | } 35 | template 36 | void memcpy(thrust::device_vector > &Do, 37 | thrust::device_vector > &Di) { 38 | int dev_num; 39 | cudaGetDevice(&dev_num); 40 | cudaMemcpyAsync(thrust::raw_pointer_cast(Do.data()), 41 | thrust::raw_pointer_cast(Di.data()), 42 | sizeof(T) * Di.size(), cudaMemcpyDeviceToDevice, cuda_stream[dev_num]); 43 | } 44 | template 45 | void memzero(thrust::device_vector >& D) { 46 | int dev_num; 47 | cudaGetDevice(&dev_num); 48 | cudaMemsetAsync(thrust::raw_pointer_cast(D.data()), 0, sizeof(T)*D.size(), cuda_stream[dev_num]); 49 | } 50 | void streamsync(int dev_num); 51 | 52 | void make_self_dots(int n, int d, 53 | thrust::device_vector& data, 54 | thrust::device_vector& dots); 55 | 56 | void make_all_dots(int n, int k, thrust::device_vector& data_dots, 57 | thrust::device_vector& centroid_dots, 58 | thrust::device_vector& dots); 59 | 60 | void calculate_distances(int n, int d, int k, 61 | thrust::device_vector& data, 62 | thrust::device_vector& centroids, 63 | thrust::device_vector& data_dots, 64 | thrust::device_vector& centroid_dots, 65 | thrust::device_vector& pairwise_distances); 66 | 67 | void relabel(int n, int k, 68 | thrust::device_vector& pairwise_distances, 69 | thrust::device_vector& labels, 70 | thrust::device_vector& distances, 71 | int *d_changes); 72 | 73 | } 74 | } 75 | namespace mycub { 76 | 77 | extern void *d_key_alt_buf[16]; 78 | extern unsigned int key_alt_buf_bytes[16]; 79 | extern void *d_value_alt_buf[16]; 80 | extern unsigned int value_alt_buf_bytes[16]; 81 | extern void *d_temp_storage[16]; 82 | extern size_t temp_storage_bytes[16]; 83 | extern void *d_temp_storage2[16]; 84 | extern size_t temp_storage_bytes2[16]; 85 | extern bool cub_initted; 86 | 87 | void sort_by_key_int(thrust::device_vector& keys, thrust::device_vector& values); 88 | 89 | template 90 | void sort_by_key(thrust::device_vector& keys, thrust::device_vector& values) { 91 | int dev_num; 92 | cudaGetDevice(&dev_num); 93 | cudaStream_t this_stream = cuda_stream[dev_num]; 94 | int SIZE = keys.size(); 95 | if (key_alt_buf_bytes[dev_num] < sizeof(T)*SIZE) { 96 | if (d_key_alt_buf[dev_num]) cudaFree(d_key_alt_buf[dev_num]); 97 | cudaMalloc(&d_key_alt_buf[dev_num], sizeof(T)*SIZE); 98 | key_alt_buf_bytes[dev_num] = sizeof(T)*SIZE; 99 | std::cout << "Malloc key_alt_buf" << std::endl; 100 | } 101 | if (value_alt_buf_bytes[dev_num] < sizeof(U)*SIZE) { 102 | if (d_value_alt_buf[dev_num]) cudaFree(d_value_alt_buf[dev_num]); 103 | cudaMalloc(&d_value_alt_buf[dev_num], sizeof(U)*SIZE); 104 | value_alt_buf_bytes[dev_num] = sizeof(U)*SIZE; 105 | std::cout << "Malloc value_alt_buf" << std::endl; 106 | } 107 | cub::DoubleBuffer d_keys(thrust::raw_pointer_cast(keys.data()), (T*)d_key_alt_buf[dev_num]); 108 | cub::DoubleBuffer d_values(thrust::raw_pointer_cast(values.data()), (U*)d_value_alt_buf[dev_num]); 109 | cudaError_t err; 110 | 111 | // Determine temporary device storage requirements for sorting operation 112 | //if (temp_storage_bytes[dev_num] == 0) { 113 | void *d_temp; 114 | size_t temp_bytes; 115 | err = cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_bytes, d_keys, 116 | d_values, SIZE, 0, sizeof(T)*8, this_stream); 117 | // Allocate temporary storage for sorting operation 118 | cudaMalloc(&d_temp, temp_bytes); 119 | d_temp_storage[dev_num] = d_temp; 120 | temp_storage_bytes[dev_num] = temp_bytes; 121 | std::cout << "Malloc temp_storage. " << temp_storage_bytes[dev_num] << " bytes" << std::endl; 122 | std::cout << "d_temp_storage[" << dev_num << "] = " << d_temp_storage[dev_num] << std::endl; 123 | if (err) { 124 | std::cout <<"Error " << err << " in SortPairs 1" << std::endl; 125 | std::cout << cudaGetErrorString(err) << std::endl; 126 | } 127 | //} 128 | // Run sorting operation 129 | err = cub::DeviceRadixSort::SortPairs(d_temp, temp_bytes, d_keys, 130 | d_values, SIZE, 0, sizeof(T)*8, this_stream); 131 | if (err) std::cout <<"Error in SortPairs 2" << std::endl; 132 | //cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 133 | // d_values, SIZE, 0, sizeof(T)*8, this_stream); 134 | 135 | } 136 | template 137 | void sum_reduce(thrust::device_vector& values, T* sum) { 138 | int dev_num; 139 | cudaGetDevice(&dev_num); 140 | if (!d_temp_storage2[dev_num]) { 141 | cub::DeviceReduce::Sum(d_temp_storage2[dev_num], temp_storage_bytes2[dev_num], thrust::raw_pointer_cast(values.data()), 142 | sum, values.size(), cuda_stream[dev_num]); 143 | // Allocate temporary storage for sorting operation 144 | cudaMalloc(&d_temp_storage2[dev_num], temp_storage_bytes2[dev_num]); 145 | } 146 | cub::DeviceReduce::Sum(d_temp_storage2[dev_num], temp_storage_bytes2[dev_num], thrust::raw_pointer_cast(values.data()), 147 | sum, values.size(), cuda_stream[dev_num]); 148 | } 149 | void cub_init(); 150 | void cub_close(); 151 | } 152 | -------------------------------------------------------------------------------- /orig/Makefile: -------------------------------------------------------------------------------- 1 | #ifeq ($(BLAS), INTEL_MKL) 2 | #CC = icpc 3 | #LIBS = -L/shared/apps/rhel-6.2/intel/parallel-studio-xe-2013/composer_xe_2013_sp1/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lpthread -lm 4 | #endif 5 | #ifeq ($(BLAS), OPENBLAS) 6 | CC = g++ 7 | INCL = -I/usr/include/openblas/ 8 | LIBS = -lopenblas -L/shared/apps/rhel-6.2/libs/openblas-0.2.8/sandybridge/lib 9 | #endif 10 | 11 | 12 | test: test.cpp 13 | $(CC) -g -o test test.cpp $(INCL) $(LIBS) 14 | gemm_test: gemm_test.cpp 15 | $(CC) -g -o gemm_test gemm_test.cpp $(LIBS) 16 | example-1: example-1.cpp 17 | $(CC) -g -o example-1 example-1.cpp $(LIBS) 18 | -------------------------------------------------------------------------------- /orig/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "stdlib.h" 4 | #include "string.h" //memset 5 | //#include "mkl.h" 6 | #include "cblas.h" 7 | 8 | //extern "C" void dgemm_( char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int * ); 9 | 10 | void random_data(std::vector& array, int n) { 11 | for (int i=0; i& array, int n, int max) { 15 | for (int i=0; i array_in, int n, int dim, 19 | std::vector& dots) { 20 | for (int pt = 0; pt array_in, int n, int dim, 30 | std::vector labels_in, 31 | std::vector& centroids, int n_cluster) { 32 | std::vector members(n_cluster); //Number of points in each cluster 33 | memset(&members[0], 0, n_cluster*sizeof(int)); 34 | memset(¢roids[0], 0, n_cluster*dim*sizeof(double)); 35 | //Add all vectors in the cluster 36 | for(int pt=0; pt data_in, 54 | std::vector data_dots_in, 55 | int n, int dim, std::vector centroids_in, 56 | std::vector centroid_dots, int n_cluster, 57 | std::vector& pairwise_distances) { 58 | self_dot(centroids_in, n_cluster, dim, centroid_dots); 59 | for (int nn=0; nn data_in, int n, 73 | std::vector pairwise_distances_in, 74 | int n_cluster, std::vector& labels) { 75 | int changes = 0; 76 | for (int nn=0; nn1) { 101 | if (0==strcmp(argv[1], "--help")) { 102 | std::cout << "Usage: test " 103 | " " << std::endl; 104 | return 0; 105 | } 106 | else n = atoi(argv[1]); 107 | } 108 | if (argc>2) d = atoi(argv[2]); 109 | if (argc>3) n_cluster = atoi(argv[3]); 110 | if (argc>4) iterations = atoi(argv[4]); 111 | 112 | std::cout << "Generating random data" << std::endl; 113 | std::cout << n << " points of dimension " << d << std::endl; 114 | std::cout << n_cluster << " clusters" << std::endl; 115 | 116 | std::vector data(n*d); //input data 117 | std::vector centroids(n_cluster*d); //centroids for each cluster 118 | std::vector labels(n); //cluster labels for each point 119 | std::vector distances(n); //distances from point from a centroid 120 | 121 | random_data(data, n*d); 122 | 123 | 124 | std::vector data_dots(n); 125 | std::vector centroid_dots(n_cluster); 126 | std::vector pairwise_distances(n_cluster * n); 127 | std::vector labels_copy(n); 128 | 129 | self_dot(data, n, d, data_dots); 130 | 131 | //Let the first n_cluster points be the centroids of the clusters 132 | memcpy(¢roids[0], &data[0], sizeof(double)*n_cluster*d); 133 | 134 | for(int i=0; i 2 | #include "kmeans.h" 3 | #include "timer.h" 4 | #include "util.h" 5 | #include 6 | #include "cuda.h" 7 | 8 | #include 9 | 10 | template 11 | void fill_array(T& array, int m, int n) { 12 | for(int i = 0; i < m; i++) { 13 | for(int j = 0; j < n; j++) { 14 | array[i * n + j] = (i % 2)*3 + j; 15 | } 16 | } 17 | } 18 | 19 | void random_data(thrust::device_vector& array, int m, int n) { 20 | thrust::host_vector host_array(m*n); 21 | for(int i = 0; i < m * n; i++) { 22 | host_array[i] = (double)rand()/(double)RAND_MAX; 23 | } 24 | array = host_array; 25 | } 26 | 27 | void random_labels(thrust::device_vector& labels, int n, int k) { 28 | thrust::host_vector host_labels(n); 29 | for(int i = 0; i < n; i++) { 30 | host_labels[i] = rand() % k; 31 | } 32 | labels = host_labels; 33 | } 34 | 35 | 36 | void tiny_test() { 37 | int iterations = 1; 38 | int n = 5; 39 | int d = 3; 40 | int k = 2; 41 | 42 | 43 | thrust::device_vector *data[1]; 44 | thrust::device_vector *labels[1]; 45 | thrust::device_vector *centroids[1]; 46 | thrust::device_vector *distances[1]; 47 | data[0] = new thrust::device_vector(n * d); 48 | labels[0] = new thrust::device_vector(n); 49 | centroids[0] = new thrust::device_vector(k * d); 50 | distances[0] = new thrust::device_vector(n); 51 | 52 | fill_array(*data[0], n, d); 53 | std::cout << "Data: " << std::endl; 54 | print_array(*data[0], n, d); 55 | 56 | (*labels[0])[0] = 0; 57 | (*labels[0])[1] = 0; 58 | (*labels[0])[2] = 0; 59 | (*labels[0])[3] = 1; 60 | (*labels[0])[4] = 1; 61 | 62 | std::cout << "Labels: " << std::endl; 63 | print_array(*labels[0], n, 1); 64 | 65 | int i = kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, 1); 66 | 67 | std::cout << "Labels: " << std::endl; 68 | print_array(*labels[0], n, 1); 69 | 70 | std::cout << "Centroids:" << std::endl; 71 | print_array(*centroids[0], k, d); 72 | 73 | std::cout << "Distances:" << std::endl; 74 | print_array(*distances[0], n, 1); 75 | delete(data[0]); 76 | delete(labels[0]); 77 | delete(centroids[0]); 78 | delete(distances[0]); 79 | } 80 | 81 | 82 | void more_tiny_test() { 83 | double dataset[] = { 84 | 0.5, 0.5, 85 | 1.5, 0.5, 86 | 1.5, 1.5, 87 | 0.5, 1.5, 88 | 1.1, 1.2, 89 | 0.5, 15.5, 90 | 1.5, 15.5, 91 | 1.5, 16.5, 92 | 0.5, 16.5, 93 | 1.2, 16.1, 94 | 15.5, 15.5, 95 | 16.5, 15.5, 96 | 16.5, 16.5, 97 | 15.5, 16.5, 98 | 15.6, 16.2, 99 | 15.5, 0.5, 100 | 16.5, 0.5, 101 | 16.5, 1.5, 102 | 15.5, 1.5, 103 | 15.7, 1.6}; 104 | double centers[] = { 105 | 0.5, 0.5, 106 | 1.5, 0.5, 107 | 1.5, 1.5, 108 | 0.5, 1.5}; 109 | 110 | int iterations = 3; 111 | int n = 20; 112 | int d = 2; 113 | int k = 4; 114 | 115 | thrust::device_vector *data[1]; 116 | thrust::device_vector *labels[1]; 117 | thrust::device_vector *centroids[1]; 118 | thrust::device_vector *distances[1]; 119 | data[0] = new thrust::device_vector(dataset, dataset+n*d); 120 | labels[0] = new thrust::device_vector(n); 121 | centroids[0] = new thrust::device_vector(centers, centers+k*d); 122 | distances[0] = new thrust::device_vector(n); 123 | 124 | 125 | kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, 1, false); 126 | 127 | std::cout << "Labels: " << std::endl; 128 | print_array(*labels[0], n, 1); 129 | 130 | std::cout << "Centroids:" << std::endl; 131 | print_array(*centroids[0], k, d); 132 | 133 | } 134 | 135 | 136 | 137 | int main() { 138 | std::cout << "Input a character to choose a test:" << std::endl; 139 | std::cout << "Tiny test: t" << std::endl; 140 | std::cout << "More tiny test: m" << std::endl; 141 | std::cout << "Huge test: h: " << std::endl; 142 | char c; 143 | //std::cin >> c; 144 | c = 'h'; 145 | switch (c) { 146 | case 't': 147 | tiny_test(); 148 | exit(0); 149 | case 'm': 150 | more_tiny_test(); 151 | exit(0); 152 | case 'h': 153 | break; 154 | default: 155 | std::cout << "Choice not understood, running huge test" << std::endl; 156 | } 157 | int iterations = 100; 158 | int n = 5e6; 159 | int d = 50; 160 | int k = 100; 161 | 162 | int n_gpu; 163 | 164 | cudaGetDeviceCount(&n_gpu); 165 | 166 | //n_gpu = 1; 167 | std::cout << n_gpu << " gpus." << std::endl; 168 | 169 | thrust::device_vector *data[16]; 170 | thrust::device_vector *labels[16]; 171 | thrust::device_vector *centroids[16]; 172 | thrust::device_vector *distances[16]; 173 | for (int q = 0; q < n_gpu; q++) { 174 | cudaSetDevice(q); 175 | data[q] = new thrust::device_vector(n/n_gpu*d); 176 | labels[q] = new thrust::device_vector(n/n_gpu*d); 177 | centroids[q] = new thrust::device_vector(k * d); 178 | distances[q] = new thrust::device_vector(n); 179 | } 180 | 181 | std::cout << "Generating random data" << std::endl; 182 | std::cout << "Number of points: " << n << std::endl; 183 | std::cout << "Number of dimensions: " << d << std::endl; 184 | std::cout << "Number of clusters: " << k << std::endl; 185 | std::cout << "Number of iterations: " << iterations << std::endl; 186 | 187 | for (int q = 0; q < n_gpu; q++) { 188 | random_data(*data[q], n/n_gpu, d); 189 | random_labels(*labels[q], n/n_gpu, k); 190 | } 191 | kmeans::timer t; 192 | t.start(); 193 | kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, n_gpu); 194 | float time = t.stop(); 195 | std::cout << " Time: " << time/1000.0 << " s" << std::endl; 196 | 197 | for (int q = 0; q < n_gpu; q++) { 198 | delete(data[q]); 199 | delete(labels[q]); 200 | delete(centroids[q]); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /timer.cu: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | 3 | namespace kmeans { 4 | 5 | timer::timer() { 6 | cudaEventCreate(&m_start); 7 | cudaEventCreate(&m_stop); 8 | } 9 | 10 | timer::~timer() { 11 | cudaEventDestroy(m_start); 12 | cudaEventDestroy(m_stop); 13 | } 14 | 15 | void timer::start() { 16 | cudaEventRecord(m_start, 0); 17 | } 18 | 19 | float timer::stop() { 20 | float time; 21 | cudaEventRecord(m_stop, 0); 22 | cudaEventSynchronize(m_stop); 23 | cudaEventElapsedTime(&time, m_start, m_stop); 24 | return time; 25 | } 26 | 27 | } 28 | 29 | -------------------------------------------------------------------------------- /timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace kmeans { 3 | 4 | struct timer { 5 | timer(); 6 | ~timer(); 7 | void start(); 8 | float stop(); 9 | private: 10 | cudaEvent_t m_start, m_stop; 11 | }; 12 | 13 | 14 | } 15 | -------------------------------------------------------------------------------- /util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | void print_array(T& array, int m, int n) { 5 | for(int i = 0; i < m; i++) { 6 | for(int j = 0; j < n; j++) { 7 | typename T::value_type value = array[i * n + j]; 8 | std::cout << value << " "; 9 | } 10 | std::cout << std::endl; 11 | } 12 | } 13 | --------------------------------------------------------------------------------