├── LICENSE
├── README
├── README.md
├── centroids.cu
├── centroids.h
├── contributors.txt
├── kmeans.cu
├── kmeans.h
├── labels.cu
├── labels.h
├── orig
    ├── Makefile
    └── test.cpp
├── test.cu
├── timer.cu
├── timer.h
└── util.h


/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [yyyy] [name of copyright owner]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | ** Building kmeans for GPU **
 2 | 
 3 | Prereq:
 4 | CUB (https://github.com/nvlabs/CUB)
 5 | CUDA
 6 | 
 7 | Either modify Makefile to specify the path to CUB, or specify CUB_HOME on your make invocation
 8 | 
 9 | %> make CUB_HOME=/home-2/lbarnes/CUB/cub-1.0.2
10 | 
11 | By default, it will compile with some additional data exchange between CPU and GPU in order to provide updates on screen with each iteration. This reduces performance a bit. Disable these by setting USERFLAGS to ""
12 | 
13 | %> make CUDA_HOME=/home-2/lbarnes/CUB/cub-1.0.2 USERFLAGS=""
14 | 
15 | ** Running **
16 | 
17 | The code is set to generate random data and cluster a set of 5M points of dimension 50. By default it iterates 100 times. Adjust these parameters in test.cu. It will use all the GPUs it finds. Use CUDA_VISIBLE_DEVICES to limit the number of GPUs.
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | kmeans
 2 | ======
 3 | 
 4 | A simple kmeans clustering implementation for double precision data,
 5 | written for CUDA GPUs.
 6 | 
 7 | There are two ideas here:
 8 | 
 9 |   1. The relabel step of kmeans relies on computing distances between
10 | all n points (x) and all k centroids (y). This code refactors the distance
11 | computation using the identity ||x-y||^2 = x.x + y.y - 2x.y; this
12 | refactorization moves the x.x computation outside the kmeans loop, and
13 | uses GEMM to compute the x.y, getting us peak performance. 
14 |   2. The computation of new centroids can be tricky because the labels
15 | change every iteration.  This code shows how to sort to group all points with
16 | the same label, transforming the centroid accumulation into 
17 | simple additions, minimizing atomic memory operations.  For many
18 | practical problem sizes, sorting reduces the centroid computation to less
19 | than 20% of the overall runtime of the algorithm.
20 | 
21 | The CUDA code here is purposefully non-optimized - this code is not
22 | meant to be the fastest possible kmeans implementation, but rather to
23 | show how using libraries like thrust and BLAS can provide reasonable
24 | performance with high programmer productivity.
25 | 
26 | Multi-GPU version
27 | =================
28 | This version has been updated to use multiple GPUs attached to the same machine.
29 | You do not need to specify the number of GPUs, the program will detect and use
30 | them.
31 | 
32 | Prerequisites
33 | =============
34 | * CUDA toolkit 4.2
35 | * CUB 1.0.2 https://github.com/NVLabs/cub
36 | 
37 | Build
38 | =====
39 | To build, edit Makefile to specify CUB_HOME, the location of your CUB files
40 | Then call make.
41 | 
42 | Run
43 | ===
44 | A simple test case is run when you invoke the executable 'test'.
45 | 
46 | For demonstration, test will generate and solve 3 test cases of different
47 | sizes. At the prompt, specify 't' for a tiny test case, 'm' for a slightly 
48 | bigger test case, and 'h' for a huge test case: 1 million points, with 50 
49 | dimensions and 100 clusters, for 50 iterations.
50 | 


--------------------------------------------------------------------------------
/centroids.cu:
--------------------------------------------------------------------------------
  1 | #include <thrust/device_vector.h>
  2 | #include <thrust/sort.h>
  3 | #include <thrust/fill.h>
  4 | #include <thrust/iterator/counting_iterator.h>
  5 | 
  6 | #include "labels.h"
  7 | 
  8 | __device__ double atomicAdd(double* address, double val)
  9 | {
 10 |     unsigned long long int* address_as_ull =
 11 |                              (unsigned long long int*)address;
 12 |     unsigned long long int old = *address_as_ull, assumed;
 13 |     do {
 14 |         assumed = old;
 15 |         old = atomicCAS(address_as_ull, assumed,
 16 |                         __double_as_longlong(val +
 17 |                                              __longlong_as_double(assumed)));
 18 |     } while (assumed != old);
 19 |     return __longlong_as_double(old);
 20 | }
 21 | 
 22 | namespace kmeans {
 23 | namespace detail {
 24 | 
 25 | __device__ __forceinline__ void update_centroid(int label, int dimension,
 26 |                                                 int d,
 27 |                                                 double accumulator, double* centroids,
 28 |                                                 int count, int* counts) {
 29 |     int index = label * d + dimension;
 30 |     double* target = centroids + index;
 31 |     atomicAdd(target, accumulator);
 32 |     if (dimension == 0) {
 33 |         atomicAdd(counts + label, count);
 34 |     }             
 35 | }
 36 | 
 37 | __global__ void calculate_centroids(int n, int d, int k,
 38 |                                     double* data,
 39 |                                     int* ordered_labels,
 40 |                                     int* ordered_indices,
 41 |                                     double* centroids,
 42 |                                     int* counts) {
 43 |     int in_flight = blockDim.y * gridDim.y;
 44 |     int labels_per_row = (n - 1) / in_flight + 1; 
 45 |     for(int dimension = threadIdx.x; dimension < d; dimension += blockDim.x) {
 46 |         double accumulator = 0;
 47 |         int count = 0;
 48 |         int global_id = threadIdx.y + blockIdx.y * blockDim.y;
 49 |         int start = global_id * labels_per_row;
 50 |         int end = (global_id + 1) * labels_per_row;
 51 |         end = (end > n) ? n : end;
 52 |         int prior_label;
 53 |         if (start < n) {
 54 |             prior_label = ordered_labels[start];
 55 |         
 56 |             for(int label_number = start; label_number < end; label_number++) {
 57 |                 int label = ordered_labels[label_number];
 58 |                 if (label != prior_label) {
 59 |                     update_centroid(prior_label, dimension,
 60 |                                     d,
 61 |                                     accumulator, centroids,
 62 |                                     count, counts);
 63 |                     accumulator = 0;
 64 |                     count = 0;
 65 |                 }
 66 |   
 67 |                 double value = data[dimension + ordered_indices[label_number] * d];
 68 |                 accumulator += value;
 69 |                 prior_label = label;
 70 |                 count++;
 71 |             }
 72 |             update_centroid(prior_label, dimension,
 73 |                             d,
 74 |                             accumulator, centroids,
 75 |                             count, counts);
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | __global__ void scale_centroids(int d, int k, int* counts, double* centroids) {
 81 |     int global_id_x = threadIdx.x + blockIdx.x * blockDim.x;
 82 |     int global_id_y = threadIdx.y + blockIdx.y * blockDim.y;
 83 |     if ((global_id_x < d) && (global_id_y < k)) {
 84 |         int count = counts[global_id_y];
 85 |         //To avoid introducing divide by zero errors
 86 |         //If a centroid has no weight, we'll do no normalization
 87 |         //This will keep its coordinates defined.
 88 |         if (count < 1) {
 89 |             count = 1;
 90 |         }
 91 |         double scale = 1.0/double(count);
 92 |         centroids[global_id_x + d * global_id_y] *= scale;
 93 |     }
 94 | }
 95 | 
 96 | void find_centroids(int n, int d, int k,
 97 |                     thrust::device_vector<double>& data,
 98 |                     thrust::device_vector<int>& labels,
 99 |                     thrust::device_vector<double>& centroids,
100 |                     thrust::device_vector<int>& range,
101 |                     thrust::device_vector<int>& indices,
102 |                     thrust::device_vector<int>& counts) {
103 |     int dev_num;
104 |     cudaGetDevice(&dev_num);
105 |     detail::memcpy(indices,range);
106 |     //Bring all labels with the same value together
107 | #if 0
108 |     thrust::sort_by_key(labels.begin(),
109 |                         labels.end(),
110 |                         indices.begin());
111 | #else
112 |     mycub::sort_by_key_int(labels, indices);
113 | #endif
114 | 
115 |     //Initialize centroids to all zeros
116 |     detail::memzero(centroids);
117 | 
118 |     //Initialize counts to all zeros
119 |     detail::memzero(counts);
120 |     
121 |     //Calculate centroids 
122 |     int n_threads_x = 64;
123 |     int n_threads_y = 16;
124 |     //XXX Number of blocks here is hard coded at 30
125 |     //This should be taken care of more thoughtfully.
126 |     detail::calculate_centroids<<<dim3(1, 30), dim3(n_threads_x, n_threads_y),
127 |                                   0, cuda_stream[dev_num]>>>
128 |         (n, d, k,
129 |          thrust::raw_pointer_cast(data.data()),
130 |          thrust::raw_pointer_cast(labels.data()),
131 |          thrust::raw_pointer_cast(indices.data()),
132 |          thrust::raw_pointer_cast(centroids.data()),
133 |          thrust::raw_pointer_cast(counts.data()));
134 |     
135 |     //Scale centroids
136 |     detail::scale_centroids<<<dim3((d-1)/32+1, (k-1)/32+1), dim3(32, 32),
137 |                               0, cuda_stream[dev_num]>>>
138 |         (d, k,
139 |          thrust::raw_pointer_cast(counts.data()),
140 |          thrust::raw_pointer_cast(centroids.data()));
141 | }
142 | 
143 | }
144 | }
145 | 


--------------------------------------------------------------------------------
/centroids.h:
--------------------------------------------------------------------------------
 1 | #include <thrust/device_vector.h>
 2 | 
 3 | namespace kmeans {
 4 | namespace detail {
 5 | void find_centroids(int n, int d, int k,
 6 |                     thrust::device_vector<double>& data,
 7 |                     thrust::device_vector<int>& labels,
 8 |                     thrust::device_vector<double>& centroids,
 9 |                     thrust::device_vector<int>& range,
10 |                     thrust::device_vector<int>& indices,
11 |                     thrust::device_vector<int>& counts);
12 | 
13 | 
14 | }
15 | }
16 | 


--------------------------------------------------------------------------------
/contributors.txt:
--------------------------------------------------------------------------------
1 | Levi Barnes
2 | 


--------------------------------------------------------------------------------
/kmeans.cu:
--------------------------------------------------------------------------------
  1 | #include "kmeans.h"
  2 | #include "util.h"
  3 | #include <thrust/reduce.h>
  4 | 
  5 | namespace kmeans {
  6 | 
  7 | int kmeans(int iterations,
  8 |            int n, int d, int k,
  9 |            thrust::device_vector<double>** data,
 10 |            thrust::device_vector<int>** labels,
 11 |            thrust::device_vector<double>** centroids,
 12 |            thrust::device_vector<double>** distances,
 13 |            int n_gpu,
 14 |            bool init_from_labels, 
 15 |            double threshold) {
 16 |     thrust::device_vector<double> *data_dots[16];
 17 |     thrust::device_vector<double> *centroid_dots[16];
 18 |     thrust::device_vector<double> *pairwise_distances[16];
 19 |     thrust::device_vector<int> *labels_copy[16];
 20 |     thrust::device_vector<int> *range[16];
 21 |     thrust::device_vector<int> *indices[16];
 22 |     thrust::device_vector<int> *counts[16];
 23 |     
 24 |     thrust::host_vector<double> h_centroids( k * d );
 25 |     thrust::host_vector<double> h_centroids_tmp( k * d );
 26 |     int h_changes[16], *d_changes[16];
 27 |     double h_distance_sum[16], *d_distance_sum[16];
 28 | 
 29 | 
 30 |     for (int q = 0; q < n_gpu; q++) {
 31 | 
 32 |         cudaSetDevice(q);
 33 |         cudaMalloc(&d_changes[q], sizeof(int));
 34 |         cudaMalloc(&d_distance_sum[q], sizeof(double));
 35 |         detail::labels_init();
 36 |         data_dots[q] = new thrust::device_vector <double>(n/n_gpu);
 37 |         centroid_dots[q] = new thrust::device_vector<double>(n/n_gpu);
 38 |         pairwise_distances[q] = new thrust::device_vector<double>(n/n_gpu * k);
 39 |         labels_copy[q] = new thrust::device_vector<int>(n/n_gpu * d);
 40 |         range[q] = new thrust::device_vector<int>(n/n_gpu);
 41 |         counts[q] = new thrust::device_vector<int>(k);
 42 |         indices[q] = new thrust::device_vector<int>(n/n_gpu);
 43 |         //Create and save "range" for initializing labels
 44 |         thrust::copy(thrust::counting_iterator<int>(0),
 45 |                      thrust::counting_iterator<int>(n/n_gpu), 
 46 |                      (*range[q]).begin());
 47 | 
 48 |         detail::make_self_dots(n/n_gpu, d, *data[q], *data_dots[q]);
 49 |         if (init_from_labels) {
 50 |             detail::find_centroids(n/n_gpu, d, k, *data[q], *labels[q], *centroids[q], *range[q], *indices[q], *counts[q]);
 51 |         }
 52 |     }
 53 | 
 54 |     double prior_distance_sum = 0;
 55 |     int i=0;
 56 |     for(; i < iterations; i++) {
 57 |         //Average the centroids from each device
 58 |         if (n_gpu > 1) {
 59 |             for (int p = 0; p < k * d; p++) h_centroids[p] = 0.0;
 60 |             for (int q = 0; q < n_gpu; q++) {
 61 |                 cudaSetDevice(q);
 62 |                 detail::memcpy(h_centroids_tmp, *centroids[q]);
 63 |                 detail::streamsync(q);
 64 |                 for (int p = 0; p < k * d; p++) h_centroids[p] += h_centroids_tmp[p];
 65 |             }
 66 |             for (int p = 0; p < k * d; p++) h_centroids[p] /= n_gpu;
 67 |             //Copy the averaged centroids to each device 
 68 |             for (int q = 0; q < n_gpu; q++) {
 69 |                 cudaSetDevice(q);
 70 |                 detail::memcpy(*centroids[q],h_centroids);
 71 |             }
 72 |         }
 73 |         for (int q = 0; q < n_gpu; q++) {
 74 |             //TODO compute total distance
 75 |             cudaSetDevice(q);
 76 |           
 77 |             detail::calculate_distances(n/n_gpu, d, k,
 78 |                                         *data[q], *centroids[q], *data_dots[q],
 79 |                                         *centroid_dots[q], *pairwise_distances[q]);
 80 | 
 81 |             detail::relabel(n/n_gpu, k, *pairwise_distances[q], *labels[q], *distances[q], d_changes[q]);
 82 |             //TODO remove one memcpy
 83 |             detail::memcpy(*labels_copy[q], *labels[q]);
 84 |             detail::find_centroids(n/n_gpu, d, k, *data[q], *labels[q], *centroids[q], *range[q], *indices[q], *counts[q]);
 85 |             detail::memcpy(*labels[q], *labels_copy[q]);
 86 |             //double d_distance_sum[q] = thrust::reduce(distances[q].begin(), distances[q].end())
 87 |             mycub::sum_reduce(*distances[q], d_distance_sum[q]);
 88 |         }
 89 | #if __VERBOSE
 90 |         double distance_sum = 0.0;
 91 |         for (int q = 0; q < n_gpu; q++) {
 92 |             cudaMemcpyAsync(h_changes+q, d_changes[q], sizeof(int), cudaMemcpyDeviceToHost, cuda_stream[q]);
 93 |             cudaMemcpyAsync(h_distance_sum+q, d_distance_sum[q], sizeof(double), cudaMemcpyDeviceToHost, cuda_stream[q]);
 94 |             detail::streamsync(q);
 95 |             std::cout << "Device " << q << ":  Iteration " << i << " produced " << h_changes[q]
 96 |                       << " changes and the total_distance is " << h_distance_sum[q] << std::endl;
 97 |             distance_sum += h_distance_sum[q];
 98 |         }
 99 |         if (i > 0) {
100 |             double delta = distance_sum / prior_distance_sum;
101 |             if (delta > 1 - threshold) {
102 |                 std::cout << "Threshold triggered. Terminating iterations early." << std::endl;
103 |                 return i + 1;
104 |             }
105 |         }
106 |         prior_distance_sum = distance_sum;
107 | #endif
108 |         
109 |     }
110 |     for (int q = 0; q < n_gpu; q++) {
111 |        cudaSetDevice(q);
112 |        cudaFree(d_changes[q]);
113 |        detail::labels_close();
114 |        delete(pairwise_distances[q]);
115 |        delete(data_dots[q]);
116 |        delete(centroid_dots[q]);
117 |     }
118 |     return i;
119 | 
120 | 
121 | }
122 | 
123 | 
124 | }
125 | 


--------------------------------------------------------------------------------
/kmeans.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <thrust/device_vector.h>
 3 | #include "centroids.h"
 4 | #include "labels.h"
 5 | 
 6 | namespace kmeans {
 7 | 
 8 | 
 9 | //! kmeans clusters data into k groups
10 | /*! 
11 |   
12 |   \param iterations How many iterations to run
13 |   \param n Number of data points
14 |   \param d Number of dimensions
15 |   \param k Number of clusters
16 |   \param data Data points, in row-major order. This vector must have
17 |   size n * d, and since it's in row-major order, data point x occupies
18 |   positions [x * d, (x + 1) * d) in the vector. The vector is passed
19 |   by reference since it is shared with the caller and not copied.
20 |   \param labels Cluster labels. This vector has size n.
21 |   The vector is passed by reference since it is shared with the caller
22 |   and not copied.
23 |   \param centroids Centroid locations, in row-major order. This
24 |   vector must have size k * d, and since it's in row-major order,
25 |   centroid x occupies positions [x * d, (x + 1) * d) in the
26 |   vector. The vector is passed by reference since it is shared
27 |   with the caller and not copied.
28 |   \param distances Distances from points to centroids. This vector has
29 |   size n. It is passed by reference since it is shared with the caller
30 |   and not copied.
31 |   \param init_from_labels If true, the labels need to be initialized
32 |   before calling kmeans. If false, the centroids need to be
33 |   initialized before calling kmeans. Defaults to true, which means
34 |   the labels must be initialized.
35 |   \param threshold This controls early termination of the kmeans
36 |   iterations. If the ratio of the sum of distances from points to
37 |   centroids from this iteration to the previous iteration changes by
38 |   less than the threshold, than the iterations are
39 |   terminated. Defaults to 0.000001
40 |   \return The number of iterations actually performed.
41 | */
42 | 
43 | int kmeans(int iterations,
44 |             int n, int d, int k,
45 |             thrust::device_vector<double>** data,
46 |             thrust::device_vector<int>** labels,
47 |             thrust::device_vector<double>** centroids,
48 |             thrust::device_vector<double>** distances,
49 |             int n_gpu=1,
50 |             bool init_from_labels=true,
51 |             double threshold=0.000001
52 |     );
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/labels.cu:
--------------------------------------------------------------------------------
  1 | #include "labels.h"
  2 | #include <cublas_v2.h>
  3 | #include <cfloat>
  4 | 
  5 | cudaStream_t cuda_stream[16];
  6 | namespace kmeans {
  7 | namespace detail {
  8 | 
  9 | cublasHandle_t cublas_handle[16];
 10 | 
 11 | void labels_init() {
 12 |     cublasStatus_t stat;
 13 |     cudaError_t err;
 14 |     int dev_num;
 15 |     cudaGetDevice(&dev_num);
 16 |     stat = cublasCreate(&detail::cublas_handle[dev_num]);
 17 |     if (stat != CUBLAS_STATUS_SUCCESS) {
 18 |         std::cout << "CUBLAS initialization failed" << std::endl;
 19 |         exit(1);
 20 |     }
 21 |     err = cudaStreamCreate(&cuda_stream[dev_num]);
 22 |     if (err != cudaSuccess) {
 23 |         std::cout << "Stream creation failed" << std::endl;
 24 |         exit(1);
 25 |     }
 26 |     cublasSetStream(cublas_handle[dev_num], cuda_stream[dev_num]);
 27 |     mycub::cub_init();
 28 | }
 29 | void labels_close() {
 30 |     int dev_num;
 31 |     cudaGetDevice(&dev_num);
 32 |     cublasDestroy(cublas_handle[dev_num]);
 33 |     cudaStreamDestroy(cuda_stream[dev_num]);
 34 |     mycub::cub_close();
 35 | }
 36 | 
 37 | 
 38 | void streamsync(int dev_num) {
 39 | 
 40 |     cudaStreamSynchronize(cuda_stream[dev_num]);
 41 | }
 42 | 
 43 | //n: number of points
 44 | //d: dimensionality of points
 45 | //data: points, laid out in row-major order (n rows, d cols)
 46 | //dots: result vector (n rows)
 47 | // NOTE:
 48 | //Memory accesses in this function are uncoalesced!!
 49 | //This is because data is in row major order
 50 | //However, in k-means, it's called outside the optimization loop
 51 | //on the large data array, and inside the optimization loop it's
 52 | //called only on a small array, so it doesn't really matter.
 53 | //If this becomes a performance limiter, transpose the data somewhere
 54 | __global__ void self_dots(int n, int d, double* data, double* dots) {
 55 | 	double accumulator = 0;
 56 |     int global_id = blockDim.x * blockIdx.x + threadIdx.x;
 57 | 
 58 |     if (global_id < n) {
 59 |         for (int i = 0; i < d; i++) {
 60 |             double value = data[i + global_id * d];
 61 |             accumulator += value * value;
 62 |         }
 63 |         dots[global_id] = accumulator;
 64 |     }    
 65 | }
 66 | 
 67 | 
 68 | void make_self_dots(int n, int d, thrust::device_vector<double>& data, thrust::device_vector<double>& dots) {
 69 |     int dev_num;
 70 |     cudaGetDevice(&dev_num);
 71 |     self_dots<<<(n-1)/256+1, 256, 0, cuda_stream[dev_num]>>>(n, d, thrust::raw_pointer_cast(data.data()),
 72 |                                     thrust::raw_pointer_cast(dots.data()));
 73 | }
 74 | 
 75 | __global__ void all_dots(int n, int k, double* data_dots, double* centroid_dots, double* dots) {
 76 | 	__shared__ double local_data_dots[32];
 77 | 	__shared__ double local_centroid_dots[32];
 78 | 
 79 |     int data_index = threadIdx.x + blockIdx.x * blockDim.x;
 80 |     if ((data_index < n) && (threadIdx.y == 0)) {
 81 |         local_data_dots[threadIdx.x] = data_dots[data_index];
 82 |     }
 83 | 
 84 |     
 85 | 
 86 |     
 87 |     int centroid_index = threadIdx.x + blockIdx.y * blockDim.y;
 88 |     if ((centroid_index < k) && (threadIdx.y == 1)) {
 89 |         local_centroid_dots[threadIdx.x] = centroid_dots[centroid_index];
 90 |     }
 91 |        
 92 |    	__syncthreads();
 93 | 
 94 | 	centroid_index = threadIdx.y + blockIdx.y * blockDim.y;
 95 |     if ((data_index < n) && (centroid_index < k)) {
 96 |         dots[data_index + centroid_index * n] = local_data_dots[threadIdx.x] +
 97 |             local_centroid_dots[threadIdx.y];
 98 |     }
 99 | }
100 | 
101 | void make_all_dots(int n, int k, thrust::device_vector<double>& data_dots,
102 |                    thrust::device_vector<double>& centroid_dots,
103 |                    thrust::device_vector<double>& dots) {
104 |     int dev_num;
105 |     cudaGetDevice(&dev_num);
106 |     all_dots<<<
107 |         dim3((n-1)/32+1,
108 |              (k-1)/32+1),
109 |         dim3(32, 32), 0,
110 |         cuda_stream[dev_num]>>>(n, k, thrust::raw_pointer_cast(data_dots.data()),
111 |                                  thrust::raw_pointer_cast(centroid_dots.data()),
112 |                                  thrust::raw_pointer_cast(dots.data()));
113 | };
114 | 
115 | void calculate_distances(int n, int d, int k,
116 |                          thrust::device_vector<double>& data,
117 |                          thrust::device_vector<double>& centroids,
118 |                          thrust::device_vector<double>& data_dots,
119 |                          thrust::device_vector<double>& centroid_dots,
120 |                          thrust::device_vector<double>& pairwise_distances) {
121 |     detail::make_self_dots(k, d, centroids, centroid_dots);
122 |     detail::make_all_dots(n, k, data_dots, centroid_dots, pairwise_distances);
123 |     //||x-y||^2 = ||x||^2 + ||y||^2 - 2 x . y
124 |     //pairwise_distances has ||x||^2 + ||y||^2, so beta = 1
125 |     //The dgemm calculates x.y for all x and y, so alpha = -2.0
126 |     double alpha = -2.0;
127 |     double beta = 1.0;
128 |     //If the data were in standard column major order, we'd do a
129 |     //centroids * data ^ T
130 |     //But the data is in row major order, so we have to permute
131 |     //the arguments a little
132 |     int dev_num;
133 |     cudaGetDevice(&dev_num);
134 |     cublasStatus_t stat =
135 |         cublasDgemm(detail::cublas_handle[dev_num],
136 |                     CUBLAS_OP_T, CUBLAS_OP_N,
137 |                     n, k, d, &alpha,
138 |                     thrust::raw_pointer_cast(data.data()),
139 |                     d,//Has to be n or d
140 |                     thrust::raw_pointer_cast(centroids.data()),
141 |                     d,//Has to be k or d
142 |                     &beta,
143 |                     thrust::raw_pointer_cast(pairwise_distances.data()),
144 |                     n); //Has to be n or k
145 |     
146 |     if (stat != CUBLAS_STATUS_SUCCESS) {
147 |         std::cout << "Invalid Dgemm" << std::endl;
148 |         exit(1);
149 |     }
150 | 
151 | }
152 |                          
153 | __global__ void make_new_labels(int n, int k, double* pairwise_distances,
154 |                                 int* labels, int* changes,
155 |                                 double* distances) {
156 |     double min_distance = DBL_MAX;
157 |     double min_idx = -1;
158 |     int global_id = threadIdx.x + blockIdx.x * blockDim.x;
159 |     if (global_id < n) {
160 |         int old_label = labels[global_id];
161 |         for(int c = 0; c < k; c++) {
162 |             double distance = pairwise_distances[c * n + global_id];
163 |             if (distance < min_distance) {
164 |                 min_distance = distance;
165 |                 min_idx = c;
166 |             }
167 |         }
168 |         labels[global_id] = min_idx;
169 |         distances[global_id] = min_distance;
170 |         if (old_label != min_idx) {
171 |             atomicAdd(changes, 1);
172 |         }
173 |     }
174 | }
175 | 
176 | 
177 | void relabel(int n, int k,
178 |              thrust::device_vector<double>& pairwise_distances,
179 |              thrust::device_vector<int>& labels, 
180 |              thrust::device_vector<double>& distances, 
181 |              int *d_changes) {
182 |     int dev_num;
183 |     cudaGetDevice(&dev_num);
184 |     cudaMemsetAsync(d_changes, 0, sizeof(int), cuda_stream[dev_num]);
185 |     make_new_labels<<<(n-1)/256+1,256,0,cuda_stream[dev_num]>>>(
186 |         n, k,
187 |         thrust::raw_pointer_cast(pairwise_distances.data()),
188 |         thrust::raw_pointer_cast(labels.data()),
189 |         d_changes,
190 |         thrust::raw_pointer_cast(distances.data()));
191 | }
192 | 
193 | }
194 | }
195 | namespace mycub {
196 | void *d_key_alt_buf[16];
197 | unsigned int key_alt_buf_bytes[16];
198 | void *d_value_alt_buf[16];
199 | unsigned int value_alt_buf_bytes[16];
200 | void *d_temp_storage[16];
201 | size_t temp_storage_bytes[16];
202 | void *d_temp_storage2[16];
203 | size_t temp_storage_bytes2[16];
204 | bool cub_initted;
205 | void cub_init() {
206 |     std::cout <<"CUB init" << std::endl;
207 |     for (int q=0; q<16; q++) {
208 |         d_key_alt_buf[q] = NULL;
209 |         key_alt_buf_bytes[q] = 0;
210 |         d_value_alt_buf[q] = NULL;
211 |         value_alt_buf_bytes[q] = 0;
212 |         d_temp_storage[q] = NULL;
213 |         temp_storage_bytes[q] = 0;
214 |         d_temp_storage2[q] = NULL;
215 |         temp_storage_bytes2[q] = 0;
216 |     }
217 |     cub_initted = true;
218 | }
219 | void cub_close() {
220 |     for (int q=0; q<16; q++) {
221 |         if(d_key_alt_buf[q]) cudaFree(d_key_alt_buf[q]);
222 |         if(d_value_alt_buf[q]) cudaFree(d_value_alt_buf[q]);
223 |         if(d_temp_storage[q]) cudaFree(d_temp_storage[q]);
224 |         if(d_temp_storage2[q]) cudaFree(d_temp_storage2[q]);
225 |         d_temp_storage[q] = NULL;
226 |         d_temp_storage2[q] = NULL;
227 |     }
228 |     cub_initted = false;
229 | }
230 | void sort_by_key_int(thrust::device_vector<int>& keys, thrust::device_vector<int>& values) {
231 |     int dev_num;
232 |     cudaGetDevice(&dev_num);
233 |     cudaStream_t this_stream = cuda_stream[dev_num]; 
234 |     int SIZE = keys.size();
235 |     //int *d_key_alt_buf, *d_value_alt_buf;
236 |     if (key_alt_buf_bytes[dev_num] < sizeof(int)*SIZE) {
237 |        if (d_key_alt_buf[dev_num]) cudaFree(d_key_alt_buf[dev_num]);
238 |        cudaMalloc(&d_key_alt_buf[dev_num], sizeof(int)*SIZE);
239 |        key_alt_buf_bytes[dev_num] = sizeof(int)*SIZE;
240 |     }
241 |     if (value_alt_buf_bytes[dev_num] < sizeof(int)*SIZE) {
242 |        if (d_value_alt_buf[dev_num]) cudaFree(d_value_alt_buf[dev_num]);
243 |        cudaMalloc(&d_value_alt_buf[dev_num], sizeof(int)*SIZE);
244 |        value_alt_buf_bytes[dev_num] = sizeof(int)*SIZE;
245 |     }
246 |     cub::DoubleBuffer<int> d_keys(thrust::raw_pointer_cast(keys.data()), (int*)d_key_alt_buf[dev_num]);
247 |     cub::DoubleBuffer<int> d_values(thrust::raw_pointer_cast(values.data()), (int*)d_value_alt_buf[dev_num]);
248 | 
249 |     // Determine temporary device storage requirements for sorting operation
250 |     if (!d_temp_storage[dev_num]) {
251 |         cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 
252 |                                         d_values, SIZE, 0, sizeof(int)*8, this_stream);
253 |         // Allocate temporary storage for sorting operation
254 |         cudaMalloc(&d_temp_storage[dev_num], temp_storage_bytes[dev_num]);
255 |     }
256 |     // Run sorting operation
257 |     cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 
258 |                                     d_values, SIZE, 0, sizeof(int)*8, this_stream);
259 |     // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
260 | 
261 | 
262 | }
263 | }
264 | 


--------------------------------------------------------------------------------
/labels.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <thrust/device_vector.h>
  3 | #include <cub/cub.cuh>
  4 | 
  5 | extern cudaStream_t cuda_stream[16];
  6 | 
  7 | template<unsigned int i>
  8 | extern __global__ void debugMark(){};
  9 | 
 10 | namespace kmeans {
 11 | namespace detail {
 12 | 
 13 | void labels_init();
 14 | void labels_close();
 15 | 
 16 | template<typename T>
 17 | void memcpy(thrust::host_vector<T, std::allocator<T> > &H, 
 18 |             thrust::device_vector<T, thrust::device_malloc_allocator<T> > &D) {
 19 |     int dev_num;
 20 |     cudaGetDevice(&dev_num);
 21 |     cudaMemcpyAsync(thrust::raw_pointer_cast(H.data()), 
 22 |                thrust::raw_pointer_cast(D.data()), 
 23 |                sizeof(T) * D.size(), cudaMemcpyDeviceToHost, cuda_stream[dev_num]);
 24 | }
 25 | 
 26 | template<typename T>
 27 | void memcpy(thrust::device_vector<T, thrust::device_malloc_allocator<T> > &D, 
 28 |             thrust::host_vector<T, std::allocator<T> > &H) {
 29 |     int dev_num;
 30 |     cudaGetDevice(&dev_num);
 31 |     cudaMemcpyAsync(thrust::raw_pointer_cast(D.data()), 
 32 |                thrust::raw_pointer_cast(H.data()), 
 33 |                sizeof(T) * H.size(), cudaMemcpyHostToDevice, cuda_stream[dev_num]);
 34 | }
 35 | template<typename T>
 36 | void memcpy(thrust::device_vector<T, thrust::device_malloc_allocator<T> > &Do, 
 37 |             thrust::device_vector<T, thrust::device_malloc_allocator<T> > &Di) {
 38 |     int dev_num;
 39 |     cudaGetDevice(&dev_num);
 40 |     cudaMemcpyAsync(thrust::raw_pointer_cast(Do.data()), 
 41 |                thrust::raw_pointer_cast(Di.data()), 
 42 |                sizeof(T) * Di.size(), cudaMemcpyDeviceToDevice, cuda_stream[dev_num]);
 43 | }
 44 | template<typename T>
 45 | void memzero(thrust::device_vector<T, thrust::device_malloc_allocator<T> >& D) {
 46 |     int dev_num;
 47 |     cudaGetDevice(&dev_num);
 48 |     cudaMemsetAsync(thrust::raw_pointer_cast(D.data()), 0, sizeof(T)*D.size(), cuda_stream[dev_num]);
 49 | }
 50 | void streamsync(int dev_num);
 51 | 
 52 | void make_self_dots(int n, int d,
 53 |                     thrust::device_vector<double>& data,
 54 |                     thrust::device_vector<double>& dots);
 55 | 
 56 | void make_all_dots(int n, int k, thrust::device_vector<double>& data_dots,
 57 |                    thrust::device_vector<double>& centroid_dots,
 58 |                    thrust::device_vector<double>& dots);
 59 | 
 60 | void calculate_distances(int n, int d, int k,
 61 |                          thrust::device_vector<double>& data,
 62 |                          thrust::device_vector<double>& centroids,
 63 |                          thrust::device_vector<double>& data_dots,
 64 |                          thrust::device_vector<double>& centroid_dots,
 65 |                          thrust::device_vector<double>& pairwise_distances);
 66 | 
 67 | void relabel(int n, int k,
 68 |              thrust::device_vector<double>& pairwise_distances,
 69 |              thrust::device_vector<int>& labels,
 70 |              thrust::device_vector<double>& distances,
 71 |              int *d_changes); 
 72 | 
 73 | }
 74 | }
 75 | namespace mycub {
 76 | 
 77 | extern void *d_key_alt_buf[16];
 78 | extern unsigned int key_alt_buf_bytes[16];
 79 | extern void *d_value_alt_buf[16];
 80 | extern unsigned int value_alt_buf_bytes[16];
 81 | extern void *d_temp_storage[16];
 82 | extern size_t temp_storage_bytes[16];
 83 | extern void *d_temp_storage2[16];
 84 | extern size_t temp_storage_bytes2[16];
 85 | extern bool cub_initted;
 86 | 
 87 | void sort_by_key_int(thrust::device_vector<int>& keys, thrust::device_vector<int>& values);
 88 | 
 89 | template <typename T, typename U>
 90 | void sort_by_key(thrust::device_vector<T>& keys, thrust::device_vector<U>& values) {
 91 |     int dev_num;
 92 |     cudaGetDevice(&dev_num);
 93 |     cudaStream_t this_stream = cuda_stream[dev_num]; 
 94 |     int SIZE = keys.size();
 95 |     if (key_alt_buf_bytes[dev_num] < sizeof(T)*SIZE) {
 96 |        if (d_key_alt_buf[dev_num]) cudaFree(d_key_alt_buf[dev_num]);
 97 |        cudaMalloc(&d_key_alt_buf[dev_num], sizeof(T)*SIZE);
 98 |        key_alt_buf_bytes[dev_num] = sizeof(T)*SIZE;
 99 |        std::cout << "Malloc key_alt_buf" << std::endl;
100 |     }
101 |     if (value_alt_buf_bytes[dev_num] < sizeof(U)*SIZE) {
102 |        if (d_value_alt_buf[dev_num]) cudaFree(d_value_alt_buf[dev_num]);
103 |        cudaMalloc(&d_value_alt_buf[dev_num], sizeof(U)*SIZE);
104 |        value_alt_buf_bytes[dev_num] = sizeof(U)*SIZE;
105 |        std::cout << "Malloc value_alt_buf" << std::endl;
106 |     }
107 |     cub::DoubleBuffer<T> d_keys(thrust::raw_pointer_cast(keys.data()), (T*)d_key_alt_buf[dev_num]);
108 |     cub::DoubleBuffer<U> d_values(thrust::raw_pointer_cast(values.data()), (U*)d_value_alt_buf[dev_num]);
109 |     cudaError_t err;
110 | 
111 |     // Determine temporary device storage requirements for sorting operation
112 |     //if (temp_storage_bytes[dev_num] == 0) {
113 |         void *d_temp;
114 |         size_t temp_bytes;
115 |         err = cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_bytes, d_keys, 
116 |                                         d_values, SIZE, 0, sizeof(T)*8, this_stream);
117 |         // Allocate temporary storage for sorting operation
118 |         cudaMalloc(&d_temp, temp_bytes);
119 |         d_temp_storage[dev_num] = d_temp;
120 |         temp_storage_bytes[dev_num] = temp_bytes;
121 |         std::cout << "Malloc temp_storage. " << temp_storage_bytes[dev_num] << " bytes" << std::endl;
122 |         std::cout << "d_temp_storage[" << dev_num << "] = " << d_temp_storage[dev_num] << std::endl;
123 |         if (err) {
124 |            std::cout <<"Error " << err << " in SortPairs 1" << std::endl;
125 |            std::cout << cudaGetErrorString(err) << std::endl;
126 |         }
127 |     //}
128 |     // Run sorting operation
129 |     err = cub::DeviceRadixSort::SortPairs(d_temp, temp_bytes, d_keys, 
130 |                                     d_values, SIZE, 0, sizeof(T)*8, this_stream);
131 |     if (err) std::cout <<"Error in SortPairs 2" << std::endl;
132 |     //cub::DeviceRadixSort::SortPairs(d_temp_storage[dev_num], temp_storage_bytes[dev_num], d_keys, 
133 |     //                                d_values, SIZE, 0, sizeof(T)*8, this_stream);
134 | 
135 | }
136 | template <typename T>
137 | void sum_reduce(thrust::device_vector<T>& values, T* sum) {
138 |     int dev_num;
139 |     cudaGetDevice(&dev_num);
140 |     if (!d_temp_storage2[dev_num]) {
141 |         cub::DeviceReduce::Sum(d_temp_storage2[dev_num], temp_storage_bytes2[dev_num], thrust::raw_pointer_cast(values.data()),
142 |                               sum, values.size(), cuda_stream[dev_num]); 
143 |         // Allocate temporary storage for sorting operation
144 |         cudaMalloc(&d_temp_storage2[dev_num], temp_storage_bytes2[dev_num]);
145 |     }
146 |     cub::DeviceReduce::Sum(d_temp_storage2[dev_num], temp_storage_bytes2[dev_num], thrust::raw_pointer_cast(values.data()),
147 |                               sum, values.size(), cuda_stream[dev_num]); 
148 | }
149 | void cub_init();
150 | void cub_close();
151 | }
152 | 


--------------------------------------------------------------------------------
/orig/Makefile:
--------------------------------------------------------------------------------
 1 | #ifeq ($(BLAS), INTEL_MKL) 
 2 | #CC = icpc
 3 | #LIBS = -L/shared/apps/rhel-6.2/intel/parallel-studio-xe-2013/composer_xe_2013_sp1/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lpthread -lm
 4 | #endif
 5 | #ifeq ($(BLAS), OPENBLAS) 
 6 | CC = g++
 7 | INCL = -I/usr/include/openblas/
 8 | LIBS = -lopenblas -L/shared/apps/rhel-6.2/libs/openblas-0.2.8/sandybridge/lib
 9 | #endif
10 | 
11 | 
12 | test: test.cpp
13 | 	$(CC) -g -o test test.cpp $(INCL) $(LIBS)
14 | gemm_test: gemm_test.cpp
15 | 	$(CC) -g -o gemm_test gemm_test.cpp $(LIBS)
16 | example-1: example-1.cpp
17 | 	$(CC) -g -o example-1 example-1.cpp $(LIBS)
18 | 


--------------------------------------------------------------------------------
/orig/test.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "stdlib.h"
  4 | #include "string.h" //memset
  5 | //#include "mkl.h"
  6 | #include "cblas.h"
  7 | 
  8 | //extern "C" void dgemm_( char *, char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int * );
  9 | 
 10 | void random_data(std::vector<double>& array, int n) {
 11 |    for (int i=0; i<n; i++) array[i] = (double)rand()/(double)RAND_MAX;
 12 | }
 13 | 
 14 | void random_labels(std::vector<int>& array, int n, int max) {
 15 |    for (int i=0; i<n; i++) array[i] = rand()%max;
 16 | }
 17 | 
 18 | void self_dot(std::vector<double> array_in, int n, int dim, 
 19 |               std::vector<double>& dots) {
 20 |    for (int pt = 0; pt<n; pt++) {
 21 |       double sum = 0.0;
 22 |       for (int i=0; i<dim; i++) {
 23 |          sum += array_in[pt*dim+i]*array_in[pt*dim+i];
 24 |       }
 25 |       dots[pt] = sum;
 26 |    }
 27 | }
 28 | 
 29 | void find_centroids(std::vector<double> array_in, int n, int dim, 
 30 |                     std::vector<int> labels_in,
 31 |                     std::vector<double>& centroids, int n_cluster) {
 32 |    std::vector<int> members(n_cluster); //Number of points in each cluster
 33 |    memset(&members[0], 0, n_cluster*sizeof(int));
 34 |    memset(&centroids[0], 0, n_cluster*dim*sizeof(double));
 35 |    //Add all vectors in the cluster
 36 |    for(int pt=0; pt<n; pt++) {
 37 |       int this_cluster = labels_in[pt];
 38 |       members[this_cluster]++;
 39 |       for (int i=0; i<dim; i++) centroids[this_cluster*dim+i] += 
 40 |                                                            array_in[pt*dim+i];
 41 |    }
 42 |    //Divide by the number of points in the cluster 
 43 |    for(int cluster=0; cluster < n_cluster; cluster++) {
 44 |       if (dim < 6) std::cout << cluster << "(" << members[cluster] << " members):  ";
 45 |       for (int i=0; i<dim; i++) { 
 46 |          centroids[cluster*dim+i] /= members[cluster];
 47 |          if (dim < 6) std::cout << centroids[cluster*dim+i] << "  ";
 48 |       }
 49 |       if (dim < 6) std::cout << std::endl;
 50 |    }
 51 | }
 52 | 
 53 | void compute_distances(std::vector<double> data_in, 
 54 |                        std::vector<double> data_dots_in, 
 55 |                        int n, int dim, std::vector<double> centroids_in, 
 56 |                        std::vector<double> centroid_dots, int n_cluster, 
 57 |                        std::vector<double>& pairwise_distances) {
 58 |    self_dot(centroids_in, n_cluster, dim, centroid_dots);
 59 |    for (int nn=0; nn<n; nn++) 
 60 |       for (int c=0; c<n_cluster; c++) {
 61 |          pairwise_distances[nn*n_cluster+c] = data_dots_in[nn] + 
 62 |                                                         centroid_dots[c];
 63 |       }
 64 |    double alpha = -2.0;
 65 |    double beta = 1.0;
 66 |    char transa = 'N';
 67 |    char transb = 'N';
 68 |    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, n, n_cluster, 
 69 |                dim, alpha, &data_in[0], dim, &centroids_in[0], dim, 
 70 |                beta, &pairwise_distances[0], n_cluster);
 71 | }
 72 | int relabel(std::vector<double> data_in, int n, 
 73 |              std::vector<double> pairwise_distances_in,
 74 |              int n_cluster, std::vector<int>& labels) {
 75 |    int changes = 0; 
 76 |    for (int nn=0; nn<n; nn++) {
 77 |       double min = pairwise_distances_in[nn*n_cluster];
 78 |       int idx = 0;
 79 |       for (int cc=1; cc<n_cluster; cc++) {
 80 |          double this_dist = pairwise_distances_in[nn*n_cluster+cc];
 81 |          if (this_dist < min) {
 82 |             idx=cc;
 83 |             min=this_dist;
 84 |          }
 85 |       }
 86 |       if (labels[nn] != idx) {
 87 |          changes ++; 
 88 |          labels[nn] = idx;
 89 |       }
 90 |    }
 91 |    return changes;
 92 | }
 93 | int main(int argc, char** argv) {
 94 | 
 95 |    int n = 5e6;
 96 |    int d = 50;
 97 |    int n_cluster = 100;
 98 |    int iterations = 100;
 99 | 
100 |    if (argc>1) {
101 |      if (0==strcmp(argv[1], "--help")) {
102 |         std::cout << "Usage: test <number of points> <dimension of space>"
103 |                      " <number of clusters>" << std::endl;
104 |         return 0;
105 |      }
106 |      else n = atoi(argv[1]);
107 |    }
108 |    if (argc>2) d = atoi(argv[2]);
109 |    if (argc>3) n_cluster = atoi(argv[3]);
110 |    if (argc>4) iterations = atoi(argv[4]);
111 | 
112 |    std::cout << "Generating random data" << std::endl;
113 |    std::cout << n << " points of dimension " << d << std::endl;
114 |    std::cout << n_cluster << " clusters" << std::endl;
115 |    
116 |    std::vector<double> data(n*d); //input data
117 |    std::vector<double> centroids(n_cluster*d); //centroids for each cluster
118 |    std::vector<int> labels(n); //cluster labels for each point
119 |    std::vector<double> distances(n); //distances from point from a centroid
120 | 
121 |    random_data(data, n*d);
122 |  
123 | 
124 |    std::vector<double> data_dots(n);
125 |    std::vector<double> centroid_dots(n_cluster);
126 |    std::vector<double> pairwise_distances(n_cluster * n);
127 |    std::vector<int> labels_copy(n);
128 | 
129 |    self_dot(data, n, d, data_dots);
130 | 
131 |    //Let the first n_cluster points be the centroids of the clusters
132 |    memcpy(&centroids[0], &data[0], sizeof(double)*n_cluster*d);
133 |    
134 |    for(int i=0; i<iterations; i++) {
135 |       compute_distances(data, data_dots, n, d, centroids, centroid_dots, 
136 |                         n_cluster, pairwise_distances);
137 |       int movers = relabel(data, n, pairwise_distances, n_cluster, labels);
138 |       std::cout <<std::endl << "*** Iteration " << i << " ***" << std::endl;
139 |       std::cout << movers << " points moved between clusters." << std::endl;
140 |       if (0 == movers) break;
141 |       find_centroids(data, n, d, labels, centroids, n_cluster);
142 |    }
143 | }
144 | 


--------------------------------------------------------------------------------
/test.cu:
--------------------------------------------------------------------------------
  1 | #include <thrust/device_vector.h>
  2 | #include "kmeans.h"
  3 | #include "timer.h"
  4 | #include "util.h"
  5 | #include <iostream>
  6 | #include "cuda.h"
  7 | 
  8 | #include <cstdlib>
  9 | 
 10 | template<typename T>
 11 | void fill_array(T& array, int m, int n) {
 12 |     for(int i = 0; i < m; i++) {
 13 |         for(int j = 0; j < n; j++) {
 14 |             array[i * n + j] = (i % 2)*3 + j;
 15 |         }
 16 |     }
 17 | }
 18 | 
 19 | void random_data(thrust::device_vector<double>& array, int m, int n) {
 20 |     thrust::host_vector<double> host_array(m*n);
 21 |     for(int i = 0; i < m * n; i++) {
 22 |         host_array[i] = (double)rand()/(double)RAND_MAX;
 23 |     }
 24 |     array = host_array;
 25 | }
 26 | 
 27 | void random_labels(thrust::device_vector<int>& labels, int n, int k) {
 28 |     thrust::host_vector<int> host_labels(n);
 29 |     for(int i = 0; i < n; i++) {
 30 |         host_labels[i] = rand() % k;
 31 |     }
 32 |     labels = host_labels;
 33 | }
 34 | 
 35 | 
 36 | void tiny_test() {
 37 |     int iterations = 1;
 38 |     int n = 5;
 39 |     int d = 3;
 40 |     int k = 2;
 41 | 
 42 |     
 43 |     thrust::device_vector<double> *data[1];
 44 |     thrust::device_vector<int> *labels[1];
 45 |     thrust::device_vector<double> *centroids[1];
 46 |     thrust::device_vector<double> *distances[1];
 47 |     data[0] = new thrust::device_vector<double>(n * d);
 48 |     labels[0] = new thrust::device_vector<int>(n);
 49 |     centroids[0] = new thrust::device_vector<double>(k * d);
 50 |     distances[0] = new thrust::device_vector<double>(n);
 51 | 
 52 |     fill_array(*data[0], n, d);
 53 |     std::cout << "Data: " << std::endl;
 54 |     print_array(*data[0], n, d);
 55 | 
 56 |     (*labels[0])[0] = 0;
 57 |     (*labels[0])[1] = 0;
 58 |     (*labels[0])[2] = 0;
 59 |     (*labels[0])[3] = 1;
 60 |     (*labels[0])[4] = 1;
 61 | 
 62 |     std::cout << "Labels: " << std::endl;
 63 |     print_array(*labels[0], n, 1);
 64 |     
 65 |     int i = kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, 1);
 66 | 
 67 |     std::cout << "Labels: " << std::endl;
 68 |     print_array(*labels[0], n, 1);
 69 | 
 70 |     std::cout << "Centroids:" << std::endl;
 71 |     print_array(*centroids[0], k, d);
 72 | 
 73 |     std::cout << "Distances:" << std::endl;
 74 |     print_array(*distances[0], n, 1);
 75 |     delete(data[0]);
 76 |     delete(labels[0]);
 77 |     delete(centroids[0]);
 78 |     delete(distances[0]);
 79 | }
 80 | 
 81 | 
 82 | void more_tiny_test() {
 83 | 	double dataset[] = {
 84 | 		0.5, 0.5,
 85 | 		1.5, 0.5,
 86 | 		1.5, 1.5,
 87 | 		0.5, 1.5,
 88 | 		1.1, 1.2,
 89 | 		0.5, 15.5,
 90 | 		1.5, 15.5,
 91 | 		1.5, 16.5,
 92 | 		0.5, 16.5,
 93 | 		1.2, 16.1,
 94 | 		15.5, 15.5,
 95 | 		16.5, 15.5,
 96 | 		16.5, 16.5,
 97 | 		15.5, 16.5,
 98 | 		15.6, 16.2,
 99 | 		15.5, 0.5,
100 | 		16.5, 0.5,
101 | 		16.5, 1.5,
102 | 		15.5, 1.5,
103 | 		15.7, 1.6};
104 | 	double centers[] = {
105 | 		0.5, 0.5,
106 | 		1.5, 0.5,
107 | 		1.5, 1.5,
108 | 		0.5, 1.5};
109 | 	 
110 |     int iterations = 3;
111 |     int n = 20;
112 |     int d = 2;
113 |     int k = 4;
114 | 	
115 |     thrust::device_vector<double> *data[1];
116 |     thrust::device_vector<int> *labels[1];
117 |     thrust::device_vector<double> *centroids[1];
118 |     thrust::device_vector<double> *distances[1];
119 |     data[0] = new thrust::device_vector<double>(dataset, dataset+n*d);
120 |     labels[0] = new thrust::device_vector<int>(n);
121 |     centroids[0] = new thrust::device_vector<double>(centers, centers+k*d);
122 |     distances[0] = new thrust::device_vector<double>(n);
123 | 
124 |     
125 |     kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, 1, false);
126 | 
127 |     std::cout << "Labels: " << std::endl;
128 |     print_array(*labels[0], n, 1);
129 | 
130 |     std::cout << "Centroids:" << std::endl;
131 |     print_array(*centroids[0], k, d);
132 | 
133 | }
134 | 
135 | 
136 | 
137 | int main() {
138 |     std::cout << "Input a character to choose a test:" << std::endl;
139 |     std::cout << "Tiny test: t" << std::endl;
140 |     std::cout << "More tiny test: m" << std::endl;
141 |     std::cout << "Huge test: h: " << std::endl;
142 |     char c;
143 |     //std::cin >> c;
144 |     c = 'h';
145 |     switch (c) {
146 |     case 't':
147 |         tiny_test();
148 |         exit(0);
149 |     case 'm':
150 |         more_tiny_test();
151 |         exit(0);
152 |     case 'h':
153 |         break;
154 |     default:
155 |         std::cout << "Choice not understood, running huge test" << std::endl;
156 |     }
157 |     int iterations = 100;
158 |     int n = 5e6;
159 |     int d = 50;
160 |     int k = 100;
161 | 
162 |     int n_gpu;
163 |     
164 |     cudaGetDeviceCount(&n_gpu);
165 | 
166 |     //n_gpu = 1;
167 |     std::cout << n_gpu << " gpus." << std::endl;
168 | 
169 |     thrust::device_vector<double> *data[16];
170 |     thrust::device_vector<int> *labels[16];
171 |     thrust::device_vector<double> *centroids[16];
172 |     thrust::device_vector<double> *distances[16];
173 |     for (int q = 0; q < n_gpu; q++) {
174 |        cudaSetDevice(q);
175 |        data[q] = new thrust::device_vector<double>(n/n_gpu*d);
176 |        labels[q] = new thrust::device_vector<int>(n/n_gpu*d);
177 |        centroids[q] = new thrust::device_vector<double>(k * d);
178 |        distances[q] = new thrust::device_vector<double>(n);
179 |     }
180 | 
181 |     std::cout << "Generating random data" << std::endl;
182 |     std::cout << "Number of points: " << n << std::endl;
183 |     std::cout << "Number of dimensions: " << d << std::endl;
184 |     std::cout << "Number of clusters: " << k << std::endl;
185 |     std::cout << "Number of iterations: " << iterations << std::endl;
186 |     
187 |     for (int q = 0; q < n_gpu; q++) {
188 |        random_data(*data[q], n/n_gpu, d);
189 |        random_labels(*labels[q], n/n_gpu, k);
190 |     }
191 |     kmeans::timer t;
192 |     t.start();
193 |     kmeans::kmeans(iterations, n, d, k, data, labels, centroids, distances, n_gpu);
194 |     float time = t.stop();
195 |     std::cout << "  Time: " << time/1000.0 << " s" << std::endl;
196 | 
197 |     for (int q = 0; q < n_gpu; q++) {
198 |        delete(data[q]);
199 |        delete(labels[q]);
200 |        delete(centroids[q]);
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/timer.cu:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | 
 3 | namespace kmeans {
 4 | 
 5 | timer::timer() {
 6 |     cudaEventCreate(&m_start);
 7 |     cudaEventCreate(&m_stop);
 8 | }
 9 | 
10 | timer::~timer() {
11 |     cudaEventDestroy(m_start);
12 |     cudaEventDestroy(m_stop);
13 | }
14 | 
15 | void timer::start() {
16 |     cudaEventRecord(m_start, 0);
17 | }
18 | 
19 | float timer::stop() {
20 |     float time;
21 |     cudaEventRecord(m_stop, 0);
22 |     cudaEventSynchronize(m_stop);
23 |     cudaEventElapsedTime(&time, m_start, m_stop);
24 |     return time;
25 | }
26 | 
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/timer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | namespace kmeans {
 3 | 
 4 | struct timer {
 5 |     timer();
 6 |     ~timer();
 7 |     void start();
 8 |     float stop();
 9 | private:
10 |     cudaEvent_t m_start, m_stop;
11 | };
12 | 
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template<typename T>
 4 | void print_array(T& array, int m, int n) {
 5 |     for(int i = 0; i < m; i++) {
 6 |         for(int j = 0; j < n; j++) {
 7 |             typename T::value_type value = array[i * n + j];
 8 |             std::cout << value << " ";
 9 |         }
10 |         std::cout << std::endl;
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------