├── paper.pdf ├── timer.h ├── Benchmark └── Matlab │ ├── write_clustering_result.m │ ├── spectral_clustering.m │ └── read_unweighted_graph.m ├── timer.cu ├── Makefile ├── labels.cu ├── kmeans.h ├── Makefile_example.inc ├── README.md ├── centroids.h ├── labels.h ├── spectral_clustering.cu └── LICENSE /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuj-umd/fastsc/HEAD/paper.pdf -------------------------------------------------------------------------------- /timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace kmeans { 3 | 4 | struct timer { 5 | timer(); 6 | ~timer(); 7 | void start(); 8 | float stop(); 9 | private: 10 | cudaEvent_t m_start, m_stop; 11 | }; 12 | 13 | 14 | } 15 | -------------------------------------------------------------------------------- /Benchmark/Matlab/write_clustering_result.m: -------------------------------------------------------------------------------- 1 | function write_clustering_result(labels, output_filename) 2 | fileID = fopen(output_filename, 'w'); 3 | fprintf(fileID, 'Node ID Label\n'); 4 | for i = 1:size(labels, 1) 5 | fprintf(fileID, '%d %d \n', i, labels(i)); 6 | end 7 | fclose(fileID); 8 | 9 | 10 | 11 | end 12 | 13 | -------------------------------------------------------------------------------- /timer.cu: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | 3 | namespace kmeans { 4 | 5 | timer::timer() { 6 | cudaEventCreate(&m_start); 7 | cudaEventCreate(&m_stop); 8 | } 9 | 10 | timer::~timer() { 11 | cudaEventDestroy(m_start); 12 | cudaEventDestroy(m_stop); 13 | } 14 | 15 | void timer::start() { 16 | cudaEventRecord(m_start, 0); 17 | } 18 | 19 | float timer::stop() { 20 | float time; 21 | cudaEventRecord(m_stop, 0); 22 | cudaEventSynchronize(m_stop); 23 | cudaEventElapsedTime(&time, m_start, m_stop); 24 | return time; 25 | } 26 | 27 | } 28 | 29 | -------------------------------------------------------------------------------- /Benchmark/Matlab/spectral_clustering.m: -------------------------------------------------------------------------------- 1 | function spectral_clustering(input_filename, n, k, output_filename) 2 | S = read_unweighted_graph(input_filename); 3 | % 0-based index to 1-based index 4 | S = S + 1; 5 | val = ones(size(S, 1), 1); 6 | S = [S val]; 7 | G = spconvert(S); 8 | 9 | % Compute I - L. 10 | network_sum = sum(G, 2); 11 | D_inv = diag(1./(sqrt(network_sum))); 12 | L = D_inv*G*D_inv; 13 | L = (L + L') / 2; 14 | [V,D] = eigs(L, k, 'LM'); 15 | labels = kmeans(V, k); 16 | write_clustering_result(labels, output_filename); 17 | end -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CUDA_CPP = nvcc 2 | CUDA_ARCH ?= sm_35 3 | include ../arpackpp/Makefile.inc 4 | CUDA_FLAGS = -arch=$(CUDA_ARCH) -Xptxas -v 5 | CUDA_LIBS = -lcublas -lcusparse 6 | 7 | spectral_clustering: spectral_clustering.cu timer.o labels.o kmeans.h centroids.h 8 | $(CUDA_CPP) $(CPP_FLAGS) $(CUDA_FLAGS) -o spectral_clustering spectral_clustering.cu timer.o labels.o $(ALL_LIBS) $(CUDA_LIBS) 9 | 10 | labels.o: labels.cu labels.h 11 | $(CUDA_CPP) $(CPP_FLAGS) $(CUDA_FLAGS) -c -o labels.o labels.cu 12 | 13 | timer.o: timer.cu timer.h 14 | $(CUDA_CPP) $(CPP_FLAGS) $(CUDA_FLAGS) -c -o timer.o timer.cu 15 | -------------------------------------------------------------------------------- /labels.cu: -------------------------------------------------------------------------------- 1 | #include "labels.h" 2 | 3 | namespace kmeans { 4 | namespace detail { 5 | 6 | struct cublas_state { 7 | cublasHandle_t cublas_handle; 8 | cublas_state() { 9 | cublasStatus_t stat; 10 | stat = cublasCreate(&cublas_handle); 11 | if (stat != CUBLAS_STATUS_SUCCESS) { 12 | std::cout << "CUBLAS initialization failed" << std::endl; 13 | exit(1); 14 | } 15 | } 16 | ~cublas_state() { 17 | cublasStatus_t stat; 18 | stat = cublasDestroy(cublas_handle); 19 | if (stat != CUBLAS_STATUS_SUCCESS) { 20 | std::cout << "CUBLAS destruction failed" << std::endl; 21 | exit(1); 22 | } 23 | } 24 | }; 25 | 26 | 27 | cublas_state state; 28 | 29 | void gemm(cublasOperation_t transa, cublasOperation_t transb, 30 | int m, int n, int k, const float *alpha, 31 | const float *A, int lda, const float *B, int ldb, 32 | const float *beta, 33 | float *C, int ldc) { 34 | cublasStatus_t status = cublasSgemm(state.cublas_handle, transa, transb, 35 | m, n, k, alpha, 36 | A, lda, B, ldb, 37 | beta, 38 | C, ldc); 39 | if (status != CUBLAS_STATUS_SUCCESS) { 40 | std::cout << "Invalid Sgemm" << std::endl; 41 | exit(1); 42 | } 43 | } 44 | 45 | void gemm(cublasOperation_t transa, cublasOperation_t transb, 46 | int m, int n, int k, const double *alpha, 47 | const double *A, int lda, const double *B, int ldb, 48 | const double *beta, 49 | double *C, int ldc) { 50 | cublasStatus_t status = cublasDgemm(state.cublas_handle, transa, transb, 51 | m, n, k, alpha, 52 | A, lda, B, ldb, 53 | beta, 54 | C, ldc); 55 | if (status != CUBLAS_STATUS_SUCCESS) { 56 | std::cout << "Invalid Dgemm" << std::endl; 57 | exit(1); 58 | } 59 | } 60 | 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Benchmark/Matlab/read_unweighted_graph.m: -------------------------------------------------------------------------------- 1 | function S = read_unweighted_graph(filename, startRow, endRow) 2 | %IMPORTFILE Import numeric data from a text file as a matrix. 3 | % FACEBOOKG = IMPORTFILE(FILENAME) Reads data from text file FILENAME for 4 | % the default selection. 5 | % 6 | % FACEBOOKG = IMPORTFILE(FILENAME, STARTROW, ENDROW) Reads data from rows 7 | % STARTROW through ENDROW of text file FILENAME. 8 | % 9 | % Example: 10 | % facebookG = importfile('facebookG.txt', 1, 176468); 11 | % 12 | % See also TEXTSCAN. 13 | 14 | % Auto-generated by MATLAB on 2017/01/07 22:45:51 15 | 16 | %% Initialize variables. 17 | delimiter = ' '; 18 | if nargin<=2 19 | startRow = 1; 20 | endRow = inf; 21 | end 22 | 23 | %% Format string for each line of text: 24 | % column1: double (%f) 25 | % column2: double (%f) 26 | % For more information, see the TEXTSCAN documentation. 27 | formatSpec = '%f%f%[^\n\r]'; 28 | 29 | %% Open the text file. 30 | fileID = fopen(filename,'r'); 31 | 32 | %% Read columns of data according to format string. 33 | % This call is based on the structure of the file used to generate this 34 | % code. If an error occurs for a different file, try regenerating the code 35 | % from the Import Tool. 36 | dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', delimiter, 'MultipleDelimsAsOne', true, 'EmptyValue' ,NaN,'HeaderLines', startRow(1)-1, 'ReturnOnError', false); 37 | for block=2:length(startRow) 38 | frewind(fileID); 39 | dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', delimiter, 'MultipleDelimsAsOne', true, 'EmptyValue' ,NaN,'HeaderLines', startRow(block)-1, 'ReturnOnError', false); 40 | for col=1:length(dataArray) 41 | dataArray{col} = [dataArray{col};dataArrayBlock{col}]; 42 | end 43 | end 44 | 45 | %% Close the text file. 46 | fclose(fileID); 47 | 48 | %% Post processing for unimportable data. 49 | % No unimportable data rules were applied during the import, so no post 50 | % processing code is included. To generate code which works for 51 | % unimportable data, select unimportable cells in a file and regenerate the 52 | % script. 53 | 54 | %% Create output variable 55 | S = [dataArray{1:end-1}]; 56 | 57 | -------------------------------------------------------------------------------- /kmeans.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "centroids.h" 4 | #include "labels.h" 5 | #include 6 | 7 | namespace kmeans { 8 | 9 | 10 | //! kmeans clusters data into k groups 11 | /*! 12 | 13 | \param iterations How many iterations to run 14 | \param n Number of data points 15 | \param d Number of dimensions 16 | \param k Number of clusters 17 | \param data Data points, in row-major order. This vector must have 18 | size n * d, and since it's in row-major order, data point x occupies 19 | positions [x * d, (x + 1) * d) in the vector. The vector is passed 20 | by reference since it is shared with the caller and not copied. 21 | \param labels Cluster labels. This vector has size n. 22 | The vector is passed by reference since it is shared with the caller 23 | and not copied. 24 | \param centroids Centroid locations, in row-major order. This 25 | vector must have size k * d, and since it's in row-major order, 26 | centroid x occupies positions [x * d, (x + 1) * d) in the 27 | vector. The vector is passed by reference since it is shared 28 | with the caller and not copied. 29 | \param distances Distances from points to centroids. This vector has 30 | size n. It is passed by reference since it is shared with the caller 31 | and not copied. 32 | \param init_from_labels If true, the labels need to be initialized 33 | before calling kmeans. If false, the centroids need to be 34 | initialized before calling kmeans. Defaults to true, which means 35 | the labels must be initialized. 36 | \param threshold This controls early termination of the kmeans 37 | iterations. If the ratio of the sum of distances from points to 38 | centroids from this iteration to the previous iteration changes by 39 | less than the threshold, than the iterations are 40 | terminated. Defaults to 0.000001 41 | \return The number of iterations actually performed. 42 | */ 43 | 44 | template 45 | int kmeans(int iterations, 46 | int n, int d, int k, 47 | thrust::device_vector& data, 48 | thrust::device_vector& labels, 49 | thrust::device_vector& centroids, 50 | thrust::device_vector& distances, 51 | bool init_from_labels=true, 52 | double threshold=0.000001) { 53 | thrust::device_vector data_dots(n); 54 | thrust::device_vector centroid_dots(n); 55 | thrust::device_vector pairwise_distances(n * k); 56 | 57 | detail::make_self_dots(n, d, data, data_dots); 58 | 59 | if (init_from_labels) { 60 | detail::find_centroids(n, d, k, data, labels, centroids); 61 | } 62 | T prior_distance_sum = 0; 63 | int i = 0; 64 | for(; i < iterations; i++) { 65 | detail::calculate_distances(n, d, k, 66 | data, centroids, data_dots, 67 | centroid_dots, pairwise_distances); 68 | 69 | int changes = detail::relabel(n, k, pairwise_distances, labels, distances); 70 | 71 | 72 | detail::find_centroids(n, d, k, data, labels, centroids); 73 | T distance_sum = thrust::reduce(distances.begin(), distances.end()); 74 | std::cout << "Iteration " << i << " produced " << changes 75 | << " changes, and total distance is " << distance_sum << std::endl; 76 | 77 | if (i > 0) { 78 | T delta = distance_sum / prior_distance_sum; 79 | if (delta > 1 - threshold) { 80 | std::cout << "Threshold triggered, terminating iterations early" << std::endl; 81 | return i + 1; 82 | } 83 | } 84 | prior_distance_sum = distance_sum; 85 | } 86 | return i; 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /Makefile_example.inc: -------------------------------------------------------------------------------- 1 | # ARPACK++ v1.2 2/18/2000 2 | # c++ interface to ARPACK code. 3 | # This file contains some definitions used to compile arpack++ examples 4 | # with the g++ compiler under linux. 5 | 6 | 7 | # Defining the machine. 8 | 9 | PLAT = linux 10 | 11 | # Defining the compiler. 12 | 13 | CPP = g++ 14 | 15 | # Defining ARPACK++ directories. 16 | # ARPACKPP_INC is the directory that contains all arpack++ header files. 17 | # SUPERLU_DIR and UMFPACK_DIR must be set to ARPACKPP_INC. 18 | 19 | ############################################################################# 20 | # Change directory path here 21 | #ARPACKPP_DIR = $(HOME)/arpack++ 22 | #ARPACKPP_DIR = ../../.. 23 | ARPACKPP_DIR = $(HOME)/arpackpp 24 | ARPACKPP_INC = $(ARPACKPP_DIR)/include 25 | #SUPERLU_DIR = $(ARPACKPP_INC) 26 | SUPERLU_DIR = $(ARPACKPP_DIR)/external/SuperLU 27 | UMFPACK_DIR = $(ARPACKPP_INC) 28 | 29 | 30 | # Defining ARPACK, LAPACK, UMFPACK, SUPERLU, BLAS and FORTRAN libraries. 31 | # See the arpack++ manual or the README file for directions on how to 32 | # obtain arpack, umfpack and SuperLU packages. 33 | # UMFPACK_LIB and SUPERLU_LIB must be declared only if umfpack and superlu 34 | # are going to be used. Some BLAS and LAPACK fortran routines are 35 | # distributed along with arpack fortran code, but the user should verify 36 | # if optimized versions of these libraries are available before installing 37 | # arpack. The fortran libraries described below are those required to link 38 | # fortran and c++ code using gnu g++ and f77 compiler under linux. 39 | # Other libraries should be defined if the user intends to compile 40 | # arpack++ on another environment. 41 | 42 | ############################################################################# 43 | # Change library path here 44 | ARPACK_LIB = $(ARPACKPP_DIR)/external/libarpack.a 45 | LAPACK_LIB = 46 | SUPERLU_LIB = $(ARPACKPP_DIR)/external/libsuperlu.a 47 | BLAS_LIB = $(ARPACKPP_DIR)/external/libopenblas.a 48 | FORTRAN_LIBS = -lgfortran 49 | 50 | # SuiteSparse contains the UMFPACK and CHOLMOD code. To link examples against 51 | # these, set the SUITESPARSE_DIR to point to your installation: 52 | #SUITESPARSE_DIR = $(ARPACKPP_DIR)/../SuiteSparse 53 | 54 | SUITESPARSE_DIR = $(ARPACKPP_DIR)/external/SuiteSparse 55 | ############################################################################# 56 | 57 | UMFPACK_LIB = $(SUITESPARSE_DIR)/UMFPACK/Lib/libumfpack.a \ 58 | $(SUITESPARSE_DIR)/CHOLMOD/Lib/libcholmod.a \ 59 | $(SUITESPARSE_DIR)/COLAMD/Lib/libcolamd.a \ 60 | $(SUITESPARSE_DIR)/CCOLAMD/Lib/libccolamd.a \ 61 | $(SUITESPARSE_DIR)/metis-4.0/libmetis.a \ 62 | $(SUITESPARSE_DIR)/CAMD/Lib/libcamd.a \ 63 | $(SUITESPARSE_DIR)/AMD/Lib/libamd.a \ 64 | $(SUITESPARSE_DIR)/SuiteSparse_config/libsuitesparseconfig.a 65 | 66 | CHOLMOD_LIB = $(SUITESPARSE_DIR)/CHOLMOD/Lib/libcholmod.a \ 67 | $(SUITESPARSE_DIR)/COLAMD/Lib/libcolamd.a \ 68 | $(SUITESPARSE_DIR)/CCOLAMD/Lib/libccolamd.a \ 69 | $(SUITESPARSE_DIR)/metis-4.0/libmetis.a \ 70 | $(SUITESPARSE_DIR)/CAMD/Lib/libcamd.a \ 71 | $(SUITESPARSE_DIR)/AMD/Lib/libamd.a \ 72 | 73 | # For cholmod need additional headers: 74 | CHOLMOD_INC = -I$(SUITESPARSE_DIR)/CHOLMOD/Include -I$(SUITESPARSE_DIR)/SuiteSparse_config 75 | 76 | ############################################################################# 77 | # Change CPP configuration. 78 | # Defining g++ flags and directories. 79 | 80 | # CPP_WARNINGS = -fpermissive 81 | CPP_WARNINGS = -Wall 82 | CPP_DEBUG = -g 83 | CPP_OPTIM = -O 84 | 85 | # If nvcc is used, no pthread 86 | #CPP_LIBS = -pthread 87 | CPP_LIBS = 88 | CPP_INC = 89 | ############################################################################# 90 | 91 | CPP_FLAGS = $(CPP_DEBUG) -D$(PLAT) -I$(ARPACKPP_INC) -I$(CPP_INC) \ 92 | $(CPP_WARNINGS) 93 | 94 | # Putting all libraries together. 95 | 96 | ALL_LIBS = $(CPP_LIBS) $(ARPACK_LIB) \ 97 | $(BLAS_LIB) $(LAPACK_LIB) $(FORTRAN_LIBS) 98 | 99 | # defining paths. 100 | 101 | vpath %.h $(ARPACK_INC) 102 | 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Fast Implementation of Spectral Clustering on GPU-CPU Platforms 2 | 3 | ## Introduction 4 | ## 5 | 6 | This software package provides a fast implementation of spectral clustering on GPU and CPU platforms. 7 | 8 | This work is published on IPDPS 2016 workshop titled as "A high performance implementation of spectral clustering on CPU-GPU platforms" authored by Yu Jin and Joseph F. JaJa. 9 | 10 | If you use the software in your applications, please cite the paper as 11 | 12 | 13 | @inproceedings{jin2016, 14 | title={A high performance implementation of spectral clustering on cpu-gpu platforms}, 15 | author={Jin, Yu and JaJa, Joseph F}, 16 | booktitle={Parallel and Distributed Processing Symposium Workshops, 2016 IEEE International}, 17 | pages={825--834}, 18 | year={2016}, 19 | organization={IEEE} 20 | } 21 | 22 | 23 | 24 | Spectral clustering is one of the most popular clustering algorithms for finding structural communities in graphs. However, the running time of the algorithm is usually quite long as it involves very expensive numerical operations, i.e. finding the smallest few eigenvectors of a real symmetric matrix. 25 | 26 | In this package, we provides a fast implementation of the spectral clustering algorithm which is significantly faster than using other CPU-based software packages such as Matlab and Python. As far as we know, our implementation is also the fastest implementation available in the open source community. 27 | 28 | The implementation contains three parts: 29 | 30 | - Normalize the edge weights by the inverse of the corresponding node degrees. 31 | - Computet the first k eigenvectors of the normalized Laplacian matrix based on arpackpp package and CUDA libraries. 32 | - Apply k-means algorithm on rows of the k eigenvectors. The implementation of k-means algorithm is originally developed by Bran Catanzaro at https://github.com/bryancatanzaro/kmeans 33 | 34 | Each part can be easily divided for individual functional usage. 35 | 36 | If you have trouble working with the software package, please contact Yu Jin (yuj AT umd.edu). 37 | 38 | 39 | ## Installation 40 | ### CUDA Environment Setup 41 | CUDA libraries, such as CUSPARSE, CUBLAS and Thrust are pre-installed. 42 | 43 | ### arpackpp installation 44 | Check out ARPACK++ package from https://github.com/yuj-umd/arpackpp 45 | 46 | ``` 47 | $ git clone https://github.com/yuj-umd/arpackpp.git 48 | $ cd arpackpp 49 | ``` 50 | 51 | Install the libraries 52 | 53 | ``` 54 | $ ./install-openblas.sh 55 | $ ./install-arpack-ng.sh 56 | $ ./install-superlu.sh 57 | $ ./install-suitesparse.sh 58 | 59 | ``` 60 | Change the directory and library path in Makefile.inc, as instructed in Makefile_example.inc. 61 | 62 | ### fastsc installation 63 | Check out the code from https://github.com/yuj-umd/fastsc 64 | 65 | ``` 66 | $ git clone https://github.com/yuj-umd/fastsc.git 67 | $ cd fastsc 68 | ``` 69 | Modify the arpack library path and cuda architecture in Makefile. 70 | 71 | Compile and run the program 72 | ``` 73 | $ make 74 | $ ./spectral_clustering input_file_name n k output_file_name 75 | ``` 76 | 77 | ## Usage 78 | The program format is 79 | ``` 80 | $ ./spectral_clustering input_file_name n k output_file_name 81 | ``` 82 | 83 | The input file contains the graph information represented as edge list. By default, the program supports unweighted graphs where each row contains two node indices. It is easy to adapt the code for weighted graphs and other graph representations. 84 | 85 | n is the total number of nodes and k is the desired number of clusters. The graph nodes are indexed from 0 to n-1 and there are NO isolated nodes. 86 | 87 | output file will contain the node ID and the corresponding label. 88 | 89 | Two input examples are contained in Dataset folder. 90 | 91 | ## Benchmarks 92 | The Benchmark folder contains Matlab code with the same function. Our implementation is faster than the naive Matlab implementation especially for large-scale problems. 93 | 94 | 95 | ## Reference 96 | Jin, Yu, and Joseph F. JaJa. "A high performance implementation of spectral clustering on cpu-gpu platforms." Parallel and Distributed Processing Symposium Workshops, 2016 IEEE International. IEEE, 2016. 97 | -------------------------------------------------------------------------------- /centroids.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | __device__ double atomicAdd2(double* address, double val) 8 | { 9 | unsigned long long int* address_as_ull = 10 | (unsigned long long int*)address; 11 | unsigned long long int old = *address_as_ull, assumed; 12 | do { 13 | assumed = old; 14 | old = atomicCAS(address_as_ull, assumed, 15 | __double_as_longlong(val + 16 | __longlong_as_double(assumed))); 17 | } while (assumed != old); 18 | return __longlong_as_double(old); 19 | } 20 | 21 | namespace kmeans { 22 | namespace detail { 23 | 24 | template 25 | __device__ __forceinline__ 26 | void update_centroid(int label, int dimension, 27 | int d, 28 | T accumulator, T* centroids, 29 | int count, int* counts) { 30 | int index = label * d + dimension; 31 | T* target = centroids + index; 32 | atomicAdd(target, accumulator); 33 | if (dimension == 0) { 34 | atomicAdd(counts + label, count); 35 | } 36 | } 37 | 38 | template 39 | __global__ void calculate_centroids(int n, int d, int k, 40 | T* data, 41 | int* ordered_labels, 42 | int* ordered_indices, 43 | T* centroids, 44 | int* counts) { 45 | int in_flight = blockDim.y * gridDim.y; 46 | int labels_per_row = (n - 1) / in_flight + 1; 47 | for(int dimension = threadIdx.x; dimension < d; dimension += blockDim.x) { 48 | T accumulator = 0; 49 | int count = 0; 50 | int global_id = threadIdx.y + blockIdx.y * blockDim.y; 51 | int start = global_id * labels_per_row; 52 | int end = (global_id + 1) * labels_per_row; 53 | end = (end > n) ? n : end; 54 | int prior_label; 55 | if (start < n) { 56 | prior_label = ordered_labels[start]; 57 | 58 | for(int label_number = start; label_number < end; label_number++) { 59 | int label = ordered_labels[label_number]; 60 | if (label != prior_label) { 61 | update_centroid(prior_label, dimension, 62 | d, 63 | accumulator, centroids, 64 | count, counts); 65 | accumulator = 0; 66 | count = 0; 67 | } 68 | 69 | T value = data[dimension + ordered_indices[label_number] * d]; 70 | accumulator += value; 71 | prior_label = label; 72 | count++; 73 | } 74 | update_centroid(prior_label, dimension, 75 | d, 76 | accumulator, centroids, 77 | count, counts); 78 | } 79 | } 80 | } 81 | 82 | template 83 | __global__ void scale_centroids(int d, int k, int* counts, T* centroids) { 84 | int global_id_x = threadIdx.x + blockIdx.x * blockDim.x; 85 | int global_id_y = threadIdx.y + blockIdx.y * blockDim.y; 86 | if ((global_id_x < d) && (global_id_y < k)) { 87 | int count = counts[global_id_y]; 88 | //To avoid introducing divide by zero errors 89 | //If a centroid has no weight, we'll do no normalization 90 | //This will keep its coordinates defined. 91 | if (count < 1) { 92 | count = 1; 93 | } 94 | double scale = 1.0/double(count); 95 | centroids[global_id_x + d * global_id_y] *= scale; 96 | } 97 | } 98 | 99 | template 100 | void find_centroids(int n, int d, int k, 101 | thrust::device_vector& data, 102 | //Labels are taken by value because 103 | //they get destroyed in sort_by_key 104 | //So we need to make a copy of them 105 | thrust::device_vector labels, 106 | thrust::device_vector& centroids) { 107 | thrust::device_vector indices(n); 108 | thrust::device_vector counts(k); 109 | thrust::copy(thrust::counting_iterator(0), 110 | thrust::counting_iterator(n), 111 | indices.begin()); 112 | //Bring all labels with the same value together 113 | thrust::sort_by_key(labels.begin(), 114 | labels.end(), 115 | indices.begin()); 116 | 117 | //Initialize centroids to all zeros 118 | thrust::fill(centroids.begin(), 119 | centroids.end(), 120 | 0); 121 | 122 | //Calculate centroids 123 | int n_threads_x = 64; 124 | int n_threads_y = 16; 125 | //XXX Number of blocks here is hard coded at 30 126 | //This should be taken care of more thoughtfully. 127 | detail::calculate_centroids<<>> 128 | (n, d, k, 129 | thrust::raw_pointer_cast(data.data()), 130 | thrust::raw_pointer_cast(labels.data()), 131 | thrust::raw_pointer_cast(indices.data()), 132 | thrust::raw_pointer_cast(centroids.data()), 133 | thrust::raw_pointer_cast(counts.data())); 134 | 135 | //Scale centroids 136 | detail::scale_centroids<<>> 137 | (d, k, 138 | thrust::raw_pointer_cast(counts.data()), 139 | thrust::raw_pointer_cast(centroids.data())); 140 | } 141 | 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /labels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace kmeans { 7 | namespace detail { 8 | 9 | //n: number of points 10 | //d: dimensionality of points 11 | //data: points, laid out in row-major order (n rows, d cols) 12 | //dots: result vector (n rows) 13 | // NOTE: 14 | //Memory accesses in this function are uncoalesced!! 15 | //This is because data is in row major order 16 | //However, in k-means, it's called outside the optimization loop 17 | //on the large data array, and inside the optimization loop it's 18 | //called only on a small array, so it doesn't really matter. 19 | //If this becomes a performance limiter, transpose the data somewhere 20 | template 21 | __global__ void self_dots(int n, int d, T* data, T* dots) { 22 | T accumulator = 0; 23 | int global_id = blockDim.x * blockIdx.x + threadIdx.x; 24 | 25 | if (global_id < n) { 26 | for (int i = 0; i < d; i++) { 27 | T value = data[i + global_id * d]; 28 | accumulator += value * value; 29 | } 30 | dots[global_id] = accumulator; 31 | } 32 | } 33 | 34 | 35 | template 36 | void make_self_dots(int n, int d, 37 | thrust::device_vector& data, 38 | thrust::device_vector& dots) { 39 | self_dots<<<(n-1)/256+1, 256>>>(n, d, thrust::raw_pointer_cast(data.data()), 40 | thrust::raw_pointer_cast(dots.data())); 41 | } 42 | 43 | template 44 | __global__ void all_dots(int n, int k, T* data_dots, T* centroid_dots, T* dots) { 45 | __shared__ T local_data_dots[32]; 46 | __shared__ T local_centroid_dots[32]; 47 | 48 | int data_index = threadIdx.x + blockIdx.x * blockDim.x; 49 | if ((data_index < n) && (threadIdx.y == 0)) { 50 | local_data_dots[threadIdx.x] = data_dots[data_index]; 51 | } 52 | 53 | int centroid_index = threadIdx.x + blockIdx.y * blockDim.y; 54 | if ((centroid_index < k) && (threadIdx.y == 1)) { 55 | local_centroid_dots[threadIdx.x] = centroid_dots[centroid_index]; 56 | } 57 | 58 | __syncthreads(); 59 | 60 | centroid_index = threadIdx.y + blockIdx.y * blockDim.y; 61 | if ((data_index < n) && (centroid_index < k)) { 62 | dots[data_index + centroid_index * n] = local_data_dots[threadIdx.x] + 63 | local_centroid_dots[threadIdx.y]; 64 | } 65 | } 66 | 67 | 68 | template 69 | void make_all_dots(int n, int k, thrust::device_vector& data_dots, 70 | thrust::device_vector& centroid_dots, 71 | thrust::device_vector& dots) { 72 | all_dots<<< 73 | dim3((n-1)/32+1, 74 | (k-1)/32+1), 75 | dim3(32, 32)>>>(n, k, thrust::raw_pointer_cast(data_dots.data()), 76 | thrust::raw_pointer_cast(centroid_dots.data()), 77 | thrust::raw_pointer_cast(dots.data())); 78 | } 79 | 80 | void gemm(cublasOperation_t transa, 81 | cublasOperation_t transb, 82 | int m, int n, int k, 83 | const float *alpha, 84 | const float *A, int lda, 85 | const float *B, int ldb, 86 | const float *beta, 87 | float *C, int ldc); 88 | 89 | void gemm(cublasOperation_t transa, 90 | cublasOperation_t transb, 91 | int m, int n, int k, 92 | const double *alpha, 93 | const double *A, int lda, 94 | const double *B, int ldb, 95 | const double *beta, 96 | double *C, int ldc); 97 | 98 | template 99 | void calculate_distances(int n, int d, int k, 100 | thrust::device_vector& data, 101 | thrust::device_vector& centroids, 102 | thrust::device_vector& data_dots, 103 | thrust::device_vector& centroid_dots, 104 | thrust::device_vector& pairwise_distances) { 105 | detail::make_self_dots(k, d, centroids, centroid_dots); 106 | detail::make_all_dots(n, k, data_dots, centroid_dots, pairwise_distances); 107 | //||x-y||^2 = ||x||^2 + ||y||^2 - 2 x . y 108 | //pairwise_distances has ||x||^2 + ||y||^2, so beta = 1 109 | //The dgemm calculates x.y for all x and y, so alpha = -2.0 110 | T alpha = -2.0; 111 | T beta = 1.0; 112 | //If the data were in standard column major order, we'd do a 113 | //centroids * data ^ T 114 | //But the data is in row major order, so we have to permute 115 | //the arguments a little 116 | gemm(CUBLAS_OP_T, CUBLAS_OP_N, 117 | n, k, d, &alpha, 118 | thrust::raw_pointer_cast(data.data()), 119 | d,//Has to be n or d 120 | thrust::raw_pointer_cast(centroids.data()), 121 | d,//Has to be k or d 122 | &beta, 123 | thrust::raw_pointer_cast(pairwise_distances.data()), 124 | n); //Has to be n or k 125 | } 126 | 127 | template 128 | __global__ void make_new_labels(int n, int k, T* pairwise_distances, 129 | int* labels, int* changes, 130 | T* distances) { 131 | T min_distance = DBL_MAX; 132 | T min_idx = -1; 133 | int global_id = threadIdx.x + blockIdx.x * blockDim.x; 134 | if (global_id < n) { 135 | int old_label = labels[global_id]; 136 | for(int c = 0; c < k; c++) { 137 | T distance = pairwise_distances[c * n + global_id]; 138 | if (distance < min_distance) { 139 | min_distance = distance; 140 | min_idx = c; 141 | } 142 | } 143 | labels[global_id] = min_idx; 144 | distances[global_id] = sqrt(min_distance); 145 | if (old_label != min_idx) { 146 | atomicAdd(changes, 1); 147 | } 148 | } 149 | } 150 | 151 | 152 | template 153 | int relabel(int n, int k, 154 | thrust::device_vector& pairwise_distances, 155 | thrust::device_vector& labels, 156 | thrust::device_vector& distances) { 157 | thrust::device_vector changes(1); 158 | changes[0] = 0; 159 | make_new_labels<<<(n-1)/256+1,256>>>( 160 | n, k, 161 | thrust::raw_pointer_cast(pairwise_distances.data()), 162 | thrust::raw_pointer_cast(labels.data()), 163 | thrust::raw_pointer_cast(changes.data()), 164 | thrust::raw_pointer_cast(distances.data())); 165 | return changes[0]; 166 | } 167 | 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /spectral_clustering.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "rsymsol.h" 3 | #include "arrssym.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "cusparse.h" 11 | #include "cuda_runtime.h" 12 | #include 13 | #include "timer.h" 14 | #include "kmeans.h" 15 | 16 | using namespace std; 17 | 18 | int CUDA_MULT(float *x, float *y, cusparseHandle_t& handle, cusparseStatus_t& status, cusparseMatDescr_t& descr, int n, int nnz, thrust::device_vector& csrRowPtr, thrust::device_vector& cooColIndex, thrust::device_vector& cooVal, thrust::device_vector& tmpx, thrust::device_vector& tmpy){ 19 | float fone = 1.0; 20 | float fzero = 0.0; 21 | cudaMemcpy(thrust::raw_pointer_cast(tmpx.data()), x, n*sizeof(float), cudaMemcpyHostToDevice); 22 | status = cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 23 | n, n, nnz, &fone, 24 | descr, 25 | thrust::raw_pointer_cast(cooVal.data()), 26 | thrust::raw_pointer_cast(csrRowPtr.data()) , thrust::raw_pointer_cast(cooColIndex.data()), 27 | thrust::raw_pointer_cast(tmpx.data()), &fzero, 28 | thrust::raw_pointer_cast(tmpy.data())); 29 | if (status != CUSPARSE_STATUS_SUCCESS) { 30 | printf("cusparseScsrmv Failed"); 31 | return 1; 32 | } 33 | cudaMemcpy(y, thrust::raw_pointer_cast(tmpy.data()), n*sizeof(float), cudaMemcpyDeviceToHost); 34 | return 0; 35 | 36 | } 37 | 38 | 39 | void random_labels(thrust::device_vector& labels, int n, int k) { 40 | thrust::host_vector host_labels(n); 41 | for(int i = 0; i < n; i++) { 42 | host_labels[i] = rand() % k; 43 | } 44 | labels = host_labels; 45 | } 46 | 47 | void regular_labels(thrust::device_vector& labels, int n, int k) { 48 | // Initialize by assigning nodes that are close in indexing order with the same label. 49 | thrust::host_vector host_labels(n); 50 | int l = n/k; 51 | int count = 0; 52 | int cur = 0; 53 | for(int i = 0; i < n; i++) { 54 | host_labels[i] = cur; 55 | count++; 56 | if(count > l) { 57 | cur++; 58 | count = 0; 59 | } 60 | } 61 | labels = host_labels; 62 | } 63 | 64 | int main(int argc, char* argv[]) { 65 | if(argc < 5) { 66 | cout<<"Not enough input arguments!"<. 76 | // For undirected graphs, both and need to be included in the file. 77 | // Nodes are indexed from 0 to n-1 with no isolated nodes. 78 | ifstream infile(argv[1]); 79 | if(!infile) { 80 | cout<<"wrong input file"< row(nnz), col(nnz); 91 | 92 | // Initialize the degree 93 | thrust::host_vector degree(n, 0.0); 94 | 95 | // For unweighted graphs, edge weights are initilized to 1.0. Otherwise, revise the code to the specific graph representation. 96 | thrust::host_vector val(nnz, 1.0); 97 | infile.close(); 98 | infile.open(argv[1]); 99 | cout<<"Start loading data..."<>row[i]>>col[i]; 102 | if (row[i] >= n || col[i] >= n) { 103 | cout<<"Index exceed the dimension. Please check the right number of nodes"<>row[i]>>col[i]>>val[i]; 108 | degree[row[i]] = degree[row[i]] + val[i]; 109 | } 110 | infile.close(); 111 | cout<<"Loading data completed!"< degree_sqrt(n); 122 | 123 | // Normlize the edge weight of by 1.0/sqrt(degree[i] * degree[j]) 124 | for(int i = 0; i < n; ++i) { 125 | degree_sqrt[i] = sqrt(degree[i]); 126 | } 127 | 128 | for(int i = 0; i < nnz; ++i) { 129 | val[i] = val[i] / (degree_sqrt[col[i]] * degree_sqrt[row[i]]); 130 | } 131 | 132 | cout<<"Computing normalized Graph Laplacian completed"< cooRowIndex = row; 135 | thrust::device_vector cooColIndex = col; 136 | thrust::device_vector cooVal = val; 137 | cusparseStatus_t status; 138 | cusparseHandle_t handle=0; 139 | cusparseMatDescr_t descr=0; 140 | status= cusparseCreate(&handle); 141 | status= cusparseCreateMatDescr(&descr); 142 | if (status != CUSPARSE_STATUS_SUCCESS) { 143 | printf("Matrix descriptor initialization failed"); 144 | return 1; 145 | } 146 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 147 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); 148 | thrust::device_vector csrRowPtr(n+1); 149 | 150 | status= cusparseXcoo2csr(handle,thrust::raw_pointer_cast(cooRowIndex.data()),nnz,n, 151 | thrust::raw_pointer_cast(csrRowPtr.data()),CUSPARSE_INDEX_BASE_ZERO); 152 | if (status != CUSPARSE_STATUS_SUCCESS) { 153 | printf("Conversion from COO to CSR format failed"); 154 | return 1; 155 | } 156 | thrust::device_vector tmpx(n); 157 | thrust::device_vector tmpy(n); 158 | ARrcSymStdEig prob(n, k, "LM"); 159 | while (!prob.ArnoldiBasisFound()) { 160 | prob.TakeStep(); 161 | if ((prob.GetIdo() == 1)||(prob.GetIdo() == -1)) { 162 | CUDA_MULT(prob.GetVector(), prob.PutVector(), handle, status, descr, n, nnz, csrRowPtr, cooColIndex, cooVal, tmpx, tmpy); 163 | } 164 | } 165 | 166 | // Finding eigenvalues and eigenvectors. 167 | prob.FindEigenvectors(); 168 | // Printing eigenvalue solution. 169 | // Solution(prob); 170 | 171 | cout<<"Completed computing the first smallest k eigenvectors!"< eigenvectors_h(n*k); 177 | for (int i = 0; i < n; ++i) { 178 | for (int j = 0; j < k; ++j) { 179 | eigenvectors_h[i*k + j] = prob.Eigenvector(j, i); 180 | } 181 | } 182 | 183 | //Apply K-means algorithm on the eigenvectors 184 | int iterations = 100; 185 | // The dimension of each point is equal to the number of desired clusters. 186 | int d = k; 187 | thrust::device_vector eigenvectors_d = eigenvectors_h; 188 | thrust::device_vector labels(n); 189 | thrust::device_vector centroids(k * d); 190 | thrust::device_vector distances(n); 191 | // Randomly initialize the labels. (You can also try the regular_labels) 192 | random_labels(labels, n, k); 193 | kmeans::kmeans(iterations, n, d, k, eigenvectors_d, labels, centroids, distances); 194 | cout<<"Completed kmeans clustering algorithm on the k eigenvectors!"<