├── common ├── simpleTime.cuh ├── globals.hpp ├── parse_graph.hpp ├── simpleTime.cu ├── initial_graph.hpp ├── cuda_error_check.cuh ├── user_specified_global_configurations.h ├── user_specified_structures.h ├── cuda_utilities.cuh ├── user_specified_pre_and_post_processing_functions.hpp ├── parse_graph.cpp └── user_specified_device_functions.cuh ├── sfBuild.sh ├── .gitignore ├── csr_src ├── vwc_process.cuh ├── csr_format.cuh ├── csr_format.cu └── vwc_process.cu ├── cusha_src ├── cusha_format.cuh ├── find_block_size.cuh ├── cusha_process.cuh ├── find_block_size.cu ├── cusha_process.cu └── cusha_format.cu ├── README.md ├── LICENSE └── entry_point.cu /common/simpleTime.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLETIMER_H 2 | #define SIMPLETIMER_H 3 | 4 | void setTime(); 5 | double getTime(); 6 | 7 | #endif // SIMPLETIMER_H 8 | -------------------------------------------------------------------------------- /sfBuild.sh: -------------------------------------------------------------------------------- 1 | nvcc entry_point.cu common/simpleTime.cu csr_src/csr_format.cu csr_src/vwc_process.cu cusha_src/cusha_format.cu cusha_src/cusha_process.cu cusha_src/find_block_size.cu common/parse_graph.cpp -O3 -arch=sm_35 -o cusha 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Libraries 8 | *.lib 9 | *.a 10 | 11 | # Shared objects (inc. Windows DLLs) 12 | *.dll 13 | *.so 14 | *.so.* 15 | *.dylib 16 | 17 | # Executables 18 | *.exe 19 | *.out 20 | *.app 21 | *.i*86 22 | *.x86_64 23 | *.hex 24 | -------------------------------------------------------------------------------- /common/globals.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLOBALS_HPP 2 | #define GLOBALS_HPP 3 | 4 | #include // for timing 5 | #include // for timing 6 | 7 | enum GraphProcessingMethod { 8 | UNSPECIFIED = 0, 9 | CW, // Concatenated Windows (CW) method 10 | GS, // G-Shards method 11 | VWC // Virtual Warp-Centric method 12 | }; 13 | 14 | #endif // GLOBALS_HPP 15 | -------------------------------------------------------------------------------- /common/parse_graph.hpp: -------------------------------------------------------------------------------- 1 | #ifndef PARSE_GRAPH_HPP 2 | #define PARSE_GRAPH_HPP 3 | 4 | #include 5 | 6 | #include "initial_graph.hpp" 7 | 8 | namespace parse_graph { 9 | uint parse( 10 | std::ifstream& inFile, 11 | std::vector& initGraph, 12 | const long long arbparam, 13 | const bool nondirected ); 14 | } 15 | 16 | #endif // PARSE_GRAPH_HPP 17 | -------------------------------------------------------------------------------- /common/simpleTime.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | timeval StartingTime; 5 | 6 | void setTime(){ 7 | gettimeofday( &StartingTime, NULL ); 8 | } 9 | 10 | double getTime(){ 11 | timeval PausingTime, ElapsedTime; 12 | gettimeofday( &PausingTime, NULL ); 13 | timersub(&PausingTime, &StartingTime, &ElapsedTime); 14 | return ElapsedTime.tv_sec*1000.0+ElapsedTime.tv_usec/1000.0; // Returning in milliseconds. 15 | } 16 | -------------------------------------------------------------------------------- /csr_src/vwc_process.cuh: -------------------------------------------------------------------------------- 1 | #ifndef VWC_PROCESS_CUH 2 | #define VWC_PROCESS_CUH 3 | 4 | #include "../common/user_specified_structures.h" 5 | 6 | void vwc_process( 7 | int vwSize, 8 | uint gridDimen, 9 | const uint nVertices, 10 | const uint* vertexIndices, 11 | const uint* edgesIndices, 12 | Vertex* VertexValue, 13 | Edge* EdgeValue, 14 | Vertex_static* VertexValueStatic, 15 | int* finished ); 16 | 17 | 18 | #endif // VWC_PROCESS_CUH 19 | -------------------------------------------------------------------------------- /csr_src/csr_format.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CSR_FORMAT_CUH 2 | #define CSR_FORMAT_CUH 3 | 4 | #include 5 | #include 6 | 7 | #include "../common/initial_graph.hpp" 8 | #include "../common/globals.hpp" 9 | 10 | 11 | namespace csr_format{ 12 | void process( 13 | const int vwsize_or_threads, 14 | std::vector* initGraph, 15 | const uint nEdges, 16 | std::ofstream& outputFile, 17 | bool EdgesOnHost = false ); 18 | } 19 | 20 | #endif // CSR_FORMAT_CUH 21 | -------------------------------------------------------------------------------- /cusha_src/cusha_format.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUSHA_FORMAT_CUH 2 | #define CUSHA_FORMAT_CUH 3 | 4 | #include 5 | #include 6 | 7 | #include "../common/initial_graph.hpp" 8 | #include "../common/globals.hpp" 9 | 10 | 11 | namespace cusha_format{ 12 | void process( 13 | const GraphProcessingMethod procesingMethod, 14 | const int bsize, 15 | std::vector* initGraph, 16 | const uint nEdges, 17 | std::ofstream& outputFile, 18 | bool EdgesOnHost = false ); 19 | } 20 | 21 | #endif // CUSHA_FORMAT_CUH 22 | -------------------------------------------------------------------------------- /common/initial_graph.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INITIAL_GRAPH_HPP 2 | #define INITIAL_GRAPH_HPP 3 | 4 | #include 5 | 6 | #include "user_specified_structures.h" 7 | 8 | class neighbor { 9 | public: 10 | Edge edgeValue; 11 | unsigned int srcIndex; 12 | }; 13 | 14 | class initial_vertex { 15 | public: 16 | Vertex vertexValue; 17 | Vertex_static VertexValueStatic; 18 | std::vector nbrs; 19 | initial_vertex(): 20 | nbrs(0) 21 | {} 22 | Vertex& get_vertex_ref() { 23 | return vertexValue; 24 | } 25 | }; 26 | 27 | #endif // INITIAL_GRAPH_HPP 28 | -------------------------------------------------------------------------------- /common/cuda_error_check.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDAERRCHCK_CUH 2 | #define CUDAERRCHCK_CUH 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | //Error checking mechanism 9 | #define CUDAErrorCheck(err) { CUDAAssert((err), __FILE__, __LINE__); } 10 | inline void CUDAAssert( cudaError_t err, const char *file, int line ) 11 | { 12 | if ( err != cudaSuccess ) 13 | { 14 | std::ostringstream errStream; 15 | errStream << "CUDAAssert: " << cudaGetErrorString(err) << " " << file << " " << line << "\n"; 16 | throw std::runtime_error( errStream.str() ); 17 | } 18 | } 19 | 20 | #endif // CUDAERRCHCK_CUH 21 | -------------------------------------------------------------------------------- /cusha_src/find_block_size.cuh: -------------------------------------------------------------------------------- 1 | #ifndef FIND_BLOCK_SIZE_CUH 2 | #define FIND_BLOCK_SIZE_CUH 3 | 4 | struct blockSize_N_pair{ 5 | uint blockSize; 6 | uint N; // The maximum number of vertices inside a shard. 7 | }; 8 | 9 | // This function does NOT guarantee the best block size. But it tries to come up with the best. 10 | // Be aware block sizes rather than what this function chooses might end up showing better performance. 11 | // Any suggestions to improve this function will be appreciated. 12 | blockSize_N_pair find_proper_block_size( 13 | const int suggestedBlockSize, 14 | const uint nEdges, 15 | const uint nVertices ); 16 | 17 | 18 | #endif // FIND_BLOCK_SIZE_CUH 19 | -------------------------------------------------------------------------------- /cusha_src/cusha_process.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUSHA_PROCESS_CUH 2 | #define CUSHA_PROCESS_CUH 3 | 4 | #include "../common/globals.hpp" 5 | #include "../common/user_specified_structures.h" 6 | 7 | 8 | void cusha_process( 9 | const GraphProcessingMethod procesingMethod, 10 | const uint blockSize, 11 | const uint N, 12 | const uint nShards, 13 | const uint nVertices, 14 | Vertex* vertexValue, 15 | const uint* concatenatedWindowsSizesScan, 16 | const uint* windowSizesScansVertical, 17 | const uint* shardSizesScans, 18 | int* finished, 19 | Vertex* srcValue, 20 | const uint* dstIndex, 21 | Edge* edgeValue, 22 | Vertex_static* vertexValueStatic, 23 | const uint* srcIndex, 24 | const uint* mapper ); 25 | 26 | #endif // CUSHA_PROCESS_CUH 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #![alt text](http://farkhor.github.io/CuSha/images/CuSha-logo-small.png "CuSha") 2 | 3 | CuSha is a CUDA-based vertex-centric graph processing framework that uses G-Shards and Concatenated Windows (CW) representations to store graphs inside the GPU global memory. G-Shards and CW consume more space compared to Compressed Sparse Row (CSR) format but on the other hand provide better performance due to GPU-friendly representations. For completeness, provided package also includes Virtual Warp-Centric (VWC) processing method for GPU that uses CSR representation. 4 | 5 | [ [Paper](http://www.cs.ucr.edu/~fkhor001/index.html#publications) ] -- [ [Slides](http://www.cs.ucr.edu/~fkhor001/CuSha/CuSha_Slides.pptx) ] -- [ [Requirements and Usage](http://farkhor.github.io/CuSha/) ] 6 | 7 | 8 | #####Acknowledgements##### 9 | This work is supported by National Science Foundation grants CCF-1157377 and CCF-0905509 to UC Riverside. 10 | -------------------------------------------------------------------------------- /common/user_specified_global_configurations.h: -------------------------------------------------------------------------------- 1 | #ifndef USER_SPECIFIED_GLOBAL_CONFIGURATIONS_H 2 | #define USER_SPECIFIED_GLOBAL_CONFIGURATIONS_H 3 | /////////////////////////////////////////////// 4 | 5 | #define VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE 256 6 | 7 | /********************************************* 8 | * Sample algorithm templates. Uncomment whichever (only the one) you want to use. 9 | *********************************************/ 10 | 11 | //#define BFS 12 | #define SSSP 13 | //#define PR 14 | 15 | /********************************************* 16 | * User's Compile-time constant definitions. 17 | *********************************************/ 18 | 19 | #define BFS_INF 1073741824 20 | 21 | #define SSSP_INF 1073741824 22 | 23 | #define PR_INITIAL_VALUE 0.0f 24 | #define PR_DAMPING_FACTOR 0.85f 25 | #define PR_TOLERANCE 0.005f 26 | 27 | /////////////////////////////////////////////// 28 | #endif // USER_SPECIFIED_GLOBAL_CONFIGURATIONS_H 29 | -------------------------------------------------------------------------------- /common/user_specified_structures.h: -------------------------------------------------------------------------------- 1 | #ifndef USER_SPECIFIED_STRUCTURES_H 2 | #define USER_SPECIFIED_STRUCTURES_H 3 | 4 | #include "user_specified_global_configurations.h" 5 | 6 | /************************************** 7 | * STRUCTURES 8 | **************************************/ 9 | 10 | 11 | // Vertex structure. 12 | struct Vertex{ 13 | 14 | #ifdef BFS 15 | unsigned int distance; 16 | #endif 17 | 18 | #ifdef SSSP 19 | unsigned int distance; 20 | #endif 21 | 22 | #ifdef PR 23 | float rank; 24 | #endif 25 | 26 | }; 27 | 28 | // Vertex_static structure. Those properties of the vertex that remain constant during processing should be declared here. 29 | typedef struct Vertex_static{ 30 | 31 | #ifdef PR 32 | unsigned int NbrsNum; 33 | #endif 34 | 35 | }Vertex_static; 36 | 37 | // Edge structure. 38 | struct Edge{ 39 | 40 | #ifdef SSSP 41 | unsigned int weight; 42 | #endif 43 | 44 | }; 45 | 46 | 47 | 48 | #endif // USER_SPECIFIED_STRUCTURES_H 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Farzad Khorasani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /cusha_src/find_block_size.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "find_block_size.cuh" 5 | #include "../common/user_specified_structures.h" 6 | #include "../common/cuda_error_check.cuh" 7 | 8 | // This function does NOT guarantee the best block size. But it tries to come up with the best. 9 | // Be aware block sizes rather than what this function chooses might end up showing better performance. 10 | // Any suggestions to improve this function will be appreciated. 11 | blockSize_N_pair find_proper_block_size( 12 | const int suggestedBlockSize, 13 | const uint nEdges, 14 | const uint nVertices ) { 15 | 16 | // Getting current device properties to properly select block size and N. 17 | int currentDevice; 18 | CUDAErrorCheck( cudaGetDevice( ¤tDevice ) ); 19 | cudaDeviceProp deviceProp; 20 | CUDAErrorCheck ( cudaGetDeviceProperties( &deviceProp, currentDevice ) ); 21 | int maxVerticesPerSM = deviceProp.sharedMemPerBlock / sizeof(Vertex); 22 | 23 | int MaxBlockPerSM; // Maximum number of resident blocks per multiprocessor. Not queryable (is it a word??) by CUDA runtime. 24 | #if __CUDA_ARCH__ < 300 25 | MaxBlockPerSM = 8; 26 | #endif 27 | #if __CUDA_ARCH__ >= 300 & __CUDA_ARCH__ < 500 28 | MaxBlockPerSM = 16; 29 | #endif 30 | #if __CUDA_ARCH__ >= 500 31 | MaxBlockPerSM = 32; 32 | #endif 33 | 34 | // If suggested block size is 0 (user hasn't entered anything), we ignore it. 35 | blockSize_N_pair BS_N; 36 | if( suggestedBlockSize == 0 ) { 37 | int approximated_N = (int)std::sqrt((deviceProp.warpSize * std::pow(nVertices,2))/nEdges); // Please refer to paper for explanation. 38 | //fprintf( stdout, "Approximated N: %d\n", approximated_N); 39 | for( int b_per_SM = 2; b_per_SM<=MaxBlockPerSM; ++b_per_SM ) { 40 | blockSize_N_pair temp_pair; 41 | temp_pair.blockSize = deviceProp.maxThreadsPerMultiProcessor/b_per_SM; 42 | if ( deviceProp.maxThreadsPerMultiProcessor % (temp_pair.blockSize * b_per_SM) != 0 ) 43 | continue; 44 | if( temp_pair.blockSize > deviceProp.maxThreadsPerBlock) 45 | continue; 46 | temp_pair.N = maxVerticesPerSM / b_per_SM; 47 | if( temp_pair.N > approximated_N ) 48 | BS_N = temp_pair; 49 | } 50 | 51 | } 52 | else { 53 | // The behavior is undefined if user-specified block size is not a power of two. Usual block sizes are 1024, 512, 256, and 128. 54 | if( suggestedBlockSize > deviceProp.maxThreadsPerBlock ) 55 | throw std::runtime_error( "Specified block size is invalid." ); 56 | BS_N.blockSize = suggestedBlockSize; 57 | BS_N.N = ( maxVerticesPerSM * suggestedBlockSize ) / deviceProp.maxThreadsPerMultiProcessor; 58 | } 59 | 60 | return BS_N; 61 | } 62 | -------------------------------------------------------------------------------- /common/cuda_utilities.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_UTILITIES_CUH 2 | #define CUDA_UTILITIES_CUH 3 | 4 | #include 5 | 6 | #include "cuda_error_check.cuh" 7 | 8 | 9 | template 10 | class host_pinned_buffer{ 11 | private: 12 | T* ptr; 13 | size_t nElems; 14 | void construct(size_t n){ 15 | CUDAErrorCheck( cudaHostAlloc( (void**)&ptr, n*sizeof(T), cudaHostAllocPortable ) ); 16 | nElems = n; 17 | } 18 | public: 19 | host_pinned_buffer(){ 20 | nElems = 0; 21 | ptr = NULL; 22 | } 23 | host_pinned_buffer(size_t n){ 24 | construct(n); 25 | } 26 | ~host_pinned_buffer(){ 27 | if( nElems!=0 ) 28 | CUDAErrorCheck( cudaFreeHost( ptr ) ); 29 | } 30 | void alloc(size_t n){ 31 | if( nElems==0 ) 32 | construct(n); 33 | } 34 | void free(){ 35 | if( nElems!=0 ) { 36 | nElems = 0; 37 | CUDAErrorCheck( cudaFreeHost( ptr ) ); 38 | } 39 | } 40 | T& at(size_t index){ 41 | if( index >= nElems ) 42 | throw std::runtime_error( "The referred element does not exist in the buffer." ); 43 | return ptr[index]; 44 | } 45 | T& operator[](size_t index){ 46 | return this->at(index); 47 | } 48 | T* get_ptr(){ 49 | return ptr; 50 | } 51 | size_t size(){ 52 | return nElems; 53 | } 54 | size_t sizeInBytes(){ 55 | return nElems*sizeof(T); 56 | } 57 | }; 58 | 59 | template 60 | class device_buffer{ 61 | private: 62 | T* ptr; 63 | size_t nElems; 64 | void construct(size_t n) { 65 | CUDAErrorCheck( cudaMalloc( (void**)&ptr, n*sizeof(T) ) ); 66 | nElems = n; 67 | } 68 | public: 69 | device_buffer(): 70 | nElems(0), ptr(NULL) 71 | {} 72 | device_buffer(size_t n){ 73 | construct(n); 74 | } 75 | ~device_buffer(){ 76 | if( nElems!=0 ) 77 | CUDAErrorCheck( cudaFree( ptr ) ); 78 | } 79 | void alloc(size_t n){ 80 | if( nElems==0 ) 81 | construct(n); 82 | } 83 | void free(){ 84 | if( nElems!=0 ) { 85 | nElems = 0; 86 | CUDAErrorCheck( cudaFree( ptr ) ); 87 | } 88 | } 89 | T* get_ptr(){ 90 | return ptr; 91 | } 92 | size_t size(){ 93 | return nElems; 94 | } 95 | size_t sizeInBytes(){ 96 | return nElems*sizeof(T); 97 | } 98 | device_buffer& operator=( host_pinned_buffer& srcHostBuffer ) 99 | { 100 | if( nElems == 0 ) { 101 | construct( srcHostBuffer.size() ); 102 | CUDAErrorCheck( cudaMemcpyAsync( ptr, srcHostBuffer.get_ptr(), srcHostBuffer.sizeInBytes(), cudaMemcpyHostToDevice ) ); 103 | } 104 | else { 105 | size_t copySize = ( srcHostBuffer.sizeInBytes() < this->sizeInBytes() ) ? srcHostBuffer.sizeInBytes() : this->sizeInBytes(); 106 | CUDAErrorCheck( cudaMemcpyAsync( ptr, srcHostBuffer.get_ptr(), copySize, cudaMemcpyHostToDevice ) ); 107 | } 108 | return *this; 109 | } 110 | }; 111 | 112 | 113 | 114 | #endif // CUDA_UTILITIES_CUH 115 | -------------------------------------------------------------------------------- /common/user_specified_pre_and_post_processing_functions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef USER_SPECIFIED_PRE_AND_POST_PROCESSING_FUNCTIONS_HPP 2 | #define USER_SPECIFIED_PRE_AND_POST_PROCESSING_FUNCTIONS_HPP 3 | 4 | #include 5 | #include 6 | 7 | #include "user_specified_structures.h" 8 | #include "user_specified_global_configurations.h" 9 | 10 | /************************************** 11 | * INITIALIZATION FUNCTION 12 | **************************************/ 13 | 14 | // The function that helps to initialize vertex/edge contents before processing the graph. 15 | // It will be called for every line of graph input file. 16 | // Items are separated by space or tab. 17 | inline void completeEntry( 18 | unsigned int argcount, // The number of additional items in the line 19 | char** argvector, // char* pointer for which dereferencing its elements provides us with the additional items in the line in form of char*. 20 | const int src_vertex_index, // Source vertex index. 21 | const int dst_vertex_index, // Destination vertex index. 22 | Edge* edge_address, // Pointer to the current edge corresponding to the current line. 23 | Vertex& src_vertex_ref, // Pointer to the source vertex. 24 | Vertex_static* src_vertex_static_address, // Pointer to the source Vertex_static. 25 | Vertex& dst_vertex_ref, // Pointer to the destination vertex. 26 | Vertex_static* dst_vertex_static_address, // Pointer to the destination Vertex_static. 27 | const long long arbparam = 0 // Arbitrary integer input in the console. 28 | ) { 29 | 30 | #ifdef BFS 31 | src_vertex_ref.distance = ( src_vertex_index != arbparam ) ? BFS_INF : 0; 32 | dst_vertex_ref.distance = ( dst_vertex_index != arbparam ) ? BFS_INF : 0; 33 | #endif 34 | 35 | #ifdef SSSP 36 | src_vertex_ref.distance = ( src_vertex_index != arbparam ) ? SSSP_INF : 0; 37 | dst_vertex_ref.distance = ( dst_vertex_index != arbparam ) ? SSSP_INF : 0; 38 | edge_address->weight = ( argcount > 0 ) ? atoi(argvector[0]) : 0; 39 | #endif 40 | 41 | #ifdef PR 42 | src_vertex_ref.rank = PR_INITIAL_VALUE; 43 | dst_vertex_ref.rank = PR_INITIAL_VALUE; 44 | src_vertex_static_address->NbrsNum = ( argcount > 0 ) ? atoi( argvector[0] ) : 0; 45 | #endif 46 | 47 | } 48 | 49 | 50 | /************************************** 51 | * OUTPUT FORMATTING FUNCTION 52 | **************************************/ 53 | 54 | // Below function outputs the resulted vertex content. 55 | // It will be performed at the end of processing for each and every vertex. 56 | inline void print_vertex_output( 57 | const uint vertexIndex, 58 | const Vertex resultVertex, 59 | std::ofstream& outFile 60 | ) { 61 | 62 | #ifdef BFS 63 | outFile << vertexIndex << ":\t" << resultVertex.distance << "\n"; 64 | #endif 65 | 66 | #ifdef SSSP 67 | outFile << vertexIndex << ":\t" << resultVertex.distance << "\n"; 68 | #endif 69 | 70 | #ifdef PR 71 | outFile << vertexIndex << ":\t" << resultVertex.rank << "\n"; 72 | #endif 73 | 74 | } 75 | 76 | 77 | #endif // USER_SPECIFIED_PRE_AND_POST_PROCESSING_FUNCTIONS_HPP 78 | -------------------------------------------------------------------------------- /common/parse_graph.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "parse_graph.hpp" 8 | #include "user_specified_pre_and_post_processing_functions.hpp" 9 | 10 | uint parse_graph::parse( 11 | std::ifstream& inFile, 12 | std::vector& initGraph, 13 | const long long arbparam, 14 | const bool nondirected ) { 15 | 16 | const bool firstColumnSourceIndex = true; 17 | 18 | std::string line; 19 | char delim[3] = " \t"; //In most benchmarks, the delimiter is usually the space character or the tab character. 20 | char* pch; 21 | uint nEdges = 0; 22 | 23 | unsigned int Additionalargc=0; 24 | char* Additionalargv[ 61 ]; 25 | 26 | // Read the input graph line-by-line. 27 | while( std::getline( inFile, line ) ) { 28 | if( line[0] < '0' || line[0] > '9' ) // Skipping any line blank or starting with a character rather than a number. 29 | continue; 30 | char cstrLine[256]; 31 | std::strcpy( cstrLine, line.c_str() ); 32 | uint firstIndex, secondIndex; 33 | 34 | pch = strtok(cstrLine, delim); 35 | if( pch != NULL ) 36 | firstIndex = atoi( pch ); 37 | else 38 | continue; 39 | pch = strtok( NULL, delim ); 40 | if( pch != NULL ) 41 | secondIndex = atoi( pch ); 42 | else 43 | continue; 44 | 45 | uint theMax = std::max( firstIndex, secondIndex ); 46 | uint srcVertexIndex = firstColumnSourceIndex ? firstIndex : secondIndex; 47 | uint dstVertexIndex = firstColumnSourceIndex ? secondIndex : firstIndex; 48 | if( initGraph.size() <= theMax ) 49 | initGraph.resize(theMax+1); 50 | { 51 | neighbor nbrToAdd; 52 | nbrToAdd.srcIndex = srcVertexIndex; 53 | 54 | Additionalargc=0; 55 | Additionalargv[ Additionalargc ] = strtok( NULL, delim ); 56 | while( Additionalargv[ Additionalargc ] != NULL ){ 57 | Additionalargc++; 58 | Additionalargv[ Additionalargc ] = strtok( NULL, delim ); 59 | } 60 | completeEntry( Additionalargc, 61 | Additionalargv, 62 | srcVertexIndex, 63 | dstVertexIndex, 64 | &(nbrToAdd.edgeValue), 65 | (initGraph.at(srcVertexIndex).vertexValue), 66 | &(initGraph.at(srcVertexIndex).VertexValueStatic), 67 | (initGraph.at(dstVertexIndex).vertexValue), 68 | &(initGraph.at(dstVertexIndex).VertexValueStatic), 69 | arbparam ); 70 | 71 | initGraph.at(dstVertexIndex).nbrs.push_back( nbrToAdd ); 72 | nEdges++; 73 | } 74 | if( nondirected ) { 75 | 76 | uint tmp = srcVertexIndex; 77 | srcVertexIndex = dstVertexIndex; 78 | dstVertexIndex = tmp; 79 | 80 | neighbor nbrToAdd; 81 | nbrToAdd.srcIndex = srcVertexIndex; 82 | 83 | Additionalargc=0; 84 | Additionalargv[ Additionalargc ] = strtok( NULL, delim ); 85 | while( Additionalargv[ Additionalargc ] != NULL ){ 86 | Additionalargc++; 87 | Additionalargv[ Additionalargc ] = strtok( NULL, delim ); 88 | } 89 | completeEntry( Additionalargc, 90 | Additionalargv, 91 | srcVertexIndex, 92 | dstVertexIndex, 93 | &(nbrToAdd.edgeValue), 94 | (initGraph.at(srcVertexIndex).vertexValue), 95 | &(initGraph.at(srcVertexIndex).VertexValueStatic), 96 | (initGraph.at(dstVertexIndex).vertexValue), 97 | &(initGraph.at(dstVertexIndex).VertexValueStatic), 98 | arbparam ); 99 | 100 | initGraph.at(dstVertexIndex).nbrs.push_back( nbrToAdd ); 101 | nEdges++; 102 | } 103 | } 104 | 105 | return nEdges; 106 | 107 | } 108 | -------------------------------------------------------------------------------- /csr_src/csr_format.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../common/simpleTime.cuh" 6 | #include "csr_format.cuh" 7 | #include "../common/cuda_error_check.cuh" 8 | #include "../common/cuda_utilities.cuh" 9 | #include "vwc_process.cuh" 10 | #include "../common/user_specified_structures.h" 11 | #include "../common/user_specified_pre_and_post_processing_functions.hpp" 12 | 13 | 14 | void csr_format::process( 15 | const int vwsize_or_threads, 16 | std::vector* initGraph, 17 | const uint nEdges, 18 | std::ofstream& outputFile, 19 | bool EdgesOnHost ) { 20 | 21 | const uint nVertices = initGraph->size(); 22 | 23 | // Variables collecting timing info. 24 | float H2D_copy_time = 0, processing_time = 0, D2H_copy_time = 0; 25 | 26 | // Allocate host buffers. 27 | host_pinned_buffer vertexValue( nVertices ); 28 | host_pinned_buffer edgesIndices( nVertices + 1 ); 29 | edgesIndices.at(0) = 0; 30 | host_pinned_buffer vertexIndices( nEdges ); 31 | host_pinned_buffer EdgeValue; 32 | if( sizeof(Edge) > 1 ) EdgeValue.alloc( nEdges ); 33 | host_pinned_buffer VertexValueStatic; 34 | if( sizeof(Vertex_static) > 1 ) VertexValueStatic.alloc( nVertices ); 35 | 36 | // Put vertices into host buffer CSR form. 37 | for( uint vIdx = 0; vIdx < nVertices; ++vIdx ) { 38 | initial_vertex& vvv = initGraph->at(vIdx); 39 | vertexValue[ vIdx ] = vvv.vertexValue; 40 | if( sizeof(Vertex_static) > 1 ) VertexValueStatic[ vIdx ] = vvv.VertexValueStatic; 41 | uint nNbrs = vvv.nbrs.size(); 42 | uint edgeIdxOffset = edgesIndices[ vIdx ]; 43 | for( uint nbrIdx = 0; nbrIdx < nNbrs; ++nbrIdx ) { 44 | neighbor& nbr = vvv.nbrs.at( nbrIdx ); 45 | vertexIndices[ edgeIdxOffset + nbrIdx ] = nbr.srcIndex; 46 | if( sizeof(Edge) > 1 ) EdgeValue[ edgeIdxOffset + nbrIdx ] = nbr.edgeValue; 47 | } 48 | edgesIndices[ vIdx + 1 ] = edgeIdxOffset + nNbrs; 49 | } 50 | 51 | // Define device buffers. 52 | device_buffer dev_vertexValue; 53 | device_buffer dev_edgesIndices; 54 | device_buffer dev_vertexIndices; 55 | device_buffer dev_EdgeValue; 56 | device_buffer dev_VertexValueStatic; 57 | device_buffer devFinished; 58 | 59 | uint vwcGridDimen = 0; 60 | 61 | vwcGridDimen = std::ceil( static_cast( nVertices ) / ( VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE / vwsize_or_threads ) ); 62 | 63 | // Allocate device buffers. 64 | dev_vertexValue.alloc( nVertices ); 65 | dev_edgesIndices.alloc( nVertices + 1 ); 66 | if( !EdgesOnHost ) dev_vertexIndices.alloc( nEdges ); 67 | if( !EdgesOnHost ) if( sizeof(Edge) > 1 ) dev_EdgeValue.alloc( nEdges ); 68 | if( sizeof(Vertex_static) > 1 ) dev_VertexValueStatic.alloc( nVertices ); 69 | devFinished.alloc( 1 ); 70 | 71 | // Copy data to device buffers. 72 | setTime(); 73 | dev_vertexValue = vertexValue; 74 | dev_edgesIndices = edgesIndices; 75 | if( !EdgesOnHost ) dev_vertexIndices = vertexIndices; 76 | if( !EdgesOnHost ) if( sizeof(Edge) > 1 ) dev_EdgeValue = EdgeValue; 77 | if( sizeof(Vertex_static) > 1 ) dev_VertexValueStatic = VertexValueStatic; 78 | CUDAErrorCheck( cudaDeviceSynchronize() ); 79 | H2D_copy_time = getTime(); 80 | std::cout << "Copying data to device took " << H2D_copy_time << " (ms)." << std::endl; 81 | 82 | int finished; 83 | 84 | // Iteratively process the graph. 85 | unsigned int IterationCounter = 0; 86 | setTime(); 87 | do { 88 | finished = 0; 89 | 90 | CUDAErrorCheck( cudaMemcpyAsync( devFinished.get_ptr(), &finished, sizeof(int), cudaMemcpyHostToDevice ) ); 91 | vwc_process( 92 | vwsize_or_threads, 93 | vwcGridDimen, 94 | nVertices, 95 | ( !EdgesOnHost ) ? dev_vertexIndices.get_ptr() : vertexIndices.get_ptr(), 96 | dev_edgesIndices.get_ptr(), 97 | dev_vertexValue.get_ptr(), 98 | ( !EdgesOnHost ) ? dev_EdgeValue.get_ptr() : EdgeValue.get_ptr(), 99 | dev_VertexValueStatic.get_ptr(), 100 | devFinished.get_ptr() ); 101 | CUDAErrorCheck( cudaPeekAtLastError() ); 102 | CUDAErrorCheck( cudaMemcpyAsync( &finished, devFinished.get_ptr(), sizeof(int), cudaMemcpyDeviceToHost ) ); 103 | CUDAErrorCheck( cudaDeviceSynchronize() ); 104 | 105 | ++IterationCounter; 106 | } while( finished == 1 ); 107 | processing_time = getTime(); 108 | std::cout << "Processing finished in " << processing_time << " (ms).\n"; 109 | std::cout << "Performed " << IterationCounter << " iterations in total.\n"; 110 | 111 | // Copy resulted vertex values back from the device to the host. 112 | setTime(); 113 | CUDAErrorCheck( cudaMemcpy( vertexValue.get_ptr(), dev_vertexValue.get_ptr(), vertexValue.sizeInBytes(), cudaMemcpyDeviceToHost ) ); 114 | D2H_copy_time = getTime(); 115 | std::cout << "Copying final vertex values back to the host took " << D2H_copy_time << " (ms).\n"; 116 | 117 | //std::cout << "Total Execution time was " << H2D_copy_time + processing_time + D2H_copy_time << " (ms)." << std::endl; 118 | //std::cout << IterationCounter <<"\t"<< H2D_copy_time <<"\t"<< processing_time <<"\t"<< D2H_copy_time << "\n"; 119 | 120 | // Print the output vertex values to the file. 121 | for( uint vvv = 0; vvv < nVertices; ++vvv ) 122 | print_vertex_output( 123 | vvv, 124 | vertexValue[ vvv ], 125 | outputFile ); 126 | 127 | } 128 | -------------------------------------------------------------------------------- /cusha_src/cusha_process.cu: -------------------------------------------------------------------------------- 1 | #include "cusha_process.cuh" 2 | #include "../common/user_specified_device_functions.cuh" 3 | 4 | template 5 | __global__ void CuSha_GPU_kernel( 6 | const uint nShards, 7 | const uint N, // Maximum number of vertices assigned to a shard. 8 | const uint* SrcIndex, 9 | const uint* DestIndex, 10 | Vertex* SrcValue, 11 | Vertex* VertexValues, 12 | const Edge* EdgeValue, 13 | const Vertex_static* SrcValue_static, 14 | int* finishedProcessing, 15 | const uint* shardSizesScan, 16 | const uint* concatenatedWindowsSizesScan, // For CW. 17 | const uint* windowSizesScansVertical, // For GS. 18 | const uint* Mapper = NULL ) { // Used only when processing method is CW. 19 | 20 | extern __shared__ Vertex localVertices[]; 21 | 22 | // Shard index is determined by blockIdx.x. 23 | uint shardOffset = blockIdx.x * N; 24 | uint shardStartingAddress = shardSizesScan[ blockIdx.x ]; 25 | uint shardEndingAddress = shardSizesScan[ blockIdx.x + 1 ]; 26 | Vertex* shardVertexValues = VertexValues + shardOffset; 27 | 28 | /* 1st stage */ 29 | // Initialize block vertices residing in shared memory. 30 | for( uint vertexID = threadIdx.x; 31 | vertexID < N; 32 | vertexID += blockDim.x ) { 33 | init_compute_CuSha( localVertices + vertexID, shardVertexValues + vertexID ); 34 | } 35 | 36 | /* 2nd stage */ 37 | // Consecutive entries of shard are processed by consecutive threads. 38 | __syncthreads(); 39 | for( uint EntryAddress = shardStartingAddress + threadIdx.x; 40 | EntryAddress < shardEndingAddress; 41 | EntryAddress += blockDim.x ) { 42 | 43 | compute_CuSha( SrcValue[ EntryAddress ], 44 | SrcValue_static + EntryAddress, 45 | EdgeValue + EntryAddress, 46 | localVertices + ( DestIndex[ EntryAddress ] - shardOffset ) ); 47 | 48 | } 49 | 50 | 51 | /* 3rd stage */ 52 | // Check if any update has happened. 53 | __syncthreads(); 54 | int flag = false; 55 | for( uint vertexID = threadIdx.x; 56 | vertexID < N; 57 | vertexID += blockDim.x ) { 58 | 59 | if( update_condition_CuSha( localVertices + vertexID, shardVertexValues + vertexID ) ) { 60 | flag = true; 61 | shardVertexValues[ vertexID ] = localVertices[ vertexID ]; 62 | } 63 | 64 | } 65 | 66 | 67 | /* 4th stage */ 68 | // If any vertex has been updated during processing, update shard's corresponding windows. 69 | if( __syncthreads_or( flag ) ) { // Requires (CC>=2.0). 70 | 71 | if( procesingMethod == CW ) { 72 | 73 | uint shardCWStartingAddress = concatenatedWindowsSizesScan[ blockIdx.x ]; 74 | uint shardCWEndingAddress = concatenatedWindowsSizesScan[ blockIdx.x + 1 ]; 75 | for( uint EntryAddress = shardCWStartingAddress + threadIdx.x; 76 | EntryAddress < shardCWEndingAddress; 77 | EntryAddress += blockDim.x ) { 78 | 79 | SrcValue[ Mapper[ EntryAddress ] ] = localVertices[ SrcIndex[ EntryAddress ] - shardOffset ]; 80 | 81 | } 82 | 83 | } 84 | else { //GS 85 | 86 | // Each warp in the block updates one specific window in one specific shard. 87 | for( uint targetShardIndex = threadIdx.x / warpSize; // threadIdx.x >> 5 88 | targetShardIndex < nShards; 89 | targetShardIndex += ( blockDim.x / warpSize ) ) { // blockDim.x >> 5 90 | 91 | uint targetWindowStartingAddress = windowSizesScansVertical[ targetShardIndex * nShards + blockIdx.x ]; 92 | uint targetWindowEndingAddress = windowSizesScansVertical[ targetShardIndex * nShards + blockIdx.x + 1 ]; 93 | // Threads of warp update window entries in parallel. 94 | for( uint windowEntry = targetWindowStartingAddress + ( threadIdx.x & ( warpSize - 1 ) ); 95 | windowEntry < targetWindowEndingAddress; 96 | windowEntry += warpSize ) { 97 | 98 | SrcValue[ windowEntry ] = localVertices[ SrcIndex[ windowEntry ] - shardOffset ]; 99 | 100 | } 101 | 102 | } 103 | 104 | } 105 | 106 | // Signal the host to launch a new kernel. 107 | if( threadIdx.x == 0 ) 108 | (*finishedProcessing) = 1; 109 | 110 | } 111 | 112 | } 113 | 114 | void cusha_process( 115 | const GraphProcessingMethod procesingMethod, 116 | const uint blockSize, 117 | const uint N, 118 | const uint nShards, 119 | const uint nVertices, 120 | Vertex* vertexValue, 121 | const uint* concatenatedWindowsSizesScan, 122 | const uint* windowSizesScansVertical, 123 | const uint* shardSizesScans, 124 | int* finished, 125 | Vertex* srcValue, 126 | const uint* dstIndex, 127 | Edge* edgeValue, 128 | Vertex_static* vertexValueStatic, 129 | const uint* srcIndex, 130 | const uint* mapper ) { 131 | 132 | 133 | if( procesingMethod == CW ) { 134 | CuSha_GPU_kernel < CW > 135 | <<< nShards, blockSize, ( N * sizeof(Vertex) ) >>> ( 136 | nShards, 137 | N, 138 | srcIndex, 139 | dstIndex, 140 | srcValue, 141 | vertexValue, 142 | edgeValue, 143 | vertexValueStatic, 144 | finished, 145 | shardSizesScans, 146 | concatenatedWindowsSizesScan, 147 | windowSizesScansVertical, 148 | mapper ); 149 | } 150 | else { // Processing method is GS. 151 | CuSha_GPU_kernel < GS > 152 | <<< nShards, blockSize, ( N * sizeof(Vertex) ) >>> ( 153 | nShards, 154 | N, 155 | srcIndex, 156 | dstIndex, 157 | srcValue, 158 | vertexValue, 159 | edgeValue, 160 | vertexValueStatic, 161 | finished, 162 | shardSizesScans, 163 | concatenatedWindowsSizesScan, 164 | windowSizesScansVertical ); 165 | } 166 | 167 | 168 | } 169 | -------------------------------------------------------------------------------- /entry_point.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "common/globals.hpp" 9 | #include "common/cuda_error_check.cuh" 10 | #include "common/initial_graph.hpp" 11 | #include "common/parse_graph.hpp" 12 | #include "cusha_src/cusha_format.cuh" 13 | #include "csr_src/csr_format.cuh" 14 | 15 | 16 | // Open files safely. 17 | template 18 | void openFileToAccess( T_file& input_file, std::string file_name ) { 19 | input_file.open( file_name.c_str() ); 20 | if( !input_file ) 21 | throw std::runtime_error( "Failed to open specified file: " + file_name + "\n" ); 22 | } 23 | 24 | 25 | // Execution entry point. 26 | int main( int argc, char** argv ) 27 | { 28 | 29 | std::string usage = 30 | "\tRequired command line arguments:\n\ 31 | -Input file: E.g., --input in.txt\n\ 32 | -Processing method: CW, GS, VWC. E.g., --method CW\n\ 33 | Additional arguments:\n\ 34 | -Output file (default: out.txt). E.g., --output myout.txt.\n\ 35 | -Is the input graph directed (default:yes). To make it undirected: --undirected\n\ 36 | -Device ID (default: 0). E.g., --device 1\n\ 37 | -GPU kernels Block size for CW and GS (default: chosen based on analysis). E.g., --bsize 512.\n\ 38 | -Virtual warp size for VWC (default: 32). E.g., --vwsize 8.\n\ 39 | -User's arbitrary parameter (default: 0). E.g., --arbparam 17.\n"; 40 | 41 | try { 42 | 43 | GraphProcessingMethod procesingMethod = UNSPECIFIED; 44 | std::ifstream inputFile; 45 | std::ofstream outputFile; 46 | int selectedDevice = 0; 47 | int bsize = 0; 48 | int vwsize = 32; 49 | int threads = 1; 50 | long long arbparam = 0; 51 | bool nonDirectedGraph = false; // By default, the graph is directed. 52 | 53 | /******************************** 54 | * GETTING INPUT PARAMETERS. 55 | ********************************/ 56 | 57 | for( int iii = 1; iii < argc; ++iii ) 58 | if ( !strcmp(argv[iii], "--method") && iii != argc-1 ) { 59 | if ( !strcmp(argv[iii+1], "CW") ) 60 | procesingMethod = CW; 61 | if ( !strcmp(argv[iii+1], "GS") ) 62 | procesingMethod = GS; 63 | if ( !strcmp(argv[iii+1], "VWC") ) 64 | procesingMethod = VWC; 65 | } 66 | else if( !strcmp( argv[iii], "--input" ) && iii != argc-1 /*is not the last one*/) 67 | openFileToAccess< std::ifstream >( inputFile, std::string( argv[iii+1] ) ); 68 | else if( !strcmp( argv[iii], "--output" ) && iii != argc-1 /*is not the last one*/) 69 | openFileToAccess< std::ofstream >( outputFile, std::string( argv[iii+1] ) ); 70 | else if( !strcmp( argv[iii], "--device" ) && iii != argc-1 /*is not the last one*/) 71 | selectedDevice = std::atoi( argv[iii+1] ); 72 | else if( !strcmp( argv[iii], "--bsize" ) && iii != argc-1 /*is not the last one*/) 73 | bsize = std::atoi( argv[iii+1] ); 74 | else if( !strcmp( argv[iii], "--vwsize" ) && iii != argc-1 /*is not the last one*/) 75 | vwsize = std::atoi( argv[iii+1] ); 76 | else if( !strcmp( argv[iii], "--arbparam" ) && iii != argc-1 /*is not the last one*/) 77 | arbparam = std::atoll( argv[iii+1] ); 78 | else if( !strcmp(argv[iii], "--undirected")) 79 | nonDirectedGraph = true; 80 | 81 | if( !inputFile.is_open() || procesingMethod == UNSPECIFIED ) { 82 | std::cerr << "Usage: " << usage; 83 | throw std::runtime_error( "\nAn initialization error happened.\nExiting." ); 84 | } 85 | if( !outputFile.is_open() ) 86 | openFileToAccess< std::ofstream >( outputFile, "out.txt" ); 87 | CUDAErrorCheck( cudaSetDevice( selectedDevice ) ); 88 | std::cout << "Device with ID " << selectedDevice << " is selected to process the graph.\n"; 89 | if( procesingMethod == VWC ) { 90 | if( vwsize != 2 && vwsize !=4 && vwsize != 8 && vwsize != 16 && vwsize != 32 ) 91 | vwsize = 32; 92 | std::cout << "Virtual-Warp Centric method will be employed to process the graph with virtual warp size " << vwsize << ".\n"; 93 | } 94 | 95 | 96 | /******************************** 97 | * Read the input graph file. 98 | ********************************/ 99 | 100 | std::cout << "Collecting the input graph ...\n"; 101 | std::vector parsedGraph( 0 ); 102 | uint nEdges = parse_graph::parse( 103 | inputFile, // Input file. 104 | parsedGraph, // The parsed graph. 105 | arbparam, 106 | nonDirectedGraph ); // Arbitrary user-provided parameter. 107 | std::cout << "Input graph collected with " << parsedGraph.size() << " vertices and " << nEdges << " edges.\n"; 108 | 109 | 110 | /******************************** 111 | * Process the graph. 112 | ********************************/ 113 | 114 | if( procesingMethod == GS || procesingMethod == CW ) { 115 | cusha_format::process( 116 | procesingMethod, 117 | bsize, 118 | &parsedGraph, 119 | nEdges, 120 | outputFile ); 121 | } 122 | else { 123 | csr_format::process( 124 | ((procesingMethod==VWC)?vwsize:threads), 125 | &parsedGraph, 126 | nEdges, 127 | outputFile ); 128 | } 129 | 130 | 131 | /******************************** 132 | * It's done here. 133 | ********************************/ 134 | 135 | CUDAErrorCheck( cudaDeviceReset() ); 136 | std::cout << "Done.\n"; 137 | return( EXIT_SUCCESS ); 138 | 139 | } 140 | catch( const std::exception& strException ) { 141 | std::cerr << strException.what() << "\n"; 142 | return( EXIT_FAILURE ); 143 | } 144 | catch(...) { 145 | std::cerr << "An exception has occurred." << std::endl; 146 | return( EXIT_FAILURE ); 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /csr_src/vwc_process.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "vwc_process.cuh" 3 | #include "../common/user_specified_device_functions.cuh" 4 | 5 | // Virtual Warp-Centric (VWC) manner of processing graph using Compressed Sparse Row (CSR) representation format. 6 | template < uint VWSize, uint VWMask > 7 | __global__ void VWC_CSR_GPU_kernel( 8 | const uint num_of_vertices, 9 | const uint* edges_indices, 10 | const uint* vertices_indices, 11 | Vertex* VertexValue, 12 | Edge* EdgeValue, 13 | Vertex_static* VertexValue_static, 14 | int* dev_finished ) { 15 | 16 | __shared__ Vertex final_vertex_values[ VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >> VWMask ]; 17 | __shared__ Vertex thread_outcome[ VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE ]; 18 | volatile __shared__ uint edges_starting_address[ VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >> VWMask ]; 19 | volatile __shared__ uint ngbrs_size[ VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >> VWMask ]; 20 | Vertex previous_vertex_value; 21 | 22 | // You might gain some performance if you limit maximum number of registers per thread with -maxrregcount flag. For example, specifying 32 for the Kepler architecture. 23 | const uint warp_in_block_offset = threadIdx.x >> VWMask; 24 | const uint VLane_id = threadIdx.x & (VWSize-1); 25 | const uint t_id = threadIdx.x + blockIdx.x * blockDim.x; 26 | const uint VW_id = t_id >> VWMask; 27 | if( VW_id >= num_of_vertices ) 28 | return; 29 | 30 | previous_vertex_value = VertexValue[ VW_id ]; 31 | // Only one virtual lane in the virtual warp does vertex initialization. 32 | if( VLane_id == 0 ) { 33 | edges_starting_address[ warp_in_block_offset ] = vertices_indices[ VW_id ]; 34 | ngbrs_size[ warp_in_block_offset ] = vertices_indices[ VW_id + 1 ] - edges_starting_address[ warp_in_block_offset ] ; 35 | init_compute( final_vertex_values + warp_in_block_offset, &previous_vertex_value ); 36 | } 37 | 38 | for( uint index = VLane_id; index < ngbrs_size[ warp_in_block_offset ]; index += VWSize ) { 39 | 40 | uint target_edge = edges_starting_address[ warp_in_block_offset ] + index; 41 | uint target_vertex = edges_indices[ target_edge ]; 42 | compute_local( 43 | VertexValue[target_vertex], 44 | VertexValue_static + target_vertex, 45 | EdgeValue + target_edge, 46 | thread_outcome + threadIdx.x, 47 | final_vertex_values + warp_in_block_offset ); 48 | 49 | // Parallel Reduction. Totally unrolled. 50 | if( VWSize == 32 ) 51 | if( VLane_id < 16 ) 52 | if( (index + 16) < ngbrs_size[ warp_in_block_offset ]) 53 | compute_reduce( thread_outcome + threadIdx.x, thread_outcome + threadIdx.x + 16 ); 54 | if( VWSize >= 16 ) 55 | if( VLane_id < 8 ) 56 | if( (index + 8) < ngbrs_size[ warp_in_block_offset ]) 57 | compute_reduce( thread_outcome + threadIdx.x, thread_outcome + threadIdx.x + 8 ); 58 | if( VWSize >= 8 ) 59 | if( VLane_id < 4 ) 60 | if( (index + 4) < ngbrs_size[ warp_in_block_offset ]) 61 | compute_reduce( thread_outcome + threadIdx.x, thread_outcome + threadIdx.x + 4 ); 62 | if( VWSize >= 4 ) 63 | if( VLane_id < 2 ) 64 | if( (index + 2) < ngbrs_size[ warp_in_block_offset ]) 65 | compute_reduce( thread_outcome + threadIdx.x, thread_outcome + threadIdx.x + 2 ); 66 | if( VWSize >= 2 ) 67 | if( VLane_id < 1 ) { 68 | if( (index + 1) < ngbrs_size[ warp_in_block_offset ]) 69 | compute_reduce( thread_outcome + threadIdx.x, thread_outcome + threadIdx.x + 1 ); 70 | compute_reduce( final_vertex_values + warp_in_block_offset, thread_outcome + threadIdx.x ); // Virtual lane 0 saves the final value of current iteration. 71 | } 72 | 73 | } 74 | 75 | if( VLane_id == 0 ) 76 | if( update_condition ( final_vertex_values + warp_in_block_offset, &previous_vertex_value ) ) { 77 | (*dev_finished) = 1; 78 | VertexValue[ VW_id ] = final_vertex_values[ warp_in_block_offset ]; 79 | } 80 | 81 | } 82 | 83 | void vwc_process( 84 | int vwSize, 85 | uint gridDimen, 86 | const uint nVertices, 87 | const uint* vertexIndices, 88 | const uint* edgesIndices, 89 | Vertex* VertexValue, 90 | Edge* EdgeValue, 91 | Vertex_static* VertexValueStatic, 92 | int* finished ) { 93 | 94 | switch( vwSize ) { 95 | case(32): 96 | VWC_CSR_GPU_kernel< 32, 5 > 97 | <<< gridDimen, VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >>> ( 98 | nVertices, 99 | vertexIndices, 100 | edgesIndices, 101 | VertexValue, 102 | EdgeValue, 103 | VertexValueStatic, 104 | finished ); 105 | break; 106 | case(16): 107 | VWC_CSR_GPU_kernel< 16, 4 > 108 | <<< gridDimen, VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >>> ( 109 | nVertices, 110 | vertexIndices, 111 | edgesIndices, 112 | VertexValue, 113 | EdgeValue, 114 | VertexValueStatic, 115 | finished ); 116 | break; 117 | case(8): 118 | VWC_CSR_GPU_kernel< 8, 3 > 119 | <<< gridDimen, VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >>> ( 120 | nVertices, 121 | vertexIndices, 122 | edgesIndices, 123 | VertexValue, 124 | EdgeValue, 125 | VertexValueStatic, 126 | finished ); 127 | break; 128 | case(4): 129 | VWC_CSR_GPU_kernel< 4, 2 > 130 | <<< gridDimen, VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >>> ( 131 | nVertices, 132 | vertexIndices, 133 | edgesIndices, 134 | VertexValue, 135 | EdgeValue, 136 | VertexValueStatic, 137 | finished ); 138 | break; 139 | case(2): 140 | VWC_CSR_GPU_kernel< 2, 1 > 141 | <<< gridDimen, VWC_COMPILE_TIME_DEFINED_BLOCK_SIZE >>> ( 142 | nVertices, 143 | vertexIndices, 144 | edgesIndices, 145 | VertexValue, 146 | EdgeValue, 147 | VertexValueStatic, 148 | finished ); 149 | break; 150 | 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /common/user_specified_device_functions.cuh: -------------------------------------------------------------------------------- 1 | #ifndef USER_SPECIFIED_DEVICE_FUNCTIONS_CUH 2 | #define USER_SPECIFIED_DEVICE_FUNCTIONS_CUH 3 | 4 | #include "user_specified_global_configurations.h" 5 | #include "user_specified_structures.h" 6 | 7 | 8 | /************************************************* 9 | * PROCESSING FUNCTIONS FOR SHARD-BASED METHODS. 10 | *************************************************/ 11 | 12 | // At every iteration, you need to initialize shared memory with vertices. 13 | // This function is executed for each and every vertex. 14 | inline __device__ void init_compute_CuSha( 15 | Vertex* local_V, // Address of the corresponding vertex in shared memory. 16 | Vertex* V ) { // Address of the vertex in global memory 17 | 18 | #ifdef BFS 19 | local_V->distance = V->distance; 20 | #endif 21 | 22 | #ifdef SSSP 23 | local_V->distance = V->distance; 24 | #endif 25 | 26 | #ifdef PR 27 | local_V->rank = 0; 28 | #endif 29 | 30 | } 31 | 32 | // This function is executed for every edge in the shard. 33 | // Since multiple threads may access same shared memory address at the same time, it (usually) has to be implemented with atomics. 34 | inline __device__ void compute_CuSha( 35 | Vertex SrcV, // Source vertex. 36 | const Vertex_static* SrcV_static, // Source Vertex_static in global memory. Dereferencing this pointer if it's not defined causes run-time error. 37 | const Edge* E, // Edge content for the entry. Dereferencing this pointer if it's not defined creates run-time error. 38 | Vertex* local_V ) { // Current value of the corresponding (destination) vertex in the shared memory. 39 | 40 | #ifdef BFS 41 | if( SrcV.distance != BFS_INF ) // Just to prevent possible unpredicted overflows. 42 | atomicMin( &(local_V->distance), SrcV.distance + 1 ); 43 | #endif 44 | 45 | #ifdef SSSP 46 | if (SrcV.distance != SSSP_INF) // Just to prevent possible unpredicted overflows. 47 | atomicMin( &(local_V->distance), SrcV.distance + E->weight ); 48 | #endif 49 | 50 | #ifdef PR 51 | unsigned int nbrsNum = SrcV_static->NbrsNum; 52 | if ( nbrsNum != 0 ) 53 | atomicAdd( &(local_V->rank), SrcV.rank / nbrsNum ); 54 | #endif 55 | 56 | } 57 | 58 | // Below function signals the caller (and consequently the host) if the vertex content should be replaced with the newly calculated value. 59 | // It is executed for every vertex assigned to the shard. 60 | inline __device__ bool update_condition_CuSha( 61 | Vertex* local_V, // newly calculated vertex content. 62 | Vertex* V ) { // Vertex content at the end of previous iteration. 63 | 64 | #ifdef BFS 65 | return ( local_V->distance < V->distance ); 66 | #endif 67 | 68 | #ifdef SSSP 69 | return ( local_V->distance < V->distance ); 70 | #endif 71 | 72 | #ifdef PR 73 | local_V->rank = (1-PR_DAMPING_FACTOR) + local_V->rank*PR_DAMPING_FACTOR; 74 | return ( fabs( local_V->rank - V->rank ) > PR_TOLERANCE ); 75 | #endif 76 | 77 | } 78 | 79 | 80 | 81 | /************************************************* 82 | * PROCESSING FUNCTIONS FOR CSR-BASED METHODS. 83 | *************************************************/ 84 | 85 | // At every iteration, you need to initialize shared memory with vertices. 86 | // This function is executed for each and every vertex. 87 | inline __host__ __device__ void init_compute( 88 | volatile Vertex* local_V, // Address of the corresponding vertex in shared memory. 89 | Vertex* V ) { // Address of the previous version of the vertex. 90 | 91 | #ifdef BFS 92 | local_V->distance = V->distance; 93 | #endif 94 | 95 | #ifdef SSSP 96 | local_V->distance = V->distance; 97 | #endif 98 | 99 | #ifdef PR 100 | local_V->rank = 0; 101 | #endif 102 | 103 | } 104 | 105 | // In below, each thread computes a result based on edge data and writes to its own specific shared memory. 106 | // This function is executed for each and every edge. 107 | inline __host__ __device__ void compute_local( 108 | Vertex SrcV, // Source vertex in global memory. 109 | const Vertex_static* SrcV_static, // Source Vertex_static in global memory. Dereferencing this pointer if it's not defined causes error. 110 | const Edge* E, // Edge in global memory. Dereferencing this pointer if it's not defined cause error. 111 | volatile Vertex* thread_V_in_shared, // Thread's specific shared memory region to store the result of the local computation. 112 | Vertex* refV ) { // Value of the corresponding (destination) vertex inside shared memory. 113 | 114 | #ifdef BFS 115 | thread_V_in_shared->distance = SrcV.distance + 1; 116 | #endif 117 | 118 | #ifdef SSSP 119 | thread_V_in_shared->distance = SrcV.distance + E->weight; 120 | #endif 121 | 122 | #ifdef PR 123 | unsigned int nbrsNum = SrcV_static->NbrsNum; 124 | thread_V_in_shared->rank = ( nbrsNum != 0 ) ? ( SrcV.rank / nbrsNum ) : 0; 125 | #endif 126 | 127 | } 128 | 129 | // Reduction function that is performed for every pair of neighbors of a vertex. 130 | inline __host__ __device__ void compute_reduce( 131 | volatile Vertex* thread_V_in_shared, 132 | Vertex* next_thread_V_in_shared ) { 133 | 134 | #ifdef BFS 135 | if ( thread_V_in_shared->distance > next_thread_V_in_shared->distance ) 136 | thread_V_in_shared->distance = next_thread_V_in_shared->distance; 137 | #endif 138 | 139 | #ifdef SSSP 140 | if ( thread_V_in_shared->distance > next_thread_V_in_shared->distance) 141 | thread_V_in_shared->distance = next_thread_V_in_shared->distance; 142 | #endif 143 | 144 | #ifdef PR 145 | thread_V_in_shared->rank += next_thread_V_in_shared->rank; 146 | #endif 147 | 148 | } 149 | 150 | // Below function signals the caller (and consequently the host) if the vertex content should be replaced with the newly calculated value. 151 | // This function is performed by one virtual lane in the virtual warp. 152 | inline __host__ __device__ bool update_condition ( volatile Vertex* computed_V, 153 | Vertex* previous_V ) { 154 | 155 | #ifdef BFS 156 | return ( computed_V->distance < previous_V->distance ); 157 | #endif 158 | 159 | #ifdef SSSP 160 | return ( computed_V->distance < previous_V->distance ); 161 | #endif 162 | 163 | #ifdef PR 164 | computed_V->rank = (1-PR_DAMPING_FACTOR) + computed_V->rank * PR_DAMPING_FACTOR; // Or you can replace this expression by fused multiply-add. 165 | return ( fabs( computed_V->rank - previous_V->rank) > PR_TOLERANCE ); 166 | #endif 167 | 168 | } 169 | 170 | 171 | #endif // USER_SPECIFIED_DEVICE_FUNCTIONS_CUH 172 | -------------------------------------------------------------------------------- /cusha_src/cusha_format.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cusha_format.cuh" 6 | #include "../common/simpleTime.cuh" 7 | #include "../common/cuda_utilities.cuh" 8 | #include "../common/cuda_error_check.cuh" 9 | #include "../common/user_specified_structures.h" 10 | #include "../common/user_specified_pre_and_post_processing_functions.hpp" 11 | #include "find_block_size.cuh" 12 | #include "cusha_process.cuh" 13 | #include "../common/globals.hpp" 14 | #include "cusha_process.cuh" 15 | 16 | struct shard_entry{ 17 | Edge edgeVal; 18 | uint srcIdx; 19 | uint dstIdx; 20 | }; 21 | 22 | void cusha_format::process( 23 | const GraphProcessingMethod procesingMethod, 24 | const int bsize, 25 | std::vector* initGraph, 26 | const uint nEdges, 27 | std::ofstream& outputFile, 28 | bool EdgesOnHost ) { 29 | 30 | const uint nVerticesInitially = initGraph->size(); 31 | 32 | // Variables collecting timing info. 33 | float H2D_copy_time = 0, processing_time = 0, D2H_copy_time = 0; 34 | 35 | // Less possible bank conflict when the vertex is big. 36 | #if __CUDA_ARCH__ >= 300 37 | if ( sizeof(Vertex) > 4 ) 38 | CUDAErrorCheck( cudaDeviceSetSharedMemConfig( cudaSharedMemBankSizeEightByte ) ); 39 | #endif 40 | 41 | // Estimate the proper block size. 42 | const blockSize_N_pair bsizeNPair = find_proper_block_size( bsize, nEdges, nVerticesInitially ); 43 | const uint nShards = std::ceil( (double)nVerticesInitially / bsizeNPair.N ); 44 | const uint nVertices = nShards * bsizeNPair.N; 45 | std::cout << "Block size would be " << bsizeNPair.blockSize << ".\n"; 46 | std::cout << "The graph is divided into " << nShards << " shards.\n"; 47 | std::cout << ( ( procesingMethod == GS ) ? "G-Shards" : "Concatenated Windows" ) << " will be the processing method.\n"; 48 | 49 | // Allocate host buffers. 50 | host_pinned_buffer vertexValue( nVertices ); 51 | std::vector tmpVertexValueStatic; 52 | if( sizeof(Vertex_static) > 1 ) tmpVertexValueStatic.resize( nVertices ); 53 | std::vector< std::vector > graphWindows( nShards * nShards, std::vector( 0 ) ); 54 | 55 | // Collecting graph data into shard form. 56 | for( uint vIdx = 0; vIdx < nVerticesInitially; ++vIdx ) { 57 | initial_vertex& vvv = initGraph->at(vIdx); 58 | vertexValue[ vIdx ] = vvv.vertexValue; 59 | if( sizeof(Vertex_static) > 1 ) tmpVertexValueStatic[ vIdx ] = vvv.VertexValueStatic; 60 | uint nNbrs = vvv.nbrs.size(); 61 | for( uint nbrIdx = 0; nbrIdx < nNbrs; ++nbrIdx ) { 62 | neighbor& nbr = vvv.nbrs.at( nbrIdx ); 63 | shard_entry tmpShardEntry; 64 | tmpShardEntry.dstIdx = vIdx; 65 | tmpShardEntry.srcIdx = nbr.srcIndex; 66 | if( sizeof(Edge) > 1 ) tmpShardEntry.edgeVal = nbr.edgeValue; 67 | uint belongingShardIdx = ( static_cast( tmpShardEntry.dstIdx ) * nShards ) / nVertices; 68 | uint belongingWindowIdx = ( static_cast( tmpShardEntry.srcIdx ) * nShards ) / nVertices; 69 | graphWindows.at( belongingShardIdx * nShards + belongingWindowIdx ).push_back( tmpShardEntry ); 70 | } 71 | } 72 | initGraph->clear(); 73 | // no need to sort inside a window. 74 | 75 | // Define and allocate host buffers. 76 | host_pinned_buffer SrcValue( nEdges ); 77 | host_pinned_buffer DstIndex( nEdges ); 78 | host_pinned_buffer EdgeValues; 79 | if( sizeof(Edge) > 1 ) EdgeValues.alloc( nEdges ); 80 | host_pinned_buffer VertexValuesStatic; 81 | if( sizeof(Vertex_static) > 1 ) VertexValuesStatic.alloc( nEdges ); 82 | host_pinned_buffer SrcIndex( nEdges ); 83 | host_pinned_buffer Mapper; 84 | if( procesingMethod == CW ) Mapper.alloc( nEdges ); 85 | host_pinned_buffer windowSizesScansVertical( nShards * nShards + 1 ); 86 | windowSizesScansVertical.at( 0 ) = 0; 87 | host_pinned_buffer shardSizesScans( nShards + 1 ); 88 | shardSizesScans.at( 0 ) = 0; 89 | host_pinned_buffer concatenatedWindowsSizesScan( nShards + 1 ); 90 | concatenatedWindowsSizesScan.at( 0 ) = 0; 91 | 92 | // Put collected shard-based graph data into host pinned buffers. 93 | uint movingIdx = 0; 94 | uint winMovingIdx = 0; 95 | for( uint shardIdx = 0; shardIdx < nShards; ++shardIdx ) { 96 | for( uint winIdx = 0; winIdx < nShards; ++winIdx ) { 97 | std::vector& window = graphWindows.at( shardIdx * nShards + winIdx ); 98 | for( uint entryIdx = 0; entryIdx < window.size(); ++entryIdx ) { 99 | SrcValue[ movingIdx ] = vertexValue[ window.at( entryIdx ).srcIdx ]; 100 | DstIndex[ movingIdx ] = window.at( entryIdx ).dstIdx; 101 | if( sizeof(Edge) > 1 ) EdgeValues[ movingIdx ] = window.at( entryIdx ).edgeVal; 102 | if( sizeof(Vertex_static) > 1 ) VertexValuesStatic[ movingIdx ] = tmpVertexValueStatic[ window.at( entryIdx ).srcIdx ]; 103 | if( procesingMethod == GS ) SrcIndex[ movingIdx ] = window.at( entryIdx ).srcIdx; 104 | ++movingIdx; 105 | } 106 | windowSizesScansVertical[ winMovingIdx + 1 ] = windowSizesScansVertical[ winMovingIdx ] + window.size(); 107 | ++winMovingIdx; 108 | } 109 | shardSizesScans[ shardIdx + 1 ] = movingIdx; 110 | } 111 | tmpVertexValueStatic.clear(); 112 | movingIdx = 0; 113 | for( uint winIdx = 0; winIdx < nShards; ++winIdx ) { 114 | for( uint shardIdx = 0; shardIdx < nShards; ++shardIdx ) { 115 | std::vector& window = graphWindows.at( shardIdx * nShards + winIdx ); 116 | uint inWinMovingIdx = 0; 117 | for( uint entryIdx = 0; entryIdx < window.size(); ++entryIdx ) { 118 | if( procesingMethod == CW ) { 119 | SrcIndex[ movingIdx ] = window.at( entryIdx ).srcIdx; 120 | Mapper[ movingIdx ] = windowSizesScansVertical[ shardIdx * nShards + winIdx ] + inWinMovingIdx; 121 | } 122 | ++inWinMovingIdx; 123 | ++movingIdx; 124 | } 125 | } 126 | concatenatedWindowsSizesScan[ winIdx + 1 ] = movingIdx; 127 | } 128 | graphWindows.clear(); 129 | 130 | // Define and allocate device buffers. 131 | device_buffer dev_vertexValue( nVertices ); 132 | device_buffer dev_SrcValue; 133 | device_buffer dev_DstIndex; 134 | device_buffer dev_EdgeValues; 135 | device_buffer dev_VertexValuesStatic; 136 | device_buffer dev_SrcIndex; 137 | device_buffer dev_Mapper; 138 | device_buffer dev_concatenatedWindowsSizesScan; 139 | if( procesingMethod == CW ) dev_concatenatedWindowsSizesScan.alloc( nShards + 1 ); 140 | device_buffer dev_windowSizesScansVertical; 141 | if( procesingMethod == GS ) dev_windowSizesScansVertical.alloc( nShards * nShards + 1 ); 142 | device_buffer dev_shardSizesScans( nShards + 1 ); 143 | device_buffer dev_Finished( 1 ); 144 | if( !EdgesOnHost ) { 145 | dev_SrcValue.alloc( nEdges ); 146 | dev_DstIndex.alloc( nEdges ); 147 | if( sizeof(Edge) > 1 ) dev_EdgeValues.alloc( nEdges ); 148 | if( sizeof(Vertex_static) > 1 ) dev_VertexValuesStatic.alloc( nEdges ); 149 | dev_SrcIndex.alloc( nEdges ); 150 | if( procesingMethod == CW ) dev_Mapper.alloc( nEdges ); 151 | } 152 | 153 | // Copy data to device buffers. 154 | setTime(); 155 | dev_vertexValue = vertexValue; 156 | if( procesingMethod == CW ) dev_concatenatedWindowsSizesScan = concatenatedWindowsSizesScan; 157 | if( procesingMethod == GS ) dev_windowSizesScansVertical = windowSizesScansVertical; 158 | dev_shardSizesScans = shardSizesScans; 159 | if( !EdgesOnHost ) { 160 | dev_SrcValue = SrcValue; 161 | dev_DstIndex = DstIndex; 162 | if( sizeof(Edge) > 1 ) dev_EdgeValues = EdgeValues; 163 | if( sizeof(Vertex_static) > 1 ) dev_VertexValuesStatic = VertexValuesStatic; 164 | dev_SrcIndex = SrcIndex; 165 | if( procesingMethod == CW ) dev_Mapper = Mapper; 166 | } 167 | CUDAErrorCheck( cudaDeviceSynchronize() ); 168 | H2D_copy_time = getTime(); 169 | std::cout << "Copying data to device took " << H2D_copy_time << " (ms).\n"; 170 | 171 | // Iteratively process the graph. 172 | int finished; 173 | unsigned int IterationCounter = 0; 174 | setTime(); 175 | do { 176 | finished = 0; 177 | 178 | CUDAErrorCheck( cudaMemcpyAsync( dev_Finished.get_ptr(), &finished, sizeof(int), cudaMemcpyHostToDevice ) ); 179 | 180 | cusha_process( 181 | procesingMethod, 182 | bsizeNPair.blockSize, 183 | bsizeNPair.N, 184 | nShards, 185 | nVertices, 186 | dev_vertexValue.get_ptr(), 187 | dev_concatenatedWindowsSizesScan.get_ptr(), 188 | dev_windowSizesScansVertical.get_ptr(), 189 | dev_shardSizesScans.get_ptr(), 190 | dev_Finished.get_ptr(), 191 | ( !EdgesOnHost ) ? dev_SrcValue.get_ptr() : SrcValue.get_ptr(), 192 | ( !EdgesOnHost ) ? dev_DstIndex.get_ptr() : DstIndex.get_ptr(), 193 | ( !EdgesOnHost ) ? dev_EdgeValues.get_ptr() : EdgeValues.get_ptr(), 194 | ( !EdgesOnHost ) ? dev_VertexValuesStatic.get_ptr() : VertexValuesStatic.get_ptr(), 195 | ( !EdgesOnHost ) ? dev_SrcIndex.get_ptr() : SrcIndex.get_ptr(), 196 | ( !EdgesOnHost ) ? dev_Mapper.get_ptr() : Mapper.get_ptr() ); 197 | 198 | CUDAErrorCheck( cudaPeekAtLastError() ); 199 | CUDAErrorCheck( cudaMemcpyAsync( &finished, dev_Finished.get_ptr(), sizeof(int), cudaMemcpyDeviceToHost ) ); 200 | CUDAErrorCheck( cudaDeviceSynchronize() ); 201 | 202 | ++IterationCounter; 203 | } while( finished == 1 ); 204 | processing_time = getTime(); 205 | std::cout << "Processing finished in " << processing_time << " (ms).\n"; 206 | std::cout << "Performed " << IterationCounter << " iterations in total.\n"; 207 | 208 | // Copy resulted vertex values back from the device to the host. 209 | setTime(); 210 | CUDAErrorCheck( cudaMemcpy( vertexValue.get_ptr(), dev_vertexValue.get_ptr(), nVerticesInitially * sizeof(Vertex), cudaMemcpyDeviceToHost ) ); 211 | D2H_copy_time = getTime(); 212 | std::cout << "Copying final vertex values back to the host took " << D2H_copy_time << " (ms).\n"; 213 | 214 | //std::cout << "Total Execution time was " << H2D_copy_time + processing_time + D2H_copy_time << " (ms).\n"; 215 | //std::cout << IterationCounter <<"\t"<< H2D_copy_time <<"\t"<< processing_time <<"\t"<< D2H_copy_time << "\n"; 216 | 217 | // Print the output vertex values to the file. 218 | for( uint vvv = 0; vvv < nVerticesInitially; ++vvv ) 219 | print_vertex_output( 220 | vvv, 221 | vertexValue[ vvv ], 222 | outputFile ); 223 | 224 | } 225 | --------------------------------------------------------------------------------