├── contributors.txt ├── csrcolor.pdf ├── src ├── GM │ ├── run │ ├── Makefile │ ├── README │ ├── tree.h │ ├── graphColoring.h │ └── tree.cpp ├── cusp │ ├── Makefile │ ├── README │ ├── timer.h │ └── vertex_coloring.cu ├── serial │ ├── Makefile │ ├── README │ ├── runall │ ├── graph_io.h │ └── greedy.cpp ├── csrcolor │ ├── runall │ ├── Makefile │ ├── README │ └── csrcolor.cu ├── common.mk ├── topo │ ├── Makefile │ ├── runall │ ├── README │ ├── main.cu │ ├── kernel.h │ └── graph_io.h ├── omp │ ├── Makefile │ ├── runall │ ├── common.h │ ├── kernel2.h │ ├── kernel1.h │ ├── worklist.h │ ├── main.cc │ └── graph.h └── data │ ├── runall │ ├── README │ ├── variants.h │ ├── Makefile │ ├── kernel_ldb.h │ ├── kernel_fusion.h │ ├── kernel_bitset.h │ ├── kernel_tc.h │ ├── kernel_pq.h │ ├── kernel_base.h │ ├── kernel_ldg.h │ ├── kernel_comb.h │ └── main.cu ├── .gitignore ├── include ├── lonestargpu.h ├── header.h ├── cutil_subset.h ├── list.h ├── kernelconfig.h ├── worklistc.h ├── common.h ├── sharedptr.h ├── util.h ├── gbar.cuh └── component.h ├── Makefile ├── LICENSE └── README /contributors.txt: -------------------------------------------------------------------------------- 1 | Xuhao Chen 2 | Pingfan Li 3 | -------------------------------------------------------------------------------- /csrcolor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenxuhao/csrcolor/HEAD/csrcolor.pdf -------------------------------------------------------------------------------- /src/GM/run: -------------------------------------------------------------------------------- 1 | ./gc 0 0 0 0 0 0 0 14 128 data/hood.mtx y n 2 | #./gc 0 0 0 0 0 0 0 14 128 data/pwtk.mtx y n 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.DS_Store 3 | *.log 4 | *.out 5 | *.o 6 | *.d 7 | *.swp 8 | color.txt 9 | */color.txt 10 | bin/* 11 | input/* 12 | -------------------------------------------------------------------------------- /src/cusp/Makefile: -------------------------------------------------------------------------------- 1 | all: vc 2 | vc: 3 | nvcc -w -O3 -I./ -I~/cusplibrary-0.5.1 vertex_coloring.cpp -o vc 4 | cp $@ $(BIN) 5 | 6 | clean: 7 | rm vc 8 | -------------------------------------------------------------------------------- /src/serial/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | all: greedy 3 | greedy: greedy.cpp 4 | g++ -w -O3 greedy.cpp -o greedy 5 | mv $@ $(BIN) 6 | 7 | clean: 8 | rm greedy 9 | -------------------------------------------------------------------------------- /src/csrcolor/runall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=../input 4 | APP=./csrcolor 5 | 6 | for input in `ls $DIR` 7 | do 8 | echo $APP $DIR/$input 9 | $APP $DIR/$input 10 | done 11 | -------------------------------------------------------------------------------- /src/serial/README: -------------------------------------------------------------------------------- 1 | program read mtx or gr graph and store its information in CSR format 2 | 3 | and write colors of all vertices to "color.txt" 4 | 5 | usage: 6 | ./greedy 7 | 8 | -------------------------------------------------------------------------------- /src/csrcolor/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | EXE=csrcolor 3 | all: csrcolor.cu 4 | $(NVCC) $(NVFLAGS) $(INCLUDES) -lcusparse csrcolor.cu -o $(EXE) 5 | mv $(EXE) $(BIN) 6 | 7 | clean: 8 | rm csrcolor 9 | -------------------------------------------------------------------------------- /src/GM/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | all: gc 3 | gc: graphColoring.cu graphDriver.cpp tree.cpp 4 | $(NVCC) $(NVFLAGS) graphDriver.cpp graphColoring.cu -o gc 5 | mv $@ $(BIN) 6 | 7 | clean: 8 | rm gc 9 | -------------------------------------------------------------------------------- /src/serial/runall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #filename=rmat1.gr 4 | DIR=../input 5 | APP=./greedy 6 | 7 | for input in `ls $DIR` 8 | do 9 | echo $APP $DIR/$input 10 | $APP $DIR/$input 11 | #echo $DIR/$input 12 | done 13 | -------------------------------------------------------------------------------- /include/lonestargpu.h: -------------------------------------------------------------------------------- 1 | #ifndef LSG_LONESTARGPU 2 | #define LSG_LONESTARGPU 3 | 4 | #include "common.h" 5 | #include "graph.h" 6 | #include "kernelconfig.h" 7 | #include "list.h" 8 | #include "component.h" 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /src/common.mk: -------------------------------------------------------------------------------- 1 | GCC=gcc 2 | GXX=g++ 3 | NVCC=nvcc 4 | COMPUTECAPABILITY=sm_35 5 | #NVFLAGS=-g -arch=$(COMPUTECAPABILITY) #-Xptxas -v 6 | NVFLAGS=-w -O3 -arch=$(COMPUTECAPABILITY) #-Xptxas -v 7 | INCLUDES=-I../../include 8 | BIN=../../bin/ 9 | -------------------------------------------------------------------------------- /src/topo/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | INCLUDES += -I../../cub-1.1.1 3 | EXTRA := $(NVFLAGS) $(INCLUDES) -DITERATIONS=10 4 | SRC=main.cu 5 | 6 | all: topo_base 7 | topo_base: 8 | $(NVCC) $(EXTRA) $(SRC) -o topo_base 9 | mv $@ $(BIN) 10 | 11 | clean: 12 | rm topo_naive topo_ldg 13 | -------------------------------------------------------------------------------- /src/topo/runall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=../input 4 | 5 | for input in `ls $DIR` 6 | do 7 | for APP in './topodriven_naive' './topodriven_ldg' 8 | do 9 | for ((blksize=32; blksize<=1024; blksize*=2)) 10 | do 11 | echo $APP $blksize $DIR/$input 12 | $APP $blksize $DIR/$input 13 | done 14 | done 15 | done 16 | -------------------------------------------------------------------------------- /src/csrcolor/README: -------------------------------------------------------------------------------- 1 | program read mtx or gr graph and store its information in CSR format 2 | 3 | and write colors of all vertices to "color.txt" 4 | 5 | usage: 6 | ./csrcolor 7 | 8 | Reference: 9 | 10 | NVIDIA cuSPARSE Reorderings Reference 11 | http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-reorderings-reference 12 | 13 | -------------------------------------------------------------------------------- /src/omp/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXX_FLAGS=-w -O3 3 | EXTRA=-DITERATIONS=10 4 | SRC=main.cc 5 | 6 | all:color-omp color-serial 7 | 8 | color-omp: 9 | $(CXX) $(CXX_FLAGS) $(EXTRA) -fopenmp -DENABLE_OPENMP $(SRC) -o $@ 10 | mv $@ ../../bin 11 | 12 | color-serial: 13 | $(CXX) $(CXX_FLAGS) $(EXTRA) $(SRC) -o $@ 14 | 15 | clean: 16 | rm color-omp color-serial 17 | -------------------------------------------------------------------------------- /src/data/runall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=../input 4 | 5 | for input in `ls $DIR` 6 | do 7 | for APP in './datadriven_naive-32' './datadriven_naive-64' './datadriven_naive-128' './datadriven_naive-256' './datadriven_naive-512' './datadriven_naive-1024' './datadriven_ldg-32' './datadriven_ldg-64' './datadriven_ldg-128' './datadriven_ldg-256' './datadriven_ldg-512' './datadriven_ldg-1024' 8 | do 9 | echo $APP $DIR/$input 10 | $APP $DIR/$input 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /src/omp/runall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #filename=rmat1.gr 4 | DIR=/home/lpf/workspace/lonestargpu-2.0/apps/coloring/input 5 | APP=./color_greedy-omp 6 | 7 | SERIAL=./color_greedy-s 8 | 9 | #NUM_THREADS=16 10 | 11 | for input in `ls $DIR` 12 | do 13 | # echo $SERIAL 1 $DIR/$input 14 | # $SERIAL 1 $DIR/$input 15 | 16 | for ((NUM_THREADS=2; NUM_THREADS<=24; NUM_THREADS+=2)) 17 | do 18 | # echo $APP $NUM_THREADS $DIR/$input 19 | $APP $NUM_THREADS $DIR/$input 20 | done 21 | done 22 | -------------------------------------------------------------------------------- /src/cusp/README: -------------------------------------------------------------------------------- 1 | Place a symlink to the top-level cusp directory in current dir. Assuming 2 | the top-level CUB directory is $CUSPDIR: 3 | 4 | $ ln -s $CUSPDIR 5 | 6 | program read mtx graph with the cusplibrary 7 | 8 | and write colors of all vertices to "color.txt" 9 | 10 | usage: 11 | ./vc 12 | 13 | Reference: 14 | 15 | S. Dalton, N. Bell, L. Olson, and M. Garland, “Cusp: 16 | Generic parallel algorithms for sparse matrix and graph 17 | computations,” 2014, version 0.5.0. 18 | http://cusplibrary.github.io/ -------------------------------------------------------------------------------- /src/topo/README: -------------------------------------------------------------------------------- 1 | topology implementation of parallel graph coloring on GPGPUs using FristFit strategy 2 | 3 | program read mtx or gr graph and store its information in CSR format 4 | 5 | and write colors of all vertices to "color.txt" 6 | 7 | program has two variants. 8 | 9 | topodriven_naive: naive implementation without any optimization 10 | 11 | topodriven_ldg: use __ldg to read C and R array from read_only cache 12 | 13 | 14 | usage: 15 | ./topodriven_naive 16 | ./topodriven_ldg 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TOPLEVEL := . 2 | APPS := csrcolor serial topo data GM 3 | INPUT_URL := http:// 4 | INPUT := csrcolor-inputs.tar.bz2 5 | 6 | .PHONY: all clean 7 | 8 | all: $(APPS) 9 | 10 | $(APPS): 11 | make -C src/$@ 12 | 13 | #include common.mk 14 | 15 | inputs: 16 | @echo "Downloading inputs ..." 17 | @wget $(INPUT_URL) -O $(INPUT) 18 | @echo "Uncompressing inputs ..." 19 | @tar xvf $(INPUT) 20 | @rm $(INPUT) 21 | @echo "Inputs available at $(TOPLEVEL)/inputs/" 22 | 23 | clean: 24 | for APP in $(APPS); do make -C apps/$$APP clean; done 25 | 26 | -------------------------------------------------------------------------------- /src/data/README: -------------------------------------------------------------------------------- 1 | datadriven implementation of parallel graph coloring on GPGPUs using FristFit strategy 2 | 3 | program read mtx or gr graph and store its information in CSR format 4 | 5 | and write colors of all vertices to "color.txt" 6 | 7 | program has two variants. 8 | 9 | datadriven_naive: use worklist to improve work effiency, reduce atomic operation 10 | 11 | using block scan 12 | 13 | datadriven_ldg: based on datadriven_naive, use __ldg to read C and R array 14 | 15 | from read_only cache 16 | 17 | 18 | usage: 19 | ./datadriven_naive 20 | ./datadriven_ldg 21 | -------------------------------------------------------------------------------- /src/GM/README: -------------------------------------------------------------------------------- 1 | Graph Coloring (GCO) partitions the vertices of a graph such that 2 | no two adjacent matrices share the same color. 3 | 4 | 3-step graph coloring framework: 5 | 1) Graph partitioning which partitions graph into subgraphs and 6 | identifies boundary vertices, 7 | 2) graph coloring & conflicts detection which colors the graph using 8 | the specified heuristic, e.g. FF, and identifies color conflicts, and 9 | 3) sequential conflicts resolution which goes back to CPU and resolves 10 | the conflicts. 11 | 12 | 13 | useage: 14 | ./gc 0 0 0 0 0 0 0 n 15 | 16 | see run for details 17 | 18 | 19 | note: 20 | 21 | program can only read mtx graph 22 | 23 | 24 | References: 25 | 26 | A. V. P. Grosset, P. Zhu, S. Liu, S. Venkatasubramanian, and M. Hall, 27 | “Evaluating graph coloring on gpus,” in Proceedings of the 16th ACM 28 | Symposium on Principles and Practice of Parallel Programming, 2011, 29 | pp. 297–298. 30 | -------------------------------------------------------------------------------- /include/header.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | #define MYINFINITY 100000000 20 | #define SWAP(x, y) {tmp = x; x = y; y = tmp;} 21 | 22 | typedef unsigned foru; 23 | 24 | double rtclock() 25 | { 26 | struct timezone Tzp; 27 | struct timeval Tp; 28 | int stat; 29 | stat = gettimeofday (&Tp, &Tzp); 30 | if (stat != 0) printf("Error return from gettimeofday: %d",stat); 31 | return(Tp.tv_sec + Tp.tv_usec*1.0e-6); 32 | } 33 | 34 | void CudaTest(char *msg) 35 | { 36 | cudaError_t e; 37 | 38 | cudaThreadSynchronize(); 39 | if (cudaSuccess != (e = cudaGetLastError())) { 40 | fprintf(stderr, "%s: %d\n", msg, e); 41 | fprintf(stderr, "%s\n", cudaGetErrorString(e)); 42 | exit(-1); 43 | } 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Xuhao Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/topo/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | using namespace std; 10 | #ifndef ITERATIONS 11 | #define ITERATIONS 10 12 | #endif 13 | #ifndef BLKSIZE 14 | #define BLKSIZE 32 15 | #endif 16 | #include "kernel.h" 17 | #include "graph_io.h" 18 | 19 | int main(int argc, char *argv[]) { 20 | if (argc != 3) { 21 | printf("Usage: %s \n", argv[0]); 22 | exit(1); 23 | } 24 | int blksize = atoi(argv[1]); 25 | int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL; 26 | if (strstr(argv[2], ".mtx")) 27 | mtx2csr(argv[2], m, nnz, csrRowPtr, csrColInd); 28 | if (strstr(argv[2], ".gr")) 29 | gr2csr(argv[2], m, nnz, csrRowPtr, csrColInd); 30 | int *coloring = (int *)calloc(m, sizeof(int)); 31 | color(m, nnz, csrRowPtr, csrColInd, coloring, blksize); 32 | write_solution("color.txt", coloring, m); 33 | int correct = 1; 34 | verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct); 35 | if (correct) 36 | printf("correct.\n"); 37 | else 38 | printf("incorrect.\n"); 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /src/data/variants.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define DATA_BASE 0 // the baseline data-driven version 4 | #define DATA_LDG 1 // using __ldg() intrinsic 5 | #define DATA_BITSET 2 // bitset for forbiddenColors 6 | #define DATA_COARSE 3 // thread coarsening 7 | #define DATA_FUSION 4 // kernel fusion 8 | #define DATA_WLC 5 // worklistc from lonestargpu 9 | #define DATA_LDB 6 // load balancing using merrill's scheme 10 | #define DATA_PQ 7 11 | #define DATA_BEST 8 12 | #define DATA_COMB1 9 13 | 14 | #ifndef VARIANT 15 | #error "VARIANT not defined." 16 | #endif 17 | 18 | #if VARIANT==DATA_BASE 19 | #include "kernel_base.h" 20 | #elif VARIANT==DATA_LDG 21 | #include "kernel_ldg.h" 22 | #elif VARIANT==DATA_BITSET 23 | #include "kernel_bitset.h" 24 | #elif VARIANT==DATA_COARSE 25 | #include "kernel_tc.h" 26 | #elif VARIANT==DATA_FUSION 27 | #include "kernel_fusion.h" 28 | #elif VARIANT==DATA_WLC 29 | #include "kernel_wlc.h" 30 | #elif VARIANT==DATA_LDB 31 | #include "kernel_ldb.h" 32 | #elif VARIANT==DATA_PQ 33 | #include "kernel_pq.h" 34 | #elif VARIANT==DATA_BEST 35 | #include "kernel_best.h" 36 | #elif VARIANT==DATA_COMB1 37 | #include "kernel_comb.h" 38 | #else 39 | #error "Unknown variant" 40 | #endif 41 | -------------------------------------------------------------------------------- /src/GM/tree.h: -------------------------------------------------------------------------------- 1 | 2 | class node{ 3 | private: 4 | int key, saturation, degree, color; 5 | node *left, *right; 6 | 7 | public: 8 | node(); 9 | node(int index, int sat, int deg); 10 | node(int index, int sat, int deg, int col, node *L, node *R); 11 | 12 | 13 | int getKey(); 14 | int getSaturation(); 15 | int getDegree(); 16 | int getColor(); 17 | node* getLeft(); 18 | node* getRight(); 19 | 20 | void setKey(int index); 21 | void setSaturation(int sat); 22 | void setDegree(int deg); 23 | void setColor(int c); 24 | void setLeft(node *L); 25 | void setRight(node *R); 26 | 27 | void setKSD(int index, int saturation, int degree); 28 | 29 | void displayNode(); 30 | 31 | ~node(); 32 | }; 33 | 34 | 35 | // Tree sorted by saturation and then by degree 36 | class tree{ 37 | private: 38 | node *top; 39 | 40 | public: 41 | tree(); 42 | 43 | void insert(node *x); 44 | node* remove(int index, int saturation, int degree); 45 | 46 | node* findNode(int index, int saturation, int degree); 47 | void findBiggest(int &index, int &saturation, int °ree); 48 | 49 | void displayTreeRML(node *current); 50 | void displayTreeLMR(node *current); 51 | void displayTreeMLR(node *current); 52 | 53 | node* getTop(); 54 | 55 | ~tree(); 56 | }; 57 | 58 | -------------------------------------------------------------------------------- /src/GM/graphColoring.h: -------------------------------------------------------------------------------- 1 | #ifndef _GRAPHCOLORING_H_ 2 | #define _GRAPHCOLORING_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace std; 12 | 13 | 14 | // Should be at least equal to maxDegree of graph + 1 15 | // if doing that generates an error like: too much local memory, then use commented line 16 | // maked OPTION2 instead of OPTION1 in function color & saturation in gaphColoring.cu 17 | //const int TEMP_COLOR_LENGTH = 256;//128; //256;//1024; 18 | const int TEMP_COLOR_LENGTH = 1000;//128; //256;//1024; 19 | 20 | const int CONFLICT_BLOCK_SIZE = 256; 21 | 22 | const int MAXGPUITERATIONS = 50; 23 | 24 | 25 | #ifdef __cplusplus 26 | #define CHECK_EXT extern "C" 27 | #else 28 | #define CHECK_EXT 29 | #endif 30 | 31 | 32 | CHECK_EXT float cudaGraphColoring(int *adjacentList, int *boundaryList, int *graphColors, int *degreeList, 33 | int *conflict, int boundarySize, int maxDegree, int graphSize, int & passes, 34 | int subsizeBoundary, int _gridSize, int _blockSize, int *startPartitionList, 35 | int *endPartitionList, int *randomList, int numRand, int useSDO, int *numOut); 36 | 37 | 38 | #endif // _GRAPHCOLORING_H_ 39 | 40 | -------------------------------------------------------------------------------- /include/cutil_subset.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | # define CUDA_SAFE_CALL_NO_SYNC( call) { \ 4 | cudaError err = call; \ 5 | if( cudaSuccess != err) { \ 6 | fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \ 7 | err, __FILE__, __LINE__, cudaGetErrorString( err) ); \ 8 | exit(EXIT_FAILURE); \ 9 | } } 10 | 11 | # define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); \ 12 | 13 | # define CUDA_SAFE_THREAD_SYNC( ) { \ 14 | cudaError err = CUT_DEVICE_SYNCHRONIZE(); \ 15 | if ( cudaSuccess != err) { \ 16 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 17 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 18 | } } 19 | 20 | // from http://forums.nvidia.com/index.php?showtopic=186669 21 | static __device__ unsigned get_smid(void) { 22 | unsigned ret; 23 | asm("mov.u32 %0, %smid;" : "=r"(ret) ); 24 | return ret; 25 | } 26 | -------------------------------------------------------------------------------- /src/data/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | INCLUDES += -I../../cub-1.1.1 3 | B40_DIR=./back40computing-read-only 4 | B40C_INC=-I$(B40_DIR) -I$(B40_DIR)/test 5 | EXTRA := $(NVFLAGS) $(CFLAGS) $(INCLUDES) -DITERATIONS=10 -DBLKSIZE=128 6 | SRC=main.cu 7 | 8 | all: data_base data_bitset data_wlc data_ldb data_fusion data_tc data_ldg data_pq data_best 9 | 10 | data_base: 11 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=0 -o $@ 12 | mv $@ $(BIN) 13 | 14 | data_ldg: 15 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=1 -o $@ 16 | mv $@ $(BIN) 17 | 18 | data_bitset: 19 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=2 -o $@ 20 | mv $@ $(BIN) 21 | 22 | data_ldb: 23 | $(NVCC) $(EXTRA) $(B40C_INC) $(SRC) -DVARIANT=6 -o $@ 24 | mv $@ $(BIN) 25 | 26 | data_wlc: 27 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=5 -o $@ 28 | mv $@ $(BIN) 29 | 30 | data_fusion: 31 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=4 -o $@ 32 | mv $@ $(BIN) 33 | 34 | data_tc: 35 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=3 -o $@ 36 | mv $@ $(BIN) 37 | 38 | data_pq: 39 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=7 -o $@ 40 | mv $@ $(BIN) 41 | 42 | data_best: 43 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=8 -o $@ 44 | mv $@ $(BIN) 45 | 46 | data_comb1: 47 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=9 -o $@ 48 | mv $@ $(BIN) 49 | 50 | data_comb2: 51 | $(NVCC) $(EXTRA) $(SRC) -DVARIANT=9 -o $@ 52 | mv $@ $(BIN) 53 | 54 | clean: 55 | rm data_base data_ldg data_bitset data_ldb data_fusion data_tc data_pq 56 | 57 | -------------------------------------------------------------------------------- /src/cusp/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2009 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | // A simple timer class 20 | 21 | #include 22 | 23 | class timer 24 | { 25 | cudaEvent_t start; 26 | cudaEvent_t end; 27 | 28 | public: 29 | timer() 30 | { 31 | cudaEventCreate(&start); 32 | cudaEventCreate(&end); 33 | cudaEventRecord(start,0); 34 | } 35 | 36 | ~timer() 37 | { 38 | cudaEventDestroy(start); 39 | cudaEventDestroy(end); 40 | } 41 | 42 | float milliseconds_elapsed() 43 | { 44 | float elapsed_time; 45 | cudaEventRecord(end, 0); 46 | cudaEventSynchronize(end); 47 | cudaEventElapsedTime(&elapsed_time, start, end); 48 | return elapsed_time; 49 | } 50 | float seconds_elapsed() 51 | { 52 | return milliseconds_elapsed() / 1000.0; 53 | } 54 | }; 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Copyright 2016 Xuhao Chen, National University of Defense Technology 2 | This is the code for sequential graph coloring on CPU and CUDA code for parallel graph coloring on GPGPUs. 3 | 4 | Variants: 5 | csrcolor: graph coloring routine of NVIDIA cusparse 6 | 3-step-GM: parallel graph coloring implemented on GPGPUs by Grosset et al. 7 | sequential: sequential graph coloring using FirstFit strategy 8 | datadriven: datadriven implementation of parallel graph coloring using FirstFit strategy 9 | topodriven: topodriven implementation of parallel graph coloring using FirstFit strategy 10 | 11 | 12 | Requirements: 13 | compute capability 3.5 and higher 14 | Kepler or later GPU hardware 15 | CUB v1.1.1 16 | 17 | The instructions below assume CSRCOLOR_CODE has been installed in $CSRCOLOR_CODE_DIR. 18 | 19 | Each variant directory under $CSRCOLOR_CODE_DIR/$VARIANT contains a README that 20 | explains what $VARIANT does, how to run it, details of implementations 21 | and other useful info. 22 | 23 | 24 | INSTALLATION 25 | 26 | You will need to download and install CUB from here: 27 | 28 | http://nvlabs.github.io/cub/ 29 | 30 | Place a symlink to the top-level CUB directory in $CSRCOLOR_CODE_DIR. Assuming 31 | the top-level CUB directory is $CUBDIR: 32 | 33 | $ cd $CSRCOLOR_CODE_DIR 34 | $ ln -s $CUBDIR 35 | 36 | 37 | BUILDING 38 | 39 | Assuming you're in $CSRCOLOR_CODE_DIR: 40 | $ make # compiles all variants 41 | 42 | 43 | RUNNING 44 | 45 | Each variant directory under $CSRCOLOR_CODE_DIR contains a simple `run' script that 46 | runs the application with all recommended inputs. 47 | 48 | 49 | Authors: 50 | Xuhao Chen 51 | Pingfan Li 52 | 53 | Citations: 54 | Pingfan Li et al., High Performance Parallel Graph Coloring on GPGPUs, IPDPSW, 2016 55 | Xuhao Chen et al., Efficient and High-quality Sparse Graph Coloring on the GPU, Tech. Rep. NUDT-CS-2016-003, 2016 56 | -------------------------------------------------------------------------------- /src/omp/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | #include 4 | #ifdef ENABLE_OPENMP 5 | #include 6 | #endif 7 | //#define GCC_EXTENSION 8 | #define OPENMP_3_1 9 | 10 | double rtclock() { 11 | struct timezone Tzp; 12 | struct timeval Tp; 13 | int stat; 14 | stat = gettimeofday (&Tp, &Tzp); 15 | if (stat != 0) printf("Error return from gettimeofday: %d",stat); 16 | return(Tp.tv_sec + Tp.tv_usec*1.0e-6); 17 | } 18 | 19 | template 20 | inline T my_fetch_add(T *ptr, T val) { 21 | #ifdef ENABLE_OPENMP 22 | #ifdef GCC_EXTENSION 23 | return __sync_fetch_and_add(ptr,val); 24 | #endif 25 | #ifdef OPENMP_3_1 26 | T old; 27 | #pragma omp atomic capture 28 | {old = *ptr; *ptr += val;} 29 | return old; 30 | #endif 31 | #else 32 | T old; old = *ptr; *ptr += val; 33 | return old; 34 | #endif 35 | } 36 | 37 | template 38 | inline T my_fetch_sub(T *ptr, T val) { 39 | #ifdef ENABLE_OPENMP 40 | #ifdef GCC_EXTENSION 41 | return __sync_fetch_and_sub(ptr,val); 42 | #endif 43 | #ifdef OPENMP_3_1 44 | T old; 45 | #pragma omp atomic capture 46 | {old = *ptr; *ptr -= val;} 47 | return old; 48 | #endif 49 | #else 50 | T old; old = *ptr; *ptr -= val; 51 | return old; 52 | #endif 53 | } 54 | ; 55 | 56 | template 57 | inline T my_compare_swap(T *ptr, T old_val, T new_val) { 58 | #ifdef ENABLE_OPENMP 59 | #ifdef GCC_EXTENSION 60 | return __sync_val_compare_and_swap(ptr,old_val,new_val); 61 | #endif 62 | #ifdef OPENMP_3_1 63 | T old = *ptr; 64 | #pragma omp critical 65 | { 66 | if(*ptr == old_val) { 67 | *ptr = new_val; 68 | } 69 | } 70 | return old; 71 | #endif 72 | #else 73 | T old = *ptr; 74 | if(*ptr == old_val) *ptr = new_val; 75 | return old; 76 | #endif 77 | } 78 | ; 79 | 80 | template 81 | inline T atomicMin(T *ptr, T val) { 82 | T old = *ptr; 83 | #ifdef ENABLE_OPENMP 84 | #pragma omp critical 85 | #endif 86 | {if(val < *ptr) *ptr = val;} 87 | return old; 88 | } 89 | ; 90 | 91 | void __syncthreads() { 92 | #ifdef ENABLE_OPENMP 93 | #ifdef GCC_EXTENSION 94 | //#pragma omp barrier 95 | //__sync_synchronize(); 96 | #endif 97 | #ifdef OPENMP_3_1 98 | #pragma omp barrier 99 | #endif 100 | #else 101 | #endif 102 | } 103 | #endif 104 | -------------------------------------------------------------------------------- /include/list.h: -------------------------------------------------------------------------------- 1 | #ifndef LSG_LIST 2 | #define LSG_LIST 3 | 4 | typedef struct List { 5 | __device__ List(unsigned size); 6 | __device__ void init(unsigned *mem, unsigned size, unsigned cap); 7 | __device__ void push(unsigned item); 8 | __device__ unsigned *toArray(); 9 | __device__ void clear(); 10 | __device__ unsigned size(); 11 | __device__ void uniq(unsigned *mark, unsigned maxelement); 12 | 13 | unsigned *array; 14 | unsigned nitems; 15 | unsigned capacity; 16 | } List; 17 | 18 | __device__ List::List(unsigned size) { 19 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 20 | capacity = 0; 21 | array = NULL; 22 | nitems = 0; 23 | 24 | if (size) { 25 | array = (unsigned *)malloc(size * sizeof(unsigned)); 26 | if (array == NULL) { 27 | printf("%s(%d): thread %d: Error: malloc of %d unsigned returned no memory.\n", __FILE__, __LINE__, id, size); 28 | } else { 29 | capacity = size; 30 | } 31 | } 32 | } 33 | __device__ void List::init(unsigned *mem, unsigned size, unsigned cap) { 34 | array = mem; 35 | nitems = size; 36 | capacity = cap; 37 | } 38 | __device__ void List::push(unsigned item) { 39 | if (array && nitems < capacity) { 40 | array[nitems++] = item; 41 | } else { 42 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 43 | printf("%s(%d): thread %d: Error: buffer overflow, capacity=%d.\n", __FILE__, __LINE__, id, capacity); 44 | } 45 | } 46 | __device__ unsigned *List::toArray() { 47 | return array; 48 | } 49 | __device__ void List::clear() { 50 | if (array) free(array); 51 | nitems = 0; 52 | capacity = 0; 53 | } 54 | __device__ unsigned List::size() { 55 | return nitems; 56 | } 57 | __device__ void List::uniq(unsigned *mark, unsigned maxelement) { 58 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 59 | unsigned mysize = size(); 60 | if (mysize == 0) return; 61 | 62 | unsigned *newarray = (unsigned *)malloc(mysize * sizeof(unsigned)); 63 | if (newarray == NULL) { 64 | printf("%s(%d): thread %d: Error: malloc of %d unsigned returned no memory.\n", __FILE__, __LINE__, id, mysize); 65 | return; 66 | } 67 | unsigned *insertptr = newarray; 68 | 69 | for (unsigned ii = 0; ii < mysize; ++ii) { 70 | unsigned element = array[ii]; 71 | if (element < maxelement && mark[element] == id) { // this thread didn't succeed in marking this element. 72 | *insertptr++ = element; 73 | } 74 | } 75 | clear(); 76 | init(newarray, insertptr - newarray, mysize); 77 | } 78 | #endif 79 | -------------------------------------------------------------------------------- /src/cusp/vertex_coloring.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "timer.h" 9 | 10 | template 11 | void coloring(const MatrixType& G) 12 | { 13 | typedef typename MatrixType::index_type IndexType; 14 | typedef cusp::csr_matrix GraphType; 15 | 16 | GraphType G_csr(G); 17 | cusp::array1d colors(G.num_rows, 0); 18 | 19 | timer t; 20 | size_t max_color = cusp::graph::vertex_coloring(G_csr, colors); 21 | std::cout << "Coloring time : " << t.milliseconds_elapsed() << " (ms)." << std::endl; 22 | std::cout << "Number of colors : " << max_color << std::endl; 23 | 24 | if(max_color > 0) 25 | { 26 | cusp::array1d color_counts(max_color); 27 | thrust::sort(colors.begin(), colors.end()); 28 | thrust::reduce_by_key(colors.begin(), 29 | colors.end(), 30 | thrust::constant_iterator(1), 31 | thrust::make_discard_iterator(), 32 | color_counts.begin()); 33 | cusp::print(color_counts); 34 | } 35 | } 36 | 37 | int main(int argc, char*argv[]) 38 | { 39 | srand(time(NULL)); 40 | 41 | typedef int IndexType; 42 | typedef float ValueType; 43 | typedef cusp::host_memory MemorySpace; 44 | 45 | cusp::csr_matrix A; 46 | size_t size = 512; 47 | 48 | if (argc == 1) 49 | { 50 | // no input file was specified, generate an example 51 | std::cout << "Generated matrix (poisson5pt) "; 52 | cusp::gallery::poisson5pt(A, size, size); 53 | } 54 | else if (argc == 2) 55 | { 56 | // an input file was specified, read it from disk 57 | cusp::io::read_matrix_market_file(A, argv[1]); 58 | std::cout << "Read matrix (" << argv[1] << ") "; 59 | } 60 | 61 | std::cout << "with shape (" << A.num_rows << "," << A.num_cols << ") and " 62 | << A.num_entries << " entries" << "\n\n"; 63 | 64 | //std::cout << " Device "; 65 | //coloring(A); 66 | 67 | std::cout << " Host "; 68 | coloring(A); 69 | 70 | return EXIT_SUCCESS; 71 | } 72 | 73 | -------------------------------------------------------------------------------- /include/kernelconfig.h: -------------------------------------------------------------------------------- 1 | #ifndef LSG_KERNELCONFIG 2 | #define LSG_KERNELCONFIG 3 | 4 | typedef struct KernelConfig { 5 | unsigned device; 6 | unsigned problemsize; 7 | unsigned nblocks, blocksize; 8 | cudaDeviceProp dp; 9 | 10 | KernelConfig(unsigned ldevice = 0); 11 | void init(); 12 | unsigned setProblemSize(unsigned size); 13 | unsigned setNumberOfBlocks(unsigned lnblocks); 14 | unsigned setNumberOfBlockThreads(unsigned lblocksize); 15 | unsigned setMaxThreadsPerBlock(); 16 | unsigned getNumberOfBlocks(); 17 | unsigned getNumberOfBlockThreads(); 18 | unsigned getNumberOfTotalThreads(); 19 | 20 | unsigned calculate(); 21 | unsigned getMaxThreadsPerBlock(); 22 | unsigned getMaxBlocks(); 23 | unsigned getMaxSharedMemoryPerBlock(); 24 | unsigned getNumberOfSMs(); 25 | bool coversProblem(unsigned size = 0); 26 | unsigned getProblemSize(); 27 | } KernelConfig; 28 | 29 | KernelConfig::KernelConfig(unsigned ldevice/* = 0*/) { 30 | device = ldevice; 31 | init(); 32 | } 33 | void KernelConfig::init() { 34 | int deviceCount = 0; 35 | if (cudaSuccess != cudaGetDeviceCount(&deviceCount)) { 36 | CudaTest("cudaGetDeviceCount failed"); 37 | } 38 | if (deviceCount == 0) { 39 | fprintf(stderr, "No CUDA capable devices found."); 40 | return; 41 | } 42 | 43 | cudaGetDeviceProperties(&dp, device); 44 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", deviceCount, device, dp.name, dp.major, dp.minor, getNumberOfSMs(), ConvertSMVer2Cores(dp.major, dp.minor)); 45 | problemsize = 0; 46 | nblocks = 0; 47 | setMaxThreadsPerBlock(); // default. 48 | } 49 | unsigned KernelConfig::getMaxThreadsPerBlock() { 50 | return dp.maxThreadsDim[0]; 51 | } 52 | unsigned KernelConfig::getMaxBlocks() { 53 | return dp.maxGridSize[0]; 54 | } 55 | unsigned KernelConfig::getMaxSharedMemoryPerBlock() { 56 | return dp.sharedMemPerBlock; 57 | } 58 | unsigned KernelConfig::getNumberOfSMs() { 59 | return dp.multiProcessorCount; 60 | } 61 | 62 | unsigned KernelConfig::setProblemSize(unsigned size) { 63 | problemsize = size; 64 | return calculate(); 65 | } 66 | unsigned KernelConfig::getProblemSize() { 67 | return problemsize; 68 | } 69 | unsigned KernelConfig::getNumberOfBlocks() { 70 | return nblocks; 71 | } 72 | unsigned KernelConfig::getNumberOfBlockThreads() { 73 | return blocksize; 74 | } 75 | unsigned KernelConfig::getNumberOfTotalThreads() { 76 | return nblocks * blocksize; 77 | } 78 | unsigned KernelConfig::calculate() { 79 | if (blocksize == 0) { 80 | fprintf(stderr, "blocksize = 0.\n"); 81 | return 1; 82 | } 83 | nblocks = (problemsize + blocksize - 1) / blocksize; 84 | return 0; 85 | } 86 | unsigned KernelConfig::setNumberOfBlocks(unsigned lnblocks) { 87 | nblocks = lnblocks; 88 | return nblocks; 89 | } 90 | unsigned KernelConfig::setNumberOfBlockThreads(unsigned lblocksize) { 91 | blocksize = lblocksize; 92 | return blocksize; 93 | } 94 | unsigned KernelConfig::setMaxThreadsPerBlock() { 95 | return setNumberOfBlockThreads(getMaxThreadsPerBlock()); 96 | } 97 | bool KernelConfig::coversProblem(unsigned size/* = 0*/) { 98 | if (size == 0) { 99 | size = problemsize; 100 | } 101 | return (size <= nblocks * blocksize); 102 | } 103 | #endif 104 | -------------------------------------------------------------------------------- /include/worklistc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cutil_subset.h" 3 | static int zero = 0; 4 | 5 | struct Worklist { 6 | int *dwl, *wl; 7 | int length, *dnsize; 8 | int *dindex; 9 | 10 | Worklist(size_t nsize) { 11 | wl = (int *) calloc(nsize, sizeof(int)); 12 | CUDA_SAFE_CALL(cudaMalloc(&dwl, nsize * sizeof(int))); 13 | CUDA_SAFE_CALL(cudaMalloc(&dnsize, 1 * sizeof(int))); 14 | CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int))); 15 | CUDA_SAFE_CALL(cudaMemcpy(dnsize, &nsize, 1 * sizeof(int), cudaMemcpyHostToDevice)); 16 | CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice)); 17 | CUDA_SAFE_CALL(cudaMemcpy(&length, dnsize, 1 * sizeof(int), cudaMemcpyDeviceToHost)); 18 | } 19 | 20 | ~Worklist() {/*CUDA_SAFE_CALL(cudaFree(dwl));*/} 21 | 22 | void update_cpu() { 23 | int nsize = nitems(); 24 | CUDA_SAFE_CALL(cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost)); 25 | } 26 | 27 | void display_items() { 28 | int nsize = nitems(); 29 | CUDA_SAFE_CALL(cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost)); 30 | printf("WL: "); 31 | for(int i = 0; i < nsize; i++) 32 | printf("%d %d, ", i, wl[i]); 33 | printf("\n"); 34 | return; 35 | } 36 | 37 | void reset() { 38 | CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice)); 39 | } 40 | 41 | int nitems() { 42 | int index; 43 | //printf("dindex=%p &index=%p\n", dindex, &index); 44 | CUDA_SAFE_CALL(cudaMemcpy(&index, (void *) dindex, 1 * sizeof(index), cudaMemcpyDeviceToHost)); 45 | return index; 46 | } 47 | 48 | __device__ int push(int item) { 49 | int lindex = atomicAdd((int *) dindex, 1); 50 | if(lindex >= *dnsize) 51 | return 0; 52 | dwl[lindex] = item; 53 | return 1; 54 | } 55 | 56 | __device__ int pop(int &item) { 57 | int lindex = atomicSub((int *) dindex, 1); 58 | if(lindex <= 0) { 59 | *dindex = 0; 60 | return 0; 61 | } 62 | item = dwl[lindex - 1]; 63 | return 1; 64 | } 65 | }; 66 | 67 | struct Worklist2: public Worklist { 68 | Worklist2(int nsize) : Worklist(nsize) {} 69 | 70 | template __device__ __forceinline__ 71 | int push_1item(int nitem, int item, int threads_per_block) { 72 | assert(nitem == 0 || nitem == 1); 73 | __shared__ typename T::TempStorage temp_storage; 74 | __shared__ int queue_index; 75 | int total_items = 0; 76 | int thread_data = nitem; 77 | T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items); 78 | __syncthreads(); 79 | if(threadIdx.x == 0) { 80 | queue_index = atomicAdd((int *) dindex, total_items); 81 | } 82 | __syncthreads(); 83 | if(nitem == 1) { 84 | if(queue_index + thread_data >= *dnsize) { 85 | printf("GPU: exceeded length: %d %d %d %d %d\n", queue_index, thread_data, *dnsize, total_items, *dindex); 86 | return 0; 87 | } 88 | //cub::ThreadStore(dwl + queue_index + thread_data, item); 89 | dwl[queue_index + thread_data] = item; 90 | } 91 | __syncthreads(); 92 | return total_items; 93 | } 94 | 95 | template 96 | __device__ __forceinline__ 97 | int push_nitems(int n_items, int *items, int threads_per_block) { 98 | __shared__ typename T::TempStorage temp_storage; 99 | __shared__ int queue_index; 100 | int total_items; 101 | int thread_data = n_items; 102 | T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items); 103 | if(threadIdx.x == 0) { 104 | queue_index = atomicAdd((int *) dindex, total_items); 105 | //printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x, queue_index, thread_data + n_items, total_items); 106 | } 107 | __syncthreads(); 108 | for(int i = 0; i < n_items; i++) { 109 | //printf("pushing %d to %d\n", items[i], queue_index + thread_data + i); 110 | if(queue_index + thread_data + i >= *dnsize) { 111 | printf("GPU: exceeded length: %d %d %d %d\n", queue_index, thread_data, i, *dnsize); 112 | return 0; 113 | } 114 | dwl[queue_index + thread_data + i] = items[i]; 115 | } 116 | return total_items; 117 | } 118 | 119 | __device__ int pop_id(int id, int &item) { 120 | if(id < *dindex) { 121 | //item = cub::ThreadLoad(dwl + id); 122 | item = dwl[id]; 123 | return 1; 124 | } 125 | return 0; 126 | } 127 | }; 128 | -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | #ifndef LSG_COMMON 2 | #define LSG_COMMON 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #define MAXNBLOCKS (4*NBLOCKS) 31 | #define BLOCKSIZE 256 32 | #define MAXBLOCKSIZE 1024 33 | #define MAXSHARED (48*1024) 34 | #define MAXSHAREDUINT (MAXSHARED / 4) 35 | #define SHAREDPERTHREAD (MAXSHAREDUINT / MAXBLOCKSIZE) 36 | 37 | // For MAC and FreeBSD: by Rashid Kaleem. 38 | #ifdef __APPLE__ 39 | #include 40 | # define le64toh(x) OSSwapLittleToHostInt64(x) 41 | # define le32toh(x) OSSwapLittleToHostInt32(x) 42 | #elif __FreeBSD__ 43 | # include 44 | #elif __linux__ 45 | # include 46 | # ifndef le64toh 47 | # if __BYTE_ORDER == __LITTLE_ENDIAN 48 | # define le64toh(x) (x) 49 | # define le32toh(x) (x) 50 | # else 51 | # define le64toh(x) __bswap_64 (x) 52 | # endif 53 | # endif 54 | #endif 55 | 56 | #ifndef LSGDEBUG 57 | #define LSGDEBUG 0 58 | #endif 59 | 60 | #define dprintf if (debug) printf 61 | unsigned const debug = LSGDEBUG; 62 | 63 | typedef unsigned foru; 64 | //typedef float foru; 65 | 66 | double rtclock() 67 | { 68 | struct timezone Tzp; 69 | struct timeval Tp; 70 | int stat; 71 | stat = gettimeofday (&Tp, &Tzp); 72 | if (stat != 0) printf("Error return from gettimeofday: %d",stat); 73 | return(Tp.tv_sec + Tp.tv_usec*1.0e-6); 74 | } 75 | 76 | 77 | __device__ 78 | void global_sync(unsigned goalVal, volatile unsigned *Arrayin, volatile unsigned *Arrayout) { 79 | // thread ID in a block 80 | unsigned tid_in_blk = threadIdx.x * blockDim.y + threadIdx.y; 81 | unsigned nBlockNum = gridDim.x * gridDim.y; 82 | unsigned bid = blockIdx.x * gridDim.y + blockIdx.y; 83 | // only thread 0 is used for synchronization 84 | if (tid_in_blk == 0) { 85 | Arrayin[bid] = goalVal; 86 | __threadfence(); 87 | } 88 | if (bid == 0) { 89 | if (tid_in_blk < nBlockNum) { 90 | while (Arrayin[tid_in_blk] != goalVal){ 91 | //Do nothing here 92 | } 93 | } 94 | __syncthreads(); 95 | if (tid_in_blk < nBlockNum) { 96 | Arrayout[tid_in_blk] = goalVal; 97 | __threadfence(); 98 | } 99 | } 100 | if (tid_in_blk == 0) { 101 | while (Arrayout[bid] != goalVal) { 102 | //Do nothing here 103 | } 104 | } 105 | __syncthreads(); 106 | } 107 | 108 | static unsigned CudaTest(char *msg) 109 | { 110 | cudaError_t e; 111 | 112 | cudaThreadSynchronize(); 113 | if (cudaSuccess != (e = cudaGetLastError())) { 114 | fprintf(stderr, "%s: %d\n", msg, e); 115 | fprintf(stderr, "%s\n", cudaGetErrorString(e)); 116 | exit(-1); 117 | //return 1; 118 | } 119 | return 0; 120 | } 121 | // from CUDA SDK. 122 | inline int ConvertSMVer2Cores(int major, int minor) 123 | { 124 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM 125 | typedef struct { 126 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version 127 | int Cores; 128 | } sSMtoCores; 129 | 130 | sSMtoCores nGpuArchCoresPerSM[] = 131 | { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class 132 | { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class 133 | { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class 134 | { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class 135 | { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class 136 | { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class 137 | { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class 138 | { 0x35, 192}, // Kepler Generation (SM 3.5) GK110 class 139 | { -1, -1 } 140 | }; 141 | 142 | int index = 0; 143 | while (nGpuArchCoresPerSM[index].SM != -1) { 144 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) { 145 | return nGpuArchCoresPerSM[index].Cores; 146 | } 147 | index++; 148 | } 149 | printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor); 150 | return -1; 151 | } 152 | 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /src/omp/kernel2.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen 3 | #define MAXCOLOR 128 // available colors: 0 ~ (MAXCOLOR - 1) 4 | 5 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, int *coloring) { 6 | unsigned start = inwl.start; 7 | unsigned end = inwl.end; 8 | #ifndef ENABLE_OPENMP 9 | int *forbiddenColors = (int *) malloc(m * sizeof(int)); 10 | for(int i = 0; i < m; i ++) forbiddenColors[i] = m + 1; 11 | #else 12 | int **forbiddenColors = (int **) malloc(num_omp_threads*sizeof(int*)); 13 | for (int i = 0; i < num_omp_threads; i++) { 14 | forbiddenColors[i] = (int *) malloc((MAXCOLOR+1)*sizeof(int)); 15 | for(int j = 0; j < MAXCOLOR; j++) forbiddenColors[i][j] = m + 1; 16 | } 17 | #pragma omp parallel for 18 | #endif 19 | for (int i = start; i < end; i++) { 20 | #ifdef ENABLE_OPENMP 21 | int tid = omp_get_thread_num(); 22 | int vertex = inwl.getItem(i); 23 | #else 24 | int vertex = i; 25 | #endif 26 | int row_begin = csrRowPtr[vertex]; 27 | int row_end = csrRowPtr[vertex + 1]; 28 | for (int offset = row_begin; offset < row_end; offset++) { 29 | int neighbor = csrColInd[offset]; 30 | int color = coloring[neighbor]; 31 | #ifdef ENABLE_OPENMP 32 | forbiddenColors[tid][color] = vertex; 33 | #else 34 | forbiddenColors[color] = vertex;//forbid this color 35 | #endif 36 | } 37 | int vertex_color = 0; 38 | #ifdef ENABLE_OPENMP 39 | while (vertex_color < MAXCOLOR && forbiddenColors[tid][vertex_color] == vertex) 40 | #else 41 | while (vertex_color < MAXCOLOR && forbiddenColors[vertex_color] == vertex) 42 | #endif 43 | vertex_color++; 44 | assert(vertex_color < MAXCOLOR); 45 | coloring[vertex] = vertex_color; 46 | } 47 | } 48 | 49 | void conflictDetect(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, Worklist &outwl, int *coloring) { 50 | unsigned start = inwl.start; 51 | unsigned end = inwl.end; 52 | #ifdef ENABLE_OPENMP 53 | #pragma omp parallel for 54 | #endif 55 | for (int i = start; i < end; i++) { 56 | int vertex = inwl.getItem(i); 57 | int neighbor_offset = csrRowPtr[vertex]; 58 | int num_neighbors = csrRowPtr[vertex + 1] - neighbor_offset; 59 | for (int j = 0; j < num_neighbors; j++) { 60 | int neighbor = csrColInd[neighbor_offset + j]; 61 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 62 | outwl.push(vertex); 63 | break; 64 | } 65 | } 66 | } 67 | } 68 | 69 | void findMax(int *coloring, int n, int *ncolors) { 70 | int i; 71 | for (i = 0; i < n; i++) { 72 | if (coloring[i] > *ncolors) 73 | *ncolors = coloring[i]; 74 | } 75 | *ncolors ++; 76 | } 77 | 78 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring) { 79 | Worklist inwl, outwl, *inwlptr, *outwlptr, *tmp; 80 | double starttime, endtime; 81 | double runtime[ITERATIONS]; 82 | int colors[ITERATIONS]; 83 | int iteration[ITERATIONS]; 84 | for(int i = 0; i < ITERATIONS; i ++) { 85 | colors[i] = 0; 86 | iteration[i] = 0; 87 | } 88 | inwl.ensureSpace(m); 89 | outwl.ensureSpace(m); 90 | for (int i = 0; i < ITERATIONS; i++) { 91 | inwlptr = &inwl; 92 | outwlptr = &outwl; 93 | starttime = rtclock(); 94 | unsigned *range = (unsigned *)malloc(m * sizeof(unsigned)); 95 | for (unsigned j = 0; j < m; j++) 96 | range[j] = j; 97 | inwl.pushRange(range, m); 98 | unsigned wlsz = inwl.getSize(); 99 | #ifdef ENABLE_OPENMP 100 | while (wlsz) { 101 | ++iteration[i]; 102 | //printf("iteration=%d, %d vertices to process\n", iteration, wlsz); 103 | #endif 104 | FirstFit(m, nnz, csrRowPtr, csrColInd, *inwlptr, coloring); 105 | #ifdef ENABLE_OPENMP 106 | __syncthreads(); 107 | conflictDetect(m, nnz, csrRowPtr, csrColInd, *inwlptr, *outwlptr, coloring); 108 | __syncthreads(); 109 | wlsz = outwlptr->getSize(); 110 | tmp = inwlptr; inwlptr = outwlptr; outwlptr = tmp; 111 | outwlptr->clear(); 112 | } 113 | #endif 114 | endtime = rtclock(); 115 | findMax(coloring, m, &colors[i]); 116 | runtime[i] = (1000.0f * (endtime - starttime)); 117 | } 118 | double total_time = 0.0; 119 | int total_colors = 0; 120 | int total_iterations = 0; 121 | for (int i = 0; i < ITERATIONS; i++) { 122 | total_time += runtime[i]; 123 | total_colors += colors[i]; 124 | total_iterations += iteration[i]; 125 | printf("[%d %.2f %d] ", colors[i], runtime[i], iteration[i]); 126 | } 127 | double avg_time = (double)total_time / ITERATIONS; 128 | double avg_colors = (double)total_colors / ITERATIONS; 129 | double avg_iterations = (double)total_iterations / ITERATIONS; 130 | printf("\navg_time %f ms avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 131 | } 132 | -------------------------------------------------------------------------------- /include/sharedptr.h: -------------------------------------------------------------------------------- 1 | /* 2 | @file 3 | @section License 4 | TODO 5 | 6 | @section description 7 | 8 | Convenience class for shared CPU/GPU allocations. 9 | 10 | Based on the X10 Runtime ideas described in Pai et al. in PACT 2012. 11 | 12 | Also see NVIDIA Hemi's array.h at 13 | 14 | @author Sreepathi Pai 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "cutil_subset.h" 23 | 24 | template 25 | class Shared { 26 | T **ptrs; 27 | bool *owner; 28 | bool *isCPU; 29 | int max_devices; 30 | size_t nmemb; 31 | 32 | public: 33 | 34 | Shared() { 35 | nmemb = 0; 36 | } 37 | 38 | Shared(size_t nmemb) { 39 | this->nmemb = nmemb; 40 | max_devices = 2; 41 | ptrs = (T **) calloc(max_devices, sizeof(T *)); 42 | owner = (bool *) calloc(max_devices, sizeof(bool)); 43 | isCPU = (bool *) calloc(max_devices, sizeof(bool)); 44 | 45 | isCPU[0] = true; 46 | 47 | for(int i = 0; i < max_devices; i++) 48 | owner[i] = true; 49 | } 50 | 51 | void alloc(size_t nmemb) { 52 | assert(this->nmemb == 0); 53 | 54 | this->nmemb = nmemb; 55 | 56 | max_devices = 2; 57 | ptrs = (T **) calloc(max_devices, sizeof(T *)); 58 | owner = (bool *) calloc(max_devices, sizeof(bool)); 59 | isCPU = (bool *) calloc(max_devices, sizeof(bool)); 60 | 61 | isCPU[0] = true; 62 | 63 | for(int i = 0; i < max_devices; i++) 64 | owner[i] = true; 65 | } 66 | 67 | void free() 68 | { 69 | for(int i = 0; i < max_devices; i++) 70 | free_device(i); 71 | } 72 | 73 | bool free_device(int device = 0) 74 | { 75 | assert(device < max_devices); 76 | 77 | if(!ptrs[device]) 78 | return true; 79 | 80 | if(isCPU[device]) 81 | ::free(ptrs[device]); 82 | else 83 | { 84 | if(cudaFree(ptrs[device]) == cudaSuccess) 85 | ptrs[device] = NULL; 86 | else 87 | return false; 88 | } 89 | 90 | return true; 91 | } 92 | 93 | bool find_owner(int &o) 94 | { 95 | int i; 96 | for(i = 0; i < max_devices; i++) 97 | if(owner[i]) { 98 | o = i; 99 | break; 100 | } 101 | 102 | return i < max_devices; 103 | } 104 | 105 | 106 | T *cpu_rd_ptr() 107 | { 108 | if(ptrs[0] == NULL) 109 | ptrs[0] = (T *) calloc(nmemb, sizeof(T)); 110 | 111 | if(!owner[0]) 112 | { 113 | int o; 114 | if(find_owner(o)) 115 | copy(o, 0); 116 | 117 | owner[0] = true; 118 | } 119 | 120 | return ptrs[0]; 121 | } 122 | 123 | T *cpu_wr_ptr(bool overwrite = false) 124 | { 125 | if(ptrs[0] == NULL) 126 | ptrs[0] = (T *) calloc(nmemb, sizeof(T)); 127 | 128 | if(!owner[0]) 129 | { 130 | if(!overwrite) 131 | { 132 | int o; 133 | if(find_owner(o)) 134 | copy(o, 0); 135 | } 136 | 137 | owner[0] = true; 138 | } 139 | 140 | for(int i = 1; i < max_devices; i++) 141 | owner[i] = false; 142 | 143 | return ptrs[0]; 144 | } 145 | 146 | T *gpu_rd_ptr(int device = 1) /* device >= 1 */ 147 | { 148 | assert(device >= 1); 149 | 150 | if(ptrs[device] == NULL) 151 | CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T))); 152 | 153 | if(!owner[device]) 154 | { 155 | int o; 156 | if(find_owner(o)) 157 | copy(o, device); 158 | 159 | owner[device] = true; 160 | } 161 | 162 | return ptrs[device]; 163 | } 164 | 165 | T *gpu_wr_ptr(bool overwrite = false, int device = 1) 166 | { 167 | assert(device >= 1); 168 | 169 | if(ptrs[device] == NULL) 170 | CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T))); 171 | 172 | if(!owner[device]) 173 | { 174 | if(!overwrite) 175 | { 176 | int o; 177 | if(find_owner(o)) 178 | copy(o, device); 179 | } 180 | 181 | owner[device] = true; 182 | } 183 | 184 | for(int i = 0; i < max_devices; i++) 185 | if(i != device) 186 | owner[i] = false; 187 | 188 | return ptrs[device]; 189 | } 190 | 191 | void copy(int src, int dst) 192 | { 193 | if(!ptrs[src]) 194 | return; 195 | 196 | assert(ptrs[dst]); 197 | 198 | if(isCPU[dst] && !isCPU[src]) { 199 | CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyDeviceToHost)); 200 | } else if (!isCPU[dst] && !isCPU[src]) { 201 | CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyDeviceToDevice)); 202 | } else if (!isCPU[dst] && isCPU[src]) { 203 | CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyHostToDevice)); 204 | } else 205 | abort(); // cpu-to-cpu not implemented 206 | 207 | } 208 | }; 209 | 210 | 211 | -------------------------------------------------------------------------------- /src/omp/kernel1.h: -------------------------------------------------------------------------------- 1 | /* 2 | #include "worklist.h" 3 | 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | */ 9 | 10 | #define MAXCOLOR 128 11 | 12 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, int *coloring) 13 | { 14 | unsigned start, end; 15 | int ii; 16 | 17 | start = inwl.start; 18 | end = inwl.end; 19 | 20 | 21 | #ifdef ENABLE_OPENMP 22 | #pragma omp parallel for 23 | #endif 24 | for (ii = start; ii < end; ii++) { 25 | int j, node, neighbors, neighbor_j; 26 | 27 | node = inwl.getItem(ii); 28 | int neighboroffset = csrRowPtr[node]; 29 | neighbors = csrRowPtr[node + 1] - neighboroffset; 30 | 31 | unsigned v[MAXCOLOR / 32]; 32 | v[0] = 0xfffffffe; 33 | for (j = 1; j < MAXCOLOR / 32; j++) 34 | v[j] = 0xffffffff; 35 | 36 | for (j = 0; j < neighbors; j++) { 37 | neighbor_j = csrColInd[neighboroffset + j]; 38 | int color_j = coloring[neighbor_j]; 39 | if (color_j) 40 | v[color_j / 32] &= ~(1 << (color_j % 32)); 41 | } 42 | 43 | int c = 32; 44 | for (int i = 0; i < MAXCOLOR / 32; i++) { 45 | if (v[i] != 0) { 46 | v[i] &= -(signed)v[i]; 47 | if (v[i]) c--; 48 | if (v[i] & 0x0000ffff) c -= 16; 49 | if (v[i] & 0x00ff00ff) c -= 8; 50 | if (v[i] & 0x0f0f0f0f) c -= 4; 51 | if (v[i] & 0x33333333) c -= 2; 52 | if (v[i] & 0x55555555) c -= 1; 53 | break; 54 | } 55 | else 56 | c += 32; 57 | } 58 | coloring[node] = c; 59 | } 60 | } 61 | 62 | void conflictDetect(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, Worklist &outwl, int *coloring) 63 | { 64 | unsigned start, end; 65 | int ii; 66 | //inwl.myItems(start, end); 67 | start = inwl.start; 68 | end = inwl.end; 69 | //printf("inwl=%d, outwl=%d, start=%d, end=%d\n", inwl.getSize(), outwl.getSize(), start, end); 70 | 71 | #ifdef ENABLE_OPENMP 72 | #pragma omp parallel for 73 | #endif 74 | for (ii = start; ii < end; ii++) { 75 | int j, node, neighbors, neighbor_j; 76 | node = inwl.getItem(ii); 77 | //if (node == -1) 78 | //continue; 79 | int neighboroffset = csrRowPtr[node]; 80 | neighbors = csrRowPtr[node + 1] - neighboroffset; 81 | //neighbors = graph.noutgoing[node]; 82 | 83 | for (j = 0; j < neighbors; j++) { 84 | //neighbor_j = graph.edgessrcdst[graph.psrc[node] + j]; 85 | neighbor_j = csrColInd[neighboroffset + j]; 86 | if (coloring[node] == coloring[neighbor_j] && node < neighbor_j) { 87 | //printf("c[%d] = c[%d] = %d\n", node, neighbor_j, coloring[node]); 88 | outwl.push(node); 89 | break; 90 | } 91 | } 92 | 93 | //if (j == neighbors) 94 | //printf("%d ok\tcolor[%d]=%d\n", node, node, coloring[node]); 95 | } 96 | } 97 | 98 | void findMax(int *coloring, int n, int *ncolors) { 99 | int i; 100 | for (i = 0; i < n; i++) { 101 | //printf("coloring[%d]=%d\n", i, coloring[i]); 102 | if (coloring[i] > *ncolors) 103 | *ncolors = coloring[i]; 104 | } 105 | } 106 | 107 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring) 108 | { 109 | Worklist inwl, outwl, *inwlptr, *outwlptr, *tmp; 110 | 111 | double starttime, endtime; 112 | double runtime; 113 | 114 | //int nnodes = graph.nnodes; 115 | 116 | inwl.ensureSpace(m); 117 | outwl.ensureSpace(m); 118 | inwlptr = &inwl; 119 | outwlptr = &outwl; 120 | 121 | unsigned *range; 122 | range = (unsigned *)malloc(m * sizeof(unsigned)); 123 | for (unsigned i = 0; i < m; i++) 124 | range[i] = i; 125 | //inwl.pushRange(graph.srcsrc, nnodes); 126 | inwl.pushRange(range, m); 127 | 128 | int iteration = 0; 129 | unsigned wlsz = inwl.getSize(); 130 | //printf("wlsz=%d, outwl=%d\n", wlsz, outwl.getSize()); 131 | //printf("solving.\n"); 132 | 133 | starttime = rtclock(); 134 | #ifdef ENABLE_OPENMP 135 | while (wlsz) { 136 | ++iteration; 137 | #endif 138 | 139 | //FirstFit(graph, *inwlptr, coloring); 140 | FirstFit(m, nnz, csrRowPtr, csrColInd, *inwlptr, coloring); 141 | #ifdef ENABLE_OPENMP 142 | __syncthreads(); 143 | //printf("ok\n"); 144 | //conflictDetect(graph, *inwlptr, *outwlptr, coloring); 145 | conflictDetect(m, nnz, csrRowPtr, csrColInd, *inwlptr, *outwlptr, coloring); 146 | __syncthreads(); 147 | //printf("ok\n"); 148 | 149 | //printf("iteration %d:inwl=%d, outwl=%d\n", iteration, wlsz, outwlptr->getSize()); 150 | wlsz = outwlptr->getSize(); 151 | 152 | tmp = inwlptr; inwlptr = outwlptr; outwlptr = tmp; 153 | outwlptr->clear(); 154 | } 155 | #endif 156 | endtime = rtclock(); 157 | 158 | //verify<<<(nnodes - 1) / 1024 + 1, 1024>>>(graph, coloring, correct); 159 | //CUDA_SAFE_CALL(cudaDeviceSynchronize()); 160 | //if (*correct) { 161 | //findMax<<<(nnodes - 1) / 1024 + 1, 1024>>>(coloring, nnodes, ncolors); 162 | findMax(coloring, m, ncolors); 163 | //CUDA_SAFE_CALL(cudaDeviceSynchronize()); 164 | //} 165 | 166 | runtime = (1000.0f * (endtime - starttime)); 167 | printf("runtime=%f\tcolors=%d\t", runtime, *ncolors); 168 | } 169 | -------------------------------------------------------------------------------- /src/data/kernel_ldb.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | //#include 19 | #include 20 | #define INIT_VAL -1 21 | using namespace b40c; 22 | using namespace graph; 23 | 24 | void verify_color(unsigned *dist, int m, int *csrRowPtr, int *csrColInd, unsigned *nerr) { 25 | for (int nn = 0; nn < m; nn ++) { 26 | int neighbor_offset = csrRowPtr[nn]; 27 | int neighbor_size = csrRowPtr[nn + 1] - neighbor_offset; 28 | for (unsigned ii = 0; ii < neighbor_size; ++ii) { 29 | int v = csrColInd[neighbor_offset + ii]; 30 | unsigned wt = 1; 31 | if (wt > 0 && dist[nn] + wt < dist[v]) { 32 | //printf("%d %d %d %d\n", nn, v, dist[nn], dist[v]); 33 | ++*nerr; 34 | } 35 | } 36 | } 37 | } 38 | 39 | void write_solution(const char *fname, int m, unsigned *h_dist) { 40 | //unsigned *h_dist; 41 | //h_dist = (unsigned *) malloc(m * sizeof(unsigned)); 42 | assert(h_dist != NULL); 43 | //CUDA_SAFE_CALL(cudaMemcpy(h_dist, dist, m * sizeof(foru), cudaMemcpyDeviceToHost)); 44 | printf("Writing solution to %s\n", fname); 45 | FILE *f = fopen(fname, "w"); 46 | fprintf(f, "Computed solution (source dist): ["); 47 | for(int node = 0; node < m; node++) { 48 | fprintf(f, "%d:%d\n ", node, h_dist[node]); 49 | } 50 | fprintf(f, "]"); 51 | free(h_dist); 52 | } 53 | 54 | void color_ldb(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring, int num_SMs) { 55 | printf("Graph coloring data-driven load-balance version\n"); 56 | typedef int VertexId; 57 | typedef unsigned Value; 58 | typedef int SizeT; 59 | int *d_csrRowPtr, *d_csrColInd; 60 | int *d_coloring; 61 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 62 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 63 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 64 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 65 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 66 | CUDA_SAFE_CALL(cudaMemset(d_coloring, INIT_VAL, m * sizeof(int))); 67 | 68 | graph::CsrGraph csr_graph; 69 | csr_graph.FromScratch(m, nnz); 70 | CUDA_SAFE_CALL(cudaMemcpy(csr_graph.row_offsets, d_csrRowPtr, sizeof(SizeT) * (m + 1), cudaMemcpyDeviceToHost)); 71 | CUDA_SAFE_CALL(cudaMemcpy(csr_graph.column_indices, d_csrColInd, sizeof(VertexId) * nnz, cudaMemcpyDeviceToHost)); 72 | 73 | typedef color::CsrProblem CsrProblem; 74 | color::EnactorTwoPhase two_phase(false); 75 | //color::EnactorHybrid hybrid(false); 76 | CsrProblem csr_problem; 77 | if (csr_problem.FromHostProblem(false, csr_graph.nodes, csr_graph.edges, csr_graph.column_indices, csr_graph.row_offsets, 1)) exit(1); 78 | cudaError_t retval = cudaSuccess; 79 | double runtime[ITERATIONS]; 80 | int colors[ITERATIONS]; 81 | double starttime, endtime; 82 | for (int i = 0; i < ITERATIONS; i++) { 83 | starttime = rtclock(); 84 | //if (retval = csr_problem.Reset(hybrid.GetFrontierType(), 1.3)) 85 | if (retval = csr_problem.Reset(two_phase.GetFrontierType(), 1.3)) 86 | return; 87 | //if (retval = hybrid.EnactSearch(csr_problem, 0)) { 88 | if (retval = two_phase.EnactIterativeSearch(csr_problem, 0)) { 89 | if (retval && (retval != cudaErrorInvalidDeviceFunction)) { 90 | exit(1); 91 | } 92 | } 93 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 94 | endtime = rtclock(); 95 | runtime[i] = 1000.0f * (endtime - starttime); 96 | //colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()); 97 | } 98 | unsigned *h_dist; 99 | h_dist = (unsigned *) malloc(m * sizeof(unsigned)); 100 | assert(h_dist != NULL); 101 | if (csr_problem.ExtractResults((int *) h_dist)) exit(1); 102 | for(int i = 0; i < m; i++) 103 | if((signed) h_dist[i] == -1) 104 | h_dist[i] = 1000000000; 105 | printf("Done!\n"); 106 | unsigned nerr = 0; 107 | printf("verifying.\n"); 108 | verify_color(h_dist, m, csrRowPtr, csrColInd, &nerr); 109 | printf("\tno of errors = %d.\n", nerr); 110 | write_solution("color-output.txt", m, h_dist); 111 | exit(0); 112 | /* 113 | cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost); 114 | *ncolors = colors[ITERATIONS - 1]; 115 | double totaltime = 0.0; 116 | int totalcolors = 0; 117 | for (int i = 0; i < ITERATIONS; i++) { 118 | totaltime += runtime[i]; 119 | totalcolors += colors[i]; 120 | printf("[%d %f] ", colors[i], runtime[i]); 121 | } 122 | double avgtime = (double)totaltime / ITERATIONS; 123 | double avgcolors = (double)totalcolors / ITERATIONS; 124 | printf("\navgtime=%f ms, avgcolors = %f\n", avgtime, avgcolors); 125 | */ 126 | } 127 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | #define FORMATSTR "%d %d %d" 11 | 12 | unsigned allocOnHost(Graph &gg) { 13 | gg.destination = (unsigned int *)malloc((gg.nedges+1) * sizeof(unsigned int)); // first entry acts as null. 14 | gg.weight = (foru *)malloc((gg.nedges+1) * sizeof(foru)); // first entry acts as null. 15 | gg.psrc = (unsigned int *)calloc(gg.nnodes+1, sizeof(unsigned int)); // init to null. 16 | gg.psrc[gg.nnodes] = gg.nedges; // last entry points to end of edges, to avoid thread divergence in drelax. 17 | gg.noutgoing = (unsigned int *)calloc(gg.nnodes, sizeof(unsigned int)); // init to 0. 18 | gg.srcsrc = (unsigned int *)malloc(gg.nnodes * sizeof(unsigned int)); 19 | 20 | return 0; 21 | } 22 | void progressPrint(unsigned maxii, unsigned ii) { 23 | const unsigned nsteps = 10; 24 | unsigned ineachstep = (maxii / nsteps); 25 | if (ii % ineachstep == 0) { 26 | printf("\t%3d%%\r", ii*100/maxii + 1); 27 | fflush(stdout); 28 | } 29 | } 30 | unsigned readFromEdges(char file[], Graph &gg) { 31 | std::ifstream cfile; 32 | cfile.open(file); 33 | 34 | std::string str; 35 | getline(cfile, str); 36 | sscanf(str.c_str(), "%d %d", &gg.nnodes, &gg.nedges); 37 | 38 | printf("file %s: nnodes=%d, nedges=%d.\n", file, gg.nnodes, gg.nedges); 39 | allocOnHost(gg); 40 | for (unsigned ii = 0; ii < gg.nnodes; ++ii) { 41 | gg.srcsrc[ii] = ii; 42 | } 43 | 44 | 45 | unsigned int prevnode = 0; 46 | unsigned int tempsrcnode; 47 | unsigned int ncurroutgoing = 0; 48 | unsigned unweightedgraph = 0; 49 | for (unsigned ii = 0; ii < gg.nedges; ++ii) { 50 | getline(cfile, str); 51 | if (unweightedgraph) { 52 | sscanf(str.c_str(), "%d %d", &tempsrcnode, &gg.destination[ii+1]); 53 | gg.weight[ii+1] = 0; 54 | } else { 55 | sscanf(str.c_str(), FORMATSTR, &tempsrcnode, &gg.destination[ii+1], &gg.weight[ii+1]); 56 | } 57 | if (prevnode == tempsrcnode) { 58 | if (ii == 0) { 59 | gg.psrc[tempsrcnode] = ii + 1; 60 | } 61 | ++ncurroutgoing; 62 | } else { 63 | gg.psrc[tempsrcnode] = ii + 1; 64 | if (ncurroutgoing) { 65 | gg.noutgoing[prevnode] = ncurroutgoing; 66 | } 67 | prevnode = tempsrcnode; 68 | ncurroutgoing = 1; // not 0. 69 | } 70 | 71 | progressPrint(gg.nedges, ii); 72 | } 73 | gg.noutgoing[prevnode] = ncurroutgoing; // last entries. 74 | 75 | printf("\n"); 76 | cfile.close(); 77 | return 0; 78 | } 79 | 80 | unsigned readFromGR(char file[], Graph &gg) { 81 | std::ifstream cfile; 82 | cfile.open(file); 83 | 84 | // copied from GaloisCpp/trunk/src/FileGraph.h 85 | int masterFD = open(file, O_RDONLY); 86 | if (masterFD == -1) { 87 | printf("FileGraph::structureFromFile: unable to open %s.\n", file); 88 | return 1; 89 | } 90 | 91 | struct stat buf; 92 | int f = fstat(masterFD, &buf); 93 | if (f == -1) { 94 | printf("FileGraph::structureFromFile: unable to stat %s.\n", file); 95 | abort(); 96 | } 97 | size_t masterLength = buf.st_size; 98 | 99 | int _MAP_BASE = MAP_PRIVATE; 100 | //#ifdef MAP_POPULATE 101 | // _MAP_BASE |= MAP_POPULATE; 102 | //#endif 103 | 104 | void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); 105 | if (m == MAP_FAILED) { 106 | m = 0; 107 | printf("FileGraph::structureFromFile: mmap failed.\n"); 108 | abort(); 109 | } 110 | 111 | //parse file 112 | uint64_t* fptr = (uint64_t*)m; 113 | __attribute__((unused)) uint64_t version = le64toh(*fptr++); 114 | assert(version == 1); 115 | uint64_t sizeEdgeTy = le64toh(*fptr++); 116 | uint64_t numNodes = le64toh(*fptr++); 117 | uint64_t numEdges = le64toh(*fptr++); 118 | uint64_t *outIdx = fptr; 119 | fptr += numNodes; 120 | uint32_t *fptr32 = (uint32_t*)fptr; 121 | uint32_t *outs = fptr32; 122 | fptr32 += numEdges; 123 | if (numEdges % 2) fptr32 += 1; 124 | foru *edgeData = (foru *)fptr32; 125 | 126 | // cuda. 127 | gg.nnodes = numNodes; 128 | gg.nedges = numEdges; 129 | 130 | printf("file %s: nnodes=%d, nedges=%d.\n", file, gg.nnodes, gg.nedges); 131 | allocOnHost(gg); 132 | 133 | for (unsigned ii = 0; ii < gg.nnodes; ++ii) { 134 | // fill unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *destination; unsigned *weight; 135 | gg.srcsrc[ii] = ii; 136 | if (ii > 0) { 137 | gg.psrc[ii] = le64toh(outIdx[ii - 1]) + 1; 138 | gg.noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]); 139 | } else { 140 | gg.psrc[0] = 1; 141 | gg.noutgoing[0] = le64toh(outIdx[0]); 142 | } 143 | for (unsigned jj = 0; jj < gg.noutgoing[ii]; ++jj) { 144 | unsigned edgeindex = gg.psrc[ii] + jj; 145 | unsigned dst = le32toh(outs[edgeindex - 1]); 146 | if (dst >= gg.nnodes) printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, edgeindex); 147 | gg.destination[edgeindex] = dst; 148 | gg.weight[edgeindex] = edgeData[edgeindex - 1]; // Weighted. 149 | //gg.weight[edgeindex] = 1; // Unweighted like wikipedia. 150 | 151 | } 152 | progressPrint(gg.nnodes, ii); 153 | } 154 | printf("\n"); 155 | 156 | cfile.close(); // probably galois doesn't close its file due to mmap. 157 | return 0; 158 | } 159 | unsigned readInput(char file[], Graph &gg) { 160 | if (strstr(file, ".edges") || strstr(file, ".undirected")) { 161 | return readFromEdges(file, gg); 162 | } else if (strstr(file, ".gr")) { 163 | return readFromGR(file, gg); 164 | } 165 | return 0; 166 | } 167 | 168 | -------------------------------------------------------------------------------- /src/topo/kernel.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include "cuda_launch_config.hpp" 4 | #include "cutil_subset.h" 5 | #include "common.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #define MAXCOLOR 128 12 | 13 | __global__ void initialize(int *coloring, bool *colored, int m) { 14 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 15 | if (id < m) { 16 | coloring[id] = MAXCOLOR; 17 | colored[id] = false; 18 | } 19 | } 20 | 21 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, int *coloring, bool *changed) { 22 | int id = blockIdx.x * blockDim.x + threadIdx.x; 23 | bool forbiddenColors[MAXCOLOR+1]; 24 | if (coloring[id] == MAXCOLOR) { 25 | for (int i = 0; i < MAXCOLOR; i++) 26 | forbiddenColors[i] = false; 27 | int row_begin = csrRowPtr[id]; 28 | int row_end = csrRowPtr[id + 1]; 29 | for (int offset = row_begin; offset < row_end; offset ++) { 30 | int neighbor = csrColInd[offset]; 31 | int color = coloring[neighbor]; 32 | forbiddenColors[color] = true; 33 | } 34 | int vertex_color; 35 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) { 36 | if (!forbiddenColors[vertex_color]) { 37 | coloring[id] = vertex_color; 38 | break; 39 | } 40 | } 41 | assert(vertex_color < MAXCOLOR); 42 | *changed = true; 43 | } 44 | } 45 | 46 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, int *coloring, bool *colored) { 47 | int id = blockIdx.x * blockDim.x + threadIdx.x; 48 | if (!colored[id]) { 49 | int row_begin = csrRowPtr[id]; 50 | int row_end = csrRowPtr[id + 1]; 51 | int offset; 52 | for (offset = row_begin; offset < row_end; offset ++) { 53 | int neighbor = csrColInd[offset]; 54 | if (coloring[id] == coloring[neighbor] && id < neighbor) { 55 | coloring[id] = MAXCOLOR; 56 | break; 57 | } 58 | } 59 | if(offset == row_end) 60 | colored[id] = true; 61 | } 62 | } 63 | 64 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int blksz) { 65 | double starttime, endtime, t1, t2; 66 | double runtime[ITERATIONS]; 67 | int colors[ITERATIONS]; 68 | int iterations[ITERATIONS]; 69 | double avgtime, avgcolors; 70 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 71 | bool *changed, hchanged; 72 | bool *d_colored; 73 | 74 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 75 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 76 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 77 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_colored, m * sizeof(int))); 78 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 79 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 80 | 81 | int device = 0; 82 | int deviceCount = 0; 83 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 84 | cudaDeviceProp deviceProp; 85 | cudaGetDeviceProperties(&deviceProp, device); 86 | int nSM = deviceProp.multiProcessorCount; 87 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 88 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 89 | const size_t max_blocks_1 = maximum_residency(firstFit, blksz, 0); 90 | const size_t max_blocks_2 = maximum_residency(conflictResolve, blksz, 0); 91 | printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2); 92 | 93 | for (int i = 0; i < ITERATIONS; i++) { 94 | CUDA_SAFE_CALL(cudaMalloc((void **)&changed, sizeof(bool))); 95 | initialize <<<((m - 1) / blksz + 1), blksz>>> (d_coloring, d_colored, m); 96 | iterations[i] = 0; 97 | starttime = rtclock(); 98 | do { 99 | iterations[i] ++; 100 | hchanged = false; 101 | CUDA_SAFE_CALL(cudaMemcpy(changed, &hchanged, sizeof(hchanged), cudaMemcpyHostToDevice)); 102 | int nblocks = (m - 1) / blksz + 1; 103 | firstFit<<>>(m, d_csrRowPtr, d_csrColInd, d_coloring, changed); 104 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, d_coloring, d_colored); 105 | CUDA_SAFE_CALL(cudaMemcpy(&hchanged, changed, sizeof(hchanged), cudaMemcpyDeviceToHost)); 106 | //left = (int)thrust::count(thrust::device, conflicted, conflicted + m, 1); 107 | } while (hchanged); 108 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 109 | endtime = rtclock(); 110 | runtime[i] = 1000.0f * (endtime - starttime); 111 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 112 | } 113 | cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost); 114 | double total_time = 0.0; 115 | int total_colors = 0; 116 | int total_iterations = 0; 117 | for (int i = 0; i < ITERATIONS; i++) { 118 | total_time += runtime[i]; 119 | total_colors += colors[i]; 120 | total_iterations += iterations[i]; 121 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 122 | } 123 | double avg_time = (double)total_time / ITERATIONS; 124 | double avg_colors = (double)total_colors / ITERATIONS; 125 | double avg_iterations = (double)total_iterations / ITERATIONS; 126 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 127 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 128 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 129 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 130 | } 131 | -------------------------------------------------------------------------------- /include/gbar.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Software Global Barrier 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | //#include 36 | #include "cutil_subset.h" 37 | 38 | /** 39 | * Manages device storage needed for implementing a global software barrier 40 | * between CTAs in a single grid 41 | */ 42 | class GlobalBarrier 43 | { 44 | public: 45 | 46 | typedef unsigned int SyncFlag; 47 | 48 | protected : 49 | 50 | 51 | // Counters in global device memory 52 | SyncFlag* d_sync; 53 | 54 | /** 55 | * Simple wrapper for returning a CG-loaded SyncFlag at the specified pointer 56 | */ 57 | __device__ __forceinline__ SyncFlag LoadCG(SyncFlag* d_ptr) const 58 | { 59 | SyncFlag retval; 60 | //retval = cub::ThreadLoad(d_ptr); 61 | retval = d_ptr[0]; 62 | return retval; 63 | } 64 | 65 | public: 66 | 67 | /** 68 | * Constructor 69 | */ 70 | GlobalBarrier() : d_sync(NULL) {} 71 | 72 | 73 | /** 74 | * Synchronize 75 | */ 76 | __device__ __forceinline__ void Sync() const 77 | { 78 | volatile SyncFlag *d_vol_sync = d_sync; 79 | 80 | // Threadfence and syncthreads to make sure global writes are visible before 81 | // thread-0 reports in with its sync counter 82 | __threadfence(); 83 | __syncthreads(); 84 | 85 | if (blockIdx.x == 0) { 86 | 87 | // Report in ourselves 88 | if (threadIdx.x == 0) { 89 | d_vol_sync[blockIdx.x] = 1; 90 | } 91 | 92 | __syncthreads(); 93 | 94 | // Wait for everyone else to report in 95 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { 96 | while (LoadCG(d_sync + peer_block) == 0) { 97 | __threadfence_block(); 98 | } 99 | } 100 | 101 | __syncthreads(); 102 | 103 | // Let everyone know it's safe to read their prefix sums 104 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { 105 | d_vol_sync[peer_block] = 0; 106 | } 107 | 108 | } else { 109 | 110 | if (threadIdx.x == 0) { 111 | // Report in 112 | d_vol_sync[blockIdx.x] = 1; 113 | 114 | // Wait for acknowledgement 115 | while (LoadCG(d_sync + blockIdx.x) == 1) { 116 | __threadfence_block(); 117 | } 118 | } 119 | 120 | __syncthreads(); 121 | } 122 | } 123 | }; 124 | 125 | 126 | /** 127 | * Version of global barrier with storage lifetime management. 128 | * 129 | * We can use this in host enactors, and pass the base GlobalBarrier 130 | * as parameters to kernels. 131 | */ 132 | class GlobalBarrierLifetime : public GlobalBarrier 133 | { 134 | protected: 135 | 136 | // Number of bytes backed by d_sync 137 | size_t sync_bytes; 138 | 139 | public: 140 | 141 | /** 142 | * Constructor 143 | */ 144 | GlobalBarrierLifetime() : GlobalBarrier(), sync_bytes(0) {} 145 | 146 | 147 | /** 148 | * Deallocates and resets the progress counters 149 | */ 150 | cudaError_t HostReset() 151 | { 152 | cudaError_t retval = cudaSuccess; 153 | if (d_sync) { 154 | CUDA_SAFE_CALL(cudaFree(d_sync)); 155 | d_sync = NULL; 156 | } 157 | sync_bytes = 0; 158 | return retval; 159 | } 160 | 161 | 162 | /** 163 | * Destructor 164 | */ 165 | virtual ~GlobalBarrierLifetime() 166 | { 167 | HostReset(); 168 | } 169 | 170 | 171 | /** 172 | * Sets up the progress counters for the next kernel launch (lazily 173 | * allocating and initializing them if necessary) 174 | */ 175 | cudaError_t Setup(int sweep_grid_size) 176 | { 177 | cudaError_t retval = cudaSuccess; 178 | do { 179 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 180 | if (new_sync_bytes > sync_bytes) { 181 | 182 | if (d_sync) { 183 | CUDA_SAFE_CALL(cudaFree(d_sync)); 184 | retval = cudaSuccess; 185 | } 186 | 187 | sync_bytes = new_sync_bytes; 188 | 189 | CUDA_SAFE_CALL(cudaMalloc((void**) &d_sync, sync_bytes)); 190 | retval = cudaSuccess; 191 | 192 | // Initialize to zero 193 | CUDA_SAFE_CALL(cudaMemset(d_sync, 0, sweep_grid_size * sizeof(SyncFlag))); 194 | 195 | } 196 | } while (0); 197 | 198 | return retval; 199 | } 200 | }; 201 | -------------------------------------------------------------------------------- /include/component.h: -------------------------------------------------------------------------------- 1 | struct ComponentSpace { 2 | ComponentSpace(unsigned nelements); 3 | 4 | __device__ unsigned numberOfElements(); 5 | __device__ unsigned numberOfComponents(); 6 | __device__ bool isBoss(unsigned element); 7 | __device__ unsigned find(unsigned lelement, bool compresspath = true); 8 | __device__ bool unify(unsigned one, unsigned two); 9 | __device__ void print1x1(); 10 | __host__ void print(); 11 | __host__ void copy(ComponentSpace &two); 12 | void dump_to_file(const char *F); 13 | void allocate(); 14 | void init(); 15 | unsigned numberOfComponentsHost(); 16 | 17 | unsigned nelements; 18 | unsigned *ncomponents, // number of components. 19 | *complen, // lengths of components. 20 | *ele2comp; // components of elements. 21 | }; 22 | ComponentSpace::ComponentSpace(unsigned nelements) { 23 | this->nelements = nelements; 24 | 25 | allocate(); 26 | init(); 27 | } 28 | 29 | void ComponentSpace::dump_to_file(const char *F) 30 | { 31 | static FILE *f; 32 | static unsigned *mem; 33 | 34 | if(!f) 35 | { 36 | f = fopen(F, "w"); 37 | mem = (unsigned *) calloc(nelements, sizeof(unsigned)); 38 | } 39 | 40 | assert(cudaMemcpy(mem, ele2comp, nelements * sizeof(unsigned), cudaMemcpyDeviceToHost) == cudaSuccess); 41 | 42 | int i; 43 | for(i = 0; i < nelements; i++) 44 | { 45 | fprintf(f, "%d %d\n", i, mem[i]); 46 | } 47 | fprintf(f, "\n"); 48 | } 49 | 50 | void ComponentSpace::copy(ComponentSpace &two) 51 | { 52 | assert(cudaMemcpy(two.ncomponents, ncomponents, sizeof(unsigned), cudaMemcpyDeviceToDevice) == 0); 53 | assert(cudaMemcpy(two.ele2comp, ele2comp, sizeof(unsigned) * nelements, cudaMemcpyDeviceToDevice) == 0); 54 | assert(cudaMemcpy(two.complen, complen, sizeof(unsigned) * nelements, cudaMemcpyDeviceToDevice) == 0); 55 | } 56 | __device__ void ComponentSpace::print1x1() { 57 | printf("\t\t-----------------\n"); 58 | for (unsigned ii = 0; ii < nelements; ++ii) { 59 | printf("\t\t%d -> %d\n", ii, ele2comp[ii]); 60 | } 61 | printf("\t\t-----------------\n"); 62 | } 63 | __global__ void print1x1(ComponentSpace cs) { 64 | cs.print1x1(); 65 | } 66 | __host__ void ComponentSpace::print() { 67 | ::print1x1<<<1,1>>>(*this); 68 | CudaTest("cs.print1x1 failed"); 69 | } 70 | __device__ unsigned ComponentSpace::numberOfElements() { 71 | return nelements; 72 | } 73 | __device__ unsigned ComponentSpace::numberOfComponents() { 74 | return *ncomponents; 75 | } 76 | unsigned ComponentSpace::numberOfComponentsHost() { 77 | unsigned hncomponents = 0; 78 | cudaMemcpy(&hncomponents, ncomponents, sizeof(unsigned), cudaMemcpyDeviceToHost); 79 | return hncomponents; 80 | } 81 | void ComponentSpace::allocate() { 82 | if (cudaMalloc((void **)&ncomponents, 1 * sizeof(unsigned)) != cudaSuccess) 83 | CudaTest("allocating ncomponents failed"); 84 | if (cudaMalloc((void **)&complen, nelements * sizeof(unsigned)) != cudaSuccess) 85 | CudaTest("allocating complen failed"); 86 | if (cudaMalloc((void **)&ele2comp, nelements * sizeof(unsigned)) != cudaSuccess) 87 | CudaTest("allocating ele2comp failed"); 88 | } 89 | __global__ void dinitcs(unsigned nelements, unsigned *complen, unsigned *ele2comp) { 90 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 91 | if (id < nelements) { 92 | //elements[id] = id; 93 | complen[id] = 1; 94 | ele2comp[id] = id; 95 | } 96 | } 97 | void ComponentSpace::init() { 98 | // init the elements. 99 | unsigned blocksize = MAXBLOCKSIZE; //// 100 | unsigned nblocks = (nelements + blocksize - 1) / blocksize; 101 | dinitcs<<>>(nelements, complen, ele2comp); 102 | CudaTest("dinitcs failed"); 103 | 104 | // init number of components. 105 | cudaMemcpy(ncomponents, &nelements, sizeof(unsigned), cudaMemcpyHostToDevice); 106 | } 107 | __device__ bool ComponentSpace::isBoss(unsigned element) { 108 | return atomicCAS(&ele2comp[element],element,element) == element; 109 | } 110 | __device__ unsigned ComponentSpace::find(unsigned lelement, bool compresspath/*= true*/) { 111 | // do we need to worry about concurrency in this function? 112 | // for other finds, no synchronization necessary as the data-structure is a tree. 113 | // for other unifys, synchornization is not required considering that unify is going to affect only bosses, while find is going to affect only non-bosses. 114 | unsigned element = lelement; 115 | while (isBoss(element) == false) { 116 | element = ele2comp[element]; 117 | } 118 | if (compresspath) ele2comp[lelement] = element; // path compression. 119 | return element; 120 | } 121 | __device__ bool ComponentSpace::unify(unsigned one, unsigned two) { 122 | // if the client makes sure that one component is going to get unified as a source with another destination only once, then synchronization is unnecessary. 123 | // while this is true for MST, due to load-balancing in if-block below, a node may be source multiple times. 124 | // if a component is source in one thread and destination is another, then it is okay for MST. 125 | do { 126 | if(!isBoss(one)) return false; 127 | if(!isBoss(two)) return false; 128 | 129 | unsigned onecomp = one; 130 | unsigned twocomp = two; 131 | //unsigned onecomp = find(one, false); 132 | //unsigned twocomp = find(two, false); 133 | 134 | if (onecomp == twocomp) return false; // "duplicate" edges due to symmetry 135 | 136 | unsigned boss = twocomp; 137 | unsigned subordinate = onecomp; 138 | //if (complen[onecomp] > complen[twocomp]) { // one is larger, make it the representative: can create cycles. 139 | if (boss < subordinate) { // break cycles by id. 140 | boss = onecomp; 141 | subordinate = twocomp; 142 | } 143 | // merge subordinate into the boss. 144 | //ele2comp[subordinate] = boss; 145 | 146 | unsigned oldboss = atomicCAS(&ele2comp[subordinate], subordinate, boss); 147 | if (oldboss != subordinate) { // someone else updated the boss. 148 | // we need not restore the ele2comp[subordinate], as union-find ensures correctness and complen of subordinate doesn't matter. 149 | one = oldboss; 150 | two = boss; 151 | return false; 152 | } else { 153 | dprintf("\t\tunifying %d -> %d (%d)\n", subordinate, boss); 154 | atomicAdd(&complen[boss], complen[subordinate]); 155 | //complen[boss] += complen[subordinate]; 156 | // complen[subordinate] doesn't matter now, since find() will find its boss. 157 | 158 | // a component has reduced. 159 | unsigned ncomp = atomicSub(ncomponents, 1); 160 | //atomicDec(ncomponents, nelements); 161 | dprintf("\t%d: ncomponents = %d\n", threadIdx.x, ncomp); 162 | return true; 163 | } 164 | } while (true); 165 | } 166 | -------------------------------------------------------------------------------- /src/data/kernel_fusion.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen 3 | // Data-driven version with Kernel Fusion technique 4 | #include 5 | #include "gbar.cuh" 6 | #include "cuda_launch_config.hpp" 7 | #include "cutil_subset.h" 8 | #include "common.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "worklistc.h" 16 | #define MAXCOLOR 128 17 | 18 | typedef cub::BlockScan BlockScan; 19 | 20 | __global__ void initialize(int *coloring, int m) { 21 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 22 | if (id < m) { 23 | coloring[id] = MAXCOLOR; 24 | } 25 | } 26 | 27 | __device__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 &inwl, Worklist2 &outwl, int *coloring) { 28 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 29 | bool forbiddenColors[MAXCOLOR+1]; 30 | int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x); 31 | for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) { 32 | //int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1; 33 | //int start = tid * perthread; 34 | //int end = start + perthread; 35 | //for (int id = start; id < end; id ++) { 36 | int vertex; 37 | if (inwl.pop_id(id, vertex)) { 38 | for (int j = 0; j < MAXCOLOR; j++) 39 | forbiddenColors[j] = false; 40 | int row_begin = csrRowPtr[vertex]; 41 | int row_end = csrRowPtr[vertex + 1]; 42 | for (int offset = row_begin; offset < row_end; offset ++) { 43 | int neighbor = csrColInd[offset]; 44 | int color = coloring[neighbor]; 45 | forbiddenColors[color] = true; 46 | } 47 | int vertex_color; 48 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color ++) { 49 | if (!forbiddenColors[vertex_color]) { 50 | coloring[vertex] = vertex_color; 51 | break; 52 | } 53 | } 54 | assert(vertex_color < MAXCOLOR); 55 | } 56 | } 57 | } 58 | 59 | __device__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 &inwl, Worklist2 &outwl, int *coloring) { 60 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 61 | int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x); 62 | for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) { 63 | //int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1; 64 | //int start = tid * perthread; 65 | //int end = start + perthread; 66 | //for (int id = start; id < end; id ++) { 67 | int vertex; 68 | int conflicted = 0; 69 | if (inwl.pop_id(id, vertex)) { 70 | int row_begin = csrRowPtr[vertex]; 71 | int row_end = csrRowPtr[vertex + 1]; 72 | for (int offset = row_begin; offset < row_end; offset ++) { 73 | int neighbor = csrColInd[offset]; 74 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 75 | conflicted = 1; 76 | coloring[vertex] = MAXCOLOR; 77 | break; 78 | } 79 | } 80 | } 81 | outwl.push_1item(conflicted, vertex, BLKSIZE); 82 | } 83 | } 84 | 85 | __global__ void color_kernel(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring, GlobalBarrier gb) { 86 | Worklist2 *in; 87 | Worklist2 *out; 88 | Worklist2 *tmp; 89 | in = &inwl; out = &outwl; 90 | while (*in->dindex > 0) { 91 | firstFit(m, csrRowPtr, csrColInd, *in, *out, coloring); 92 | gb.Sync(); 93 | conflictResolve(m, csrRowPtr, csrColInd, *in, *out, coloring); 94 | gb.Sync(); 95 | tmp = in; 96 | in = out; 97 | out = tmp; 98 | *out->dindex = 0; 99 | } 100 | } 101 | 102 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 103 | double starttime, endtime; 104 | double runtime[ITERATIONS]; 105 | int colors[ITERATIONS]; 106 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 107 | printf("Graph coloring data-driven Kernel Fusion version\n"); 108 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 109 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 110 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 111 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 112 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 113 | 114 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 115 | int device = 0; 116 | int deviceCount = 0; 117 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 118 | cudaDeviceProp deviceProp; 119 | cudaGetDeviceProperties(&deviceProp, device); 120 | int nSM = deviceProp.multiProcessorCount; 121 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 122 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 123 | //int nSM = num_SMs; 124 | const size_t max_blocks = maximum_residency(color_kernel, BLKSIZE, 0); 125 | //printf("nSM=%d, block_size=%d, max_blocks=%d\n", nSM, BLKSIZE, max_blocks); 126 | GlobalBarrierLifetime gb; 127 | gb.Setup(nSM * max_blocks); 128 | for (int i = 0; i < ITERATIONS; i++) { 129 | Worklist2 inwl(m), outwl(m); 130 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 131 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 132 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 133 | 134 | starttime = rtclock(); 135 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 136 | color_kernel<<>>(m, d_csrRowPtr, d_csrColInd, inwl, outwl, d_coloring, gb); 137 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 138 | endtime = rtclock(); 139 | 140 | runtime[i] = 1000.0f * (endtime - starttime); 141 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 142 | } 143 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 144 | double total_time = 0.0; 145 | int total_colors = 0; 146 | for (int i = 0; i < ITERATIONS; i++) { 147 | total_time += runtime[i]; 148 | total_colors += colors[i]; 149 | printf("[%d %.2f %d] ", colors[i], runtime[i]); 150 | } 151 | double avg_time = (double)total_time / ITERATIONS; 152 | double avg_colors = (double)total_colors / ITERATIONS; 153 | printf("\navg_time %f ms, avg_colors %.2f\n", avg_time, avg_colors); 154 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 155 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 156 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 157 | } 158 | -------------------------------------------------------------------------------- /src/data/kernel_bitset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "worklistc.h" 15 | #define SCRATCHSIZE BLKSIZE 16 | #define MAXCOLOR 128 // assume graph can be colored with less than 128 colors 17 | typedef cub::BlockScan BlockScan; 18 | 19 | __global__ void initialize(int *coloring, int m) { 20 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 21 | if (id < m) { 22 | coloring[id] = MAXCOLOR; 23 | } 24 | } 25 | 26 | __device__ __forceinline__ void assignColor(unsigned int *forbiddenColors, int *coloring, int node) { 27 | int i; 28 | /* 29 | int c = 32; 30 | for (i = 0; i < MAXCOLOR/32; i++) { 31 | if (forbiddenColors[i] != 0) { 32 | forbiddenColors[i] &= -(signed)forbiddenColors[i]; 33 | if (forbiddenColors[i]) c--; 34 | if (forbiddenColors[i] & 0x0000ffff) c -= 16; 35 | if (forbiddenColors[i] & 0x00ff00ff) c -= 8; 36 | if (forbiddenColors[i] & 0x0f0f0f0f) c -= 4; 37 | if (forbiddenColors[i] & 0x33333333) c -= 2; 38 | if (forbiddenColors[i] & 0x55555555) c -= 1; 39 | coloring[node] = c; 40 | break; 41 | } 42 | else 43 | c += 32; 44 | } 45 | //*/ 46 | ///* 47 | for (i = 0; i < MAXCOLOR/32; i++) { 48 | int pos = __ffs(forbiddenColors[i]); 49 | if(pos) { 50 | coloring[node] = i * 32 + pos - 1; 51 | break; 52 | } 53 | } 54 | assert(i < MAXCOLOR/32); 55 | //*/ 56 | } 57 | 58 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) { 59 | int id = blockIdx.x * blockDim.x + threadIdx.x; 60 | unsigned forbiddenColors[MAXCOLOR/32+1]; 61 | int vertex; 62 | if (inwl.pop_id(id, vertex)) { 63 | int row_begin = csrRowPtr[vertex]; 64 | int row_end = csrRowPtr[vertex + 1]; 65 | for (int j = 0; j < MAXCOLOR/32; j++) 66 | forbiddenColors[j] = 0xffffffff; 67 | for (int offset = row_begin; offset < row_end; offset ++) { 68 | int neighbor = csrColInd[offset]; 69 | int color = coloring[neighbor]; 70 | forbiddenColors[color / 32] &= ~(1 << (color % 32)); 71 | } 72 | assignColor(forbiddenColors, coloring, vertex); 73 | } 74 | } 75 | 76 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) { 77 | int id = blockIdx.x * blockDim.x + threadIdx.x; 78 | int vertex; 79 | int conflicted = 0; 80 | if (inwl.pop_id(id, vertex)) { 81 | int row_begin = csrRowPtr[vertex]; 82 | int row_end = csrRowPtr[vertex + 1]; 83 | for (int offset = row_begin; offset < row_end; offset ++) { 84 | int neighbor = csrColInd[offset]; 85 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 86 | conflicted = 1; 87 | coloring[vertex] = MAXCOLOR; 88 | break; 89 | } 90 | } 91 | } 92 | outwl.push_1item(conflicted, vertex, BLKSIZE); 93 | } 94 | 95 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 96 | double starttime, endtime; 97 | double runtime[ITERATIONS]; 98 | int colors[ITERATIONS]; 99 | int iterations[ITERATIONS]; 100 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 101 | printf("Graph coloring data-driven Bitset version\n"); 102 | 103 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 104 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 105 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 106 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 107 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 108 | cudaDeviceSynchronize(); 109 | int device = 0; 110 | int deviceCount = 0; 111 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 112 | cudaDeviceProp deviceProp; 113 | cudaGetDeviceProperties(&deviceProp, device); 114 | int nSM = deviceProp.multiProcessorCount; 115 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 116 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 117 | const size_t max_blocks_1 = maximum_residency(firstFit, BLKSIZE, 0); 118 | const size_t max_blocks_2 = maximum_residency(conflictResolve, BLKSIZE, 0); 119 | printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2); 120 | 121 | for (int i = 0; i < ITERATIONS; i++) { 122 | Worklist2 inwl(m), outwl(m); 123 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 124 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 125 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 126 | int nitems = m; 127 | iterations[i] = 0; 128 | 129 | starttime = rtclock(); 130 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 131 | int iteration = 0; 132 | while (nitems > 0) { 133 | iterations[i] ++; 134 | int nblocks = (nitems - 1) / BLKSIZE + 1; 135 | firstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring); 136 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring); 137 | nitems = outwlptr->nitems(); 138 | Worklist2 * tmp = inwlptr; 139 | inwlptr = outwlptr; 140 | outwlptr = tmp; 141 | outwlptr->reset(); 142 | } 143 | cudaDeviceSynchronize(); 144 | endtime = rtclock(); 145 | runtime[i] = 1000.0f * (endtime - starttime); 146 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 147 | } 148 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 149 | double total_time = 0.0; 150 | int total_colors = 0; 151 | int total_iterations = 0; 152 | for (int i = 0; i < ITERATIONS; i++) { 153 | total_time += runtime[i]; 154 | total_colors += colors[i]; 155 | total_iterations += iterations[i]; 156 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 157 | } 158 | double avg_time = (double)total_time / ITERATIONS; 159 | double avg_colors = (double)total_colors / ITERATIONS; 160 | double avg_iterations = (double)total_iterations / ITERATIONS; 161 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 162 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 163 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 164 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 165 | } 166 | -------------------------------------------------------------------------------- /src/data/kernel_tc.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen 3 | // Data-driven version with Thread Coarsening technique 4 | #include 5 | #include "gbar.cuh" 6 | #include "cuda_launch_config.hpp" 7 | #include "cutil_subset.h" 8 | #include "common.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "worklistc.h" 16 | #define MAXCOLOR 128 17 | typedef cub::BlockScan BlockScan; 18 | 19 | __global__ void initialize(int *coloring, int m) { 20 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 21 | if (id < m) { 22 | coloring[id] = MAXCOLOR; 23 | } 24 | } 25 | 26 | __global__ void FirstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) { 27 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 28 | bool forbiddenColors[MAXCOLOR+1]; 29 | int id = tid; 30 | //int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x); 31 | //for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) { 32 | //int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1; 33 | //int start = tid * perthread; 34 | //int end = start + perthread; 35 | //for (int id = start; id < end; id ++) { 36 | int vertex; 37 | if (inwl.pop_id(id, vertex)) { 38 | for (int j = 0; j < MAXCOLOR; j++) 39 | forbiddenColors[j] = false; 40 | int row_begin = csrRowPtr[vertex]; 41 | int row_end = csrRowPtr[vertex + 1]; 42 | for (int offset = row_begin; offset < row_end; offset ++) { 43 | int neighbor = csrColInd[offset]; 44 | int color = coloring[neighbor]; 45 | forbiddenColors[color] = true; 46 | } 47 | int vertex_color; 48 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color ++) { 49 | if (!forbiddenColors[vertex_color]) { 50 | coloring[vertex] = vertex_color; 51 | break; 52 | } 53 | } 54 | assert(vertex_color < MAXCOLOR); 55 | } 56 | //} 57 | } 58 | 59 | __global__ void conflictDetect(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) { 60 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 61 | //int id = tid; 62 | int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x); 63 | for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) { 64 | //int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1; 65 | //int start = tid * perthread; 66 | //int end = start + perthread; 67 | //for (int id = start; id < end; id ++) { 68 | int vertex; 69 | int conflicted = 0; 70 | if (inwl.pop_id(id, vertex)) { 71 | int row_begin = csrRowPtr[vertex]; 72 | int row_end = csrRowPtr[vertex + 1]; 73 | for (int offset = row_begin; offset < row_end; offset ++) { 74 | int neighbor = csrColInd[offset]; 75 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 76 | conflicted = 1; 77 | coloring[vertex] = MAXCOLOR; 78 | break; 79 | } 80 | } 81 | } 82 | outwl.push_1item(conflicted, vertex, BLKSIZE); 83 | } 84 | } 85 | 86 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 87 | double starttime, endtime; 88 | double runtime[ITERATIONS]; 89 | int colors[ITERATIONS]; 90 | int iterations[ITERATIONS]; 91 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 92 | printf("Graph coloring data-driven Thread Coarsening version\n"); 93 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 94 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 95 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 96 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 97 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 98 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 99 | int device = 0; 100 | int deviceCount = 0; 101 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 102 | cudaDeviceProp deviceProp; 103 | cudaGetDeviceProperties(&deviceProp, device); 104 | int nSM = deviceProp.multiProcessorCount; 105 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 106 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 107 | 108 | const size_t max_blocks_1 = maximum_residency(FirstFit, BLKSIZE, 0); 109 | const size_t max_blocks_2 = maximum_residency(conflictDetect, BLKSIZE, 0); 110 | printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2); 111 | 112 | for (int i = 0; i < ITERATIONS; i++) { 113 | Worklist2 inwl(m), outwl(m); 114 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 115 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 116 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 117 | int nitems = m; 118 | iterations[i] = 0; 119 | 120 | starttime = rtclock(); 121 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 122 | while (nitems > 0) { 123 | iterations[i] ++; 124 | //printf("nitems=%d\n", nitems); 125 | int nblocks_1 = nSM * max_blocks_1; 126 | int nblocks_2 = nSM * max_blocks_2; 127 | int nblocks = (nitems - 1) / BLKSIZE + 1; 128 | if(nblocks < nblocks_1) nblocks_1 = nblocks; 129 | if(nblocks < nblocks_2) nblocks_2 = nblocks; 130 | FirstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring); 131 | conflictDetect<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring); 132 | nitems = outwlptr->nitems(); 133 | Worklist2 * tmp = inwlptr; 134 | inwlptr = outwlptr; 135 | outwlptr = tmp; 136 | outwlptr->reset(); 137 | } 138 | cudaDeviceSynchronize(); 139 | endtime = rtclock(); 140 | runtime[i] = 1000.0f * (endtime - starttime); 141 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 142 | } 143 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 144 | double total_time = 0.0; 145 | int total_colors = 0; 146 | int total_iterations = 0; 147 | for (int i = 0; i < ITERATIONS; i++) { 148 | total_time += runtime[i]; 149 | total_colors += colors[i]; 150 | total_iterations += iterations[i]; 151 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 152 | } 153 | double avg_time = (double)total_time / ITERATIONS; 154 | double avg_colors = (double)total_colors / ITERATIONS; 155 | double avg_iterations = (double)total_iterations / ITERATIONS; 156 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 157 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 158 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 159 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 160 | } 161 | -------------------------------------------------------------------------------- /src/data/kernel_pq.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "worklistc.h" 15 | #define MAXCOLOR 128 16 | typedef cub::BlockScan BlockScan; 17 | 18 | __global__ void initialize(int *coloring, int m) { 19 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 20 | if (id < m) { 21 | coloring[id] = MAXCOLOR; 22 | } 23 | } 24 | 25 | __global__ void FirstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *degree, int *coloring) { 26 | int id = blockIdx.x * blockDim.x + threadIdx.x; 27 | bool forbiddenColors[MAXCOLOR + 1]; 28 | int vertex; 29 | if (inwl.pop_id(id, vertex)) { 30 | int row_begin = csrRowPtr[vertex]; 31 | int row_end = csrRowPtr[vertex + 1]; 32 | for (int j = 0; j < MAXCOLOR; j++) 33 | forbiddenColors[j] = false; 34 | for (int offset = row_begin; offset < row_end; offset ++) { 35 | int neighbor = csrColInd[offset]; 36 | int color = coloring[neighbor]; 37 | forbiddenColors[color] = true; 38 | } 39 | int vertex_color; 40 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) { 41 | if (!forbiddenColors[vertex_color]) { 42 | coloring[vertex] = vertex_color; 43 | break; 44 | } 45 | } 46 | assert(vertex_color < MAXCOLOR); 47 | } 48 | } 49 | 50 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) { 51 | int id = blockIdx.x * blockDim.x + threadIdx.x; 52 | int conflicted = 0; 53 | int vertex; 54 | if (inwl.pop_id(id, vertex)) { 55 | int row_begin = csrRowPtr[vertex]; 56 | int row_end = csrRowPtr[vertex + 1]; 57 | for (int offset = row_begin; offset < row_end; offset ++) { 58 | int neighbor = csrColInd[offset]; 59 | if (coloring[vertex] == coloring[neighbor]) { 60 | bool is_victim; 61 | if(degree[vertex] == degree[neighbor]) 62 | is_victim = (vertex < neighbor) ? true : false; 63 | else is_victim = (degree[vertex] < degree[neighbor]) ? true : false; 64 | if(is_victim) { 65 | conflicted = 1; 66 | coloring[vertex] = MAXCOLOR; 67 | break; 68 | } 69 | } 70 | } 71 | } 72 | outwl.push_1item(conflicted, vertex, BLKSIZE); // push to outwl if conflicted 73 | } 74 | 75 | __global__ void gatherKey(int *degree, int *key, Worklist2 wl, int n) { 76 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 77 | int vertex; 78 | if (id < n) { 79 | wl.pop_id(id, vertex); 80 | key[id] = degree[vertex]; 81 | } 82 | } 83 | 84 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 85 | double starttime, endtime; 86 | double runtime[ITERATIONS]; 87 | int colors[ITERATIONS]; 88 | int iterations[ITERATIONS]; 89 | int *d_csrRowPtr, *d_csrColInd, *d_coloring, *d_degree, *d_key; 90 | printf("Graph coloring data-driven Priority Queue version\n"); 91 | int *degree = (int *)malloc(m * sizeof(int)); 92 | for(int i = 0; i < m; i ++) { 93 | degree[i] = csrRowPtr[i + 1] - csrRowPtr[i]; 94 | } 95 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 96 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 97 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_degree, m * sizeof(int))); 98 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_key, m * sizeof(int))); 99 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 100 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 101 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 102 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 103 | int device = 0; 104 | int deviceCount = 0; 105 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 106 | cudaDeviceProp deviceProp; 107 | cudaGetDeviceProperties(&deviceProp, device); 108 | int nSM = deviceProp.multiProcessorCount; 109 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 110 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 111 | 112 | for (int i = 0; i < ITERATIONS; i++) { 113 | Worklist2 inwl(m), outwl(m); 114 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 115 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 116 | CUDA_SAFE_CALL(cudaMemcpy(d_degree, degree, m * sizeof(int), cudaMemcpyHostToDevice)); 117 | CUDA_SAFE_CALL(cudaMemcpy(d_key, degree, m * sizeof(int), cudaMemcpyHostToDevice)); 118 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 119 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 120 | iterations[i] = 0; 121 | 122 | starttime = rtclock(); 123 | int nitems = m; 124 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 125 | //thrust::sort_by_key(thrust::device, d_key, d_key + m, inwl.dwl, thrust::greater()); 126 | while (nitems > 0) { 127 | iterations[i] ++; 128 | int nblocks = (nitems - 1) / BLKSIZE + 1; 129 | FirstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_degree, d_coloring); 130 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_degree, d_coloring); 131 | nitems = outwlptr->nitems(); 132 | //gatherKey<<<((nitems - 1) / BLKSIZE + 1), BLKSIZE>>>(d_degree, d_key, *outwlptr, nitems); 133 | //thrust::sort_by_key(thrust::device, d_key, d_key + nitems, inwl.dwl, thrust::greater()); 134 | Worklist2 * tmp = inwlptr; 135 | inwlptr = outwlptr; 136 | outwlptr = tmp; 137 | outwlptr->reset(); 138 | } 139 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 140 | endtime = rtclock(); 141 | runtime[i] = 1000.0f * (endtime - starttime); 142 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 143 | } 144 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 145 | double total_time = 0.0; 146 | int total_colors = 0; 147 | int total_iterations = 0; 148 | for (int i = 0; i < ITERATIONS; i++) { 149 | total_time += runtime[i]; 150 | total_colors += colors[i]; 151 | total_iterations += iterations[i]; 152 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 153 | } 154 | double avg_time = (double)total_time / ITERATIONS; 155 | double avg_colors = (double)total_colors / ITERATIONS; 156 | double avg_iterations = (double)total_iterations / ITERATIONS; 157 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 158 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 159 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 160 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 161 | } 162 | -------------------------------------------------------------------------------- /src/serial/graph_io.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | 4 | // transfer R-MAT generated gr graph to CSR format 5 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 6 | printf("Reading RMAT (.gr) input file %s\n", gr); 7 | std::ifstream cfile; 8 | cfile.open(gr); 9 | std::string str; 10 | getline(cfile, str); 11 | char c; 12 | sscanf(str.c_str(), "%c", &c); 13 | while (c == 'c') { 14 | getline(cfile, str); 15 | sscanf(str.c_str(), "%c", &c); 16 | } 17 | char sp[3]; 18 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 19 | printf("num_vertices %d num_edges %d\n", m, nnz); 20 | //printf("%c %s %d %d\n", c, sp, m, nnz); 21 | vector > svector; 22 | set s; 23 | for (int i = 0; i < m; i++) 24 | svector.push_back(s); 25 | int dst, src; 26 | for (int i = 0; i < nnz; i++) { 27 | getline(cfile, str); 28 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 29 | 30 | if (c != 'a') 31 | printf("line %d\n", __LINE__); 32 | dst--; 33 | src--; 34 | svector[src].insert(dst); 35 | svector[dst].insert(src); 36 | } 37 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 38 | int count = 0; 39 | for (int i = 0; i < m; i++) { 40 | csrRowPtr[i] = count; 41 | count += svector[i].size(); 42 | } 43 | csrRowPtr[m] = count; 44 | if (count != nnz) { 45 | printf("This graph is not symmetric\n"); 46 | nnz = count; 47 | } 48 | double avgdeg; 49 | double variance = 0.0; 50 | int maxdeg = 0; 51 | int mindeg = m; 52 | avgdeg = (double)nnz / m; 53 | for (int i = 0; i < m; i++) { 54 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 55 | if (deg_i > maxdeg) 56 | maxdeg = deg_i; 57 | if (deg_i < mindeg) 58 | mindeg = deg_i; 59 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 60 | } 61 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 62 | csrColInd = (int *)malloc(count * sizeof(int)); 63 | set::iterator site; 64 | for (int i = 0, index = 0; i < m; i++) { 65 | site = svector[i].begin(); 66 | while (site != svector[i].end()) { 67 | csrColInd[index++] = *site; 68 | site++; 69 | } 70 | } 71 | } 72 | 73 | // transfer *.graph file to CSR format 74 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 75 | printf("Reading .graph input file %s\n", graph); 76 | std::ifstream cfile; 77 | cfile.open(graph); 78 | std::string str; 79 | getline(cfile, str); 80 | sscanf(str.c_str(), "%d %d", &m, &nnz); 81 | printf("num_vertices %d num_edges %d\n", m, nnz); 82 | vector > svector; 83 | set s; 84 | for (int i = 0; i < m; i++) 85 | svector.push_back(s); 86 | int dst; 87 | for (int i = 0; i < m; i++) { 88 | getline(cfile, str); 89 | istringstream istr; 90 | istr.str(str); 91 | while(istr>>dst) { 92 | dst --; 93 | svector[i].insert(dst); 94 | svector[dst].insert(i); 95 | } 96 | istr.clear(); 97 | } 98 | cfile.close(); 99 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 100 | int count = 0; 101 | for (int i = 0; i < m; i++) { 102 | csrRowPtr[i] = count; 103 | count += svector[i].size(); 104 | } 105 | csrRowPtr[m] = count; 106 | if (count != nnz) { 107 | printf("This graph is not symmetric\n"); 108 | nnz = count; 109 | } 110 | double avgdeg; 111 | double variance = 0.0; 112 | int maxdeg = 0; 113 | int mindeg = m; 114 | avgdeg = (double)nnz / m; 115 | for (int i = 0; i < m; i++) { 116 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 117 | if (deg_i > maxdeg) 118 | maxdeg = deg_i; 119 | if (deg_i < mindeg) 120 | mindeg = deg_i; 121 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 122 | } 123 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 124 | csrColInd = (int *)malloc(count * sizeof(int)); 125 | set::iterator site; 126 | for (int i = 0, index = 0; i < m; i++) { 127 | site = svector[i].begin(); 128 | while (site != svector[i].end()) { 129 | csrColInd[index++] = *site; 130 | site++; 131 | } 132 | } 133 | } 134 | 135 | // transfer mtx graph to CSR format 136 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 137 | printf("Reading (.mtx) input file %s\n", mtx); 138 | std::ifstream cfile; 139 | cfile.open(mtx); 140 | std::string str; 141 | getline(cfile, str); 142 | char c; 143 | sscanf(str.c_str(), "%c", &c); 144 | while (c == '%') { 145 | getline(cfile, str); 146 | sscanf(str.c_str(), "%c", &c); 147 | } 148 | int n; 149 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 150 | if (m != n) { 151 | printf("error!\n"); 152 | exit(0); 153 | } 154 | printf("num_vertices %d num_edges %d\n", m, nnz); 155 | vector > svector; 156 | set s; 157 | for (int i = 0; i < m; i++) 158 | svector.push_back(s); 159 | int dst, src; 160 | for (int i = 0; i < nnz; i++) { 161 | getline(cfile, str); 162 | sscanf(str.c_str(), "%d %d", &dst, &src); 163 | dst--; 164 | src--; 165 | svector[src].insert(dst); 166 | svector[dst].insert(src); 167 | } 168 | cfile.close(); 169 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 170 | int count = 0; 171 | for (int i = 0; i < m; i++) { 172 | csrRowPtr[i] = count; 173 | count += svector[i].size(); 174 | } 175 | csrRowPtr[m] = count; 176 | if (count != nnz) { 177 | printf("This graph is not symmetric\n"); 178 | nnz = count; 179 | } 180 | double avgdeg; 181 | double variance = 0.0; 182 | int maxdeg = 0; 183 | int mindeg = m; 184 | avgdeg = (double)nnz / m; 185 | for (int i = 0; i < m; i++) { 186 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 187 | if (deg_i > maxdeg) 188 | maxdeg = deg_i; 189 | if (deg_i < mindeg) 190 | mindeg = deg_i; 191 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 192 | } 193 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 194 | csrColInd = (int *)malloc(count * sizeof(int)); 195 | set::iterator site; 196 | for (int i = 0, index = 0; i < m; i++) { 197 | site = svector[i].begin(); 198 | while (site != svector[i].end()) { 199 | csrColInd[index++] = *site; 200 | site++; 201 | } 202 | } 203 | } 204 | 205 | // store color of all vertex 206 | void write_solution(char *fname, int *coloring, int n) { 207 | int i; 208 | FILE *fp; 209 | fp = fopen(fname, "w"); 210 | for (i = 0; i < n; i++) { 211 | //fprintf(fp, "%d:%d\n", i, coloring[i]); 212 | fprintf(fp, "%d\n", coloring[i]); 213 | } 214 | fclose(fp); 215 | } 216 | 217 | // check if correctly coloured 218 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 219 | int i, offset, neighbor_j; 220 | for (i = 0; i < m; i++) { 221 | for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) { 222 | neighbor_j = csrColInd[offset]; 223 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 224 | *correct = 0; 225 | //printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 226 | break; 227 | } 228 | } 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/topo/graph_io.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | 4 | // transfer R-MAT generated gr graph to CSR format 5 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 6 | printf("Reading RMAT (.gr) input file %s\n", gr); 7 | std::ifstream cfile; 8 | cfile.open(gr); 9 | std::string str; 10 | getline(cfile, str); 11 | char c; 12 | sscanf(str.c_str(), "%c", &c); 13 | while (c == 'c') { 14 | getline(cfile, str); 15 | sscanf(str.c_str(), "%c", &c); 16 | } 17 | char sp[3]; 18 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 19 | printf("num_vertices %d num_edges %d\n", m, nnz); 20 | //printf("%c %s %d %d\n", c, sp, m, nnz); 21 | vector > svector; 22 | set s; 23 | for (int i = 0; i < m; i++) 24 | svector.push_back(s); 25 | int dst, src; 26 | for (int i = 0; i < nnz; i++) { 27 | getline(cfile, str); 28 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 29 | 30 | if (c != 'a') 31 | printf("line %d\n", __LINE__); 32 | dst--; 33 | src--; 34 | svector[src].insert(dst); 35 | svector[dst].insert(src); 36 | } 37 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 38 | int count = 0; 39 | for (int i = 0; i < m; i++) { 40 | csrRowPtr[i] = count; 41 | count += svector[i].size(); 42 | } 43 | csrRowPtr[m] = count; 44 | if (count != nnz) { 45 | printf("This graph is not symmetric\n"); 46 | nnz = count; 47 | } 48 | double avgdeg; 49 | double variance = 0.0; 50 | int maxdeg = 0; 51 | int mindeg = m; 52 | avgdeg = (double)nnz / m; 53 | for (int i = 0; i < m; i++) { 54 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 55 | if (deg_i > maxdeg) 56 | maxdeg = deg_i; 57 | if (deg_i < mindeg) 58 | mindeg = deg_i; 59 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 60 | } 61 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 62 | csrColInd = (int *)malloc(count * sizeof(int)); 63 | set::iterator site; 64 | for (int i = 0, index = 0; i < m; i++) { 65 | site = svector[i].begin(); 66 | while (site != svector[i].end()) { 67 | csrColInd[index++] = *site; 68 | site++; 69 | } 70 | } 71 | } 72 | 73 | // transfer *.graph file to CSR format 74 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 75 | printf("Reading .graph input file %s\n", graph); 76 | std::ifstream cfile; 77 | cfile.open(graph); 78 | std::string str; 79 | getline(cfile, str); 80 | sscanf(str.c_str(), "%d %d", &m, &nnz); 81 | printf("num_vertices %d num_edges %d\n", m, nnz); 82 | vector > svector; 83 | set s; 84 | for (int i = 0; i < m; i++) 85 | svector.push_back(s); 86 | int dst; 87 | for (int i = 0; i < m; i++) { 88 | getline(cfile, str); 89 | istringstream istr; 90 | istr.str(str); 91 | while(istr>>dst) { 92 | dst --; 93 | svector[i].insert(dst); 94 | svector[dst].insert(i); 95 | } 96 | istr.clear(); 97 | } 98 | cfile.close(); 99 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 100 | int count = 0; 101 | for (int i = 0; i < m; i++) { 102 | csrRowPtr[i] = count; 103 | count += svector[i].size(); 104 | } 105 | csrRowPtr[m] = count; 106 | if (count != nnz) { 107 | printf("This graph is not symmetric\n"); 108 | nnz = count; 109 | } 110 | double avgdeg; 111 | double variance = 0.0; 112 | int maxdeg = 0; 113 | int mindeg = m; 114 | avgdeg = (double)nnz / m; 115 | for (int i = 0; i < m; i++) { 116 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 117 | if (deg_i > maxdeg) 118 | maxdeg = deg_i; 119 | if (deg_i < mindeg) 120 | mindeg = deg_i; 121 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 122 | } 123 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 124 | csrColInd = (int *)malloc(count * sizeof(int)); 125 | set::iterator site; 126 | for (int i = 0, index = 0; i < m; i++) { 127 | site = svector[i].begin(); 128 | while (site != svector[i].end()) { 129 | csrColInd[index++] = *site; 130 | site++; 131 | } 132 | } 133 | } 134 | 135 | // transfer mtx graph to CSR format 136 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 137 | printf("Reading (.mtx) input file %s\n", mtx); 138 | std::ifstream cfile; 139 | cfile.open(mtx); 140 | std::string str; 141 | getline(cfile, str); 142 | char c; 143 | sscanf(str.c_str(), "%c", &c); 144 | while (c == '%') { 145 | getline(cfile, str); 146 | sscanf(str.c_str(), "%c", &c); 147 | } 148 | int n; 149 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 150 | if (m != n) { 151 | printf("error!\n"); 152 | exit(0); 153 | } 154 | printf("num_vertices %d num_edges %d\n", m, nnz); 155 | vector > svector; 156 | set s; 157 | for (int i = 0; i < m; i++) 158 | svector.push_back(s); 159 | int dst, src; 160 | for (int i = 0; i < nnz; i++) { 161 | getline(cfile, str); 162 | sscanf(str.c_str(), "%d %d", &dst, &src); 163 | dst--; 164 | src--; 165 | svector[src].insert(dst); 166 | svector[dst].insert(src); 167 | } 168 | cfile.close(); 169 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 170 | int count = 0; 171 | for (int i = 0; i < m; i++) { 172 | csrRowPtr[i] = count; 173 | count += svector[i].size(); 174 | } 175 | csrRowPtr[m] = count; 176 | if (count != nnz) { 177 | printf("This graph is not symmetric\n"); 178 | nnz = count; 179 | } 180 | double avgdeg; 181 | double variance = 0.0; 182 | int maxdeg = 0; 183 | int mindeg = m; 184 | avgdeg = (double)nnz / m; 185 | for (int i = 0; i < m; i++) { 186 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 187 | if (deg_i > maxdeg) 188 | maxdeg = deg_i; 189 | if (deg_i < mindeg) 190 | mindeg = deg_i; 191 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 192 | } 193 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 194 | csrColInd = (int *)malloc(count * sizeof(int)); 195 | set::iterator site; 196 | for (int i = 0, index = 0; i < m; i++) { 197 | site = svector[i].begin(); 198 | while (site != svector[i].end()) { 199 | csrColInd[index++] = *site; 200 | site++; 201 | } 202 | } 203 | } 204 | 205 | // store color of all vertex 206 | void write_solution(char *fname, int *coloring, int n) { 207 | int i; 208 | FILE *fp; 209 | fp = fopen(fname, "w"); 210 | for (i = 0; i < n; i++) { 211 | //fprintf(fp, "%d:%d\n", i, coloring[i]); 212 | fprintf(fp, "%d\n", coloring[i]); 213 | } 214 | fclose(fp); 215 | } 216 | 217 | // check if correctly coloured 218 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 219 | int i, offset, neighbor_j; 220 | for (i = 0; i < m; i++) { 221 | for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) { 222 | neighbor_j = csrColInd[offset]; 223 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 224 | *correct = 0; 225 | //printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 226 | break; 227 | } 228 | } 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/omp/worklist.h: -------------------------------------------------------------------------------- 1 | /* 2 | * use atomicInc to automatically wrap around. 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "common.h" 13 | #define MINCAPACITY 65535 14 | #define MAXOVERFLOWS 1 15 | 16 | typedef struct Worklist { 17 | unsigned pushRange(unsigned *start, unsigned nitems); 18 | unsigned push(unsigned work); 19 | unsigned popRange(unsigned *start, unsigned nitems); 20 | unsigned pop(unsigned &work); 21 | void clear(); 22 | void myItems(unsigned &start, unsigned &end); 23 | unsigned getItem(unsigned at); 24 | unsigned getItemWithin(unsigned at, unsigned hsize); 25 | unsigned count(); 26 | 27 | void init(); 28 | void init(unsigned initialcapacity); 29 | void setSize(unsigned hsize); 30 | unsigned getSize(); 31 | void setCapacity(unsigned hcapacity); 32 | unsigned getCapacity(); 33 | void setInitialSize(unsigned hsize); 34 | unsigned calculateSize(unsigned hstart, unsigned hend); 35 | void copyOldToNew(unsigned *olditems, unsigned *newitems, unsigned oldsize, unsigned oldcapacity); 36 | void append(Worklist wl); 37 | 38 | Worklist(); 39 | ~Worklist(); 40 | unsigned ensureSpace(unsigned space); 41 | unsigned *alloc(unsigned allocsize); 42 | unsigned realloc(unsigned space); 43 | unsigned dealloc(); 44 | unsigned freeSize(); 45 | unsigned *items; 46 | unsigned start, end; 47 | unsigned capacity; 48 | unsigned noverflows; 49 | } Worklist; 50 | 51 | Worklist::Worklist() { 52 | init(); 53 | } 54 | 55 | void Worklist::init() { 56 | init(0); 57 | } 58 | 59 | void Worklist::init(unsigned initialcapacity) { 60 | setCapacity(initialcapacity); 61 | setInitialSize(0); 62 | items = NULL; 63 | if (initialcapacity) items = alloc(initialcapacity); 64 | noverflows = 0; 65 | } 66 | 67 | unsigned *Worklist::alloc(unsigned allocsize) { 68 | unsigned *ptr = NULL; 69 | if(allocsize > 0) 70 | ptr = (unsigned *)malloc(allocsize * sizeof(unsigned)); 71 | if(ptr == NULL) 72 | printf("%s(%d): Allocating %d failed.\n", __FILE__, __LINE__, allocsize); 73 | return ptr; 74 | } 75 | 76 | unsigned Worklist::getCapacity() { 77 | return capacity; 78 | } 79 | 80 | unsigned Worklist::calculateSize(unsigned hstart, unsigned hend) { 81 | if (hend >= hstart) { 82 | return hend - hstart; 83 | } 84 | // circular queue. 85 | unsigned cap = getCapacity(); 86 | return hend + (cap - hstart + 1); 87 | } 88 | 89 | unsigned Worklist::getSize() { 90 | return calculateSize(start, end); 91 | } 92 | 93 | void Worklist::setCapacity(unsigned cap) { 94 | capacity = cap; 95 | } 96 | 97 | void Worklist::setInitialSize(unsigned size) { 98 | start = 0; 99 | end = 0; 100 | } 101 | 102 | void Worklist::setSize(unsigned size) { 103 | unsigned cap = getCapacity(); 104 | if (size > cap) { 105 | printf("%s(%d): buffer overflow, setting size=%d, when capacity=%d.\n", __FILE__, __LINE__, size, cap); 106 | return; 107 | } 108 | if (start + size < cap) { 109 | end = start + size; 110 | } else { 111 | size -= cap - start; 112 | end = size; 113 | } 114 | } 115 | 116 | void Worklist::copyOldToNew(unsigned *olditems, unsigned *newitems, unsigned oldsize, unsigned oldcapacity) { 117 | if (start < end) { // no wrap-around. 118 | memcpy(newitems, olditems + start, oldsize * sizeof(unsigned)); 119 | } else { 120 | memcpy(newitems, olditems + start, (oldcapacity - start) * sizeof(unsigned)); 121 | memcpy(newitems + (oldcapacity - start), olditems, end * sizeof(unsigned)); 122 | } 123 | } 124 | 125 | unsigned Worklist::realloc(unsigned space) { 126 | unsigned cap = getCapacity(); 127 | unsigned newcapacity = (space > MINCAPACITY ? space : MINCAPACITY); 128 | if (cap == 0) { 129 | setCapacity(newcapacity); 130 | items = alloc(newcapacity); 131 | if (items == NULL) { 132 | return 1; 133 | } 134 | //printf("\tworklist capacity set to %d.\n", getCapacity()); 135 | } else { 136 | unsigned *itemsrealloc = alloc(newcapacity); 137 | if (itemsrealloc == NULL) { 138 | return 1; 139 | } 140 | unsigned oldsize = getSize(); 141 | copyOldToNew(items, itemsrealloc, oldsize, cap); 142 | dealloc(); 143 | items = itemsrealloc; 144 | setCapacity(newcapacity); 145 | start = 0; 146 | end = oldsize; 147 | printf("\tworklist capacity reset to %d.\n", getCapacity()); 148 | } 149 | return 0; 150 | } 151 | 152 | unsigned Worklist::freeSize() { 153 | return getCapacity() - getSize(); 154 | } 155 | 156 | unsigned Worklist::ensureSpace(unsigned space) { 157 | if (freeSize() >= space) { 158 | return 0; 159 | } 160 | realloc(space); 161 | return 1; 162 | } 163 | 164 | unsigned Worklist::dealloc() { 165 | free(items); 166 | setInitialSize(0); 167 | return 0; 168 | } 169 | 170 | Worklist::~Worklist() { 171 | } 172 | 173 | unsigned Worklist::pushRange(unsigned *copyfrom, unsigned nitems) { 174 | if (copyfrom == NULL || nitems == 0) return 0; 175 | 176 | unsigned lcap = capacity; 177 | unsigned offset = my_fetch_add(&end, nitems); 178 | if (offset >= lcap) { // overflow. 179 | my_fetch_sub(&end, nitems); 180 | return 1; 181 | } 182 | for (unsigned ii = 0; ii < nitems; ++ii) { 183 | items[(offset + ii) % lcap] = copyfrom[ii]; 184 | } 185 | return 0; 186 | } 187 | 188 | unsigned Worklist::push(unsigned work) { 189 | return pushRange(&work, 1); 190 | } 191 | 192 | unsigned Worklist::popRange(unsigned *copyto, unsigned nitems) { 193 | unsigned currsize = count(); 194 | if (currsize < nitems) { 195 | nitems = currsize; 196 | } 197 | unsigned offset = 0; 198 | unsigned lcap = capacity; 199 | if (nitems) { 200 | if (start + nitems < lcap) { 201 | offset = my_fetch_add(&start, nitems); 202 | } else { 203 | offset = my_fetch_add(&start, start + nitems - lcap); 204 | } 205 | } 206 | // copy nitems starting from offset. 207 | for (unsigned ii = 0; ii < nitems; ++ii) { 208 | copyto[ii] = items[(offset + ii) % lcap]; 209 | } 210 | return nitems; 211 | } 212 | 213 | unsigned Worklist::pop(unsigned &work) { 214 | return popRange(&work, 1); 215 | } 216 | 217 | void Worklist::clear() { 218 | setSize(0); 219 | } 220 | 221 | unsigned Worklist::getItem(unsigned at) { 222 | unsigned size = count(); 223 | return getItemWithin(at, size); 224 | } 225 | 226 | unsigned Worklist::getItemWithin(unsigned at, unsigned size) { 227 | if (at < size) { 228 | return items[at]; 229 | } 230 | return -1; 231 | } 232 | 233 | unsigned Worklist::count() { 234 | if (end >= start) { 235 | return end - start; 236 | } else { 237 | return end + (capacity - start + 1); 238 | } 239 | } 240 | 241 | #define SWAPDEV(a, b) { unsigned tmp = a; a = b; b = tmp; } 242 | void printWorklist(Worklist wl) { 243 | printf("\t"); 244 | for (unsigned ii = wl.start; ii < wl.end; ++ii) { 245 | printf("%d,", wl.getItem(ii)); 246 | } 247 | printf("\n"); 248 | } 249 | 250 | void Worklist::append(Worklist wl) { 251 | unsigned size = getSize(); 252 | for (unsigned ii = 0; ii < wl.count(); ++ii) { 253 | items[size + ii] = wl.items[ii]; 254 | } 255 | end += wl.getSize(); 256 | } 257 | 258 | -------------------------------------------------------------------------------- /src/data/kernel_base.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "worklistc.h" 15 | #define TIMING 16 | #define SCRATCHSIZE BLKSIZE 17 | #define MAXCOLOR 128 // assume graph can be colored with less than 128 colors 18 | typedef cub::BlockScan BlockScan; 19 | 20 | __global__ void initialize(int *coloring, int m) { 21 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 22 | if (id < m) { 23 | coloring[id] = MAXCOLOR; 24 | } 25 | } 26 | 27 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) { 28 | int id = blockIdx.x * blockDim.x + threadIdx.x; 29 | bool forbiddenColors[MAXCOLOR+1]; 30 | int vertex; 31 | // get vertex from worklist according to thread id 32 | if (inwl.pop_id(id, vertex)) { 33 | int row_begin = csrRowPtr[vertex]; 34 | int row_end = csrRowPtr[vertex + 1]; 35 | for (int j = 0; j < MAXCOLOR; j++) 36 | forbiddenColors[j] = false; 37 | // traverse all neighbors of current vertex 38 | for (int offset = row_begin; offset < row_end; offset ++) { 39 | int neighbor = csrColInd[offset]; 40 | int color = coloring[neighbor]; 41 | //int color = cub::ThreadLoad(coloring + neighbor); 42 | if(color != MAXCOLOR) 43 | forbiddenColors[color] = true; // mask the color 44 | } 45 | // assign the smallest unforbidden color to vertex 46 | int vertex_color; 47 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) { 48 | if (!forbiddenColors[vertex_color]) { 49 | coloring[vertex] = vertex_color; 50 | break; 51 | } 52 | } 53 | assert(vertex_color < MAXCOLOR); 54 | } 55 | } 56 | 57 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) { 58 | int id = blockIdx.x * blockDim.x + threadIdx.x; 59 | int conflicted = 0; // assume vertex not conflicted 60 | int vertex; 61 | if (inwl.pop_id(id, vertex)) { 62 | int row_begin = csrRowPtr[vertex]; 63 | int row_end = csrRowPtr[vertex + 1]; 64 | for (int offset = row_begin; offset < row_end; offset ++) { 65 | int neighbor = csrColInd[offset]; 66 | // if at least one neighbor was assigned the same color as vertex, 67 | // and its vertex number is bigger than vertex, 68 | // then vertex is regarded as conflicting 69 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 70 | conflicted = 1; 71 | coloring[vertex] = MAXCOLOR; // reset color 72 | break; 73 | } 74 | } 75 | } 76 | outwl.push_1item(conflicted, vertex, BLKSIZE); // push to outwl if conflicted 77 | } 78 | 79 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 80 | double starttime, endtime; 81 | double runtime[ITERATIONS]; 82 | int colors[ITERATIONS]; 83 | int iterations[ITERATIONS]; 84 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 85 | printf("Graph coloring data-driven Base version\n"); 86 | 87 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 88 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 89 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 90 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 91 | #ifdef TIMING 92 | double t1 = rtclock(); 93 | #endif 94 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 95 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 96 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 97 | #ifdef TIMING 98 | double t2 = rtclock(); 99 | printf("Time of init:%f\n", 1000.0f * (t2 - t1)); 100 | #endif 101 | int device = 0; 102 | int deviceCount = 0; 103 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 104 | cudaDeviceProp deviceProp; 105 | cudaGetDeviceProperties(&deviceProp, device); 106 | int nSM = deviceProp.multiProcessorCount; 107 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 108 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 109 | const size_t max_blocks_1 = maximum_residency(firstFit, BLKSIZE, 0); 110 | const size_t max_blocks_2 = maximum_residency(conflictResolve, BLKSIZE, 0); 111 | printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2); 112 | 113 | for (int i = 0; i < ITERATIONS; i++) { 114 | Worklist2 inwl(m), outwl(m); 115 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 116 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 117 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 118 | iterations[i] = 0; 119 | 120 | starttime = rtclock(); 121 | int nitems = m; 122 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 123 | while (nitems > 0) { 124 | iterations[i] ++; 125 | int nblocks = (nitems - 1) / BLKSIZE + 1; 126 | //printf("nitems=%d, nblocks=%d\n", nitems, nblocks); 127 | firstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring); 128 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring); 129 | nitems = outwlptr->nitems(); 130 | // swap inwlptr and outwlptr 131 | Worklist2 * tmp = inwlptr; 132 | inwlptr = outwlptr; 133 | outwlptr = tmp; 134 | outwlptr->reset(); // clear outwl 135 | } 136 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 137 | endtime = rtclock(); 138 | //printf("iteration=%d\n", iterations[i]); 139 | runtime[i] = 1000.0f * (endtime - starttime); 140 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 141 | } 142 | #ifdef TIMING 143 | double t3, t4; 144 | t3 = rtclock(); 145 | #endif 146 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 147 | #ifdef TIMING 148 | t4 = rtclock(); 149 | printf("Time of copy back:%f\n", 1000.0f * (t4 - t3)); 150 | #endif 151 | double total_time = 0.0; 152 | int total_colors = 0; 153 | int total_iterations = 0; 154 | for (int i = 0; i < ITERATIONS; i++) { 155 | total_time += runtime[i]; 156 | total_colors += colors[i]; 157 | total_iterations += iterations[i]; 158 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 159 | } 160 | double avg_time = (double)total_time / ITERATIONS; 161 | double avg_colors = (double)total_colors / ITERATIONS; 162 | double avg_iterations = (double)total_iterations / ITERATIONS; 163 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 164 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 165 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 166 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 167 | } 168 | -------------------------------------------------------------------------------- /src/data/kernel_ldg.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "worklistc.h" 15 | #define SCRATCHSIZE BLKSIZE 16 | #define MAXCOLOR 128 // assume graph can be colored with less than 128 colors 17 | //#define TEXTURE 18 | 19 | typedef cub::BlockScan BlockScan; 20 | #ifdef TEXTURE 21 | texture rowPtr; 22 | texture colInd; 23 | #endif 24 | 25 | __global__ void initialize(int *coloring, int m) { 26 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 27 | if (id < m) { 28 | coloring[id] = MAXCOLOR; 29 | } 30 | } 31 | 32 | #ifdef TEXTURE 33 | __global__ void FirstFit(int m, Worklist2 inwl, int *coloring) { 34 | #else 35 | __global__ void FirstFit(int m, const int * __restrict__ csrRowPtr, const int * __restrict__ csrColInd, Worklist2 inwl, int *coloring) { 36 | #endif 37 | int id = blockIdx.x * blockDim.x + threadIdx.x; 38 | bool forbiddenColors[MAXCOLOR+1]; 39 | int vertex; 40 | if (inwl.pop_id(id, vertex)) { 41 | #ifdef TEXTURE 42 | int row_begin = tex1Dfetch(rowPtr, vertex); 43 | int row_end = tex1Dfetch(rowPtr, vertex + 1); 44 | #else 45 | int row_begin = __ldg(csrRowPtr + vertex); 46 | int row_end = __ldg(csrRowPtr + vertex + 1); 47 | #endif 48 | for (int i = 0; i < MAXCOLOR; i ++) 49 | forbiddenColors[i] = false; 50 | for (int offset = row_begin; offset < row_end; offset ++) { 51 | #ifdef TEXTURE 52 | int neighbor = tex1Dfetch(colInd, offset); 53 | #else 54 | int neighbor = __ldg(csrColInd + offset); 55 | #endif 56 | int color = coloring[neighbor]; 57 | forbiddenColors[color] = true; 58 | } 59 | int vertex_color; 60 | for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) { 61 | if (!forbiddenColors[vertex_color]) { 62 | coloring[vertex] = vertex_color; 63 | break; 64 | } 65 | } 66 | assert(vertex_color < MAXCOLOR); 67 | } 68 | } 69 | 70 | #ifdef TEXTURE 71 | __global__ void conflictResolve(int m, Worklist2 inwl, Worklist2 outwl, int *coloring) { 72 | #else 73 | __global__ void conflictResolve(int m, const int * __restrict__ csrRowPtr, const int * __restrict__ csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) { 74 | #endif 75 | int id = blockIdx.x * blockDim.x + threadIdx.x; 76 | int conflicted = 0; 77 | int vertex; 78 | if (inwl.pop_id(id, vertex)) { 79 | #ifdef TEXTURE 80 | int row_begin = tex1Dfetch(rowPtr, vertex); 81 | int row_end= tex1Dfetch(rowPtr, vertex + 1); 82 | #else 83 | int row_begin = __ldg(csrRowPtr + vertex); 84 | int row_end= __ldg(csrRowPtr + vertex + 1); 85 | #endif 86 | for (int offset = row_begin; offset < row_end; offset ++) { 87 | #ifdef TEXTURE 88 | int neighbor = tex1Dfetch(colInd, offset); 89 | #else 90 | int neighbor = __ldg(csrColInd + offset); 91 | #endif 92 | if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) { 93 | conflicted = 1; 94 | coloring[vertex] = MAXCOLOR; 95 | break; 96 | } 97 | } 98 | } 99 | outwl.push_1item(conflicted, vertex, BLKSIZE); 100 | } 101 | 102 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 103 | double starttime, endtime; 104 | double runtime[ITERATIONS]; 105 | int colors[ITERATIONS]; 106 | int iterations[ITERATIONS]; 107 | int *d_csrRowPtr, *d_csrColInd, *d_coloring; 108 | printf("Graph coloring data-driven LDG version\n"); 109 | 110 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 111 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 112 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 113 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 114 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 115 | #ifdef TEXTURE 116 | CUDA_SAFE_CALL(cudaBindTexture(0, rowPtr, csrRowPtr, (m + 1) * sizeof(int))); 117 | CUDA_SAFE_CALL(cudaBindTexture(0, colInd, csrColInd, (nnz + 1) * sizeof(int))); 118 | #endif 119 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 120 | int device = 0; 121 | int deviceCount = 0; 122 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 123 | cudaDeviceProp deviceProp; 124 | cudaGetDeviceProperties(&deviceProp, device); 125 | int nSM = deviceProp.multiProcessorCount; 126 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 127 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 128 | 129 | for (int i = 0; i < ITERATIONS; i++) { 130 | Worklist2 inwl(m), outwl(m); 131 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 132 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 133 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 134 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 135 | iterations[i] = 0; 136 | 137 | starttime = rtclock(); 138 | int nitems = m; 139 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 140 | int iteration = 0; 141 | while (nitems > 0) { 142 | iterations[i] ++; 143 | int nblocks = (nitems - 1) / BLKSIZE + 1; 144 | #ifdef TEXTURE 145 | FirstFit<<>>(m, *inwlptr, d_coloring); 146 | conflictResolve<<>>(m, *inwlptr, *outwlptr, d_coloring); 147 | #else 148 | FirstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring); 149 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring); 150 | #endif 151 | nitems = outwlptr->nitems(); 152 | Worklist2 * tmp = inwlptr; 153 | inwlptr = outwlptr; 154 | outwlptr = tmp; 155 | outwlptr->reset(); 156 | } 157 | cudaDeviceSynchronize(); 158 | endtime = rtclock(); 159 | runtime[i] = 1000.0f * (endtime - starttime); 160 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 161 | } 162 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 163 | double total_time = 0.0; 164 | int total_colors = 0; 165 | int total_iterations = 0; 166 | for (int i = 0; i < ITERATIONS; i++) { 167 | total_time += runtime[i]; 168 | total_colors += colors[i]; 169 | total_iterations += iterations[i]; 170 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 171 | } 172 | double avg_time = (double)total_time / ITERATIONS; 173 | double avg_colors = (double)total_colors / ITERATIONS; 174 | double avg_iterations = (double)total_iterations / ITERATIONS; 175 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 176 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 177 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 178 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 179 | } 180 | -------------------------------------------------------------------------------- /src/data/kernel_comb.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include "gbar.cuh" 5 | #include "cuda_launch_config.hpp" 6 | #include "cutil_subset.h" 7 | #include "common.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "worklistc.h" 15 | #define SCRATCHSIZE BLKSIZE 16 | #define MAXCOLOR 128 17 | typedef cub::BlockScan BlockScan; 18 | 19 | __global__ void initialize(int *coloring, int m) { 20 | unsigned id = blockIdx.x * blockDim.x + threadIdx.x; 21 | if (id < m) { 22 | coloring[id] = MAXCOLOR; 23 | } 24 | } 25 | 26 | __device__ __forceinline__ void assignColor(unsigned *forbiddenColors, int *coloring, int vertex) { 27 | int vertex_color; 28 | for (vertex_color = 0; vertex_color < MAXCOLOR/32; vertex_color++) { 29 | int pos = __ffs(forbiddenColors[vertex_color]); 30 | if(pos) { 31 | coloring[vertex] = vertex_color * 32 + pos - 1; 32 | break; 33 | } 34 | } 35 | assert(vertex_color < MAXCOLOR); 36 | } 37 | 38 | __global__ void firstFit(int m, const int* __restrict__ csrRowPtr, const int* __restrict__ csrColInd, Worklist2 inwl, int *coloring) { 39 | //__global__ void firstFit(int m, int* csrRowPtr, int* csrColInd, Worklist2 inwl, int *coloring) { 40 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 41 | unsigned forbiddenColors[MAXCOLOR/32+1]; 42 | int id = tid; 43 | //int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x); 44 | //for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) { 45 | int vertex; 46 | if (inwl.pop_id(id, vertex)) { 47 | //int row_begin = csrRowPtr[vertex]; 48 | //int row_end = csrRowPtr[vertex + 1]; 49 | int row_begin = __ldg(csrRowPtr + vertex); 50 | int row_end= __ldg(csrRowPtr + vertex + 1); 51 | for (int j = 0; j < MAXCOLOR/32; j++) 52 | forbiddenColors[j] = 0xffffffff; 53 | for (int offset = row_begin; offset < row_end; offset ++) { 54 | //int neighbor = csrColInd[offset]; 55 | int neighbor = __ldg(csrColInd + offset); 56 | int color = coloring[neighbor]; 57 | forbiddenColors[color / 32] &= ~(1 << (color % 32)); 58 | } 59 | assignColor(forbiddenColors, coloring, vertex); 60 | } 61 | //} 62 | } 63 | 64 | __device__ __forceinline__ void conflictDetect1(int src, int dst, int *coloring, bool &is_conflict) { 65 | int color_s = coloring[src]; 66 | int color_d = coloring[dst]; 67 | if (color_s == color_d && src < dst) { 68 | is_conflict = 1; 69 | coloring[src] = MAXCOLOR; 70 | } 71 | } 72 | 73 | __device__ __forceinline__ bool conflictDetect2(int src, int dst, int *coloring, int *degree, bool &is_conflict) { 74 | if (coloring[src] == coloring[dst]) { 75 | bool is_victim; 76 | if (degree[src] == degree[dst]) 77 | is_victim = (src < dst) ? true : false; 78 | else is_victim = (degree[src] < degree[dst]) ? true : false; 79 | if (is_victim) { 80 | is_conflict = 1; 81 | coloring[src] = MAXCOLOR; 82 | } 83 | } 84 | } 85 | 86 | __global__ void conflictResolve(int m, const int* __restrict__ csrRowPtr, const int* __restrict__ csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) { 87 | //__global__ void conflictResolve(int m, int* csrRowPtr, int* csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) { 88 | int id = blockIdx.x * blockDim.x + threadIdx.x; 89 | bool is_conflict = 0; 90 | int vertex; 91 | if (inwl.pop_id(id, vertex)) { 92 | //int row_begin = csrRowPtr[vertex]; 93 | //int row_end = csrRowPtr[vertex + 1]; 94 | int row_begin = __ldg(csrRowPtr + vertex); 95 | int row_end= __ldg(csrRowPtr + vertex + 1); 96 | for (int offset = row_begin; offset < row_end; offset ++) { 97 | //int neighbor = csrColInd[offset]; 98 | int neighbor = __ldg(csrColInd + offset); 99 | //conflictDetect1(vertex, neighbor, coloring, is_conflict); 100 | conflictDetect2(vertex, neighbor, coloring, degree, is_conflict); 101 | if(is_conflict) break; 102 | } 103 | } 104 | outwl.push_1item((int)is_conflict, vertex, BLKSIZE); 105 | } 106 | 107 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) { 108 | double starttime, endtime; 109 | double runtime[ITERATIONS]; 110 | int colors[ITERATIONS]; 111 | int iterations[ITERATIONS]; 112 | int *d_csrRowPtr, *d_csrColInd, *d_coloring, *d_degree; 113 | printf("Graph coloring data-driven Combination A version\n"); 114 | int *degree = (int *)malloc(m * sizeof(int)); 115 | for(int i = 0; i < m; i ++) { 116 | degree[i] = csrRowPtr[i + 1] - csrRowPtr[i]; 117 | } 118 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 119 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 120 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_degree, m * sizeof(int))); 121 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 122 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 123 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 124 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 125 | int device = 0; 126 | int deviceCount = 0; 127 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 128 | cudaDeviceProp deviceProp; 129 | cudaGetDeviceProperties(&deviceProp, 0); 130 | int nSM = deviceProp.multiProcessorCount; 131 | //int nSM = num_SMs; 132 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 133 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 134 | 135 | const size_t max_blocks = maximum_residency(firstFit, BLKSIZE, 0); 136 | printf("max_blocks=%d\n", max_blocks); 137 | 138 | for (int i = 0; i < ITERATIONS; i++) { 139 | Worklist2 inwl(m), outwl(m); 140 | Worklist2 *inwlptr = &inwl, *outwlptr = &outwl; 141 | CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice)); 142 | CUDA_SAFE_CALL(cudaMemcpy(d_degree, degree, m * sizeof(int), cudaMemcpyHostToDevice)); 143 | initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m); 144 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 145 | iterations[i] = 0; 146 | 147 | starttime = rtclock(); 148 | int nitems = m; 149 | thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m); 150 | while (nitems > 0) { 151 | iterations[i] ++; 152 | //printf("in_nitems[%d]=%d\n", iteration, nitems); 153 | int nblocks = (nitems - 1) / BLKSIZE + 1; 154 | int nblocks_1 = nSM * max_blocks; 155 | if(nblocks < nblocks_1) nblocks_1 = nblocks; 156 | firstFit<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring); 157 | conflictResolve<<>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_degree, d_coloring); 158 | nitems = outwlptr->nitems(); 159 | Worklist2 * tmp = inwlptr; 160 | inwlptr = outwlptr; 161 | outwlptr = tmp; 162 | outwlptr->reset(); 163 | } 164 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 165 | endtime = rtclock(); 166 | runtime[i] = 1000.0f * (endtime - starttime); 167 | colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()) + 1; 168 | } 169 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 170 | double total_time = 0.0; 171 | int total_colors = 0; 172 | int total_iterations = 0; 173 | for (int i = 0; i < ITERATIONS; i++) { 174 | total_time += runtime[i]; 175 | total_colors += colors[i]; 176 | total_iterations += iterations[i]; 177 | printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]); 178 | } 179 | double avg_time = (double)total_time / ITERATIONS; 180 | double avg_colors = (double)total_colors / ITERATIONS; 181 | double avg_iterations = (double)total_iterations / ITERATIONS; 182 | printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations); 183 | CUDA_SAFE_CALL(cudaFree(d_csrRowPtr)); 184 | CUDA_SAFE_CALL(cudaFree(d_csrColInd)); 185 | CUDA_SAFE_CALL(cudaFree(d_coloring)); 186 | } 187 | -------------------------------------------------------------------------------- /src/data/main.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "lonestargpu.h" 12 | #include "variants.h" 13 | using namespace std; 14 | 15 | #ifndef ITERATIONS 16 | #define ITERATIONS 1 17 | #endif 18 | #ifndef BLKSIZE 19 | #define BLKSIZE 128 20 | #endif 21 | 22 | // transfer R-MAT generated gr graph to CSR format 23 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 24 | printf("Reading RMAT (.gr) input file %s\n", gr); 25 | std::ifstream cfile; 26 | cfile.open(gr); 27 | std::string str; 28 | getline(cfile, str); 29 | char c; 30 | sscanf(str.c_str(), "%c", &c); 31 | while (c == 'c') { 32 | getline(cfile, str); 33 | sscanf(str.c_str(), "%c", &c); 34 | } 35 | char sp[3]; 36 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 37 | printf("num_vertices %d num_edges %d\n", m, nnz); 38 | //printf("%c %s %d %d\n", c, sp, m, nnz); 39 | vector > svector; 40 | set s; 41 | for (int i = 0; i < m; i++) 42 | svector.push_back(s); 43 | int dst, src; 44 | for (int i = 0; i < nnz; i++) { 45 | getline(cfile, str); 46 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 47 | 48 | if (c != 'a') 49 | printf("line %d\n", __LINE__); 50 | dst--; 51 | src--; 52 | svector[src].insert(dst); 53 | svector[dst].insert(src); 54 | } 55 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 56 | int count = 0; 57 | for (int i = 0; i < m; i++) { 58 | csrRowPtr[i] = count; 59 | count += svector[i].size(); 60 | } 61 | csrRowPtr[m] = count; 62 | if (count != nnz) { 63 | printf("This graph is not symmetric\n"); 64 | nnz = count; 65 | } 66 | double avgdeg; 67 | double variance = 0.0; 68 | int maxdeg = 0; 69 | int mindeg = m; 70 | avgdeg = (double)nnz / m; 71 | for (int i = 0; i < m; i++) { 72 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 73 | if (deg_i > maxdeg) 74 | maxdeg = deg_i; 75 | if (deg_i < mindeg) 76 | mindeg = deg_i; 77 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 78 | } 79 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 80 | csrColInd = (int *)malloc(count * sizeof(int)); 81 | set::iterator site; 82 | for (int i = 0, index = 0; i < m; i++) { 83 | site = svector[i].begin(); 84 | while (site != svector[i].end()) { 85 | csrColInd[index++] = *site; 86 | site++; 87 | } 88 | } 89 | } 90 | 91 | // transfer *.graph file to CSR format 92 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 93 | printf("Reading .graph input file %s\n", graph); 94 | std::ifstream cfile; 95 | cfile.open(graph); 96 | std::string str; 97 | getline(cfile, str); 98 | sscanf(str.c_str(), "%d %d", &m, &nnz); 99 | printf("num_vertices %d num_edges %d\n", m, nnz); 100 | vector > svector; 101 | set s; 102 | for (int i = 0; i < m; i++) 103 | svector.push_back(s); 104 | int dst; 105 | for (int i = 0; i < m; i++) { 106 | getline(cfile, str); 107 | istringstream istr; 108 | istr.str(str); 109 | while(istr>>dst) { 110 | dst --; 111 | svector[i].insert(dst); 112 | svector[dst].insert(i); 113 | } 114 | istr.clear(); 115 | } 116 | cfile.close(); 117 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 118 | int count = 0; 119 | for (int i = 0; i < m; i++) { 120 | csrRowPtr[i] = count; 121 | count += svector[i].size(); 122 | } 123 | csrRowPtr[m] = count; 124 | if (count != nnz) { 125 | printf("This graph is not symmetric\n"); 126 | nnz = count; 127 | } 128 | double avgdeg; 129 | double variance = 0.0; 130 | int maxdeg = 0; 131 | int mindeg = m; 132 | avgdeg = (double)nnz / m; 133 | for (int i = 0; i < m; i++) { 134 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 135 | if (deg_i > maxdeg) 136 | maxdeg = deg_i; 137 | if (deg_i < mindeg) 138 | mindeg = deg_i; 139 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 140 | } 141 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 142 | csrColInd = (int *)malloc(count * sizeof(int)); 143 | set::iterator site; 144 | for (int i = 0, index = 0; i < m; i++) { 145 | site = svector[i].begin(); 146 | while (site != svector[i].end()) { 147 | csrColInd[index++] = *site; 148 | site++; 149 | } 150 | } 151 | } 152 | 153 | // transfer mtx graph to CSR format 154 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 155 | printf("Reading (.mtx) input file %s\n", mtx); 156 | std::ifstream cfile; 157 | cfile.open(mtx); 158 | std::string str; 159 | getline(cfile, str); 160 | char c; 161 | sscanf(str.c_str(), "%c", &c); 162 | while (c == '%') { 163 | getline(cfile, str); 164 | sscanf(str.c_str(), "%c", &c); 165 | } 166 | int n; 167 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 168 | if (m != n) { 169 | printf("error!\n"); 170 | exit(0); 171 | } 172 | printf("num_vertices %d num_edges %d\n", m, nnz); 173 | vector > svector; 174 | set s; 175 | for (int i = 0; i < m; i++) 176 | svector.push_back(s); 177 | int dst, src; 178 | for (int i = 0; i < nnz; i++) { 179 | getline(cfile, str); 180 | sscanf(str.c_str(), "%d %d", &dst, &src); 181 | dst--; 182 | src--; 183 | svector[src].insert(dst); 184 | svector[dst].insert(src); 185 | } 186 | cfile.close(); 187 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 188 | int count = 0; 189 | for (int i = 0; i < m; i++) { 190 | csrRowPtr[i] = count; 191 | count += svector[i].size(); 192 | } 193 | csrRowPtr[m] = count; 194 | if (count != nnz) { 195 | printf("This graph is not symmetric\n"); 196 | nnz = count; 197 | } 198 | double avgdeg; 199 | double variance = 0.0; 200 | int maxdeg = 0; 201 | int mindeg = m; 202 | avgdeg = (double)nnz / m; 203 | for (int i = 0; i < m; i++) { 204 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 205 | if (deg_i > maxdeg) 206 | maxdeg = deg_i; 207 | if (deg_i < mindeg) 208 | mindeg = deg_i; 209 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 210 | } 211 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 212 | csrColInd = (int *)malloc(count * sizeof(int)); 213 | set::iterator site; 214 | for (int i = 0, index = 0; i < m; i++) { 215 | site = svector[i].begin(); 216 | while (site != svector[i].end()) { 217 | csrColInd[index++] = *site; 218 | site++; 219 | } 220 | } 221 | } 222 | 223 | // store colour of all vertex 224 | void write_solution(char *fname, int *coloring, int n) { 225 | int i; 226 | FILE *fp; 227 | fp = fopen(fname, "w"); 228 | for (i = 0; i < n; i++) { 229 | //fprintf(fp, "%d:%d\n", i, coloring[i]); 230 | fprintf(fp, "%d\n", coloring[i]); 231 | } 232 | fclose(fp); 233 | } 234 | 235 | // check if correctly coloured 236 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 237 | int i, offset, neighbor_j; 238 | for (i = 0; i < m; i++) { 239 | for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) { 240 | neighbor_j = csrColInd[offset]; 241 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 242 | *correct = 0; 243 | //printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 244 | break; 245 | } 246 | } 247 | } 248 | } 249 | 250 | int main(int argc, char *argv[]) { 251 | if (argc < 2) { 252 | printf("Usage: %s \n", argv[0]); 253 | exit(1); 254 | } 255 | int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL; 256 | // read graph 257 | if (strstr(argv[1], ".mtx")) 258 | mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 259 | else if (strstr(argv[1], ".graph")) 260 | graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 261 | else if (strstr(argv[1], ".gr")) 262 | gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 263 | else { printf("Unrecognizable input file format\n"); exit(0); } 264 | int *coloring = (int *)calloc(m, sizeof(int)); 265 | int correct = 1; 266 | int num_SMs; 267 | if (argc > 2) { 268 | num_SMs = atoi(argv[2]); 269 | printf("block_size=%d, num_SMs=%d\n", BLKSIZE, num_SMs); 270 | } 271 | #if VARIANT==DATA_LDB 272 | color_ldb(m, nnz, csrRowPtr, csrColInd, coloring, num_SMs); 273 | #else 274 | color(m, nnz, csrRowPtr, csrColInd, coloring, num_SMs); 275 | #endif 276 | write_solution("color.txt", coloring, m); 277 | verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct); 278 | if (correct) 279 | printf("correct.\n"); 280 | else 281 | printf("incorrect.\n"); 282 | return 0; 283 | } 284 | -------------------------------------------------------------------------------- /src/GM/tree.cpp: -------------------------------------------------------------------------------- 1 | #include "tree.h" 2 | #include 3 | 4 | using namespace std; 5 | 6 | 7 | /* 8 | int main(){ 9 | tree graph; 10 | node *temp; 11 | 12 | 13 | node *nodes = new node[7]; 14 | int degList[7] = {4,2,4,5,10,4,7}; 15 | 16 | for (int i=0; i<7; i++){ 17 | nodes[i].setKSD(i, 0, degList[i]); 18 | graph.insert(&nodes[i]); 19 | } 20 | 21 | 22 | cout << "RML" << endl; 23 | graph.displayTreeRML(graph.getTop()); 24 | 25 | 26 | 27 | temp = graph.remove(1,0,2); 28 | cout << endl << "Deleted: "; temp->displayNode(); 29 | 30 | cout << "RML" << endl; 31 | graph.displayTreeRML(graph.getTop()); 32 | 33 | 34 | cout << endl << "Deleted: "; temp = graph.remove(6,0,7); 35 | temp->displayNode(); 36 | 37 | cout << "RML" << endl; 38 | graph.displayTreeRML(graph.getTop()); 39 | 40 | 41 | return 0; 42 | } 43 | */ 44 | 45 | 46 | 47 | node::node(){ 48 | key = saturation = degree = color = -1; 49 | 50 | left = NULL; 51 | right = NULL; 52 | } 53 | 54 | node::node(int index, int sat, int deg){ 55 | key = index; 56 | saturation = sat; 57 | degree = deg; 58 | 59 | color = -1; 60 | 61 | left = NULL; 62 | right = NULL; 63 | } 64 | 65 | node::node(int index, int sat, int deg, int col, node *L, node *R){ 66 | key = index; 67 | saturation = sat; 68 | degree = deg; 69 | color = col; 70 | left = L; 71 | right = R; 72 | } 73 | 74 | 75 | int node::getKey(){ 76 | return key; 77 | } 78 | 79 | int node::getSaturation(){ 80 | return saturation; 81 | } 82 | 83 | int node::getDegree(){ 84 | return degree; 85 | } 86 | 87 | int node::getColor(){ 88 | return color; 89 | } 90 | 91 | node* node::getLeft(){ 92 | return left; 93 | } 94 | 95 | node* node::getRight(){ 96 | return right; 97 | } 98 | 99 | 100 | void node::setKey(int index){ 101 | key = index; 102 | } 103 | 104 | void node::setSaturation(int sat){ 105 | saturation = sat; 106 | } 107 | 108 | void node::setDegree(int deg){ 109 | degree = deg; 110 | } 111 | 112 | void node::setKSD(int index, int sat, int deg){ 113 | key = index; 114 | saturation = sat; 115 | degree = deg; 116 | color = -1; 117 | left = NULL; 118 | right = NULL; 119 | } 120 | 121 | void node::setColor(int c){ 122 | color = c; 123 | } 124 | 125 | void node::setLeft(node *L){ 126 | left = L; 127 | } 128 | 129 | void node::setRight(node *R){ 130 | right = R; 131 | } 132 | 133 | 134 | void node::displayNode(){ 135 | cout << key << " : Sat: " << saturation << " , Deg: " << degree << endl; 136 | } 137 | 138 | 139 | node::~node(){ 140 | left = NULL; 141 | right = NULL; 142 | } 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | tree::tree(){ 152 | top = NULL; 153 | } 154 | 155 | void tree::insert(node *x){ 156 | node *current, *previous; 157 | bool left; 158 | 159 | if (top == NULL) 160 | top = x; 161 | else 162 | { 163 | current = top; 164 | 165 | // Check to see where to insert 166 | while (current != NULL){ 167 | previous = current; 168 | 169 | if (current->getSaturation() < x->getSaturation()){ 170 | current = current->getRight(); 171 | left = false; 172 | } 173 | else 174 | if (current->getSaturation() > x->getSaturation()){ 175 | current = current->getLeft(); 176 | left = true; 177 | } 178 | else 179 | if (current->getDegree() < x->getDegree()){ 180 | current = current->getRight(); 181 | left = false; 182 | } 183 | else 184 | if (current->getDegree() >= x->getDegree()){ 185 | current = current->getLeft(); 186 | left = true; 187 | } 188 | } 189 | 190 | // Insert item 191 | if (left == true) 192 | previous->setLeft(x); 193 | else 194 | previous->setRight(x); 195 | } 196 | } 197 | 198 | 199 | node* tree::findNode(int index, int saturation, int degree){ 200 | node *current, *previous; 201 | 202 | current = top; 203 | 204 | while ((current != NULL) && (current->getKey() != index)){ 205 | previous = current; 206 | 207 | if (current->getSaturation() < saturation) 208 | current = current->getRight(); 209 | else 210 | if (current->getSaturation() > saturation) 211 | current = current->getLeft(); 212 | else 213 | if (current->getDegree() < degree) 214 | current = current->getRight(); 215 | else 216 | if (current->getDegree() >= degree) 217 | current = current->getLeft(); 218 | } 219 | 220 | return current; 221 | } 222 | 223 | node* tree::remove(int index, int saturation, int degree){ 224 | node *current, *previous, *parent, *nodeToDel; 225 | bool left, parentLeft; 226 | parent = previous = current = top; 227 | //node blank; 228 | 229 | if (top == NULL){ 230 | cout << "Tree is empty!!!" << endl; 231 | } 232 | else{ 233 | // step1: find the node 234 | while ((current != NULL) && (current->getKey() != index)){ 235 | previous = current; 236 | 237 | if (current->getSaturation() < saturation){ 238 | current = current->getRight(); 239 | left = false; 240 | } 241 | else 242 | if (current->getSaturation() > saturation){ 243 | current = current->getLeft(); 244 | left = true; 245 | } 246 | else 247 | if (current->getDegree() < degree){ 248 | current = current->getRight(); 249 | left = false; 250 | } 251 | else 252 | if (current->getDegree() >= degree){ 253 | current = current->getLeft(); 254 | left = true; 255 | } 256 | } 257 | 258 | 259 | // Not found!!! 260 | if (current == NULL){ 261 | cout << "Node not found!!!" << endl; 262 | return NULL; 263 | } 264 | 265 | 266 | 267 | // Replace 268 | parent = previous; 269 | parentLeft = left; 270 | nodeToDel = current; 271 | 272 | // blank.setKey(nodeToDel->getKey()); 273 | // blank.setSaturation(nodeToDel->getSaturation()); 274 | // blank.setDegree(nodeToDel->getDegree()); 275 | 276 | 277 | // Option 1: A leaf; replace by nothing!!! 278 | if ((current->getLeft() == NULL) && (current->getRight() == NULL)){ 279 | if (parentLeft == true) 280 | parent->setLeft(NULL); 281 | else 282 | parent->setRight(NULL); 283 | 284 | if (top == nodeToDel) 285 | top = NULL; 286 | 287 | return nodeToDel; 288 | } 289 | 290 | //Option 2: Node had only 1 child 291 | if ((current->getLeft() == NULL) || (current->getRight() == NULL)){ 292 | if (current->getLeft() == NULL) 293 | current = current->getRight(); 294 | else 295 | current = current->getLeft(); 296 | 297 | if (top == nodeToDel) 298 | top = current; 299 | else 300 | if (parentLeft == true) 301 | parent->setLeft(current); 302 | else 303 | parent->setRight(current); 304 | 305 | return nodeToDel; 306 | } 307 | 308 | 309 | 310 | //Option 3: Node had 2 Children - the painful one: replace by node slightly biggest (normally the rightmost of the left node) 311 | previous = current; 312 | current = current->getLeft(); 313 | 314 | if (current->getRight() == NULL){ 315 | if (top == nodeToDel) 316 | top = current; 317 | else 318 | if (parentLeft == true) 319 | parent->setLeft(current); 320 | else 321 | parent->setRight(current); 322 | 323 | current->setRight(nodeToDel->getRight()); 324 | } 325 | else{ 326 | while (current->getRight() != NULL){ 327 | previous = current; 328 | current = current->getRight(); 329 | } 330 | 331 | 332 | if (current->getLeft() == NULL) // replaced node is a leaf 333 | previous->setRight(NULL); 334 | else 335 | previous->setRight(current->getLeft()); // node has left children 336 | 337 | 338 | current->setLeft(nodeToDel->getLeft()); 339 | current->setRight(nodeToDel->getRight()); 340 | 341 | if (top == nodeToDel) 342 | top = current; 343 | else 344 | if (parentLeft == true) 345 | parent->setLeft(current); 346 | else 347 | parent->setRight(current); 348 | } 349 | 350 | return nodeToDel; 351 | } 352 | } 353 | 354 | 355 | void tree::findBiggest(int &index, int &saturation, int °ree){ 356 | node *current, temp; 357 | 358 | current = top; 359 | 360 | while (current->getRight() != NULL){ 361 | current = current->getRight(); 362 | } 363 | 364 | index = current->getKey(); 365 | saturation = current->getSaturation(); 366 | degree = current->getDegree(); 367 | } 368 | 369 | 370 | 371 | void tree::displayTreeRML(node *current){ 372 | 373 | if (current != NULL) 374 | { 375 | displayTreeRML(current->getRight()); 376 | current->displayNode(); 377 | displayTreeRML(current->getLeft()); 378 | } 379 | } 380 | 381 | void tree::displayTreeLMR(node *current){ 382 | 383 | if (current != NULL) 384 | { 385 | displayTreeLMR(current->getLeft()); 386 | current->displayNode(); 387 | displayTreeLMR(current->getRight()); 388 | } 389 | } 390 | 391 | void tree::displayTreeMLR(node *current){ 392 | 393 | if (current != NULL) 394 | { 395 | current->displayNode(); 396 | displayTreeMLR(current->getLeft()); 397 | displayTreeMLR(current->getRight()); 398 | } 399 | } 400 | 401 | 402 | node* tree::getTop(){ 403 | return top; 404 | } 405 | 406 | tree::~tree(){ 407 | top = NULL; 408 | } 409 | 410 | -------------------------------------------------------------------------------- /src/serial/greedy.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | #include "graph_io.h" 17 | /* 18 | // transfer *.graph file to CSR format 19 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 20 | printf("Reading .graph input file %s\n", graph); 21 | std::ifstream cfile; 22 | cfile.open(graph); 23 | std::string str; 24 | getline(cfile, str); 25 | sscanf(str.c_str(), "%d %d", &m, &nnz); 26 | printf("num_vertices %d num_edges %d\n", m, nnz); 27 | vector > svector; 28 | set s; 29 | for (int i = 0; i < m; i++) 30 | svector.push_back(s); 31 | int dst; 32 | for (int i = 0; i < m; i++) { 33 | getline(cfile, str); 34 | istringstream istr; 35 | istr.str(str); 36 | while(istr>>dst) { 37 | dst --; 38 | svector[i].insert(dst); 39 | svector[dst].insert(i); 40 | } 41 | istr.clear(); 42 | } 43 | cfile.close(); 44 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 45 | int count = 0; 46 | for (int i = 0; i < m; i++) { 47 | csrRowPtr[i] = count; 48 | count += svector[i].size(); 49 | } 50 | csrRowPtr[m] = count; 51 | if (count != nnz) { 52 | printf("The graph is not symmetric\n"); 53 | nnz = count; 54 | } 55 | double avgdeg; 56 | double variance = 0.0; 57 | int maxdeg = 0; 58 | int mindeg = m; 59 | avgdeg = (double)nnz / m; 60 | for (int i = 0; i < m; i++) { 61 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 62 | if (deg_i > maxdeg) 63 | maxdeg = deg_i; 64 | if (deg_i < mindeg) 65 | mindeg = deg_i; 66 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 67 | } 68 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 69 | csrColInd = (int *)malloc(count * sizeof(int)); 70 | set::iterator site; 71 | for (int i = 0, index = 0; i < m; i++) { 72 | site = svector[i].begin(); 73 | while (site != svector[i].end()) { 74 | csrColInd[index++] = *site; 75 | site++; 76 | } 77 | } 78 | } 79 | 80 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 81 | printf("Reading RMAT (.gr) input file %s\n", gr); 82 | std::ifstream cfile; 83 | cfile.open(gr); 84 | std::string str; 85 | getline(cfile, str); 86 | char c; 87 | sscanf(str.c_str(), "%c", &c); 88 | while (c == 'c') { 89 | getline(cfile, str); 90 | sscanf(str.c_str(), "%c", &c); 91 | } 92 | char sp[3]; 93 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 94 | printf("num_vertices %d num_edges %d\n", m, nnz); 95 | //printf("%c %s %d %d\n", c, sp, m, nnz); 96 | vector > svector; 97 | set s; 98 | for (int i = 0; i < m; i++) 99 | svector.push_back(s); 100 | int dst, src; 101 | for (int i = 0; i < nnz; i++) { 102 | getline(cfile, str); 103 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 104 | if (c != 'a') 105 | printf("line %d\n", __LINE__); 106 | dst--; 107 | src--; 108 | svector[src].insert(dst); 109 | svector[dst].insert(src); 110 | } 111 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 112 | int count = 0; 113 | for (int i = 0; i < m; i++) { 114 | csrRowPtr[i] = count; 115 | count += svector[i].size(); 116 | } 117 | csrRowPtr[m] = count; 118 | if (count != nnz) { 119 | printf("The graph is not symmetric\n"); 120 | nnz = count; 121 | } 122 | double avgdeg; 123 | double variance = 0.0; 124 | int maxdeg = 0; 125 | int mindeg = m; 126 | avgdeg = (double)nnz / m; 127 | for (int i = 0; i < m; i++) { 128 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 129 | if (deg_i > maxdeg) 130 | maxdeg = deg_i; 131 | if (deg_i < mindeg) 132 | mindeg = deg_i; 133 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 134 | } 135 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 136 | csrColInd = (int *)malloc(count * sizeof(int)); 137 | set::iterator site; 138 | for (int i = 0, index = 0; i < m; i++) { 139 | site = svector[i].begin(); 140 | while (site != svector[i].end()) { 141 | csrColInd[index++] = *site; 142 | site++; 143 | } 144 | } 145 | } 146 | 147 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 148 | printf("Reading .mtx input file %s\n", mtx); 149 | std::ifstream cfile; 150 | cfile.open(mtx); 151 | std::string str; 152 | getline(cfile, str); 153 | char c; 154 | sscanf(str.c_str(), "%c", &c); 155 | while (c == '%') { 156 | getline(cfile, str); 157 | sscanf(str.c_str(), "%c", &c); 158 | } 159 | int n; 160 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 161 | if (m != n) { 162 | printf("error!\n"); 163 | exit(0); 164 | } 165 | printf("num_vertices %d num_edges %d\n", m, nnz); 166 | vector > svector; 167 | set s; 168 | for (int i = 0; i < m; i++) 169 | svector.push_back(s); 170 | int dst, src; 171 | for (int i = 0; i < nnz; i++) { 172 | getline(cfile, str); 173 | sscanf(str.c_str(), "%d %d", &dst, &src); 174 | dst--; 175 | src--; 176 | svector[src].insert(dst); 177 | svector[dst].insert(src); 178 | } 179 | cfile.close(); 180 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 181 | int count = 0; 182 | for (int i = 0; i < m; i++) { 183 | csrRowPtr[i] = count; 184 | count += svector[i].size(); 185 | } 186 | csrRowPtr[m] = count; 187 | if (count != nnz) { 188 | printf("The graph is not symmetric\n"); 189 | nnz = count; 190 | } 191 | double avgdeg; 192 | double variance = 0.0; 193 | int maxdeg = 0; 194 | int mindeg = m; 195 | avgdeg = (double)nnz / m; 196 | for (int i = 0; i < m; i++) { 197 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 198 | if (deg_i > maxdeg) 199 | maxdeg = deg_i; 200 | if (deg_i < mindeg) 201 | mindeg = deg_i; 202 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 203 | } 204 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 205 | csrColInd = (int *)malloc(count * sizeof(int)); 206 | set::iterator site; 207 | for (int i = 0, index = 0; i < m; i++) { 208 | site = svector[i].begin(); 209 | while (site != svector[i].end()) { 210 | csrColInd[index++] = *site; 211 | site++; 212 | } 213 | } 214 | } 215 | 216 | void write_solution(char *fname, int *coloring, int n) { 217 | int i; 218 | FILE *fp; 219 | fp = fopen(fname, "w"); 220 | for (i = 0; i < n; i++) { 221 | fprintf(fp, "%d\n", coloring[i]); 222 | } 223 | fclose(fp); 224 | } 225 | 226 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 227 | int i, offset, neighbor_j; 228 | for (i = 0; i < m; i++) { 229 | for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) { 230 | neighbor_j = csrColInd[offset]; 231 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 232 | *correct = 0; 233 | printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 234 | return; 235 | } 236 | } 237 | } 238 | } 239 | //*/ 240 | 241 | double rtclock() { 242 | struct timezone Tzp; 243 | struct timeval Tp; 244 | int stat; 245 | stat = gettimeofday (&Tp, &Tzp); 246 | if (stat != 0) printf("Error return from gettimeofday: %d",stat); 247 | return(Tp.tv_sec + Tp.tv_usec*1.0e-6); 248 | } 249 | 250 | #define MAXCOLOR 128 251 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring) { 252 | int max_color = 1; 253 | int vertex; 254 | int forbiddenColors[MAXCOLOR+1]; 255 | for (int i = 0; i < MAXCOLOR; i ++) 256 | forbiddenColors[i] = -1; 257 | for (vertex = 0; vertex < m; vertex++) { 258 | int row_begin = csrRowPtr[vertex]; 259 | int row_end = csrRowPtr[vertex + 1]; 260 | for (int offset = row_begin; offset < row_end; offset++) { 261 | int neighbor = csrColInd[offset]; 262 | forbiddenColors[coloring[neighbor]] = vertex; 263 | } 264 | int vertex_color = 1; 265 | while (vertex_color < max_color && forbiddenColors[vertex_color] == vertex) 266 | vertex_color++; 267 | if (vertex_color == max_color) 268 | max_color++; 269 | assert(vertex_color < MAXCOLOR); 270 | coloring[vertex] = vertex_color; 271 | } 272 | *ncolors = max_color - 1; 273 | } 274 | 275 | int main(int argc, char *argv[]) { 276 | int iteration = 0; 277 | int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL; 278 | if (strstr(argv[1], ".mtx")) 279 | mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 280 | else if (strstr(argv[1], ".graph")) 281 | graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 282 | else if (strstr(argv[1], ".gr")) 283 | gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 284 | else 285 | { printf("Unrecognizable input file format\n"); exit(0); } 286 | int ncolors, *coloring, correct; 287 | ncolors = 0; 288 | coloring = (int *)calloc(m, sizeof(int)); 289 | correct = 1; 290 | double starttime, endtime; 291 | double runtime[10]; 292 | int colors[10]; 293 | for (int i = 0; i < 10; i++) { 294 | memset(coloring, 0, m * sizeof(int)); 295 | starttime = rtclock(); 296 | FirstFit(m, nnz, csrRowPtr, csrColInd, &ncolors, coloring); 297 | endtime = rtclock(); 298 | runtime[i] = (1000.0f) * (endtime - starttime); 299 | colors[i] = ncolors; 300 | } 301 | double total_time = 0; 302 | int total_colors = 0; 303 | double avg_time; 304 | double avg_colors; 305 | for (int i = 0; i < 10; i++) { 306 | printf("[%.2f %d] ", runtime[i], colors[i]); 307 | total_time += runtime[i]; 308 | total_colors += colors[i]; 309 | } 310 | printf("\navg_time %f ms, avg_colors %.2f\n", total_time / 10, (double)total_colors / 10); 311 | write_solution("color.txt", coloring, m); 312 | verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct); 313 | if (correct) 314 | printf("correct.\n"); 315 | else 316 | printf("incorrect.\n"); 317 | return 0; 318 | } 319 | -------------------------------------------------------------------------------- /src/omp/main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "common.h" 17 | #include "worklist.h" 18 | typedef unsigned foru; 19 | #include "graph.h" 20 | int num_omp_threads; 21 | using namespace std; 22 | //#include "kernel1.h" 23 | #include "kernel2.h" 24 | 25 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 26 | printf("Reading RMAT (.gr) input file %s\n", gr); 27 | std::ifstream cfile; 28 | cfile.open(gr); 29 | std::string str; 30 | getline(cfile, str); 31 | char c; 32 | sscanf(str.c_str(), "%c", &c); 33 | while (c == 'c') { 34 | getline(cfile, str); 35 | sscanf(str.c_str(), "%c", &c); 36 | } 37 | char sp[3]; 38 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 39 | printf("num_vertices %d num_edges %d\n", m, nnz); 40 | vector > svector; 41 | set s; 42 | for (int i = 0; i < m; i++) 43 | svector.push_back(s); 44 | int dst, src; 45 | for (int i = 0; i < nnz; i++) { 46 | getline(cfile, str); 47 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 48 | 49 | if (c != 'a') 50 | printf("line %d\n", __LINE__); 51 | dst--; 52 | src--; 53 | svector[src].insert(dst); 54 | svector[dst].insert(src); 55 | } 56 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 57 | int count = 0; 58 | for (int i = 0; i < m; i++) { 59 | csrRowPtr[i] = count; 60 | count += svector[i].size(); 61 | } 62 | csrRowPtr[m] = count; 63 | nnz = count; 64 | double avgdeg; 65 | double variance = 0.0; 66 | int maxdeg = 0; 67 | int mindeg = m; 68 | avgdeg = (double)nnz / m; 69 | for (int i = 0; i < m; i++) { 70 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 71 | if (deg_i > maxdeg) 72 | maxdeg = deg_i; 73 | if (deg_i < mindeg) 74 | mindeg = deg_i; 75 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 76 | } 77 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 78 | csrColInd = (int *)malloc(count * sizeof(int)); 79 | set::iterator site; 80 | for (int i = 0, index = 0; i < m; i++) { 81 | site = svector[i].begin(); 82 | while (site != svector[i].end()) { 83 | csrColInd[index++] = *site; 84 | site++; 85 | } 86 | } 87 | } 88 | 89 | 90 | // transfer *.graph file to CSR format 91 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 92 | printf("Reading .graph input file %s\n", graph); 93 | std::ifstream cfile; 94 | cfile.open(graph); 95 | std::string str; 96 | getline(cfile, str); 97 | sscanf(str.c_str(), "%d %d", &m, &nnz); 98 | printf("num_vertices %d num_edges %d\n", m, nnz); 99 | vector > svector; 100 | set s; 101 | for (int i = 0; i < m; i++) 102 | svector.push_back(s); 103 | int dst; 104 | for (int i = 0; i < m; i++) { 105 | getline(cfile, str); 106 | istringstream istr; 107 | istr.str(str); 108 | while(istr>>dst) { 109 | dst --; 110 | svector[i].insert(dst); 111 | svector[dst].insert(i); 112 | } 113 | istr.clear(); 114 | } 115 | cfile.close(); 116 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 117 | int count = 0; 118 | for (int i = 0; i < m; i++) { 119 | csrRowPtr[i] = count; 120 | count += svector[i].size(); 121 | } 122 | csrRowPtr[m] = count; 123 | if (count != nnz) { 124 | printf("The graph is not symmetric\n"); 125 | nnz = count; 126 | } 127 | double avgdeg; 128 | double variance = 0.0; 129 | int maxdeg = 0; 130 | int mindeg = m; 131 | avgdeg = (double)nnz / m; 132 | for (int i = 0; i < m; i++) { 133 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 134 | if (deg_i > maxdeg) 135 | maxdeg = deg_i; 136 | if (deg_i < mindeg) 137 | mindeg = deg_i; 138 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 139 | } 140 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 141 | csrColInd = (int *)malloc(count * sizeof(int)); 142 | set::iterator site; 143 | for (int i = 0, index = 0; i < m; i++) { 144 | site = svector[i].begin(); 145 | while (site != svector[i].end()) { 146 | csrColInd[index++] = *site; 147 | site++; 148 | } 149 | } 150 | } 151 | 152 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 153 | printf("Reading (.mtx) input file %s\n", mtx); 154 | std::ifstream cfile; 155 | cfile.open(mtx); 156 | std::string str; 157 | getline(cfile, str); 158 | char c; 159 | sscanf(str.c_str(), "%c", &c); 160 | while (c == '%') { 161 | getline(cfile, str); 162 | sscanf(str.c_str(), "%c", &c); 163 | } 164 | int n; 165 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 166 | if (m != n) { 167 | printf("error!\n"); 168 | exit(0); 169 | } 170 | printf("num_vertices %d num_edges %d\n", m, nnz); 171 | vector > svector; 172 | set s; 173 | for (int i = 0; i < m; i++) 174 | svector.push_back(s); 175 | int dst, src; 176 | for (int i = 0; i < nnz; i++) { 177 | getline(cfile, str); 178 | sscanf(str.c_str(), "%d %d", &dst, &src); 179 | dst--; 180 | src--; 181 | svector[src].insert(dst); 182 | svector[dst].insert(src); 183 | } 184 | cfile.close(); 185 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 186 | int count = 0; 187 | for (int i = 0; i < m; i++) { 188 | csrRowPtr[i] = count; 189 | count += svector[i].size(); 190 | } 191 | csrRowPtr[m] = count; 192 | if (count != nnz) { 193 | printf("This graph is not symmetric\n"); 194 | nnz = count; 195 | } 196 | double avgdeg; 197 | double variance = 0.0; 198 | int maxdeg = 0; 199 | int mindeg = m; 200 | avgdeg = (double)nnz / m; 201 | for (int i = 0; i < m; i++) { 202 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 203 | if (deg_i > maxdeg) 204 | maxdeg = deg_i; 205 | if (deg_i < mindeg) 206 | mindeg = deg_i; 207 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 208 | } 209 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 210 | csrColInd = (int *)malloc(count * sizeof(int)); 211 | set::iterator site; 212 | for (int i = 0, index = 0; i < m; i++) { 213 | site = svector[i].begin(); 214 | while (site != svector[i].end()) { 215 | csrColInd[index++] = *site; 216 | site++; 217 | } 218 | } 219 | } 220 | 221 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 222 | int i, offset, neighbor_j; 223 | for (i = 0; i < m; i++) { 224 | for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) { 225 | neighbor_j = csrColInd[offset]; 226 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 227 | *correct = 0; 228 | printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 229 | break; 230 | } 231 | } 232 | } 233 | } 234 | 235 | void write_solution(char *fname, int nnodes, int *coloring) { 236 | int i; 237 | FILE *fp = fopen(fname, "w"); 238 | for (i = 0; i < nnodes; i++) { 239 | fprintf(fp, "%d\n", coloring[i]); 240 | } 241 | fclose(fp); 242 | } 243 | 244 | void mtx2edges(char *mtx, char *edges) { 245 | std::ifstream cfile; 246 | cfile.open(mtx); 247 | std::string str; 248 | getline(cfile, str); 249 | char c; 250 | sscanf(str.c_str(), "%c", &c); 251 | while (c == '%') { 252 | getline(cfile, str); 253 | sscanf(str.c_str(), "%c", &c); 254 | } 255 | int m, n, nnz; 256 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 257 | if (m != n) { 258 | printf("error!\n"); 259 | exit(0); 260 | } 261 | vector > svector; 262 | set s; 263 | for (int i = 0; i < m; i++) 264 | svector.push_back(s); 265 | 266 | FILE *fp = fopen(edges, "w"); 267 | 268 | int dst, src; 269 | for (int i = 0; i < nnz; i++) { 270 | getline(cfile, str); 271 | sscanf(str.c_str(), "%d %d", &dst, &src); 272 | dst--; 273 | src--; 274 | svector[src].insert(dst); 275 | svector[dst].insert(src); 276 | } 277 | cfile.close(); 278 | int count = 0; 279 | for (int i = 0; i < m; i++) { 280 | count += svector[i].size(); 281 | } 282 | fprintf(fp, "%d %d\n", m, count); 283 | set::iterator site; 284 | for (int i = 0; i < m; i++) { 285 | site = svector[i].begin(); 286 | while (site != svector[i].end()) { 287 | fprintf(fp, "%d %d\n", i, *site); 288 | site++; 289 | } 290 | } 291 | fclose(fp); 292 | } 293 | 294 | void verify(Graph &graph, int *coloring, int *correct) { 295 | int nnodes = graph.nnodes; 296 | int i, j, neighbors, neighbor_j; 297 | for (i = 0; i < nnodes; i++) { 298 | neighbors = graph.noutgoing[i]; 299 | for (j = 0; j < neighbors; j++) { 300 | neighbor_j = graph.edgessrcdst[graph.psrc[i] + j]; 301 | if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) { 302 | *correct = 0; 303 | printf("colors[%d] = colors[%d] = %d\n", i, neighbor_j, coloring[i]); 304 | break; 305 | } 306 | } 307 | } 308 | } 309 | 310 | int main(int argc, char *argv[]) { 311 | if (argc != 3) { 312 | printf("Usage: %s \n", argv[0]); 313 | exit(1); 314 | } 315 | num_omp_threads = atoi(argv[1]); 316 | #ifdef ENABLE_OPENMP 317 | omp_set_num_threads(num_omp_threads); 318 | printf("OpenMP graph coloring by Xuhao Chen, num_omp_threads=%d\n", num_omp_threads); 319 | #endif 320 | int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL; 321 | if (strstr(argv[2], ".mtx")) 322 | mtx2csr(argv[2], m, nnz, csrRowPtr, csrColInd); 323 | else if (strstr(argv[2], ".graph")) 324 | graph2csr(argv[2], m, nnz, csrRowPtr, csrColInd); 325 | else if (strstr(argv[2], ".gr")) 326 | gr2csr(argv[2], m, nnz, csrRowPtr, csrColInd); 327 | else 328 | { printf("Unrecognizable input file format\n"); exit(0); } 329 | int *coloring, correct; 330 | coloring = (int *)calloc(m, sizeof(int)); 331 | correct = 1; 332 | color(m, nnz, csrRowPtr, csrColInd, coloring); 333 | verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct); 334 | if (correct) 335 | printf("correct\n"); 336 | else 337 | printf("incorrect\n"); 338 | write_solution("coloring.txt", m, coloring); 339 | return 0; 340 | } 341 | -------------------------------------------------------------------------------- /src/omp/graph.h: -------------------------------------------------------------------------------- 1 | #ifndef LSG_GRAPH 2 | #define LSG_GRAPH 3 | 4 | #define MYINFINITY 1000000000 5 | #define DISTANCETHRESHOLD 150 6 | #define THRESHOLDDEGREE 10 7 | 8 | typedef struct Graph { 9 | enum {NotAllocated, AllocatedOnHost, AllocatedOnDevice} memory; 10 | 11 | unsigned read(char file[]); 12 | long unsigned cudaCopy(struct Graph ©graph); 13 | unsigned optimize(); 14 | unsigned printStats(); 15 | void print(); 16 | 17 | Graph(); 18 | ~Graph(); 19 | unsigned init(); 20 | unsigned allocOnHost(); 21 | unsigned allocOnDevice(); 22 | unsigned dealloc(); 23 | unsigned deallocOnHost(); 24 | unsigned deallocOnDevice(); 25 | unsigned optimizeone(); 26 | unsigned optimizetwo(); 27 | void allocLevels(); 28 | void freeLevels(); 29 | void progressPrint(unsigned maxii, unsigned ii); 30 | unsigned readFromEdges(char file[]); 31 | unsigned readFromGR(char file[]); 32 | unsigned getOutDegree(unsigned src); 33 | unsigned getDestination(unsigned src, unsigned nthedge); 34 | unsigned getFirstEdge(unsigned src); 35 | foru getWeight(unsigned src, unsigned nthedge); 36 | 37 | unsigned nnodes, nedges; 38 | unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *edgessrcdst; 39 | foru *edgessrcwt; 40 | unsigned *levels; 41 | unsigned source; 42 | 43 | unsigned *maxOutDegree, *maxInDegree; 44 | unsigned diameter; 45 | bool foundStats; 46 | 47 | } Graph; 48 | 49 | unsigned Graph::init() { 50 | noutgoing = nincoming = srcsrc = psrc = edgessrcdst = NULL; 51 | edgessrcwt = NULL; 52 | source = 0; 53 | nnodes = nedges = 0; 54 | memory = NotAllocated; 55 | 56 | maxOutDegree = maxInDegree = NULL; 57 | diameter = 0; 58 | foundStats = false; 59 | 60 | return 0; 61 | } 62 | 63 | unsigned Graph::allocOnHost() { 64 | edgessrcdst = (unsigned int *)malloc((nedges+1) * sizeof(unsigned int)); // first entry acts as null. 65 | edgessrcwt = (foru *)malloc((nedges+1) * sizeof(foru)); // first entry acts as null. 66 | psrc = (unsigned int *)calloc(nnodes+1, sizeof(unsigned int)); // init to null. 67 | psrc[nnodes] = nedges; // last entry points to end of edges, to avoid thread divergence in drelax. 68 | noutgoing = (unsigned int *)calloc(nnodes, sizeof(unsigned int)); // init to 0. 69 | nincoming = (unsigned int *)calloc(nnodes, sizeof(unsigned int)); // init to 0. 70 | srcsrc = (unsigned int *)malloc(nnodes * sizeof(unsigned int)); 71 | 72 | maxOutDegree = (unsigned *)malloc(sizeof(unsigned)); 73 | maxInDegree = (unsigned *)malloc(sizeof(unsigned)); 74 | *maxOutDegree = 0; 75 | *maxInDegree = 0; 76 | 77 | memory = AllocatedOnHost; 78 | return 0; 79 | } 80 | 81 | unsigned Graph::deallocOnHost() { 82 | free(noutgoing); 83 | free(nincoming); 84 | free(srcsrc); 85 | free(psrc); 86 | free(edgessrcdst); 87 | free(edgessrcwt); 88 | 89 | free(maxOutDegree); 90 | free(maxInDegree); 91 | return 0; 92 | } 93 | 94 | unsigned Graph::dealloc() { 95 | switch (memory) { 96 | case AllocatedOnHost: 97 | printf("dealloc on host.\n"); 98 | deallocOnHost(); 99 | break; 100 | case AllocatedOnDevice: 101 | printf("dealloc on device.\n"); 102 | // deallocOnDevice(); 103 | break; 104 | } 105 | return 0; 106 | } 107 | 108 | Graph::Graph() { 109 | init(); 110 | } 111 | 112 | Graph::~Graph() { 113 | } 114 | 115 | //TODO: make optimizations use the graph api. 116 | unsigned Graph::optimizeone() { 117 | unsigned int nvv = nnodes; // no of vertices to be optimized. 118 | unsigned int insertindex = 1; // because ii starts with 0. 119 | 120 | for (unsigned ii = 0; ii < nvv; ++ii) { 121 | unsigned src = srcsrc[ii]; 122 | unsigned dstindex = psrc[src]; 123 | unsigned degree = noutgoing[src]; 124 | if (degree && srcsrc[edgessrcdst[dstindex]] > src + DISTANCETHRESHOLD) { 125 | unsigned int nee = degree; 126 | for (unsigned ee = 0; ee < nee; ++ee) { 127 | unsigned dst = edgessrcdst[dstindex + ee]; 128 | unsigned dstentry = srcsrc[dst]; 129 | // swap insertindex and dst. 130 | unsigned temp = psrc[insertindex]; 131 | psrc[insertindex] = psrc[dstentry]; 132 | psrc[dstentry] = temp; 133 | 134 | temp = srcsrc[ii]; 135 | srcsrc[ii] = srcsrc[dst]; 136 | srcsrc[dst] = temp; 137 | 138 | if (++insertindex >= nnodes) { 139 | break; 140 | } 141 | } 142 | if (insertindex >= nnodes) { 143 | break; 144 | } 145 | } 146 | } 147 | return 0; 148 | } 149 | 150 | unsigned Graph::optimizetwo() { 151 | // load balance. 152 | unsigned int nvv = nnodes / 2; 153 | bool firsthalfsmaller = true; 154 | unsigned int temp; 155 | 156 | for (unsigned ii = 0; ii < nvv; ++ii) { 157 | unsigned one = ii; 158 | unsigned two = nvv + ii; 159 | unsigned degreeone = noutgoing[one]; 160 | unsigned degreetwo = noutgoing[two]; 161 | 162 | if (degreeone > degreetwo && degreeone - degreetwo > THRESHOLDDEGREE && !firsthalfsmaller || degreetwo > degreeone && degreetwo - degreeone > THRESHOLDDEGREE && firsthalfsmaller) { 163 | temp = srcsrc[one]; 164 | srcsrc[one] = srcsrc[two]; 165 | srcsrc[two] = temp; 166 | 167 | temp = psrc[one]; 168 | psrc[one] = psrc[two]; 169 | psrc[two] = temp; 170 | firsthalfsmaller = !firsthalfsmaller; 171 | } 172 | } 173 | return 0; 174 | } 175 | 176 | unsigned Graph::optimize() { 177 | optimizeone(); 178 | optimizetwo(); 179 | return 0; 180 | } 181 | 182 | void Graph::progressPrint(unsigned maxii, unsigned ii) { 183 | const unsigned nsteps = 10; 184 | unsigned ineachstep = (maxii / nsteps); 185 | if(ineachstep == 0) ineachstep = 1; 186 | /*if (ii == maxii) { 187 | printf("\t100%%\n"); 188 | } else*/ if (ii % ineachstep == 0) { 189 | printf("\t%3d%%\r", ii*100/maxii + 1); 190 | fflush(stdout); 191 | } 192 | } 193 | 194 | unsigned Graph::readFromEdges(char file[]) { 195 | std::ifstream cfile; 196 | cfile.open(file); 197 | 198 | std::string str; 199 | getline(cfile, str); 200 | sscanf(str.c_str(), "%d %d", &nnodes, &nedges); 201 | 202 | allocOnHost(); 203 | for (unsigned ii = 0; ii < nnodes; ++ii) { 204 | srcsrc[ii] = ii; 205 | } 206 | 207 | 208 | unsigned int prevnode = 0; 209 | unsigned int tempsrcnode; 210 | unsigned int ncurroutgoing = 0; 211 | for (unsigned ii = 0; ii < nedges; ++ii) { 212 | getline(cfile, str); 213 | sscanf(str.c_str(), "%d %d %d", &tempsrcnode, &edgessrcdst[ii+1], &edgessrcwt[ii+1]); 214 | if (prevnode == tempsrcnode) { 215 | if (ii == 0) { 216 | psrc[tempsrcnode] = ii + 1; 217 | } 218 | ++ncurroutgoing; 219 | } else { 220 | psrc[tempsrcnode] = ii + 1; 221 | if (ncurroutgoing) { 222 | noutgoing[prevnode] = ncurroutgoing; 223 | } 224 | prevnode = tempsrcnode; 225 | ncurroutgoing = 1; // not 0. 226 | } 227 | ++nincoming[edgessrcdst[ii+1]]; 228 | 229 | progressPrint(nedges, ii); 230 | } 231 | noutgoing[prevnode] = ncurroutgoing; // last entries. 232 | 233 | cfile.close(); 234 | return 0; 235 | } 236 | 237 | unsigned Graph::readFromGR(char file[]) { 238 | std::ifstream cfile; 239 | cfile.open(file); 240 | 241 | // copied from GaloisCpp/trunk/src/FileGraph.h 242 | int masterFD = open(file, O_RDONLY); 243 | if (masterFD == -1) { 244 | printf("FileGraph::structureFromFile: unable to open %s.\n", file); 245 | return 1; 246 | } 247 | 248 | struct stat buf; 249 | int f = fstat(masterFD, &buf); 250 | if (f == -1) { 251 | printf("FileGraph::structureFromFile: unable to stat %s.\n", file); 252 | abort(); 253 | } 254 | size_t masterLength = buf.st_size; 255 | 256 | int _MAP_BASE = MAP_PRIVATE; 257 | //#ifdef MAP_POPULATE 258 | // _MAP_BASE |= MAP_POPULATE; 259 | //#endif 260 | 261 | void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); 262 | if (m == MAP_FAILED) { 263 | m = 0; 264 | printf("FileGraph::structureFromFile: mmap failed.\n"); 265 | abort(); 266 | } 267 | 268 | double starttime, endtime; 269 | starttime = rtclock(); 270 | 271 | //parse file 272 | uint64_t* fptr = (uint64_t*)m; 273 | __attribute__((unused)) uint64_t version = le64toh(*fptr++); 274 | assert(version == 1); 275 | uint64_t sizeEdgeTy = le64toh(*fptr++); 276 | uint64_t numNodes = le64toh(*fptr++); 277 | uint64_t numEdges = le64toh(*fptr++); 278 | uint64_t *outIdx = fptr; 279 | fptr += numNodes; 280 | uint32_t *fptr32 = (uint32_t*)fptr; 281 | uint32_t *outs = fptr32; 282 | fptr32 += numEdges; 283 | if (numEdges % 2) fptr32 += 1; 284 | unsigned *edgeData = (unsigned *)fptr32; 285 | 286 | 287 | // cuda. 288 | nnodes = numNodes; 289 | nedges = numEdges; 290 | 291 | printf("nnodes=%d, nedges=%d.\n", nnodes, nedges); 292 | allocOnHost(); 293 | 294 | for (unsigned ii = 0; ii < nnodes; ++ii) { 295 | // fill unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *edgessrcdst; foru *edgessrcwt; 296 | srcsrc[ii] = ii; 297 | if (ii > 0) { 298 | psrc[ii] = le64toh(outIdx[ii - 1]) + 1; 299 | noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]); 300 | } else { 301 | psrc[0] = 1; 302 | noutgoing[0] = le64toh(outIdx[0]); 303 | } 304 | for (unsigned jj = 0; jj < noutgoing[ii]; ++jj) { 305 | unsigned edgeindex = psrc[ii] + jj; 306 | unsigned dst = le32toh(outs[edgeindex - 1]); 307 | if (dst >= nnodes) printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, edgeindex); 308 | edgessrcdst[edgeindex] = dst; 309 | edgessrcwt[edgeindex] = edgeData[edgeindex - 1]; 310 | 311 | ++nincoming[dst]; 312 | //if (ii == 194 || ii == 352) { 313 | // printf("edge %d: %d->%d, wt=%d.\n", edgeindex, ii, dst, edgessrcwt[edgeindex]); 314 | //} 315 | } 316 | progressPrint(nnodes, ii); 317 | } 318 | 319 | cfile.close(); // probably galois doesn't close its file due to mmap. 320 | 321 | endtime = rtclock(); 322 | 323 | printf("read %lld bytes in %0.2f ms (%0.2f MB/s)\n", masterLength, 1000 * (endtime - starttime), (masterLength / 1048576) / (endtime - starttime)); 324 | 325 | return 0; 326 | } 327 | unsigned Graph::read(char file[]) { 328 | if (strstr(file, ".edges")) { 329 | return readFromEdges(file); 330 | } else if (strstr(file, ".gr")) { 331 | return readFromGR(file); 332 | } 333 | return 0; 334 | } 335 | 336 | unsigned Graph::getOutDegree(unsigned src) { 337 | if (src < nnodes) { 338 | return noutgoing[src]; 339 | } 340 | return 0; 341 | } 342 | 343 | unsigned Graph::getDestination(unsigned src, unsigned nthedge) { 344 | if (src < nnodes && nthedge < getOutDegree(src)) { 345 | unsigned edge = getFirstEdge(src) + nthedge; 346 | if (edge && edge < nedges + 1) { 347 | return edgessrcdst[edge]; 348 | } 349 | return nnodes; 350 | } 351 | if (src < nnodes) { 352 | printf("Error: %s(%d): node %d: edge %d out of bounds %d.\n", __FILE__, __LINE__, src, nthedge, getOutDegree(src)); 353 | } else { 354 | printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes); 355 | } 356 | return nnodes; 357 | } 358 | 359 | foru Graph::getWeight(unsigned src, unsigned nthedge) { 360 | if (src < nnodes && nthedge < getOutDegree(src)) { 361 | unsigned edge = getFirstEdge(src) + nthedge; 362 | if (edge && edge < nedges + 1) { 363 | return edgessrcwt[edge]; 364 | } 365 | return MYINFINITY; 366 | } 367 | if (src < nnodes) { 368 | printf("Error: %s(%d): node %d: edge %d out of bounds %d.\n", __FILE__, __LINE__, src, nthedge, getOutDegree(src)); 369 | } else { 370 | printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes); 371 | } 372 | return MYINFINITY; 373 | } 374 | 375 | unsigned Graph::getFirstEdge(unsigned src) { 376 | if (src < nnodes) { 377 | unsigned srcnout = getOutDegree(src); 378 | if (srcnout > 0 && srcsrc[src] < nnodes) { 379 | return psrc[srcsrc[src]]; 380 | } 381 | printf("Error: %s(%d): edge %d out of bounds %d.\n", __FILE__, __LINE__, 0, srcnout); 382 | return 0; 383 | } 384 | printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes); 385 | return 0; 386 | } 387 | 388 | #endif 389 | -------------------------------------------------------------------------------- /src/csrcolor/csrcolor.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2016, National University of Defense Technology 2 | // Authors: Xuhao Chen and Pingfan Li 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "cusparse.h" 10 | #include "cuda.h" 11 | #include 12 | #include 13 | #include 14 | #include "cutil_subset.h" 15 | #include "common.h" 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 21 | printf("Reading RMAT (.gr) input file %s\n", gr); 22 | std::ifstream cfile; 23 | cfile.open(gr); 24 | std::string str; 25 | getline(cfile, str); 26 | char c; 27 | sscanf(str.c_str(), "%c", &c); 28 | while (c == 'c') { 29 | getline(cfile, str); 30 | sscanf(str.c_str(), "%c", &c); 31 | } 32 | char sp[3]; 33 | sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz); 34 | //printf("%c %s %d %d\n", c, sp, m, nnz); 35 | printf("num_vertices %d num_edges %d\n", m, nnz); 36 | vector > svector; 37 | set s; 38 | for (int i = 0; i < m; i++) 39 | svector.push_back(s); 40 | int dst, src; 41 | for (int i = 0; i < nnz; i++) { 42 | getline(cfile, str); 43 | sscanf(str.c_str(), "%c %d %d", &c, &src, &dst); 44 | if (c != 'a') 45 | printf("line %d\n", __LINE__); 46 | dst--; 47 | src--; 48 | svector[src].insert(dst); 49 | svector[dst].insert(src); 50 | } 51 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 52 | int count = 0; 53 | for (int i = 0; i < m; i++) { 54 | csrRowPtr[i] = count; 55 | count += svector[i].size(); 56 | } 57 | csrRowPtr[m] = count; 58 | if (count != nnz) { 59 | printf("The graph is not symmetric\n"); 60 | nnz = count; 61 | } 62 | double avgdeg; 63 | double variance = 0.0; 64 | int maxdeg = 0; 65 | int mindeg = m; 66 | avgdeg = (double)nnz / m; 67 | for (int i = 0; i < m; i++) { 68 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 69 | if (deg_i > maxdeg) 70 | maxdeg = deg_i; 71 | if (deg_i < mindeg) 72 | mindeg = deg_i; 73 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 74 | } 75 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 76 | csrColInd = (int *)malloc(count * sizeof(int)); 77 | set::iterator site; 78 | for (int i = 0, index = 0; i < m; i++) { 79 | site = svector[i].begin(); 80 | while (site != svector[i].end()) { 81 | csrColInd[index++] = *site; 82 | site++; 83 | } 84 | } 85 | } 86 | 87 | // transfer *.graph file to CSR format 88 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 89 | printf("Reading .graph input file %s\n", graph); 90 | std::ifstream cfile; 91 | cfile.open(graph); 92 | std::string str; 93 | getline(cfile, str); 94 | sscanf(str.c_str(), "%d %d", &m, &nnz); 95 | printf("num_vertices %d num_edges %d\n", m, nnz); 96 | vector > svector; 97 | set s; 98 | for (int i = 0; i < m; i++) 99 | svector.push_back(s); 100 | int dst; 101 | for (int i = 0; i < m; i++) { 102 | getline(cfile, str); 103 | istringstream istr; 104 | istr.str(str); 105 | while(istr>>dst) { 106 | dst --; 107 | svector[i].insert(dst); 108 | svector[dst].insert(i); 109 | } 110 | istr.clear(); 111 | } 112 | cfile.close(); 113 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 114 | int count = 0; 115 | for (int i = 0; i < m; i++) { 116 | csrRowPtr[i] = count; 117 | count += svector[i].size(); 118 | } 119 | csrRowPtr[m] = count; 120 | if (count != nnz) { 121 | printf("The graph is not symmetric\n"); 122 | nnz = count; 123 | } 124 | double avgdeg; 125 | double variance = 0.0; 126 | int maxdeg = 0; 127 | int mindeg = m; 128 | avgdeg = (double)nnz / m; 129 | for (int i = 0; i < m; i++) { 130 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 131 | if (deg_i > maxdeg) 132 | maxdeg = deg_i; 133 | if (deg_i < mindeg) 134 | mindeg = deg_i; 135 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 136 | } 137 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 138 | csrColInd = (int *)malloc(count * sizeof(int)); 139 | set::iterator site; 140 | for (int i = 0, index = 0; i < m; i++) { 141 | site = svector[i].begin(); 142 | while (site != svector[i].end()) { 143 | csrColInd[index++] = *site; 144 | site++; 145 | } 146 | } 147 | } 148 | 149 | void write_solution(char *fname, int m, int *coloring) { 150 | FILE *fp = fopen(fname, "w"); 151 | int i; 152 | for (i = 0; i < m; i++) { 153 | fprintf(fp, "%d\n", coloring[i]); 154 | } 155 | fclose(fp); 156 | } 157 | 158 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) { 159 | int i, j, neighbors, start, neighbor_j; 160 | for (i = 0; i < m; i++) { 161 | start = csrRowPtr[i]; 162 | neighbors = csrRowPtr[i + 1] - start; 163 | for (j = 0; j < neighbors; j++) { 164 | neighbor_j = csrColInd[start + j]; 165 | if (coloring[i] == coloring[neighbor_j] && i != neighbor_j) { 166 | *correct = 0; 167 | printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]); 168 | } 169 | break; 170 | } 171 | } 172 | } 173 | 174 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) { 175 | printf("Reading .mtx input file %s\n", mtx); 176 | std::ifstream cfile; 177 | cfile.open(mtx); 178 | std::string str; 179 | getline(cfile, str); 180 | char c; 181 | sscanf(str.c_str(), "%c", &c); 182 | while (c == '%') { 183 | getline(cfile, str); 184 | sscanf(str.c_str(), "%c", &c); 185 | } 186 | int n; 187 | sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz); 188 | if (m != n) { 189 | printf("error!\n"); 190 | exit(0); 191 | } 192 | printf("num_vertices %d, num_edges %d\n", m, nnz); 193 | vector > svector; 194 | set s; 195 | for (int i = 0; i < m; i++) 196 | svector.push_back(s); 197 | int dst, src; 198 | for (int i = 0; i < nnz; i++) { 199 | getline(cfile, str); 200 | sscanf(str.c_str(), "%d %d", &dst, &src); 201 | 202 | dst--; 203 | src--; 204 | 205 | svector[src].insert(dst); 206 | svector[dst].insert(src); 207 | } 208 | cfile.close(); 209 | csrRowPtr = (int *)malloc((m + 1) * sizeof(int)); 210 | int count = 0; 211 | for (int i = 0; i < m; i++) { 212 | csrRowPtr[i] = count; 213 | count += svector[i].size(); 214 | } 215 | csrRowPtr[m] = count; 216 | if (count != nnz) { 217 | printf("The graph is not symmetric\n"); 218 | nnz = count; 219 | } 220 | double avgdeg; 221 | double variance = 0.0; 222 | int maxdeg = 0; 223 | int mindeg = m; 224 | avgdeg = (double)nnz / m; 225 | for (int i = 0; i < m; i++) { 226 | int deg_i = csrRowPtr[i + 1] - csrRowPtr[i]; 227 | if (deg_i > maxdeg) 228 | maxdeg = deg_i; 229 | if (deg_i < mindeg) 230 | mindeg = deg_i; 231 | variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m; 232 | } 233 | printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance); 234 | csrColInd = (int *)malloc(count * sizeof(int)); 235 | set::iterator site; 236 | for (int i = 0, index = 0; i < m; i++) { 237 | site = svector[i].begin(); 238 | while (site != svector[i].end()) { 239 | csrColInd[index++] = *site; 240 | site++; 241 | } 242 | } 243 | } 244 | 245 | 246 | int main(int argc, char *argv[]) { 247 | if (argc != 2) { 248 | printf("Usage: %s \n", argv[0]); 249 | exit(1); 250 | } 251 | int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL; 252 | if (strstr(argv[1], ".mtx")) 253 | mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 254 | else if (strstr(argv[1], ".graph")) 255 | graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 256 | else if (strstr(argv[1], ".gr")) 257 | gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd); 258 | else 259 | { printf("Unrecognizable input file format\n"); exit(0); } 260 | if (csrRowPtr == NULL) 261 | printf("csrRowPtr is NULL\n"); 262 | if (csrColInd == NULL) 263 | printf("csrColInd is NULL\n"); 264 | double t1, t2, t3, t4, t5, t6; 265 | int *d_csrRowPtr, *d_csrColInd; 266 | float *d_csrVal; 267 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 268 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 269 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrVal, nnz * sizeof(float))); 270 | int ncolors = 0, *coloring; 271 | int *d_coloring, *d_reordering; 272 | float fraction = 1.0; 273 | coloring = (int *)calloc(m, sizeof(int)); 274 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int))); 275 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_reordering, m * sizeof(int))); 276 | CUDA_SAFE_CALL(cudaMemset(d_reordering, 0, m * sizeof(int))); 277 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 278 | t1 = rtclock(); 279 | CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); 280 | CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice)); 281 | cudaDeviceSynchronize(); 282 | t2 = rtclock(); 283 | //printf("time of init:%f ms\n", 1000.0f * (t2 - t1)); 284 | 285 | int device = 0; 286 | int deviceCount = 0; 287 | CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); 288 | cudaDeviceProp deviceProp; 289 | CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device)); 290 | int nSM = deviceProp.multiProcessorCount; 291 | fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 292 | deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 293 | 294 | cusparseStatus_t status; 295 | cusparseHandle_t handle; 296 | status = cusparseCreate(&handle); 297 | if (status != CUSPARSE_STATUS_SUCCESS) { 298 | printf("error!"); 299 | exit(1); 300 | } 301 | cusparseMatDescr_t descr; 302 | status = cusparseCreateMatDescr(&descr); 303 | if (status != CUSPARSE_STATUS_SUCCESS) { 304 | printf("error!"); 305 | exit(1); 306 | } 307 | cusparseColorInfo_t info; 308 | status = cusparseCreateColorInfo(&info); 309 | if (status != CUSPARSE_STATUS_SUCCESS) { 310 | printf("error!"); 311 | exit(1); 312 | } 313 | double runtime[10]; 314 | int colors[10]; 315 | for (int i = 0; i < 10; i++) { 316 | t5 = rtclock(); 317 | status = cusparseScsrcolor(handle, m, nnz, descr, d_csrVal, d_csrRowPtr, d_csrColInd, &fraction, &ncolors, d_coloring, d_reordering, info); 318 | t6 = rtclock(); 319 | runtime[i] = 1000.0f * (t6 - t5); 320 | colors[i] = 1 + thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum()); 321 | } 322 | double total_time = 0; 323 | int total_colors = 0; 324 | double avg_time; 325 | double avg_colors; 326 | for (int i = 0; i < 10; i++) { 327 | printf("[%.2f %d] ", runtime[i], colors[i]); 328 | total_time += runtime[i]; 329 | total_colors += colors[i]; 330 | } 331 | printf("\navg_time %f ms, avg_colors %.2f\n", total_time / 10, (double)total_colors / 10); 332 | switch (status) { 333 | case CUSPARSE_STATUS_SUCCESS: 334 | //printf("success\n"); 335 | break; 336 | case CUSPARSE_STATUS_NOT_INITIALIZED: 337 | printf("not initialed\n"); 338 | case CUSPARSE_STATUS_ALLOC_FAILED: 339 | printf("alloc failed\n"); 340 | break; 341 | case CUSPARSE_STATUS_INVALID_VALUE: 342 | printf("invalid value\n"); 343 | break; 344 | case CUSPARSE_STATUS_ARCH_MISMATCH: 345 | printf("mismatch\n"); 346 | break; 347 | case CUSPARSE_STATUS_INTERNAL_ERROR: 348 | printf("internal error\n"); 349 | break; 350 | case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 351 | printf("not supported\n"); 352 | break; 353 | default: 354 | printf("unknown error\n"); 355 | break; 356 | }; 357 | t3 = rtclock(); 358 | CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost)); 359 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 360 | t4 = rtclock(); 361 | //printf("time of copy back:%f ms\n", 1000.0f * (t4 - t3)); 362 | write_solution("color.txt", m, coloring); 363 | int correct = 1; 364 | verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct); 365 | if (correct) 366 | printf("correct.\n"); 367 | else 368 | printf("incorrect.\n"); 369 | return 0; 370 | } 371 | --------------------------------------------------------------------------------