├── contributors.txt
├── csrcolor.pdf
├── src
    ├── GM
    │   ├── run
    │   ├── Makefile
    │   ├── README
    │   ├── tree.h
    │   ├── graphColoring.h
    │   └── tree.cpp
    ├── cusp
    │   ├── Makefile
    │   ├── README
    │   ├── timer.h
    │   └── vertex_coloring.cu
    ├── serial
    │   ├── Makefile
    │   ├── README
    │   ├── runall
    │   ├── graph_io.h
    │   └── greedy.cpp
    ├── csrcolor
    │   ├── runall
    │   ├── Makefile
    │   ├── README
    │   └── csrcolor.cu
    ├── common.mk
    ├── topo
    │   ├── Makefile
    │   ├── runall
    │   ├── README
    │   ├── main.cu
    │   ├── kernel.h
    │   └── graph_io.h
    ├── omp
    │   ├── Makefile
    │   ├── runall
    │   ├── common.h
    │   ├── kernel2.h
    │   ├── kernel1.h
    │   ├── worklist.h
    │   ├── main.cc
    │   └── graph.h
    └── data
    │   ├── runall
    │   ├── README
    │   ├── variants.h
    │   ├── Makefile
    │   ├── kernel_ldb.h
    │   ├── kernel_fusion.h
    │   ├── kernel_bitset.h
    │   ├── kernel_tc.h
    │   ├── kernel_pq.h
    │   ├── kernel_base.h
    │   ├── kernel_ldg.h
    │   ├── kernel_comb.h
    │   └── main.cu
├── .gitignore
├── include
    ├── lonestargpu.h
    ├── header.h
    ├── cutil_subset.h
    ├── list.h
    ├── kernelconfig.h
    ├── worklistc.h
    ├── common.h
    ├── sharedptr.h
    ├── util.h
    ├── gbar.cuh
    └── component.h
├── Makefile
├── LICENSE
└── README


/contributors.txt:
--------------------------------------------------------------------------------
1 | Xuhao Chen
2 | Pingfan Li
3 | 


--------------------------------------------------------------------------------
/csrcolor.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxuhao/csrcolor/HEAD/csrcolor.pdf


--------------------------------------------------------------------------------
/src/GM/run:
--------------------------------------------------------------------------------
1 | ./gc 0 0 0 0 0 0 0 14 128 data/hood.mtx y n
2 | #./gc 0 0 0 0 0 0 0 14 128 data/pwtk.mtx y n
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.DS_Store
 3 | *.log
 4 | *.out
 5 | *.o
 6 | *.d
 7 | *.swp
 8 | color.txt
 9 | */color.txt
10 | bin/*
11 | input/*
12 | 


--------------------------------------------------------------------------------
/src/cusp/Makefile:
--------------------------------------------------------------------------------
1 | all: vc
2 | vc:
3 | 	nvcc -w -O3 -I./ -I~/cusplibrary-0.5.1 vertex_coloring.cpp -o vc
4 | 	cp $@ $(BIN)
5 | 		
6 | clean:
7 | 	rm vc
8 | 


--------------------------------------------------------------------------------
/src/serial/Makefile:
--------------------------------------------------------------------------------
1 | include ../common.mk
2 | all: greedy
3 | greedy:	greedy.cpp
4 | 	g++ -w -O3 greedy.cpp -o greedy
5 | 	mv $@ $(BIN)
6 | 
7 | clean:
8 | 	rm greedy
9 | 


--------------------------------------------------------------------------------
/src/csrcolor/runall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=../input
 4 | APP=./csrcolor
 5 | 
 6 | for input in `ls $DIR`
 7 | do
 8 | 	echo $APP $DIR/$input
 9 | 	$APP $DIR/$input
10 | done
11 | 


--------------------------------------------------------------------------------
/src/serial/README:
--------------------------------------------------------------------------------
1 | program read mtx or gr graph and store its information in CSR format
2 | 
3 | and write colors of all vertices to "color.txt"
4 | 
5 | usage:
6 | 	./greedy <graph>
7 | 
8 | 


--------------------------------------------------------------------------------
/src/csrcolor/Makefile:
--------------------------------------------------------------------------------
1 | include ../common.mk
2 | EXE=csrcolor
3 | all: csrcolor.cu
4 | 	$(NVCC) $(NVFLAGS) $(INCLUDES) -lcusparse csrcolor.cu -o $(EXE)
5 | 	mv $(EXE) $(BIN)
6 | 
7 | clean:
8 | 	rm csrcolor
9 | 


--------------------------------------------------------------------------------
/src/GM/Makefile:
--------------------------------------------------------------------------------
1 | include ../common.mk
2 | all: gc
3 | gc:	graphColoring.cu graphDriver.cpp tree.cpp
4 | 	$(NVCC) $(NVFLAGS) graphDriver.cpp graphColoring.cu -o gc
5 | 	mv $@ $(BIN)
6 | 
7 | clean:
8 | 	rm gc
9 | 


--------------------------------------------------------------------------------
/src/serial/runall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #filename=rmat1.gr
 4 | DIR=../input
 5 | APP=./greedy
 6 | 
 7 | for input in `ls $DIR`
 8 | do
 9 | 	echo $APP $DIR/$input
10 | 	$APP $DIR/$input
11 | 	#echo $DIR/$input
12 | done
13 | 


--------------------------------------------------------------------------------
/include/lonestargpu.h:
--------------------------------------------------------------------------------
 1 | #ifndef LSG_LONESTARGPU
 2 | #define LSG_LONESTARGPU
 3 | 
 4 | #include "common.h"
 5 | #include "graph.h"
 6 | #include "kernelconfig.h"
 7 | #include "list.h"
 8 | #include "component.h"
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/common.mk:
--------------------------------------------------------------------------------
1 | GCC=gcc
2 | GXX=g++
3 | NVCC=nvcc
4 | COMPUTECAPABILITY=sm_35
5 | #NVFLAGS=-g -arch=$(COMPUTECAPABILITY) #-Xptxas -v
6 | NVFLAGS=-w -O3 -arch=$(COMPUTECAPABILITY) #-Xptxas -v
7 | INCLUDES=-I../../include
8 | BIN=../../bin/
9 | 


--------------------------------------------------------------------------------
/src/topo/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | INCLUDES += -I../../cub-1.1.1
 3 | EXTRA := $(NVFLAGS) $(INCLUDES) -DITERATIONS=10
 4 | SRC=main.cu
 5 | 
 6 | all: topo_base
 7 | topo_base:
 8 | 	$(NVCC) $(EXTRA) $(SRC) -o topo_base
 9 | 	mv $@ $(BIN)
10 | 
11 | clean:
12 | 	rm topo_naive topo_ldg
13 | 


--------------------------------------------------------------------------------
/src/topo/runall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=../input
 4 | 
 5 | for input in `ls $DIR`
 6 | do
 7 | 	for APP in './topodriven_naive' './topodriven_ldg'
 8 | 	do
 9 | 		for ((blksize=32; blksize<=1024; blksize*=2))
10 | 		do
11 | 			echo $APP $blksize $DIR/$input
12 | 			$APP $blksize $DIR/$input
13 | 		done
14 | 	done
15 | done
16 | 


--------------------------------------------------------------------------------
/src/csrcolor/README:
--------------------------------------------------------------------------------
 1 | program read mtx or gr graph and store its information in CSR format
 2 | 
 3 | and write colors of all vertices to "color.txt"
 4 | 
 5 | usage:
 6 | 	./csrcolor <graph>
 7 | 	
 8 | Reference:
 9 | 
10 | NVIDIA cuSPARSE Reorderings Reference
11 | http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-reorderings-reference
12 | 
13 | 


--------------------------------------------------------------------------------
/src/omp/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ 
 2 | CXX_FLAGS=-w -O3
 3 | EXTRA=-DITERATIONS=10
 4 | SRC=main.cc
 5 | 
 6 | all:color-omp color-serial
 7 | 
 8 | color-omp:
 9 | 	$(CXX) $(CXX_FLAGS) $(EXTRA) -fopenmp -DENABLE_OPENMP $(SRC) -o $@
10 | 	mv $@ ../../bin
11 | 
12 | color-serial:
13 | 	$(CXX) $(CXX_FLAGS) $(EXTRA) $(SRC) -o $@
14 | 
15 | clean:
16 | 	rm color-omp color-serial
17 | 


--------------------------------------------------------------------------------
/src/data/runall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=../input
 4 | 
 5 | for input in `ls $DIR`
 6 | do
 7 | 	for APP in './datadriven_naive-32' './datadriven_naive-64' './datadriven_naive-128' './datadriven_naive-256' './datadriven_naive-512' './datadriven_naive-1024' './datadriven_ldg-32' './datadriven_ldg-64' './datadriven_ldg-128' './datadriven_ldg-256' './datadriven_ldg-512' './datadriven_ldg-1024'
 8 | 	do
 9 | 		echo $APP $DIR/$input
10 | 		$APP $DIR/$input
11 | 	done
12 | done
13 | 


--------------------------------------------------------------------------------
/src/omp/runall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #filename=rmat1.gr
 4 | DIR=/home/lpf/workspace/lonestargpu-2.0/apps/coloring/input
 5 | APP=./color_greedy-omp
 6 | 
 7 | SERIAL=./color_greedy-s
 8 | 
 9 | #NUM_THREADS=16
10 | 
11 | for input in `ls $DIR`
12 | do
13 | #	echo $SERIAL 1 $DIR/$input
14 | #	$SERIAL 1 $DIR/$input
15 | 
16 | 	for ((NUM_THREADS=2; NUM_THREADS<=24; NUM_THREADS+=2))
17 | 	do
18 | #		echo $APP $NUM_THREADS $DIR/$input
19 | 		$APP $NUM_THREADS $DIR/$input
20 | 	done
21 | done
22 | 


--------------------------------------------------------------------------------
/src/cusp/README:
--------------------------------------------------------------------------------
 1 | Place a symlink to the top-level cusp directory in current dir. Assuming
 2 | the top-level CUB directory is $CUSPDIR:
 3 | 
 4 | $ ln -s $CUSPDIR
 5 | 
 6 | program read mtx graph with the cusplibrary
 7 | 
 8 | and write colors of all vertices to "color.txt"
 9 | 
10 | usage:
11 | 	./vc <graph>
12 | 	
13 | Reference:
14 | 
15 | S. Dalton, N. Bell, L. Olson, and M. Garland, “Cusp:
16 | Generic parallel algorithms for sparse matrix and graph
17 | computations,” 2014, version 0.5.0.
18 | http://cusplibrary.github.io/


--------------------------------------------------------------------------------
/src/topo/README:
--------------------------------------------------------------------------------
 1 | topology implementation of parallel graph coloring on GPGPUs using FristFit strategy
 2 | 
 3 | program read mtx or gr graph and store its information in CSR format
 4 | 
 5 | and write colors of all vertices to "color.txt"
 6 | 
 7 | program has two variants.
 8 | 
 9 | topodriven_naive:	naive implementation without any optimization
10 | 
11 | topodriven_ldg:		use __ldg to read C and R array from read_only cache
12 | 
13 | 
14 | usage:
15 | 	./topodriven_naive <blocksize> <graph>
16 | 	./topodriven_ldg <blocksize> <graph>
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TOPLEVEL := .
 2 | APPS := csrcolor serial topo data GM
 3 | INPUT_URL := http://
 4 | INPUT := csrcolor-inputs.tar.bz2
 5 | 
 6 | .PHONY: all clean
 7 | 
 8 | all: $(APPS)
 9 | 
10 | $(APPS):
11 | 	make -C src/$@
12 | 
13 | #include common.mk
14 | 
15 | inputs:
16 | 	@echo "Downloading inputs ..."
17 | 	@wget $(INPUT_URL) -O $(INPUT)
18 | 	@echo "Uncompressing inputs ..."
19 | 	@tar xvf $(INPUT)
20 | 	@rm $(INPUT)
21 | 	@echo "Inputs available at $(TOPLEVEL)/inputs/"
22 | 
23 | clean:
24 | 	for APP in $(APPS); do make -C apps/$$APP clean; done
25 | 
26 | 


--------------------------------------------------------------------------------
/src/data/README:
--------------------------------------------------------------------------------
 1 | datadriven implementation of parallel graph coloring on GPGPUs using FristFit strategy
 2 | 
 3 | program read mtx or gr graph and store its information in CSR format
 4 | 
 5 | and write colors of all vertices to "color.txt"
 6 | 
 7 | program has two variants.
 8 | 
 9 | datadriven_naive:	use worklist to improve work effiency, reduce atomic operation
10 | 
11 | using block scan
12 | 
13 | datadriven_ldg:		based on datadriven_naive, use __ldg to read C and R array
14 | 
15 | from read_only cache
16 | 
17 | 
18 | usage:
19 | 	./datadriven_naive <graph>
20 | 	./datadriven_ldg <graph>
21 | 


--------------------------------------------------------------------------------
/src/GM/README:
--------------------------------------------------------------------------------
 1 | Graph Coloring (GCO) partitions the vertices of a graph such that
 2 | no two adjacent matrices share the same color.  
 3 | 
 4 | 3-step graph coloring framework: 
 5 | 1) Graph partitioning which partitions graph into subgraphs and 
 6 | identifies boundary vertices, 
 7 | 2) graph coloring & conflicts detection which colors the graph using 
 8 | the specified heuristic, e.g. FF, and identifies color conflicts, and
 9 | 3) sequential conflicts resolution which goes back to CPU and resolves 
10 | the conflicts.
11 | 
12 | 
13 | useage:
14 | 	./gc 0 0 0 0 0 0 0 <nblocks> <blocksize> <graph> <weighted> n
15 | 	
16 | 	see run for details
17 | 	
18 | 
19 | note:
20 | 
21 | program can only read mtx graph
22 | 
23 | 
24 | References:
25 | 
26 | A. V. P. Grosset, P. Zhu, S. Liu, S. Venkatasubramanian, and M. Hall,
27 | “Evaluating graph coloring on gpus,” in Proceedings of the 16th ACM
28 | Symposium on Principles and Practice of Parallel Programming, 2011,
29 | pp. 297–298.
30 | 


--------------------------------------------------------------------------------
/include/header.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <time.h>
 4 | #include <fstream>
 5 | #include <string>
 6 | #include <iostream>
 7 | 
 8 | #include <unistd.h>
 9 | #include <cassert>
10 | #include <inttypes.h>
11 | #include <unistd.h>
12 | #include <stdio.h>
13 | #include <time.h>
14 | #include <sys/time.h>
15 | #include <stdlib.h>
16 | #include <stdarg.h>
17 | 
18 | 
19 | #define MYINFINITY	100000000
20 | #define SWAP(x, y)	{tmp = x; x = y; y = tmp;}
21 | 
22 | typedef unsigned foru;
23 | 
24 | double rtclock()
25 | {
26 |     struct timezone Tzp;
27 |     struct timeval Tp;
28 |     int stat;
29 |     stat = gettimeofday (&Tp, &Tzp);
30 |     if (stat != 0) printf("Error return from gettimeofday: %d",stat);
31 |     return(Tp.tv_sec + Tp.tv_usec*1.0e-6);
32 | }
33 | 
34 | void CudaTest(char *msg)
35 | {
36 |   cudaError_t e;
37 | 
38 |   cudaThreadSynchronize();
39 |   if (cudaSuccess != (e = cudaGetLastError())) {
40 |     fprintf(stderr, "%s: %d\n", msg, e);
41 |     fprintf(stderr, "%s\n", cudaGetErrorString(e));
42 |     exit(-1);
43 |   }
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Xuhao Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/topo/main.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <iostream>
 4 | #include <fstream>
 5 | #include <sstream>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <set>
 9 | using namespace std;
10 | #ifndef	ITERATIONS
11 | #define	ITERATIONS 10
12 | #endif
13 | #ifndef	BLKSIZE
14 | #define	BLKSIZE	32
15 | #endif
16 | #include "kernel.h"
17 | #include "graph_io.h"
18 | 
19 | int main(int argc, char *argv[]) {
20 | 	if (argc != 3) {
21 | 		printf("Usage: %s <BLKSIZE> <graph>\n", argv[0]);
22 | 		exit(1);
23 | 	}
24 | 	int blksize = atoi(argv[1]);
25 | 	int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL;
26 | 	if (strstr(argv[2], ".mtx"))
27 | 		mtx2csr(argv[2], m, nnz, csrRowPtr, csrColInd);
28 | 	if (strstr(argv[2], ".gr"))
29 | 		gr2csr(argv[2], m, nnz, csrRowPtr, csrColInd);
30 | 	int *coloring = (int *)calloc(m, sizeof(int));
31 | 	color(m, nnz, csrRowPtr, csrColInd, coloring, blksize);
32 | 	write_solution("color.txt", coloring, m);
33 | 	int correct = 1;
34 | 	verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct);
35 | 	if (correct)
36 | 		printf("correct.\n");
37 | 	else
38 | 		printf("incorrect.\n");
39 | 	return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/src/data/variants.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define DATA_BASE 0 // the baseline data-driven version
 4 | #define DATA_LDG 1 // using __ldg() intrinsic
 5 | #define DATA_BITSET 2 // bitset for forbiddenColors
 6 | #define DATA_COARSE 3 // thread coarsening
 7 | #define DATA_FUSION 4 // kernel fusion
 8 | #define DATA_WLC 5  // worklistc from lonestargpu
 9 | #define DATA_LDB 6  // load balancing using merrill's scheme
10 | #define DATA_PQ 7
11 | #define DATA_BEST 8
12 | #define DATA_COMB1 9
13 | 
14 | #ifndef VARIANT
15 | #error "VARIANT not defined."
16 | #endif
17 | 
18 | #if VARIANT==DATA_BASE
19 | #include "kernel_base.h"
20 | #elif VARIANT==DATA_LDG
21 | #include "kernel_ldg.h"
22 | #elif VARIANT==DATA_BITSET
23 | #include "kernel_bitset.h"
24 | #elif VARIANT==DATA_COARSE
25 | #include "kernel_tc.h"
26 | #elif VARIANT==DATA_FUSION
27 | #include "kernel_fusion.h"
28 | #elif VARIANT==DATA_WLC
29 | #include "kernel_wlc.h"
30 | #elif VARIANT==DATA_LDB
31 | #include "kernel_ldb.h"
32 | #elif VARIANT==DATA_PQ
33 | #include "kernel_pq.h"
34 | #elif VARIANT==DATA_BEST
35 | #include "kernel_best.h"
36 | #elif VARIANT==DATA_COMB1
37 | #include "kernel_comb.h"
38 | #else 
39 | #error "Unknown variant"
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/GM/tree.h:
--------------------------------------------------------------------------------
 1 | 
 2 | class node{
 3 | private:
 4 | 	int key, saturation, degree, color;
 5 | 	node *left, *right;
 6 | 	
 7 | public:
 8 | 	node();
 9 | 	node(int index, int sat, int deg);
10 | 	node(int index, int sat, int deg, int col, node *L, node *R);
11 | 	
12 | 	
13 | 	int getKey();
14 | 	int getSaturation();
15 | 	int getDegree();
16 | 	int getColor();
17 | 	node* getLeft();
18 | 	node* getRight();
19 | 	
20 | 	void setKey(int index);
21 | 	void setSaturation(int sat);
22 | 	void setDegree(int deg);
23 | 	void setColor(int c);
24 | 	void setLeft(node *L);
25 | 	void setRight(node *R);
26 | 	
27 | 	void setKSD(int index, int saturation, int degree);
28 | 	
29 | 	void displayNode();
30 | 	
31 | 	~node();
32 | };
33 | 
34 | 
35 | // Tree sorted by saturation and then  by degree
36 | class tree{
37 | private:
38 | 	node *top;
39 | 	
40 | public:
41 | 	tree();
42 | 	
43 | 	void insert(node *x);
44 | 	node* remove(int index, int saturation, int degree);
45 | 	
46 | 	node* findNode(int index, int saturation, int degree);
47 | 	void findBiggest(int &index, int &saturation, int &degree);
48 | 	
49 | 	void displayTreeRML(node *current);
50 | 	void displayTreeLMR(node *current);
51 | 	void displayTreeMLR(node *current);
52 | 	
53 | 	node* getTop();
54 | 	
55 | 	~tree();
56 | };
57 | 
58 | 


--------------------------------------------------------------------------------
/src/GM/graphColoring.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GRAPHCOLORING_H_ 
 2 | #define _GRAPHCOLORING_H_ 
 3 | 
 4 | #include <stdlib.h> 
 5 | #include <stdio.h> 
 6 | #include <string.h> 
 7 | #include <math.h> 
 8 | #include <cuda_runtime_api.h> 
 9 | #include <cuda.h> 
10 | #include <iostream>
11 | using namespace std;
12 | 
13 | 
14 | // Should be at least equal to maxDegree of graph + 1
15 | //    if doing that generates an error like: too much local memory, then use commented line 
16 | //    maked OPTION2 instead of OPTION1 in function color & saturation in gaphColoring.cu
17 | //const int TEMP_COLOR_LENGTH = 256;//128; //256;//1024;		
18 | const int TEMP_COLOR_LENGTH = 1000;//128; //256;//1024;		
19 | 
20 | const int CONFLICT_BLOCK_SIZE = 256;
21 | 
22 | const int MAXGPUITERATIONS = 50;
23 | 
24 | 
25 | #ifdef __cplusplus 
26 | 	#define CHECK_EXT extern "C" 
27 | #else 
28 | 	#define CHECK_EXT 
29 | #endif 
30 | 
31 | 
32 | CHECK_EXT float cudaGraphColoring(int *adjacentList, int *boundaryList, int *graphColors, int *degreeList, 
33 | 				int *conflict, int boundarySize, int maxDegree, int graphSize, int & passes, 
34 | 				int subsizeBoundary, int _gridSize, int _blockSize, int *startPartitionList, 
35 | 				int *endPartitionList, int *randomList, int numRand, int useSDO, int *numOut);
36 | 
37 | 
38 | #endif // _GRAPHCOLORING_H_ 
39 | 
40 | 


--------------------------------------------------------------------------------
/include/cutil_subset.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
 4 |     cudaError err = call;                                                    \
 5 |     if( cudaSuccess != err) {                                                \
 6 |         fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n",        \
 7 |                 err, __FILE__, __LINE__, cudaGetErrorString( err) );              \
 8 |         exit(EXIT_FAILURE);                                                  \
 9 |     } }
10 | 
11 | #  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
12 | 
13 | #  define CUDA_SAFE_THREAD_SYNC( ) {                                         \
14 |     cudaError err = CUT_DEVICE_SYNCHRONIZE();                                 \
15 |     if ( cudaSuccess != err) {                                               \
16 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
17 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
18 |     } }
19 | 
20 | // from http://forums.nvidia.com/index.php?showtopic=186669
21 | static __device__ unsigned get_smid(void) {
22 |      unsigned ret;
23 |      asm("mov.u32 %0, %smid;" : "=r"(ret) );
24 |      return ret;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/data/Makefile:
--------------------------------------------------------------------------------
 1 | include ../common.mk
 2 | INCLUDES += -I../../cub-1.1.1
 3 | B40_DIR=./back40computing-read-only
 4 | B40C_INC=-I$(B40_DIR) -I$(B40_DIR)/test
 5 | EXTRA := $(NVFLAGS) $(CFLAGS) $(INCLUDES) -DITERATIONS=10 -DBLKSIZE=128
 6 | SRC=main.cu
 7 | 
 8 | all: data_base data_bitset data_wlc data_ldb data_fusion data_tc data_ldg data_pq data_best
 9 | 
10 | data_base:
11 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=0 -o $@
12 | 	mv $@ $(BIN)
13 | 
14 | data_ldg:
15 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=1 -o $@
16 | 	mv $@ $(BIN)
17 | 
18 | data_bitset:
19 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=2 -o $@
20 | 	mv $@ $(BIN)
21 | 
22 | data_ldb:
23 | 	$(NVCC) $(EXTRA) $(B40C_INC) $(SRC) -DVARIANT=6 -o $@
24 | 	mv $@ $(BIN)
25 | 
26 | data_wlc:
27 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=5 -o $@
28 | 	mv $@ $(BIN)
29 | 
30 | data_fusion:
31 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=4 -o $@
32 | 	mv $@ $(BIN)
33 | 
34 | data_tc:
35 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=3 -o $@
36 | 	mv $@ $(BIN)
37 | 
38 | data_pq:
39 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=7 -o $@
40 | 	mv $@ $(BIN)
41 | 
42 | data_best:
43 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=8 -o $@
44 | 	mv $@ $(BIN)
45 | 
46 | data_comb1:
47 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=9 -o $@
48 | 	mv $@ $(BIN)
49 | 
50 | data_comb2:
51 | 	$(NVCC) $(EXTRA) $(SRC) -DVARIANT=9 -o $@
52 | 	mv $@ $(BIN)
53 | 
54 | clean:
55 | 	rm data_base data_ldg data_bitset data_ldb data_fusion data_tc data_pq
56 | 
57 | 


--------------------------------------------------------------------------------
/src/cusp/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2008-2009 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | // A simple timer class
20 | 
21 | #include <cuda.h>
22 | 
23 | class timer
24 | {
25 |     cudaEvent_t start;
26 |     cudaEvent_t end;
27 | 
28 | public:
29 |     timer()
30 |     {
31 |         cudaEventCreate(&start);
32 |         cudaEventCreate(&end);
33 |         cudaEventRecord(start,0);
34 |     }
35 | 
36 |     ~timer()
37 |     {
38 |         cudaEventDestroy(start);
39 |         cudaEventDestroy(end);
40 |     }
41 | 
42 |     float milliseconds_elapsed()
43 |     {
44 |         float elapsed_time;
45 |         cudaEventRecord(end, 0);
46 |         cudaEventSynchronize(end);
47 |         cudaEventElapsedTime(&elapsed_time, start, end);
48 |         return elapsed_time;
49 |     }
50 |     float seconds_elapsed()
51 |     {
52 |         return milliseconds_elapsed() / 1000.0;
53 |     }
54 | };
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Copyright 2016 Xuhao Chen, National University of Defense Technology
 2 | This is the code for sequential graph coloring on CPU and CUDA code for parallel graph coloring on GPGPUs.
 3 | 
 4 | Variants:
 5 | csrcolor:	graph coloring routine of NVIDIA cusparse		
 6 | 3-step-GM:	parallel graph coloring implemented on GPGPUs by Grosset et al.
 7 | sequential:	sequential graph coloring using FirstFit strategy
 8 | datadriven:	datadriven implementation of parallel graph coloring using FirstFit strategy
 9 | topodriven:	topodriven implementation of parallel graph coloring using FirstFit strategy
10 | 
11 | 
12 | Requirements:
13 |         compute capability 3.5 and higher
14 |         Kepler or later GPU hardware
15 |         CUB v1.1.1
16 | 
17 | The instructions below assume CSRCOLOR_CODE has been installed in $CSRCOLOR_CODE_DIR.
18 | 
19 | Each variant directory under $CSRCOLOR_CODE_DIR/$VARIANT contains a README that
20 | explains what $VARIANT does, how to run it, details of implementations
21 | and other useful info.
22 | 
23 | 
24 | INSTALLATION
25 | 
26 | You will need to download and install CUB from here:
27 | 
28 | http://nvlabs.github.io/cub/
29 | 
30 | Place a symlink to the top-level CUB directory in $CSRCOLOR_CODE_DIR. Assuming
31 | the top-level CUB directory is $CUBDIR:
32 | 
33 | $ cd $CSRCOLOR_CODE_DIR
34 | $ ln -s $CUBDIR
35 | 
36 | 
37 | BUILDING
38 | 
39 | Assuming you're in $CSRCOLOR_CODE_DIR:
40 | $ make # compiles all variants
41 | 
42 | 
43 | RUNNING
44 | 
45 | Each variant directory under $CSRCOLOR_CODE_DIR contains a simple `run' script that
46 | runs the application with all recommended inputs.
47 | 
48 | 
49 | Authors: 
50 | Xuhao Chen <cxh.nudt@gmail.com>
51 | Pingfan Li <li_pingfan@163.com>
52 | 
53 | Citations:
54 | Pingfan Li et al., High Performance Parallel Graph Coloring on GPGPUs, IPDPSW, 2016
55 | Xuhao Chen et al., Efficient and High-quality Sparse Graph Coloring on the GPU, Tech. Rep. NUDT-CS-2016-003, 2016
56 | 


--------------------------------------------------------------------------------
/src/omp/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | #include <sys/time.h>
  4 | #ifdef ENABLE_OPENMP
  5 | #include <omp.h>
  6 | #endif
  7 | //#define GCC_EXTENSION
  8 | #define OPENMP_3_1
  9 | 
 10 | double rtclock() {
 11 | 	struct timezone Tzp;
 12 | 	struct timeval Tp;
 13 | 	int stat;
 14 | 	stat = gettimeofday (&Tp, &Tzp);
 15 | 	if (stat != 0) printf("Error return from gettimeofday: %d",stat);
 16 | 	return(Tp.tv_sec + Tp.tv_usec*1.0e-6);
 17 | }
 18 | 
 19 | template <class T>
 20 | inline T my_fetch_add(T *ptr, T val) {
 21 | #ifdef ENABLE_OPENMP
 22 | #ifdef GCC_EXTENSION
 23 | 	return __sync_fetch_and_add(ptr,val);
 24 | #endif
 25 | #ifdef OPENMP_3_1
 26 | 	T old;
 27 | 	#pragma omp atomic capture
 28 | 	{old = *ptr; *ptr += val;}
 29 | 	return old;
 30 | #endif
 31 | #else
 32 | 	T old; old = *ptr; *ptr += val;
 33 | 	return old;
 34 | #endif
 35 | }
 36 | 
 37 | template <class T>
 38 | inline T my_fetch_sub(T *ptr, T val) {
 39 | #ifdef ENABLE_OPENMP
 40 | #ifdef GCC_EXTENSION
 41 | 	return __sync_fetch_and_sub(ptr,val);
 42 | #endif
 43 | #ifdef OPENMP_3_1
 44 | 	T old;
 45 | 	#pragma omp atomic capture
 46 | 	{old = *ptr; *ptr -= val;}
 47 | 	return old;
 48 | #endif
 49 | #else
 50 | 	T old; old = *ptr; *ptr -= val;
 51 | 	return old;
 52 | #endif
 53 | }
 54 | ;
 55 | 
 56 | template <class T>
 57 | inline T my_compare_swap(T *ptr, T old_val, T new_val) {
 58 | #ifdef ENABLE_OPENMP
 59 | #ifdef GCC_EXTENSION
 60 | 	return __sync_val_compare_and_swap(ptr,old_val,new_val);
 61 | #endif
 62 | #ifdef OPENMP_3_1
 63 | 	T old = *ptr;
 64 | 	#pragma omp critical
 65 | 	{
 66 | 	if(*ptr == old_val) {
 67 | 		*ptr = new_val;
 68 | 	}
 69 | 	}
 70 | 	return old;
 71 | #endif
 72 | #else
 73 | 	T old = *ptr;
 74 | 	if(*ptr == old_val) *ptr = new_val;
 75 | 	return old;
 76 | #endif
 77 | }
 78 | ;
 79 | 
 80 | template <class T>
 81 | inline T atomicMin(T *ptr, T val) {
 82 | 	T old = *ptr;
 83 | #ifdef ENABLE_OPENMP
 84 | 	#pragma omp critical
 85 | #endif
 86 | 	{if(val < *ptr) *ptr = val;}
 87 | 	return old;
 88 | }
 89 | ;
 90 | 
 91 | void __syncthreads() {
 92 | #ifdef ENABLE_OPENMP
 93 | #ifdef GCC_EXTENSION
 94 | 	//#pragma omp barrier
 95 | 	//__sync_synchronize();
 96 | #endif
 97 | #ifdef OPENMP_3_1
 98 | 	#pragma omp barrier
 99 | #endif
100 | #else
101 | #endif
102 | }
103 | #endif
104 | 


--------------------------------------------------------------------------------
/include/list.h:
--------------------------------------------------------------------------------
 1 | #ifndef LSG_LIST
 2 | #define LSG_LIST
 3 | 
 4 | typedef struct List {
 5 | 	__device__ List(unsigned size);
 6 | 	__device__ void init(unsigned *mem, unsigned size, unsigned cap);
 7 | 	__device__ void push(unsigned item);
 8 | 	__device__ unsigned *toArray();
 9 | 	__device__ void clear();
10 | 	__device__ unsigned size();
11 | 	__device__ void uniq(unsigned *mark, unsigned maxelement);
12 | 
13 | 	unsigned *array;
14 | 	unsigned nitems;
15 | 	unsigned capacity;
16 | } List;
17 | 
18 | __device__ List::List(unsigned size) {
19 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
20 | 	capacity = 0;
21 | 	array = NULL;
22 | 	nitems = 0;
23 | 
24 | 	if (size) {
25 | 		array = (unsigned *)malloc(size * sizeof(unsigned));
26 | 		if (array == NULL) {
27 | 			printf("%s(%d): thread %d: Error: malloc of %d unsigned returned no memory.\n", __FILE__, __LINE__, id, size);
28 | 		} else {
29 | 			capacity = size;
30 | 		}
31 | 	}
32 | }
33 | __device__ void List::init(unsigned *mem, unsigned size, unsigned cap) {
34 | 	array = mem;
35 | 	nitems = size;
36 | 	capacity = cap;
37 | }
38 | __device__ void List::push(unsigned item) {
39 | 	if (array && nitems < capacity) {
40 | 		array[nitems++] = item;
41 | 	} else {
42 | 		unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
43 | 		printf("%s(%d): thread %d: Error: buffer overflow, capacity=%d.\n", __FILE__, __LINE__, id, capacity);
44 | 	}
45 | }
46 | __device__ unsigned *List::toArray() {
47 | 	return array;
48 | }
49 | __device__ void List::clear() {
50 | 	if (array) free(array);
51 | 	nitems = 0;
52 | 	capacity = 0;
53 | }
54 | __device__ unsigned List::size() {
55 | 	return nitems;
56 | }
57 | __device__ void List::uniq(unsigned *mark, unsigned maxelement) {
58 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
59 | 	unsigned mysize = size();
60 | 	if (mysize == 0) return;
61 | 
62 | 	unsigned *newarray = (unsigned *)malloc(mysize * sizeof(unsigned));
63 | 	if (newarray == NULL) {
64 | 		printf("%s(%d): thread %d: Error: malloc of %d unsigned returned no memory.\n", __FILE__, __LINE__, id, mysize);
65 | 		return;
66 | 	}
67 | 	unsigned *insertptr = newarray;
68 | 
69 | 	for (unsigned ii = 0; ii < mysize; ++ii) {
70 | 		unsigned element = array[ii];
71 | 		if (element < maxelement && mark[element] == id) {	// this thread didn't succeed in marking this element.
72 | 			*insertptr++ = element;
73 | 		}
74 | 	}
75 | 	clear();
76 | 	init(newarray, insertptr - newarray, mysize);
77 | }
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/cusp/vertex_coloring.cu:
--------------------------------------------------------------------------------
 1 | #include <cusp/csr_matrix.h>
 2 | #include <cusp/print.h>
 3 | 
 4 | #include <cusp/gallery/poisson.h>
 5 | #include <cusp/graph/vertex_coloring.h>
 6 | #include <cusp/io/matrix_market.h>
 7 | 
 8 | #include "timer.h"
 9 | 
10 | template<typename MemorySpace, typename MatrixType>
11 | void coloring(const MatrixType& G)
12 | {
13 |     typedef typename MatrixType::index_type IndexType;
14 |     typedef cusp::csr_matrix<IndexType,IndexType,MemorySpace> GraphType;
15 | 
16 |     GraphType G_csr(G);
17 |     cusp::array1d<IndexType,MemorySpace> colors(G.num_rows, 0);
18 | 
19 |     timer t;
20 |     size_t max_color = cusp::graph::vertex_coloring(G_csr, colors);
21 |     std::cout << "Coloring time    : " << t.milliseconds_elapsed() << " (ms)." << std::endl;
22 |     std::cout << "Number of colors : " << max_color << std::endl;
23 | 
24 |     if(max_color > 0)
25 |     {
26 |       cusp::array1d<IndexType,MemorySpace> color_counts(max_color);
27 |       thrust::sort(colors.begin(), colors.end());
28 |       thrust::reduce_by_key(colors.begin(),
29 |                           colors.end(),
30 |                           thrust::constant_iterator<int>(1),
31 |                           thrust::make_discard_iterator(),
32 |                           color_counts.begin());
33 |       cusp::print(color_counts);
34 |     }
35 | }
36 | 
37 | int main(int argc, char*argv[])
38 | {
39 |     srand(time(NULL));
40 | 
41 |     typedef int   IndexType;
42 |     typedef float ValueType;
43 |     typedef cusp::host_memory MemorySpace;
44 | 
45 |     cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
46 |     size_t size = 512;
47 | 
48 |     if (argc == 1)
49 |     {
50 |         // no input file was specified, generate an example
51 |         std::cout << "Generated matrix (poisson5pt) ";
52 |         cusp::gallery::poisson5pt(A, size, size);
53 |     }
54 |     else if (argc == 2)
55 |     {
56 |         // an input file was specified, read it from disk
57 |         cusp::io::read_matrix_market_file(A, argv[1]);
58 |         std::cout << "Read matrix (" << argv[1] << ") ";
59 |     }
60 | 
61 |     std::cout << "with shape ("  << A.num_rows << "," << A.num_cols << ") and "
62 |               << A.num_entries << " entries" << "\n\n";
63 | 
64 |     //std::cout << " Device ";
65 |     //coloring<cusp::device_memory>(A);
66 | 
67 |     std::cout << " Host ";
68 |     coloring<cusp::host_memory>(A);
69 | 
70 |     return EXIT_SUCCESS;
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/include/kernelconfig.h:
--------------------------------------------------------------------------------
  1 | #ifndef LSG_KERNELCONFIG
  2 | #define LSG_KERNELCONFIG
  3 | 
  4 | typedef struct KernelConfig {
  5 | 	unsigned device;
  6 | 	unsigned problemsize;
  7 | 	unsigned nblocks, blocksize;
  8 | 	cudaDeviceProp dp;
  9 | 
 10 | 	KernelConfig(unsigned ldevice = 0);
 11 | 	void	 init();
 12 | 	unsigned setProblemSize(unsigned size);
 13 | 	unsigned setNumberOfBlocks(unsigned lnblocks);
 14 | 	unsigned setNumberOfBlockThreads(unsigned lblocksize);
 15 | 	unsigned setMaxThreadsPerBlock();
 16 | 	unsigned getNumberOfBlocks();
 17 | 	unsigned getNumberOfBlockThreads();
 18 | 	unsigned getNumberOfTotalThreads();
 19 | 
 20 | 	unsigned calculate();
 21 | 	unsigned getMaxThreadsPerBlock();
 22 | 	unsigned getMaxBlocks();
 23 | 	unsigned getMaxSharedMemoryPerBlock();
 24 | 	unsigned getNumberOfSMs();
 25 | 	bool	 coversProblem(unsigned size = 0);
 26 | 	unsigned getProblemSize();
 27 | } KernelConfig;
 28 | 
 29 | KernelConfig::KernelConfig(unsigned ldevice/* = 0*/) {
 30 | 	device = ldevice;
 31 | 	init();
 32 | }
 33 | void KernelConfig::init() {
 34 | 	int deviceCount = 0;
 35 | 	if (cudaSuccess != cudaGetDeviceCount(&deviceCount)) {
 36 | 		CudaTest("cudaGetDeviceCount failed");
 37 | 	}
 38 | 	if (deviceCount == 0) {
 39 |         	fprintf(stderr, "No CUDA capable devices found.");
 40 | 		return;
 41 | 	} 
 42 | 
 43 | 	cudaGetDeviceProperties(&dp, device);
 44 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", deviceCount, device, dp.name, dp.major, dp.minor, getNumberOfSMs(), ConvertSMVer2Cores(dp.major, dp.minor));
 45 | 	problemsize = 0;
 46 | 	nblocks = 0;
 47 | 	setMaxThreadsPerBlock();	// default.
 48 | }
 49 | unsigned KernelConfig::getMaxThreadsPerBlock() {
 50 | 	return dp.maxThreadsDim[0];
 51 | }
 52 | unsigned KernelConfig::getMaxBlocks() {
 53 | 	return dp.maxGridSize[0];
 54 | }
 55 | unsigned KernelConfig::getMaxSharedMemoryPerBlock() {
 56 | 	return dp.sharedMemPerBlock;
 57 | }
 58 | unsigned KernelConfig::getNumberOfSMs() {
 59 | 	return dp.multiProcessorCount;
 60 | }
 61 | 
 62 | unsigned KernelConfig::setProblemSize(unsigned size) {
 63 | 	problemsize = size;
 64 | 	return calculate();
 65 | }
 66 | unsigned KernelConfig::getProblemSize() {
 67 | 	return problemsize;
 68 | }
 69 | unsigned KernelConfig::getNumberOfBlocks() {
 70 | 	return nblocks;
 71 | }
 72 | unsigned KernelConfig::getNumberOfBlockThreads() {
 73 | 	return blocksize;
 74 | }
 75 | unsigned KernelConfig::getNumberOfTotalThreads() {
 76 | 	return nblocks * blocksize;
 77 | }
 78 | unsigned KernelConfig::calculate() {
 79 | 	if (blocksize == 0) {
 80 | 		fprintf(stderr, "blocksize = 0.\n");
 81 | 		return 1;
 82 | 	}
 83 | 	nblocks = (problemsize + blocksize - 1) / blocksize;
 84 | 	return 0;
 85 | }
 86 | unsigned KernelConfig::setNumberOfBlocks(unsigned lnblocks) {
 87 | 	nblocks = lnblocks;
 88 | 	return nblocks;
 89 | }
 90 | unsigned KernelConfig::setNumberOfBlockThreads(unsigned lblocksize) {
 91 | 	blocksize = lblocksize;
 92 | 	return blocksize;
 93 | }
 94 | unsigned KernelConfig::setMaxThreadsPerBlock() {
 95 | 	return setNumberOfBlockThreads(getMaxThreadsPerBlock());
 96 | }
 97 | bool KernelConfig::coversProblem(unsigned size/* = 0*/) {
 98 | 	if (size == 0) {
 99 | 		size = problemsize;
100 | 	}
101 | 	return (size <= nblocks * blocksize);
102 | }
103 | #endif
104 | 


--------------------------------------------------------------------------------
/include/worklistc.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "cutil_subset.h"
  3 | static int zero = 0;
  4 | 
  5 | struct Worklist {
  6 | 	int *dwl, *wl;
  7 | 	int length, *dnsize;
  8 | 	int *dindex;
  9 | 
 10 | 	Worklist(size_t nsize) {
 11 | 		wl = (int *) calloc(nsize, sizeof(int));
 12 | 		CUDA_SAFE_CALL(cudaMalloc(&dwl, nsize * sizeof(int)));
 13 | 		CUDA_SAFE_CALL(cudaMalloc(&dnsize, 1 * sizeof(int)));
 14 | 		CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int)));
 15 | 		CUDA_SAFE_CALL(cudaMemcpy(dnsize, &nsize, 1 * sizeof(int), cudaMemcpyHostToDevice));
 16 | 		CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice));
 17 | 		CUDA_SAFE_CALL(cudaMemcpy(&length, dnsize, 1 * sizeof(int), cudaMemcpyDeviceToHost));
 18 | 	}
 19 | 
 20 | 	~Worklist() {/*CUDA_SAFE_CALL(cudaFree(dwl));*/}
 21 | 
 22 | 	void update_cpu() {
 23 | 		int nsize = nitems();
 24 | 		CUDA_SAFE_CALL(cudaMemcpy(wl, dwl, nsize  * sizeof(int), cudaMemcpyDeviceToHost));
 25 | 	}
 26 | 
 27 | 	void display_items() {
 28 | 		int nsize = nitems();
 29 | 		CUDA_SAFE_CALL(cudaMemcpy(wl, dwl, nsize  * sizeof(int), cudaMemcpyDeviceToHost));
 30 | 		printf("WL: ");
 31 | 		for(int i = 0; i < nsize; i++)
 32 | 			printf("%d %d, ", i, wl[i]);
 33 | 		printf("\n");
 34 | 		return;
 35 | 	}
 36 | 
 37 | 	void reset() {
 38 | 		CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice));
 39 | 	}
 40 | 
 41 | 	int nitems() {
 42 | 		int index;
 43 | 		//printf("dindex=%p &index=%p\n", dindex, &index);
 44 | 		CUDA_SAFE_CALL(cudaMemcpy(&index, (void *) dindex, 1 * sizeof(index), cudaMemcpyDeviceToHost));
 45 | 		return index;
 46 | 	}
 47 | 
 48 | 	__device__ int push(int item) {
 49 | 		int lindex = atomicAdd((int *) dindex, 1);
 50 | 		if(lindex >= *dnsize)
 51 | 			return 0;
 52 | 		dwl[lindex] = item;
 53 | 		return 1;
 54 | 	}
 55 | 
 56 | 	__device__ int pop(int &item) {
 57 | 		int lindex = atomicSub((int *) dindex, 1);
 58 | 		if(lindex <= 0) {
 59 | 			*dindex = 0;
 60 | 			return 0;
 61 | 		}
 62 | 		item = dwl[lindex - 1];
 63 | 		return 1;
 64 | 	}
 65 | };
 66 | 
 67 | struct Worklist2: public Worklist {
 68 | 	Worklist2(int nsize) : Worklist(nsize) {}
 69 | 
 70 | 	template <typename T> __device__ __forceinline__
 71 | 		int push_1item(int nitem, int item, int threads_per_block) {
 72 | 			assert(nitem == 0 || nitem == 1);
 73 | 			__shared__ typename T::TempStorage temp_storage;
 74 | 			__shared__ int queue_index;
 75 | 			int total_items = 0;
 76 | 			int thread_data = nitem;
 77 | 			T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);
 78 | 			__syncthreads();
 79 | 			if(threadIdx.x == 0) {	
 80 | 				queue_index = atomicAdd((int *) dindex, total_items);
 81 | 			}
 82 | 			__syncthreads();
 83 | 			if(nitem == 1) {
 84 | 				if(queue_index + thread_data >= *dnsize) {
 85 | 					printf("GPU: exceeded length: %d %d %d %d %d\n", queue_index, thread_data, *dnsize, total_items, *dindex);
 86 | 					return 0;
 87 | 				}
 88 | 				//cub::ThreadStore<cub::STORE_CG>(dwl + queue_index + thread_data, item);
 89 | 				dwl[queue_index + thread_data] = item;
 90 | 			}
 91 | 			__syncthreads();
 92 | 			return total_items;
 93 | 		}
 94 | 
 95 | 	template <typename T>
 96 | 		__device__ __forceinline__
 97 | 		int push_nitems(int n_items, int *items, int threads_per_block) {
 98 | 			__shared__ typename T::TempStorage temp_storage;
 99 | 			__shared__ int queue_index;
100 | 			int total_items;
101 | 			int thread_data = n_items;
102 | 			T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);
103 | 			if(threadIdx.x == 0) {	
104 | 				queue_index = atomicAdd((int *) dindex, total_items);
105 | 				//printf("queueindex: %d %d %d %d %d\n", blockIdx.x, threadIdx.x, queue_index, thread_data + n_items, total_items);
106 | 			}
107 | 			__syncthreads();
108 | 			for(int i = 0; i < n_items; i++) {
109 | 				//printf("pushing %d to %d\n", items[i], queue_index + thread_data + i);
110 | 				if(queue_index + thread_data + i >= *dnsize) {
111 | 					printf("GPU: exceeded length: %d %d %d %d\n", queue_index, thread_data, i, *dnsize);
112 | 					return 0;
113 | 				}
114 | 				dwl[queue_index + thread_data + i] = items[i];
115 | 			}
116 | 			return total_items;
117 | 		}
118 | 
119 | 	__device__ int pop_id(int id, int &item) {
120 | 		if(id < *dindex) {
121 | 			//item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);
122 | 			item = dwl[id];
123 | 			return 1;
124 | 		}
125 | 		return 0;
126 | 	}
127 | };
128 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef LSG_COMMON
  2 | #define LSG_COMMON
  3 | 
  4 | #include <stdio.h>
  5 | #include <cuda.h>
  6 | #include <time.h>
  7 | #include <fstream>
  8 | #include <string>
  9 | #include <iostream>
 10 | #include <limits>
 11 | #include <string.h>
 12 | 
 13 | #include <unistd.h>
 14 | #include <cassert>
 15 | #include <inttypes.h>
 16 | #include <unistd.h>
 17 | #include <stdio.h>
 18 | #include <time.h>
 19 | #include <sys/time.h>
 20 | #include <stdlib.h>
 21 | #include <stdarg.h>
 22 | #include <sys/mman.h>
 23 | #include <sys/stat.h>
 24 | #include <sys/types.h>
 25 | #include <fcntl.h>
 26 | #include <unistd.h>
 27 | #include <cassert>
 28 | #include <inttypes.h>
 29 | 
 30 | #define MAXNBLOCKS	(4*NBLOCKS)
 31 | #define BLOCKSIZE	256
 32 | #define MAXBLOCKSIZE	1024
 33 | #define MAXSHARED	(48*1024)
 34 | #define MAXSHAREDUINT	(MAXSHARED / 4)
 35 | #define SHAREDPERTHREAD	(MAXSHAREDUINT / MAXBLOCKSIZE)
 36 | 
 37 | // For MAC and FreeBSD: by Rashid Kaleem.
 38 | #ifdef __APPLE__ 
 39 | #include <libkern/OSByteOrder.h>
 40 | #  define le64toh(x) OSSwapLittleToHostInt64(x)
 41 | #  define le32toh(x) OSSwapLittleToHostInt32(x)
 42 | #elif __FreeBSD__ 
 43 | #  include <sys/endian.h>
 44 | #elif __linux__ 
 45 | #  include <endian.h>
 46 | #  ifndef le64toh
 47 | #    if __BYTE_ORDER == __LITTLE_ENDIAN
 48 | #      define le64toh(x) (x)
 49 | #      define le32toh(x) (x)
 50 | #    else
 51 | #      define le64toh(x) __bswap_64 (x)
 52 | #    endif
 53 | #  endif
 54 | #endif
 55 | 
 56 | #ifndef LSGDEBUG
 57 | #define LSGDEBUG 0
 58 | #endif 
 59 | 
 60 | #define dprintf	if (debug) printf
 61 | unsigned const debug = LSGDEBUG;
 62 | 
 63 | typedef unsigned foru;
 64 | //typedef float foru;
 65 | 
 66 | double rtclock()
 67 | {
 68 |     struct timezone Tzp;
 69 |     struct timeval Tp;
 70 |     int stat;
 71 |     stat = gettimeofday (&Tp, &Tzp);
 72 |     if (stat != 0) printf("Error return from gettimeofday: %d",stat);
 73 |     return(Tp.tv_sec + Tp.tv_usec*1.0e-6);
 74 | }
 75 | 
 76 | 
 77 | __device__ 
 78 | void global_sync(unsigned goalVal, volatile unsigned *Arrayin, volatile unsigned *Arrayout) {
 79 | 	// thread ID in a block
 80 | 	unsigned tid_in_blk = threadIdx.x * blockDim.y + threadIdx.y;
 81 | 	unsigned nBlockNum = gridDim.x * gridDim.y;
 82 | 	unsigned bid = blockIdx.x * gridDim.y + blockIdx.y;
 83 | 	// only thread 0 is used for synchronization
 84 | 	if (tid_in_blk == 0) {
 85 | 		Arrayin[bid] = goalVal;
 86 | 		__threadfence();
 87 | 	}
 88 | 	if (bid == 0) {
 89 | 		if (tid_in_blk < nBlockNum) {
 90 | 			while (Arrayin[tid_in_blk] != goalVal){
 91 | 				//Do nothing here
 92 | 			}
 93 | 		}
 94 | 		__syncthreads();
 95 | 		if (tid_in_blk < nBlockNum) {
 96 | 			Arrayout[tid_in_blk] = goalVal;
 97 | 			__threadfence();
 98 | 		}
 99 | 	}
100 | 	if (tid_in_blk == 0) {
101 | 		while (Arrayout[bid] != goalVal) {
102 | 			//Do nothing here
103 | 		}
104 | 	}
105 | 	__syncthreads();
106 | }
107 | 
108 | static unsigned CudaTest(char *msg)
109 | {
110 |   cudaError_t e;
111 | 
112 |   cudaThreadSynchronize();
113 |   if (cudaSuccess != (e = cudaGetLastError())) {
114 |     fprintf(stderr, "%s: %d\n", msg, e);
115 |     fprintf(stderr, "%s\n", cudaGetErrorString(e));
116 |     exit(-1);
117 |     //return 1;
118 |   }
119 |   return 0;
120 | }
121 | // from CUDA SDK.
122 | inline int ConvertSMVer2Cores(int major, int minor)
123 | {
124 |         // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
125 |         typedef struct {
126 |                 int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
127 |                 int Cores;
128 |         } sSMtoCores;
129 | 
130 |         sSMtoCores nGpuArchCoresPerSM[] =
131 |         { { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
132 |           { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
133 |           { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
134 |           { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
135 |           { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
136 |           { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
137 |           { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
138 | 	  { 0x35, 192}, // Kepler Generation (SM 3.5) GK110 class
139 |           {   -1, -1 }
140 |         };
141 | 
142 |         int index = 0;
143 |         while (nGpuArchCoresPerSM[index].SM != -1) {
144 |                 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
145 |                         return nGpuArchCoresPerSM[index].Cores;
146 |                 }
147 |                 index++;
148 |         }
149 |         printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
150 |         return -1;
151 | }
152 | 
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/src/omp/kernel2.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu>
  3 | #define	MAXCOLOR 128 // available colors: 0 ~ (MAXCOLOR - 1)
  4 | 
  5 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, int *coloring) {	
  6 | 	unsigned start = inwl.start;
  7 | 	unsigned end = inwl.end;
  8 | #ifndef ENABLE_OPENMP
  9 | 	int *forbiddenColors = (int *) malloc(m * sizeof(int));
 10 | 	for(int i = 0; i < m; i ++) forbiddenColors[i] = m + 1;
 11 | #else
 12 | 	int **forbiddenColors = (int **) malloc(num_omp_threads*sizeof(int*));
 13 | 	for (int i = 0; i < num_omp_threads; i++) {
 14 | 		forbiddenColors[i] = (int *) malloc((MAXCOLOR+1)*sizeof(int));
 15 | 		for(int j = 0; j < MAXCOLOR; j++) forbiddenColors[i][j] = m + 1;
 16 | 	}
 17 | 	#pragma omp parallel for
 18 | #endif
 19 | 	for (int i = start; i < end; i++) {
 20 | #ifdef ENABLE_OPENMP
 21 | 		int tid = omp_get_thread_num();
 22 | 		int vertex = inwl.getItem(i);
 23 | #else
 24 | 		int vertex = i;
 25 | #endif
 26 | 		int row_begin = csrRowPtr[vertex];
 27 | 		int row_end = csrRowPtr[vertex + 1];
 28 | 		for (int offset = row_begin; offset < row_end; offset++) {
 29 | 			int neighbor = csrColInd[offset];
 30 | 			int color = coloring[neighbor];
 31 | #ifdef ENABLE_OPENMP
 32 | 			forbiddenColors[tid][color] = vertex;
 33 | #else
 34 | 			forbiddenColors[color] = vertex;//forbid this color
 35 | #endif
 36 | 		}
 37 | 		int vertex_color = 0;
 38 | #ifdef ENABLE_OPENMP
 39 | 		while (vertex_color < MAXCOLOR && forbiddenColors[tid][vertex_color] == vertex)
 40 | #else
 41 | 		while (vertex_color < MAXCOLOR && forbiddenColors[vertex_color] == vertex)
 42 | #endif
 43 | 			vertex_color++;
 44 | 		assert(vertex_color < MAXCOLOR);
 45 | 		coloring[vertex] = vertex_color;
 46 | 	}
 47 | }
 48 | 
 49 | void conflictDetect(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, Worklist &outwl, int *coloring) {
 50 | 	unsigned start = inwl.start;
 51 | 	unsigned end = inwl.end;
 52 | #ifdef ENABLE_OPENMP	
 53 | 	#pragma omp parallel for
 54 | #endif
 55 | 	for (int i = start; i < end; i++) {
 56 | 		int vertex = inwl.getItem(i);
 57 | 		int neighbor_offset = csrRowPtr[vertex];
 58 | 		int num_neighbors = csrRowPtr[vertex + 1] - neighbor_offset;
 59 | 		for (int j = 0; j < num_neighbors; j++) {
 60 | 			int neighbor = csrColInd[neighbor_offset + j];
 61 | 			if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 62 | 				outwl.push(vertex);
 63 | 				break;
 64 | 			}
 65 | 		}
 66 | 	}
 67 | }
 68 | 
 69 | void findMax(int *coloring, int n, int *ncolors) {
 70 | 	int i;
 71 | 	for (i = 0; i < n; i++) {
 72 | 		if (coloring[i] > *ncolors)
 73 | 			*ncolors = coloring[i];
 74 | 	}
 75 | 	*ncolors ++;
 76 | }
 77 | 
 78 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring) {
 79 | 	Worklist inwl, outwl, *inwlptr, *outwlptr, *tmp;
 80 | 	double starttime, endtime;
 81 | 	double runtime[ITERATIONS];
 82 | 	int colors[ITERATIONS];
 83 | 	int iteration[ITERATIONS];
 84 | 	for(int i = 0; i < ITERATIONS; i ++) {
 85 | 		colors[i] = 0;
 86 | 		iteration[i] = 0;
 87 | 	}
 88 | 	inwl.ensureSpace(m);
 89 | 	outwl.ensureSpace(m);
 90 | 	for (int i = 0; i < ITERATIONS; i++) {
 91 | 		inwlptr = &inwl;
 92 | 		outwlptr = &outwl;
 93 | 		starttime = rtclock();
 94 | 		unsigned *range = (unsigned *)malloc(m * sizeof(unsigned));
 95 | 		for (unsigned j = 0; j < m; j++)
 96 | 			range[j] = j;
 97 | 		inwl.pushRange(range, m);
 98 | 		unsigned wlsz = inwl.getSize();
 99 | #ifdef ENABLE_OPENMP
100 | 		while (wlsz) {
101 | 			++iteration[i];
102 | 			//printf("iteration=%d, %d vertices to process\n", iteration, wlsz);
103 | #endif
104 | 			FirstFit(m, nnz, csrRowPtr, csrColInd, *inwlptr, coloring);
105 | #ifdef ENABLE_OPENMP
106 | 			__syncthreads();
107 | 			conflictDetect(m, nnz, csrRowPtr, csrColInd, *inwlptr, *outwlptr, coloring);
108 | 			__syncthreads();
109 | 			wlsz = outwlptr->getSize();
110 | 			tmp = inwlptr; inwlptr = outwlptr; outwlptr = tmp;
111 | 			outwlptr->clear();
112 | 		}
113 | #endif
114 | 		endtime = rtclock();
115 | 		findMax(coloring, m, &colors[i]);
116 | 		runtime[i] = (1000.0f * (endtime - starttime));
117 | 	}
118 | 	double total_time = 0.0;
119 | 	int total_colors = 0;
120 | 	int total_iterations = 0;
121 | 	for (int i = 0; i < ITERATIONS; i++) {
122 | 		total_time += runtime[i];
123 | 		total_colors += colors[i];
124 | 		total_iterations += iteration[i];
125 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iteration[i]);
126 | 	}   
127 | 	double avg_time = (double)total_time / ITERATIONS;
128 | 	double avg_colors = (double)total_colors / ITERATIONS;
129 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
130 | 	printf("\navg_time %f ms avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
131 | }
132 | 


--------------------------------------------------------------------------------
/include/sharedptr.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   @file
  3 |   @section License
  4 |   TODO
  5 | 
  6 |   @section description
  7 | 
  8 |   Convenience class for shared CPU/GPU allocations.
  9 | 
 10 |   Based on the X10 Runtime ideas described in Pai et al. in PACT 2012.
 11 | 
 12 |   Also see NVIDIA Hemi's array.h at <https://github.com/harrism/hemi>
 13 | 
 14 |   @author Sreepathi Pai  <sreepai@ices.utexas.edu>
 15 | */
 16 | 
 17 | #pragma once
 18 | #include <cstdlib>
 19 | #include <cstdio>
 20 | #include <cuda.h>
 21 | #include <assert.h>
 22 | #include "cutil_subset.h"
 23 | 
 24 | template<typename T>
 25 | class Shared {
 26 |   T **ptrs;
 27 |   bool *owner;
 28 |   bool *isCPU;
 29 |   int max_devices;
 30 |   size_t nmemb;
 31 | 
 32 | public:
 33 | 
 34 |   Shared() {
 35 |     nmemb = 0;
 36 |   }
 37 | 
 38 |   Shared(size_t nmemb) {
 39 |     this->nmemb = nmemb;
 40 |     max_devices = 2;
 41 |     ptrs = (T **) calloc(max_devices, sizeof(T *));
 42 |     owner = (bool *) calloc(max_devices, sizeof(bool));
 43 |     isCPU = (bool *) calloc(max_devices, sizeof(bool));
 44 | 
 45 |     isCPU[0] = true;
 46 | 
 47 |     for(int i = 0; i < max_devices; i++)
 48 |       owner[i] = true;
 49 |   }
 50 | 
 51 |   void alloc(size_t nmemb) {
 52 |     assert(this->nmemb == 0);
 53 | 
 54 |     this->nmemb = nmemb;
 55 | 
 56 |     max_devices = 2;
 57 |     ptrs = (T **) calloc(max_devices, sizeof(T *));
 58 |     owner = (bool *) calloc(max_devices, sizeof(bool));
 59 |     isCPU = (bool *) calloc(max_devices, sizeof(bool));
 60 | 
 61 |     isCPU[0] = true;
 62 | 
 63 |     for(int i = 0; i < max_devices; i++)
 64 |       owner[i] = true;   
 65 |   }
 66 | 
 67 |   void free()
 68 |   {
 69 |     for(int i = 0; i < max_devices; i++)
 70 |       free_device(i);
 71 |   }
 72 | 
 73 |   bool free_device(int device = 0)
 74 |   {
 75 |     assert(device < max_devices);
 76 | 
 77 |     if(!ptrs[device])
 78 |       return true;
 79 | 
 80 |     if(isCPU[device])
 81 |       ::free(ptrs[device]);
 82 |     else
 83 |       {
 84 | 	if(cudaFree(ptrs[device]) == cudaSuccess)
 85 | 	  ptrs[device] = NULL;
 86 | 	else
 87 | 	  return false;
 88 |       }    
 89 | 
 90 |     return true;
 91 |   }
 92 | 
 93 |   bool find_owner(int &o)
 94 |   {
 95 |     int i;
 96 |     for(i = 0; i < max_devices; i++)
 97 |       if(owner[i]) {
 98 | 	o = i;
 99 | 	break;
100 |       }
101 |     
102 |     return i < max_devices;    
103 |   }
104 | 
105 |   
106 |   T *cpu_rd_ptr()
107 |   {
108 |     if(ptrs[0] == NULL)
109 |       ptrs[0] = (T *) calloc(nmemb, sizeof(T));
110 | 
111 |     if(!owner[0])
112 |       {
113 | 	int o;
114 | 	if(find_owner(o))
115 | 	  copy(o, 0);
116 | 
117 | 	owner[0] = true;
118 |       }
119 | 
120 |     return ptrs[0];
121 |   }
122 | 
123 |   T *cpu_wr_ptr(bool overwrite = false)
124 |   {
125 |     if(ptrs[0] == NULL)
126 |       ptrs[0] = (T *) calloc(nmemb, sizeof(T));
127 | 
128 |     if(!owner[0])
129 |       {
130 | 	if(!overwrite)
131 | 	  {
132 | 	    int o;
133 | 	    if(find_owner(o))
134 | 	      copy(o, 0);
135 | 	  }
136 | 
137 | 	owner[0] = true;
138 |       }
139 | 
140 |     for(int i = 1; i < max_devices; i++)
141 |       owner[i] = false;
142 | 
143 |     return ptrs[0];
144 |   }
145 | 
146 |   T *gpu_rd_ptr(int device = 1) /* device >= 1 */
147 |   {
148 |     assert(device >= 1);
149 | 
150 |     if(ptrs[device] == NULL)
151 |       CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));
152 | 
153 |     if(!owner[device])
154 |       {
155 | 	int o;
156 | 	if(find_owner(o))
157 | 	  copy(o, device);
158 | 
159 | 	owner[device] = true;
160 |       }
161 | 
162 |     return ptrs[device];
163 |   }
164 | 
165 |   T *gpu_wr_ptr(bool overwrite = false, int device = 1)
166 |   {
167 |     assert(device >= 1);
168 | 
169 |     if(ptrs[device] == NULL)
170 |       CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));
171 | 
172 |     if(!owner[device])
173 |       {
174 | 	if(!overwrite)
175 | 	  {
176 | 	    int o;
177 | 	    if(find_owner(o))
178 | 	      copy(o, device);
179 | 	  }
180 | 
181 | 	owner[device] = true;
182 |       }
183 | 
184 |     for(int i = 0; i < max_devices; i++)
185 |       if(i != device)
186 | 	owner[i] = false;
187 | 
188 |     return ptrs[device];
189 |   }
190 | 
191 |   void copy(int src, int dst)
192 |   {
193 |     if(!ptrs[src])
194 |       return;
195 | 
196 |     assert(ptrs[dst]);
197 | 
198 |     if(isCPU[dst] && !isCPU[src]) {
199 |       CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyDeviceToHost));
200 |     } else if (!isCPU[dst] && !isCPU[src]) {
201 |       CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyDeviceToDevice)); 
202 |     } else if (!isCPU[dst] && isCPU[src]) {
203 |       CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T), cudaMemcpyHostToDevice)); 
204 |     } else
205 |       abort(); // cpu-to-cpu not implemented
206 |   
207 |   }
208 | };
209 |   
210 | 
211 | 


--------------------------------------------------------------------------------
/src/omp/kernel1.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | #include "worklist.h"
  3 | 
  4 | #include <vector>
  5 | #include <set>
  6 | 
  7 | using namespace std;
  8 | */
  9 | 
 10 | #define	MAXCOLOR	128
 11 | 
 12 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, int *coloring)
 13 | {	
 14 | 	unsigned start, end;
 15 | 	int ii;
 16 | 
 17 | 	start = inwl.start;
 18 | 	end = inwl.end;
 19 | 
 20 | 
 21 | 	#ifdef ENABLE_OPENMP	
 22 | 	#pragma omp parallel for
 23 | 	#endif
 24 | 	for (ii = start; ii < end; ii++) {
 25 | 		int  j, node, neighbors, neighbor_j;
 26 | 
 27 | 		node = inwl.getItem(ii);
 28 | 		int neighboroffset = csrRowPtr[node];
 29 | 		neighbors = csrRowPtr[node + 1] - neighboroffset;
 30 | 	
 31 | 		unsigned v[MAXCOLOR / 32];
 32 | 		v[0] = 0xfffffffe;
 33 | 		for (j = 1; j < MAXCOLOR / 32; j++)
 34 | 			v[j] = 0xffffffff;	
 35 | 
 36 | 		for (j = 0; j < neighbors; j++) {
 37 | 			neighbor_j = csrColInd[neighboroffset + j];
 38 | 			int color_j = coloring[neighbor_j];
 39 | 			if (color_j)
 40 | 				v[color_j / 32] &= ~(1 << (color_j % 32));			
 41 | 		}
 42 | 		
 43 | 		int c = 32;
 44 |                 for (int i = 0; i < MAXCOLOR / 32; i++) {
 45 |                         if (v[i] != 0) {
 46 |                                 v[i] &= -(signed)v[i];
 47 |                                 if (v[i]) c--;
 48 |                                 if (v[i] & 0x0000ffff) c -= 16;
 49 |                                 if (v[i] & 0x00ff00ff) c -= 8;
 50 |                                 if (v[i] & 0x0f0f0f0f) c -= 4;
 51 |                                 if (v[i] & 0x33333333) c -= 2;
 52 |                                 if (v[i] & 0x55555555) c -= 1;
 53 |                                 break;
 54 |                         }
 55 |                         else
 56 |                                 c += 32;
 57 |                 }
 58 |                 coloring[node] = c;		
 59 | 	}
 60 | }
 61 | 
 62 | void conflictDetect(int m, int nnz, int *csrRowPtr, int *csrColInd, Worklist &inwl, Worklist &outwl, int *coloring)
 63 | {
 64 | 	unsigned start, end;
 65 | 	int ii;
 66 | 	//inwl.myItems(start, end);
 67 | 	start = inwl.start;
 68 | 	end = inwl.end;
 69 | 	//printf("inwl=%d, outwl=%d, start=%d, end=%d\n", inwl.getSize(), outwl.getSize(), start, end);
 70 | 
 71 | 	#ifdef ENABLE_OPENMP	
 72 | 	#pragma omp parallel for
 73 | 	#endif
 74 | 	for (ii = start; ii < end; ii++) {
 75 | 		int j, node, neighbors, neighbor_j;
 76 | 		node = inwl.getItem(ii);
 77 | 		//if (node == -1)
 78 | 			//continue;
 79 | 		int neighboroffset = csrRowPtr[node];
 80 | 		neighbors = csrRowPtr[node + 1] - neighboroffset;
 81 | 		//neighbors = graph.noutgoing[node];
 82 | 
 83 | 		for (j = 0; j < neighbors; j++) {
 84 | 			//neighbor_j = graph.edgessrcdst[graph.psrc[node] + j];
 85 | 			neighbor_j = csrColInd[neighboroffset + j];
 86 | 			if (coloring[node] == coloring[neighbor_j] && node < neighbor_j) {
 87 | 				//printf("c[%d] = c[%d] = %d\n", node, neighbor_j, coloring[node]);
 88 | 				outwl.push(node);
 89 | 				break;
 90 | 			}
 91 | 		}
 92 | 
 93 | 		//if (j == neighbors)
 94 | 			//printf("%d ok\tcolor[%d]=%d\n", node, node, coloring[node]);
 95 | 	}
 96 | }
 97 | 
 98 | void findMax(int *coloring, int n, int *ncolors) {
 99 | 	int i;
100 | 	for (i = 0; i < n; i++) {
101 | 		//printf("coloring[%d]=%d\n", i, coloring[i]);
102 | 		if (coloring[i] > *ncolors)
103 | 			*ncolors = coloring[i];
104 | 	}
105 | }
106 | 
107 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring)
108 | {
109 | 	Worklist inwl, outwl, *inwlptr, *outwlptr, *tmp;
110 | 	
111 | 	double starttime, endtime;
112 | 	double runtime;
113 | 	
114 | 	//int nnodes = graph.nnodes;
115 | 	
116 | 	inwl.ensureSpace(m);
117 | 	outwl.ensureSpace(m);
118 | 	inwlptr = &inwl;
119 | 	outwlptr = &outwl;
120 | 	
121 | 	unsigned *range;
122 | 	range = (unsigned *)malloc(m * sizeof(unsigned));
123 | 	for (unsigned i = 0; i < m; i++)
124 | 		range[i] = i;
125 | 	//inwl.pushRange(graph.srcsrc, nnodes);
126 | 	inwl.pushRange(range, m);
127 | 
128 | 	int iteration = 0;
129 | 	unsigned wlsz = inwl.getSize();
130 | 	//printf("wlsz=%d, outwl=%d\n", wlsz, outwl.getSize());
131 | 	//printf("solving.\n");
132 | 		
133 | 	starttime = rtclock();	
134 | 	#ifdef ENABLE_OPENMP
135 | 	while (wlsz) {
136 | 		++iteration;
137 | 	#endif
138 | 	
139 | 		//FirstFit(graph, *inwlptr, coloring);
140 | 		FirstFit(m, nnz, csrRowPtr, csrColInd, *inwlptr, coloring);
141 | 		#ifdef ENABLE_OPENMP
142 | 		__syncthreads();
143 | 		//printf("ok\n");
144 | 		//conflictDetect(graph, *inwlptr, *outwlptr, coloring);
145 | 		conflictDetect(m, nnz, csrRowPtr, csrColInd, *inwlptr, *outwlptr, coloring);
146 | 		__syncthreads();
147 | 		//printf("ok\n");
148 | 
149 | 		//printf("iteration %d:inwl=%d, outwl=%d\n", iteration, wlsz, outwlptr->getSize());
150 | 		wlsz = outwlptr->getSize();
151 | 
152 | 		tmp = inwlptr; inwlptr = outwlptr; outwlptr = tmp;
153 | 		outwlptr->clear();
154 | 	}
155 | 		#endif
156 | 	endtime = rtclock();
157 | 	
158 | 	//verify<<<(nnodes - 1) / 1024 + 1, 1024>>>(graph, coloring, correct);
159 | 	//CUDA_SAFE_CALL(cudaDeviceSynchronize());
160 | 	//if (*correct) {
161 | 		//findMax<<<(nnodes - 1) / 1024 + 1, 1024>>>(coloring, nnodes, ncolors);
162 | 	findMax(coloring, m, ncolors);
163 | 		//CUDA_SAFE_CALL(cudaDeviceSynchronize());
164 | 	//}
165 | 	
166 | 	runtime = (1000.0f * (endtime - starttime));
167 | 	printf("runtime=%f\tcolors=%d\t", runtime, *ncolors);
168 | }
169 | 


--------------------------------------------------------------------------------
/src/data/kernel_ldb.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include <b40c_test_util.h>
 15 | #include <b40c/graph/builder/dimacs.cuh>
 16 | #include <b40c/graph/color/csr_problem.cuh>
 17 | #include <b40c/graph/csr_graph.cuh>
 18 | //#include <b40c/graph/color/enactor_hybrid.cuh>
 19 | #include <b40c/graph/color/enactor_two_phase.cuh>
 20 | #define INIT_VAL -1
 21 | using namespace b40c;
 22 | using namespace graph;
 23 | 
 24 | void verify_color(unsigned *dist, int m, int *csrRowPtr, int *csrColInd, unsigned *nerr) {
 25 | 	for (int nn = 0; nn < m; nn ++) {
 26 | 		int neighbor_offset = csrRowPtr[nn];
 27 | 		int neighbor_size = csrRowPtr[nn + 1] - neighbor_offset;
 28 | 		for (unsigned ii = 0; ii < neighbor_size; ++ii) {
 29 | 			int v = csrColInd[neighbor_offset + ii];
 30 | 			unsigned wt = 1;
 31 | 			if (wt > 0 && dist[nn] + wt < dist[v]) {
 32 | 				//printf("%d %d %d %d\n", nn, v, dist[nn], dist[v]);
 33 | 				++*nerr;
 34 | 			}
 35 | 		}
 36 | 	}	
 37 | }
 38 | 
 39 | void write_solution(const char *fname, int m, unsigned *h_dist) {
 40 | 	//unsigned *h_dist;
 41 | 	//h_dist = (unsigned *) malloc(m * sizeof(unsigned));
 42 | 	assert(h_dist != NULL);
 43 | 	//CUDA_SAFE_CALL(cudaMemcpy(h_dist, dist, m * sizeof(foru), cudaMemcpyDeviceToHost));
 44 | 	printf("Writing solution to %s\n", fname);
 45 | 	FILE *f = fopen(fname, "w");
 46 | 	fprintf(f, "Computed solution (source dist): [");
 47 | 	for(int node = 0; node < m; node++) {
 48 | 		fprintf(f, "%d:%d\n ", node, h_dist[node]);
 49 | 	}
 50 | 	fprintf(f, "]");
 51 | 	free(h_dist);
 52 | }
 53 | 
 54 | void color_ldb(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring, int num_SMs) {
 55 | 	printf("Graph coloring data-driven load-balance version\n");
 56 | 	typedef int VertexId;
 57 | 	typedef unsigned Value;
 58 | 	typedef int SizeT;
 59 | 	int *d_csrRowPtr, *d_csrColInd;
 60 | 	int *d_coloring;
 61 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
 62 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
 63 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
 64 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
 65 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
 66 | 	CUDA_SAFE_CALL(cudaMemset(d_coloring, INIT_VAL, m  * sizeof(int)));
 67 | 
 68 | 	graph::CsrGraph<VertexId, Value, SizeT> csr_graph;
 69 | 	csr_graph.FromScratch<true>(m, nnz);
 70 | 	CUDA_SAFE_CALL(cudaMemcpy(csr_graph.row_offsets, d_csrRowPtr, sizeof(SizeT) * (m + 1), cudaMemcpyDeviceToHost));
 71 | 	CUDA_SAFE_CALL(cudaMemcpy(csr_graph.column_indices, d_csrColInd, sizeof(VertexId) * nnz, cudaMemcpyDeviceToHost));
 72 | 
 73 | 	typedef color::CsrProblem<VertexId, SizeT, false> CsrProblem;
 74 | 	color::EnactorTwoPhase<false> two_phase(false);
 75 | 	//color::EnactorHybrid<false> hybrid(false);
 76 | 	CsrProblem csr_problem;
 77 | 	if (csr_problem.FromHostProblem(false, csr_graph.nodes, csr_graph.edges, csr_graph.column_indices, csr_graph.row_offsets, 1)) exit(1);
 78 | 	cudaError_t	retval = cudaSuccess;
 79 | 	double runtime[ITERATIONS];
 80 | 	int colors[ITERATIONS];
 81 | 	double starttime, endtime;
 82 | 	for (int i = 0; i < ITERATIONS; i++) {
 83 | 		starttime = rtclock();
 84 | 		//if (retval = csr_problem.Reset(hybrid.GetFrontierType(), 1.3))
 85 | 		if (retval = csr_problem.Reset(two_phase.GetFrontierType(), 1.3))
 86 | 			return;
 87 | 		//if (retval = hybrid.EnactSearch(csr_problem, 0)) {
 88 | 		if (retval = two_phase.EnactIterativeSearch(csr_problem, 0)) {
 89 | 			if (retval && (retval != cudaErrorInvalidDeviceFunction)) {
 90 | 				exit(1);
 91 | 			}
 92 | 		}
 93 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
 94 | 		endtime = rtclock();
 95 | 		runtime[i] = 1000.0f * (endtime - starttime);
 96 | 		//colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>());
 97 | 	}
 98 | 	unsigned *h_dist;
 99 | 	h_dist = (unsigned *) malloc(m * sizeof(unsigned));
100 | 	assert(h_dist != NULL);
101 | 	if (csr_problem.ExtractResults((int *) h_dist)) exit(1);
102 | 	for(int i = 0; i < m; i++)
103 | 		if((signed) h_dist[i] == -1)
104 | 			h_dist[i] = 1000000000;
105 | 	printf("Done!\n");
106 | 	unsigned nerr = 0;
107 | 	printf("verifying.\n");
108 | 	verify_color(h_dist, m, csrRowPtr, csrColInd, &nerr);
109 | 	printf("\tno of errors = %d.\n", nerr);
110 | 	write_solution("color-output.txt", m, h_dist);
111 | 	exit(0);
112 | 	/*
113 | 	cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost);
114 | 	*ncolors = colors[ITERATIONS - 1];
115 | 	double totaltime = 0.0;
116 | 	int totalcolors = 0;
117 | 	for (int i = 0; i < ITERATIONS; i++) {
118 | 		totaltime += runtime[i];
119 | 		totalcolors += colors[i];
120 | 		printf("[%d %f] ", colors[i], runtime[i]);
121 | 	}
122 | 	double avgtime = (double)totaltime / ITERATIONS;
123 | 	double avgcolors = (double)totalcolors / ITERATIONS;
124 | 	printf("\navgtime=%f ms, avgcolors = %f\n", avgtime, avgcolors);
125 | 	*/
126 | }
127 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
  1 | #include <sys/mman.h>
  2 | #include <sys/stat.h>
  3 | #include <sys/types.h>
  4 | #include <fcntl.h>
  5 | #include <unistd.h>
  6 | #include <cassert>
  7 | #include <inttypes.h>
  8 | 
  9 | 
 10 | #define FORMATSTR	"%d %d %d"
 11 | 
 12 | unsigned allocOnHost(Graph &gg) {
 13 | 	gg.destination = (unsigned int *)malloc((gg.nedges+1) * sizeof(unsigned int));	// first entry acts as null.
 14 | 	gg.weight = (foru *)malloc((gg.nedges+1) * sizeof(foru));	// first entry acts as null.
 15 | 	gg.psrc = (unsigned int *)calloc(gg.nnodes+1, sizeof(unsigned int));	// init to null.
 16 | 	gg.psrc[gg.nnodes] = gg.nedges;	// last entry points to end of edges, to avoid thread divergence in drelax.
 17 | 	gg.noutgoing = (unsigned int *)calloc(gg.nnodes, sizeof(unsigned int));	// init to 0.
 18 | 	gg.srcsrc = (unsigned int *)malloc(gg.nnodes * sizeof(unsigned int));
 19 | 
 20 | 	return 0;
 21 | }
 22 | void progressPrint(unsigned maxii, unsigned ii) {
 23 | 	const unsigned nsteps = 10;
 24 | 	unsigned ineachstep = (maxii / nsteps);
 25 | 	if (ii % ineachstep == 0) {
 26 | 		printf("\t%3d%%\r", ii*100/maxii + 1);
 27 | 		fflush(stdout);
 28 | 	}
 29 | }
 30 | unsigned readFromEdges(char file[], Graph &gg) {
 31 | 	std::ifstream cfile;
 32 | 	cfile.open(file);
 33 | 
 34 | 	std::string str;
 35 | 	getline(cfile, str);
 36 | 	sscanf(str.c_str(), "%d %d", &gg.nnodes, &gg.nedges);
 37 | 
 38 | 	printf("file %s: nnodes=%d, nedges=%d.\n", file, gg.nnodes, gg.nedges);
 39 | 	allocOnHost(gg);
 40 | 	for (unsigned ii = 0; ii < gg.nnodes; ++ii) {
 41 | 		gg.srcsrc[ii] = ii;
 42 | 	}
 43 | 
 44 | 
 45 | 	unsigned int prevnode = 0;
 46 | 	unsigned int tempsrcnode;
 47 | 	unsigned int ncurroutgoing = 0;
 48 | 	unsigned unweightedgraph = 0;
 49 | 	for (unsigned ii = 0; ii < gg.nedges; ++ii) {
 50 | 		getline(cfile, str);
 51 | 		if (unweightedgraph) {
 52 | 			sscanf(str.c_str(), "%d %d", &tempsrcnode, &gg.destination[ii+1]);
 53 | 			gg.weight[ii+1] = 0;
 54 | 		} else {
 55 | 			sscanf(str.c_str(), FORMATSTR, &tempsrcnode, &gg.destination[ii+1], &gg.weight[ii+1]);
 56 | 		}
 57 | 		if (prevnode == tempsrcnode) {
 58 | 			if (ii == 0) {
 59 | 				gg.psrc[tempsrcnode] = ii + 1;
 60 | 			}
 61 | 			++ncurroutgoing;
 62 | 		} else {
 63 | 			gg.psrc[tempsrcnode] = ii + 1;
 64 | 			if (ncurroutgoing) {
 65 | 				gg.noutgoing[prevnode] = ncurroutgoing;
 66 | 			}
 67 | 			prevnode = tempsrcnode;
 68 | 			ncurroutgoing = 1;	// not 0.
 69 | 		}
 70 | 
 71 | 		progressPrint(gg.nedges, ii);
 72 | 	}
 73 | 	gg.noutgoing[prevnode] = ncurroutgoing;	// last entries.
 74 | 
 75 | 	printf("\n");
 76 | 	cfile.close();
 77 | 	return 0;
 78 | }
 79 | 
 80 | unsigned readFromGR(char file[], Graph &gg) {
 81 | 	std::ifstream cfile;
 82 | 	cfile.open(file);
 83 | 
 84 | 	// copied from GaloisCpp/trunk/src/FileGraph.h
 85 | 	int masterFD = open(file, O_RDONLY);
 86 |   	if (masterFD == -1) {
 87 | 	printf("FileGraph::structureFromFile: unable to open %s.\n", file);
 88 | 	return 1;
 89 |   	}
 90 | 
 91 |   	struct stat buf;
 92 | 	int f = fstat(masterFD, &buf);
 93 |   	if (f == -1) {
 94 |     		printf("FileGraph::structureFromFile: unable to stat %s.\n", file);
 95 |     		abort();
 96 |   	}
 97 |   	size_t masterLength = buf.st_size;
 98 | 
 99 |   	int _MAP_BASE = MAP_PRIVATE;
100 | //#ifdef MAP_POPULATE
101 | //  _MAP_BASE  |= MAP_POPULATE;
102 | //#endif
103 | 
104 |   	void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
105 |   	if (m == MAP_FAILED) {
106 |     		m = 0;
107 |     		printf("FileGraph::structureFromFile: mmap failed.\n");
108 |     		abort();
109 |   	}
110 | 
111 |   	//parse file
112 |   	uint64_t* fptr = (uint64_t*)m;
113 |   	__attribute__((unused)) uint64_t version = le64toh(*fptr++);
114 |   	assert(version == 1);
115 |   	uint64_t sizeEdgeTy = le64toh(*fptr++);
116 |   	uint64_t numNodes = le64toh(*fptr++);
117 |   	uint64_t numEdges = le64toh(*fptr++);
118 |   	uint64_t *outIdx = fptr;
119 |   	fptr += numNodes;
120 |   	uint32_t *fptr32 = (uint32_t*)fptr;
121 |   	uint32_t *outs = fptr32; 
122 |   	fptr32 += numEdges;
123 |   	if (numEdges % 2) fptr32 += 1;
124 |   	foru  *edgeData = (foru *)fptr32;
125 | 	
126 | 	// cuda.
127 | 	gg.nnodes = numNodes;
128 | 	gg.nedges = numEdges;
129 | 
130 | 	printf("file %s: nnodes=%d, nedges=%d.\n", file, gg.nnodes, gg.nedges);
131 | 	allocOnHost(gg);
132 | 
133 | 	for (unsigned ii = 0; ii < gg.nnodes; ++ii) {
134 | 		// fill unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *destination; unsigned *weight;
135 | 		gg.srcsrc[ii] = ii;
136 | 		if (ii > 0) {
137 | 			gg.psrc[ii] = le64toh(outIdx[ii - 1]) + 1;
138 | 			gg.noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]);
139 | 		} else {
140 | 			gg.psrc[0] = 1;
141 | 			gg.noutgoing[0] = le64toh(outIdx[0]);
142 | 		}
143 | 		for (unsigned jj = 0; jj < gg.noutgoing[ii]; ++jj) {
144 | 			unsigned edgeindex = gg.psrc[ii] + jj;
145 | 			unsigned dst = le32toh(outs[edgeindex - 1]);
146 | 			if (dst >= gg.nnodes) printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, edgeindex);
147 | 			gg.destination[edgeindex] = dst;
148 | 			gg.weight[edgeindex] = edgeData[edgeindex - 1];	// Weighted.
149 | 			//gg.weight[edgeindex] = 1;			// Unweighted like wikipedia.
150 | 
151 | 		}
152 | 		progressPrint(gg.nnodes, ii);
153 | 	}
154 | 	printf("\n");
155 | 
156 | 	cfile.close();	// probably galois doesn't close its file due to mmap.
157 | 	return 0;
158 | }
159 | unsigned readInput(char file[], Graph &gg) {
160 | 	if (strstr(file, ".edges") || strstr(file, ".undirected")) {
161 | 		return readFromEdges(file, gg);
162 | 	} else if (strstr(file, ".gr")) {
163 | 		return readFromGR(file, gg);
164 | 	}
165 | 	return 0;
166 | }
167 | 
168 | 


--------------------------------------------------------------------------------
/src/topo/kernel.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include "cuda_launch_config.hpp"
  4 | #include "cutil_subset.h"
  5 | #include "common.h"
  6 | #include <thrust/fill.h>
  7 | #include <thrust/count.h>
  8 | #include <thrust/reduce.h>
  9 | #include <thrust/functional.h>
 10 | #include <thrust/execution_policy.h>
 11 | #define	MAXCOLOR 128
 12 | 
 13 | __global__ void initialize(int *coloring, bool *colored, int m) {
 14 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 15 | 	if (id < m) {
 16 | 		coloring[id] = MAXCOLOR;
 17 | 		colored[id] = false;
 18 | 	}
 19 | }
 20 | 
 21 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, int *coloring, bool *changed) {
 22 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;	
 23 | 	bool forbiddenColors[MAXCOLOR+1];
 24 | 	if (coloring[id] == MAXCOLOR) {
 25 | 		for (int i = 0; i < MAXCOLOR; i++)
 26 | 			forbiddenColors[i] = false;
 27 | 		int row_begin = csrRowPtr[id];
 28 | 		int row_end = csrRowPtr[id + 1];
 29 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 30 | 			int neighbor = csrColInd[offset];
 31 | 			int color = coloring[neighbor];
 32 | 			forbiddenColors[color] = true;
 33 | 		}
 34 | 		int vertex_color;
 35 | 		for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) {
 36 | 			if (!forbiddenColors[vertex_color]) {
 37 | 				coloring[id] = vertex_color;
 38 | 				break;
 39 | 			}
 40 | 		}
 41 | 		assert(vertex_color < MAXCOLOR);
 42 | 		*changed = true;
 43 | 	}
 44 | }
 45 | 
 46 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, int *coloring, bool *colored) {
 47 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 48 | 	if (!colored[id]) {
 49 | 		int row_begin = csrRowPtr[id];
 50 | 		int row_end = csrRowPtr[id + 1];
 51 | 		int offset;
 52 | 		for (offset = row_begin; offset < row_end; offset ++) {
 53 | 			int neighbor = csrColInd[offset];
 54 | 			if (coloring[id] == coloring[neighbor] && id < neighbor) {
 55 | 				coloring[id] = MAXCOLOR;
 56 | 				break;
 57 | 			}
 58 | 		}
 59 | 		if(offset == row_end)
 60 | 			colored[id] = true;
 61 | 	}
 62 | }
 63 | 
 64 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int blksz) {
 65 | 	double starttime, endtime, t1, t2;
 66 | 	double runtime[ITERATIONS];
 67 | 	int colors[ITERATIONS];
 68 | 	int iterations[ITERATIONS];
 69 | 	double avgtime, avgcolors;
 70 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
 71 | 	bool *changed, hchanged;
 72 | 	bool *d_colored;
 73 | 	
 74 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
 75 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
 76 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
 77 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_colored, m * sizeof(int)));
 78 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
 79 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
 80 | 
 81 | 	int device = 0;
 82 | 	int deviceCount = 0;
 83 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
 84 | 	cudaDeviceProp deviceProp;
 85 | 	cudaGetDeviceProperties(&deviceProp, device);
 86 | 	int nSM = deviceProp.multiProcessorCount;
 87 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n",
 88 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
 89 | 	const size_t max_blocks_1 = maximum_residency(firstFit, blksz, 0);
 90 | 	const size_t max_blocks_2 = maximum_residency(conflictResolve, blksz, 0);
 91 | 	printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2);
 92 | 
 93 | 	for (int i = 0; i < ITERATIONS; i++) {
 94 | 		CUDA_SAFE_CALL(cudaMalloc((void **)&changed, sizeof(bool)));
 95 | 		initialize <<<((m - 1) / blksz + 1), blksz>>> (d_coloring, d_colored, m);
 96 | 		iterations[i] = 0;
 97 | 		starttime = rtclock();	
 98 | 		do {
 99 | 			iterations[i] ++;
100 | 			hchanged = false;
101 | 			CUDA_SAFE_CALL(cudaMemcpy(changed, &hchanged, sizeof(hchanged), cudaMemcpyHostToDevice));
102 | 			int nblocks = (m - 1) / blksz + 1;
103 | 			firstFit<<<nblocks, blksz>>>(m, d_csrRowPtr, d_csrColInd, d_coloring, changed);
104 | 			conflictResolve<<<nblocks, blksz>>>(m, d_csrRowPtr, d_csrColInd, d_coloring, d_colored);
105 | 			CUDA_SAFE_CALL(cudaMemcpy(&hchanged, changed, sizeof(hchanged), cudaMemcpyDeviceToHost));
106 | 			//left = (int)thrust::count(thrust::device, conflicted, conflicted + m, 1);
107 | 		} while (hchanged);
108 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
109 | 		endtime = rtclock();
110 | 		runtime[i] = 1000.0f * (endtime - starttime);
111 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
112 | 	}
113 | 	cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost);
114 | 	double total_time = 0.0;
115 | 	int total_colors = 0;
116 | 	int total_iterations = 0;
117 | 	for (int i = 0; i < ITERATIONS; i++) {
118 | 		total_time += runtime[i];
119 | 		total_colors += colors[i];
120 | 		total_iterations += iterations[i];
121 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
122 | 	}
123 | 	double avg_time = (double)total_time / ITERATIONS;
124 | 	double avg_colors = (double)total_colors / ITERATIONS;
125 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
126 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
127 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
128 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
129 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
130 | }
131 | 


--------------------------------------------------------------------------------
/include/gbar.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Software Global Barrier
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | //#include <cub/cub.cuh>
 36 | #include "cutil_subset.h"
 37 | 
 38 | /**
 39 |  * Manages device storage needed for implementing a global software barrier
 40 |  * between CTAs in a single grid
 41 |  */
 42 | class GlobalBarrier
 43 | {
 44 | public:
 45 | 
 46 | 	typedef unsigned int SyncFlag;
 47 | 
 48 | protected :
 49 | 
 50 | 
 51 | 	// Counters in global device memory
 52 | 	SyncFlag* d_sync;
 53 | 
 54 | 	/**
 55 | 	 * Simple wrapper for returning a CG-loaded SyncFlag at the specified pointer
 56 | 	 */
 57 | 	__device__ __forceinline__ SyncFlag LoadCG(SyncFlag* d_ptr) const
 58 | 	{
 59 | 		SyncFlag retval;
 60 | 		//retval = cub::ThreadLoad<cub::LOAD_CG>(d_ptr);
 61 | 		retval = d_ptr[0];
 62 | 		return retval;
 63 | 	}
 64 | 
 65 | public:
 66 | 
 67 | 	/**
 68 | 	 * Constructor
 69 | 	 */
 70 | 	GlobalBarrier() : d_sync(NULL) {}
 71 | 
 72 | 
 73 | 	/**
 74 | 	 * Synchronize
 75 | 	 */
 76 | 	__device__ __forceinline__ void Sync() const
 77 | 	{
 78 |         volatile SyncFlag *d_vol_sync = d_sync;
 79 | 
 80 |         // Threadfence and syncthreads to make sure global writes are visible before
 81 | 		// thread-0 reports in with its sync counter
 82 | 		__threadfence();
 83 | 		__syncthreads();
 84 | 
 85 | 		if (blockIdx.x == 0) {
 86 | 
 87 | 			// Report in ourselves
 88 | 			if (threadIdx.x == 0) {
 89 | 			    d_vol_sync[blockIdx.x] = 1;
 90 | 			}
 91 | 
 92 | 			__syncthreads();
 93 | 
 94 | 			// Wait for everyone else to report in
 95 | 			for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {
 96 | 				while (LoadCG(d_sync + peer_block) == 0) {
 97 | 					__threadfence_block();
 98 | 				}
 99 | 			}
100 | 
101 | 			__syncthreads();
102 | 
103 | 			// Let everyone know it's safe to read their prefix sums
104 | 			for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {
105 | 			    d_vol_sync[peer_block] = 0;
106 | 			}
107 | 
108 | 		} else {
109 | 
110 | 			if (threadIdx.x == 0) {
111 | 				// Report in
112 | 			    d_vol_sync[blockIdx.x] = 1;
113 | 
114 | 				// Wait for acknowledgement
115 | 				while (LoadCG(d_sync + blockIdx.x) == 1) {
116 | 					__threadfence_block();
117 | 				}
118 | 			}
119 | 
120 | 			__syncthreads();
121 | 		}
122 | 	}
123 | };
124 | 
125 | 
126 | /**
127 |  * Version of global barrier with storage lifetime management.
128 |  *
129 |  * We can use this in host enactors, and pass the base GlobalBarrier
130 |  * as parameters to kernels.
131 |  */
132 | class GlobalBarrierLifetime : public GlobalBarrier
133 | {
134 | protected:
135 | 
136 | 	// Number of bytes backed by d_sync
137 | 	size_t sync_bytes;
138 | 
139 | public:
140 | 
141 | 	/**
142 | 	 * Constructor
143 | 	 */
144 | 	GlobalBarrierLifetime() : GlobalBarrier(), sync_bytes(0) {}
145 | 
146 | 
147 | 	/**
148 | 	 * Deallocates and resets the progress counters
149 | 	 */
150 | 	cudaError_t HostReset()
151 | 	{
152 | 		cudaError_t retval = cudaSuccess;
153 | 		if (d_sync) {
154 | 			CUDA_SAFE_CALL(cudaFree(d_sync));
155 | 			d_sync = NULL;
156 | 		}
157 | 		sync_bytes = 0;
158 | 		return retval;
159 | 	}
160 | 
161 | 
162 | 	/**
163 | 	 * Destructor
164 | 	 */
165 | 	virtual ~GlobalBarrierLifetime()
166 | 	{
167 | 		HostReset();
168 | 	}
169 | 
170 | 
171 | 	/**
172 | 	 * Sets up the progress counters for the next kernel launch (lazily
173 | 	 * allocating and initializing them if necessary)
174 | 	 */
175 | 	cudaError_t Setup(int sweep_grid_size)
176 | 	{
177 | 		cudaError_t retval = cudaSuccess;
178 | 		do {
179 | 			size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
180 | 			if (new_sync_bytes > sync_bytes) {
181 | 
182 | 				if (d_sync) {
183 | 					CUDA_SAFE_CALL(cudaFree(d_sync));
184 | 					retval = cudaSuccess;
185 | 				}
186 | 
187 | 				sync_bytes = new_sync_bytes;
188 | 
189 | 				CUDA_SAFE_CALL(cudaMalloc((void**) &d_sync, sync_bytes));
190 | 				retval = cudaSuccess;
191 | 
192 | 				// Initialize to zero
193 | 				CUDA_SAFE_CALL(cudaMemset(d_sync, 0, sweep_grid_size * sizeof(SyncFlag)));
194 | 
195 | 			}
196 | 		} while (0);
197 | 
198 | 		return retval;
199 | 	}
200 | };
201 | 


--------------------------------------------------------------------------------
/include/component.h:
--------------------------------------------------------------------------------
  1 | struct ComponentSpace {
  2 | 	ComponentSpace(unsigned nelements);
  3 | 	
  4 | 	__device__ unsigned numberOfElements();
  5 | 	__device__ unsigned numberOfComponents();
  6 | 	__device__ bool isBoss(unsigned element);
  7 | 	__device__ unsigned find(unsigned lelement, bool compresspath = true);
  8 | 	__device__ bool unify(unsigned one, unsigned two);
  9 | 	__device__ void print1x1();
 10 | 	__host__   void print();
 11 |         __host__   void copy(ComponentSpace &two);
 12 |         void dump_to_file(const char *F);
 13 | 	void allocate();
 14 | 	void init();
 15 | 	unsigned numberOfComponentsHost();
 16 | 
 17 | 	unsigned nelements;
 18 | 	unsigned *ncomponents,			// number of components.
 19 | 		 *complen, 			// lengths of components.
 20 | 		 *ele2comp;			// components of elements.
 21 | };
 22 | ComponentSpace::ComponentSpace(unsigned nelements) {
 23 | 	this->nelements = nelements;
 24 | 
 25 | 	allocate();
 26 | 	init();
 27 | }
 28 | 
 29 | void ComponentSpace::dump_to_file(const char *F)
 30 | {
 31 |   static FILE *f;
 32 |   static unsigned *mem;
 33 | 
 34 |   if(!f)
 35 |     {
 36 |       f = fopen(F, "w");
 37 |       mem = (unsigned *) calloc(nelements, sizeof(unsigned));
 38 |     }
 39 | 
 40 |   assert(cudaMemcpy(mem, ele2comp, nelements * sizeof(unsigned), cudaMemcpyDeviceToHost) == cudaSuccess);
 41 | 
 42 |   int i;
 43 |   for(i = 0; i < nelements; i++)
 44 |     {
 45 |       fprintf(f, "%d %d\n", i, mem[i]);
 46 |     }
 47 |   fprintf(f, "\n");
 48 | }
 49 | 
 50 | void ComponentSpace::copy(ComponentSpace &two)
 51 | {
 52 |   assert(cudaMemcpy(two.ncomponents, ncomponents, sizeof(unsigned), cudaMemcpyDeviceToDevice) == 0);
 53 |   assert(cudaMemcpy(two.ele2comp, ele2comp, sizeof(unsigned) * nelements, cudaMemcpyDeviceToDevice) == 0);
 54 |   assert(cudaMemcpy(two.complen, complen, sizeof(unsigned) * nelements, cudaMemcpyDeviceToDevice) == 0);
 55 | }
 56 | __device__ void ComponentSpace::print1x1() {
 57 | 	printf("\t\t-----------------\n");
 58 | 	for (unsigned ii = 0; ii < nelements; ++ii) {
 59 | 		printf("\t\t%d -> %d\n", ii, ele2comp[ii]);
 60 | 	}	
 61 | 	printf("\t\t-----------------\n");
 62 | }
 63 | __global__ void print1x1(ComponentSpace cs) {
 64 | 	cs.print1x1();
 65 | }
 66 | __host__ void ComponentSpace::print() {
 67 | 	::print1x1<<<1,1>>>(*this);
 68 | 	CudaTest("cs.print1x1 failed");
 69 | }
 70 | __device__ unsigned ComponentSpace::numberOfElements() {
 71 | 	return nelements;
 72 | }
 73 | __device__ unsigned ComponentSpace::numberOfComponents() {
 74 | 	return *ncomponents;
 75 | }
 76 | unsigned ComponentSpace::numberOfComponentsHost() {
 77 | 	unsigned hncomponents = 0;
 78 | 	cudaMemcpy(&hncomponents, ncomponents, sizeof(unsigned), cudaMemcpyDeviceToHost);
 79 | 	return hncomponents;
 80 | }
 81 | void ComponentSpace::allocate() {
 82 | 	if (cudaMalloc((void **)&ncomponents, 1 * sizeof(unsigned)) != cudaSuccess) 
 83 | 		CudaTest("allocating ncomponents failed");
 84 | 	if (cudaMalloc((void **)&complen, nelements * sizeof(unsigned)) != cudaSuccess) 
 85 | 		CudaTest("allocating complen failed");
 86 | 	if (cudaMalloc((void **)&ele2comp, nelements * sizeof(unsigned)) != cudaSuccess) 
 87 | 		CudaTest("allocating ele2comp failed");
 88 | }
 89 | __global__ void dinitcs(unsigned nelements, unsigned *complen, unsigned *ele2comp) {
 90 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 91 | 	if (id < nelements) {
 92 | 		//elements[id] 	= id;
 93 | 		complen[id]	= 1;
 94 | 		ele2comp[id]	= id;
 95 | 	}
 96 | }
 97 | void ComponentSpace::init() {
 98 | 	// init the elements.
 99 | 	unsigned blocksize = MAXBLOCKSIZE;	////
100 | 	unsigned nblocks = (nelements + blocksize - 1) / blocksize;
101 | 	dinitcs<<<nblocks, blocksize>>>(nelements, complen, ele2comp);
102 | 	CudaTest("dinitcs failed");
103 | 
104 | 	// init number of components.
105 | 	cudaMemcpy(ncomponents, &nelements, sizeof(unsigned), cudaMemcpyHostToDevice);
106 | }
107 | __device__ bool ComponentSpace::isBoss(unsigned element) {
108 |   return atomicCAS(&ele2comp[element],element,element) == element;
109 | }
110 | __device__ unsigned ComponentSpace::find(unsigned lelement, bool compresspath/*= true*/) {
111 | 	// do we need to worry about concurrency in this function?
112 | 	// for other finds, no synchronization necessary as the data-structure is a tree.
113 | 	// for other unifys, synchornization is not required considering that unify is going to affect only bosses, while find is going to affect only non-bosses.
114 | 	unsigned element = lelement;
115 | 	while (isBoss(element) == false) {
116 | 	  element = ele2comp[element];
117 | 	}
118 | 	if (compresspath) ele2comp[lelement] = element;	// path compression.
119 | 	return element;
120 | }
121 | __device__ bool ComponentSpace::unify(unsigned one, unsigned two) {
122 | 	// if the client makes sure that one component is going to get unified as a source with another destination only once, then synchronization is unnecessary.
123 | 	// while this is true for MST, due to load-balancing in if-block below, a node may be source multiple times.
124 | 	// if a component is source in one thread and destination is another, then it is okay for MST.
125 |     do {
126 |       if(!isBoss(one)) return false;
127 |       if(!isBoss(two)) return false;
128 | 
129 |       unsigned onecomp = one;
130 |       unsigned twocomp = two;
131 |       //unsigned onecomp = find(one, false);
132 |       //unsigned twocomp = find(two, false);
133 | 
134 |       if (onecomp == twocomp) return false; // "duplicate" edges due to symmetry
135 | 
136 | 	unsigned boss = twocomp;
137 | 	unsigned subordinate = onecomp;
138 | 	//if (complen[onecomp] > complen[twocomp]) {	// one is larger, make it the representative: can create cycles.
139 | 	if (boss < subordinate) {			// break cycles by id.
140 | 		boss = onecomp;
141 | 		subordinate = twocomp;
142 | 	}
143 | 	// merge subordinate into the boss.
144 | 	//ele2comp[subordinate] = boss;
145 | 
146 | 	unsigned oldboss = atomicCAS(&ele2comp[subordinate], subordinate, boss);
147 | 	if (oldboss != subordinate) {	// someone else updated the boss.
148 | 		// we need not restore the ele2comp[subordinate], as union-find ensures correctness and complen of subordinate doesn't matter.
149 | 		one = oldboss;
150 | 		two = boss;
151 | 		return false;
152 | 	} else {
153 | 		dprintf("\t\tunifying %d -> %d (%d)\n", subordinate, boss);
154 | 		atomicAdd(&complen[boss], complen[subordinate]);
155 | 		//complen[boss] += complen[subordinate];
156 | 		// complen[subordinate] doesn't matter now, since find() will find its boss.
157 | 	
158 | 		// a component has reduced.
159 | 		unsigned ncomp = atomicSub(ncomponents, 1);
160 | 		//atomicDec(ncomponents, nelements);
161 | 		dprintf("\t%d: ncomponents = %d\n", threadIdx.x, ncomp);
162 | 		return true;
163 | 	}
164 |     } while (true);
165 | }
166 | 


--------------------------------------------------------------------------------
/src/data/kernel_fusion.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu>
  3 | // Data-driven version with Kernel Fusion technique
  4 | #include <cub/cub.cuh>
  5 | #include "gbar.cuh"
  6 | #include "cuda_launch_config.hpp"
  7 | #include "cutil_subset.h"
  8 | #include "common.h"
  9 | #include <thrust/fill.h>
 10 | #include <thrust/sequence.h>
 11 | #include <thrust/count.h>
 12 | #include <thrust/reduce.h>
 13 | #include <thrust/functional.h>
 14 | #include <thrust/execution_policy.h>
 15 | #include "worklistc.h"
 16 | #define	MAXCOLOR 128
 17 | 
 18 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 19 | 
 20 | __global__ void initialize(int *coloring, int m) {
 21 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 22 | 	if (id < m) {
 23 | 		coloring[id] = MAXCOLOR;
 24 | 	}   
 25 | }
 26 | 
 27 | __device__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 &inwl, Worklist2 &outwl, int *coloring) {
 28 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 29 | 	bool forbiddenColors[MAXCOLOR+1];
 30 | 	int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x);
 31 | 	for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) {
 32 | 	//int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1;
 33 | 	//int start = tid * perthread;
 34 | 	//int end = start + perthread;
 35 | 	//for (int id = start; id < end; id ++) {
 36 | 		int vertex;
 37 | 		if (inwl.pop_id(id, vertex)) {
 38 | 			for (int j = 0; j < MAXCOLOR; j++)
 39 | 				forbiddenColors[j] = false;
 40 | 			int row_begin = csrRowPtr[vertex];
 41 | 			int row_end = csrRowPtr[vertex + 1];
 42 | 			for (int offset = row_begin; offset < row_end; offset ++) {
 43 | 				int neighbor = csrColInd[offset];
 44 | 				int color = coloring[neighbor];
 45 | 				forbiddenColors[color] = true;
 46 | 			}
 47 | 			int vertex_color;
 48 | 			for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color ++) {
 49 | 				if (!forbiddenColors[vertex_color]) {
 50 | 					coloring[vertex] = vertex_color;
 51 | 					break;
 52 | 				}
 53 | 			}
 54 | 			assert(vertex_color < MAXCOLOR);
 55 | 		}
 56 | 	}
 57 | }
 58 | 
 59 | __device__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 &inwl, Worklist2 &outwl, int *coloring) {
 60 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 61 | 	int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x);
 62 | 	for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) {
 63 | 	//int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1;
 64 | 	//int start = tid * perthread;
 65 | 	//int end = start + perthread;
 66 | 	//for (int id = start; id < end; id ++) {
 67 | 		int vertex;
 68 | 		int conflicted = 0;
 69 | 		if (inwl.pop_id(id, vertex)) {
 70 | 			int row_begin = csrRowPtr[vertex];
 71 | 			int row_end = csrRowPtr[vertex + 1];
 72 | 			for (int offset = row_begin; offset < row_end; offset ++) {
 73 | 				int neighbor = csrColInd[offset];
 74 | 				if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 75 | 					conflicted = 1;
 76 | 					coloring[vertex] = MAXCOLOR;
 77 | 					break;
 78 | 				}
 79 | 			}
 80 | 		}
 81 | 		outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE);
 82 | 	}
 83 | }
 84 | 
 85 | __global__ void color_kernel(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring, GlobalBarrier gb) {
 86 | 	Worklist2 *in;
 87 | 	Worklist2 *out;
 88 | 	Worklist2 *tmp;	
 89 | 	in = &inwl; out = &outwl;
 90 | 	while (*in->dindex > 0) {
 91 | 		firstFit(m, csrRowPtr, csrColInd, *in, *out, coloring);
 92 | 		gb.Sync();
 93 | 		conflictResolve(m, csrRowPtr, csrColInd, *in, *out, coloring);
 94 | 		gb.Sync();
 95 | 		tmp = in;
 96 | 		in = out;
 97 | 		out = tmp;
 98 | 		*out->dindex = 0;
 99 | 	}
100 | }
101 | 
102 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
103 | 	double starttime, endtime;
104 | 	double runtime[ITERATIONS];
105 | 	int colors[ITERATIONS];
106 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
107 | 	printf("Graph coloring data-driven Kernel Fusion version\n");
108 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
109 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
110 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
111 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
112 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
113 | 
114 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
115 | 	int device = 0;
116 | 	int deviceCount = 0;
117 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
118 | 	cudaDeviceProp deviceProp;
119 | 	cudaGetDeviceProperties(&deviceProp, device);
120 | 	int nSM = deviceProp.multiProcessorCount;
121 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
122 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
123 | 	//int nSM = num_SMs;
124 | 	const size_t max_blocks = maximum_residency(color_kernel, BLKSIZE, 0);
125 | 	//printf("nSM=%d, block_size=%d, max_blocks=%d\n", nSM, BLKSIZE, max_blocks);
126 | 	GlobalBarrierLifetime gb;
127 | 	gb.Setup(nSM * max_blocks);
128 | 	for (int i = 0; i < ITERATIONS; i++) {
129 | 		Worklist2 inwl(m), outwl(m);
130 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
131 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
132 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
133 | 
134 | 		starttime = rtclock();
135 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
136 | 		color_kernel<<<nSM * max_blocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, inwl, outwl, d_coloring, gb);
137 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
138 | 		endtime = rtclock();
139 | 
140 | 		runtime[i] = 1000.0f * (endtime - starttime);
141 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
142 | 	}
143 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
144 | 	double total_time = 0.0;
145 | 	int total_colors = 0;
146 | 	for (int i = 0; i < ITERATIONS; i++) {
147 | 		total_time += runtime[i];
148 | 		total_colors += colors[i];
149 | 		printf("[%d %.2f %d] ", colors[i], runtime[i]);
150 | 	}
151 | 	double avg_time = (double)total_time / ITERATIONS;
152 | 	double avg_colors = (double)total_colors / ITERATIONS;
153 | 	printf("\navg_time %f ms, avg_colors %.2f\n", avg_time, avg_colors);
154 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
155 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
156 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
157 | }
158 | 


--------------------------------------------------------------------------------
/src/data/kernel_bitset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "worklistc.h"
 15 | #define	SCRATCHSIZE BLKSIZE
 16 | #define	MAXCOLOR 128 // assume graph can be colored with less than 128 colors
 17 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 18 | 
 19 | __global__ void initialize(int *coloring, int m) {
 20 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 21 | 	if (id < m) {
 22 | 		coloring[id] = MAXCOLOR;
 23 | 	}   
 24 | }
 25 | 
 26 | __device__ __forceinline__ void assignColor(unsigned int *forbiddenColors, int *coloring, int node) {
 27 | 	int i;
 28 | /*
 29 | 	int c = 32;
 30 | 	for (i = 0; i < MAXCOLOR/32; i++) {
 31 | 		if (forbiddenColors[i] != 0) {
 32 | 			forbiddenColors[i] &= -(signed)forbiddenColors[i];
 33 | 			if (forbiddenColors[i]) c--;
 34 | 			if (forbiddenColors[i] & 0x0000ffff) c -= 16;
 35 | 	        	if (forbiddenColors[i] & 0x00ff00ff) c -= 8;
 36 |         		if (forbiddenColors[i] & 0x0f0f0f0f) c -= 4;
 37 | 		        if (forbiddenColors[i] & 0x33333333) c -= 2;
 38 |         		if (forbiddenColors[i] & 0x55555555) c -= 1;
 39 | 			coloring[node] = c;
 40 | 			break;
 41 | 		}
 42 | 		else
 43 | 			c += 32;
 44 | 	}
 45 | //*/
 46 | ///*
 47 | 	for (i = 0; i < MAXCOLOR/32; i++) {
 48 | 		int pos = __ffs(forbiddenColors[i]);
 49 | 		if(pos) {
 50 | 			coloring[node] = i * 32 + pos - 1;
 51 | 			break;
 52 | 		}
 53 | 	}
 54 | 	assert(i < MAXCOLOR/32);
 55 | //*/
 56 | }
 57 | 
 58 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) {
 59 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 60 | 	unsigned forbiddenColors[MAXCOLOR/32+1];
 61 | 	int vertex;
 62 | 	if (inwl.pop_id(id, vertex)) {
 63 | 		int row_begin = csrRowPtr[vertex];
 64 | 		int row_end = csrRowPtr[vertex + 1];
 65 | 		for (int j = 0; j < MAXCOLOR/32; j++)
 66 | 			forbiddenColors[j] = 0xffffffff;
 67 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 68 | 			int neighbor = csrColInd[offset];
 69 | 			int color = coloring[neighbor];
 70 | 			forbiddenColors[color / 32] &= ~(1 << (color % 32));
 71 | 		}
 72 | 		assignColor(forbiddenColors, coloring, vertex);
 73 | 	}
 74 | }
 75 | 
 76 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) {
 77 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 78 | 	int vertex;
 79 | 	int conflicted = 0;
 80 | 	if (inwl.pop_id(id, vertex)) {
 81 | 		int row_begin = csrRowPtr[vertex];
 82 | 		int row_end = csrRowPtr[vertex + 1];
 83 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 84 | 			int neighbor = csrColInd[offset];
 85 | 			if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 86 | 				conflicted = 1;
 87 | 				coloring[vertex] = MAXCOLOR;
 88 | 				break;
 89 | 			}
 90 | 		}
 91 | 	}
 92 | 	outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE);
 93 | }
 94 | 
 95 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
 96 | 	double starttime, endtime;
 97 | 	double runtime[ITERATIONS];
 98 | 	int colors[ITERATIONS];
 99 | 	int iterations[ITERATIONS];
100 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
101 | 	printf("Graph coloring data-driven Bitset version\n");
102 | 	
103 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
104 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
105 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
106 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
107 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
108 | 	cudaDeviceSynchronize();
109 | 	int device = 0;
110 | 	int deviceCount = 0;
111 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
112 | 	cudaDeviceProp deviceProp;
113 | 	cudaGetDeviceProperties(&deviceProp, device);
114 | 	int nSM = deviceProp.multiProcessorCount;
115 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
116 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
117 | 	const size_t max_blocks_1 = maximum_residency(firstFit, BLKSIZE, 0); 
118 | 	const size_t max_blocks_2 = maximum_residency(conflictResolve, BLKSIZE, 0); 
119 | 	printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2);
120 | 
121 | 	for (int i = 0; i < ITERATIONS; i++) {
122 | 		Worklist2 inwl(m), outwl(m);
123 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
124 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
125 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
126 | 		int nitems = m;
127 | 		iterations[i] = 0;
128 | 
129 | 		starttime = rtclock();
130 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
131 | 		int iteration = 0;
132 | 		while (nitems > 0) {
133 | 			iterations[i] ++;
134 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
135 | 			firstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring);
136 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring);
137 | 			nitems = outwlptr->nitems();
138 | 			Worklist2 * tmp = inwlptr;
139 | 			inwlptr = outwlptr;
140 | 			outwlptr = tmp;
141 | 			outwlptr->reset();
142 | 		}
143 | 		cudaDeviceSynchronize();
144 | 		endtime = rtclock();
145 | 		runtime[i] = 1000.0f * (endtime - starttime);
146 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
147 | 	}
148 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
149 | 	double total_time = 0.0;
150 | 	int total_colors = 0;
151 | 	int total_iterations = 0;
152 | 	for (int i = 0; i < ITERATIONS; i++) {
153 | 		total_time += runtime[i];
154 | 		total_colors += colors[i];
155 | 		total_iterations += iterations[i];
156 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
157 | 	}
158 | 	double avg_time = (double)total_time / ITERATIONS;
159 | 	double avg_colors = (double)total_colors / ITERATIONS;
160 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
161 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
162 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
163 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
164 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
165 | }
166 | 


--------------------------------------------------------------------------------
/src/data/kernel_tc.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu>
  3 | // Data-driven version with Thread Coarsening technique
  4 | #include <cub/cub.cuh>
  5 | #include "gbar.cuh"
  6 | #include "cuda_launch_config.hpp"
  7 | #include "cutil_subset.h"
  8 | #include "common.h"
  9 | #include <thrust/fill.h>
 10 | #include <thrust/sequence.h>
 11 | #include <thrust/count.h>
 12 | #include <thrust/reduce.h>
 13 | #include <thrust/functional.h>
 14 | #include <thrust/execution_policy.h>
 15 | #include "worklistc.h"
 16 | #define	MAXCOLOR 128
 17 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 18 | 
 19 | __global__ void initialize(int *coloring, int m) {
 20 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 21 | 	if (id < m) {
 22 | 		coloring[id] = MAXCOLOR;
 23 | 	}   
 24 | }
 25 | 
 26 | __global__ void FirstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) {
 27 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 28 | 	bool forbiddenColors[MAXCOLOR+1];
 29 | 	int id = tid;
 30 | 	//int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x);
 31 | 	//for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) {
 32 | 	//int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1;
 33 | 	//int start = tid * perthread;
 34 | 	//int end = start + perthread;
 35 | 	//for (int id = start; id < end; id ++) {
 36 | 		int vertex;
 37 | 		if (inwl.pop_id(id, vertex)) {
 38 | 			for (int j = 0; j < MAXCOLOR; j++)
 39 | 				forbiddenColors[j] = false;
 40 | 			int row_begin = csrRowPtr[vertex];
 41 | 			int row_end = csrRowPtr[vertex + 1];
 42 | 			for (int offset = row_begin; offset < row_end; offset ++) {
 43 | 				int neighbor = csrColInd[offset];
 44 | 				int color = coloring[neighbor];
 45 | 				forbiddenColors[color] = true;
 46 | 			}
 47 | 			int vertex_color;
 48 | 			for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color ++) {
 49 | 				if (!forbiddenColors[vertex_color]) {
 50 | 					coloring[vertex] = vertex_color;
 51 | 					break;
 52 | 				}
 53 | 			}
 54 | 			assert(vertex_color < MAXCOLOR);
 55 | 		}
 56 | 	//}
 57 | }
 58 | 
 59 | __global__ void conflictDetect(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) {
 60 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 61 | 	//int id = tid;
 62 | 	int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x);
 63 | 	for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) {
 64 | 	//int perthread = (*inwl.dindex - 1) / (gridDim.x * blockDim.x) + 1;
 65 | 	//int start = tid * perthread;
 66 | 	//int end = start + perthread;
 67 | 	//for (int id = start; id < end; id ++) {
 68 | 		int vertex;
 69 | 		int conflicted = 0;
 70 | 		if (inwl.pop_id(id, vertex)) {
 71 | 			int row_begin = csrRowPtr[vertex];
 72 | 			int row_end = csrRowPtr[vertex + 1];
 73 | 			for (int offset = row_begin; offset < row_end; offset ++) {
 74 | 				int neighbor = csrColInd[offset];
 75 | 				if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 76 | 					conflicted = 1;
 77 | 					coloring[vertex] = MAXCOLOR;
 78 | 					break;
 79 | 				}
 80 | 			}
 81 | 		}
 82 | 		outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE);
 83 | 	}
 84 | }
 85 | 
 86 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
 87 | 	double starttime, endtime;
 88 | 	double runtime[ITERATIONS];
 89 | 	int colors[ITERATIONS];
 90 | 	int iterations[ITERATIONS];
 91 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
 92 | 	printf("Graph coloring data-driven Thread Coarsening version\n");
 93 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
 94 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
 95 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
 96 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
 97 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
 98 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
 99 | 	int device = 0;
100 | 	int deviceCount = 0;
101 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
102 | 	cudaDeviceProp deviceProp;
103 | 	cudaGetDeviceProperties(&deviceProp, device);
104 | 	int nSM = deviceProp.multiProcessorCount;
105 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
106 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
107 | 
108 | 	const size_t max_blocks_1 = maximum_residency(FirstFit, BLKSIZE, 0);
109 | 	const size_t max_blocks_2 = maximum_residency(conflictDetect, BLKSIZE, 0);
110 | 	printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2);
111 | 
112 | 	for (int i = 0; i < ITERATIONS; i++) {
113 | 		Worklist2 inwl(m), outwl(m);
114 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
115 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
116 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
117 | 		int nitems = m;
118 | 		iterations[i] = 0;
119 | 
120 | 		starttime = rtclock();
121 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
122 | 		while (nitems > 0) {
123 | 			iterations[i] ++;
124 | 			//printf("nitems=%d\n", nitems);
125 | 			int nblocks_1 = nSM * max_blocks_1;
126 | 			int nblocks_2 = nSM * max_blocks_2;
127 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
128 | 			if(nblocks < nblocks_1) nblocks_1 = nblocks;
129 | 			if(nblocks < nblocks_2) nblocks_2 = nblocks;
130 | 			FirstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring);
131 | 			conflictDetect<<<nblocks_2, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring);
132 | 			nitems = outwlptr->nitems();
133 | 			Worklist2 * tmp = inwlptr;
134 | 			inwlptr = outwlptr;
135 | 			outwlptr = tmp;
136 | 			outwlptr->reset();
137 | 		}
138 | 		cudaDeviceSynchronize();
139 | 		endtime = rtclock();
140 | 		runtime[i] = 1000.0f * (endtime - starttime);
141 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
142 | 	}
143 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
144 | 	double total_time = 0.0;
145 | 	int total_colors = 0;
146 | 	int total_iterations = 0;
147 | 	for (int i = 0; i < ITERATIONS; i++) {
148 | 		total_time += runtime[i];
149 | 		total_colors += colors[i];
150 | 		total_iterations += iterations[i];
151 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
152 | 	}
153 | 	double avg_time = (double)total_time / ITERATIONS;
154 | 	double avg_colors = (double)total_colors / ITERATIONS;
155 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
156 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
157 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
158 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
159 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
160 | }
161 | 


--------------------------------------------------------------------------------
/src/data/kernel_pq.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "worklistc.h"
 15 | #define	MAXCOLOR 128
 16 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 17 | 
 18 | __global__ void initialize(int *coloring, int m) {
 19 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 20 | 	if (id < m) {
 21 | 		coloring[id] = MAXCOLOR;
 22 | 	}
 23 | }
 24 | 
 25 | __global__ void FirstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *degree, int *coloring) {
 26 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;	
 27 | 	bool forbiddenColors[MAXCOLOR + 1];
 28 | 	int vertex;
 29 | 	if (inwl.pop_id(id, vertex)) {
 30 | 		int row_begin = csrRowPtr[vertex];
 31 | 		int row_end = csrRowPtr[vertex + 1];
 32 | 		for (int j = 0; j < MAXCOLOR; j++)
 33 | 			forbiddenColors[j] = false;
 34 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 35 | 			int neighbor = csrColInd[offset];
 36 | 			int color = coloring[neighbor];
 37 | 			forbiddenColors[color] = true;
 38 | 		}
 39 | 		int vertex_color;
 40 | 		for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) {
 41 | 			if (!forbiddenColors[vertex_color]) {
 42 | 				coloring[vertex] = vertex_color;
 43 | 				break;
 44 | 			}
 45 | 		}
 46 | 		assert(vertex_color < MAXCOLOR);
 47 | 	}
 48 | }
 49 | 
 50 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) {
 51 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 52 | 	int conflicted = 0;
 53 | 	int vertex;
 54 | 	if (inwl.pop_id(id, vertex)) {
 55 | 		int row_begin = csrRowPtr[vertex];
 56 | 		int row_end = csrRowPtr[vertex + 1];
 57 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 58 | 			int neighbor = csrColInd[offset];
 59 | 			if (coloring[vertex] == coloring[neighbor]) {
 60 | 				bool is_victim;
 61 | 				if(degree[vertex] == degree[neighbor])
 62 | 					is_victim = (vertex < neighbor) ? true : false;
 63 | 				else is_victim = (degree[vertex] < degree[neighbor]) ? true : false;
 64 | 				if(is_victim) {
 65 | 					conflicted = 1;
 66 | 					coloring[vertex] = MAXCOLOR;
 67 | 					break;
 68 | 				}
 69 | 			}
 70 | 		}
 71 | 	}
 72 | 	outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE); // push to outwl if conflicted
 73 | }
 74 | 
 75 | __global__ void gatherKey(int *degree, int *key, Worklist2 wl, int n) {
 76 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 77 | 	int vertex;
 78 | 	if (id < n) {
 79 | 		wl.pop_id(id, vertex);
 80 | 		key[id] = degree[vertex];
 81 | 	}
 82 | }
 83 | 
 84 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
 85 | 	double starttime, endtime;
 86 | 	double runtime[ITERATIONS];
 87 | 	int colors[ITERATIONS];
 88 | 	int iterations[ITERATIONS];
 89 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring, *d_degree, *d_key;
 90 | 	printf("Graph coloring data-driven Priority Queue version\n");
 91 | 	int *degree = (int *)malloc(m * sizeof(int));
 92 | 	for(int i = 0; i < m; i ++) {
 93 | 		degree[i] = csrRowPtr[i + 1] - csrRowPtr[i];
 94 | 	}
 95 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
 96 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
 97 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_degree, m * sizeof(int)));
 98 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_key, m * sizeof(int)));
 99 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
100 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
101 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
102 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
103 | 	int device = 0;
104 | 	int deviceCount = 0;
105 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
106 | 	cudaDeviceProp deviceProp;
107 | 	cudaGetDeviceProperties(&deviceProp, device);
108 | 	int nSM = deviceProp.multiProcessorCount;
109 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
110 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
111 | 
112 | 	for (int i = 0; i < ITERATIONS; i++) {
113 | 		Worklist2 inwl(m), outwl(m);
114 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
115 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
116 | 		CUDA_SAFE_CALL(cudaMemcpy(d_degree, degree, m * sizeof(int), cudaMemcpyHostToDevice));
117 | 		CUDA_SAFE_CALL(cudaMemcpy(d_key, degree, m * sizeof(int), cudaMemcpyHostToDevice));
118 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
119 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
120 | 		iterations[i] = 0;
121 | 
122 | 		starttime = rtclock();
123 | 		int nitems = m;
124 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
125 | 		//thrust::sort_by_key(thrust::device, d_key, d_key + m, inwl.dwl, thrust::greater<int>());
126 | 		while (nitems > 0) {
127 | 			iterations[i] ++;
128 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
129 | 			FirstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_degree, d_coloring);
130 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_degree, d_coloring);
131 | 			nitems = outwlptr->nitems();
132 | 			//gatherKey<<<((nitems - 1) / BLKSIZE + 1), BLKSIZE>>>(d_degree, d_key, *outwlptr, nitems);
133 | 			//thrust::sort_by_key(thrust::device, d_key, d_key + nitems, inwl.dwl, thrust::greater<int>());
134 | 			Worklist2 * tmp = inwlptr;
135 | 			inwlptr = outwlptr;
136 | 			outwlptr = tmp;
137 | 			outwlptr->reset();
138 | 		}
139 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
140 | 		endtime = rtclock();
141 | 		runtime[i] = 1000.0f * (endtime - starttime);
142 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
143 | 	}
144 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
145 | 	double total_time = 0.0;
146 | 	int total_colors = 0;
147 | 	int total_iterations = 0;
148 | 	for (int i = 0; i < ITERATIONS; i++) {
149 | 		total_time += runtime[i];
150 | 		total_colors += colors[i];
151 | 		total_iterations += iterations[i];
152 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
153 | 	}
154 | 	double avg_time = (double)total_time / ITERATIONS;
155 | 	double avg_colors = (double)total_colors / ITERATIONS;
156 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
157 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
158 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
159 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
160 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
161 | }
162 | 


--------------------------------------------------------------------------------
/src/serial/graph_io.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | 
  4 | // transfer R-MAT generated gr graph to CSR format
  5 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
  6 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
  7 | 	std::ifstream cfile;
  8 | 	cfile.open(gr);
  9 | 	std::string str;
 10 | 	getline(cfile, str);
 11 | 	char c;
 12 | 	sscanf(str.c_str(), "%c", &c);
 13 | 	while (c == 'c') {
 14 | 		getline(cfile, str);
 15 | 		sscanf(str.c_str(), "%c", &c);
 16 | 	}
 17 | 	char sp[3];
 18 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 19 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 20 | 	//printf("%c %s %d %d\n", c, sp, m, nnz);
 21 | 	vector<set<int> > svector;
 22 | 	set<int> s;
 23 | 	for (int i = 0; i < m; i++)
 24 | 		svector.push_back(s);
 25 | 	int dst, src;
 26 | 	for (int i = 0; i < nnz; i++) {
 27 | 		getline(cfile, str);
 28 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
 29 | 
 30 | 		if (c != 'a')
 31 | 			printf("line %d\n", __LINE__);
 32 | 		dst--;
 33 | 		src--;
 34 | 		svector[src].insert(dst);
 35 | 		svector[dst].insert(src);
 36 | 	}
 37 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 38 | 	int count = 0;
 39 | 	for (int i = 0; i < m; i++) {
 40 | 		csrRowPtr[i] = count;
 41 | 		count += svector[i].size();
 42 | 	}
 43 | 	csrRowPtr[m] = count;
 44 | 	if (count != nnz) {
 45 | 		printf("This graph is not symmetric\n");
 46 | 		nnz = count;
 47 | 	}
 48 | 	double avgdeg;
 49 | 	double variance = 0.0;
 50 | 	int maxdeg = 0;
 51 | 	int mindeg = m;
 52 | 	avgdeg = (double)nnz / m;
 53 | 	for (int i = 0; i < m; i++) {
 54 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 55 | 		if (deg_i > maxdeg)
 56 | 			maxdeg = deg_i;
 57 | 		if (deg_i < mindeg)
 58 | 			mindeg = deg_i;
 59 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 60 | 	}
 61 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 62 | 	csrColInd = (int *)malloc(count * sizeof(int));
 63 | 	set<int>::iterator site;
 64 | 	for (int i = 0, index = 0; i < m; i++) {
 65 | 		site = svector[i].begin();
 66 | 		while (site != svector[i].end()) {
 67 | 			csrColInd[index++] = *site;
 68 | 			site++;
 69 | 		}
 70 | 	}
 71 | }
 72 | 
 73 | // transfer *.graph file to CSR format
 74 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 75 | 	printf("Reading .graph input file %s\n", graph);
 76 | 	std::ifstream cfile;
 77 | 	cfile.open(graph);
 78 | 	std::string str;
 79 | 	getline(cfile, str);
 80 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 81 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 82 | 	vector<set<int> > svector;
 83 | 	set<int> s;
 84 | 	for (int i = 0; i < m; i++)
 85 | 		svector.push_back(s);
 86 | 	int dst;
 87 | 	for (int i = 0; i < m; i++) {
 88 | 		getline(cfile, str);
 89 | 		istringstream istr;
 90 | 		istr.str(str);
 91 | 		while(istr>>dst) {
 92 | 			dst --;
 93 | 			svector[i].insert(dst);
 94 | 			svector[dst].insert(i);
 95 | 		}
 96 | 		istr.clear();
 97 | 	}
 98 |     cfile.close();
 99 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
100 | 	int count = 0;
101 | 	for (int i = 0; i < m; i++) {
102 | 		csrRowPtr[i] = count;
103 | 		count += svector[i].size();
104 | 	}
105 | 	csrRowPtr[m] = count;
106 | 	if (count != nnz) {
107 | 		printf("This graph is not symmetric\n");
108 | 		nnz = count;
109 | 	}
110 | 	double avgdeg;
111 | 	double variance = 0.0;
112 | 	int maxdeg = 0;
113 | 	int mindeg = m;
114 | 	avgdeg = (double)nnz / m;
115 | 	for (int i = 0; i < m; i++) {
116 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
117 | 		if (deg_i > maxdeg)
118 | 			maxdeg = deg_i;
119 | 		if (deg_i < mindeg)
120 | 			mindeg = deg_i;
121 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
122 | 	}
123 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
124 | 	csrColInd = (int *)malloc(count * sizeof(int));
125 | 	set<int>::iterator site;
126 | 	for (int i = 0, index = 0; i < m; i++) {
127 | 		site = svector[i].begin();
128 | 		while (site != svector[i].end()) {
129 | 			csrColInd[index++] = *site;
130 | 			site++;
131 | 		}
132 | 	}
133 | }
134 | 
135 | // transfer mtx graph to CSR format
136 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
137 | 	printf("Reading (.mtx) input file %s\n", mtx);
138 | 	std::ifstream cfile;
139 | 	cfile.open(mtx);
140 | 	std::string str;
141 | 	getline(cfile, str);
142 | 	char c;
143 | 	sscanf(str.c_str(), "%c", &c);
144 | 	while (c == '%') {
145 | 		getline(cfile, str);
146 | 		sscanf(str.c_str(), "%c", &c);
147 | 	}
148 | 	int n;
149 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
150 | 	if (m != n) {
151 | 		printf("error!\n");
152 | 		exit(0);
153 | 	}
154 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
155 | 	vector<set<int> > svector;
156 | 	set<int> s;
157 | 	for (int i = 0; i < m; i++)
158 | 		svector.push_back(s);
159 | 	int dst, src;
160 | 	for (int i = 0; i < nnz; i++) {
161 | 		getline(cfile, str);
162 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
163 | 		dst--;
164 | 		src--;
165 | 		svector[src].insert(dst);
166 | 		svector[dst].insert(src);
167 | 	}
168 | 	cfile.close();
169 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
170 | 	int count = 0;
171 | 	for (int i = 0; i < m; i++) {
172 | 		csrRowPtr[i] = count;
173 | 		count += svector[i].size();
174 | 	}
175 | 	csrRowPtr[m] = count;
176 | 	if (count != nnz) {
177 | 		printf("This graph is not symmetric\n");
178 | 		nnz = count;
179 | 	}
180 | 	double avgdeg;
181 | 	double variance = 0.0;
182 | 	int maxdeg = 0;
183 | 	int mindeg = m;
184 | 	avgdeg = (double)nnz / m;
185 | 	for (int i = 0; i < m; i++) {
186 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
187 | 		if (deg_i > maxdeg)
188 | 			maxdeg = deg_i;
189 | 		if (deg_i < mindeg)
190 | 			mindeg = deg_i;
191 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
192 | 	}
193 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
194 | 	csrColInd = (int *)malloc(count * sizeof(int));
195 | 	set<int>::iterator site;
196 | 	for (int i = 0, index = 0; i < m; i++) {
197 | 		site = svector[i].begin();
198 | 		while (site != svector[i].end()) {
199 | 			csrColInd[index++] = *site;
200 | 			site++;
201 | 		}
202 | 	}
203 | }
204 | 
205 | // store color of all vertex
206 | void write_solution(char *fname, int *coloring, int n) {
207 | 	int i;
208 | 	FILE *fp;
209 | 	fp = fopen(fname, "w");
210 | 	for (i = 0; i < n; i++) {
211 | 		//fprintf(fp, "%d:%d\n", i, coloring[i]);
212 | 		fprintf(fp, "%d\n", coloring[i]);
213 | 	}
214 | 	fclose(fp);
215 | }
216 | 
217 | // check if correctly coloured
218 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
219 | 	int i, offset, neighbor_j;
220 | 	for (i = 0; i < m; i++) {
221 | 		for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) {
222 | 			neighbor_j = csrColInd[offset];
223 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
224 | 				*correct = 0;
225 | 				//printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
226 | 				break;
227 | 			}
228 | 		}	
229 | 	}
230 | }
231 | 


--------------------------------------------------------------------------------
/src/topo/graph_io.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | 
  4 | // transfer R-MAT generated gr graph to CSR format
  5 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
  6 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
  7 | 	std::ifstream cfile;
  8 | 	cfile.open(gr);
  9 | 	std::string str;
 10 | 	getline(cfile, str);
 11 | 	char c;
 12 | 	sscanf(str.c_str(), "%c", &c);
 13 | 	while (c == 'c') {
 14 | 		getline(cfile, str);
 15 | 		sscanf(str.c_str(), "%c", &c);
 16 | 	}
 17 | 	char sp[3];
 18 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 19 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 20 | 	//printf("%c %s %d %d\n", c, sp, m, nnz);
 21 | 	vector<set<int> > svector;
 22 | 	set<int> s;
 23 | 	for (int i = 0; i < m; i++)
 24 | 		svector.push_back(s);
 25 | 	int dst, src;
 26 | 	for (int i = 0; i < nnz; i++) {
 27 | 		getline(cfile, str);
 28 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
 29 | 
 30 | 		if (c != 'a')
 31 | 			printf("line %d\n", __LINE__);
 32 | 		dst--;
 33 | 		src--;
 34 | 		svector[src].insert(dst);
 35 | 		svector[dst].insert(src);
 36 | 	}
 37 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 38 | 	int count = 0;
 39 | 	for (int i = 0; i < m; i++) {
 40 | 		csrRowPtr[i] = count;
 41 | 		count += svector[i].size();
 42 | 	}
 43 | 	csrRowPtr[m] = count;
 44 | 	if (count != nnz) {
 45 | 		printf("This graph is not symmetric\n");
 46 | 		nnz = count;
 47 | 	}
 48 | 	double avgdeg;
 49 | 	double variance = 0.0;
 50 | 	int maxdeg = 0;
 51 | 	int mindeg = m;
 52 | 	avgdeg = (double)nnz / m;
 53 | 	for (int i = 0; i < m; i++) {
 54 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 55 | 		if (deg_i > maxdeg)
 56 | 			maxdeg = deg_i;
 57 | 		if (deg_i < mindeg)
 58 | 			mindeg = deg_i;
 59 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 60 | 	}
 61 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 62 | 	csrColInd = (int *)malloc(count * sizeof(int));
 63 | 	set<int>::iterator site;
 64 | 	for (int i = 0, index = 0; i < m; i++) {
 65 | 		site = svector[i].begin();
 66 | 		while (site != svector[i].end()) {
 67 | 			csrColInd[index++] = *site;
 68 | 			site++;
 69 | 		}
 70 | 	}
 71 | }
 72 | 
 73 | // transfer *.graph file to CSR format
 74 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 75 | 	printf("Reading .graph input file %s\n", graph);
 76 | 	std::ifstream cfile;
 77 | 	cfile.open(graph);
 78 | 	std::string str;
 79 | 	getline(cfile, str);
 80 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 81 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 82 | 	vector<set<int> > svector;
 83 | 	set<int> s;
 84 | 	for (int i = 0; i < m; i++)
 85 | 		svector.push_back(s);
 86 | 	int dst;
 87 | 	for (int i = 0; i < m; i++) {
 88 | 		getline(cfile, str);
 89 | 		istringstream istr;
 90 | 		istr.str(str);
 91 | 		while(istr>>dst) {
 92 | 			dst --;
 93 | 			svector[i].insert(dst);
 94 | 			svector[dst].insert(i);
 95 | 		}
 96 | 		istr.clear();
 97 | 	}
 98 |     cfile.close();
 99 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
100 | 	int count = 0;
101 | 	for (int i = 0; i < m; i++) {
102 | 		csrRowPtr[i] = count;
103 | 		count += svector[i].size();
104 | 	}
105 | 	csrRowPtr[m] = count;
106 | 	if (count != nnz) {
107 | 		printf("This graph is not symmetric\n");
108 | 		nnz = count;
109 | 	}
110 | 	double avgdeg;
111 | 	double variance = 0.0;
112 | 	int maxdeg = 0;
113 | 	int mindeg = m;
114 | 	avgdeg = (double)nnz / m;
115 | 	for (int i = 0; i < m; i++) {
116 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
117 | 		if (deg_i > maxdeg)
118 | 			maxdeg = deg_i;
119 | 		if (deg_i < mindeg)
120 | 			mindeg = deg_i;
121 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
122 | 	}
123 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
124 | 	csrColInd = (int *)malloc(count * sizeof(int));
125 | 	set<int>::iterator site;
126 | 	for (int i = 0, index = 0; i < m; i++) {
127 | 		site = svector[i].begin();
128 | 		while (site != svector[i].end()) {
129 | 			csrColInd[index++] = *site;
130 | 			site++;
131 | 		}
132 | 	}
133 | }
134 | 
135 | // transfer mtx graph to CSR format
136 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
137 | 	printf("Reading (.mtx) input file %s\n", mtx);
138 | 	std::ifstream cfile;
139 | 	cfile.open(mtx);
140 | 	std::string str;
141 | 	getline(cfile, str);
142 | 	char c;
143 | 	sscanf(str.c_str(), "%c", &c);
144 | 	while (c == '%') {
145 | 		getline(cfile, str);
146 | 		sscanf(str.c_str(), "%c", &c);
147 | 	}
148 | 	int n;
149 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
150 | 	if (m != n) {
151 | 		printf("error!\n");
152 | 		exit(0);
153 | 	}
154 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
155 | 	vector<set<int> > svector;
156 | 	set<int> s;
157 | 	for (int i = 0; i < m; i++)
158 | 		svector.push_back(s);
159 | 	int dst, src;
160 | 	for (int i = 0; i < nnz; i++) {
161 | 		getline(cfile, str);
162 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
163 | 		dst--;
164 | 		src--;
165 | 		svector[src].insert(dst);
166 | 		svector[dst].insert(src);
167 | 	}
168 | 	cfile.close();
169 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
170 | 	int count = 0;
171 | 	for (int i = 0; i < m; i++) {
172 | 		csrRowPtr[i] = count;
173 | 		count += svector[i].size();
174 | 	}
175 | 	csrRowPtr[m] = count;
176 | 	if (count != nnz) {
177 | 		printf("This graph is not symmetric\n");
178 | 		nnz = count;
179 | 	}
180 | 	double avgdeg;
181 | 	double variance = 0.0;
182 | 	int maxdeg = 0;
183 | 	int mindeg = m;
184 | 	avgdeg = (double)nnz / m;
185 | 	for (int i = 0; i < m; i++) {
186 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
187 | 		if (deg_i > maxdeg)
188 | 			maxdeg = deg_i;
189 | 		if (deg_i < mindeg)
190 | 			mindeg = deg_i;
191 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
192 | 	}
193 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
194 | 	csrColInd = (int *)malloc(count * sizeof(int));
195 | 	set<int>::iterator site;
196 | 	for (int i = 0, index = 0; i < m; i++) {
197 | 		site = svector[i].begin();
198 | 		while (site != svector[i].end()) {
199 | 			csrColInd[index++] = *site;
200 | 			site++;
201 | 		}
202 | 	}
203 | }
204 | 
205 | // store color of all vertex
206 | void write_solution(char *fname, int *coloring, int n) {
207 | 	int i;
208 | 	FILE *fp;
209 | 	fp = fopen(fname, "w");
210 | 	for (i = 0; i < n; i++) {
211 | 		//fprintf(fp, "%d:%d\n", i, coloring[i]);
212 | 		fprintf(fp, "%d\n", coloring[i]);
213 | 	}
214 | 	fclose(fp);
215 | }
216 | 
217 | // check if correctly coloured
218 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
219 | 	int i, offset, neighbor_j;
220 | 	for (i = 0; i < m; i++) {
221 | 		for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) {
222 | 			neighbor_j = csrColInd[offset];
223 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
224 | 				*correct = 0;
225 | 				//printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
226 | 				break;
227 | 			}
228 | 		}	
229 | 	}
230 | }
231 | 


--------------------------------------------------------------------------------
/src/omp/worklist.h:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * use atomicInc to automatically wrap around.
  3 |  */
  4 | 
  5 | #include <stdio.h>
  6 | #include <time.h>
  7 | #include <fstream>
  8 | #include <string>
  9 | #include <vector>
 10 | #include <iostream>
 11 | #include <string.h>
 12 | #include "common.h"
 13 | #define MINCAPACITY	65535
 14 | #define MAXOVERFLOWS	1
 15 | 
 16 | typedef struct Worklist {
 17 | 	unsigned pushRange(unsigned *start, unsigned nitems);
 18 | 	unsigned push(unsigned work);
 19 | 	unsigned popRange(unsigned *start, unsigned nitems);
 20 | 	unsigned pop(unsigned &work);
 21 | 	void clear();
 22 | 	void myItems(unsigned &start, unsigned &end);
 23 | 	unsigned getItem(unsigned at);
 24 | 	unsigned getItemWithin(unsigned at, unsigned hsize);
 25 | 	unsigned count();
 26 | 
 27 | 	void init();
 28 | 	void init(unsigned initialcapacity);
 29 | 	void setSize(unsigned hsize);
 30 | 	unsigned getSize();
 31 | 	void setCapacity(unsigned hcapacity);
 32 | 	unsigned getCapacity();
 33 | 	void setInitialSize(unsigned hsize);
 34 | 	unsigned calculateSize(unsigned hstart, unsigned hend);
 35 | 	void copyOldToNew(unsigned *olditems, unsigned *newitems, unsigned oldsize, unsigned oldcapacity);
 36 | 	void append(Worklist wl);
 37 | 
 38 | 	Worklist();
 39 | 	~Worklist();
 40 | 	unsigned ensureSpace(unsigned space);
 41 | 	unsigned *alloc(unsigned allocsize);
 42 | 	unsigned realloc(unsigned space);
 43 | 	unsigned dealloc();
 44 | 	unsigned freeSize();
 45 | 	unsigned *items;
 46 | 	unsigned start, end;
 47 | 	unsigned capacity;
 48 | 	unsigned noverflows;
 49 | } Worklist;
 50 | 
 51 | Worklist::Worklist() {
 52 | 	init();
 53 | }
 54 | 
 55 | void Worklist::init() {
 56 | 	init(0);
 57 | }
 58 | 
 59 | void Worklist::init(unsigned initialcapacity) {
 60 | 	setCapacity(initialcapacity);
 61 | 	setInitialSize(0);
 62 | 	items = NULL;
 63 | 	if (initialcapacity) items = alloc(initialcapacity);
 64 | 	noverflows = 0;
 65 | }
 66 | 
 67 | unsigned *Worklist::alloc(unsigned allocsize) {
 68 | 	unsigned *ptr = NULL;
 69 | 	if(allocsize > 0)
 70 | 		ptr = (unsigned *)malloc(allocsize * sizeof(unsigned));
 71 | 	if(ptr == NULL)
 72 | 		printf("%s(%d): Allocating %d failed.\n", __FILE__, __LINE__, allocsize);
 73 | 	return ptr;
 74 | }
 75 | 
 76 | unsigned Worklist::getCapacity() {
 77 | 	return capacity;
 78 | }
 79 | 
 80 | unsigned Worklist::calculateSize(unsigned hstart, unsigned hend) {
 81 | 	if (hend >= hstart) {
 82 | 		return hend - hstart;
 83 | 	}
 84 | 	// circular queue.
 85 | 	unsigned cap = getCapacity();
 86 | 	return hend + (cap - hstart + 1);
 87 | }
 88 | 
 89 | unsigned Worklist::getSize() {
 90 | 	return calculateSize(start, end);
 91 | }
 92 | 
 93 | void Worklist::setCapacity(unsigned cap) {
 94 | 	capacity = cap;
 95 | }
 96 | 
 97 | void Worklist::setInitialSize(unsigned size) {
 98 | 	start = 0;
 99 | 	end = 0;
100 | }
101 | 
102 | void Worklist::setSize(unsigned size) {
103 | 	unsigned cap = getCapacity();
104 | 	if (size > cap) {
105 | 		printf("%s(%d): buffer overflow, setting size=%d, when capacity=%d.\n", __FILE__, __LINE__, size, cap);
106 | 		return;
107 | 	}
108 | 	if (start + size < cap) {
109 | 		end   = start + size;
110 | 	} else {
111 | 		size -= cap - start;
112 | 		end   = size;
113 | 	}
114 | }
115 | 
116 | void Worklist::copyOldToNew(unsigned *olditems, unsigned *newitems, unsigned oldsize, unsigned oldcapacity) {
117 | 	if (start < end) {	// no wrap-around.
118 | 		memcpy(newitems, olditems + start, oldsize * sizeof(unsigned));
119 | 	} else {
120 | 		memcpy(newitems, olditems + start, (oldcapacity - start) * sizeof(unsigned));
121 | 		memcpy(newitems + (oldcapacity - start), olditems, end * sizeof(unsigned));
122 | 	}
123 | }
124 | 
125 | unsigned Worklist::realloc(unsigned space) {
126 | 	unsigned cap = getCapacity();
127 | 	unsigned newcapacity = (space > MINCAPACITY ? space : MINCAPACITY);
128 | 	if (cap == 0) {
129 | 		setCapacity(newcapacity);
130 | 		items = alloc(newcapacity);
131 | 		if (items == NULL) {
132 | 			return 1;
133 | 		}
134 | 		//printf("\tworklist capacity set to %d.\n", getCapacity());
135 | 	} else {
136 | 		unsigned *itemsrealloc = alloc(newcapacity);
137 | 		if (itemsrealloc == NULL) {
138 | 			return 1;
139 | 		}
140 | 		unsigned oldsize = getSize();
141 | 		copyOldToNew(items, itemsrealloc, oldsize, cap);
142 | 		dealloc();
143 | 		items = itemsrealloc;
144 | 		setCapacity(newcapacity);
145 | 		start = 0;
146 | 		end = oldsize;
147 | 		printf("\tworklist capacity reset to %d.\n", getCapacity());
148 | 	}
149 | 	return 0;
150 | }
151 | 
152 | unsigned Worklist::freeSize() {
153 | 	return getCapacity() - getSize();
154 | }
155 | 
156 | unsigned Worklist::ensureSpace(unsigned space) {
157 | 	if (freeSize() >= space) {
158 | 		return 0;
159 | 	}
160 | 	realloc(space);
161 | 	return 1;
162 | }
163 | 
164 | unsigned Worklist::dealloc() {
165 | 	free(items);
166 | 	setInitialSize(0);
167 | 	return 0;
168 | }
169 | 
170 | Worklist::~Worklist() {
171 | }
172 | 
173 | unsigned Worklist::pushRange(unsigned *copyfrom, unsigned nitems) {
174 | 	if (copyfrom == NULL || nitems == 0) return 0;
175 | 
176 | 	unsigned lcap = capacity;
177 | 	unsigned offset = my_fetch_add<unsigned>(&end, nitems);
178 | 	if (offset >= lcap) {	// overflow.
179 | 		my_fetch_sub<unsigned>(&end, nitems);
180 | 		return 1;
181 | 	}
182 | 	for (unsigned ii = 0; ii < nitems; ++ii) {
183 | 		items[(offset + ii) % lcap] = copyfrom[ii];
184 | 	}
185 | 	return 0;
186 | }
187 | 
188 | unsigned Worklist::push(unsigned work) {
189 | 	return pushRange(&work, 1);
190 | }
191 | 
192 | unsigned Worklist::popRange(unsigned *copyto, unsigned nitems) {
193 | 	unsigned currsize = count();
194 | 	if (currsize < nitems) {
195 | 		nitems = currsize;
196 | 	}
197 | 	unsigned offset = 0;
198 | 	unsigned lcap = capacity;
199 | 	if (nitems) {
200 | 		if (start + nitems < lcap) {
201 | 			offset = my_fetch_add<unsigned>(&start, nitems);
202 | 		} else {
203 | 			offset = my_fetch_add<unsigned>(&start, start + nitems - lcap);
204 | 		}
205 | 	}
206 | 	// copy nitems starting from offset.
207 | 	for (unsigned ii = 0; ii < nitems; ++ii) {
208 | 		copyto[ii] = items[(offset + ii) % lcap];
209 | 	}
210 | 	return nitems;
211 | }
212 | 
213 | unsigned Worklist::pop(unsigned &work) {
214 | 	return popRange(&work, 1);
215 | }
216 | 
217 | void Worklist::clear() {
218 | 	setSize(0);
219 | }
220 | 
221 | unsigned Worklist::getItem(unsigned at) {
222 | 	unsigned size = count();
223 | 	return getItemWithin(at, size);
224 | }
225 | 
226 | unsigned Worklist::getItemWithin(unsigned at, unsigned size) {
227 | 	if (at < size) {
228 | 		return items[at];
229 | 	}
230 | 	return -1;
231 | }
232 | 
233 | unsigned Worklist::count() {
234 | 	if (end >= start) {
235 | 		return end - start;
236 | 	} else {
237 | 		return end + (capacity - start + 1);
238 | 	}
239 | }
240 | 
241 | #define SWAPDEV(a, b)	{ unsigned tmp = a; a = b; b = tmp; }
242 | void printWorklist(Worklist wl) {
243 | 	printf("\t");
244 | 	for (unsigned ii = wl.start; ii < wl.end; ++ii) {
245 | 		printf("%d,", wl.getItem(ii));
246 | 	}
247 | 	printf("\n");
248 | }
249 | 
250 | void Worklist::append(Worklist wl) {
251 | 	unsigned size = getSize();
252 | 	for (unsigned ii = 0; ii < wl.count(); ++ii) {
253 | 		items[size + ii] = wl.items[ii];
254 | 	}
255 | 	end += wl.getSize();
256 | }
257 | 
258 | 


--------------------------------------------------------------------------------
/src/data/kernel_base.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "worklistc.h"
 15 | #define TIMING
 16 | #define	SCRATCHSIZE BLKSIZE
 17 | #define	MAXCOLOR 128 // assume graph can be colored with less than 128 colors
 18 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 19 | 
 20 | __global__ void initialize(int *coloring, int m) {
 21 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 22 | 	if (id < m) {
 23 | 		coloring[id] = MAXCOLOR;
 24 | 	}   
 25 | }
 26 | 
 27 | __global__ void firstFit(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, int *coloring) {
 28 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;	
 29 | 	bool forbiddenColors[MAXCOLOR+1];
 30 | 	int vertex;
 31 | 	// get vertex from worklist according to thread id
 32 | 	if (inwl.pop_id(id, vertex)) {
 33 | 		int row_begin = csrRowPtr[vertex];
 34 | 		int row_end = csrRowPtr[vertex + 1];
 35 | 		for (int j = 0; j < MAXCOLOR; j++)
 36 | 			forbiddenColors[j] = false;
 37 | 		// traverse all neighbors of current vertex
 38 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 39 | 			int neighbor = csrColInd[offset];
 40 | 			int color = coloring[neighbor];
 41 | 			//int color = cub::ThreadLoad<cub::LOAD_CG>(coloring + neighbor);
 42 | 			if(color != MAXCOLOR)
 43 | 				forbiddenColors[color] = true; // mask the color
 44 | 		}
 45 | 		// assign the smallest unforbidden color to vertex
 46 | 		int vertex_color;
 47 | 		for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) {
 48 | 			if (!forbiddenColors[vertex_color]) {
 49 | 				coloring[vertex] = vertex_color;
 50 | 				break;
 51 | 			}
 52 | 		}
 53 | 		assert(vertex_color < MAXCOLOR);
 54 | 	}
 55 | }
 56 | 
 57 | __global__ void conflictResolve(int m, int *csrRowPtr, int *csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) {
 58 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 59 | 	int conflicted = 0; // assume vertex not conflicted
 60 | 	int vertex;
 61 | 	if (inwl.pop_id(id, vertex)) {
 62 | 		int row_begin = csrRowPtr[vertex];
 63 | 		int row_end = csrRowPtr[vertex + 1];
 64 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 65 | 			int neighbor = csrColInd[offset];
 66 | 			// if at least one neighbor was assigned the same color as vertex, 
 67 | 			// and its vertex number is bigger than vertex,
 68 | 			// then vertex is regarded as conflicting
 69 | 			if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 70 | 				conflicted = 1;
 71 | 				coloring[vertex] = MAXCOLOR; // reset color
 72 | 				break;
 73 | 			}
 74 | 		}
 75 | 	}
 76 | 	outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE); // push to outwl if conflicted
 77 | }
 78 | 
 79 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
 80 | 	double starttime, endtime;
 81 | 	double runtime[ITERATIONS];
 82 | 	int colors[ITERATIONS];
 83 | 	int iterations[ITERATIONS];
 84 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
 85 | 	printf("Graph coloring data-driven Base version\n");
 86 | 	
 87 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
 88 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
 89 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
 90 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
 91 | #ifdef TIMING
 92 | 	double t1 = rtclock();
 93 | #endif
 94 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
 95 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
 96 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
 97 | #ifdef TIMING
 98 | 	double t2 = rtclock();
 99 | 	printf("Time of init:%f\n", 1000.0f * (t2 - t1));
100 | #endif
101 | 	int device = 0;
102 | 	int deviceCount = 0;
103 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
104 | 	cudaDeviceProp deviceProp;
105 | 	cudaGetDeviceProperties(&deviceProp, device);
106 | 	int nSM = deviceProp.multiProcessorCount;
107 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
108 | 		deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
109 | 	const size_t max_blocks_1 = maximum_residency(firstFit, BLKSIZE, 0); 
110 | 	const size_t max_blocks_2 = maximum_residency(conflictResolve, BLKSIZE, 0); 
111 | 	printf("max_blocks_1=%d, max_blocks_2=%d\n", max_blocks_1, max_blocks_2);
112 | 
113 | 	for (int i = 0; i < ITERATIONS; i++) {
114 | 		Worklist2 inwl(m), outwl(m);
115 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
116 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
117 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
118 | 		iterations[i] = 0;
119 | 
120 | 		starttime = rtclock();
121 | 		int nitems = m;
122 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
123 | 		while (nitems > 0) {
124 | 			iterations[i] ++;
125 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
126 | 			//printf("nitems=%d, nblocks=%d\n", nitems, nblocks);
127 | 			firstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring);
128 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring);
129 | 			nitems = outwlptr->nitems();
130 | 			// swap inwlptr and outwlptr
131 | 			Worklist2 * tmp = inwlptr;
132 | 			inwlptr = outwlptr;
133 | 			outwlptr = tmp;
134 | 			outwlptr->reset(); // clear outwl
135 | 		}
136 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
137 | 		endtime = rtclock();
138 | 		//printf("iteration=%d\n", iterations[i]);
139 | 		runtime[i] = 1000.0f * (endtime - starttime);
140 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
141 | 	}
142 | #ifdef TIMING
143 | 	double t3, t4;
144 | 	t3 = rtclock();
145 | #endif
146 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
147 | #ifdef TIMING
148 | 	t4 = rtclock();
149 | 	printf("Time of copy back:%f\n", 1000.0f * (t4 - t3));
150 | #endif
151 | 	double total_time = 0.0;
152 | 	int total_colors = 0;
153 | 	int total_iterations = 0;
154 | 	for (int i = 0; i < ITERATIONS; i++) {
155 | 		total_time += runtime[i];
156 | 		total_colors += colors[i];
157 | 		total_iterations += iterations[i];
158 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
159 | 	}
160 | 	double avg_time = (double)total_time / ITERATIONS;
161 | 	double avg_colors = (double)total_colors / ITERATIONS;
162 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
163 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
164 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
165 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
166 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
167 | }
168 | 


--------------------------------------------------------------------------------
/src/data/kernel_ldg.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "worklistc.h"
 15 | #define	SCRATCHSIZE BLKSIZE
 16 | #define	MAXCOLOR 128 // assume graph can be colored with less than 128 colors
 17 | //#define TEXTURE
 18 | 
 19 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 20 | #ifdef TEXTURE
 21 | texture <int, 1, cudaReadModeElementType> rowPtr;
 22 | texture <int, 1, cudaReadModeElementType> colInd;
 23 | #endif
 24 | 
 25 | __global__ void initialize(int *coloring, int m) {
 26 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 27 | 	if (id < m) {
 28 | 		coloring[id] = MAXCOLOR;
 29 | 	}   
 30 | }
 31 | 
 32 | #ifdef TEXTURE
 33 | __global__ void FirstFit(int m, Worklist2 inwl, int *coloring) {
 34 | #else
 35 | __global__ void FirstFit(int m, const int * __restrict__  csrRowPtr, const int * __restrict__ csrColInd, Worklist2 inwl, int *coloring) {
 36 | #endif
 37 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;	
 38 | 	bool forbiddenColors[MAXCOLOR+1];
 39 | 	int vertex;
 40 | 	if (inwl.pop_id(id, vertex)) {
 41 | #ifdef TEXTURE
 42 | 		int row_begin = tex1Dfetch(rowPtr, vertex);
 43 | 		int row_end = tex1Dfetch(rowPtr, vertex + 1);
 44 | #else
 45 | 		int row_begin = __ldg(csrRowPtr + vertex);
 46 | 		int row_end = __ldg(csrRowPtr + vertex + 1);
 47 | #endif
 48 | 		for (int i = 0; i < MAXCOLOR; i ++)
 49 | 			forbiddenColors[i] = false;
 50 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 51 | #ifdef TEXTURE
 52 | 			int neighbor = tex1Dfetch(colInd, offset);
 53 | #else
 54 | 			int neighbor = __ldg(csrColInd + offset);
 55 | #endif
 56 | 			int color = coloring[neighbor];
 57 | 			forbiddenColors[color] = true;
 58 | 		}
 59 | 		int vertex_color;
 60 | 		for (vertex_color = 0; vertex_color < MAXCOLOR; vertex_color++) {
 61 | 			if (!forbiddenColors[vertex_color]) {
 62 | 				coloring[vertex] = vertex_color;
 63 | 				break;
 64 | 			}
 65 | 		}
 66 | 		assert(vertex_color < MAXCOLOR);
 67 | 	}
 68 | }
 69 | 
 70 | #ifdef TEXTURE
 71 | __global__ void conflictResolve(int m, Worklist2 inwl, Worklist2 outwl, int *coloring) {
 72 | #else
 73 | __global__ void conflictResolve(int m, const int * __restrict__  csrRowPtr, const int * __restrict__  csrColInd, Worklist2 inwl, Worklist2 outwl, int *coloring) {
 74 | #endif
 75 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 76 | 	int conflicted = 0;
 77 | 	int vertex;
 78 | 	if (inwl.pop_id(id, vertex)) {
 79 | #ifdef TEXTURE
 80 | 		int row_begin = tex1Dfetch(rowPtr, vertex);
 81 | 		int row_end= tex1Dfetch(rowPtr, vertex + 1);
 82 | #else
 83 | 		int row_begin = __ldg(csrRowPtr + vertex);
 84 | 		int row_end= __ldg(csrRowPtr + vertex + 1);
 85 | #endif
 86 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 87 | #ifdef TEXTURE
 88 | 			int neighbor = tex1Dfetch(colInd, offset);
 89 | #else
 90 | 			int neighbor = __ldg(csrColInd + offset);
 91 | #endif
 92 | 			if (coloring[vertex] == coloring[neighbor] && vertex < neighbor) {
 93 | 				conflicted = 1;
 94 | 				coloring[vertex] = MAXCOLOR;
 95 | 				break;
 96 | 			}
 97 | 		}
 98 | 	}
 99 | 	outwl.push_1item<BlockScan>(conflicted, vertex, BLKSIZE);
100 | }
101 | 
102 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
103 | 	double starttime, endtime;
104 | 	double runtime[ITERATIONS];
105 | 	int colors[ITERATIONS];
106 | 	int iterations[ITERATIONS];
107 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring;
108 | 	printf("Graph coloring data-driven LDG version\n");
109 | 	
110 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
111 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
112 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
113 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
114 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
115 | #ifdef TEXTURE
116 | 	CUDA_SAFE_CALL(cudaBindTexture(0, rowPtr, csrRowPtr, (m + 1) * sizeof(int)));
117 | 	CUDA_SAFE_CALL(cudaBindTexture(0, colInd, csrColInd, (nnz + 1) * sizeof(int)));
118 | #endif
119 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
120 | 	int device = 0;
121 | 	int deviceCount = 0;
122 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
123 | 	cudaDeviceProp deviceProp;
124 | 	cudaGetDeviceProperties(&deviceProp, device);
125 | 	int nSM = deviceProp.multiProcessorCount;
126 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
127 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
128 | 
129 | 	for (int i = 0; i < ITERATIONS; i++) {
130 | 		Worklist2 inwl(m), outwl(m);
131 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
132 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
133 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
134 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
135 | 		iterations[i] = 0;
136 | 
137 | 		starttime = rtclock();
138 | 		int nitems = m;
139 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
140 | 		int iteration = 0;
141 | 		while (nitems > 0) {
142 | 			iterations[i] ++;
143 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
144 | #ifdef TEXTURE
145 | 			FirstFit<<<nblocks, BLKSIZE>>>(m, *inwlptr, d_coloring);
146 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, *inwlptr, *outwlptr, d_coloring);
147 | #else
148 | 			FirstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring);
149 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_coloring);
150 | #endif
151 | 			nitems = outwlptr->nitems();
152 | 			Worklist2 * tmp = inwlptr;
153 | 			inwlptr = outwlptr;
154 | 			outwlptr = tmp;
155 | 			outwlptr->reset();
156 | 		}
157 | 		cudaDeviceSynchronize();
158 | 		endtime = rtclock();
159 | 		runtime[i] = 1000.0f * (endtime - starttime);
160 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
161 | 	}
162 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
163 | 	double total_time = 0.0;
164 | 	int total_colors = 0;
165 | 	int total_iterations = 0;
166 | 	for (int i = 0; i < ITERATIONS; i++) {
167 | 		total_time += runtime[i];
168 | 		total_colors += colors[i];
169 | 		total_iterations += iterations[i];
170 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
171 | 	}   
172 | 	double avg_time = (double)total_time / ITERATIONS;
173 | 	double avg_colors = (double)total_colors / ITERATIONS;
174 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
175 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
176 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
177 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
178 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
179 | }
180 | 


--------------------------------------------------------------------------------
/src/data/kernel_comb.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <cub/cub.cuh>
  4 | #include "gbar.cuh"
  5 | #include "cuda_launch_config.hpp"
  6 | #include "cutil_subset.h"
  7 | #include "common.h"
  8 | #include <thrust/fill.h>
  9 | #include <thrust/sequence.h>
 10 | #include <thrust/count.h>
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "worklistc.h"
 15 | #define	SCRATCHSIZE BLKSIZE
 16 | #define	MAXCOLOR 128
 17 | typedef cub::BlockScan<int, BLKSIZE> BlockScan;
 18 | 
 19 | __global__ void initialize(int *coloring, int m) {
 20 | 	unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
 21 | 	if (id < m) {
 22 | 		coloring[id] = MAXCOLOR;
 23 | 	}   
 24 | }
 25 | 
 26 | __device__ __forceinline__ void assignColor(unsigned *forbiddenColors, int *coloring, int vertex) {
 27 | 	int vertex_color;
 28 | 	for (vertex_color = 0; vertex_color < MAXCOLOR/32; vertex_color++) {
 29 | 		int pos = __ffs(forbiddenColors[vertex_color]);
 30 | 		if(pos) {
 31 | 			coloring[vertex] = vertex_color * 32 + pos - 1;
 32 | 			break;
 33 | 		}
 34 | 	}
 35 | 	assert(vertex_color < MAXCOLOR);
 36 | }
 37 | 
 38 | __global__ void firstFit(int m, const int* __restrict__ csrRowPtr, const int* __restrict__ csrColInd, Worklist2 inwl, int *coloring) {
 39 | //__global__ void firstFit(int m, int* csrRowPtr, int* csrColInd, Worklist2 inwl, int *coloring) {
 40 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 41 | 	unsigned forbiddenColors[MAXCOLOR/32+1];
 42 | 	int id = tid;
 43 | 	//int total_inputs = (*inwl.dindex + gridDim.x * blockDim.x - 1)/(gridDim.x * blockDim.x);
 44 | 	//for (int id = tid; total_inputs > 0; id += blockDim.x * gridDim.x, total_inputs--) {
 45 | 		int vertex;
 46 | 		if (inwl.pop_id(id, vertex)) {
 47 | 			//int row_begin = csrRowPtr[vertex];
 48 | 			//int row_end = csrRowPtr[vertex + 1]; 
 49 | 			int row_begin = __ldg(csrRowPtr + vertex);
 50 | 			int row_end= __ldg(csrRowPtr + vertex + 1);
 51 | 			for (int j = 0; j < MAXCOLOR/32; j++)
 52 | 				forbiddenColors[j] = 0xffffffff;
 53 | 			for (int offset = row_begin; offset < row_end; offset ++) {
 54 | 				//int neighbor = csrColInd[offset];
 55 | 				int neighbor = __ldg(csrColInd + offset);
 56 | 				int color = coloring[neighbor];
 57 | 				forbiddenColors[color / 32] &= ~(1 << (color % 32));
 58 | 			}
 59 | 			assignColor(forbiddenColors, coloring, vertex);
 60 | 		}
 61 | 	//}
 62 | }
 63 | 
 64 | __device__ __forceinline__ void conflictDetect1(int src, int dst, int *coloring, bool &is_conflict) {
 65 |     int color_s = coloring[src];
 66 | 	int color_d = coloring[dst];
 67 | 	if (color_s == color_d && src < dst) {
 68 | 		is_conflict = 1;
 69 | 		coloring[src] = MAXCOLOR;
 70 | 	}
 71 | }
 72 | 
 73 | __device__ __forceinline__ bool conflictDetect2(int src, int dst, int *coloring, int *degree, bool &is_conflict) {
 74 | 	if (coloring[src] == coloring[dst]) {
 75 | 		bool is_victim;
 76 | 		if (degree[src] == degree[dst])
 77 | 			is_victim = (src < dst) ? true : false;
 78 | 		else is_victim = (degree[src] < degree[dst]) ? true : false;
 79 | 		if (is_victim) {
 80 | 			is_conflict = 1;
 81 | 			coloring[src] = MAXCOLOR;
 82 | 		}
 83 | 	}
 84 | }
 85 | 
 86 | __global__ void conflictResolve(int m, const int* __restrict__ csrRowPtr, const int* __restrict__ csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) {
 87 | //__global__ void conflictResolve(int m, int* csrRowPtr, int* csrColInd, Worklist2 inwl, Worklist2 outwl, int * degree, int *coloring) {
 88 | 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 89 | 	bool is_conflict = 0;
 90 | 	int vertex;
 91 | 	if (inwl.pop_id(id, vertex)) {
 92 | 		//int row_begin = csrRowPtr[vertex];
 93 | 		//int row_end = csrRowPtr[vertex + 1]; 
 94 | 		int row_begin = __ldg(csrRowPtr + vertex);
 95 | 		int row_end= __ldg(csrRowPtr + vertex + 1);
 96 | 		for (int offset = row_begin; offset < row_end; offset ++) {
 97 | 			//int neighbor = csrColInd[offset];
 98 | 			int neighbor = __ldg(csrColInd + offset);
 99 | 			//conflictDetect1(vertex, neighbor, coloring, is_conflict);
100 | 			conflictDetect2(vertex, neighbor, coloring, degree, is_conflict);
101 | 			if(is_conflict) break;
102 | 		}
103 | 	}   
104 | 	outwl.push_1item<BlockScan>((int)is_conflict, vertex, BLKSIZE);
105 | }
106 | 
107 | void color(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int num_SMs) {
108 | 	double starttime, endtime;
109 | 	double runtime[ITERATIONS];
110 | 	int colors[ITERATIONS];
111 | 	int iterations[ITERATIONS];
112 | 	int *d_csrRowPtr, *d_csrColInd, *d_coloring, *d_degree;
113 | 	printf("Graph coloring data-driven Combination A version\n");
114 | 	int *degree = (int *)malloc(m * sizeof(int));
115 | 	for(int i = 0; i < m; i ++) {
116 | 		degree[i] = csrRowPtr[i + 1] - csrRowPtr[i];
117 | 	}
118 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int)));
119 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int)));
120 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_degree, m * sizeof(int)));
121 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));
122 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
123 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
124 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
125 | 	int device = 0;
126 | 	int deviceCount = 0;
127 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
128 | 	cudaDeviceProp deviceProp;
129 | 	cudaGetDeviceProperties(&deviceProp, 0);
130 | 	int nSM = deviceProp.multiProcessorCount;
131 | 	//int nSM = num_SMs;
132 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n",
133 | 		deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
134 | 	
135 | 	const size_t max_blocks = maximum_residency(firstFit, BLKSIZE, 0); 
136 | 	printf("max_blocks=%d\n", max_blocks);
137 | 
138 | 	for (int i = 0; i < ITERATIONS; i++) {
139 | 		Worklist2 inwl(m), outwl(m);
140 | 		Worklist2 *inwlptr = &inwl, *outwlptr = &outwl;
141 | 		CUDA_SAFE_CALL(cudaMemcpy(inwl.dindex, &m, sizeof(int), cudaMemcpyHostToDevice));
142 | 		CUDA_SAFE_CALL(cudaMemcpy(d_degree, degree, m * sizeof(int), cudaMemcpyHostToDevice));
143 | 		initialize <<<((m - 1) / BLKSIZE + 1), BLKSIZE>>> (d_coloring, m);
144 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
145 | 		iterations[i] = 0;
146 | 
147 | 		starttime = rtclock();
148 | 		int nitems = m;
149 | 		thrust::sequence(thrust::device, inwl.dwl, inwl.dwl + m);
150 | 		while (nitems > 0) {
151 | 			iterations[i] ++;
152 | 			//printf("in_nitems[%d]=%d\n", iteration, nitems);
153 | 			int nblocks = (nitems - 1) / BLKSIZE + 1;
154 | 			int nblocks_1 = nSM * max_blocks;
155 | 			if(nblocks < nblocks_1) nblocks_1 = nblocks;
156 | 			firstFit<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, d_coloring);
157 | 			conflictResolve<<<nblocks, BLKSIZE>>>(m, d_csrRowPtr, d_csrColInd, *inwlptr, *outwlptr, d_degree, d_coloring);
158 | 			nitems = outwlptr->nitems();
159 | 			Worklist2 * tmp = inwlptr;
160 | 			inwlptr = outwlptr;
161 | 			outwlptr = tmp;
162 | 			outwlptr->reset();
163 | 		}
164 | 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
165 | 		endtime = rtclock();
166 | 		runtime[i] = 1000.0f * (endtime - starttime);
167 | 		colors[i] = thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>()) + 1;
168 | 	}
169 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
170 | 	double total_time = 0.0;
171 | 	int total_colors = 0;
172 | 	int total_iterations = 0;
173 | 	for (int i = 0; i < ITERATIONS; i++) {
174 | 		total_time += runtime[i];
175 | 		total_colors += colors[i];
176 | 		total_iterations += iterations[i];
177 | 		printf("[%d %.2f %d] ", colors[i], runtime[i], iterations[i]);
178 | 	}
179 | 	double avg_time = (double)total_time / ITERATIONS;
180 | 	double avg_colors = (double)total_colors / ITERATIONS;
181 | 	double avg_iterations = (double)total_iterations / ITERATIONS;
182 | 	printf("\navg_time %f ms, avg_colors %.2f avg_iterations %.2f\n", avg_time, avg_colors, avg_iterations);
183 | 	CUDA_SAFE_CALL(cudaFree(d_csrRowPtr));
184 | 	CUDA_SAFE_CALL(cudaFree(d_csrColInd));
185 | 	CUDA_SAFE_CALL(cudaFree(d_coloring));
186 | }
187 | 


--------------------------------------------------------------------------------
/src/data/main.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <string>
  9 | #include <vector>
 10 | #include <set>
 11 | #include "lonestargpu.h"
 12 | #include "variants.h"
 13 | using namespace std;
 14 | 
 15 | #ifndef	ITERATIONS
 16 | #define	ITERATIONS 1
 17 | #endif
 18 | #ifndef	BLKSIZE
 19 | #define	BLKSIZE 128
 20 | #endif
 21 | 
 22 | // transfer R-MAT generated gr graph to CSR format
 23 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 24 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
 25 | 	std::ifstream cfile;
 26 | 	cfile.open(gr);
 27 | 	std::string str;
 28 | 	getline(cfile, str);
 29 | 	char c;
 30 | 	sscanf(str.c_str(), "%c", &c);
 31 | 	while (c == 'c') {
 32 | 		getline(cfile, str);
 33 | 		sscanf(str.c_str(), "%c", &c);
 34 | 	}
 35 | 	char sp[3];
 36 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 37 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 38 | 	//printf("%c %s %d %d\n", c, sp, m, nnz);
 39 | 	vector<set<int> > svector;
 40 | 	set<int> s;
 41 | 	for (int i = 0; i < m; i++)
 42 | 		svector.push_back(s);
 43 | 	int dst, src;
 44 | 	for (int i = 0; i < nnz; i++) {
 45 | 		getline(cfile, str);
 46 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
 47 | 
 48 | 		if (c != 'a')
 49 | 			printf("line %d\n", __LINE__);
 50 | 		dst--;
 51 | 		src--;
 52 | 		svector[src].insert(dst);
 53 | 		svector[dst].insert(src);
 54 | 	}
 55 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 56 | 	int count = 0;
 57 | 	for (int i = 0; i < m; i++) {
 58 | 		csrRowPtr[i] = count;
 59 | 		count += svector[i].size();
 60 | 	}
 61 | 	csrRowPtr[m] = count;
 62 | 	if (count != nnz) {
 63 | 		printf("This graph is not symmetric\n");
 64 | 		nnz = count;
 65 | 	}
 66 | 	double avgdeg;
 67 | 	double variance = 0.0;
 68 | 	int maxdeg = 0;
 69 | 	int mindeg = m;
 70 | 	avgdeg = (double)nnz / m;
 71 | 	for (int i = 0; i < m; i++) {
 72 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 73 | 		if (deg_i > maxdeg)
 74 | 			maxdeg = deg_i;
 75 | 		if (deg_i < mindeg)
 76 | 			mindeg = deg_i;
 77 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 78 | 	}
 79 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 80 | 	csrColInd = (int *)malloc(count * sizeof(int));
 81 | 	set<int>::iterator site;
 82 | 	for (int i = 0, index = 0; i < m; i++) {
 83 | 		site = svector[i].begin();
 84 | 		while (site != svector[i].end()) {
 85 | 			csrColInd[index++] = *site;
 86 | 			site++;
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | // transfer *.graph file to CSR format
 92 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 93 | 	printf("Reading .graph input file %s\n", graph);
 94 | 	std::ifstream cfile;
 95 | 	cfile.open(graph);
 96 | 	std::string str;
 97 | 	getline(cfile, str);
 98 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 99 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
100 | 	vector<set<int> > svector;
101 | 	set<int> s;
102 | 	for (int i = 0; i < m; i++)
103 | 		svector.push_back(s);
104 | 	int dst;
105 | 	for (int i = 0; i < m; i++) {
106 | 		getline(cfile, str);
107 | 		istringstream istr;
108 | 		istr.str(str);
109 | 		while(istr>>dst) {
110 | 			dst --;
111 | 			svector[i].insert(dst);
112 | 			svector[dst].insert(i);
113 | 		}
114 | 		istr.clear();
115 | 	}
116 |     cfile.close();
117 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
118 | 	int count = 0;
119 | 	for (int i = 0; i < m; i++) {
120 | 		csrRowPtr[i] = count;
121 | 		count += svector[i].size();
122 | 	}
123 | 	csrRowPtr[m] = count;
124 | 	if (count != nnz) {
125 | 		printf("This graph is not symmetric\n");
126 | 		nnz = count;
127 | 	}
128 | 	double avgdeg;
129 | 	double variance = 0.0;
130 | 	int maxdeg = 0;
131 | 	int mindeg = m;
132 | 	avgdeg = (double)nnz / m;
133 | 	for (int i = 0; i < m; i++) {
134 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
135 | 		if (deg_i > maxdeg)
136 | 			maxdeg = deg_i;
137 | 		if (deg_i < mindeg)
138 | 			mindeg = deg_i;
139 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
140 | 	}
141 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
142 | 	csrColInd = (int *)malloc(count * sizeof(int));
143 | 	set<int>::iterator site;
144 | 	for (int i = 0, index = 0; i < m; i++) {
145 | 		site = svector[i].begin();
146 | 		while (site != svector[i].end()) {
147 | 			csrColInd[index++] = *site;
148 | 			site++;
149 | 		}
150 | 	}
151 | }
152 | 
153 | // transfer mtx graph to CSR format
154 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
155 | 	printf("Reading (.mtx) input file %s\n", mtx);
156 | 	std::ifstream cfile;
157 | 	cfile.open(mtx);
158 | 	std::string str;
159 | 	getline(cfile, str);
160 | 	char c;
161 | 	sscanf(str.c_str(), "%c", &c);
162 | 	while (c == '%') {
163 | 		getline(cfile, str);
164 | 		sscanf(str.c_str(), "%c", &c);
165 | 	}
166 | 	int n;
167 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
168 | 	if (m != n) {
169 | 		printf("error!\n");
170 | 		exit(0);
171 | 	}
172 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
173 | 	vector<set<int> > svector;
174 | 	set<int> s;
175 | 	for (int i = 0; i < m; i++)
176 | 		svector.push_back(s);
177 | 	int dst, src;
178 | 	for (int i = 0; i < nnz; i++) {
179 | 		getline(cfile, str);
180 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
181 | 		dst--;
182 | 		src--;
183 | 		svector[src].insert(dst);
184 | 		svector[dst].insert(src);
185 | 	}
186 | 	cfile.close();
187 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
188 | 	int count = 0;
189 | 	for (int i = 0; i < m; i++) {
190 | 		csrRowPtr[i] = count;
191 | 		count += svector[i].size();
192 | 	}
193 | 	csrRowPtr[m] = count;
194 | 	if (count != nnz) {
195 | 		printf("This graph is not symmetric\n");
196 | 		nnz = count;
197 | 	}
198 | 	double avgdeg;
199 | 	double variance = 0.0;
200 | 	int maxdeg = 0;
201 | 	int mindeg = m;
202 | 	avgdeg = (double)nnz / m;
203 | 	for (int i = 0; i < m; i++) {
204 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
205 | 		if (deg_i > maxdeg)
206 | 			maxdeg = deg_i;
207 | 		if (deg_i < mindeg)
208 | 			mindeg = deg_i;
209 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
210 | 	}
211 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
212 | 	csrColInd = (int *)malloc(count * sizeof(int));
213 | 	set<int>::iterator site;
214 | 	for (int i = 0, index = 0; i < m; i++) {
215 | 		site = svector[i].begin();
216 | 		while (site != svector[i].end()) {
217 | 			csrColInd[index++] = *site;
218 | 			site++;
219 | 		}
220 | 	}
221 | }
222 | 
223 | // store colour of all vertex
224 | void write_solution(char *fname, int *coloring, int n) {
225 | 	int i;
226 | 	FILE *fp;
227 | 	fp = fopen(fname, "w");
228 | 	for (i = 0; i < n; i++) {
229 | 		//fprintf(fp, "%d:%d\n", i, coloring[i]);
230 | 		fprintf(fp, "%d\n", coloring[i]);
231 | 	}
232 | 	fclose(fp);
233 | }
234 | 
235 | // check if correctly coloured
236 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
237 | 	int i, offset, neighbor_j;
238 | 	for (i = 0; i < m; i++) {
239 | 		for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) {
240 | 			neighbor_j = csrColInd[offset];
241 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
242 | 				*correct = 0;
243 | 				//printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
244 | 				break;
245 | 			}
246 | 		}	
247 | 	}
248 | }
249 | 
250 | int main(int argc, char *argv[]) {
251 | 	if (argc < 2) {
252 | 		printf("Usage: %s <graph> <num_SMs>\n", argv[0]);
253 | 		exit(1);
254 | 	}
255 | 	int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL;
256 | 	// read graph
257 | 	if (strstr(argv[1], ".mtx"))
258 | 		mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
259 | 	else if (strstr(argv[1], ".graph"))
260 | 		graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
261 | 	else if (strstr(argv[1], ".gr"))
262 | 		gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
263 | 	else { printf("Unrecognizable input file format\n"); exit(0); }
264 | 	int *coloring = (int *)calloc(m, sizeof(int));
265 | 	int correct = 1;
266 | 	int num_SMs;
267 | 	if (argc > 2) {
268 | 		num_SMs = atoi(argv[2]);
269 | 		printf("block_size=%d, num_SMs=%d\n", BLKSIZE, num_SMs);
270 | 	}
271 | #if VARIANT==DATA_LDB
272 | 	color_ldb(m, nnz, csrRowPtr, csrColInd, coloring, num_SMs);
273 | #else
274 | 	color(m, nnz, csrRowPtr, csrColInd, coloring, num_SMs);
275 | #endif
276 | 	write_solution("color.txt", coloring, m);
277 | 	verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct);
278 | 	if (correct)
279 | 		printf("correct.\n");
280 | 	else
281 | 		printf("incorrect.\n");
282 | 	return 0;
283 | }
284 | 


--------------------------------------------------------------------------------
/src/GM/tree.cpp:
--------------------------------------------------------------------------------
  1 | #include "tree.h"
  2 | #include <iostream>
  3 | 
  4 | using namespace std;
  5 | 
  6 | 
  7 | /*
  8 | int main(){
  9 | 	tree graph;
 10 | 	node *temp;
 11 | 	
 12 | 	
 13 | 	node *nodes = new node[7];
 14 | 	int degList[7] = {4,2,4,5,10,4,7};
 15 | 	
 16 | 	for (int i=0; i<7; i++){
 17 | 		nodes[i].setKSD(i, 0, degList[i]);
 18 | 		graph.insert(&nodes[i]);
 19 | 	}
 20 | 	
 21 | 	
 22 | 	cout << "RML" << endl;
 23 | 	graph.displayTreeRML(graph.getTop()); 
 24 | 	
 25 | 	
 26 | 	
 27 | 	temp = graph.remove(1,0,2);
 28 | 	cout << endl << "Deleted: "; temp->displayNode();
 29 | 	
 30 | 	cout << "RML" << endl;
 31 | 	graph.displayTreeRML(graph.getTop());
 32 | 	
 33 | 	
 34 | 	cout << endl << "Deleted: "; temp = graph.remove(6,0,7);
 35 | 	temp->displayNode();
 36 | 	
 37 | 	cout << "RML" << endl;
 38 | 	graph.displayTreeRML(graph.getTop());
 39 | 	
 40 | 	
 41 | 	return 0;
 42 | }
 43 | */
 44 | 
 45 | 
 46 | 
 47 | node::node(){
 48 | 	key = saturation = degree = color = -1;
 49 | 	
 50 | 	left = NULL;
 51 | 	right = NULL;
 52 | }
 53 | 
 54 | node::node(int index, int sat, int deg){
 55 | 	key = index;
 56 | 	saturation = sat;
 57 | 	degree = deg;
 58 | 	
 59 | 	color = -1;
 60 | 	
 61 | 	left = NULL;
 62 | 	right = NULL;
 63 | }
 64 | 
 65 | node::node(int index, int sat, int deg, int col, node *L, node *R){
 66 | 	key = index;
 67 | 	saturation = sat;
 68 | 	degree = deg;
 69 | 	color = col;
 70 | 	left = L;
 71 | 	right = R;
 72 | }
 73 | 
 74 | 
 75 | int node::getKey(){
 76 | 	return key;
 77 | }
 78 | 
 79 | int node::getSaturation(){
 80 | 	return saturation;
 81 | }
 82 | 
 83 | int node::getDegree(){
 84 | 	return degree;
 85 | }
 86 | 
 87 | int node::getColor(){
 88 | 	return color;
 89 | }
 90 | 
 91 | node* node::getLeft(){
 92 | 	return left;
 93 | }
 94 | 
 95 | node* node::getRight(){
 96 | 	return right;
 97 | }
 98 | 
 99 | 
100 | void node::setKey(int index){
101 | 	key = index;
102 | }
103 | 
104 | void node::setSaturation(int sat){
105 | 	saturation = sat;
106 | }
107 | 
108 | void node::setDegree(int deg){
109 | 	degree = deg;
110 | }
111 | 
112 | void node::setKSD(int index, int sat, int deg){
113 | 	key = index;
114 | 	saturation = sat;
115 | 	degree = deg;
116 | 	color = -1;
117 | 	left = NULL;
118 | 	right = NULL;
119 | }
120 | 
121 | void node::setColor(int c){
122 | 	color = c;
123 | }
124 | 
125 | void node::setLeft(node *L){
126 | 	left = L;
127 | }
128 | 
129 | void node::setRight(node *R){
130 | 	right = R;
131 | }
132 | 
133 | 
134 | void node::displayNode(){
135 | 	cout <<  key  << " :  Sat: " << saturation << " , Deg: " << degree << endl; 
136 | }
137 | 
138 | 
139 | node::~node(){
140 | 	left = NULL;
141 | 	right = NULL;
142 | }
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | tree::tree(){
152 | 	top = NULL;
153 | }
154 | 
155 | void tree::insert(node *x){
156 | 	node *current, *previous;
157 | 	bool left;
158 | 	
159 | 	if (top == NULL)
160 | 		top = x;
161 | 	else 
162 | 	{
163 | 		current = top;
164 | 
165 | 		// Check to see where to insert
166 | 		while (current != NULL){
167 | 			previous = current;
168 | 			
169 | 			if (current->getSaturation() < x->getSaturation()){				
170 | 				current = current->getRight();
171 | 				left = false;
172 | 			}
173 | 			else 
174 | 				if (current->getSaturation() > x->getSaturation()){
175 | 					current = current->getLeft();
176 | 					left = true;
177 | 				}
178 | 				else 
179 | 					if (current->getDegree() < x->getDegree()){
180 | 						current = current->getRight();
181 | 						left = false;
182 | 					}
183 | 					else 
184 | 						if (current->getDegree() >= x->getDegree()){
185 | 							current = current->getLeft();
186 | 							left = true;
187 | 						}
188 | 		}
189 | 		
190 | 		// Insert item
191 | 		if (left == true)
192 | 			previous->setLeft(x);
193 | 		else 
194 | 			previous->setRight(x);
195 | 	}
196 | }
197 | 
198 | 
199 | node* tree::findNode(int index, int saturation, int degree){
200 | 	node *current, *previous;
201 | 	
202 | 	current = top;
203 | 	
204 | 	while ((current != NULL) && (current->getKey() != index)){
205 | 		previous = current;
206 | 		
207 | 		if (current->getSaturation() < saturation)	
208 | 			current = current->getRight();
209 | 		else 
210 | 			if (current->getSaturation() > saturation)
211 | 				current = current->getLeft();
212 | 			else 
213 | 				if (current->getDegree() < degree)
214 | 					current = current->getRight();
215 | 				else 
216 | 					if (current->getDegree() >= degree)
217 | 						current = current->getLeft();
218 | 	}
219 | 	
220 | 	return current;
221 | }
222 | 
223 | node* tree::remove(int index, int saturation, int degree){
224 | 	node *current, *previous, *parent, *nodeToDel;
225 | 	bool left, parentLeft;
226 | 	parent = previous = current = top;
227 | 	//node blank;
228 | 	
229 | 	if (top == NULL){
230 | 		cout << "Tree is empty!!!" << endl;
231 | 	}
232 | 	else{
233 | 		// step1: find the node
234 | 		while ((current != NULL) && (current->getKey() != index)){
235 | 			previous = current;
236 | 			
237 | 			if (current->getSaturation() < saturation){				
238 | 				current = current->getRight();
239 | 				left = false;
240 | 			}
241 | 			else 
242 | 				if (current->getSaturation() > saturation){
243 | 					current = current->getLeft();
244 | 					left = true;
245 | 				}
246 | 				else 
247 | 					if (current->getDegree() < degree){
248 | 						current = current->getRight();
249 | 						left = false;
250 | 					}
251 | 					else 
252 | 						if (current->getDegree() >= degree){
253 | 							current = current->getLeft();
254 | 							left = true;
255 | 						}
256 | 		}
257 | 		
258 | 		
259 | 		// Not found!!!
260 | 		if (current == NULL){
261 | 			cout << "Node not found!!!" << endl;
262 | 			return NULL;
263 | 		}
264 | 		
265 | 		
266 | 		
267 | 		// Replace
268 | 		parent = previous;
269 | 		parentLeft = left;
270 | 		nodeToDel = current;
271 | 		
272 | 	//	blank.setKey(nodeToDel->getKey());
273 | 	//	blank.setSaturation(nodeToDel->getSaturation());
274 | 	//	blank.setDegree(nodeToDel->getDegree());
275 | 		
276 | 		
277 | 		// Option 1: A leaf; replace by nothing!!!
278 | 		if ((current->getLeft() == NULL) && (current->getRight() == NULL)){
279 | 			if (parentLeft == true)
280 | 				parent->setLeft(NULL);
281 | 			else 
282 | 				parent->setRight(NULL);
283 | 			
284 | 			if (top == nodeToDel)
285 | 				top = NULL;
286 | 			
287 | 			return nodeToDel;
288 | 		}
289 | 		
290 | 		//Option 2: Node had only 1 child
291 | 		if ((current->getLeft() == NULL) || (current->getRight() == NULL)){
292 | 			if (current->getLeft() == NULL) 
293 | 				current = current->getRight();
294 | 			else 
295 | 				current = current->getLeft();
296 | 			
297 | 			if (top == nodeToDel)
298 | 				top = current;
299 | 			else
300 | 				if (parentLeft == true)
301 | 					parent->setLeft(current);
302 | 				else 
303 | 					parent->setRight(current);
304 | 			
305 | 			return nodeToDel;
306 | 		}
307 | 		
308 | 		
309 | 		
310 | 		//Option 3: Node had 2 Children - the painful one: replace by node slightly biggest (normally the rightmost of the left node)
311 | 		previous = current;
312 | 		current = current->getLeft();
313 | 		
314 | 		if (current->getRight() == NULL){
315 | 			if (top == nodeToDel)
316 | 				top = current;
317 | 			else
318 | 				if (parentLeft == true)
319 | 					parent->setLeft(current);
320 | 				else 
321 | 					parent->setRight(current);
322 | 			
323 | 			current->setRight(nodeToDel->getRight());
324 | 		}
325 | 		else{
326 | 			while (current->getRight() != NULL){
327 | 				previous = current;
328 | 				current = current->getRight();
329 | 			}
330 | 			
331 | 			
332 | 			if (current->getLeft() == NULL)	// replaced node is a leaf
333 | 				previous->setRight(NULL);
334 | 			else 
335 | 				previous->setRight(current->getLeft());	// node has left children
336 | 			
337 | 			
338 | 			current->setLeft(nodeToDel->getLeft());
339 | 			current->setRight(nodeToDel->getRight());
340 | 			
341 | 			if (top == nodeToDel)
342 | 				top = current;
343 | 			else
344 | 				if (parentLeft == true)
345 | 					parent->setLeft(current);
346 | 				else 
347 | 					parent->setRight(current);
348 | 		}
349 | 		
350 | 		return nodeToDel;
351 | 	}
352 | }
353 | 
354 | 
355 | void tree::findBiggest(int &index, int &saturation, int &degree){
356 | 	node *current, temp;
357 | 	
358 | 	current = top;
359 | 	
360 | 	while (current->getRight() != NULL){
361 | 		current = current->getRight();
362 | 	}
363 | 	
364 | 	index = current->getKey();
365 | 	saturation = current->getSaturation();
366 | 	degree = current->getDegree();
367 | }
368 | 
369 | 
370 | 
371 | void tree::displayTreeRML(node *current){
372 | 	
373 | 	if (current != NULL) 
374 | 	{
375 | 		displayTreeRML(current->getRight());
376 | 		current->displayNode();
377 | 		displayTreeRML(current->getLeft());
378 | 	}
379 | }
380 | 
381 | void tree::displayTreeLMR(node *current){
382 | 	
383 | 	if (current != NULL) 
384 | 	{
385 | 		displayTreeLMR(current->getLeft());
386 | 		current->displayNode();
387 | 		displayTreeLMR(current->getRight());
388 | 	}
389 | }
390 | 
391 | void tree::displayTreeMLR(node *current){
392 | 	
393 | 	if (current != NULL) 
394 | 	{
395 | 		current->displayNode();
396 | 		displayTreeMLR(current->getLeft());
397 | 		displayTreeMLR(current->getRight());
398 | 	}
399 | }
400 | 
401 | 
402 | node* tree::getTop(){
403 | 	return top;
404 | }
405 | 
406 | tree::~tree(){
407 | 	top = NULL;
408 | }
409 | 
410 | 


--------------------------------------------------------------------------------
/src/serial/greedy.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <string>
  8 | #include <vector>
  9 | #include <set>
 10 | #include <sstream>
 11 | #include <string.h>
 12 | #include <time.h>
 13 | #include <assert.h>
 14 | #include <sys/time.h>
 15 | using namespace std;
 16 | #include "graph_io.h"
 17 | /*
 18 | // transfer *.graph file to CSR format
 19 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 20 | 	printf("Reading .graph input file %s\n", graph);
 21 | 	std::ifstream cfile;
 22 | 	cfile.open(graph);
 23 | 	std::string str;
 24 | 	getline(cfile, str);
 25 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 26 | 	printf("num_vertices %d  num_edges %d\n", m, nnz);
 27 | 	vector<set<int> > svector;
 28 | 	set<int> s;
 29 | 	for (int i = 0; i < m; i++)
 30 | 		svector.push_back(s);
 31 | 	int dst;
 32 | 	for (int i = 0; i < m; i++) {
 33 | 		getline(cfile, str);
 34 | 		istringstream istr;
 35 | 		istr.str(str);
 36 | 		while(istr>>dst) {
 37 | 			dst --;
 38 | 			svector[i].insert(dst);
 39 | 			svector[dst].insert(i);
 40 | 		}
 41 | 		istr.clear();
 42 | 	}
 43 |     cfile.close();
 44 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 45 | 	int count = 0;
 46 | 	for (int i = 0; i < m; i++) {
 47 | 		csrRowPtr[i] = count;
 48 | 		count += svector[i].size();
 49 | 	}
 50 | 	csrRowPtr[m] = count;
 51 | 	if (count != nnz) {
 52 | 		printf("The graph is not symmetric\n");
 53 | 		nnz = count;
 54 | 	}
 55 | 	double avgdeg;
 56 | 	double variance = 0.0;
 57 | 	int maxdeg = 0;
 58 | 	int mindeg = m;
 59 | 	avgdeg = (double)nnz / m;
 60 | 	for (int i = 0; i < m; i++) {
 61 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 62 | 		if (deg_i > maxdeg)
 63 | 			maxdeg = deg_i;
 64 | 		if (deg_i < mindeg)
 65 | 			mindeg = deg_i;
 66 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 67 | 	}
 68 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 69 | 	csrColInd = (int *)malloc(count * sizeof(int));
 70 | 	set<int>::iterator site;
 71 | 	for (int i = 0, index = 0; i < m; i++) {
 72 | 		site = svector[i].begin();
 73 | 		while (site != svector[i].end()) {
 74 | 			csrColInd[index++] = *site;
 75 | 			site++;
 76 | 		}
 77 | 	}
 78 | }
 79 | 
 80 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 81 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
 82 | 	std::ifstream cfile;
 83 | 	cfile.open(gr);
 84 | 	std::string str;
 85 | 	getline(cfile, str);
 86 | 	char c;
 87 | 	sscanf(str.c_str(), "%c", &c);
 88 | 	while (c == 'c') {
 89 | 		getline(cfile, str);
 90 | 		sscanf(str.c_str(), "%c", &c);
 91 | 	}
 92 | 	char sp[3];
 93 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 94 | 	printf("num_vertices %d  num_edges %d\n", m, nnz);
 95 | 	//printf("%c %s %d %d\n", c, sp, m, nnz);
 96 | 	vector<set<int> > svector;
 97 | 	set<int> s;
 98 | 	for (int i = 0; i < m; i++)
 99 | 		svector.push_back(s);
100 | 	int dst, src;
101 | 	for (int i = 0; i < nnz; i++) {
102 | 		getline(cfile, str);
103 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
104 | 		if (c != 'a')
105 | 			printf("line %d\n", __LINE__);
106 | 		dst--;
107 | 		src--;
108 | 		svector[src].insert(dst);
109 | 		svector[dst].insert(src);
110 | 	}
111 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
112 | 	int count = 0;
113 | 	for (int i = 0; i < m; i++) {
114 | 		csrRowPtr[i] = count;
115 | 		count += svector[i].size();
116 | 	}
117 | 	csrRowPtr[m] = count;
118 | 	if (count != nnz) {
119 | 		printf("The graph is not symmetric\n");
120 | 		nnz = count;
121 | 	}
122 | 	double avgdeg;
123 | 	double variance = 0.0;
124 | 	int maxdeg = 0;
125 | 	int mindeg = m;
126 | 	avgdeg = (double)nnz / m;
127 | 	for (int i = 0; i < m; i++) {
128 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
129 | 		if (deg_i > maxdeg)
130 | 			maxdeg = deg_i;
131 | 		if (deg_i < mindeg)
132 | 			mindeg = deg_i;
133 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
134 | 	}
135 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
136 | 	csrColInd = (int *)malloc(count * sizeof(int));
137 | 	set<int>::iterator site;
138 | 	for (int i = 0, index = 0; i < m; i++) {
139 | 		site = svector[i].begin();
140 | 		while (site != svector[i].end()) {
141 | 			csrColInd[index++] = *site;
142 | 			site++;
143 | 		}
144 | 	}
145 | }
146 | 
147 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
148 | 	printf("Reading .mtx input file %s\n", mtx);
149 | 	std::ifstream cfile;
150 | 	cfile.open(mtx);
151 | 	std::string str;
152 | 	getline(cfile, str);
153 | 	char c;
154 | 	sscanf(str.c_str(), "%c", &c);
155 | 	while (c == '%') {
156 | 		getline(cfile, str);
157 | 		sscanf(str.c_str(), "%c", &c);
158 | 	}
159 | 	int n;
160 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
161 | 	if (m != n) {
162 | 		printf("error!\n");
163 | 		exit(0);
164 | 	}
165 | 	printf("num_vertices %d  num_edges %d\n", m, nnz);
166 | 	vector<set<int> > svector;
167 | 	set<int> s;
168 | 	for (int i = 0; i < m; i++)
169 | 		svector.push_back(s);
170 | 	int dst, src;
171 | 	for (int i = 0; i < nnz; i++) {
172 | 		getline(cfile, str);
173 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
174 | 		dst--;
175 | 		src--;
176 | 		svector[src].insert(dst);
177 | 		svector[dst].insert(src);
178 | 	}
179 | 	cfile.close();
180 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
181 | 	int count = 0;
182 | 	for (int i = 0; i < m; i++) {
183 | 		csrRowPtr[i] = count;
184 | 		count += svector[i].size();
185 | 	}
186 | 	csrRowPtr[m] = count;
187 | 	if (count != nnz) {
188 | 		printf("The graph is not symmetric\n");
189 | 		nnz = count;
190 | 	}
191 | 	double avgdeg;
192 | 	double variance = 0.0;
193 | 	int maxdeg = 0;
194 | 	int mindeg = m;
195 | 	avgdeg = (double)nnz / m;
196 | 	for (int i = 0; i < m; i++) {
197 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
198 | 		if (deg_i > maxdeg)
199 | 			maxdeg = deg_i;
200 | 		if (deg_i < mindeg)
201 | 			mindeg = deg_i;
202 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
203 | 	}
204 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
205 | 	csrColInd = (int *)malloc(count * sizeof(int));
206 | 	set<int>::iterator site;
207 | 	for (int i = 0, index = 0; i < m; i++) {
208 | 		site = svector[i].begin();
209 | 		while (site != svector[i].end()) {
210 | 			csrColInd[index++] = *site;
211 | 			site++;
212 | 		}
213 | 	}
214 | }
215 | 
216 | void write_solution(char *fname, int *coloring, int n) {
217 | 	int i;
218 | 	FILE *fp;
219 | 	fp = fopen(fname, "w");
220 | 	for (i = 0; i < n; i++) {
221 | 		fprintf(fp, "%d\n", coloring[i]);
222 | 	}
223 | 	fclose(fp);
224 | }
225 | 
226 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
227 | 	int i, offset, neighbor_j;
228 | 	for (i = 0; i < m; i++) {
229 | 		for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) {
230 | 			neighbor_j = csrColInd[offset];
231 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
232 | 				*correct = 0;
233 | 				printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
234 | 				return;
235 | 			}
236 | 		}	
237 | 	}
238 | }
239 | //*/
240 | 
241 | double rtclock() {
242 |     struct timezone Tzp;
243 |     struct timeval Tp;
244 |     int stat;
245 |     stat = gettimeofday (&Tp, &Tzp);
246 |     if (stat != 0) printf("Error return from gettimeofday: %d",stat);
247 |     return(Tp.tv_sec + Tp.tv_usec*1.0e-6);
248 | }
249 | 
250 | #define MAXCOLOR 128
251 | void FirstFit(int m, int nnz, int *csrRowPtr, int *csrColInd, int *ncolors, int *coloring) {
252 | 	int max_color = 1;
253 | 	int vertex;
254 | 	int forbiddenColors[MAXCOLOR+1];
255 | 	for (int i = 0; i < MAXCOLOR; i ++)
256 | 		forbiddenColors[i] = -1;
257 | 	for (vertex = 0; vertex < m; vertex++) {
258 | 		int row_begin = csrRowPtr[vertex];
259 | 		int row_end = csrRowPtr[vertex + 1];
260 | 		for (int offset = row_begin; offset < row_end; offset++) {
261 | 			int neighbor = csrColInd[offset];
262 | 			forbiddenColors[coloring[neighbor]] = vertex;
263 | 		}
264 | 		int vertex_color = 1;
265 | 		while (vertex_color < max_color && forbiddenColors[vertex_color] == vertex)
266 | 			vertex_color++;
267 | 		if (vertex_color == max_color)
268 | 			max_color++;
269 | 		assert(vertex_color < MAXCOLOR);
270 | 		coloring[vertex] = vertex_color;
271 | 	}
272 | 	*ncolors = max_color - 1;
273 | }
274 | 
275 | int main(int argc, char *argv[]) {
276 | 	int iteration = 0;
277 | 	int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL;
278 | 	if (strstr(argv[1], ".mtx"))
279 | 		mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
280 | 	else if (strstr(argv[1], ".graph"))
281 | 		graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
282 | 	else if (strstr(argv[1], ".gr"))
283 | 		gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
284 | 	else
285 | 		{ printf("Unrecognizable input file format\n"); exit(0); }
286 | 	int ncolors, *coloring, correct;
287 | 	ncolors = 0;
288 | 	coloring = (int *)calloc(m, sizeof(int));
289 | 	correct = 1;
290 | 	double starttime, endtime;
291 | 	double runtime[10];
292 | 	int colors[10];
293 | 	for (int i = 0; i < 10; i++) {
294 | 		memset(coloring, 0, m * sizeof(int));	
295 | 		starttime = rtclock();
296 | 		FirstFit(m, nnz, csrRowPtr, csrColInd, &ncolors, coloring);
297 | 		endtime = rtclock();
298 | 		runtime[i] = (1000.0f) * (endtime - starttime);
299 | 		colors[i] = ncolors;
300 | 	}
301 | 	double total_time = 0;
302 | 	int total_colors = 0;
303 | 	double avg_time;
304 | 	double avg_colors;
305 | 	for (int i = 0; i < 10; i++) {
306 | 		printf("[%.2f %d] ", runtime[i], colors[i]);
307 | 		total_time += runtime[i];
308 | 		total_colors += colors[i];
309 | 	}
310 | 	printf("\navg_time %f ms, avg_colors %.2f\n", total_time / 10, (double)total_colors / 10);
311 | 	write_solution("color.txt", coloring, m);
312 | 	verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct);
313 | 	if (correct)
314 | 		printf("correct.\n");
315 | 	else
316 | 		printf("incorrect.\n");
317 | 	return 0;
318 | }
319 | 


--------------------------------------------------------------------------------
/src/omp/main.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <cassert>
  9 | #include <inttypes.h>
 10 | #include <fcntl.h>
 11 | #include <unistd.h>
 12 | #include <sys/stat.h>
 13 | #include <sys/mman.h>
 14 | #include <vector>
 15 | #include <set>
 16 | #include "common.h"
 17 | #include "worklist.h"
 18 | typedef unsigned foru;
 19 | #include "graph.h"
 20 | int num_omp_threads;
 21 | using namespace std;
 22 | //#include "kernel1.h"
 23 | #include "kernel2.h"
 24 | 
 25 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 26 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
 27 | 	std::ifstream cfile;
 28 | 	cfile.open(gr);
 29 | 	std::string str;
 30 | 	getline(cfile, str);
 31 | 	char c;
 32 | 	sscanf(str.c_str(), "%c", &c);
 33 | 	while (c == 'c') {
 34 | 		getline(cfile, str);
 35 | 		sscanf(str.c_str(), "%c", &c);
 36 | 	}
 37 | 	char sp[3];
 38 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 39 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 40 | 	vector<set<int> > svector;
 41 | 	set<int> s;
 42 | 	for (int i = 0; i < m; i++)
 43 | 		svector.push_back(s);
 44 | 	int dst, src;
 45 | 	for (int i = 0; i < nnz; i++) {
 46 | 		getline(cfile, str);
 47 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
 48 | 
 49 | 		if (c != 'a')
 50 | 			printf("line %d\n", __LINE__);
 51 | 		dst--;
 52 | 		src--;
 53 | 		svector[src].insert(dst);
 54 | 		svector[dst].insert(src);
 55 | 	}
 56 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 57 | 	int count = 0;
 58 | 	for (int i = 0; i < m; i++) {
 59 | 		csrRowPtr[i] = count;
 60 | 		count += svector[i].size();
 61 | 	}
 62 | 	csrRowPtr[m] = count;
 63 | 	nnz = count;
 64 | 	double avgdeg;
 65 | 	double variance = 0.0;
 66 | 	int maxdeg = 0;
 67 | 	int mindeg = m;
 68 | 	avgdeg = (double)nnz / m;
 69 | 	for (int i = 0; i < m; i++) {
 70 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 71 | 		if (deg_i > maxdeg)
 72 | 			maxdeg = deg_i;
 73 | 		if (deg_i < mindeg)
 74 | 			mindeg = deg_i;
 75 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 76 | 	}
 77 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 78 | 	csrColInd = (int *)malloc(count * sizeof(int));
 79 | 	set<int>::iterator site;
 80 | 	for (int i = 0, index = 0; i < m; i++) {
 81 | 		site = svector[i].begin();
 82 | 		while (site != svector[i].end()) {
 83 | 			csrColInd[index++] = *site;
 84 | 			site++;
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | 
 90 | // transfer *.graph file to CSR format
 91 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 92 | 	printf("Reading .graph input file %s\n", graph);
 93 | 	std::ifstream cfile;
 94 | 	cfile.open(graph);
 95 | 	std::string str;
 96 | 	getline(cfile, str);
 97 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 98 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 99 | 	vector<set<int> > svector;
100 | 	set<int> s;
101 | 	for (int i = 0; i < m; i++)
102 | 		svector.push_back(s);
103 | 	int dst;
104 | 	for (int i = 0; i < m; i++) {
105 | 		getline(cfile, str);
106 | 		istringstream istr;
107 | 		istr.str(str);
108 | 		while(istr>>dst) {
109 | 			dst --;
110 | 			svector[i].insert(dst);
111 | 			svector[dst].insert(i);
112 | 		}
113 | 		istr.clear();
114 | 	}
115 | 	cfile.close();
116 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
117 | 	int count = 0;
118 | 	for (int i = 0; i < m; i++) {
119 | 		csrRowPtr[i] = count;
120 | 		count += svector[i].size();
121 | 	}
122 | 	csrRowPtr[m] = count;
123 | 	if (count != nnz) {
124 | 		printf("The graph is not symmetric\n");
125 | 		nnz = count;
126 | 	}
127 | 	double avgdeg;
128 | 	double variance = 0.0;
129 | 	int maxdeg = 0;
130 | 	int mindeg = m;
131 | 	avgdeg = (double)nnz / m;
132 | 	for (int i = 0; i < m; i++) {
133 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
134 | 		if (deg_i > maxdeg)
135 | 			maxdeg = deg_i;
136 | 		if (deg_i < mindeg)
137 | 			mindeg = deg_i;
138 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
139 | 	}
140 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
141 | 	csrColInd = (int *)malloc(count * sizeof(int));
142 | 	set<int>::iterator site;
143 | 	for (int i = 0, index = 0; i < m; i++) {
144 | 		site = svector[i].begin();
145 | 		while (site != svector[i].end()) {
146 | 			csrColInd[index++] = *site;
147 | 			site++;
148 | 		}
149 | 	}
150 | }
151 | 
152 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
153 | 	printf("Reading (.mtx) input file %s\n", mtx);
154 | 	std::ifstream cfile;
155 | 	cfile.open(mtx);
156 | 	std::string str;
157 | 	getline(cfile, str);
158 | 	char c;
159 | 	sscanf(str.c_str(), "%c", &c);
160 | 	while (c == '%') {
161 | 		getline(cfile, str);
162 | 		sscanf(str.c_str(), "%c", &c);
163 | 	}
164 | 	int n;
165 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
166 | 	if (m != n) {
167 | 		printf("error!\n");
168 | 		exit(0);
169 | 	}
170 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
171 | 	vector<set<int> > svector;
172 | 	set<int> s;
173 | 	for (int i = 0; i < m; i++)
174 | 		svector.push_back(s);
175 | 	int dst, src;
176 | 	for (int i = 0; i < nnz; i++) {
177 | 		getline(cfile, str);
178 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
179 | 		dst--;
180 | 		src--;
181 | 		svector[src].insert(dst);
182 | 		svector[dst].insert(src);
183 | 	}
184 | 	cfile.close();
185 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
186 | 	int count = 0;
187 | 	for (int i = 0; i < m; i++) {
188 | 		csrRowPtr[i] = count;
189 | 		count += svector[i].size();
190 | 	}
191 | 	csrRowPtr[m] = count;
192 | 	if (count != nnz) {
193 | 		printf("This graph is not symmetric\n");
194 | 		nnz = count;
195 | 	}
196 | 	double avgdeg;
197 | 	double variance = 0.0;
198 | 	int maxdeg = 0;
199 | 	int mindeg = m;
200 | 	avgdeg = (double)nnz / m;
201 | 	for (int i = 0; i < m; i++) {
202 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
203 | 		if (deg_i > maxdeg)
204 | 			maxdeg = deg_i;
205 | 		if (deg_i < mindeg)
206 | 			mindeg = deg_i;
207 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
208 | 	}
209 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
210 | 	csrColInd = (int *)malloc(count * sizeof(int));
211 | 	set<int>::iterator site;
212 | 	for (int i = 0, index = 0; i < m; i++) {
213 | 		site = svector[i].begin();
214 | 		while (site != svector[i].end()) {
215 | 			csrColInd[index++] = *site;
216 | 			site++;
217 | 		}
218 | 	}
219 | }
220 | 
221 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
222 | 	int i, offset, neighbor_j;
223 | 	for (i = 0; i < m; i++) {
224 | 		for (offset = csrRowPtr[i]; offset < csrRowPtr[i + 1]; offset++) {
225 | 			neighbor_j = csrColInd[offset];
226 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
227 | 				*correct = 0;
228 | 				printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
229 | 				break;
230 | 			}
231 | 		}
232 | 	}
233 | }
234 | 
235 | void write_solution(char *fname, int nnodes, int *coloring) {
236 | 	int i;
237 | 	FILE *fp = fopen(fname, "w");
238 | 	for (i = 0; i < nnodes; i++) {
239 | 		fprintf(fp, "%d\n", coloring[i]);
240 | 	}
241 | 	fclose(fp);
242 | }
243 | 
244 | void mtx2edges(char *mtx, char *edges) {
245 | 	std::ifstream cfile;
246 | 	cfile.open(mtx);
247 | 	std::string str;
248 | 	getline(cfile, str);
249 | 	char c;
250 | 	sscanf(str.c_str(), "%c", &c);
251 | 	while (c == '%') {
252 | 		getline(cfile, str);
253 | 		sscanf(str.c_str(), "%c", &c);
254 | 	}
255 | 	int m, n, nnz;
256 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
257 | 	if (m != n) {
258 | 		printf("error!\n");
259 | 		exit(0);
260 | 	}
261 | 	vector<set<int> > svector;
262 | 	set<int> s;
263 | 	for (int i = 0; i < m; i++)
264 | 		svector.push_back(s);
265 | 
266 | 	FILE *fp = fopen(edges, "w");
267 | 
268 | 	int dst, src;
269 | 	for (int i = 0; i < nnz; i++) {
270 | 		getline(cfile, str);
271 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
272 | 		dst--;
273 | 		src--;
274 | 		svector[src].insert(dst);
275 | 		svector[dst].insert(src);
276 | 	}
277 | 	cfile.close();
278 | 	int count = 0;
279 | 	for (int i = 0; i < m; i++) {
280 | 		count += svector[i].size();
281 | 	}
282 | 	fprintf(fp, "%d %d\n", m, count);
283 | 	set<int>::iterator site;
284 | 	for (int i = 0; i < m; i++) {
285 | 		site = svector[i].begin();
286 | 		while (site != svector[i].end()) { 
287 | 			fprintf(fp, "%d %d\n", i, *site);
288 | 			site++;
289 | 		}
290 | 	}
291 | 	fclose(fp);
292 | }
293 | 
294 | void verify(Graph &graph, int *coloring, int *correct) {
295 | 	int nnodes = graph.nnodes;
296 | 	int i, j, neighbors, neighbor_j;	
297 | 	for (i = 0; i < nnodes; i++) {
298 | 		neighbors = graph.noutgoing[i];
299 | 		for (j = 0; j < neighbors; j++) {
300 | 			neighbor_j = graph.edgessrcdst[graph.psrc[i] + j];
301 | 			if (coloring[i] == coloring[neighbor_j] && neighbor_j != i) {
302 | 				*correct = 0;
303 | 				printf("colors[%d] = colors[%d] = %d\n", i, neighbor_j, coloring[i]);
304 | 				break;
305 | 			}
306 | 		}	
307 | 	}
308 | }
309 | 
310 | int main(int argc, char *argv[]) {
311 | 	if (argc != 3) {
312 | 		printf("Usage: %s <nThreads> <graph>\n", argv[0]);
313 | 		exit(1);
314 | 	}
315 | 	num_omp_threads = atoi(argv[1]);
316 | #ifdef ENABLE_OPENMP
317 | 	omp_set_num_threads(num_omp_threads);
318 | 	printf("OpenMP graph coloring by Xuhao Chen, num_omp_threads=%d\n", num_omp_threads);
319 | #endif
320 | 	int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL;
321 | 	if (strstr(argv[2], ".mtx"))
322 | 		mtx2csr(argv[2], m, nnz, csrRowPtr, csrColInd);
323 | 	else if (strstr(argv[2], ".graph"))
324 | 		graph2csr(argv[2], m, nnz, csrRowPtr, csrColInd);
325 | 	else if (strstr(argv[2], ".gr"))
326 | 		gr2csr(argv[2], m, nnz, csrRowPtr, csrColInd);
327 | 	else
328 | 		{ printf("Unrecognizable input file format\n"); exit(0); }
329 | 	int *coloring, correct;
330 | 	coloring = (int *)calloc(m, sizeof(int));
331 | 	correct = 1;
332 | 	color(m, nnz, csrRowPtr, csrColInd, coloring);
333 | 	verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct);
334 | 	if (correct)
335 | 		printf("correct\n");
336 | 	else
337 | 		printf("incorrect\n");
338 | 	write_solution("coloring.txt", m, coloring);
339 | 	return 0;
340 | }
341 | 


--------------------------------------------------------------------------------
/src/omp/graph.h:
--------------------------------------------------------------------------------
  1 | #ifndef LSG_GRAPH
  2 | #define LSG_GRAPH
  3 | 
  4 | #define MYINFINITY	1000000000
  5 | #define DISTANCETHRESHOLD	150
  6 | #define THRESHOLDDEGREE		10
  7 | 
  8 | typedef struct Graph {
  9 | 	enum {NotAllocated, AllocatedOnHost, AllocatedOnDevice} memory;
 10 | 
 11 | 	unsigned read(char file[]);
 12 | 	long unsigned cudaCopy(struct Graph &copygraph);
 13 | 	unsigned optimize();
 14 | 	unsigned printStats();
 15 | 	void     print();
 16 | 
 17 | 	Graph();
 18 | 	~Graph();
 19 | 	unsigned init();
 20 | 	unsigned allocOnHost();
 21 | 	unsigned allocOnDevice();
 22 | 	unsigned dealloc();
 23 | 	unsigned deallocOnHost();
 24 | 	unsigned deallocOnDevice();
 25 | 	unsigned optimizeone();
 26 | 	unsigned optimizetwo();
 27 | 	void allocLevels();
 28 | 	void freeLevels();
 29 | 	void progressPrint(unsigned maxii, unsigned ii);
 30 | 	unsigned readFromEdges(char file[]);
 31 | 	unsigned readFromGR(char file[]);
 32 | 	unsigned getOutDegree(unsigned src);
 33 | 	unsigned getDestination(unsigned src, unsigned nthedge);
 34 | 	unsigned getFirstEdge(unsigned src);
 35 | 	foru getWeight(unsigned src, unsigned nthedge);
 36 | 
 37 | 	unsigned nnodes, nedges;
 38 | 	unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *edgessrcdst;
 39 | 	foru *edgessrcwt;
 40 | 	unsigned *levels;
 41 | 	unsigned source;
 42 | 
 43 | 	unsigned *maxOutDegree, *maxInDegree;
 44 | 	unsigned diameter;
 45 | 	bool foundStats;
 46 | 
 47 | } Graph;
 48 | 
 49 | unsigned Graph::init() {
 50 | 	noutgoing = nincoming = srcsrc = psrc = edgessrcdst = NULL;
 51 | 	edgessrcwt = NULL;
 52 | 	source = 0;
 53 | 	nnodes = nedges = 0;
 54 | 	memory = NotAllocated;
 55 | 
 56 | 	maxOutDegree = maxInDegree = NULL;
 57 | 	diameter = 0;
 58 | 	foundStats = false;
 59 | 
 60 | 	return 0;
 61 | }
 62 | 
 63 | unsigned Graph::allocOnHost() {
 64 | 	edgessrcdst = (unsigned int *)malloc((nedges+1) * sizeof(unsigned int));	// first entry acts as null.
 65 | 	edgessrcwt = (foru *)malloc((nedges+1) * sizeof(foru));	// first entry acts as null.
 66 | 	psrc = (unsigned int *)calloc(nnodes+1, sizeof(unsigned int));	// init to null.
 67 | 	psrc[nnodes] = nedges;	// last entry points to end of edges, to avoid thread divergence in drelax.
 68 | 	noutgoing = (unsigned int *)calloc(nnodes, sizeof(unsigned int));	// init to 0.
 69 | 	nincoming = (unsigned int *)calloc(nnodes, sizeof(unsigned int));	// init to 0.
 70 | 	srcsrc = (unsigned int *)malloc(nnodes * sizeof(unsigned int));
 71 | 
 72 | 	maxOutDegree = (unsigned *)malloc(sizeof(unsigned));
 73 | 	maxInDegree = (unsigned *)malloc(sizeof(unsigned));
 74 | 	*maxOutDegree = 0;
 75 | 	*maxInDegree = 0;
 76 | 
 77 | 	memory = AllocatedOnHost;
 78 | 	return 0;
 79 | }
 80 | 
 81 | unsigned Graph::deallocOnHost() {
 82 | 	free(noutgoing);
 83 | 	free(nincoming);
 84 | 	free(srcsrc);
 85 | 	free(psrc);
 86 | 	free(edgessrcdst);
 87 | 	free(edgessrcwt);
 88 | 
 89 | 	free(maxOutDegree);
 90 | 	free(maxInDegree);
 91 | 	return 0;
 92 | }
 93 | 
 94 | unsigned Graph::dealloc() {
 95 | 	switch (memory) {
 96 | 		case AllocatedOnHost:
 97 | 			printf("dealloc on host.\n");
 98 | 			deallocOnHost();
 99 | 			break;
100 | 		case AllocatedOnDevice:
101 | 			printf("dealloc on device.\n");
102 | //			deallocOnDevice();
103 | 			break;
104 | 	}
105 | 	return 0;
106 | }
107 | 
108 | Graph::Graph() {
109 | 	init();
110 | }
111 | 
112 | Graph::~Graph() {
113 | }
114 | 
115 | //TODO: make optimizations use the graph api.
116 | unsigned Graph::optimizeone() {
117 | 	unsigned int nvv = nnodes;	// no of vertices to be optimized.
118 | 	unsigned int insertindex = 1;	// because ii starts with 0.
119 | 
120 | 	for (unsigned ii = 0; ii < nvv; ++ii) {
121 | 		unsigned src = srcsrc[ii];
122 | 		unsigned dstindex = psrc[src];
123 | 		unsigned degree = noutgoing[src];
124 | 		if (degree && srcsrc[edgessrcdst[dstindex]] > src + DISTANCETHRESHOLD) {
125 | 			unsigned int nee = degree;
126 | 			for (unsigned ee = 0; ee < nee; ++ee) {
127 | 				unsigned dst = edgessrcdst[dstindex + ee];
128 | 				unsigned dstentry = srcsrc[dst];
129 | 				// swap insertindex and dst.
130 | 				unsigned temp = psrc[insertindex];
131 | 				psrc[insertindex] = psrc[dstentry];
132 | 				psrc[dstentry] = temp;
133 | 
134 | 				temp = srcsrc[ii];
135 | 				srcsrc[ii] = srcsrc[dst];
136 | 				srcsrc[dst] = temp;
137 | 
138 | 				if (++insertindex >= nnodes) {
139 | 					break;
140 | 				}
141 | 			}
142 | 			if (insertindex >= nnodes) {
143 | 				break;
144 | 			}
145 | 		}
146 | 	}
147 | 	return 0;
148 | }
149 | 
150 | unsigned Graph::optimizetwo() {
151 | 	// load balance.
152 | 	unsigned int nvv = nnodes / 2;
153 | 	bool firsthalfsmaller = true;
154 | 	unsigned int temp;
155 | 
156 | 	for (unsigned ii = 0; ii < nvv; ++ii) {
157 | 		unsigned one = ii;
158 | 		unsigned two = nvv + ii;
159 | 		unsigned degreeone = noutgoing[one];
160 | 		unsigned degreetwo = noutgoing[two];
161 | 
162 | 		if (degreeone > degreetwo && degreeone - degreetwo > THRESHOLDDEGREE && !firsthalfsmaller || degreetwo > degreeone && degreetwo - degreeone > THRESHOLDDEGREE && firsthalfsmaller) {
163 | 			temp = srcsrc[one];
164 | 			srcsrc[one] = srcsrc[two];
165 | 			srcsrc[two] = temp;
166 | 
167 | 			temp = psrc[one];
168 | 			psrc[one] = psrc[two];
169 | 			psrc[two] = temp;
170 | 			firsthalfsmaller = !firsthalfsmaller;
171 | 		}
172 | 	}
173 | 	return 0;
174 | }
175 | 
176 | unsigned Graph::optimize() {
177 | 	optimizeone();
178 | 	optimizetwo();
179 | 	return 0;
180 | }
181 | 
182 | void Graph::progressPrint(unsigned maxii, unsigned ii) {
183 | 	const unsigned nsteps = 10;
184 | 	unsigned ineachstep = (maxii / nsteps);
185 | 	if(ineachstep == 0) ineachstep = 1;
186 | 	/*if (ii == maxii) {
187 | 		printf("\t100%%\n");
188 | 	} else*/ if (ii % ineachstep == 0) {
189 | 		printf("\t%3d%%\r", ii*100/maxii + 1);
190 | 		fflush(stdout);
191 | 	}
192 | }
193 | 
194 | unsigned Graph::readFromEdges(char file[]) {
195 | 	std::ifstream cfile;
196 | 	cfile.open(file);
197 | 
198 | 	std::string str;
199 | 	getline(cfile, str);
200 | 	sscanf(str.c_str(), "%d %d", &nnodes, &nedges);
201 | 
202 | 	allocOnHost();
203 | 	for (unsigned ii = 0; ii < nnodes; ++ii) {
204 | 		srcsrc[ii] = ii;
205 | 	}
206 | 
207 | 
208 | 	unsigned int prevnode = 0;
209 | 	unsigned int tempsrcnode;
210 | 	unsigned int ncurroutgoing = 0;
211 | 	for (unsigned ii = 0; ii < nedges; ++ii) {
212 | 		getline(cfile, str);
213 | 		sscanf(str.c_str(), "%d %d %d", &tempsrcnode, &edgessrcdst[ii+1], &edgessrcwt[ii+1]);
214 | 		if (prevnode == tempsrcnode) {
215 | 			if (ii == 0) {
216 | 				psrc[tempsrcnode] = ii + 1;
217 | 			}
218 | 			++ncurroutgoing;
219 | 		} else {
220 | 			psrc[tempsrcnode] = ii + 1;
221 | 			if (ncurroutgoing) {
222 | 				noutgoing[prevnode] = ncurroutgoing;
223 | 			}
224 | 			prevnode = tempsrcnode;
225 | 			ncurroutgoing = 1;	// not 0.
226 | 		}
227 | 		++nincoming[edgessrcdst[ii+1]];
228 | 
229 | 		progressPrint(nedges, ii);
230 | 	}
231 | 	noutgoing[prevnode] = ncurroutgoing;	// last entries.
232 | 
233 | 	cfile.close();
234 | 	return 0;
235 | }
236 | 
237 | unsigned Graph::readFromGR(char file[]) {
238 | 	std::ifstream cfile;
239 | 	cfile.open(file);
240 | 
241 | 	// copied from GaloisCpp/trunk/src/FileGraph.h
242 | 	int masterFD = open(file, O_RDONLY);
243 |   	if (masterFD == -1) {
244 | 	printf("FileGraph::structureFromFile: unable to open %s.\n", file);
245 | 	return 1;
246 |   	}
247 | 
248 |   	struct stat buf;
249 | 	int f = fstat(masterFD, &buf);
250 |   	if (f == -1) {
251 |     		printf("FileGraph::structureFromFile: unable to stat %s.\n", file);
252 |     		abort();
253 |   	}
254 |   	size_t masterLength = buf.st_size;
255 | 
256 |   	int _MAP_BASE = MAP_PRIVATE;
257 | //#ifdef MAP_POPULATE
258 | //  _MAP_BASE  |= MAP_POPULATE;
259 | //#endif
260 | 
261 |   	void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
262 |   	if (m == MAP_FAILED) {
263 |     		m = 0;
264 |     		printf("FileGraph::structureFromFile: mmap failed.\n");
265 |     		abort();
266 |   	}
267 | 
268 | 	double starttime, endtime;
269 | 	starttime = rtclock();
270 | 
271 |   	//parse file
272 |   	uint64_t* fptr = (uint64_t*)m;
273 |   	__attribute__((unused)) uint64_t version = le64toh(*fptr++);
274 |   	assert(version == 1);
275 |   	uint64_t sizeEdgeTy = le64toh(*fptr++);
276 |   	uint64_t numNodes = le64toh(*fptr++);
277 |   	uint64_t numEdges = le64toh(*fptr++);
278 |   	uint64_t *outIdx = fptr;
279 |   	fptr += numNodes;
280 |   	uint32_t *fptr32 = (uint32_t*)fptr;
281 |   	uint32_t *outs = fptr32; 
282 |   	fptr32 += numEdges;
283 |   	if (numEdges % 2) fptr32 += 1;
284 |   	unsigned  *edgeData = (unsigned *)fptr32;
285 | 
286 | 	
287 | 	// cuda.
288 | 	nnodes = numNodes;
289 | 	nedges = numEdges;
290 | 
291 | 	printf("nnodes=%d, nedges=%d.\n", nnodes, nedges);
292 | 	allocOnHost();
293 | 
294 | 	for (unsigned ii = 0; ii < nnodes; ++ii) {
295 | 		// fill unsigned *noutgoing, *nincoming, *srcsrc, *psrc, *edgessrcdst; foru *edgessrcwt;
296 | 		srcsrc[ii] = ii;
297 | 		if (ii > 0) {
298 | 			psrc[ii] = le64toh(outIdx[ii - 1]) + 1;
299 | 			noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]);
300 | 		} else {
301 | 			psrc[0] = 1;
302 | 			noutgoing[0] = le64toh(outIdx[0]);
303 | 		}
304 | 		for (unsigned jj = 0; jj < noutgoing[ii]; ++jj) {
305 | 			unsigned edgeindex = psrc[ii] + jj;
306 | 			unsigned dst = le32toh(outs[edgeindex - 1]);
307 | 			if (dst >= nnodes) printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, edgeindex);
308 | 			edgessrcdst[edgeindex] = dst;
309 | 			edgessrcwt[edgeindex] = edgeData[edgeindex - 1];
310 | 
311 | 			++nincoming[dst];
312 | 			//if (ii == 194 || ii == 352) {
313 | 			//	printf("edge %d: %d->%d, wt=%d.\n", edgeindex, ii, dst, edgessrcwt[edgeindex]);
314 | 			//}
315 | 		}
316 | 		progressPrint(nnodes, ii);
317 | 	}
318 | 
319 | 	cfile.close();	// probably galois doesn't close its file due to mmap.
320 | 
321 | 	endtime = rtclock();
322 | 
323 | 	printf("read %lld bytes in %0.2f ms (%0.2f MB/s)\n", masterLength, 1000 * (endtime - starttime), (masterLength / 1048576) / (endtime - starttime));
324 | 
325 | 	return 0;
326 | }
327 | unsigned Graph::read(char file[]) {
328 | 	if (strstr(file, ".edges")) {
329 | 		return readFromEdges(file);
330 | 	} else if (strstr(file, ".gr")) {
331 | 		return readFromGR(file);
332 | 	}
333 | 	return 0;
334 | }
335 | 
336 | unsigned Graph::getOutDegree(unsigned src) {
337 | 	if (src < nnodes) {
338 | 		return noutgoing[src];
339 | 	}
340 | 	return 0;
341 | }
342 | 
343 | unsigned Graph::getDestination(unsigned src, unsigned nthedge) {
344 | 	if (src < nnodes && nthedge < getOutDegree(src)) {
345 | 		unsigned edge = getFirstEdge(src) + nthedge;
346 | 		if (edge && edge < nedges + 1) {
347 | 			return edgessrcdst[edge];
348 | 		}
349 | 		return nnodes;
350 | 	}
351 | 	if (src < nnodes) {
352 | 		printf("Error: %s(%d): node %d: edge %d out of bounds %d.\n", __FILE__, __LINE__, src, nthedge, getOutDegree(src));
353 | 	} else {
354 | 		printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes);
355 | 	}
356 | 	return nnodes;
357 | }
358 | 
359 | foru Graph::getWeight(unsigned src, unsigned nthedge) {
360 | 	if (src < nnodes && nthedge < getOutDegree(src)) {
361 | 		unsigned edge = getFirstEdge(src) + nthedge;
362 | 		if (edge && edge < nedges + 1) {
363 | 			return edgessrcwt[edge];
364 | 		}
365 | 		return MYINFINITY;
366 | 	}
367 | 	if (src < nnodes) {
368 | 		printf("Error: %s(%d): node %d: edge %d out of bounds %d.\n", __FILE__, __LINE__, src, nthedge, getOutDegree(src));
369 | 	} else {
370 | 		printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes);
371 | 	}
372 | 	return MYINFINITY;
373 | }
374 | 
375 | unsigned Graph::getFirstEdge(unsigned src) {
376 | 	if (src < nnodes) {
377 | 		unsigned srcnout = getOutDegree(src);
378 | 		if (srcnout > 0 && srcsrc[src] < nnodes) {
379 | 			return psrc[srcsrc[src]];
380 | 		}
381 | 		printf("Error: %s(%d): edge %d out of bounds %d.\n", __FILE__, __LINE__, 0, srcnout);
382 | 		return 0;
383 | 	}
384 | 	printf("Error: %s(%d): node %d out of bounds %d.\n", __FILE__, __LINE__, src, nnodes);
385 | 	return 0;
386 | }
387 | 
388 | #endif
389 | 


--------------------------------------------------------------------------------
/src/csrcolor/csrcolor.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2016, National University of Defense Technology
  2 | // Authors: Xuhao Chen <cxh@illinois.edu> and Pingfan Li <lipingfan@163.com>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <string>
  9 | #include "cusparse.h"
 10 | #include "cuda.h"
 11 | #include <thrust/reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include "cutil_subset.h"
 15 | #include "common.h"
 16 | #include <vector>
 17 | #include <set>
 18 | using namespace std;
 19 | 
 20 | void gr2csr(char *gr, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 21 | 	printf("Reading RMAT (.gr) input file %s\n", gr);
 22 | 	std::ifstream cfile;
 23 | 	cfile.open(gr);
 24 | 	std::string str;
 25 | 	getline(cfile, str);
 26 | 	char c;
 27 | 	sscanf(str.c_str(), "%c", &c);
 28 | 	while (c == 'c') {
 29 | 		getline(cfile, str);
 30 | 		sscanf(str.c_str(), "%c", &c);
 31 | 	}
 32 | 	char sp[3];
 33 | 	sscanf(str.c_str(), "%c %s %d %d", &c, sp, &m, &nnz);
 34 | 	//printf("%c %s %d %d\n", c, sp, m, nnz);
 35 | 	printf("num_vertices %d  num_edges %d\n", m, nnz);
 36 | 	vector<set<int> > svector;
 37 | 	set<int> s;
 38 | 	for (int i = 0; i < m; i++)
 39 | 		svector.push_back(s);
 40 | 	int dst, src;
 41 | 	for (int i = 0; i < nnz; i++) {
 42 | 		getline(cfile, str);
 43 | 		sscanf(str.c_str(), "%c %d %d", &c, &src, &dst);
 44 | 		if (c != 'a')
 45 | 			printf("line %d\n", __LINE__);
 46 | 		dst--;
 47 | 		src--;
 48 | 		svector[src].insert(dst);
 49 | 		svector[dst].insert(src);
 50 | 	}
 51 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
 52 | 	int count = 0;
 53 | 	for (int i = 0; i < m; i++) {
 54 | 		csrRowPtr[i] = count;
 55 | 		count += svector[i].size();
 56 | 	}
 57 | 	csrRowPtr[m] = count;
 58 | 	if (count != nnz) {
 59 | 		printf("The graph is not symmetric\n");
 60 | 		nnz = count;
 61 | 	}
 62 | 	double avgdeg;
 63 | 	double variance = 0.0;
 64 | 	int maxdeg = 0;
 65 | 	int mindeg = m;
 66 | 	avgdeg = (double)nnz / m;
 67 | 	for (int i = 0; i < m; i++) {
 68 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
 69 | 		if (deg_i > maxdeg)
 70 | 			maxdeg = deg_i;
 71 | 		if (deg_i < mindeg)
 72 | 			mindeg = deg_i;
 73 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
 74 | 	}
 75 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
 76 | 	csrColInd = (int *)malloc(count * sizeof(int));
 77 | 	set<int>::iterator site;
 78 | 	for (int i = 0, index = 0; i < m; i++) {
 79 | 		site = svector[i].begin();
 80 | 		while (site != svector[i].end()) {
 81 | 			csrColInd[index++] = *site;
 82 | 			site++;
 83 | 		}
 84 | 	}
 85 | }
 86 | 
 87 | // transfer *.graph file to CSR format
 88 | void graph2csr(char *graph, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
 89 | 	printf("Reading .graph input file %s\n", graph);
 90 | 	std::ifstream cfile;
 91 | 	cfile.open(graph);
 92 | 	std::string str;
 93 | 	getline(cfile, str);
 94 | 	sscanf(str.c_str(), "%d %d", &m, &nnz);
 95 | 	printf("num_vertices %d num_edges %d\n", m, nnz);
 96 | 	vector<set<int> > svector;
 97 | 	set<int> s;
 98 | 	for (int i = 0; i < m; i++)
 99 | 		svector.push_back(s);
100 | 	int dst;
101 | 	for (int i = 0; i < m; i++) {
102 | 		getline(cfile, str);
103 | 		istringstream istr;
104 | 		istr.str(str);
105 | 		while(istr>>dst) {
106 | 			dst --;
107 | 			svector[i].insert(dst);
108 | 			svector[dst].insert(i);
109 | 		}
110 | 		istr.clear();
111 | 	}
112 | 	cfile.close();
113 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
114 | 	int count = 0;
115 | 	for (int i = 0; i < m; i++) {
116 | 		csrRowPtr[i] = count;
117 | 		count += svector[i].size();
118 | 	}
119 | 	csrRowPtr[m] = count;
120 | 	if (count != nnz) {
121 | 		printf("The graph is not symmetric\n");
122 | 		nnz = count;
123 | 	}
124 | 	double avgdeg;
125 | 	double variance = 0.0;
126 | 	int maxdeg = 0;
127 | 	int mindeg = m;
128 | 	avgdeg = (double)nnz / m;
129 | 	for (int i = 0; i < m; i++) {
130 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
131 | 		if (deg_i > maxdeg)
132 | 			maxdeg = deg_i;
133 | 		if (deg_i < mindeg)
134 | 			mindeg = deg_i;
135 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
136 | 	}
137 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
138 | 	csrColInd = (int *)malloc(count * sizeof(int));
139 | 	set<int>::iterator site;
140 | 	for (int i = 0, index = 0; i < m; i++) {
141 | 		site = svector[i].begin();
142 | 		while (site != svector[i].end()) {
143 | 			csrColInd[index++] = *site;
144 | 			site++;
145 | 		}
146 | 	}
147 | }
148 | 
149 | void write_solution(char *fname, int m, int *coloring) {
150 | 	FILE *fp = fopen(fname, "w");
151 | 	int i;
152 | 	for (i = 0; i < m; i++) {
153 | 		fprintf(fp, "%d\n", coloring[i]);
154 | 	}
155 | 	fclose(fp);
156 | }
157 | 
158 | void verify(int m, int nnz, int *csrRowPtr, int *csrColInd, int *coloring, int *correct) {
159 | 	int i, j, neighbors, start, neighbor_j;
160 | 	for (i = 0; i < m; i++) {
161 | 		start = csrRowPtr[i];
162 | 		neighbors = csrRowPtr[i + 1] - start;
163 | 		for (j = 0; j < neighbors; j++) {
164 | 			neighbor_j = csrColInd[start + j];
165 | 			if (coloring[i] == coloring[neighbor_j] && i != neighbor_j) {
166 | 				*correct = 0;
167 | 				printf("coloring[%d] = coloring[%d] = %d\n", i, neighbor_j, coloring[i]);
168 | 			}
169 | 			break;
170 | 		}
171 | 	}
172 | }
173 | 
174 | void mtx2csr(char *mtx, int &m, int &nnz, int *&csrRowPtr, int *&csrColInd) {
175 | 	printf("Reading .mtx input file %s\n", mtx);
176 | 	std::ifstream cfile;
177 | 	cfile.open(mtx);
178 | 	std::string str;
179 | 	getline(cfile, str);
180 | 	char c;
181 | 	sscanf(str.c_str(), "%c", &c);
182 | 	while (c == '%') {
183 | 		getline(cfile, str);
184 | 		sscanf(str.c_str(), "%c", &c);
185 | 	}
186 | 	int n;
187 | 	sscanf(str.c_str(), "%d %d %d", &m, &n, &nnz);
188 | 	if (m != n) {
189 | 		printf("error!\n");
190 | 		exit(0);
191 | 	}
192 | 	printf("num_vertices %d, num_edges %d\n", m, nnz);
193 | 	vector<set<int> > svector;
194 | 	set<int> s;
195 | 	for (int i = 0; i < m; i++)
196 | 		svector.push_back(s);
197 | 	int dst, src;
198 | 	for (int i = 0; i < nnz; i++) {
199 | 		getline(cfile, str);
200 | 		sscanf(str.c_str(), "%d %d", &dst, &src);
201 | 
202 | 		dst--;
203 | 		src--;
204 | 
205 | 		svector[src].insert(dst);
206 | 		svector[dst].insert(src);
207 | 	}
208 | 	cfile.close();
209 | 	csrRowPtr = (int *)malloc((m + 1) * sizeof(int));
210 | 	int count = 0;
211 | 	for (int i = 0; i < m; i++) {
212 | 		csrRowPtr[i] = count;
213 | 		count += svector[i].size();
214 | 	}
215 | 	csrRowPtr[m] = count;
216 | 	if (count != nnz) {
217 | 		printf("The graph is not symmetric\n");
218 | 		nnz = count;
219 | 	}
220 | 	double avgdeg;
221 | 	double variance = 0.0;
222 | 	int maxdeg = 0;
223 | 	int mindeg = m;
224 | 	avgdeg = (double)nnz / m;
225 | 	for (int i = 0; i < m; i++) {
226 | 		int deg_i = csrRowPtr[i + 1] - csrRowPtr[i];
227 | 		if (deg_i > maxdeg)
228 | 			maxdeg = deg_i;
229 | 		if (deg_i < mindeg)
230 | 			mindeg = deg_i;
231 | 		variance += (deg_i - avgdeg) * (deg_i - avgdeg) / m;
232 | 	}
233 | 	printf("mindeg %d maxdeg %d avgdeg %.2f variance %.2f\n", mindeg, maxdeg, avgdeg, variance);
234 | 	csrColInd = (int *)malloc(count * sizeof(int));
235 | 	set<int>::iterator site;
236 | 	for (int i = 0, index = 0; i < m; i++) {
237 | 		site = svector[i].begin();
238 | 		while (site != svector[i].end()) {
239 | 			csrColInd[index++] = *site;
240 | 			site++;
241 | 		}
242 | 	}
243 | }
244 | 
245 | 
246 | int main(int argc, char *argv[]) {
247 | 	if (argc != 2) {
248 | 		printf("Usage: %s <graph>\n", argv[0]);
249 | 		exit(1);
250 | 	}
251 | 	int m, nnz, *csrRowPtr = NULL, *csrColInd = NULL;
252 | 	if (strstr(argv[1], ".mtx"))
253 | 		mtx2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
254 | 	else if (strstr(argv[1], ".graph"))
255 | 		graph2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
256 | 	else if (strstr(argv[1], ".gr"))
257 | 		gr2csr(argv[1], m, nnz, csrRowPtr, csrColInd);
258 | 	else
259 | 		{ printf("Unrecognizable input file format\n"); exit(0); }
260 | 	if (csrRowPtr == NULL)
261 | 		printf("csrRowPtr is NULL\n");
262 | 	if (csrColInd == NULL)
263 | 		printf("csrColInd is NULL\n");
264 | 	double t1, t2, t3, t4, t5, t6;
265 | 	int *d_csrRowPtr, *d_csrColInd;
266 | 	float *d_csrVal;
267 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrRowPtr, (m + 1) * sizeof(int))); 
268 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrColInd, nnz * sizeof(int))); 
269 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_csrVal, nnz * sizeof(float))); 
270 | 	int ncolors = 0, *coloring;
271 | 	int *d_coloring, *d_reordering;
272 | 	float fraction = 1.0;
273 | 	coloring = (int *)calloc(m, sizeof(int));
274 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_coloring, m * sizeof(int)));	
275 | 	CUDA_SAFE_CALL(cudaMalloc((void **)&d_reordering, m * sizeof(int))); 
276 | 	CUDA_SAFE_CALL(cudaMemset(d_reordering, 0, m * sizeof(int))); 
277 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
278 | 	t1 = rtclock();
279 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrRowPtr, csrRowPtr, (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
280 | 	CUDA_SAFE_CALL(cudaMemcpy(d_csrColInd, csrColInd, nnz * sizeof(int), cudaMemcpyHostToDevice));
281 | 	cudaDeviceSynchronize();
282 | 	t2 = rtclock();
283 | 	//printf("time of init:%f ms\n", 1000.0f * (t2 - t1));	
284 | 
285 | 	int device = 0;
286 | 	int deviceCount = 0;
287 | 	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
288 | 	cudaDeviceProp deviceProp;
289 | 	CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device));
290 | 	int nSM = deviceProp.multiProcessorCount;
291 | 	fprintf(stdout, "Found %d devices, using device %d (%s), compute capability %d.%d, cores %d*%d.\n", 
292 | 			deviceCount, device, deviceProp.name, deviceProp.major, deviceProp.minor, nSM, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
293 | 
294 | 	cusparseStatus_t status;
295 | 	cusparseHandle_t handle;
296 | 	status = cusparseCreate(&handle);
297 | 	if (status != CUSPARSE_STATUS_SUCCESS) {
298 | 		printf("error!");
299 | 		exit(1);
300 | 	}
301 | 	cusparseMatDescr_t descr;
302 | 	status = cusparseCreateMatDescr(&descr);
303 | 	if (status != CUSPARSE_STATUS_SUCCESS) {
304 | 		printf("error!");
305 | 		exit(1);
306 | 	}
307 | 	cusparseColorInfo_t info;
308 | 	status = cusparseCreateColorInfo(&info);
309 | 	if (status != CUSPARSE_STATUS_SUCCESS) {
310 | 		printf("error!");
311 | 		exit(1);
312 | 	}	
313 | 	double runtime[10];
314 | 	int colors[10];
315 | 	for (int i = 0; i < 10; i++) {
316 | 		t5 = rtclock();
317 | 		status = cusparseScsrcolor(handle, m, nnz, descr, d_csrVal, d_csrRowPtr, d_csrColInd, &fraction, &ncolors, d_coloring, d_reordering, info);
318 | 		t6 = rtclock();
319 | 		runtime[i] = 1000.0f * (t6 - t5);
320 | 		colors[i] = 1 + thrust::reduce(thrust::device, d_coloring, d_coloring + m, 0, thrust::maximum<int>());
321 | 	}
322 | 	double total_time = 0;
323 | 	int total_colors = 0;
324 | 	double avg_time;
325 | 	double avg_colors;
326 | 	for (int i = 0; i < 10; i++) {
327 | 		printf("[%.2f %d] ", runtime[i], colors[i]);
328 | 		total_time += runtime[i];
329 | 		total_colors += colors[i];
330 | 	}
331 | 	printf("\navg_time %f ms, avg_colors %.2f\n", total_time / 10, (double)total_colors / 10);
332 | 	switch (status) {
333 | 		case CUSPARSE_STATUS_SUCCESS:
334 | 			//printf("success\n");
335 | 			break;
336 | 		case CUSPARSE_STATUS_NOT_INITIALIZED:
337 | 			printf("not initialed\n");
338 | 		case CUSPARSE_STATUS_ALLOC_FAILED:
339 | 			printf("alloc failed\n");
340 | 			break;
341 | 		case CUSPARSE_STATUS_INVALID_VALUE:
342 | 			printf("invalid value\n");
343 | 			break;
344 | 		case CUSPARSE_STATUS_ARCH_MISMATCH:
345 | 			printf("mismatch\n");
346 | 			break;
347 | 		case CUSPARSE_STATUS_INTERNAL_ERROR:
348 | 			printf("internal error\n");
349 | 			break;
350 | 		case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
351 | 			printf("not supported\n");
352 | 			break;
353 | 		default:
354 | 			printf("unknown error\n");
355 | 			break;
356 | 	};
357 | 	t3 = rtclock();
358 | 	CUDA_SAFE_CALL(cudaMemcpy(coloring, d_coloring, m * sizeof(int), cudaMemcpyDeviceToHost));
359 | 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
360 | 	t4 = rtclock();
361 | 	//printf("time of copy back:%f ms\n", 1000.0f * (t4 - t3));	
362 | 	write_solution("color.txt", m, coloring);
363 | 	int correct = 1;
364 | 	verify(m, nnz, csrRowPtr, csrColInd, coloring, &correct);
365 | 	if (correct)
366 | 		printf("correct.\n");
367 | 	else
368 | 		printf("incorrect.\n");
369 | 	return 0;
370 | }
371 | 


--------------------------------------------------------------------------------