├── 70799.pdf ├── results ├── analysis.ods ├── gtx285.txt ├── testa.txt └── tesla-c2050.txt ├── A Study on Connected Components Labeling algorithms using GPUs.pdf ├── make.sh ├── textures.cuh ├── README ├── spiral └── spiral.py ├── ccl_gold.cpp ├── image_set └── generate_dataset.py ├── perf_tests ├── coalescing_test.cu └── atomics_test.cu ├── pgm.h ├── LICENSE ├── ccl_test.cu ├── run.py ├── ccl_naive_prop.cu ├── distance_transform.cu ├── distance_transform2.cu ├── ccl_lequiv.cu └── ccl_uf.cu /70799.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victoroliv2/CCL-GPU/HEAD/70799.pdf -------------------------------------------------------------------------------- /results/analysis.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victoroliv2/CCL-GPU/HEAD/results/analysis.ods -------------------------------------------------------------------------------- /A Study on Connected Components Labeling algorithms using GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victoroliv2/CCL-GPU/HEAD/A Study on Connected Components Labeling algorithms using GPUs.pdf -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | /usr/local/cuda/bin/nvcc -O3 -DTHREADED -arch sm_13 ccl_test.cu -o bin/ccl_test 2 | /usr/local/cuda/bin/nvcc -O3 -Xptxas -v -arch sm_13 --maxrregcount=20 distance_transform2.cu -o bin/distance_transform2 3 | -------------------------------------------------------------------------------- /textures.cuh: -------------------------------------------------------------------------------- 1 | #ifndef textures_cuh 2 | #define textures_cuh 3 | 4 | texture imgtex; 5 | texture Ltex; 6 | texture Rtex; 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /results/gtx285.txt: -------------------------------------------------------------------------------- 1 | == Finals Results == 2 | 3 | union-find (gpu) 4 | mean:1.61 5 | std:0.10 6 | max:1.85 7 | min:1.45 8 | union-find (gpu+cpu) 9 | mean:1.28 10 | std:0.08 11 | max:1.47 12 | min:1.16 13 | Label Equivalence 14 | mean:2.05 15 | std:0.46 16 | max:3.54 17 | min:0.69 18 | -------------------------------------------------------------------------------- /results/testa.txt: -------------------------------------------------------------------------------- 1 | == Finals Results == 2 | 3 | union-find (gpu) 4 | mean:6.59 5 | std:1.56 6 | max:10.45 7 | min:4.09 8 | union-find (gpu+cpu) 9 | mean:3.29 10 | std:0.78 11 | max:5.21 12 | min:2.05 13 | Label Equivalence 14 | mean:4.95 15 | std:1.50 16 | max:10.07 17 | min:2.36 18 | 19 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Algorithms implemented for "A Study on Connected Components Labeling algorithms using GPUs", published on: 2 | 3 | Oliveira, V.M., Lotufo, R.A.: A study on connected components labeling algorithmsusing GPUs. In: SIBGRAPI. vol. 3, p. 4 (2010) 4 | 5 | ccl_uf.cu -> new algorithm described in the paper 6 | ccl_naive_prop.cu -> naive label propagation 7 | ccl_lequiv.cu -> label equivalence 8 | ccl_gold.cu -> ground truth for verification 9 | ccl_test.cu -> runs all 3 on the gpu (main function) 10 | generate_dataset.py -> download dataset used in the paper 11 | run.py -> runs ccl_test on the downloaded dataset 12 | spiral.py -> utilitity to create images with spirals 13 | 14 | I don’t have the sources for the CPU CCL implementation used in the paper because it is proprietary (it used SSE2, threads and hand-optimized assembly code). 15 | -------------------------------------------------------------------------------- /spiral/spiral.py: -------------------------------------------------------------------------------- 1 | import Image, ImageDraw 2 | import numpy 3 | 4 | def gen(n): 5 | d = 4*n + 6 6 | im = Image.new("L", (d,d)) 7 | draw = ImageDraw.Draw(im) 8 | 9 | r = d/2 - 1 10 | 11 | size = 0 12 | posx = r 13 | posy = r 14 | 15 | for k in range(d): 16 | if size % 4 == 0: 17 | draw.line((posx, posy, posx, posy+size), fill=255) 18 | posy+=size+1 19 | 20 | elif size % 4 == 1: 21 | draw.line((posx, posy, posx+size, posy), fill=255) 22 | posx+=size+1 23 | 24 | elif size % 4 == 2: 25 | draw.line((posx, posy, posx, posy-size), fill=255) 26 | posy-=size+1 27 | 28 | elif size % 4 == 3: 29 | draw.line((posx, posy, posx-size, posy), fill=255) 30 | posx-=size+1 31 | 32 | size +=1 33 | 34 | del draw 35 | return im 36 | 37 | i = 1000 38 | im = gen(i) 39 | im.save("out.pgm") 40 | -------------------------------------------------------------------------------- /ccl_gold.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace gold { 5 | 6 | int find(int* label, int x) { 7 | if (label[x] == x) { 8 | return x; 9 | }else { 10 | int v = find(label, label[x]); 11 | label[x] = v; 12 | return v; 13 | } 14 | } 15 | 16 | void merge(int* label, int x, int y){ 17 | x = find(label, x); 18 | y = find(label, y); 19 | 20 | if (label[x] < label[y]) { 21 | label[y] = x; 22 | }else { 23 | label[x] = y; 24 | } 25 | } 26 | 27 | void CCL(unsigned char* img, int w, int h, int* label) { 28 | 29 | for (int i=0; i0 && img[i]==img[i-1]) merge(label, i, i-1); 35 | if (i/w>0 && img[i]==img[i-w]) merge(label, i, i-w); 36 | } 37 | for (int i=0; i 128).astype(numpy.uint8)*255 24 | img = mmareaclose(img, 20) 25 | img = mmareaopen(img, 20) 26 | pil_img = Image.fromarray(img, "L") 27 | pil_img = pil_img.resize(newsize) 28 | pil_img.save( "output/"+f+".png" ) 29 | os.system("convert -compress none %s %s" % 30 | ("output/"+f+".png", "output/"+f+".pgm")) 31 | 32 | import sys 33 | sys.exit(0) 34 | -------------------------------------------------------------------------------- /perf_tests/coalescing_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define BLOCK_X 256 7 | 8 | __global__ void 9 | coalesced_read_1(float* f, int N) { 10 | int x = blockIdx.x*blockDim.x + threadIdx.x; 11 | 12 | float v; 13 | for (int i=0; i<100; i++) { 14 | v= f[x]; 15 | } 16 | 17 | v *= v; 18 | f[x] = v; 19 | } 20 | 21 | 22 | __global__ void 23 | coalesced_read_2(float* f, int N) { 24 | //int x = (gridDim.x-blockIdx.x-1)*blockDim.x + threadIdx.x; 25 | int x = blockIdx.x*blockDim.x + threadIdx.x; 26 | 27 | float v; 28 | 29 | for (int i=0; i<100; i++) { 30 | v = f[(x+1)%N]; 31 | } 32 | 33 | v *= v; 34 | f[x] = v; 35 | } 36 | 37 | 38 | int main(int argc, char *argv[]) { 39 | 40 | int N = 1<<20; 41 | float* data; 42 | cudaMalloc(&data, sizeof(float)*N); 43 | 44 | coalesced_read_1 <<>> (data, N); 45 | cudaThreadSynchronize(); 46 | 47 | coalesced_read_2 <<>> (data, N); 48 | cudaThreadSynchronize(); 49 | 50 | cudaFree(data); 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /pgm.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef unsigned char uchar; 5 | 6 | uchar* load_ppm(const char _imagefile[], int* _w, int* _h) { 7 | FILE* f; 8 | f = fopen(_imagefile, "r"); 9 | 10 | uchar* image; 11 | 12 | int i,w,h; 13 | int g; 14 | char v[100]; 15 | 16 | fgets(v, sizeof(v), f); 17 | //fgets(v, sizeof(v), f); 18 | fscanf(f, "%d %d", &w, &h); 19 | //printf("image size: %d %d\n", w, h); 20 | 21 | fgets(v, sizeof(v), f); 22 | fgets(v, sizeof(v), f); 23 | 24 | image = (uchar*)malloc(sizeof(uchar)*w*h); 25 | 26 | for(i=0; i 2 | #include 3 | 4 | #include "pgm.h" 5 | 6 | #include "ccl_gold.cpp" 7 | #include "ccl_uf.cu" 8 | #include "ccl_lequiv.cu" 9 | #include "ccl_naive_prop.cu" 10 | 11 | 12 | int number_cc(int* label, int w, int h) { 13 | bool* mask = (bool*)malloc(w*h*sizeof(bool)); 14 | for (int i=0; i 0) 57 | 58 | v1 = [] 59 | for j in range(10): 60 | t0 = time.time() 61 | lbl = mmlabel(k) 62 | t1 = time.time() 63 | v1.append(1000*(t1-t0)) 64 | 65 | RESULTS = res+[min(v1)] 66 | 67 | l_uf[n] = RESULTS[2] 68 | l_uf_hybrid[n] = RESULTS[3] 69 | l_lequiv[n] = RESULTS[4] 70 | l_stephano[n] = RESULTS[5] 71 | 72 | print(f) 73 | print("\tcc:\t%d\n\tgold:\t%f\n\tuf:\t%f\n\tuf_hybrid:\t%f\n\tlequiv:\t%f\n\tstephano:\t%f\n" % tuple(RESULTS) ) 74 | Writer.writerow(RESULTS) 75 | 76 | print(" == Finals Results == \n") 77 | s1 = l_stephano/l_uf 78 | s2 = l_stephano/l_uf_hybrid 79 | s3 = l_stephano/l_lequiv 80 | 81 | print("[EXECUTION TIME]") 82 | print("union-find (gpu)\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(l_uf), numpy.std(l_uf), numpy.max(l_uf), numpy.min(l_uf)) ) 83 | print("union-find (gpu+cpu)\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(l_uf_hybrid), numpy.std(l_uf_hybrid), numpy.max(l_uf_hybrid), numpy.min(l_uf_hybrid)) ) 84 | print("Label Equivalence\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(l_lequiv), numpy.std(l_lequiv), numpy.max(l_lequiv), numpy.min(l_lequiv)) ) 85 | print("Stephano\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(l_stephano), numpy.std(l_stephano), numpy.max(l_stephano), numpy.min(l_stephano)) ) 86 | 87 | print("[SPEEDUP]") 88 | print("union-find (gpu)\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(s1), numpy.std(s1), numpy.max(s1), numpy.min(s1)) ) 89 | print("union-find (gpu+cpu)\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(s2), numpy.std(s2), numpy.max(s2), numpy.min(s2)) ) 90 | print("Label Equivalence\n\tmean:%4.2f\n\tstd:%4.2f\n\tmax:%4.2f\n\tmin:%4.2f" % (numpy.mean(s3), numpy.std(s3), numpy.max(s3), numpy.min(s3)) ) 91 | 92 | import sys 93 | sys.exit(0) 94 | -------------------------------------------------------------------------------- /ccl_naive_prop.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "textures.cuh" 7 | 8 | namespace naive_prop { 9 | 10 | const int BLOCK_X = 16; 11 | const int BLOCK_Y = 16; 12 | 13 | __global__ void PROP_prescan(int* R, int w, int h) { 14 | int x = blockIdx.x*blockDim.x + threadIdx.x; 15 | int y = blockIdx.y*blockDim.y + threadIdx.y; 16 | int index = x+y*w; 17 | 18 | if (x < w && y < h) { 19 | R[index] = index; 20 | } 21 | } 22 | 23 | __global__ void PROP_scan(int* R, int w, int h, int* d_stop) { 24 | int x = blockIdx.x*blockDim.x + threadIdx.x; 25 | int y = blockIdx.y*blockDim.y + threadIdx.y; 26 | int index = x+y*w; 27 | 28 | if (x < w && y < h) { 29 | unsigned char v = tex2D(imgtex, x, y); 30 | int label = R[index]; 31 | int newlabel = w*h; 32 | 33 | if (y>0 && tex2D(imgtex, x, y-1) == v) { 34 | newlabel = min(newlabel, R[index-w]); 35 | } 36 | if (y0 && tex2D(imgtex, x-1, y) == v) { 40 | newlabel = min(newlabel, R[index-1]); 41 | } 42 | if (x(); 59 | cudaMallocArray(&imgarray, &uchardesc, w, h); 60 | 61 | int* R; 62 | cudaMalloc((void**)&R, w*h*sizeof(int)); 63 | 64 | err = cudaGetLastError(); 65 | if (err != cudaSuccess) { 66 | printf("startERROR: %s\n", cudaGetErrorString(err)); 67 | return; 68 | } 69 | 70 | cudaChannelFormatDesc intdesc = 71 | cudaCreateChannelDesc(); 72 | cudaBindTextureToArray(imgtex, imgarray, uchardesc); 73 | cudaBindTexture(NULL, Rtex, R, intdesc, w*h*sizeof(int)); 74 | 75 | int stop; 76 | int* d_stop; 77 | cudaMalloc((void**)&d_stop, sizeof(int)); 78 | 79 | dim3 block (BLOCK_X, BLOCK_Y); 80 | dim3 grid ((w+BLOCK_X-1)/BLOCK_X, 81 | (h+BLOCK_Y-1)/BLOCK_Y); 82 | 83 | cudaMemcpyToArray(imgarray, 0, 0, img, 84 | w*h*sizeof(unsigned char), 85 | cudaMemcpyHostToDevice); 86 | 87 | err = cudaGetLastError(); 88 | if (err != cudaSuccess) { 89 | printf("midERROR: %s\n", cudaGetErrorString(err)); 90 | return; 91 | } 92 | 93 | PROP_prescan <<>> 94 | (R, w, h); 95 | 96 | stop = 0; 97 | while (stop == 0) { 98 | 99 | cudaMemset(d_stop, 0xFF, sizeof(int)); 100 | 101 | PROP_scan <<>> 102 | (R, w, h, d_stop); 103 | 104 | cudaMemcpy(&stop, d_stop, sizeof(int), 105 | cudaMemcpyDeviceToHost); 106 | } 107 | 108 | cudaMemcpy(label, R, w*h*sizeof(int), 109 | cudaMemcpyDeviceToHost); 110 | 111 | cudaFree(d_stop); 112 | cudaFree(R); 113 | cudaFreeArray(imgarray); 114 | 115 | err = cudaGetLastError(); 116 | if (err != cudaSuccess) { 117 | printf("endERROR: %s\n", cudaGetErrorString(err)); 118 | return; 119 | } 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /distance_transform.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "pgm.h" 7 | 8 | #define BLOCK_SIZE 256 9 | 10 | __device__ float 11 | distance(float x, float y, float ex, float ey) { 12 | return sqrtf(powf(ex-x,2.0f)+powf(ey-y,2.0f)); 13 | } 14 | 15 | __global__ void 16 | euclidian_distance_transform(uchar2* img, 17 | float2* dist, int w, int h) { 18 | 19 | //each thread process 4 pixels 20 | __shared__ uchar2 img_line [BLOCK_SIZE]; 21 | 22 | int i = blockIdx.x*blockDim.x + threadIdx.x; 23 | float N = w*h; 24 | 25 | int2 ox = {(2*i) % w, (2*i+1) % w}; 26 | int2 oy = {(2*i) / w, (2*i+1) / w}; 27 | 28 | if (2*i < N) { 29 | float2 d = {N, N}; 30 | 31 | for(int bi=0; bi>> ((uchar2*)d_img, (float2*)d_dist, w, h); 105 | cudaThreadSynchronize(); 106 | 107 | err = cudaGetLastError(); 108 | if (err != cudaSuccess) { 109 | printf("ERROR: %s\n", cudaGetErrorString(err)); 110 | return 1; 111 | } 112 | 113 | fprintf(stderr, "GO!"); 114 | 115 | cudaMemcpy(dist, d_dist, w*h*sizeof(float), cudaMemcpyDeviceToHost); 116 | 117 | err = cudaGetLastError(); 118 | if (err != cudaSuccess) { 119 | printf("ERROR: %s\n", cudaGetErrorString(err)); 120 | return 1; 121 | } 122 | 123 | cudaFree(d_img); 124 | cudaFree(d_dist); 125 | free(dist); 126 | free(img); 127 | 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /distance_transform2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "pgm.h" 7 | 8 | #define BLOCK_SIZE 128 9 | 10 | __device__ float 11 | distance(float x, float y, float ex, float ey) { 12 | return sqrtf(powf(ex-x,2.0f)+powf(ey-y,2.0f)); 13 | } 14 | 15 | __global__ void 16 | euclidian_distance_transform(uchar4* img, 17 | float4* dist, int w, int h) { 18 | 19 | //each thread process 4 pixels 20 | __shared__ uchar4 img_line [BLOCK_SIZE]; 21 | 22 | int i = blockIdx.x*blockDim.x + threadIdx.x; 23 | float N = w*h; 24 | 25 | int4 ox = {(4*i) % w, (4*i+1) % w, (4*i+2) % w, (4*i+3) % w}; 26 | int4 oy = {(4*i) / w, (4*i+1) / w, (4*i+2) / w, (4*i+3) / w}; 27 | 28 | if (4*i < N) { 29 | float4 d = {N, N, N, N}; 30 | 31 | for(int bi=0; bi>> ((uchar4*)d_img, (float4*)d_dist, w, h); 111 | cudaThreadSynchronize(); 112 | 113 | err = cudaGetLastError(); 114 | if (err != cudaSuccess) { 115 | printf("ERROR: %s\n", cudaGetErrorString(err)); 116 | return 1; 117 | } 118 | 119 | fprintf(stderr, "GO!"); 120 | 121 | cudaMemcpy(dist, d_dist, w*h*sizeof(float), cudaMemcpyDeviceToHost); 122 | 123 | err = cudaGetLastError(); 124 | if (err != cudaSuccess) { 125 | printf("ERROR: %s\n", cudaGetErrorString(err)); 126 | return 1; 127 | } 128 | 129 | cudaFree(d_img); 130 | cudaFree(d_dist); 131 | free(dist); 132 | free(img); 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /perf_tests/atomics_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void global_coalesced_test_1(int* data) { 4 | int x = blockIdx.x*blockDim.x + threadIdx.x; 5 | int v = data[x]+1; 6 | data[x] = v; 7 | } 8 | 9 | __global__ void global_atomics_test_1(int* data) { 10 | int x = blockIdx.x*blockDim.x + threadIdx.x; 11 | atomicAdd(&data[x], 1); 12 | } 13 | 14 | __global__ void local_coalesced_test_1(int* data) { 15 | int x = blockIdx.x*blockDim.x + threadIdx.x; 16 | 17 | __shared__ int s_data[256]; 18 | s_data[threadIdx.x] = data[x]; 19 | __syncthreads(); 20 | 21 | s_data[threadIdx.x]++; 22 | __syncthreads(); 23 | 24 | data[x] = s_data[threadIdx.x]; 25 | } 26 | 27 | __global__ void local_atomics_test_1(int* data) { 28 | int x = blockIdx.x*blockDim.x + threadIdx.x; 29 | 30 | __shared__ int s_data[256]; 31 | s_data[threadIdx.x] = data[x]; 32 | __syncthreads(); 33 | 34 | atomicAdd(&s_data[threadIdx.x], 1); 35 | __syncthreads(); 36 | 37 | data[x] = s_data[threadIdx.x]; 38 | } 39 | 40 | ///////////////////// 41 | 42 | __global__ void global_atomics_test_2(int* data) { 43 | int x = blockIdx.x*blockDim.x + threadIdx.x; 44 | int v = data[x]; 45 | 46 | atomicAdd(&data[blockIdx.x], v); 47 | } 48 | 49 | __global__ void local_atomics_test_2(int* data) { 50 | int x = blockIdx.x*blockDim.x + threadIdx.x; 51 | 52 | __shared__ int s_data[256]; 53 | s_data[threadIdx.x] = data[x]; 54 | __syncthreads(); 55 | 56 | atomicAdd(&s_data[0], s_data[threadIdx.x]); 57 | __syncthreads(); 58 | 59 | data[blockIdx.x] = s_data[0]; 60 | } 61 | 62 | //////////////////////////////// 63 | 64 | __global__ void global_atomics_test_3(int* data) { 65 | int x = blockIdx.x*blockDim.x + threadIdx.x; 66 | int v = data[x]; 67 | 68 | atomicAdd(&data[0], v); 69 | } 70 | 71 | //////////////////////////////// 72 | 73 | 74 | #define START_TIME cudaEventRecord(start,0) 75 | #define STOP_TIME cudaEventRecord(stop,0 ); \ 76 | cudaEventSynchronize(stop); \ 77 | cudaEventElapsedTime( &et, start, stop ) 78 | 79 | #define TRIES 10 80 | float et_v[TRIES]; 81 | 82 | float MIN_ET() { 83 | float et = et_v[0]; 84 | for (int t=0; t>> (data); 106 | STOP_TIME; 107 | cudaThreadSynchronize(); 108 | et_v[t] = et; 109 | } 110 | printf("global_coalesced_test_1: \t %lf\n", MIN_ET()); 111 | 112 | for (int t=0;t>> (data); 115 | STOP_TIME; 116 | cudaThreadSynchronize(); 117 | et_v[t] = et; 118 | } 119 | printf("global_atomics_test_1: \t %lf\n", MIN_ET()); 120 | 121 | for (int t=0;t>> (data); 124 | STOP_TIME; 125 | cudaThreadSynchronize(); 126 | et_v[t] = et; 127 | } 128 | printf("local_coalesced_test_1: \t %lf\n", MIN_ET()); 129 | 130 | for (int t=0;t>> (data); 133 | STOP_TIME; 134 | cudaThreadSynchronize(); 135 | et_v[t] = et; 136 | } 137 | printf("locals_atomics_test_1: \t %lf\n", MIN_ET()); 138 | 139 | for (int t=0;t>> (data); 142 | STOP_TIME; 143 | cudaThreadSynchronize(); 144 | et_v[t] = et; 145 | } 146 | printf("global_atomics_test_2: \t %lf\n", MIN_ET()); 147 | 148 | for (int t=0;t>> (data); 151 | STOP_TIME; 152 | cudaThreadSynchronize(); 153 | et_v[t] = et; 154 | } 155 | printf("local_atomics_test_2: \t %lf\n", MIN_ET()); 156 | 157 | for (int t=0;t>> (data); 160 | STOP_TIME; 161 | cudaThreadSynchronize(); 162 | et_v[t] = et; 163 | } 164 | printf("local_atomics_test_3: \t %lf\n", MIN_ET()); 165 | 166 | cudaFree(data); 167 | 168 | cudaError_t err; 169 | err = cudaGetLastError(); 170 | if (err != cudaSuccess) { 171 | printf("ERROR: %s\n", cudaGetErrorString(err)); 172 | return 1; 173 | } 174 | 175 | return 0; 176 | } 177 | -------------------------------------------------------------------------------- /ccl_lequiv.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "textures.cuh" 7 | 8 | namespace lequiv { 9 | 10 | #define LEQUIV_BLOCK_SIZE_X 16 11 | #define LEQUIV_BLOCK_SIZE_Y 16 12 | 13 | 14 | __global__ void LEQUIV_prescan(int* L, int* R, int w, int h) { 15 | int x = blockIdx.x*blockDim.x + threadIdx.x; 16 | int y = blockIdx.y*blockDim.y + threadIdx.y; 17 | int index = x+y*w; 18 | 19 | if (x < w && y < h) { 20 | L[index] = index; 21 | R[index] = index; 22 | } 23 | } 24 | 25 | __global__ void LEQUIV_scan(int* R, int w, int h, int* d_stop) { 26 | int x = blockIdx.x*blockDim.x + threadIdx.x; 27 | int y = blockIdx.y*blockDim.y + threadIdx.y; 28 | int index = x+y*w; 29 | 30 | if (x < w && y < h) { 31 | unsigned char v = tex2D(imgtex, x, y); 32 | int label = tex1Dfetch(Ltex, index); 33 | int newlabel = w*h; 34 | 35 | if (y>0 && tex2D(imgtex, x, y-1) == v) { 36 | newlabel = min(newlabel, tex1Dfetch(Ltex, index-w)); 37 | } 38 | if (y0 && tex2D(imgtex, x-1, y) == v) { 42 | newlabel = min(newlabel, tex1Dfetch(Ltex, index-1)); 43 | } 44 | if (x0) { 69 | rf = label; 70 | label = tex1Dfetch(Rtex, rf); 71 | deep--; 72 | } 73 | //texture will be invalid 74 | R[index] = label; 75 | } 76 | } 77 | } 78 | 79 | __global__ void LEQUIV_labeling(int* L, int w, int h) { 80 | int x = blockIdx.x*blockDim.x + threadIdx.x; 81 | int y = blockIdx.y*blockDim.y + threadIdx.y; 82 | int index = x+y*w; 83 | if (x < w && y < h) { 84 | int label = L[index]; 85 | int cc = tex1Dfetch(Rtex, label); 86 | L[index] = tex1Dfetch(Rtex, cc); 87 | } 88 | } 89 | 90 | void CCL(unsigned char* img, int w, int h, int* label) { 91 | cudaError_t err; 92 | 93 | cudaArray* imgarray; 94 | cudaChannelFormatDesc uchardesc = 95 | cudaCreateChannelDesc(); 96 | cudaMallocArray(&imgarray, &uchardesc, w, h); 97 | 98 | int* L; 99 | cudaMalloc((void**)&L, w*h*sizeof(int)); 100 | int* R; 101 | cudaMalloc((void**)&R, w*h*sizeof(int)); 102 | 103 | err = cudaGetLastError(); 104 | if (err != cudaSuccess) { 105 | printf("startERROR: %s\n", cudaGetErrorString(err)); 106 | return; 107 | } 108 | 109 | cudaChannelFormatDesc intdesc = 110 | cudaCreateChannelDesc(); 111 | cudaBindTextureToArray(imgtex, imgarray, uchardesc); 112 | cudaBindTexture(NULL, Ltex, L, intdesc, w*h*sizeof(int)); 113 | cudaBindTexture(NULL, Rtex, R, intdesc, w*h*sizeof(int)); 114 | 115 | int stop; 116 | int* d_stop; 117 | cudaMalloc((void**)&d_stop, sizeof(int)); 118 | 119 | dim3 block (LEQUIV_BLOCK_SIZE_X, LEQUIV_BLOCK_SIZE_Y); 120 | dim3 grid ((w+LEQUIV_BLOCK_SIZE_X-1)/LEQUIV_BLOCK_SIZE_X, 121 | (h+LEQUIV_BLOCK_SIZE_Y-1)/LEQUIV_BLOCK_SIZE_Y); 122 | 123 | cudaMemcpyToArray(imgarray, 0, 0, img, 124 | w*h*sizeof(unsigned char), 125 | cudaMemcpyHostToDevice); 126 | 127 | err = cudaGetLastError(); 128 | if (err != cudaSuccess) { 129 | printf("midERROR: %s\n", cudaGetErrorString(err)); 130 | return; 131 | } 132 | 133 | LEQUIV_prescan <<>> 134 | (L, R, w, h); 135 | 136 | stop = 0; 137 | while (stop == 0) { 138 | 139 | cudaMemset(d_stop, 0xFF, sizeof(int)); 140 | 141 | LEQUIV_scan <<>> 142 | (R, w, h, d_stop); 143 | 144 | LEQUIV_analysis <<>> 145 | (L, R, w, h); 146 | 147 | LEQUIV_labeling <<>> 148 | (L, w, h); 149 | 150 | cudaMemcpy(&stop, d_stop, sizeof(int), 151 | cudaMemcpyDeviceToHost); 152 | } 153 | 154 | cudaMemcpy(label, L, w*h*sizeof(int), 155 | cudaMemcpyDeviceToHost); 156 | 157 | cudaFree(d_stop); 158 | cudaFree(L); 159 | cudaFree(R); 160 | cudaFreeArray(imgarray); 161 | 162 | err = cudaGetLastError(); 163 | if (err != cudaSuccess) { 164 | printf("endERROR: %s\n", cudaGetErrorString(err)); 165 | return; 166 | } 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /ccl_uf.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "textures.cuh" 7 | 8 | #include 9 | 10 | namespace uf { 11 | 12 | #define UF_BLOCK_SIZE_X 32 13 | #define UF_BLOCK_SIZE_Y 16 14 | 15 | class UnionFind 16 | { 17 | public: 18 | unsigned char* img; 19 | int* label; 20 | int width, height; 21 | int size; 22 | int divide; 23 | 24 | inline unsigned char F(int p) { return this->img[p]; } 25 | 26 | UnionFind(unsigned char* img, int* label, int w, int h, int divide); 27 | 28 | int find(int x); 29 | void merge(int x, int y); 30 | void build(); 31 | void build(int ls, int le, int divide); 32 | }; 33 | 34 | UnionFind::UnionFind(unsigned char* _img, int* _label, int _width, int _height, int _divide): 35 | width(_width), height(_height), size(_width*_height), divide(_divide), img(_img), label(_label) 36 | {} 37 | 38 | #ifdef THREADED 39 | typedef struct _ThreadArg { 40 | UnionFind *uf; 41 | int ls; 42 | int le; 43 | int d; 44 | } ThreadArg; 45 | 46 | void BuildUnionFindThread(void *ptr) 47 | { 48 | ThreadArg* arg = (ThreadArg* )ptr; 49 | UnionFind *uf = arg->uf; 50 | uf->build(arg->ls, arg->le, arg->d); 51 | pthread_exit(0); 52 | } 53 | #endif 54 | 55 | void UnionFind::build() 56 | { 57 | int nyblocks = (this->height+UF_BLOCK_SIZE_Y-1)/UF_BLOCK_SIZE_Y; 58 | this->build(0, nyblocks-1, this->divide); 59 | 60 | for(int i=0; iwidth*this->height; i++) 61 | this->find(i); 62 | } 63 | 64 | void UnionFind::build(int ls, int le, int d) 65 | { 66 | int rls = ls*UF_BLOCK_SIZE_Y; 67 | int rle = (le+1)*UF_BLOCK_SIZE_Y; 68 | if (rle > this->height) rle = this->height; 69 | 70 | //we have an image with labelled blocks of UF_BLOCK_SIZE_X x UF_BLOCK_SIZE_Y 71 | if (d == 0) 72 | { 73 | //fprintf(stderr, "%d\t%d\n", ls, le); 74 | 75 | //we have to merge blocks 76 | 77 | for(int y=rls; ywidth; 82 | for(int x=UF_BLOCK_SIZE_X; xwidth; x+=UF_BLOCK_SIZE_X) 83 | { 84 | this->merge(offset+x-1, offset+x); 85 | } 86 | } 87 | 88 | 89 | for(int y=rls+UF_BLOCK_SIZE_Y; ywidth; 92 | for(int x=0; xwidth; x++) 93 | { 94 | this->merge((offset-this->width)+x, offset+x); 95 | } 96 | } 97 | 98 | } 99 | else 100 | { 101 | int m = (ls+le)/2; 102 | #ifdef THREADED 103 | pthread_t thread1, thread2; 104 | ThreadArg arg1, arg2; 105 | arg1.uf = this; 106 | arg2.uf = this; 107 | arg1.ls = ls; 108 | arg1.le = m; 109 | arg1.d = d-1; 110 | arg2.ls = m+1; 111 | arg2.le = le; 112 | arg2.d = d-1; 113 | pthread_create(&thread1, NULL, (void *(*) (void *))&BuildUnionFindThread, (void *)&arg1); 114 | pthread_create(&thread2, NULL, (void *(*) (void *))&BuildUnionFindThread, (void *)&arg2); 115 | pthread_join(thread1, NULL); 116 | pthread_join(thread2, NULL); 117 | #else 118 | this->build(ls, m, d-1); 119 | this->build(m+1, le, d-1); 120 | #endif 121 | 122 | int b2 = ((m+1)*UF_BLOCK_SIZE_Y)*this->width; 123 | int b1 = b2+this->width; 124 | 125 | for(int x=0; xwidth; x++) 126 | { 127 | this->merge(b1+x,b2+x); 128 | } 129 | 130 | } 131 | } 132 | 133 | int UnionFind::find(int x) 134 | { 135 | int cur = x; 136 | int next = this->label[cur]; 137 | while (next != cur) 138 | { 139 | cur = next; 140 | next = this->label[cur]; 141 | } 142 | 143 | int root = next; 144 | 145 | cur = x; 146 | next = this->label[cur]; 147 | while (next != cur) 148 | { 149 | this->label[cur] = root; 150 | cur = next; 151 | next = this->label[x]; 152 | } 153 | 154 | return root; 155 | } 156 | 157 | void UnionFind::merge(int x, int y) 158 | { 159 | if (F(x) == F(y)) 160 | { 161 | x = this->find(x); 162 | y = this->find(y); 163 | 164 | assert(x == this->label[x]); 165 | assert(y == this->label[y]); 166 | 167 | if (x < y) { 168 | this->label[y] = x; 169 | } else { 170 | this->label[x] = y; 171 | } 172 | } 173 | } 174 | 175 | //CUDA 176 | 177 | __device__ int find(int* buf, int x) { 178 | while (x != buf[x]) { 179 | x = buf[x]; 180 | } 181 | return x; 182 | } 183 | 184 | __device__ void findAndUnion(int* buf, int g1, int g2) { 185 | bool done; 186 | do { 187 | 188 | g1 = find(buf, g1); 189 | g2 = find(buf, g2); 190 | 191 | // it should hold that g1 == buf[g1] and g2 == buf[g2] now 192 | 193 | if (g1 < g2) { 194 | int old = atomicMin(&buf[g2], g1); 195 | done = (old == g2); 196 | g2 = old; 197 | } else if (g2 < g1) { 198 | int old = atomicMin(&buf[g1], g2); 199 | done = (old == g1); 200 | g1 = old; 201 | } else { 202 | done = true; 203 | } 204 | 205 | } while(!done); 206 | } 207 | 208 | __global__ void UF_local(int* label, int w, int h) { 209 | int x = blockIdx.x*blockDim.x + threadIdx.x; 210 | int y = blockIdx.y*blockDim.y + threadIdx.y; 211 | int global_index = x+y*w; 212 | int block_index = UF_BLOCK_SIZE_X * threadIdx.y + threadIdx.x; 213 | 214 | __shared__ int s_buffer[UF_BLOCK_SIZE_X * UF_BLOCK_SIZE_Y]; 215 | __shared__ unsigned char s_img[UF_BLOCK_SIZE_X * UF_BLOCK_SIZE_Y]; 216 | 217 | bool in_limits = x < w && y < h; 218 | 219 | s_buffer[block_index] = block_index; 220 | s_img[block_index] = in_limits? tex2D(imgtex, x, y) : 0xFF; 221 | __syncthreads(); 222 | 223 | unsigned char v = s_img[block_index]; 224 | 225 | if (in_limits && threadIdx.x>0 && s_img[block_index-1] == v) { 226 | findAndUnion(s_buffer, block_index, block_index - 1); 227 | } 228 | 229 | __syncthreads(); 230 | 231 | if (in_limits && threadIdx.y>0 && s_img[block_index-UF_BLOCK_SIZE_X] == v) { 232 | findAndUnion(s_buffer, block_index, block_index - UF_BLOCK_SIZE_X); 233 | } 234 | 235 | __syncthreads(); 236 | 237 | if (in_limits) { 238 | int f = find(s_buffer, block_index); 239 | int fx = f % UF_BLOCK_SIZE_X; 240 | int fy = f / UF_BLOCK_SIZE_X; 241 | label[global_index] = (blockIdx.y*UF_BLOCK_SIZE_Y + fy)*w + 242 | (blockIdx.x*blockDim.x + fx); 243 | } 244 | 245 | } 246 | 247 | __global__ void UF_global(int* label, int w, int h) { 248 | int x = blockIdx.x*blockDim.x + threadIdx.x; 249 | int y = blockIdx.y*blockDim.y + threadIdx.y; 250 | int global_index = x+y*w; 251 | 252 | bool in_limits = x < w && y < h; 253 | unsigned char v = (in_limits? tex2D(imgtex, x, y) : 0xFF); 254 | 255 | if (in_limits && y>0 && threadIdx.y==0 && tex2D(imgtex, x, y-1) == v) { 256 | findAndUnion(label, global_index, global_index - w); 257 | } 258 | 259 | if (in_limits && x>0 && threadIdx.x==0 && tex2D(imgtex, x-1, y) == v) { 260 | findAndUnion(label, global_index, global_index - 1); 261 | } 262 | 263 | } 264 | 265 | 266 | __global__ void UF_final(int* label, int w, int h) { 267 | int x = blockIdx.x*blockDim.x + threadIdx.x; 268 | int y = blockIdx.y*blockDim.y + threadIdx.y; 269 | int global_index = x+y*w; 270 | 271 | bool in_limits = x < w && y < h; 272 | 273 | if (in_limits) { 274 | label[global_index] = find(label, global_index); 275 | } 276 | } 277 | 278 | 279 | void CCL(unsigned char* img, int w, int h, int* label, bool use_cpu=false) { 280 | cudaError_t err; 281 | cudaArray* imgarray; 282 | cudaChannelFormatDesc uchardesc = 283 | cudaCreateChannelDesc(); 284 | cudaMallocArray(&imgarray, &uchardesc, w, h); 285 | cudaBindTextureToArray(imgtex, imgarray, uchardesc); 286 | 287 | cudaMemcpyToArray(imgarray, 0, 0, img, 288 | w*h*sizeof(unsigned char), 289 | cudaMemcpyHostToDevice); 290 | 291 | int* d_label; 292 | cudaMalloc((void**)&d_label, w*h*sizeof(int)); 293 | 294 | dim3 block (UF_BLOCK_SIZE_X, UF_BLOCK_SIZE_Y); 295 | dim3 grid ((w+UF_BLOCK_SIZE_X-1)/UF_BLOCK_SIZE_X, 296 | (h+UF_BLOCK_SIZE_Y-1)/UF_BLOCK_SIZE_Y); 297 | 298 | err = cudaGetLastError(); 299 | if (err != cudaSuccess) { 300 | printf("startERROR: %s\n", cudaGetErrorString(err)); 301 | return; 302 | } 303 | 304 | cudaThreadSetCacheConfig(cudaFuncCachePreferShared); 305 | 306 | UF_local <<>> 307 | (d_label, w, h); 308 | 309 | if (use_cpu) 310 | { 311 | cudaMemcpy(label, d_label, w*h*sizeof(int), 312 | cudaMemcpyDeviceToHost); 313 | 314 | UnionFind m(img, label, w, h, 0); 315 | m.build(); 316 | } 317 | else 318 | { 319 | cudaThreadSetCacheConfig(cudaFuncCachePreferL1); 320 | 321 | UF_global <<>> 322 | (d_label, w, h); 323 | 324 | UF_final <<>> 325 | (d_label, w, h); 326 | 327 | cudaMemcpy(label, d_label, w*h*sizeof(int), 328 | cudaMemcpyDeviceToHost); 329 | } 330 | 331 | cudaFree(d_label); 332 | cudaFreeArray(imgarray); 333 | 334 | err = cudaGetLastError(); 335 | if (err != cudaSuccess) { 336 | printf("endERROR: %s\n", cudaGetErrorString(err)); 337 | return; 338 | } 339 | } 340 | 341 | } 342 | -------------------------------------------------------------------------------- /results/tesla-c2050.txt: -------------------------------------------------------------------------------- 1 | 78004 2 | cc: 67 3 | gold: 943.566000 4 | uf: 163.940000 5 | uf_hybrid: 325.304000 6 | lequiv: 340.237000 7 | stephano: 1151.246071 8 | 9 | 167083 10 | cc: 125 11 | gold: 955.980000 12 | uf: 162.636000 13 | uf_hybrid: 325.016000 14 | lequiv: 215.041000 15 | stephano: 1068.804026 16 | 17 | 106024 18 | cc: 15 19 | gold: 951.461000 20 | uf: 162.730000 21 | uf_hybrid: 328.186000 22 | lequiv: 233.502000 23 | stephano: 1517.715931 24 | 25 | 33039 26 | cc: 192 27 | gold: 962.669000 28 | uf: 160.584000 29 | uf_hybrid: 327.118000 30 | lequiv: 217.126000 31 | stephano: 1135.164022 32 | 33 | 65033 34 | cc: 82 35 | gold: 950.218000 36 | uf: 162.060000 37 | uf_hybrid: 329.303000 38 | lequiv: 246.124000 39 | stephano: 860.104084 40 | 41 | 260058 42 | cc: 16 43 | gold: 944.598000 44 | uf: 162.652000 45 | uf_hybrid: 326.439000 46 | lequiv: 232.798000 47 | stephano: 1699.454069 48 | 49 | 58060 50 | cc: 114 51 | gold: 950.029000 52 | uf: 162.319000 53 | uf_hybrid: 325.738000 54 | lequiv: 233.675000 55 | stephano: 786.159039 56 | 57 | 119082 58 | cc: 31 59 | gold: 947.006000 60 | uf: 164.063000 61 | uf_hybrid: 326.743000 62 | lequiv: 395.292000 63 | stephano: 1152.688026 64 | 65 | 253055 66 | cc: 37 67 | gold: 949.946000 68 | uf: 163.562000 69 | uf_hybrid: 327.960000 70 | lequiv: 196.683000 71 | stephano: 1499.053001 72 | 73 | 14037 74 | cc: 14 75 | gold: 950.160000 76 | uf: 163.249000 77 | uf_hybrid: 327.209000 78 | lequiv: 245.189000 79 | stephano: 831.691027 80 | 81 | 302008 82 | cc: 45 83 | gold: 946.159000 84 | uf: 163.654000 85 | uf_hybrid: 327.019000 86 | lequiv: 213.263000 87 | stephano: 898.051977 88 | 89 | 385039 90 | cc: 57 91 | gold: 952.209000 92 | uf: 163.822000 93 | uf_hybrid: 328.530000 94 | lequiv: 209.297000 95 | stephano: 1024.386168 96 | 97 | 241004 98 | cc: 37 99 | gold: 959.062000 100 | uf: 164.201000 101 | uf_hybrid: 328.452000 102 | lequiv: 196.701000 103 | stephano: 1220.497131 104 | 105 | 376043 106 | cc: 157 107 | gold: 960.424000 108 | uf: 163.274000 109 | uf_hybrid: 325.097000 110 | lequiv: 202.894000 111 | stephano: 1198.030949 112 | 113 | 147091 114 | cc: 35 115 | gold: 958.567000 116 | uf: 164.490000 117 | uf_hybrid: 331.042000 118 | lequiv: 209.253000 119 | stephano: 1252.583981 120 | 121 | 85048 122 | cc: 93 123 | gold: 963.897000 124 | uf: 163.344000 125 | uf_hybrid: 328.892000 126 | lequiv: 259.602000 127 | stephano: 1328.937054 128 | 129 | 148026 130 | cc: 172 131 | gold: 944.071000 132 | uf: 163.833000 133 | uf_hybrid: 325.916000 134 | lequiv: 203.718000 135 | stephano: 1140.581131 136 | 137 | 304074 138 | cc: 72 139 | gold: 961.643000 140 | uf: 164.418000 141 | uf_hybrid: 327.996000 142 | lequiv: 177.227000 143 | stephano: 1141.694069 144 | 145 | 42049 146 | cc: 28 147 | gold: 948.245000 148 | uf: 164.784000 149 | uf_hybrid: 325.521000 150 | lequiv: 233.854000 151 | stephano: 1675.723076 152 | 153 | 62096 154 | cc: 28 155 | gold: 950.801000 156 | uf: 164.461000 157 | uf_hybrid: 327.874000 158 | lequiv: 258.596000 159 | stephano: 1278.414011 160 | 161 | 97033 162 | cc: 40 163 | gold: 949.132000 164 | uf: 164.553000 165 | uf_hybrid: 326.565000 166 | lequiv: 247.061000 167 | stephano: 1269.778013 168 | 169 | 105025 170 | cc: 45 171 | gold: 951.425000 172 | uf: 163.986000 173 | uf_hybrid: 327.886000 174 | lequiv: 172.041000 175 | stephano: 1688.418865 176 | 177 | 210088 178 | cc: 53 179 | gold: 955.043000 180 | uf: 163.455000 181 | uf_hybrid: 327.147000 182 | lequiv: 176.308000 183 | stephano: 941.483021 184 | 185 | 295087 186 | cc: 38 187 | gold: 953.501000 188 | uf: 164.409000 189 | uf_hybrid: 329.632000 190 | lequiv: 234.180000 191 | stephano: 825.227022 192 | 193 | 208001 194 | cc: 70 195 | gold: 943.916000 196 | uf: 164.045000 197 | uf_hybrid: 325.316000 198 | lequiv: 163.517000 199 | stephano: 855.585814 200 | 201 | 167062 202 | cc: 9 203 | gold: 945.607000 204 | uf: 165.520000 205 | uf_hybrid: 327.209000 206 | lequiv: 234.014000 207 | stephano: 1125.083923 208 | 209 | 108070 210 | cc: 88 211 | gold: 957.546000 212 | uf: 163.415000 213 | uf_hybrid: 329.522000 214 | lequiv: 184.498000 215 | stephano: 767.425060 216 | 217 | 38092 218 | cc: 45 219 | gold: 955.654000 220 | uf: 163.456000 221 | uf_hybrid: 326.853000 222 | lequiv: 234.495000 223 | stephano: 1480.608940 224 | 225 | 134035 226 | cc: 253 227 | gold: 960.771000 228 | uf: 162.207000 229 | uf_hybrid: 328.068000 230 | lequiv: 135.186000 231 | stephano: 1019.999981 232 | 233 | 101085 234 | cc: 76 235 | gold: 951.126000 236 | uf: 163.405000 237 | uf_hybrid: 327.417000 238 | lequiv: 214.167000 239 | stephano: 1026.417017 240 | 241 | 220075 242 | cc: 63 243 | gold: 958.167000 244 | uf: 163.751000 245 | uf_hybrid: 328.081000 246 | lequiv: 222.174000 247 | stephano: 918.323994 248 | 249 | 86000 250 | cc: 109 251 | gold: 956.960000 252 | uf: 162.647000 253 | uf_hybrid: 329.464000 254 | lequiv: 239.376000 255 | stephano: 899.216175 256 | 257 | 156065 258 | cc: 87 259 | gold: 959.200000 260 | uf: 163.882000 261 | uf_hybrid: 330.911000 262 | lequiv: 259.402000 263 | stephano: 955.286026 264 | 265 | 236037 266 | cc: 218 267 | gold: 961.869000 268 | uf: 162.441000 269 | uf_hybrid: 325.627000 270 | lequiv: 222.804000 271 | stephano: 983.510971 272 | 273 | 229036 274 | cc: 151 275 | gold: 955.883000 276 | uf: 162.785000 277 | uf_hybrid: 329.438000 278 | lequiv: 209.932000 279 | stephano: 1085.268974 280 | 281 | 12084 282 | cc: 201 283 | gold: 952.488000 284 | uf: 161.487000 285 | uf_hybrid: 327.736000 286 | lequiv: 222.655000 287 | stephano: 829.557896 288 | 289 | 351093 290 | cc: 88 291 | gold: 954.882000 292 | uf: 163.843000 293 | uf_hybrid: 324.560000 294 | lequiv: 842.052000 295 | stephano: 1164.966822 296 | 297 | 145086 298 | cc: 72 299 | gold: 948.261000 300 | uf: 162.931000 301 | uf_hybrid: 329.146000 302 | lequiv: 184.832000 303 | stephano: 1119.440079 304 | 305 | 304034 306 | cc: 149 307 | gold: 955.314000 308 | uf: 162.655000 309 | uf_hybrid: 328.604000 310 | lequiv: 408.625000 311 | stephano: 931.329012 312 | 313 | 361010 314 | cc: 38 315 | gold: 951.489000 316 | uf: 164.109000 317 | uf_hybrid: 330.266000 318 | lequiv: 171.575000 319 | stephano: 707.834959 320 | 321 | 101087 322 | cc: 30 323 | gold: 959.529000 324 | uf: 165.148000 325 | uf_hybrid: 327.509000 326 | lequiv: 226.459000 327 | stephano: 1308.543921 328 | 329 | 8023 330 | cc: 65 331 | gold: 951.221000 332 | uf: 163.804000 333 | uf_hybrid: 328.824000 334 | lequiv: 197.444000 335 | stephano: 1136.039019 336 | 337 | 76053 338 | cc: 88 339 | gold: 960.103000 340 | uf: 163.246000 341 | uf_hybrid: 325.115000 342 | lequiv: 234.471000 343 | stephano: 1036.014080 344 | 345 | 157055 346 | cc: 81 347 | gold: 974.806000 348 | uf: 164.768000 349 | uf_hybrid: 328.794000 350 | lequiv: 246.777000 351 | stephano: 1504.856110 352 | 353 | 87046 354 | cc: 117 355 | gold: 954.128000 356 | uf: 163.967000 357 | uf_hybrid: 325.644000 358 | lequiv: 172.374000 359 | stephano: 1305.327892 360 | 361 | 126007 362 | cc: 56 363 | gold: 943.805000 364 | uf: 163.317000 365 | uf_hybrid: 329.469000 366 | lequiv: 234.188000 367 | stephano: 777.559042 368 | 369 | 216081 370 | cc: 53 371 | gold: 956.302000 372 | uf: 163.677000 373 | uf_hybrid: 330.502000 374 | lequiv: 221.538000 375 | stephano: 905.107975 376 | 377 | 296007 378 | cc: 53 379 | gold: 957.376000 380 | uf: 164.371000 381 | uf_hybrid: 328.790000 382 | lequiv: 196.996000 383 | stephano: 1418.195963 384 | 385 | 123074 386 | cc: 63 387 | gold: 955.869000 388 | uf: 164.388000 389 | uf_hybrid: 327.625000 390 | lequiv: 196.918000 391 | stephano: 1199.217796 392 | 393 | 143090 394 | cc: 7 395 | gold: 948.463000 396 | uf: 165.315000 397 | uf_hybrid: 325.698000 398 | lequiv: 220.979000 399 | stephano: 885.404110 400 | 401 | 89072 402 | cc: 86 403 | gold: 951.201000 404 | uf: 163.839000 405 | uf_hybrid: 328.629000 406 | lequiv: 227.612000 407 | stephano: 1144.986153 408 | 409 | 41033 410 | cc: 23 411 | gold: 949.466000 412 | uf: 164.532000 413 | uf_hybrid: 330.185000 414 | lequiv: 196.695000 415 | stephano: 1053.777933 416 | 417 | 196073 418 | cc: 92 419 | gold: 963.036000 420 | uf: 164.259000 421 | uf_hybrid: 329.168000 422 | lequiv: 234.690000 423 | stephano: 1653.605938 424 | 425 | 299086 426 | cc: 53 427 | gold: 948.071000 428 | uf: 163.368000 429 | uf_hybrid: 328.020000 430 | lequiv: 234.336000 431 | stephano: 825.743914 432 | 433 | 300091 434 | cc: 15 435 | gold: 953.557000 436 | uf: 165.478000 437 | uf_hybrid: 330.239000 438 | lequiv: 208.961000 439 | stephano: 846.010923 440 | 441 | 42012 442 | cc: 33 443 | gold: 951.435000 444 | uf: 164.779000 445 | uf_hybrid: 324.271000 446 | lequiv: 238.702000 447 | stephano: 966.609001 448 | 449 | 175032 450 | cc: 273 451 | gold: 948.680000 452 | uf: 161.034000 453 | uf_hybrid: 326.897000 454 | lequiv: 253.900000 455 | stephano: 938.195944 456 | 457 | 38082 458 | cc: 64 459 | gold: 952.194000 460 | uf: 162.897000 461 | uf_hybrid: 328.198000 462 | lequiv: 221.912000 463 | stephano: 801.743031 464 | 465 | 86016 466 | cc: 42 467 | gold: 945.944000 468 | uf: 165.367000 469 | uf_hybrid: 328.920000 470 | lequiv: 209.088000 471 | stephano: 1678.879023 472 | 473 | 21077 474 | cc: 82 475 | gold: 961.858000 476 | uf: 163.709000 477 | uf_hybrid: 330.224000 478 | lequiv: 234.710000 479 | stephano: 1086.477041 480 | 481 | 241048 482 | cc: 67 483 | gold: 951.374000 484 | uf: 163.794000 485 | uf_hybrid: 327.585000 486 | lequiv: 259.466000 487 | stephano: 1548.092842 488 | 489 | 271035 490 | cc: 78 491 | gold: 962.012000 492 | uf: 163.740000 493 | uf_hybrid: 328.556000 494 | lequiv: 290.620000 495 | stephano: 1325.858116 496 | 497 | 160068 498 | cc: 64 499 | gold: 955.786000 500 | uf: 162.492000 501 | uf_hybrid: 332.868000 502 | lequiv: 222.039000 503 | stephano: 953.741074 504 | 505 | 108005 506 | cc: 77 507 | gold: 956.979000 508 | uf: 163.648000 509 | uf_hybrid: 328.930000 510 | lequiv: 221.295000 511 | stephano: 788.589001 512 | 513 | 45096 514 | cc: 9 515 | gold: 947.925000 516 | uf: 164.975000 517 | uf_hybrid: 325.652000 518 | lequiv: 220.648000 519 | stephano: 733.196974 520 | 521 | 86068 522 | cc: 46 523 | gold: 969.614000 524 | uf: 163.164000 525 | uf_hybrid: 330.907000 526 | lequiv: 234.433000 527 | stephano: 1406.522036 528 | 529 | 296059 530 | cc: 29 531 | gold: 947.218000 532 | uf: 164.381000 533 | uf_hybrid: 325.026000 534 | lequiv: 270.598000 535 | stephano: 1042.436123 536 | 537 | 43074 538 | cc: 28 539 | gold: 940.646000 540 | uf: 164.095000 541 | uf_hybrid: 327.491000 542 | lequiv: 307.525000 543 | stephano: 672.955990 544 | 545 | 3096 546 | cc: 15 547 | gold: 951.716000 548 | uf: 164.347000 549 | uf_hybrid: 326.315000 550 | lequiv: 196.843000 551 | stephano: 1103.888988 552 | 553 | 19021 554 | cc: 56 555 | gold: 945.871000 556 | uf: 163.986000 557 | uf_hybrid: 328.947000 558 | lequiv: 209.136000 559 | stephano: 810.664892 560 | 561 | 69040 562 | cc: 108 563 | gold: 949.348000 564 | uf: 163.694000 565 | uf_hybrid: 325.594000 566 | lequiv: 221.895000 567 | stephano: 726.333857 568 | 569 | 108082 570 | cc: 44 571 | gold: 949.922000 572 | uf: 162.710000 573 | uf_hybrid: 327.608000 574 | lequiv: 196.721000 575 | stephano: 784.332991 576 | 577 | 189080 578 | cc: 13 579 | gold: 958.231000 580 | uf: 164.353000 581 | uf_hybrid: 325.203000 582 | lequiv: 226.170000 583 | stephano: 1242.074966 584 | 585 | 55073 586 | cc: 105 587 | gold: 949.159000 588 | uf: 163.243000 589 | uf_hybrid: 326.269000 590 | lequiv: 176.264000 591 | stephano: 786.144018 592 | 593 | 109053 594 | cc: 82 595 | gold: 950.105000 596 | uf: 163.500000 597 | uf_hybrid: 327.246000 598 | lequiv: 197.172000 599 | stephano: 870.304823 600 | 601 | 54082 602 | cc: 28 603 | gold: 950.148000 604 | uf: 162.647000 605 | uf_hybrid: 325.669000 606 | lequiv: 238.874000 607 | stephano: 884.322882 608 | 609 | 16077 610 | cc: 55 611 | gold: 958.133000 612 | uf: 163.454000 613 | uf_hybrid: 324.203000 614 | lequiv: 159.596000 615 | stephano: 1090.094090 616 | 617 | 197017 618 | cc: 19 619 | gold: 959.217000 620 | uf: 163.789000 621 | uf_hybrid: 328.033000 622 | lequiv: 220.977000 623 | stephano: 1169.651985 624 | 625 | 170057 626 | cc: 75 627 | gold: 959.231000 628 | uf: 163.018000 629 | uf_hybrid: 330.142000 630 | lequiv: 234.618000 631 | stephano: 1209.020853 632 | 633 | 69015 634 | cc: 36 635 | gold: 941.758000 636 | uf: 163.927000 637 | uf_hybrid: 327.524000 638 | lequiv: 238.759000 639 | stephano: 729.502201 640 | 641 | 182053 642 | cc: 79 643 | gold: 953.155000 644 | uf: 165.018000 645 | uf_hybrid: 325.861000 646 | lequiv: 258.969000 647 | stephano: 1474.170923 648 | 649 | 219090 650 | cc: 47 651 | gold: 958.289000 652 | uf: 163.399000 653 | uf_hybrid: 325.888000 654 | lequiv: 209.028000 655 | stephano: 1128.741026 656 | 657 | 285079 658 | cc: 76 659 | gold: 953.862000 660 | uf: 163.213000 661 | uf_hybrid: 326.208000 662 | lequiv: 213.589000 663 | stephano: 838.757038 664 | 665 | 306005 666 | cc: 62 667 | gold: 961.915000 668 | uf: 164.194000 669 | uf_hybrid: 328.888000 670 | lequiv: 209.142000 671 | stephano: 942.543030 672 | 673 | 103070 674 | cc: 55 675 | gold: 955.420000 676 | uf: 163.172000 677 | uf_hybrid: 327.091000 678 | lequiv: 183.758000 679 | stephano: 829.061985 680 | 681 | 291000 682 | cc: 174 683 | gold: 964.009000 684 | uf: 163.723000 685 | uf_hybrid: 326.844000 686 | lequiv: 160.046000 687 | stephano: 1546.474934 688 | 689 | 253027 690 | cc: 135 691 | gold: 954.060000 692 | uf: 161.839000 693 | uf_hybrid: 327.811000 694 | lequiv: 247.543000 695 | stephano: 949.643850 696 | 697 | 24077 698 | cc: 59 699 | gold: 957.488000 700 | uf: 163.399000 701 | uf_hybrid: 327.042000 702 | lequiv: 271.868000 703 | stephano: 1190.967083 704 | 705 | 66053 706 | cc: 15 707 | gold: 951.275000 708 | uf: 164.938000 709 | uf_hybrid: 329.546000 710 | lequiv: 245.592000 711 | stephano: 804.369926 712 | 713 | 163085 714 | cc: 55 715 | gold: 943.163000 716 | uf: 164.239000 717 | uf_hybrid: 325.461000 718 | lequiv: 221.205000 719 | stephano: 774.659872 720 | 721 | 159008 722 | cc: 73 723 | gold: 949.389000 724 | uf: 162.794000 725 | uf_hybrid: 326.861000 726 | lequiv: 159.724000 727 | stephano: 1046.375036 728 | 729 | 102061 730 | cc: 54 731 | gold: 949.991000 732 | uf: 162.843000 733 | uf_hybrid: 326.611000 734 | lequiv: 252.244000 735 | stephano: 1110.358953 736 | 737 | 227092 738 | cc: 31 739 | gold: 945.329000 740 | uf: 163.445000 741 | uf_hybrid: 322.935000 742 | lequiv: 378.037000 743 | stephano: 1087.686777 744 | 745 | 175043 746 | cc: 244 747 | gold: 962.684000 748 | uf: 162.074000 749 | uf_hybrid: 328.718000 750 | lequiv: 222.501000 751 | stephano: 994.480133 752 | 753 | 37073 754 | cc: 16 755 | gold: 951.255000 756 | uf: 164.473000 757 | uf_hybrid: 329.348000 758 | lequiv: 233.602000 759 | stephano: 677.886963 760 | 761 | 130026 762 | cc: 96 763 | gold: 955.693000 764 | uf: 165.398000 765 | uf_hybrid: 329.638000 766 | lequiv: 160.090000 767 | stephano: 974.393129 768 | 769 | 148089 770 | cc: 121 771 | gold: 955.507000 772 | uf: 163.030000 773 | uf_hybrid: 329.699000 774 | lequiv: 210.319000 775 | stephano: 1021.447897 776 | 777 | 69020 778 | cc: 43 779 | gold: 949.456000 780 | uf: 163.689000 781 | uf_hybrid: 328.229000 782 | lequiv: 209.310000 783 | stephano: 830.579996 784 | 785 | 223061 786 | cc: 131 787 | gold: 950.201000 788 | uf: 164.527000 789 | uf_hybrid: 328.091000 790 | lequiv: 209.688000 791 | stephano: 1327.404976 792 | 793 | 41069 794 | cc: 123 795 | gold: 953.016000 796 | uf: 162.815000 797 | uf_hybrid: 326.030000 798 | lequiv: 222.345000 799 | stephano: 1419.590950 800 | 801 | == Finals Results == 802 | 803 | [EXECUTION TIME] 804 | union-find (gpu) 805 | mean:163.63 806 | std:0.93 807 | max:165.52 808 | min:160.58 809 | union-find (gpu+cpu) 810 | mean:327.64 811 | std:1.80 812 | max:332.87 813 | min:322.94 814 | Label Equivalence 815 | mean:231.30 816 | std:75.13 817 | max:842.05 818 | min:135.19 819 | Stephano 820 | mean:1077.75 821 | std:258.60 822 | max:1699.45 823 | min:672.96 824 | [SPEEDUP] 825 | union-find (gpu) 826 | mean:6.59 827 | std:1.57 828 | max:10.45 829 | min:4.10 830 | union-find (gpu+cpu) 831 | mean:3.29 832 | std:0.79 833 | max:5.21 834 | min:2.05 835 | Label Equivalence 836 | mean:4.90 837 | std:1.50 838 | max:9.81 839 | min:1.38 840 | --------------------------------------------------------------------------------