├── README.md
├── benchmark.sh
├── images
└── C-SAW1_modified.png
├── non-stream
├── Makefile
├── WG
│ ├── beg.bin
│ └── csr.bin
├── api.cuh
├── functions.cuh
├── gpu_graph.cuh
├── graph.h
├── graph.hpp
├── header.h
├── herror.h
├── main.cu
├── mpi_main.cpp
├── run.sh
├── sample_class.cuh
├── sampler.cuh
├── util.h
└── wtime.h
└── streaming
├── Makefile
├── README.md
├── WG
├── beg.bin
└── csr.bin
├── gpu_graph.cuh
├── graph.h
├── graph.hpp
├── header.h
├── herror.h
├── mpi_main.cpp
├── run.sh
├── sample_class.cuh
├── sampler.cuh
├── streaming_sampling.cu
├── util.h
└── wtime.h
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | #### C-SAW: A Framework for Graph Sampling and Random Walk on GPUs
8 | ---
9 | C-SAW is a GPU based framework which can be used to implement variants of graph sampling and random walk algorithms.
10 |
11 | This repo contains two folders. One for streaming sampling for large graph and another for non-streaming sampling for graphs that fit in GPU memory.
12 |
13 |
14 | C-SAW uses CSR format of graph for sampling. Web-google dataset is included in the repo as example. Adjacency list of most datasets are available here.
15 | http://snap.stanford.edu/data/index.html
16 |
17 | The adjacency list can be converted into CSR using this library:
18 | https://github.com/asherliu/graph_project_start
19 |
20 |
21 |
22 | Generate the CSR and put the folder in main directory of both non-streaming and streaming sampling.
23 |
24 | To run:
25 |
26 | Step 1: Define the required API in API.cuh inside the non-streaming folder.
27 |
28 | Step 2: Go to streaming or non streaming folder. Run make command.
29 |
30 | Step 3: Update the dataset name in the run.sh file.
31 |
32 | Step 4: ./run.sh <# of samples>
33 |
34 | For changing the depth of the sampling or length of the random walk, update the DEPTH_LIMIT within Sampling class in sample_class.cuh at non-stream folder. You can also change the memory allocation and other paramters with the Sampling class.
35 |
36 | The sampled graph is stored as edge list in the GPU memory as a class variable Si found in sample_class.cuh. The output format:
37 | ```
38 | Edges sampled, dataset name, min-time, max-time
39 | ```
40 |
41 | `min-time` and `max-time` is same for single GPU. SEPS can be computed as `Edges sampled/max-time`.
42 |
43 |
44 | For more details, please refer to our [paper](https://arxiv.org/abs/2009.09103).
45 |
46 | Citation:
47 |
48 | ```
49 | @INPROCEEDINGS {,
50 | author = {S. Pandey and L. Li and A. Hoisie and X. Li and H. Liu},
51 | booktitle = {2020 SC20: International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
52 | title = {C-SAW: A Framework for Graph Sampling and Random Walk on GPUs},
53 | year = {2020},
54 | volume = {},
55 | issn = {},
56 | pages = {780-794},
57 | keywords = {},
58 | doi = {10.1109/SC41405.2020.00060},
59 | url = {https://doi.ieeecomputersociety.org/10.1109/SC41405.2020.00060},
60 | publisher = {IEEE Computer Society},
61 | address = {Los Alamitos, CA, USA},
62 | month = {nov}
63 | }
64 | ```
65 |
--------------------------------------------------------------------------------
/benchmark.sh:
--------------------------------------------------------------------------------
1 | echo "FF"
2 |
3 | echo " ITS Re-"
4 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
5 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 0
6 | done
7 |
8 | echo "select-Baseline"
9 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
10 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 0 1 1 0
11 | done
12 |
13 |
14 | echo " Normalize"
15 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
16 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 2 1 1 0
17 | done
18 |
19 | # echo "Normalize + bitmap"
20 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
21 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 0 0 0
22 | # done
23 |
24 | # echo "hash"
25 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
26 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 0
27 | # done
28 |
29 | # echo "hash+cache"
30 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
31 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 1
32 | # done
33 |
34 | # echo "combined"
35 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
36 | # echo $d
37 | # ./baseline.bin $d/beg.bin $d/csr.bin 100 32 2000
38 | # done
39 |
40 | # echo "baseline"
41 | # echo $d
42 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do
43 | # ./combined.bin $d/beg.bin $d/csr.bin 100 32 2000
44 | # done
--------------------------------------------------------------------------------
/images/C-SAW1_modified.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/images/C-SAW1_modified.png
--------------------------------------------------------------------------------
/non-stream/Makefile:
--------------------------------------------------------------------------------
1 | exe=sampling.bin
2 | N=1
3 | d=WG
4 | cucc= "$(shell which nvcc)"
5 | cc= "$(shell which mpicxx)"
6 | commflags=-lcudart -L"$(shell dirname $(cucc))"/../lib64
7 | cuflags= --compiler-options -v -Xcudafe -\# --resource-usage
8 | cuflags+= -std=c++11
9 | objs = $(patsubst %.cu,%.o,$(wildcard *.cu)) \
10 | $(patsubst %.cpp,%.o,$(wildcard *.cpp))
11 |
12 | deps = $(wildcard ./*.cuh) \
13 | $(wildcard ./*.hpp) \
14 | $(wildcard ./*.h) \
15 |
16 |
17 | %.o:%.cu $(deps)
18 | $(cucc) -c $(cuflags) $< -o $@
19 |
20 | %.o:%.cpp $(deps)
21 | $(cc) -c $< -o $@
22 |
23 | $(exe):$(objs)
24 | $(cc) $(objs) $(commflags) -O3 -o $(exe)
25 |
26 |
27 | test:$(exe)
28 | #Multidimensional random walk
29 | mpirun -n $(N) $(exe) $(d) $(d)/beg.bin $(d)/csr.bin 315 32 4000 2000 1 2000 1
30 |
31 | clean:
32 | rm -rf *.o ${exe}
33 |
--------------------------------------------------------------------------------
/non-stream/WG/beg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/non-stream/WG/beg.bin
--------------------------------------------------------------------------------
/non-stream/WG/csr.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/non-stream/WG/csr.bin
--------------------------------------------------------------------------------
/non-stream/api.cuh:
--------------------------------------------------------------------------------
1 | #ifndef API_H
2 | #define API_H
3 |
4 | __device__ int
5 | VertexBias(int vertexID, gpu_graph *graph)
6 | {
7 | // For MDRW
8 | return graph->degree_list[vertexID];
9 | // For other
10 | // return 1;
11 | }
12 |
13 | __device__ int
14 | EdgeBias(int vertexID, gpu_graph *graph)
15 | {
16 | // For BNS, LS
17 | return graph->degree_list[vertexID];
18 | // For BRW,
19 | // return 1;
20 | }
21 |
22 |
23 | __device__ int
24 | Update(gpu_graph *G, int selected, int source)
25 | {
26 | return selected;
27 | }
28 |
29 |
30 | #endif
31 |
--------------------------------------------------------------------------------
/non-stream/functions.cuh:
--------------------------------------------------------------------------------
1 | #ifndef FNC
2 | #define FNC
3 | #include "herror.h"
4 | #include "header.h"
5 | #include
6 | #include
7 | #include
8 | #include "sampler.cuh"
9 | #include "api.cuh"
10 | #define profil
11 | __device__
12 | int binary_search(int start,int end,float value, float *arr)
13 | {
14 | //printf("low:%d,high:%d,value:%f\n",start,end,value);
15 | int low=start;
16 | int high=end;
17 | int index=start;
18 |
19 | // atomicAdd(&counter[0],1);
20 | while (low<=high)
21 | {
22 | index=((low+high)/2);
23 | if (valuearr[index])
30 | {
31 | // set low to index+1
32 | low = index+1;
33 | //printf("low:%d\n",low);
34 |
35 | }
36 | else
37 | {
38 | break;
39 | }
40 |
41 | }
42 | return index;
43 | }
44 |
45 | __device__
46 | int bitmap_search(int *bitmap, int bitmap_start, int index)
47 | {
48 | int bitmap_width=32;
49 | int bitmap_pos= index;
50 | // #ifdef not_reversed
51 | int bit_block_index = bitmap_pos / bitmap_width; // find the address of bitmap
52 | int bit_block_pos= bitmap_pos % bitmap_width; // position within a address
53 | // #endif
54 | // reversed------------
55 |
56 | //#ifdef reversed
57 | // int bit_block_pos = bitmap_pos / bitmap_width;
58 | // int bit_block_index= bitmap_pos % bitmap_width;
59 | //#endif
60 |
61 | int initial_mask=1;
62 | int mask = (initial_mask << bit_block_pos);
63 | int status=atomicOr(&bitmap[bit_block_index+bitmap_start],mask);
64 | // int status=mask;
65 | int is_in= (mask & status) >> bit_block_pos;
66 | if(is_in!=0){is_in=1;}
67 | //is_in= 0x00000001 & (status >> bit_block_pos);
68 | //printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d, mask:%d, status: %d,shift: %d, is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask & status),is_in);
69 | return is_in;
70 | }
71 |
72 | __device__
73 | int linear_search(int *bitmap, int bitmap_start, int index)
74 | {
75 |
76 | int warpTID=threadIdx.x%32;
77 | int pos= warpTID;
78 | int temp_status= 0;
79 | while(pos<256){
80 | if (bitmap[index]==1)
81 | {
82 | temp_status=1;
83 | }
84 | pos+=warpSize;
85 | }
86 |
87 | int bitmap_width=32;
88 | int bitmap_pos= index;
89 | // #ifdef not_reversed
90 | int bit_block_index = bitmap_pos / bitmap_width; // find the address of bitmap
91 | int bit_block_pos= bitmap_pos % bitmap_width; // position within a address
92 | // #endif
93 | // reversed------------
94 |
95 | //#ifdef reversed
96 | // int bit_block_pos = bitmap_pos / bitmap_width;
97 | // int bit_block_index= bitmap_pos % bitmap_width;
98 | //#endif
99 |
100 | int initial_mask=1;
101 | int mask = (initial_mask << bit_block_pos);
102 | int status=atomicOr(&bitmap[bit_block_index+bitmap_start],mask);
103 | // int status=mask;
104 | int is_in= (mask & status) >> bit_block_pos;
105 | if(is_in!=0){is_in=1;}
106 | //is_in= 0x00000001 & (status >> bit_block_pos);
107 | //printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d, mask:%d, status: %d,shift: %d, is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask & status),is_in);
108 | return is_in;
109 | }
110 |
111 |
112 | __device__
113 | void gpu_prefix(int total_step,int warp_tid,float *degree_l, int offset_d_n, int warpsize, int len)
114 | {
115 | warpsize=32;
116 | for (int i=0; i< total_step; i++)
117 | {
118 | // Loop the threads
119 | int req_thread = len/(powf(2,(i+1)));
120 | for (int iid= warp_tid; iid<=req_thread; iid+=warpsize)
121 | {
122 |
123 | int tid_offset = iid*powf(2,i+1);
124 | // calculate the index
125 | int i1= (tid_offset) +(powf(2,i))-1+offset_d_n;
126 | int i2= (tid_offset) +powf(2,i+1)-1+offset_d_n;
127 | if(i1> (offset_d_n+len-1)){break; }
128 | //printf("i:%d, Index1 %d: %f,Index2 %d: %f, thread:%d\n",i,i1,degree_l[i1],i2,degree_l[i2],threadIdx.x);
129 | // load the values to shared mem
130 | int temp1= degree_l[i1];
131 | int temp2= degree_l[i2];
132 | degree_l[i2] = temp2+ temp1;
133 | //printf("Index:%d, Value:%d \n",i2,temp[i2]);
134 | }
135 | }
136 | // __syncthreads();
137 | degree_l[len-1+offset_d_n]=0;
138 | //printf("\nDownstep:%d\n",degree_l[len-1]);
139 | for (int i=(total_step-1);i >= 0; i-- )
140 | {
141 | // Loop the threads
142 | int req_thread = len/(powf(2,(i+1)));
143 | for (int iid= warp_tid; iid<=req_thread; iid+=warpsize)
144 | {
145 | int tid_offset = iid * powf(2,i+1);
146 | int i1= (tid_offset) + (powf(2,i))-1+offset_d_n;
147 | int i2= (tid_offset) + powf(2,i+1)-1+offset_d_n;
148 | if(i1 > (offset_d_n+len-1)){break;}
149 | // printf("temp1: %d, temp2: %d, thread:%d\n",i1,i2,threadIdx.x);
150 | // printf("Index1 %d: %f,Index2 %d: %f, thread:%d\n",i1,degree_l[i1],i2,degree_l[i2],threadIdx.x);
151 | int temp1 = degree_l[i1];
152 | int temp2 = degree_l[i2];
153 | degree_l[i1]=temp2;
154 | degree_l[i2]=temp2+temp1;
155 | //printf("Index:%d, Value:%d \n",i2,temp[i2]);
156 | }
157 | }
158 | }
159 |
160 | __device__ void
161 | ITS(float *degree_l,int offset_d_n,int warpsize, int neighbor_length){
162 | float bits = log2f(neighbor_length);
163 | int raise = ceilf(bits);
164 | int max_bit = powf(2,raise);
165 | int len=max_bit;
166 | int total_step= log2f(max_bit);
167 | int warp_tid = threadIdx.x%32;
168 | // __syncthreads();
169 | gpu_prefix(total_step,warp_tid,degree_l,offset_d_n,warpsize,len);
170 | float sum = degree_l[neighbor_length-1+offset_d_n];
171 | for (int i = warp_tid; i < neighbor_length; i+=warpsize)
172 | {
173 | degree_l[i]=degree_l[i]/((double)sum);
174 | // printf("i:%d, degree:%.2f\n",i,degree_l[i]);
175 | }
176 | }
177 |
178 | __device__ int
179 | max(int *data, int len)
180 | {
181 | int max=data[0];
182 | for(int i=0;imax){max=data[i];}
185 | // printf("data: %d\n",data[i]);
186 | }
187 | return max;
188 | }
189 |
190 | __device__ void
191 | read_prob(float *degree_l, float *prob, int len, int offset){
192 | int index=threadIdx.x %32;
193 | while(indexdegree;
226 | int *bitmap = wvar->bitmap;
227 | int *selected_list = wvar->selected;
228 | int neighbor_length =wvar->NL;
229 | int neighbor_start= wvar->NS;
230 | int *total_counter = wvar->total_counter;
231 | warpsize=32;
232 | int prefix=0, index=0;
233 | int new_neighbor;
234 | clock_t start_time,stop_time;
235 | float pref_time;
236 | int counter;
237 | wvar->sindex[0]=0;
238 | // decide if prefix sum is required
239 | if(neighbor_length>N) { prefix=1; }
240 | if(prefix==1)
241 | {
242 | start_time = clock();
243 | ITS(degree_l, 0, warpsize, neighbor_length);
244 | stop_time =clock();
245 | pref_time= float(stop_time-start_time);
246 | index=warpTID;
247 | start_time= clock();
248 |
249 | // reset bitmaps
250 | int start=warpTID;
251 | int end = neighbor_length/32 + 1;
252 | for(int i=start;i4000){break;}
259 | // if(warpTID==0){printf("Iteration: %d\n",counter);}
260 | index=warpTID;
261 | while(indextempSelected[index]= selected;
268 | is_in= bitmap_search(bitmap,0,selected);
269 | if(is_in==0){
270 | int pos= atomicAdd(&wvar->sindex[0],1);
271 | if(possindex[0]adj_list[selected+neighbor_start];
283 | // selected_list[index]= new_neighbor;
284 |
285 | }
286 | __syncwarp();
287 | stop_time= clock();
288 | float samp_time = float(stop_time-start_time);
289 | if(warpTID==0){
290 | total_counter[0]+=samp_time;
291 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",counter,neighbor_length, N,source,pref_time,samp_time);
292 | }
293 | }
294 | else
295 | {
296 | index=warpTID;
297 | while(indexadj_list[index + neighbor_start];
300 | selected_list[index]= new_neighbor;
301 | index+=warpsize;
302 | }
303 | }
304 | }
305 |
306 | __device__ void
307 | select(Wv *wvar, Cp *cache, int N,int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax, int bitflag, int Fcache)
308 | {
309 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
310 | int warpID = tid/32;
311 | int warpTID= threadIdx.x%32;
312 | float *degree_l = wvar->degree;
313 | int *bitmap = wvar->bitmap;
314 | int *selected_list = wvar->selected;
315 | int neighbor_length =wvar->NL;
316 | int neighbor_start= wvar->NS;
317 | int *total_counter = wvar->total_counter;
318 | int warpsize=32;
319 | int prefix=0, index=0;
320 | int new_neighbor;
321 | clock_t start_time,stop_time;
322 | float pref_time;
323 | // if(source%2==0){N=N+1;}
324 | // decide if prefix sum is required
325 | if(neighbor_length>N) { prefix=1; }
326 | if(prefix==1)
327 | {
328 | start_time = clock();
329 | if(Fcache){
330 | int offset=G->beg_pos[source];
331 | if(cache->status[source]==1){
332 | // if(warpTID==0){printf("avoided.\n");}
333 | read_prob(degree_l,cache->probability,neighbor_length,offset);
334 | }
335 | else{
336 | ITS(degree_l, 0, warpsize, neighbor_length);
337 | write_prob(degree_l,cache->probability,neighbor_length,offset);
338 | if(warpTID==0){cache->status[source]=1;}
339 | }
340 | }
341 | else{ITS(degree_l, 0, warpsize, neighbor_length);}
342 |
343 | stop_time =clock();
344 | pref_time= float(stop_time-start_time);
345 | index=warpTID;
346 | while(indexadj_list[selected+neighbor_start];
371 | selected_list[index]= new_neighbor;
372 | #ifdef profile
373 | printf("Added %d to sampled.\n",selected);
374 | #endif
375 |
376 | break;}
377 | if(colcount[index]>400){
378 | selected_list[index]= 0;
379 | break;
380 | }
381 | }
382 | // Add new neighbor to
383 | // printf("Index: %d, count: %d\n",index,colcount[index]);
384 | index+=warpsize;
385 | }
386 | //
387 | //
388 | __syncwarp();
389 | stop_time= clock();
390 | float samp_time = float(stop_time-start_time);
391 | if(warpTID==0){
392 | int longer=max(colcount,N);
393 | atomicAdd(&Gmax[0], longer);
394 | total_counter[0]+=samp_time;
395 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time);
396 | }
397 |
398 | }
399 | else
400 | // pick up neighors
401 | {
402 | index=warpTID;
403 | while(indexadj_list[index + neighbor_start];
406 | selected_list[index]= new_neighbor;
407 | index+=warpsize;
408 | }
409 | }
410 | }
411 |
412 | __device__ void
413 | naive_ITS(Wv *wvar, Cp *cache, int N,int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax, int bitflag, int Fcache)
414 | {
415 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
416 | int warpID = tid/32;
417 | int warpTID= threadIdx.x%32;
418 | float *degree_l = wvar->degree;
419 | int *bitmap = wvar->bitmap;
420 | int *selected_list = wvar->selected;
421 | int neighbor_length =wvar->NL;
422 | int neighbor_start= wvar->NS;
423 | int *total_counter = wvar->total_counter;
424 | int warpsize=32;
425 | int prefix=0, index=0;
426 | int new_neighbor;
427 | clock_t start_time,stop_time;
428 | float pref_time;
429 | // if(source%2==0){N=N+1;}
430 | // decide if prefix sum is required
431 | if(neighbor_length>N) { prefix=1; }
432 | if(prefix==1)
433 | {
434 | start_time = clock();
435 | if(Fcache){
436 | int offset=G->beg_pos[source];
437 | if(cache->status[source]==1){
438 | // if(warpTID==0){printf("avoided.\n");}
439 | read_prob(degree_l,cache->probability,neighbor_length,offset);
440 | }
441 | else{
442 | ITS(degree_l, 0, warpsize, neighbor_length);
443 | write_prob(degree_l,cache->probability,neighbor_length,offset);
444 | if(warpTID==0){cache->status[source]=1;}
445 | }
446 | }
447 | else{ITS(degree_l, 0, warpsize, neighbor_length);}
448 |
449 | stop_time =clock();
450 | pref_time= float(stop_time-start_time);
451 | index=warpTID;
452 | while(indexadj_list[selected+neighbor_start];
477 | selected_list[index]= new_neighbor;
478 | #ifdef profile
479 | printf("Added %d to sampled.\n",selected);
480 | #endif
481 |
482 | break;}
483 | if(colcount[index]>400){
484 | selected_list[index]= 0;
485 | break;
486 | }
487 | }
488 | // Add new neighbor to
489 | // printf("Index: %d, count: %d\n",index,colcount[index]);
490 | index+=warpsize;
491 | }
492 | //
493 | if(N<4){
494 | ITS(degree_l, 0, warpsize, neighbor_length);
495 | float r= curand_uniform(&local_state);
496 | int selected= binary_search(0,neighbor_length,r,degree_l);
497 | }
498 | //
499 | __syncwarp();
500 | stop_time= clock();
501 | float samp_time = float(stop_time-start_time);
502 | if(warpTID==0){
503 | int longer=max(colcount,N);
504 | atomicAdd(&Gmax[0], longer);
505 | total_counter[0]+=samp_time;
506 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time);
507 | }
508 |
509 | }
510 | else
511 | // pick up neighors
512 | {
513 | index=warpTID;
514 | while(indexadj_list[index + neighbor_start];
517 | selected_list[index]= new_neighbor;
518 | index+=warpsize;
519 | }
520 | }
521 | }
522 |
523 | __device__ void
524 | normalize_over_select(Wv *wvar, int warpsize, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax)
525 | {
526 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
527 | int warpID = tid/32;
528 | int warpTID= threadIdx.x%32;
529 | float *degree_l = wvar->degree;
530 | int *bitmap = wvar->bitmap;
531 | int *selected_list = wvar->selected;
532 | int neighbor_length =wvar->NL;
533 | int neighbor_start= wvar->NS;
534 | int *total_counter = wvar->total_counter;
535 | warpsize=32;
536 | int prefix=0, index=0;
537 | int new_neighbor;
538 | clock_t start_time,stop_time;
539 | float pref_time;
540 | wvar->sindex[0]=0;
541 | if(neighbor_length>N) { prefix=1; }
542 | if(prefix==1)
543 | {
544 | start_time = clock();
545 | ITS(degree_l, 0, warpsize, neighbor_length);
546 | stop_time =clock();
547 | pref_time= float(stop_time-start_time);
548 | index=warpTID;
549 |
550 | // reset bitmaps
551 | int start=warpTID;
552 | int end = neighbor_length/32 + 1;
553 | // if(warpTID==0)printf("Bitmap end:%d\n",end);
554 | for(int i=start;isindex[0],1);
576 | if(posvalue){
582 | a= degree_l[selected];
583 | b= degree_l[selected+1];}
584 | else{
585 | a= degree_l[selected-1];
586 | b= degree_l[selected];}
587 | // if(lb==a && hb==b){};
588 | // float temp = 0.23;
589 | temp= (float) (a-lb)/(a-lb+hb-b);
590 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp);
591 | if(r< temp)
592 | {
593 | // printf("Update hb.\n");
594 | r= (lb+r*(a-lb));
595 | hb=a;}
596 | else
597 | {
598 | // printf("Update lb.\n");
599 | r= (b+(hb-b)*r);
600 | lb=b;}
601 | // printf("\nNew r: %.2f, lb: %.2f, hb: %.2f\n",r,lb,hb);
602 | }
603 |
604 | if(colcount[index]>80){break;}
605 | // else{
606 | // // atomicAdd(&colcount[0],1);
607 | // printf("Repeated. Index: %d, selected: %d\n",index,selected);
608 | // }
609 | }
610 | new_neighbor= G->adj_list[selected+neighbor_start];
611 | selected_list[index]= new_neighbor;
612 | index+=warpsize;
613 | }
614 | __syncwarp();
615 | stop_time= clock();
616 | float samp_time = float(stop_time-start_time);
617 | if(warpTID==0){
618 | int longer=max(colcount,N);
619 | total_counter[0]+=samp_time;
620 | }
621 |
622 | }
623 | else{
624 | index=warpTID;
625 | while(indexadj_list[index + neighbor_start];
628 | selected_list[index]= new_neighbor;
629 | index+=warpsize;
630 | }
631 | }
632 | }
633 |
634 |
635 |
636 | __device__ void
637 | normalize(Wv *wvar, int warpsize, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax)
638 | {
639 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
640 | int warpID = tid/32;
641 | int warpTID= threadIdx.x%32;
642 | float *degree_l = wvar->degree;
643 | int *bitmap = wvar->bitmap;
644 | int *selected_list = wvar->selected;
645 | int neighbor_length =wvar->NL;
646 | int neighbor_start= wvar->NS;
647 | int *total_counter = wvar->total_counter;
648 | warpsize=32;
649 | int prefix=0, index=0;
650 | int new_neighbor;
651 | clock_t start_time,stop_time;
652 | float pref_time;
653 | if(neighbor_length>N) { prefix=1; }
654 | if(prefix==1)
655 | {
656 | start_time = clock();
657 | ITS(degree_l, 0, warpsize, neighbor_length);
658 | stop_time =clock();
659 | pref_time= float(stop_time-start_time);
660 | index=warpTID;
661 |
662 | // reset bitmaps
663 | int start=warpTID;
664 | int end = neighbor_length/32 + 1;
665 | // if(warpTID==0)printf("Bitmap end:%d\n",end);
666 | for(int i=start;ivalue){
690 | a= degree_l[selected];
691 | b= degree_l[selected+1];}
692 | else{
693 | a= degree_l[selected-1];
694 | b= degree_l[selected];}
695 | // if(lb==a && hb==b){};
696 | // float temp = 0.23;
697 | temp= (float) (a-lb)/(a-lb+hb-b);
698 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp);
699 | if(r< temp)
700 | {
701 | // printf("Update hb.\n");
702 | r= (lb+r*(a-lb));
703 | hb=a;}
704 | else
705 | {
706 | // printf("Update lb.\n");
707 | r= (b+(hb-b)*r);
708 | lb=b;}
709 | // printf("\nNew r: %.2f, lb: %.2f, hb: %.2f\n",r,lb,hb);
710 | }
711 |
712 | if(colcount[index]>1000){break;}
713 | // else{
714 | // // atomicAdd(&colcount[0],1);
715 | // printf("Repeated. Index: %d, selected: %d\n",index,selected);
716 | // }
717 | }
718 | new_neighbor= G->adj_list[selected+neighbor_start];
719 | selected_list[index]= new_neighbor;
720 | index+=warpsize;
721 | }
722 | __syncwarp();
723 | stop_time= clock();
724 | float samp_time = float(stop_time-start_time);
725 | if(warpTID==0){
726 | int longer=max(colcount,N);
727 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time);
728 | total_counter[0]+=samp_time;
729 | }
730 |
731 | }
732 | else{
733 | index=warpTID;
734 | while(indexadj_list[index + neighbor_start];
737 | selected_list[index]= new_neighbor;
738 | index+=warpsize;
739 | }
740 | }
741 | }
742 |
743 | __device__ void
744 | heur_normalize(Wv *wvar, Cp *cache, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax,int bitflag,int Fcache)
745 | {
746 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
747 | int warpID = tid/32;
748 | int warpTID= threadIdx.x%32;
749 | float *degree_l = wvar->degree;
750 | int *bitmap = wvar->bitmap;
751 | int *selected_list = wvar->selected;
752 | int neighbor_length =wvar->NL;
753 | int neighbor_start= wvar->NS;
754 | int *total_counter = wvar->total_counter;
755 | int warpsize=32;
756 | int prefix=0, index=0;
757 | int new_neighbor;
758 | clock_t start_time,stop_time;
759 | float pref_time;
760 | // For forest fire
761 | // if(source%2==0){N=N+1;}
762 | if(neighbor_length>N) { prefix=1; }
763 | if(prefix==1)
764 | {
765 | start_time = clock();
766 | if(Fcache){
767 | int offset=G->beg_pos[source];
768 | if(cache->status[source]==1){
769 | // if(warpTID==0){printf("avoided.\n");}
770 | read_prob(degree_l,cache->probability,neighbor_length,offset);
771 | }
772 | else{
773 | ITS(degree_l, 0, warpsize, neighbor_length);
774 | write_prob(degree_l,cache->probability,neighbor_length,offset);
775 | if(warpTID==0){cache->status[source]=1;}
776 | }
777 | }
778 | else{ITS(degree_l, 0, warpsize, neighbor_length);}
779 | stop_time =clock();
780 | pref_time= float(stop_time-start_time);
781 | index=warpTID;
782 |
783 | // reset bitmaps
784 | int start=warpTID;
785 | int end = neighbor_length/32 + 1;
786 | // if(warpTID==0)printf("Bitmap end:%d\n",end);
787 | for(int i=start;iadj_list[selected+neighbor_start];
810 | selected_list[index]= new_neighbor;
811 | break;
812 | }
813 | if(is_in==1){
814 | float value = degree_l[selected];
815 | if(r>value){
816 | a= degree_l[selected];
817 | b= degree_l[selected+1];}
818 | else{
819 | a= degree_l[selected-1];
820 | b= degree_l[selected];}
821 | // if(lb==a && hb==b){};
822 | // float temp = 0.23;
823 | float lambda= (float) (a-lb)/(a-lb+hb-b);
824 | float delta= (float) (b-a)/(hb-lb);
825 | r= (float) r/lambda;
826 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp);
827 | if(r< a)
828 | {
829 | hb=a;}
830 | else
831 | {r= r + delta;
832 | lb=b;}
833 | // printf("index: %d,random number: %.2f, selected: %d, is_in: %d\n",index,r,selected,is_in);
834 | localCount+=1;
835 |
836 | selected= binary_search(0,neighbor_length,r,degree_l);
837 | is_in= bitmap_search(bitmap,0,selected);
838 | if(is_in==0){
839 | new_neighbor= G->adj_list[selected+neighbor_start];
840 | selected_list[index]= new_neighbor;
841 | break;
842 | }
843 | }
844 | if(colcount[index]>400){break;}
845 | }
846 | index+=warpsize;
847 | }
848 | __syncwarp();
849 | stop_time= clock();
850 | float samp_time = float(stop_time-start_time);
851 | if(warpTID==0){
852 | int longer=max(colcount,N);
853 | atomicAdd(&Gmax[0], longer);
854 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time);
855 | // total_counter[0]+=samp_time;
856 | }
857 |
858 | }
859 | else{
860 | index=warpTID;
861 | while(indexadj_list[index + neighbor_start];
864 | selected_list[index]= new_neighbor;
865 | index+=warpsize;
866 | }
867 | }
868 | }
869 |
870 |
871 |
872 | __device__ int
873 | get_neighbors(gpu_graph *graph, int vertex, Wv *wvar, int VertCount){
874 | int warpTID= threadIdx.x%32;
875 | int index= warpTID;
876 | int len= graph->degree_list[vertex];
877 | wvar->NL=len;
878 | int neighbor_start=graph->beg_pos[vertex];
879 | wvar->NS= neighbor_start;
880 | #ifdef profile
881 | // if(warpTID==0){printf("Source: %d, NLen: %d, Nstart: %d\n",vertex, len,neighbor_start);}
882 | #endif
883 | while(indexadj_list[neighbor_start + index];
886 | wvar->neighbors[index]= neighbor;
887 | wvar->degree[index]= EdgeBias(neighbor,graph);
888 | // {printf("Neighbor:%d, tid:%d\n",wvar->neighbors[index],index);}
889 | index+=warpSize;
890 | }
891 | return len;
892 | }
893 |
894 | // __device__ void
895 | // next(S){
896 |
897 | // }
898 |
899 | __device__ int
900 | linear_search(int neighbor,int *partition1, int *bin_count, int bin, int BIN_SIZE, int BUCKETS)
901 | {
902 | if(bin>=32){printf("Bin error.\n");}
903 | int len = bin_count[bin];
904 | int i = bin;
905 | // printf("\nL: %d, I:%d\n",len,i);
906 | int step=0;
907 | while(stepBUCKETS;
932 | int bin= vertex % BUCKETS;
933 | int BIN_SIZE = hashtable->bin_size;
934 | // #ifdef profile
935 | // printf("Bucket %d, bin: %d\n",BUCKETS,bin);
936 | // #endif
937 | int is_in=linear_search(vertex,hashtable->hash,hashtable->bin_counter,bin,BIN_SIZE,BUCKETS);
938 | // // if(is_in==1){printf("Duplicated Found: %d\n",new_neighbor);}
939 | return is_in;
940 | }
941 |
942 | __device__ void
943 | add_hash(Ht *hashtable, int vertex)
944 | {
945 | int BUCKETS = hashtable->BUCKETS;
946 | int bin= vertex % BUCKETS;
947 | int index=atomicAdd(&hashtable->bin_counter[bin],1);
948 | if(index>100){printf("error. %d\n",index);}
949 | #ifdef profile
950 | printf("Add: %d, bin: %d, INdex: %d\n",vertex,bin,index);
951 | #endif
952 | hashtable->hash[index*BUCKETS+ bin]=vertex;
953 | }
954 |
955 |
956 |
957 | __device__ int
958 | linear_duplicate(Si *samples, int vertex){
959 | int warpTID=threadIdx.x%32;
960 | int index= warpTID;
961 | while(indexstart[0])
962 | {
963 | if(vertex==samples->edge[index]){
964 | return 1;
965 | break;
966 | }
967 | }
968 | return 0;
969 | }
970 |
971 | __device__ void
972 | frontier(gpu_graph *G,Sampling *S, int warpId,int SampleID, int N, int source, int sourceIndex, int hash, int Depth)
973 | {
974 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
975 | int warpTID= threadIdx.x%32;
976 | int *selected=S->wvar[warpId].selected;
977 | int index=warpTID;
978 | int is_in=0;
979 | while(indexhashtable[SampleID], vertex);}
984 | // else{is_in= linear_duplicate(&S->samples[SampleID], vertex);}
985 | int pos=atomicAdd(&S->samples[SampleID].start[0],1);
986 | // total count
987 | atomicAdd(&S->sampled_count[0],1);
988 | #ifdef profile
989 |
990 | // printf("Added to sampled.\n SID: %d, Updated: %d, pos: %d, is_in: %d\n",SampleID,vertex,pos,is_in);
991 | #endif
992 | S->samples[SampleID].vertex[pos]=source;
993 | S->samples[SampleID].edge[pos]=vertex;
994 | if(is_in==0)
995 | {
996 | // add_hash(&S->hashtable[SampleID], vertex);
997 | int currDepth= S->candidate.depth[sourceIndex];
998 | if(currDepth < (Depth-1)){
999 | // #ifdef profile
1000 | // printf("warpID: %d, Curr:%d, Added %d to queue.\n",tid/32,currDepth,vertex);
1001 | // #endif
1002 | int Qid= atomicAdd(&S->candidate.end[0],1);
1003 | S->candidate.vertices[Qid]= vertex;
1004 | S->candidate.instance_ID[Qid]= S->candidate.instance_ID[sourceIndex];
1005 | S->candidate.depth[Qid]= currDepth+1;
1006 | }
1007 | }
1008 | index+=warpSize;
1009 | }
1010 | // __syncwarp();
1011 | }
1012 |
1013 |
1014 | __device__ int
1015 | ITS_MDRW(Wv *wvar,curandState local_state, gpu_graph *G, int neighbor_length, float r)
1016 | {
1017 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
1018 | int warpID = tid/32;
1019 | int warpTID= threadIdx.x%32;
1020 | float *degree_l = wvar->degree;
1021 | int neighbor_start= wvar->NS;
1022 | int *total_counter = wvar->total_counter;
1023 | int warpsize=32;
1024 | int prefix=0, index=0;
1025 | int new_neighbor;
1026 | clock_t start_time,stop_time;
1027 | float pref_time;
1028 | // if(source%2==0){N=N+1;}
1029 | // decide if prefix sum is required
1030 | if(neighbor_length>1) { prefix=1; }
1031 | if(prefix==1)
1032 | {
1033 | start_time = clock();
1034 | ITS(degree_l, 0, warpsize, neighbor_length);
1035 | __syncwarp();
1036 | #ifdef profile
1037 | if(threadIdx.x==0){
1038 | for(int i=0;iadj_list[selected+neighbor_start];
1053 | return selected;
1054 | }
1055 | else
1056 | {
1057 | return 0;
1058 | // new_neighbor = G->adj_list[neighbor_start];
1059 | }
1060 | }
1061 |
1062 |
1063 | #endif
1064 |
1065 |
1066 |
--------------------------------------------------------------------------------
/non-stream/gpu_graph.cuh:
--------------------------------------------------------------------------------
1 | //10/03/2016
2 | //Graph data structure on GPUs
3 | #ifndef _GPU_GRAPH_H_
4 | #define _GPU_GRAPH_H_
5 | #include
6 | #include "header.h"
7 | #include "util.h"
8 | #include "graph.h"
9 |
10 | class gpu_graph
11 | {
12 | public:
13 | vertex_t *adj_list;
14 | weight_t *weight_list;
15 | index_t *beg_pos;
16 | vertex_t *degree_list;
17 |
18 | index_t vert_count;
19 | index_t edge_count;
20 | index_t avg_degree;
21 |
22 | public:
23 | ~gpu_graph(){}
24 |
25 | gpu_graph(
26 | graph *ginst)
27 | {
28 | vert_count=ginst->vert_count;
29 | edge_count=ginst->edge_count;
30 | avg_degree = ginst->edge_count/ginst->vert_count;
31 |
32 | // size_t weight_sz=sizeof(weight_t)*edge_count;
33 | size_t adj_sz=sizeof(vertex_t)*edge_count;
34 | size_t deg_sz=sizeof(vertex_t)*edge_count;
35 | size_t beg_sz=sizeof(index_t)*(vert_count+1);
36 | vertex_t *cpu_degree_list=(vertex_t*)malloc(sizeof(vertex_t)*edge_count);
37 | /* Alloc GPU space */
38 | H_ERR(cudaMalloc((void **)&adj_list, adj_sz));
39 | H_ERR(cudaMalloc((void **)°ree_list, deg_sz));
40 | H_ERR(cudaMalloc((void **)&beg_pos, beg_sz));
41 | //H_ERR(cudaMalloc((void **)&weight_list, weight_sz));
42 |
43 | for(int i=0; i<(ginst->edge_count); i++)
44 | {
45 | int neighbor= ginst->adj_list[i];
46 | //cout<<"Index: "<beg_pos[neighbor+1] - ginst->beg_pos[neighbor];
48 | if((cpu_degree_list[i]>1950) & (cpu_degree_list[i]<2050))
49 | {
50 | //printf("V: %d, Degree:%d\n",neighbor,cpu_degree_list[i]);
51 | }
52 | }
53 |
54 | /* copy it to GPU */
55 | H_ERR(cudaMemcpy(adj_list,ginst->adj_list,
56 | adj_sz, cudaMemcpyHostToDevice));
57 | H_ERR(cudaMemcpy(beg_pos,ginst->beg_pos,
58 | beg_sz, cudaMemcpyHostToDevice));
59 | H_ERR(cudaMemcpy(degree_list,cpu_degree_list,
60 | beg_sz, cudaMemcpyHostToDevice));
61 |
62 | //H_ERR(cudaMemcpy(weight_list,ginst->weight,
63 | // weight_sz, cudaMemcpyHostToDevice));
64 | }
65 | };
66 |
67 | #endif
68 |
--------------------------------------------------------------------------------
/non-stream/graph.h:
--------------------------------------------------------------------------------
1 | #ifndef __GRAPH_H__
2 | #define __GRAPH_H__
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include "wtime.h"
8 | #include
9 | #include
10 | #include
11 | inline off_t fsize(const char *filename) {
12 | struct stat st;
13 | if (stat(filename, &st) == 0)
14 | return st.st_size;
15 | return -1;
16 | }
17 |
18 | template<
19 | typename file_vert_t, typename file_index_t, typename file_weight_t,
20 | typename new_vert_t, typename new_index_t, typename new_weight_t>
21 | class graph
22 | {
23 | public:
24 | new_index_t *beg_pos;
25 | new_vert_t *adj_list;
26 | new_weight_t *weight;
27 | new_vert_t *degree_list;
28 | new_index_t vert_count;
29 | new_index_t edge_count;
30 |
31 | public:
32 | graph(){};
33 | ~graph(){};
34 | graph(const char *beg_file,
35 | const char *adj_list_file,
36 | const char *weight_file);
37 |
38 | graph(file_vert_t *csr,
39 | file_index_t *beg_pos,
40 | file_weight_t *weight_list,
41 | file_index_t vert_count,
42 | file_index_t edge_count)
43 | {
44 | this->beg_pos = beg_pos;
45 | this->adj_list = csr;
46 | this->weight = weight_list;
47 | //this->degree_list= degree_list;
48 | this->edge_count = edge_count;
49 | this->vert_count = vert_count;
50 | };
51 | };
52 | #include "graph.hpp"
53 | #endif
54 |
--------------------------------------------------------------------------------
/non-stream/graph.hpp:
--------------------------------------------------------------------------------
1 | #include "graph.h"
2 | #include
3 |
4 | template<
5 | typename file_vert_t, typename file_index_t, typename file_weight_t,
6 | typename new_vert_t, typename new_index_t, typename new_weight_t>
7 | graph
9 | ::graph(
10 | const char *beg_file,
11 | const char *adj_file,
12 | const char *weight_file)
13 | {
14 | double tm=wtime();
15 | FILE *file=NULL;
16 | file_index_t ret;
17 |
18 | vert_count=fsize(beg_file)/sizeof(file_index_t) - 1;
19 | edge_count=fsize(adj_file)/sizeof(file_vert_t);
20 |
21 | file=fopen(beg_file, "rb");
22 | if(file!=NULL)
23 | {
24 | file_index_t *tmp_beg_pos=NULL;
25 |
26 | if(posix_memalign((void **)&tmp_beg_pos, getpagesize(),
27 | sizeof(file_index_t)*(vert_count+1)))
28 | perror("posix_memalign");
29 |
30 | ret=fread(tmp_beg_pos, sizeof(file_index_t),
31 | vert_count+1, file);
32 | assert(ret==vert_count+1);
33 | fclose(file);
34 | edge_count=tmp_beg_pos[vert_count];
35 | //std::cout<<"Expected edge count: "<0);
38 |
39 | //converting to new type when different
40 | if(sizeof(file_index_t)!=sizeof(new_index_t))
41 | {
42 | if(posix_memalign((void **)&beg_pos, getpagesize(),
43 | sizeof(new_index_t)*(vert_count+1)))
44 | perror("posix_memalign");
45 | for(new_index_t i=0;i
2 | #include "graph.h"
3 | #include "wtime.h"
4 | #include
5 | #include
6 | #include
7 | #include "gpu_graph.cuh"
8 | #include
9 | #include
10 | #include "herror.h"
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include "api.cuh"
16 | #include "sampler.cuh"
17 | #include "sample_class.cuh"
18 | #include "functions.cuh"
19 | using namespace std;
20 |
21 | __global__ void
22 | check(Sampling *S, gpu_graph G,curandState *global_state,int n_subgraph, int FrontierSize, int NeighborSize, int Depth)
23 | {
24 | float prefix_time,local_d_time,global_d_time;
25 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
26 | int hash=1, cache=0, bitflag=1, NORMALIZE=1;
27 | #ifdef profile
28 | if(tid==0){
29 | printf("\n");
30 | for(int i=0; icandidate.vertices[i];
33 | }}
34 | #endif
35 | __syncwarp();
36 | int warpId = tid/32;
37 | int warpTid=threadIdx.x%32;
38 | clock_t start_time,stop_time;
39 | int i=0, warpsize=100;
40 | curandState local_state=global_state[threadIdx.x];
41 | curand_init(tid, 0, 0, &local_state); // sequence created with different seed and same sequence
42 | int __shared__ l_search[256];
43 | int __shared__ max_find[256];
44 | S->candidate.start[0] = 0;
45 | int sourceIndex=warpId,source=0;
46 | if(warpTid==0){
47 | atomicAdd(&S->candidate.start[0],1);
48 | }
49 | // sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0);
50 | __syncwarp();
51 |
52 | #ifdef profile
53 | // if(threadIdx.x==0){printf("warpID:%d, sourceIndex:%d,start: %d\n",warpId, sourceIndex, S->candidate.start[0]);}
54 | #endif
55 | // start loop
56 | S->candidate.end[0]= n_subgraph;
57 | // clock_t start = clock();
58 |
59 | // Subgraph number should be higher than the number of warps assigned
60 | while(sourceIndex < S->candidate.end[0])
61 | {
62 | int VertCount=1;
63 | source= S->candidate.vertices[sourceIndex];
64 | int SampleID= S->candidate.instance_ID[sourceIndex];
65 | int NL= G.degree_list[source];
66 | #ifdef profile
67 | // if(warpTid==0){printf(" Source: %d, len: %d\n",source,NL);}
68 | #endif
69 | if((NL==0)){ // Skip empty vertices
70 | if(warpTid==0){sourceIndex=atomicAdd(&S->candidate.start[0],1);}
71 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0);
72 | __syncwarp();
73 | continue;
74 | }
75 | int len= get_neighbors(&G,source,&S->wvar[warpId],VertCount);
76 |
77 | if(NORMALIZE==0){
78 | select(&S->wvar[warpId],&S->cache,NeighborSize,1,local_state, &G,S->count.colcount, source,S->max,bitflag,cache);
79 | }
80 | else{
81 | heur_normalize(&S->wvar[warpId],&S->cache,NeighborSize,1,local_state, &G,S->count.colcount, source, S->max,bitflag,cache);
82 | }
83 | frontier(&G,S,warpId,SampleID,NeighborSize,source,sourceIndex, hash, Depth);
84 |
85 | __syncwarp();
86 | if(warpTid==0){
87 | sourceIndex=atomicAdd(&S->candidate.start[0],1);
88 | }
89 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0);
90 | __syncwarp();
91 | }
92 | if(tid==0){printf("%d,",S->sampled_count[0]);}
93 | }
94 |
95 | __global__ void
96 | check_layer(Sampling *S, gpu_graph G,curandState *global_state,int n_subgraph, int FrontierSize, int NeighborSize, int Depth)
97 | {
98 | float prefix_time,local_d_time,global_d_time;
99 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
100 | int hash=1, cache=0, bitflag=1, NORMALIZE=1;
101 | int warpId = tid/32;
102 |
103 | // __syncwarp();
104 |
105 | int warpTid=threadIdx.x%32;
106 | clock_t start_time,stop_time;
107 | int i=0, warpsize=100;
108 | curandState local_state=global_state[threadIdx.x];
109 | curand_init(tid, 0, 0, &local_state); // sequence created with different seed and same sequence
110 | int __shared__ l_search[256];
111 | int __shared__ max_find[256];
112 | S->candidate.start[0] = 0;
113 | int sourceIndex=warpId,source=0;
114 | if(warpTid==0){
115 | atomicAdd(&S->candidate.start[0],1);
116 | }
117 | // sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0);
118 | // __syncwarp();
119 |
120 | #ifdef profile
121 | // if(threadIdx.x==0){printf("Block:%d, sourceIndex:%d, start: %d\n",blockIdx.x, sourceIndex, S->candidate.vertices[0]);}
122 | #endif
123 | S->candidate.end[0]= n_subgraph;
124 | // get degree for all frontiers
125 | int index= tid;
126 | // for(int i=0;i<(n_subgraph*FrontierSize);i++)
127 | // {
128 | // int vert= S->candidate.vertices[i];
129 | // S->frontier_degree[i] = G.degree_list[source];
130 | // }
131 | while(sourceIndex < n_subgraph)
132 | {
133 | int curr_depth=0;
134 | while(curr_depthcandidate.vertices[index];
147 | int bias = VertexBias(vert, &G);
148 | S->wvar[warpId].degree[index-start_index]= (float)bias;
149 | #ifdef profile
150 | // printf("Vert: %d, Bias: %d\n",vert,bias);
151 | #endif
152 | }
153 | // __syncwarp();
154 | // pick one with ITS
155 | float r = curand_uniform(&local_state);
156 | int selectedIndex= ITS_MDRW(&S->wvar[warpId], local_state, &G, FrontierSize,r);
157 | if(threadIdx.x==0){
158 | int selected = S->candidate.vertices[selectedIndex];
159 | #ifdef profile
160 | if(warpTid==0){printf("Random selected: %d, vertex: %d\n",selectedIndex, selected);}
161 | #endif
162 | int NL= G.degree_list[selected];
163 | if(NL==0){curr_depth+=1;continue;}
164 | // generate one random integer with range of (0,NL);
165 | int r=rand_integer(local_state,NL);
166 | int neighbor_start= G.beg_pos[selected];
167 | int sample= G.adj_list[r+neighbor_start] ;
168 | #ifdef profile
169 | if(warpTid==0){printf("NL: %d, New selected: %d, vertex: %d\n",NL,r, sample);}
170 | #endif
171 | int SampleID=sourceIndex;
172 | int pos=atomicAdd(&S->samples[SampleID].start[0],1);
173 | S->samples[SampleID].vertex[pos]=selected;
174 | S->samples[SampleID].edge[pos]=sample;
175 | if(warpTid==0){atomicAdd(&S->sampled_count[0],1);}
176 | // update the degree and frontier
177 | S->candidate.vertices[selectedIndex] = sample;
178 | S->frontier_degree[selectedIndex] = G.degree_list[sample];
179 | #ifdef profile
180 | if(warpTid==0){printf("Next level. Curr Depth: %d\n",curr_depth);}
181 | #endif
182 | }
183 | // __syncwarp();
184 | curr_depth+=1;
185 | }
186 | if(warpTid==0){
187 | sourceIndex=atomicAdd(&S->candidate.start[0],1);
188 | }
189 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0);
190 | // if(warpTid==0){printf("Next source. %d\n",sourceIndex);}
191 | // __syncwarp();
192 | }
193 | if(tid==0){printf("%d,",S->sampled_count[0]);}
194 | }
195 |
196 | struct arguments Sampler(char beg[100], char csr[100],int n_blocks, int n_threads, int n_subgraph, int FrontierSize, int NeighborSize, int Depth, struct arguments args, int rank)
197 | {
198 | int *total=(int *)malloc(sizeof(int)*n_subgraph);
199 | int *host_counter=(int *)malloc(sizeof(int));
200 | int T_Group=n_threads/32;
201 | int each_subgraph=Depth*NeighborSize;
202 | int total_length=each_subgraph*n_subgraph;
203 | int neighbor_length_max=n_blocks*6000*T_Group;
204 | int PER_BLOCK_WARP= T_Group;
205 | int BUCKET_SIZE=125;
206 | int BUCKETS=32;
207 | int warps = n_blocks * T_Group;
208 |
209 | int total_mem_for_hash=n_blocks*PER_BLOCK_WARP*BUCKETS*BUCKET_SIZE;
210 | int total_mem_for_bitmap=n_blocks*PER_BLOCK_WARP*300;
211 | //std::cout<<"Input: ./exe beg csr nblocks nthreads\n";
212 | int *bitmap, *node, *qstop_global, *qstart_global, *sample_id, *depth_tracker, *g_sub_index, *degree_l, *counter, *pre_counter;
213 | int *seeds=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize);
214 | int *h_sample_id=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize);
215 | int *h_depth_tracker=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize);
216 |
217 | const char *beg_file=beg;
218 | const char *csr_file=csr;
219 | const char *weight_file=csr;
220 |
221 | graph
222 | *ginst = new graph
223 |
224 | (beg_file,csr_file,weight_file);
225 | gpu_graph ggraph(ginst);
226 | curandState *d_state;
227 | cudaMalloc(&d_state,sizeof(curandState));
228 | // int *host_counter=(int *)malloc(sizeof(int));
229 | int *host_prefix_counter=(int *)malloc(sizeof(int));
230 | int *node_list=(int *)malloc(sizeof(int)*total_length);
231 | int *set_list=(int *)malloc(sizeof(int)*total_length);
232 |
233 | int *degree_list=(int *)malloc(sizeof(int)*ginst->edge_count);
234 | std::random_device rd;
235 | std::mt19937 gen(56);
236 | std::uniform_int_distribution<> dis(1,10000);
237 | int numBlocks;
238 | //cudaGetDevice(&device);
239 | //cudaGetDeviceProperties(&prop, device);
240 |
241 | // cudaOccupancyMaxActiveBlocksPerMultiprocessor(
242 | // &numBlocks,
243 | // check,
244 | // n_threads,
245 | // 0);
246 | // cout<<"Max allocatable Blocks:"<edge_count, warps, 10000,n_subgraph, BUCKETS*BUCKET_SIZE, Depth*NeighborSize, FrontierSize, Depth);
255 | H_ERR(cudaMalloc((void **)&sampler, sizeof(Sampling)));
256 |
257 | for(int n=0;n>>(sampler, ggraph, d_state, n_subgraph, FrontierSize, NeighborSize, Depth);
284 | }
285 | else{
286 | printf("Layer call\n");
287 | check_layer<<>>(sampler, ggraph, d_state, n_subgraph, FrontierSize, NeighborSize, Depth);
288 | }
289 | HRR(cudaDeviceSynchronize());
290 | // HRR(cudaMemcpy(host_counter, sampler->sampled_count, sizeof(int), cudaMemcpyDeviceToHost));
291 |
292 | // int total_count=0;
293 | // for(int i=0; i < n_subgraph;i++){
294 | // int count= S.samples[i].start[0];
295 | // printf("Sampled: %d\n",host_counter[0]);
296 | // total_count+=count;
297 | // }
298 | total_time= wtime()-start_time;
299 | // printf("%s,SamplingTime:%.6f\n",argv[1],total_time);
300 | // Copy the sampled graph to CPU
301 | /*
302 | The sampled graph is stored as edge list. To get the samples in the CPU memory, copy each array from class Si to CPU allocated memory.
303 | */
304 | // printf("Sampled edges:%d\n",host_counter[0]);
305 | // args.sampled_edges=host_counter[0];
306 | args.time=total_time;
307 | return args;
308 | }
309 |
--------------------------------------------------------------------------------
/non-stream/mpi_main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "sampler.cuh"
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | using namespace std;
16 |
17 | int main(int argc, char *argv[])
18 | {
19 | if(argc!=11){std::cout<<"Input: ./exe <# of samples> <#GPUs>\n";exit(0);}
20 | // SampleSize, FrontierSize, NeighborSize
21 | // printf("MPI started\n");
22 | int n_blocks = atoi(argv[4]);
23 | int n_threads = atoi(argv[5]);
24 | int SampleSize = atoi(argv[6]);
25 | int FrontierSize = atoi(argv[7]);
26 | int NeighborSize = atoi(argv[8]);
27 | int Depth= atoi(argv[9]);
28 | int total_GPU = atoi(argv[10]);
29 |
30 | MPI_Status status;
31 | int myrank;
32 | double global_max_time, global_min_time;
33 | int global_sampled_edges;
34 | struct arguments args;
35 | MPI_Init(&argc, &argv);
36 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
37 | int global_sum;
38 | SampleSize = SampleSize/total_GPU;
39 |
40 | args= Sampler(argv[2],argv[3], n_blocks, n_threads, SampleSize, FrontierSize, NeighborSize, Depth, args,myrank);
41 | MPI_Reduce(&args.time, &global_max_time, 1, MPI_DOUBLE,MPI_MAX, 0, MPI_COMM_WORLD);
42 | MPI_Reduce(&args.time, &global_min_time, 1, MPI_DOUBLE,MPI_MIN, 0, MPI_COMM_WORLD);
43 | float rate = global_sampled_edges/global_max_time/1000000;
44 | if(myrank==0)
45 | {
46 | printf("%s,%f,%f\n",argv[1],global_min_time,global_max_time);
47 | }
48 | MPI_Finalize();
49 | return 0;
50 | }
--------------------------------------------------------------------------------
/non-stream/run.sh:
--------------------------------------------------------------------------------
1 |
2 | ./sampling.bin WG WG/beg.bin WG/csr.bin 400 128 $1 $2 $3 $4 $5
3 |
--------------------------------------------------------------------------------
/non-stream/sample_class.cuh:
--------------------------------------------------------------------------------
1 | #ifndef SAMPLER_H
2 | #define SAMPLER_H
3 |
4 | #include "herror.h"
5 | #include "header.h"
6 | #include
7 | #include
8 | #include
9 |
10 | class Cd{
11 | /*
12 | Candidate list shared by all instances
13 | */
14 | public:
15 | int s=5;
16 | int *instance_ID, *vertices, *depth;
17 | int *start, *end;
18 | ~Cd(){};
19 | Cd(){};
20 | Cd(int len ){
21 | H_ERR(cudaMalloc((void **)&instance_ID, sizeof(int)*len));
22 | H_ERR(cudaMalloc((void **)&vertices, sizeof(int)*len));
23 | H_ERR(cudaMalloc((void **)&depth, sizeof(int)*len));
24 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2));
25 | H_ERR(cudaMalloc((void **)&end, sizeof(int)*2));
26 | }
27 | };
28 |
29 | class Dimnesion{
30 | public:
31 | int *pool;
32 | ~Dimnesion(){};
33 | Dimnesion(){};
34 | void init(int FrontierSize){
35 | H_ERR(cudaMalloc((void **)&pool, sizeof(int)*FrontierSize));
36 | }
37 | };
38 |
39 | class Wv{
40 | /*
41 | Warp variables
42 | */
43 | public:
44 | int test=1;
45 | int *total_counter;
46 | int *frontier, *findex;
47 | int *neighbors, *nindex;
48 | float *degree;
49 | int *dindex;
50 | int *selected, *sindex;
51 | int *tempSelected;
52 | int *bitmap, *bindex;
53 | int NL, NS;
54 | int *max;
55 | ~Wv(){};
56 | Wv(){}
57 | void init(int flen,int nlen, int dlen, int slen){
58 | H_ERR(cudaMalloc((void **)&frontier, sizeof(int)*flen));
59 | H_ERR(cudaMalloc((void **)&neighbors, sizeof(int)*nlen));
60 | H_ERR(cudaMalloc((void **)°ree, sizeof(float)*dlen));
61 | H_ERR(cudaMalloc((void **)&bitmap, sizeof(int)*(dlen/32)));
62 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen));
63 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen));
64 | H_ERR(cudaMalloc((void **)&findex, sizeof(int)*2));
65 | H_ERR(cudaMalloc((void **)&nindex, sizeof(int)*2));
66 | H_ERR(cudaMalloc((void **)&dindex, sizeof(int)*2));
67 | H_ERR(cudaMalloc((void **)&sindex, sizeof(int)*2));
68 | H_ERR(cudaMalloc((void **)&bindex, sizeof(int)*2));
69 | H_ERR(cudaMalloc((void **)&max, sizeof(int)*2));
70 | H_ERR(cudaMalloc((void **)&total_counter, sizeof(int)*2));
71 | }
72 | };
73 |
74 |
75 |
76 | class Si{
77 | /*
78 | sampled graph for instances. Each instance have its own sample graph.
79 | */
80 | public:
81 | int *vertex,*edge;
82 | int *start;
83 | ~Si(){};
84 | Si(){}
85 | void init(int len){
86 | H_ERR(cudaMalloc((void **)&vertex, sizeof(int)*len));
87 | H_ERR(cudaMalloc((void **)&edge, sizeof(int)*len));
88 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2));
89 | }
90 | };
91 |
92 | class Ht{
93 | /*
94 | Hashtable for each instance
95 | */
96 | public:
97 | int *hash;
98 | int *bin_counter;
99 | int BUCKETS;
100 | int bin_size=125;
101 | ~Ht(){};
102 | Ht(){}
103 | void init(int bin_count){
104 | BUCKETS=bin_count;
105 | H_ERR(cudaMalloc((void **)&hash, sizeof(int)*bin_count*bin_size));
106 | H_ERR(cudaMalloc((void **)&bin_counter, sizeof(int)*bin_count));
107 | }
108 | };
109 |
110 | class Co{
111 | /*
112 | Counters used in sampling.
113 | */
114 | public:
115 | int max_NL=90000; // Update this value for dynamic allocation
116 | int *counter, *pre_counter, *total, *colcount, *max;
117 | ~Co(){};
118 | Co(){};
119 | Co(int total){
120 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2));
121 | HRR(cudaMalloc((void **) &max,sizeof(int)*max_NL));
122 | HRR(cudaMalloc((void **) &pre_counter,sizeof(int)*2));
123 | HRR(cudaMalloc((void **) &colcount,sizeof(int)*50));
124 | HRR(cudaMalloc((void **) &total,sizeof(int)*total));
125 | }
126 | };
127 |
128 | class Cp{
129 | /*
130 | Cache probability for each vertex in the graph.
131 | */
132 | public:
133 | int *status;
134 | float *probability;
135 | int *counter;
136 | ~Cp(){};
137 | Cp(){};
138 | Cp(int len){
139 | // HRR(cudaMalloc((void **) &status,sizeof(int)*len));
140 | // HRR(cudaMalloc((void **) &probability,sizeof(float)*len));
141 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2));
142 | }
143 | };
144 |
145 |
146 | class Sampling{
147 | /*
148 | Collection of objects for sampling
149 | */
150 | public:
151 | Cd candidate;
152 | Si samples[20100];
153 | Ht hashtable[20100];
154 | Co count;
155 | Wv wvar[2000];
156 | Dimnesion front[4000];
157 | Cp cache;
158 | int *max,*sampled_count,*frontier_degree;
159 | int n_child=1;
160 | int DEPTH_LIMIT;
161 | int BUCKETS=32;
162 | int max_NL=90000; // Update this value for dynamic allocation
163 | ~Sampling(){};
164 | Sampling(int edgecount,int warpCount, int qlen, int seeds, int C_len, int sampleSize, int FrontierSize, int depth){
165 | DEPTH_LIMIT=depth;
166 | count= Co(seeds);
167 | candidate= Cd(seeds*max_NL);
168 | cache= Cp(edgecount);
169 | HRR(cudaMalloc((void **) &max,sizeof(int)*2));
170 | HRR(cudaMalloc((void **) &frontier_degree,sizeof(int)*sampleSize*FrontierSize));
171 | HRR(cudaMalloc((void **) &sampled_count,sizeof(int)));
172 | for(int i=0;i
5 | static void HandleError( cudaError_t err,
6 | const char *file,
7 | int line ) {
8 | if (err != cudaSuccess) {
9 | printf( "%s in %s at line %d\n", \
10 | cudaGetErrorString( err ),
11 | file, line );
12 | exit( EXIT_FAILURE );
13 | }
14 | }
15 | #define H_ERR( err ) \
16 | (HandleError( err, __FILE__, __LINE__ ))
17 |
18 |
19 | #define SML_MID 32
20 | #define MID_LRG 1024
21 | //#define SWITCH_TO (float)10.3
22 | #define SWITCH_TO (float)0.2
23 | #define SWITCH_BACK (float)0.4
24 | //#define SWITCH_BACK (float)0.3
25 |
26 | //#define SML_MID 0
27 | //#define MID_LRG 6553600
28 |
29 | //#define SML_MID 0
30 | //#define MID_LRG 0
31 | #define GPUID 0
32 | #define THDS_NUM 512
33 | #define BLKS_NUM 512
34 | //#define BLKS_NUM 96
35 |
36 | #endif
37 |
--------------------------------------------------------------------------------
/non-stream/wtime.h:
--------------------------------------------------------------------------------
1 | #ifndef __TIME_H__
2 | #define __TIME_H__
3 |
4 | #include
5 | #include
6 |
7 | double wtime()
8 | {
9 | double time[2];
10 | struct timeval time1;
11 | gettimeofday(&time1, NULL);
12 |
13 | time[0]=time1.tv_sec;
14 | time[1]=time1.tv_usec;
15 |
16 | return time[0]+time[1]*1.0e-6;
17 | }
18 |
19 | #endif
20 |
--------------------------------------------------------------------------------
/streaming/Makefile:
--------------------------------------------------------------------------------
1 | exe=streaming.bin
2 | N=1
3 | cucc= "$(shell which nvcc)"
4 | cc= "$(shell which mpicxx)"
5 | commflags=-lcudart -L"$(shell dirname $(cucc))"/../lib64
6 | cuflags= --compiler-options -v -Xcudafe -\# --resource-usage
7 | cuflags+= -std=c++11
8 | objs = $(patsubst %.cu,%.o,$(wildcard *.cu)) \
9 | $(patsubst %.cpp,%.o,$(wildcard *.cpp))
10 |
11 | deps = $(wildcard ./*.cuh) \
12 | $(wildcard ./*.hpp) \
13 | $(wildcard ./*.h) \
14 |
15 |
16 | %.o:%.cu $(deps)
17 | $(cucc) -c $(cuflags) $< -o $@
18 |
19 | %.o:%.cpp $(deps)
20 | $(cc) -c $< -o $@
21 |
22 | $(exe):$(objs)
23 | $(cc) $(objs) $(commflags) -O3 -o $(exe)
24 |
25 |
26 | test:$(exe)
27 | mpirun -n $(N) $(exe) WG WG/beg.bin WG/csr.bin 10 128 2 40 5 3 1
28 | clean:
29 | rm -rf *.o ${exe}
30 |
--------------------------------------------------------------------------------
/streaming/README.md:
--------------------------------------------------------------------------------
1 | #### Streaming version:
2 |
3 | ##### Input format
4 | Input: ./exe <# of samples> <#GPUs>
5 |
6 | Neighbor size represents how many neighbors to sample for each vertex.
7 |
8 | ##### Example:
9 | mpirun -n 1 streaming.bin WG WG/beg.bin WG/csr.bin 10 128 1 40 5 3 1
10 |
11 | or
12 |
13 | ./run <# of samples> <#GPUs>
14 |
15 |
16 |
17 | ##### Note:
18 | The current source code only supports dividing the graph into four partition and streams two partitions into GPU. The dynamic version may be uploaded later. Memory allocation (espically for storing the samples and queue) may require higher allocation depending on the sampling parameters.
19 | Note, current version of this code works only with a single GPU.
20 |
--------------------------------------------------------------------------------
/streaming/WG/beg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/streaming/WG/beg.bin
--------------------------------------------------------------------------------
/streaming/WG/csr.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/streaming/WG/csr.bin
--------------------------------------------------------------------------------
/streaming/gpu_graph.cuh:
--------------------------------------------------------------------------------
1 | //10/03/2016
2 | //Graph data structure on GPUs
3 | #ifndef _GPU_GRAPH_H_
4 | #define _GPU_GRAPH_H_
5 | #include
6 | #include "header.h"
7 | #include "util.h"
8 | #include "graph.h"
9 |
10 | class gpu_graph
11 | {
12 | public:
13 | vertex_t *adj_list;
14 | weight_t *weight_list;
15 | index_t *beg_pos;
16 | vertex_t *degree_list;
17 |
18 | index_t vert_count;
19 | index_t edge_count;
20 | index_t avg_degree;
21 |
22 | public:
23 | ~gpu_graph(){}
24 |
25 | gpu_graph(
26 | graph *ginst)
27 | {
28 | vert_count=ginst->vert_count;
29 | edge_count=ginst->edge_count;
30 | avg_degree = ginst->edge_count/ginst->vert_count;
31 |
32 | // size_t weight_sz=sizeof(weight_t)*edge_count;
33 | size_t adj_sz=sizeof(vertex_t)*edge_count;
34 | size_t deg_sz=sizeof(vertex_t)*edge_count;
35 | size_t beg_sz=sizeof(index_t)*(vert_count+1);
36 | vertex_t *cpu_degree_list=(vertex_t*)malloc(sizeof(vertex_t)*edge_count);
37 | /* Alloc GPU space */
38 | H_ERR(cudaMalloc((void **)&adj_list, adj_sz));
39 | H_ERR(cudaMalloc((void **)°ree_list, deg_sz));
40 | H_ERR(cudaMalloc((void **)&beg_pos, beg_sz));
41 | //H_ERR(cudaMalloc((void **)&weight_list, weight_sz));
42 |
43 | for(int i=0; i<(ginst->edge_count); i++)
44 | {
45 | int neighbor= ginst->adj_list[i];
46 | //cout<<"Index: "<beg_pos[neighbor+1] - ginst->beg_pos[neighbor];
48 | }
49 |
50 | /* copy it to GPU */
51 | H_ERR(cudaMemcpy(adj_list,ginst->adj_list,
52 | adj_sz, cudaMemcpyHostToDevice));
53 | H_ERR(cudaMemcpy(beg_pos,ginst->beg_pos,
54 | beg_sz, cudaMemcpyHostToDevice));
55 | H_ERR(cudaMemcpy(degree_list,cpu_degree_list,
56 | beg_sz, cudaMemcpyHostToDevice));
57 |
58 | //H_ERR(cudaMemcpy(weight_list,ginst->weight,
59 | // weight_sz, cudaMemcpyHostToDevice));
60 | }
61 | };
62 |
63 | #endif
64 |
--------------------------------------------------------------------------------
/streaming/graph.h:
--------------------------------------------------------------------------------
1 | #ifndef __GRAPH_H__
2 | #define __GRAPH_H__
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include "wtime.h"
8 | #include
9 | #include
10 | #include
11 | inline off_t fsize(const char *filename) {
12 | struct stat st;
13 | if (stat(filename, &st) == 0)
14 | return st.st_size;
15 | return -1;
16 | }
17 |
18 | template<
19 | typename file_vert_t, typename file_index_t, typename file_weight_t,
20 | typename new_vert_t, typename new_index_t, typename new_weight_t>
21 | class graph
22 | {
23 | public:
24 | new_index_t *beg_pos;
25 | new_vert_t *adj_list;
26 | new_weight_t *weight;
27 | new_vert_t *degree_list;
28 | new_index_t vert_count;
29 | new_index_t edge_count;
30 |
31 | public:
32 | graph(){};
33 | ~graph(){};
34 | graph(const char *beg_file,
35 | const char *adj_list_file,
36 | const char *weight_file);
37 |
38 | graph(file_vert_t *csr,
39 | file_index_t *beg_pos,
40 | file_weight_t *weight_list,
41 | file_index_t vert_count,
42 | file_index_t edge_count)
43 | {
44 | this->beg_pos = beg_pos;
45 | this->adj_list = csr;
46 | this->weight = weight_list;
47 | //this->degree_list= degree_list;
48 | this->edge_count = edge_count;
49 | this->vert_count = vert_count;
50 | };
51 | };
52 | #include "graph.hpp"
53 | #endif
54 |
--------------------------------------------------------------------------------
/streaming/graph.hpp:
--------------------------------------------------------------------------------
1 | #include "graph.h"
2 | #include
3 |
4 | template<
5 | typename file_vert_t, typename file_index_t, typename file_weight_t,
6 | typename new_vert_t, typename new_index_t, typename new_weight_t>
7 | graph
9 | ::graph(
10 | const char *beg_file,
11 | const char *adj_file,
12 | const char *weight_file)
13 | {
14 | double tm=wtime();
15 | FILE *file=NULL;
16 | file_index_t ret;
17 |
18 | vert_count=fsize(beg_file)/sizeof(file_index_t) - 1;
19 | edge_count=fsize(adj_file)/sizeof(file_vert_t);
20 |
21 | file=fopen(beg_file, "rb");
22 | if(file!=NULL)
23 | {
24 | file_index_t *tmp_beg_pos=NULL;
25 |
26 | if(posix_memalign((void **)&tmp_beg_pos, getpagesize(),
27 | sizeof(file_index_t)*(vert_count+1)))
28 | perror("posix_memalign");
29 |
30 | ret=fread(tmp_beg_pos, sizeof(file_index_t),
31 | vert_count+1, file);
32 | assert(ret==vert_count+1);
33 | fclose(file);
34 | edge_count=tmp_beg_pos[vert_count];
35 | //std::cout<<"Expected edge count: "<0);
38 |
39 | //converting to new type when different
40 | if(sizeof(file_index_t)!=sizeof(new_index_t))
41 | {
42 | if(posix_memalign((void **)&beg_pos, getpagesize(),
43 | sizeof(new_index_t)*(vert_count+1)))
44 | perror("posix_memalign");
45 | for(new_index_t i=0;i
2 | #include
3 | #include
4 | #include "sampler.cuh"
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | using namespace std;
16 |
17 | int main(int argc, char *argv[])
18 | {
19 | if(argc!=11){std::cout<<"Input: ./exe <# of samples> <#GPUs>\n";exit(0);}
20 | // SampleSize, FrontierSize, NeighborSize
21 | // printf("MPI started\n");
22 | int n_blocks = atoi(argv[4]);
23 | int n_threads = atoi(argv[5]);
24 | int n_subgraph = atoi(argv[6]);
25 | int FrontierSize = atoi(argv[7]);
26 | int NeighborSize = atoi(argv[8]);
27 | int Depth= atoi(argv[9]);
28 | int total_GPU = atoi(argv[10]);
29 |
30 | MPI_Status status;
31 | int myrank;
32 | double global_max_time, global_min_time;
33 | int global_sampled_edges;
34 | struct arguments args;
35 | MPI_Init(&argc, &argv);
36 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
37 | int global_sum;
38 | //SampleSize = SampleSize/total_GPU;
39 | args=Sampler(argv[2],argv[3], n_blocks, n_threads, n_subgraph, FrontierSize, NeighborSize, Depth, args, myrank);
40 | MPI_Reduce(&args.time, &global_max_time, 1, MPI_DOUBLE,MPI_MAX, 0, MPI_COMM_WORLD);
41 | MPI_Reduce(&args.time, &global_min_time, 1, MPI_DOUBLE,MPI_MIN, 0, MPI_COMM_WORLD);
42 | float rate = global_sampled_edges/global_max_time/1000000;
43 | if(myrank==0)
44 | {
45 | printf("%s,%f,%f\n",argv[1],global_min_time,global_max_time);
46 | }
47 | MPI_Finalize();
48 | return 0;
49 | }
50 |
--------------------------------------------------------------------------------
/streaming/run.sh:
--------------------------------------------------------------------------------
1 |
2 | ./streaming.bin WG WG/beg.bin WG/csr.bin 10 128 $1 $2 $3 $4 $5
3 |
4 |
--------------------------------------------------------------------------------
/streaming/sample_class.cuh:
--------------------------------------------------------------------------------
1 | #ifndef SAMPLER_H
2 | #define SAMPLER_H
3 |
4 | #include "herror.h"
5 | #include "header.h"
6 | #include
7 | #include
8 | #include
9 |
10 | class Cd{
11 | /*
12 | Candidate list shared by all instances
13 | */
14 | public:
15 | int s=5;
16 | int *instance_ID, *vertices, *depth;
17 | int *start, *end;
18 | ~Cd(){};
19 | Cd(){};
20 | Cd(int len ){
21 | H_ERR(cudaMalloc((void **)&instance_ID, sizeof(int)*len));
22 | H_ERR(cudaMalloc((void **)&vertices, sizeof(int)*len));
23 | H_ERR(cudaMalloc((void **)&depth, sizeof(int)*len));
24 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2));
25 | H_ERR(cudaMalloc((void **)&end, sizeof(int)*2));
26 | }
27 | };
28 |
29 | class Dimnesion{
30 | public:
31 | int *pool;
32 | ~Dimnesion(){};
33 | Dimnesion(){};
34 | void init(int FrontierSize){
35 | H_ERR(cudaMalloc((void **)&pool, sizeof(int)*FrontierSize));
36 | }
37 | };
38 |
39 | class Wv{
40 | /*
41 | Warp variables
42 | */
43 | public:
44 | int test=1;
45 | int *total_counter;
46 | int *frontier, *findex;
47 | int *neighbors, *nindex;
48 | float *degree;
49 | int *dindex;
50 | int *selected, *sindex;
51 | int *tempSelected;
52 | int *bitmap, *bindex;
53 | int NL, NS;
54 | int *max;
55 | ~Wv(){};
56 | Wv(){}
57 | void init(int flen,int nlen, int dlen, int slen){
58 | H_ERR(cudaMalloc((void **)&frontier, sizeof(int)*flen));
59 | H_ERR(cudaMalloc((void **)&neighbors, sizeof(int)*nlen));
60 | H_ERR(cudaMalloc((void **)°ree, sizeof(float)*dlen));
61 | H_ERR(cudaMalloc((void **)&bitmap, sizeof(int)*(dlen/32)));
62 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen));
63 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen));
64 | H_ERR(cudaMalloc((void **)&findex, sizeof(int)*2));
65 | H_ERR(cudaMalloc((void **)&nindex, sizeof(int)*2));
66 | H_ERR(cudaMalloc((void **)&dindex, sizeof(int)*2));
67 | H_ERR(cudaMalloc((void **)&sindex, sizeof(int)*2));
68 | H_ERR(cudaMalloc((void **)&bindex, sizeof(int)*2));
69 | H_ERR(cudaMalloc((void **)&max, sizeof(int)*2));
70 | H_ERR(cudaMalloc((void **)&total_counter, sizeof(int)*2));
71 | }
72 | };
73 |
74 |
75 |
76 | class Si{
77 | /*
78 | sampled graph for instances. Each instance have its own sample graph.
79 | */
80 | public:
81 | int *vertex,*edge;
82 | int *start;
83 | ~Si(){};
84 | Si(){}
85 | void init(int len){
86 | H_ERR(cudaMalloc((void **)&vertex, sizeof(int)*len));
87 | H_ERR(cudaMalloc((void **)&edge, sizeof(int)*len));
88 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2));
89 | }
90 | };
91 |
92 | class Ht{
93 | /*
94 | Hashtable for each instance
95 | */
96 | public:
97 | int *hash;
98 | int *bin_counter;
99 | int BUCKETS;
100 | int bin_size=125;
101 | ~Ht(){};
102 | Ht(){}
103 | void init(int bin_count){
104 | BUCKETS=bin_count;
105 | H_ERR(cudaMalloc((void **)&hash, sizeof(int)*bin_count*bin_size));
106 | H_ERR(cudaMalloc((void **)&bin_counter, sizeof(int)*bin_count));
107 | }
108 | };
109 |
110 | class Co{
111 | /*
112 | Counters used in sampling.
113 | */
114 | public:
115 | int *counter, *pre_counter, *total, *colcount, *max;
116 | int max_NL=90000; // Update this to dynamic allocation
117 | ~Co(){};
118 | Co(){};
119 | Co(int total){
120 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2));
121 | HRR(cudaMalloc((void **) &max,sizeof(int)*max_NL));
122 | HRR(cudaMalloc((void **) &pre_counter,sizeof(int)*2));
123 | HRR(cudaMalloc((void **) &colcount,sizeof(int)*50));
124 | HRR(cudaMalloc((void **) &total,sizeof(int)*total));
125 | }
126 | };
127 |
128 | class Cp{
129 | /*
130 | Cache probability for each vertex in the graph.
131 | */
132 | public:
133 | int *status;
134 | float *probability;
135 | int *counter;
136 | ~Cp(){};
137 | Cp(){};
138 | Cp(int len){
139 | // HRR(cudaMalloc((void **) &status,sizeof(int)*len));
140 | // HRR(cudaMalloc((void **) &probability,sizeof(float)*len));
141 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2));
142 | }
143 | };
144 |
145 |
146 | class Sampling{
147 | /*
148 | Collection of objects for sampling
149 | */
150 | public:
151 | Cd candidate;
152 | Si samples[20100];
153 | Ht hashtable[20100];
154 | Co count;
155 | Wv wvar[2000];
156 | Dimnesion front[4000];
157 | Cp cache;
158 | int *max,*sampled_count,*frontier_degree;
159 | int n_child=1;
160 | int DEPTH_LIMIT;
161 | int BUCKETS=32;
162 | int max_NL=90000; // Update this to dynamic allocation
163 | ~Sampling(){};
164 | Sampling(int edgecount,int warpCount, int qlen, int seeds, int C_len, int sampleSize, int FrontierSize, int depth){
165 | DEPTH_LIMIT=depth;
166 | count= Co(seeds);
167 | candidate= Cd(seeds*max_NL);
168 | cache= Cp(edgecount);
169 | HRR(cudaMalloc((void **) &max,sizeof(int)*2));
170 | HRR(cudaMalloc((void **) &frontier_degree,sizeof(int)*sampleSize*FrontierSize));
171 | HRR(cudaMalloc((void **) &sampled_count,sizeof(int)));
172 | for(int i=0;i
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include "gpu_graph.cuh"
12 | #include "graph.h"
13 | #include "herror.h"
14 | #include "sampler.cuh"
15 | #include "wtime.h"
16 | using namespace std;
17 |
18 | // int RAND_MAX=10000;
19 | int sum(int length, int *a) {
20 | int total = 0;
21 | // std::cout<<"\n size:"< arr[index]) {
62 | // set low to index+1
63 | low = index + 1;
64 | // printf("low:%d\n",low);
65 |
66 | } else {
67 | break;
68 | }
69 | }
70 | return index;
71 | }
72 |
73 | __device__ int bitmap_binary_search(int start, int end, float value, float *arr,
74 | int *bitmap, int bitmap_start, int &is_in) {
75 | // printf("low:%d,high:%d,value:%f\n",start,end,value);
76 | int low = start;
77 | int high = end;
78 | int index = start;
79 | int bitmap_width = 32;
80 | while (low <= high) {
81 | index = ((low + high) / 2);
82 | if (value < arr[index]) {
83 | // set high to index-1
84 | high = index - 1;
85 | // printf("high:%d\n",high);
86 | } else if (value > arr[index]) {
87 | // set low to index+1
88 | low = index + 1;
89 | // printf("low:%d\n",low);
90 | } else {
91 | break;
92 | }
93 | }
94 | int bitmap_pos = index;
95 | int bit_block_index =
96 | bitmap_pos / bitmap_width; // find the address of bitmap
97 | int bit_block_pos = bitmap_pos % bitmap_width; // position within a address
98 | // reversed------------
99 |
100 | // int bit_block_pos = bitmap_pos / bitmap_width;
101 | // int bit_block_index= bitmap_pos % bitmap_width;
102 | int initial_mask = 1;
103 | int mask = (initial_mask << bit_block_pos);
104 | int status = atomicOr(&bitmap[bit_block_index + bitmap_start], mask);
105 | is_in = (mask & status) >> bit_block_pos;
106 |
107 | // is_in= 0x00000001 & (status >> bit_block_pos);
108 | // printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d,
109 | // mask:%d, status: %d,shift: %d,
110 | // is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask
111 | // & status),is_in);
112 | return index;
113 | }
114 |
115 | void r2() {
116 | std::random_device rd;
117 | std::mt19937 gen(rd());
118 | std::uniform_real_distribution<> dis(0, 1);
119 | for (int n = 0; n < 10; ++n) {
120 | std::cout << dis(gen) << ' ';
121 | }
122 | }
123 |
124 | __device__ float frandom(curandState *global) {
125 | // curand_init(1000,threadIdx.x,10,&global[threadIdx.x]);
126 | float x = ((curand_uniform(&global[0])));
127 | return x;
128 | }
129 |
130 | __device__ int linear_search(int neighbor, int *partition1, int *bin_count,
131 | int bin, int BIN_OFFSET, int BIN_START,
132 | int BUCKETS) {
133 | int len = bin_count[bin + BIN_OFFSET];
134 |
135 | int i = bin + BIN_START;
136 | // printf("\nL: %d, I:%d\n",len,i);
137 | int step = 0;
138 | while (step < len) {
139 | int test = partition1[i];
140 | // printf("Neighbor: %d, Test: %d, address: %d\n",neighbor,test,i);
141 | if (test == neighbor) {
142 | // printf("Duplicate detected
143 | // -------------------------------------------------------\n");
144 | return 1;
145 | } else {
146 | i += BUCKETS;
147 | }
148 | step += 1;
149 | }
150 | return 0;
151 | }
152 |
153 | __device__ void gpu_prefix(int total_step, int warp_tid, float *degree_l,
154 | int offset_d_n, int warpsize, int len) {
155 | for (int i = 0; i < total_step; i++) {
156 | // Loop the threads
157 | int req_thread = len / (powf(2, (i + 1)));
158 | for (int iid = warp_tid; iid <= req_thread; iid += warpsize) {
159 | int tid_offset = iid * powf(2, i + 1);
160 | // calculate the index
161 | int i1 = (tid_offset) + (powf(2, i)) - 1 + offset_d_n;
162 | int i2 = (tid_offset) + powf(2, i + 1) - 1 + offset_d_n;
163 | if (i1 > (offset_d_n + len - 1)) {
164 | break;
165 | }
166 | // printf("i:%d, Index1 %d: %f,Index2 %d: %f,
167 | // thread:%d\n",i,i1,degree_l[i1],i2,degree_l[i2],threadIdx.x);
168 | // load the values to shared mem
169 | int temp1 = degree_l[i1];
170 | int temp2 = degree_l[i2];
171 | degree_l[i2] = temp2 + temp1;
172 | // printf("Index:%d, Value:%d \n",i2,temp[i2]);
173 | }
174 | }
175 | degree_l[len - 1 + offset_d_n] = 0;
176 | // printf("\nDownstep:%d\n",degree_l[len-1]);
177 | for (int i = (total_step - 1); i >= 0; i--) {
178 | // Loop the threads
179 | int req_thread = len / (powf(2, (i + 1)));
180 | for (int iid = warp_tid; iid <= req_thread; iid += warpsize) {
181 | int tid_offset = iid * powf(2, i + 1);
182 | int i1 = (tid_offset) + (powf(2, i)) - 1 + offset_d_n;
183 | int i2 = (tid_offset) + powf(2, i + 1) - 1 + offset_d_n;
184 | if (i1 > (offset_d_n + len - 1)) {
185 | break;
186 | }
187 | // printf("temp1: %d, temp2: %d, thread:%d\n",i1,i2,threadIdx.x);
188 | // printf("Index1 %d: %f,Index2 %d: %f,
189 | // thread:%d\n",i1,degree_l[i1],i2,degree_l[i2],threadIdx.x);
190 | int temp1 = degree_l[i1];
191 | int temp2 = degree_l[i2];
192 | degree_l[i1] = temp2;
193 | degree_l[i2] = temp2 + temp1;
194 | // printf("Index:%d, Value:%d \n",i2,temp[i2]);
195 | }
196 | }
197 | }
198 |
199 | __global__ void check(int Graph_block_size, int streamid, int block_id,
200 | vertex_t *adj_list, index_t *beg_pos,
201 | weight_t *weight_list, int vertex_count,
202 | curandState *global_state, int *g_node_list,
203 | int *g_edge_list, int *neigh_l, float *degree_l,
204 | int n_blocks, int *d_seed, int n_threads, int *total,
205 | int *hashtable, int *bitmap, int total_subgraphs,
206 | int *node, int *queue, int *sample_id, int *depth_tracker,
207 | int *qstart_global, int *qstop_global, int *g_sub_index,
208 | int n_child, int depth_limit, int sample_size, int queue_size) {
209 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
210 | // int __shared__ q1_start, q2_end, depth, q2_start, q2_stop;
211 | int temp_queue_start = qstart_global[block_id];
212 | int temp_queue_stop = qstop_global[block_id];
213 | //-----------------We may require a barrier here for storing temp
214 | //queue---------------------//
215 | int __shared__ bin_count[128];
216 | int warp_tid = threadIdx.x % 32;
217 | int G_warpID = tid / 32;
218 | int warpId = threadIdx.x / 32;
219 | int warpsize = 32;
220 | int offset_d_n = G_warpID * 4000;
221 | int BUCKETS = 32;
222 | int BINsize = BUCKETS * 6;
223 | int bitmap_size = 100;
224 | int Graph_block = 4;
225 | float prefix_time, local_d_time, global_d_time;
226 | clock_t start_time, stop_time;
227 | int __shared__ prefix;
228 | int seed_index;
229 | int BIN_OFFSET = 0;
230 | int depthcount, edges_traversed, q_stop, vertex, total_work;
231 | int q_start;
232 | int queue_start_address = block_id * queue_size;
233 | curandState local_state = global_state[threadIdx.x];
234 | curand_init(
235 | tid, 0, 0,
236 | &local_state); // sequence created with different seed and same sequence
237 | int depth_flag = 0;
238 | edges_traversed = 0;
239 | // add all items to the combined queue: Number of threads must be greater than
240 | // samples
241 | if ((qstop_global[block_id] - qstart_global[block_id]) != 0) {
242 | if (warp_tid == 0) {
243 | q_start = atomicAdd(&qstart_global[block_id], 1);
244 | }
245 | q_start = __shfl_sync(0xffffffff, q_start, 0);
246 | __syncwarp();
247 |
248 | while (q_start < qstop_global[block_id]) {
249 | vertex = queue[q_start + queue_start_address];
250 | //if(warp_tid==0){printf("Block_id:%d, StreamId: %d, G_warpID: %d,SampleID:%d, vertex:%d, q_stop:%d,q_start:%d,depth:%d\n",block_id,streamid,G_warpID,sample_id[q_start+queue_start_address],vertex,qstop_global[block_id],q_start,depth_tracker[q_start+queue_start_address]);}
251 | int neighbor_start = beg_pos[vertex];
252 | int neighbor_end = beg_pos[vertex + 1];
253 | int neighbor_length = neighbor_end - neighbor_start;
254 | edges_traversed += neighbor_length;
255 | if (neighbor_length == 0) {
256 | if (warp_tid == 0) {
257 | q_start = atomicAdd(&qstart_global[block_id], 1);
258 | }
259 | q_start = __shfl_sync(0xffffffff, q_start, 0);
260 | __syncwarp();
261 | continue;
262 | }
263 | int is_in = 0;
264 | int new_neighbor;
265 | int selected = 0;
266 | if (neighbor_length < n_child) {
267 | prefix = 0;
268 | } else {
269 | prefix = 1;
270 | }
271 | int thread_flag = 0;
272 | if ((warp_tid < n_child) && (warp_tid < neighbor_length)) {
273 | thread_flag = 1;
274 | }
275 | if (prefix) {
276 | // For each neighbor, calculate the degree of its neighbor
277 | int index = offset_d_n + warp_tid; // use block and thread Id for index
278 | for (int i = warp_tid + neighbor_start; i < neighbor_end;
279 | i += warpsize) {
280 | // neighbor ID
281 | int temp = adj_list[i];
282 | // if((temp>Graph_block_size)& (warp_tid==0)){printf("Reading from
283 | // outside.\n");} degree of neighbor
284 | degree_l[index] = float(beg_pos[temp + 1] - beg_pos[temp]);
285 | // printf("%d has a degree of %f found by
286 | // %d,index:%d\n",temp,degree_l[index],threadIdx.x,index);
287 | index += warpsize;
288 | }
289 | int i_start_neigh = offset_d_n;
290 | int i_end_neigh = i_start_neigh + neighbor_length;
291 | // printf("Starting prefix_sum\n");
292 | // start_time = clock();
293 | float bits = log2f(neighbor_length);
294 | int raise = ceilf(bits);
295 | int max_bit = powf(2, raise);
296 | int len = max_bit;
297 | int total_step = log2f(max_bit);
298 | gpu_prefix(total_step, warp_tid, degree_l, offset_d_n, warpsize, len);
299 | float sum = degree_l[neighbor_length - 1 + offset_d_n];
300 | for (int i = warp_tid + i_start_neigh; i < i_end_neigh; i += warpsize) {
301 | // printf("i:%d, degree:%.2f\n",i,degree_l[i]);
302 | degree_l[i] = degree_l[i] / ((double)sum);
303 | }
304 | // start_time = clock();
305 | int bitmap_start = G_warpID * bitmap_size;
306 | if (warp_tid < n_child) {
307 | float r = curand_uniform(&local_state);
308 | //------------------------------------Using
309 | //bitmaps----------------------------------------------
310 | selected =
311 | bitmap_binary_search(i_start_neigh, i_end_neigh, r, degree_l,
312 | bitmap, bitmap_start, is_in);
313 | new_neighbor = adj_list[selected + neighbor_start - offset_d_n];
314 | // if(is_in==0) {printf("Index: %d, New N: %d, Thread:
315 | // %d\n",selected,new_neighbor,threadIdx.x);}
316 | //--------------------------------------------------------------------------------------------
317 | }
318 | // Reset Bitmaps
319 | int start = bitmap_start + warp_tid;
320 | int end = bitmap_start + bitmap_size;
321 | for (int i = start; i < end; i += warpsize) {
322 | bitmap[i] = 0;
323 | // printf("Bitmap cleared at %d\n",i);
324 | }
325 | }
326 | else {
327 | if (thread_flag) {
328 | new_neighbor =
329 | adj_list[warp_tid + neighbor_start]; // unwanted thread also may
330 | // get some child but will be
331 | // neglected in next section
332 | }
333 | // printf("New Neighbor: %d, thread: %d\n",new_neighbor,threadIdx.x);
334 | }
335 | /* Use hashtable for detecting duplicates*/
336 | int BIN_START = sample_id[q_start] * BINsize;
337 | if (is_in == 0 && thread_flag) {
338 | int bin = new_neighbor % BUCKETS;
339 | is_in = linear_search(new_neighbor, hashtable, bin_count, bin,
340 | BIN_OFFSET, BIN_START, BUCKETS);
341 | // if(is_in==1){printf("Duplicated Found: %d\n",new_neighbor);}
342 | }
343 | //-------------------------------------------------------------------
344 | if (is_in == 0 && thread_flag) {
345 | //------------------------Store in
346 | //hashtable-----------------------------//
347 | int bin = new_neighbor % BUCKETS;
348 | // int index= warpId;
349 | int index = atomicAdd(&bin_count[bin + BIN_OFFSET], 1);
350 | hashtable[index] = new_neighbor;
351 | hashtable[index * BUCKETS + bin + BIN_START] = new_neighbor;
352 | int g_sub_start = sample_id[q_start] * sample_size;
353 | int g_to = atomicAdd(&g_sub_index[sample_id[q_start]], 1);
354 | //g_node_list[g_to + g_sub_start] = vertex;
355 | //g_edge_list[g_to + g_sub_start] = new_neighbor;
356 | printf("%d,%d,%d,%d\n",vertex,new_neighbor,sample_id[q_start],depth_tracker[q_start + queue_start_address]);
357 | //Added to sample:752601,328138,20,0,2
358 | // add to the expand queue
359 | if (depth_tracker[q_start] < depth_limit) {
360 | int new_bin = new_neighbor / Graph_block_size;
361 | int new_queue_start = new_bin * queue_size;
362 | // if(new_bin!=0)
363 | // { printf("Block:%d, Added to block:%d\n",block_id,new_bin);}
364 | int to = atomicAdd(&qstop_global[new_bin], 1);
365 | queue[to + new_queue_start] = new_neighbor;
366 | sample_id[to + new_queue_start] =
367 | sample_id[q_start + queue_start_address];
368 | depth_tracker[to + new_queue_start] =
369 | depth_tracker[q_start + queue_start_address] + 1;
370 | //printf("Added: %d, to queue at index %d and block %d, local_index: %d, offset: %d, new_d: %d, prev_d: %d\n",new_neighbor,to + new_queue_start,new_bin, to, new_queue_start,depth_tracker[to + new_queue_start], depth_tracker[q_start + queue_start_address]);
371 | }
372 | }
373 | // q_start+=1;
374 | if ((qstart_global[block_id] > qstop_global[block_id])) {
375 | break;
376 | }
377 | if (warp_tid == 0) {
378 | q_start = atomicAdd(&qstart_global[block_id], 1);
379 | }
380 | q_start = __shfl_sync(0xffffffff, q_start, 0);
381 | __syncwarp();
382 | }
383 | }
384 | }
385 |
386 | int build_histogram(int n_subgraph, int *input, int *frequency,
387 | int block_window_size, int block_size, int vert_count,
388 | int vertex_block_count) {
389 | int max_index = 0, max_value = 0;
390 | for (int i = 0; i < n_subgraph; i++) {
391 | int block = input[i] / block_size;
392 | if (block > vertex_block_count) {
393 | block = vertex_block_count;
394 | }
395 | // cout<<"Value:"< max_index) {
404 | max_index = j;
405 | max_value = combined_freq;
406 | }
407 | }
408 | cout << "Max_index:" << max_index << "Max_value:" << max_value << "\n";
409 | return max_index;
410 | }
411 |
412 | int block_augument(int blocks, int vertex_count, index_t *beg_pos,
413 | int *beg_size_list, int *adj_size_list) {
414 | int block_size = (vertex_count) / blocks;
415 | for (int i = 0; i < (blocks + 1); i += 1) {
416 | int start_block = i * block_size;
417 | if (i == blocks) {
418 | start_block = vertex_count;
419 | }
420 | beg_size_list[i] = start_block;
421 | int start_adj = beg_pos[block_size * i];
422 | adj_size_list[i] = start_adj;
423 | }
424 | return 0;
425 | }
426 |
427 | struct arguments Sampler(char beg[100], char csr[100], int n_blocks,
428 | int n_threads, int n_subgraph, int frontier_size,
429 | int neighbor_size, int depth, struct arguments args,
430 | int rank) {
431 | // if(args!=7){std::cout<<"Wrong input\n"; return -1;}
432 | //n_child, depth, each_subgraph, queue_size
433 | // cout<<"\nblocks:"<
457 | graph *ginst =
458 | new graph(
459 | beg_file, csr_file, weight_file);
460 | int vertex_count = ginst->vert_count;
461 | int edge_count = ginst->edge_count;
462 | int Graph_block_size = vertex_count / Graph_block;
463 | // int Graph_block_size=2000;
464 | /*
465 | printf("Size of blocks\n");
466 | for (int i = 0; i < 4; i++) {
467 | printf("%d,%d\n", i,
468 | ginst->beg_pos[(i + 1) * Graph_block_size] -
469 | ginst->beg_pos[(i)*Graph_block_size]);
470 | }
471 | */
472 | curandState *d_state;
473 | cudaMalloc(&d_state, sizeof(curandState));
474 | gpu_graph ggraph(ginst);
475 | int *node_list = (int *)malloc(sizeof(int) * total_length);
476 | int *set_list = (int *)malloc(sizeof(int) * total_length);
477 | float *n_random = (float *)malloc(sizeof(float) * n_threads);
478 | int *seeds = (int *)malloc(sizeof(int) * total_queue_memory);
479 | int *seeds_counter = (int *)malloc(sizeof(int) * Graph_block);
480 | int *start_queue = (int *)malloc(sizeof(int) * Graph_block);
481 | int *degree_list = (int *)malloc(sizeof(int) * ginst->edge_count);
482 | int *adj_size_list = (int *)malloc(sizeof(int) * (Graph_block + 1));
483 | int *beg_size_list = (int *)malloc(sizeof(int) * (Graph_block + 1));
484 | for (int n = 0; n < Graph_block; n++) {
485 | seeds_counter[n] = 0;
486 | start_queue[n] = 0;
487 | }
488 | std::random_device rd;
489 | // 200 --> 370 Mteps
490 | int numBlocks;
491 | // cudaGetDevice(&device);
492 | // cudaGetDeviceProperties(&prop, device);
493 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, check, n_threads,
494 | 0);
495 |
496 | int deviceCount;
497 | HRR(cudaGetDeviceCount(&deviceCount));
498 | // printf("My rank: %d, totaldevice: %d\n", rank,deviceCount);
499 | // HRR(cudaSetDevice(rank%deviceCount));
500 | // cout<<"Max allocatable Blocks:"<edge_count); i++) {
509 | int neighbor = ginst->adj_list[i];
510 | degree_list[i] = ginst->beg_pos[neighbor + 1] - ginst->beg_pos[neighbor];
511 | }
512 | int *hashtable, *bitmap, *node, *queue, *qstop_global, *qstart_global,
513 | *sample_id, *depth_tracker, *g_sub_index, *degree_l, *prefix_status;
514 | // Size of blocks
515 | HRR(cudaMalloc((void **)&d_total, sizeof(int) * n_subgraph));
516 | HRR(cudaMalloc((void **)&node, sizeof(int) * 2));
517 | HRR(cudaMalloc((void **)°ree_l, sizeof(int) * ginst->edge_count));
518 | HRR(cudaMalloc((void **)&prefix_status, sizeof(int) * ginst->edge_count));
519 | HRR(cudaMalloc((void **)&d_degree_l, sizeof(float) * ginst->edge_count));
520 | HRR(cudaMalloc((void **)&qstart_global, sizeof(int) * Graph_block));
521 | HRR(cudaMalloc((void **)&qstop_global, sizeof(int) * Graph_block));
522 | HRR(cudaMalloc((void **)&d_node_list, sizeof(int) * total_length));
523 | HRR(cudaMalloc((void **)&d_edge_list, sizeof(int) * total_length));
524 | HRR(cudaMalloc((void **)&d_neigh_l, sizeof(int) * neighbor_length_max));
525 | HRR(cudaMalloc((void **)&hashtable, sizeof(int) * total_mem_for_hash));
526 | HRR(cudaMalloc((void **)&bitmap, sizeof(int) * total_mem_for_bitmap));
527 | HRR(cudaMalloc((void **)&d_degree_l, sizeof(float) * neighbor_length_max));
528 | HRR(cudaMalloc((void **)&queue, sizeof(int) * total_queue_memory));
529 | HRR(cudaMalloc((void **)&sample_id, sizeof(int) * total_queue_memory));
530 | HRR(cudaMalloc((void **)&depth_tracker, sizeof(int) * total_queue_memory));
531 | HRR(cudaMalloc((void **)&g_sub_index, sizeof(float) * total_queue_memory));
532 | int *h_sample_id = (int *)malloc(sizeof(int) * total_queue_memory);
533 | int *h_depth_tracker = (int *)malloc(sizeof(int) * total_queue_memory);
534 | std::mt19937 gen(57);
535 | std::uniform_int_distribution<> dis(1, vertex_count / 4);
536 |
537 | for (int n = 0; n < n_subgraph; n++) {
538 | int new_seed = dis(gen);
539 | int bin_new = new_seed / Graph_block_size;
540 | if (bin_new > Graph_block) {
541 | bin_new = Graph_block;
542 | }
543 | int pos = bin_new * (queue_size) + seeds_counter[bin_new];
544 | assert(pos < total_queue_memory);
545 | seeds_counter[bin_new]++;
546 | seeds[pos] = new_seed;
547 | h_sample_id[pos] = n;
548 | h_depth_tracker[pos] = 0;
549 | // printf("N_subgraph: %d, Seed:%d, Bin:%d\n",n,new_seed,bin_new);
550 | }
551 | /* For streaming partition */
552 |
553 | HRR(cudaMemcpy(queue, seeds, sizeof(int) * total_queue_memory,
554 | cudaMemcpyHostToDevice));
555 | HRR(cudaMemcpy(qstart_global, start_queue, sizeof(int) * Graph_block,
556 | cudaMemcpyHostToDevice));
557 | HRR(cudaMemcpy(qstop_global, seeds_counter, sizeof(int) * Graph_block,
558 | cudaMemcpyHostToDevice));
559 | HRR(cudaMemcpy(sample_id, h_sample_id, sizeof(int) * total_queue_memory,
560 | cudaMemcpyHostToDevice));
561 | HRR(cudaMemcpy(depth_tracker, h_depth_tracker,
562 | sizeof(int) * total_queue_memory, cudaMemcpyHostToDevice));
563 | // create three cuda streams
564 |
565 | cudaStream_t stream1, stream2, stream3, stream4;
566 | cudaStreamCreate(&stream1);
567 | cudaStreamCreate(&stream2);
568 | cudaStreamCreate(&stream3);
569 | cudaStreamCreate(&stream4);
570 | cudaEvent_t event;
571 | cudaEventCreate(&event);
572 | // find top 3 blocks
573 | int sampling_complete = false;
574 | int i = 0, block_id1 = 0, block_id2 = 1, block_id3 = 2, block_id4 = 3;
575 |
576 | int q_count, max, value;
577 | block_augument(Graph_block, vertex_count, ginst->beg_pos, beg_size_list,
578 | adj_size_list);
579 | int *block_active = (int *)malloc(sizeof(int) * (Graph_block));
580 | int *frontiers_count = (int *)malloc(sizeof(int) * (Graph_block));
581 |
582 | for (int j = 0; j < Graph_block; j++) {
583 | frontiers_count[j] = seeds_counter[j] - start_queue[j];
584 | // printf("Value: %d, j: %d,Q_count:\n",frontiers_count[j],j);
585 | if (frontiers_count[j] == 0) {
586 | block_active[j] = 0;
587 | } else {
588 | block_active[j] = 1;
589 | }
590 | }
591 | printf("\nsource, destination, sample_id, depth\n");
592 | // display(block_active,Graph_block);
593 | // block[1]=1;
594 | // block[2]=1;
595 | // printf("Start while loop.\n");
596 | double time_start = wtime();
597 | while (sampling_complete == false) {
598 | // display(block_active,Graph_block);
599 | if (1) {
600 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id1]],
601 | &ginst->adj_list[adj_size_list[block_id1]],
602 | adj_size_list[block_id2] - adj_size_list[block_id1],
603 | cudaMemcpyHostToDevice, stream1));
604 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id1]],
605 | &ginst->beg_pos[beg_size_list[block_id1]],
606 | beg_size_list[block_id2] - beg_size_list[block_id1],
607 | cudaMemcpyHostToDevice, stream1));
608 |
609 | check<<>>(
610 | Graph_block_size, 0, block_id1, ggraph.adj_list, ggraph.beg_pos,
611 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list,
612 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads,
613 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id,
614 | depth_tracker, qstart_global, qstop_global, g_sub_index,
615 | n_child, depth, each_subgraph, queue_size);
616 | }
617 |
618 | if (block_active[1]) {
619 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id2]],
620 | &ginst->adj_list[adj_size_list[block_id2]],
621 | adj_size_list[block_id3] - adj_size_list[block_id2],
622 | cudaMemcpyHostToDevice, stream2));
623 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id2]],
624 | &ginst->beg_pos[beg_size_list[block_id2]],
625 | beg_size_list[block_id3] - beg_size_list[block_id2],
626 | cudaMemcpyHostToDevice, stream2));
627 | check<<>>(
628 | Graph_block_size, 1, block_id2, ggraph.adj_list, ggraph.beg_pos,
629 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list,
630 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads,
631 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id,
632 | depth_tracker, qstart_global, qstop_global, g_sub_index,
633 | n_child, depth, each_subgraph, queue_size);
634 | }
635 |
636 | if (block_active[2]) {
637 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id3]],
638 | &ginst->adj_list[adj_size_list[block_id3]],
639 | adj_size_list[block_id4] - adj_size_list[block_id3],
640 | cudaMemcpyHostToDevice, stream3));
641 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id3]],
642 | &ginst->beg_pos[beg_size_list[block_id3]],
643 | beg_size_list[block_id4] - beg_size_list[block_id3],
644 | cudaMemcpyHostToDevice, stream3));
645 | check<<>>(
646 | Graph_block_size, 2, block_id3, ggraph.adj_list, ggraph.beg_pos,
647 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list,
648 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads,
649 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id,
650 | depth_tracker, qstart_global, qstop_global, g_sub_index,
651 | n_child, depth, each_subgraph, queue_size);
652 | }
653 |
654 | if (block_active[3]) {
655 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id4]],
656 | &ginst->adj_list[adj_size_list[block_id4]],
657 | adj_size_list[4] - adj_size_list[block_id4],
658 | cudaMemcpyHostToDevice, stream4));
659 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id4]],
660 | &ginst->beg_pos[beg_size_list[block_id4]],
661 | beg_size_list[4] - beg_size_list[block_id4],
662 | cudaMemcpyHostToDevice, stream4));
663 | check<<>>(
664 | Graph_block_size, 3, block_id4, ggraph.adj_list, ggraph.beg_pos,
665 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list,
666 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads,
667 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id,
668 | depth_tracker, qstart_global, qstop_global, g_sub_index,
669 | n_child, depth, each_subgraph, queue_size);
670 | }
671 | // wait for completion
672 | // find new top 3 blocks
673 | int status1 = cudaStreamQuery(stream1);
674 | // cout<<"Status1: "<\n");
725 | args.sampled_edges = counted;
726 | args.time = cmp_time;
727 | return args;
728 | }
729 |
730 | // void blocks_allocator(int n_blocks,int *Block, )
731 |
--------------------------------------------------------------------------------
/streaming/util.h:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_H
2 | #define UTIL_H
3 |
4 | #include
5 | static void HandleError( cudaError_t err,
6 | const char *file,
7 | int line ) {
8 | if (err != cudaSuccess) {
9 | printf( "%s in %s at line %d\n", \
10 | cudaGetErrorString( err ),
11 | file, line );
12 | exit( EXIT_FAILURE );
13 | }
14 | }
15 | #define H_ERR( err ) \
16 | (HandleError( err, __FILE__, __LINE__ ))
17 |
18 |
19 | #define SML_MID 32
20 | #define MID_LRG 1024
21 | //#define SWITCH_TO (float)10.3
22 | #define SWITCH_TO (float)0.2
23 | #define SWITCH_BACK (float)0.4
24 | //#define SWITCH_BACK (float)0.3
25 |
26 | //#define SML_MID 0
27 | //#define MID_LRG 6553600
28 |
29 | //#define SML_MID 0
30 | //#define MID_LRG 0
31 | #define GPUID 0
32 | #define THDS_NUM 512
33 | #define BLKS_NUM 512
34 | //#define BLKS_NUM 96
35 |
36 | #endif
37 |
--------------------------------------------------------------------------------
/streaming/wtime.h:
--------------------------------------------------------------------------------
1 | #ifndef __TIME_H__
2 | #define __TIME_H__
3 |
4 | #include
5 | #include
6 |
7 | double wtime()
8 | {
9 | double time[2];
10 | struct timeval time1;
11 | gettimeofday(&time1, NULL);
12 |
13 | time[0]=time1.tv_sec;
14 | time[1]=time1.tv_usec;
15 |
16 | return time[0]+time[1]*1.0e-6;
17 | }
18 |
19 | #endif
20 |
--------------------------------------------------------------------------------