├── README.md ├── benchmark.sh ├── images └── C-SAW1_modified.png ├── non-stream ├── Makefile ├── WG │ ├── beg.bin │ └── csr.bin ├── api.cuh ├── functions.cuh ├── gpu_graph.cuh ├── graph.h ├── graph.hpp ├── header.h ├── herror.h ├── main.cu ├── mpi_main.cpp ├── run.sh ├── sample_class.cuh ├── sampler.cuh ├── util.h └── wtime.h └── streaming ├── Makefile ├── README.md ├── WG ├── beg.bin └── csr.bin ├── gpu_graph.cuh ├── graph.h ├── graph.hpp ├── header.h ├── herror.h ├── mpi_main.cpp ├── run.sh ├── sample_class.cuh ├── sampler.cuh ├── streaming_sampling.cu ├── util.h └── wtime.h /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | 4 |

5 | 6 | 7 | #### C-SAW: A Framework for Graph Sampling and Random Walk on GPUs 8 | --- 9 | C-SAW is a GPU based framework which can be used to implement variants of graph sampling and random walk algorithms. 10 | 11 | This repo contains two folders. One for streaming sampling for large graph and another for non-streaming sampling for graphs that fit in GPU memory. 12 | 13 | 14 | C-SAW uses CSR format of graph for sampling. Web-google dataset is included in the repo as example. Adjacency list of most datasets are available here. 15 | http://snap.stanford.edu/data/index.html 16 | 17 | The adjacency list can be converted into CSR using this library: 18 | https://github.com/asherliu/graph_project_start 19 | 20 | 21 | 22 | Generate the CSR and put the folder in main directory of both non-streaming and streaming sampling. 23 | 24 | To run: 25 | 26 | Step 1: Define the required API in API.cuh inside the non-streaming folder. 27 | 28 | Step 2: Go to streaming or non streaming folder. Run make command. 29 | 30 | Step 3: Update the dataset name in the run.sh file. 31 | 32 | Step 4: ./run.sh <# of samples> 33 | 34 | For changing the depth of the sampling or length of the random walk, update the DEPTH_LIMIT within Sampling class in sample_class.cuh at non-stream folder. You can also change the memory allocation and other paramters with the Sampling class. 35 | 36 | The sampled graph is stored as edge list in the GPU memory as a class variable Si found in sample_class.cuh. The output format: 37 | ``` 38 | Edges sampled, dataset name, min-time, max-time 39 | ``` 40 | 41 | `min-time` and `max-time` is same for single GPU. SEPS can be computed as `Edges sampled/max-time`. 42 | 43 | 44 | For more details, please refer to our [paper](https://arxiv.org/abs/2009.09103). 45 | 46 | Citation: 47 | 48 | ``` 49 | @INPROCEEDINGS {, 50 | author = {S. Pandey and L. Li and A. Hoisie and X. Li and H. Liu}, 51 | booktitle = {2020 SC20: International Conference for High Performance Computing, Networking, Storage and Analysis (SC)}, 52 | title = {C-SAW: A Framework for Graph Sampling and Random Walk on GPUs}, 53 | year = {2020}, 54 | volume = {}, 55 | issn = {}, 56 | pages = {780-794}, 57 | keywords = {}, 58 | doi = {10.1109/SC41405.2020.00060}, 59 | url = {https://doi.ieeecomputersociety.org/10.1109/SC41405.2020.00060}, 60 | publisher = {IEEE Computer Society}, 61 | address = {Los Alamitos, CA, USA}, 62 | month = {nov} 63 | } 64 | ``` 65 | -------------------------------------------------------------------------------- /benchmark.sh: -------------------------------------------------------------------------------- 1 | echo "FF" 2 | 3 | echo " ITS Re-" 4 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 5 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 0 6 | done 7 | 8 | echo "select-Baseline" 9 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 10 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 0 1 1 0 11 | done 12 | 13 | 14 | echo " Normalize" 15 | for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 16 | ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 2 1 1 0 17 | done 18 | 19 | # echo "Normalize + bitmap" 20 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 21 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 0 0 0 22 | # done 23 | 24 | # echo "hash" 25 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 26 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 0 27 | # done 28 | 29 | # echo "hash+cache" 30 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 31 | # ./sampling.bin $d $d/beg.bin $d/csr.bin 100 32 2000 1 1 1 1 32 | # done 33 | 34 | # echo "combined" 35 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 36 | # echo $d 37 | # ./baseline.bin $d/beg.bin $d/csr.bin 100 32 2000 38 | # done 39 | 40 | # echo "baseline" 41 | # echo $d 42 | # for d in /gpfs/alpine/proj-shared/csc289/Sampling/*/ ; do 43 | # ./combined.bin $d/beg.bin $d/csr.bin 100 32 2000 44 | # done -------------------------------------------------------------------------------- /images/C-SAW1_modified.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/images/C-SAW1_modified.png -------------------------------------------------------------------------------- /non-stream/Makefile: -------------------------------------------------------------------------------- 1 | exe=sampling.bin 2 | N=1 3 | d=WG 4 | cucc= "$(shell which nvcc)" 5 | cc= "$(shell which mpicxx)" 6 | commflags=-lcudart -L"$(shell dirname $(cucc))"/../lib64 7 | cuflags= --compiler-options -v -Xcudafe -\# --resource-usage 8 | cuflags+= -std=c++11 9 | objs = $(patsubst %.cu,%.o,$(wildcard *.cu)) \ 10 | $(patsubst %.cpp,%.o,$(wildcard *.cpp)) 11 | 12 | deps = $(wildcard ./*.cuh) \ 13 | $(wildcard ./*.hpp) \ 14 | $(wildcard ./*.h) \ 15 | 16 | 17 | %.o:%.cu $(deps) 18 | $(cucc) -c $(cuflags) $< -o $@ 19 | 20 | %.o:%.cpp $(deps) 21 | $(cc) -c $< -o $@ 22 | 23 | $(exe):$(objs) 24 | $(cc) $(objs) $(commflags) -O3 -o $(exe) 25 | 26 | 27 | test:$(exe) 28 | #Multidimensional random walk 29 | mpirun -n $(N) $(exe) $(d) $(d)/beg.bin $(d)/csr.bin 315 32 4000 2000 1 2000 1 30 | 31 | clean: 32 | rm -rf *.o ${exe} 33 | -------------------------------------------------------------------------------- /non-stream/WG/beg.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/non-stream/WG/beg.bin -------------------------------------------------------------------------------- /non-stream/WG/csr.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/non-stream/WG/csr.bin -------------------------------------------------------------------------------- /non-stream/api.cuh: -------------------------------------------------------------------------------- 1 | #ifndef API_H 2 | #define API_H 3 | 4 | __device__ int 5 | VertexBias(int vertexID, gpu_graph *graph) 6 | { 7 | // For MDRW 8 | return graph->degree_list[vertexID]; 9 | // For other 10 | // return 1; 11 | } 12 | 13 | __device__ int 14 | EdgeBias(int vertexID, gpu_graph *graph) 15 | { 16 | // For BNS, LS 17 | return graph->degree_list[vertexID]; 18 | // For BRW, 19 | // return 1; 20 | } 21 | 22 | 23 | __device__ int 24 | Update(gpu_graph *G, int selected, int source) 25 | { 26 | return selected; 27 | } 28 | 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /non-stream/functions.cuh: -------------------------------------------------------------------------------- 1 | #ifndef FNC 2 | #define FNC 3 | #include "herror.h" 4 | #include "header.h" 5 | #include 6 | #include 7 | #include 8 | #include "sampler.cuh" 9 | #include "api.cuh" 10 | #define profil 11 | __device__ 12 | int binary_search(int start,int end,float value, float *arr) 13 | { 14 | //printf("low:%d,high:%d,value:%f\n",start,end,value); 15 | int low=start; 16 | int high=end; 17 | int index=start; 18 | 19 | // atomicAdd(&counter[0],1); 20 | while (low<=high) 21 | { 22 | index=((low+high)/2); 23 | if (valuearr[index]) 30 | { 31 | // set low to index+1 32 | low = index+1; 33 | //printf("low:%d\n",low); 34 | 35 | } 36 | else 37 | { 38 | break; 39 | } 40 | 41 | } 42 | return index; 43 | } 44 | 45 | __device__ 46 | int bitmap_search(int *bitmap, int bitmap_start, int index) 47 | { 48 | int bitmap_width=32; 49 | int bitmap_pos= index; 50 | // #ifdef not_reversed 51 | int bit_block_index = bitmap_pos / bitmap_width; // find the address of bitmap 52 | int bit_block_pos= bitmap_pos % bitmap_width; // position within a address 53 | // #endif 54 | // reversed------------ 55 | 56 | //#ifdef reversed 57 | // int bit_block_pos = bitmap_pos / bitmap_width; 58 | // int bit_block_index= bitmap_pos % bitmap_width; 59 | //#endif 60 | 61 | int initial_mask=1; 62 | int mask = (initial_mask << bit_block_pos); 63 | int status=atomicOr(&bitmap[bit_block_index+bitmap_start],mask); 64 | // int status=mask; 65 | int is_in= (mask & status) >> bit_block_pos; 66 | if(is_in!=0){is_in=1;} 67 | //is_in= 0x00000001 & (status >> bit_block_pos); 68 | //printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d, mask:%d, status: %d,shift: %d, is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask & status),is_in); 69 | return is_in; 70 | } 71 | 72 | __device__ 73 | int linear_search(int *bitmap, int bitmap_start, int index) 74 | { 75 | 76 | int warpTID=threadIdx.x%32; 77 | int pos= warpTID; 78 | int temp_status= 0; 79 | while(pos<256){ 80 | if (bitmap[index]==1) 81 | { 82 | temp_status=1; 83 | } 84 | pos+=warpSize; 85 | } 86 | 87 | int bitmap_width=32; 88 | int bitmap_pos= index; 89 | // #ifdef not_reversed 90 | int bit_block_index = bitmap_pos / bitmap_width; // find the address of bitmap 91 | int bit_block_pos= bitmap_pos % bitmap_width; // position within a address 92 | // #endif 93 | // reversed------------ 94 | 95 | //#ifdef reversed 96 | // int bit_block_pos = bitmap_pos / bitmap_width; 97 | // int bit_block_index= bitmap_pos % bitmap_width; 98 | //#endif 99 | 100 | int initial_mask=1; 101 | int mask = (initial_mask << bit_block_pos); 102 | int status=atomicOr(&bitmap[bit_block_index+bitmap_start],mask); 103 | // int status=mask; 104 | int is_in= (mask & status) >> bit_block_pos; 105 | if(is_in!=0){is_in=1;} 106 | //is_in= 0x00000001 & (status >> bit_block_pos); 107 | //printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d, mask:%d, status: %d,shift: %d, is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask & status),is_in); 108 | return is_in; 109 | } 110 | 111 | 112 | __device__ 113 | void gpu_prefix(int total_step,int warp_tid,float *degree_l, int offset_d_n, int warpsize, int len) 114 | { 115 | warpsize=32; 116 | for (int i=0; i< total_step; i++) 117 | { 118 | // Loop the threads 119 | int req_thread = len/(powf(2,(i+1))); 120 | for (int iid= warp_tid; iid<=req_thread; iid+=warpsize) 121 | { 122 | 123 | int tid_offset = iid*powf(2,i+1); 124 | // calculate the index 125 | int i1= (tid_offset) +(powf(2,i))-1+offset_d_n; 126 | int i2= (tid_offset) +powf(2,i+1)-1+offset_d_n; 127 | if(i1> (offset_d_n+len-1)){break; } 128 | //printf("i:%d, Index1 %d: %f,Index2 %d: %f, thread:%d\n",i,i1,degree_l[i1],i2,degree_l[i2],threadIdx.x); 129 | // load the values to shared mem 130 | int temp1= degree_l[i1]; 131 | int temp2= degree_l[i2]; 132 | degree_l[i2] = temp2+ temp1; 133 | //printf("Index:%d, Value:%d \n",i2,temp[i2]); 134 | } 135 | } 136 | // __syncthreads(); 137 | degree_l[len-1+offset_d_n]=0; 138 | //printf("\nDownstep:%d\n",degree_l[len-1]); 139 | for (int i=(total_step-1);i >= 0; i-- ) 140 | { 141 | // Loop the threads 142 | int req_thread = len/(powf(2,(i+1))); 143 | for (int iid= warp_tid; iid<=req_thread; iid+=warpsize) 144 | { 145 | int tid_offset = iid * powf(2,i+1); 146 | int i1= (tid_offset) + (powf(2,i))-1+offset_d_n; 147 | int i2= (tid_offset) + powf(2,i+1)-1+offset_d_n; 148 | if(i1 > (offset_d_n+len-1)){break;} 149 | // printf("temp1: %d, temp2: %d, thread:%d\n",i1,i2,threadIdx.x); 150 | // printf("Index1 %d: %f,Index2 %d: %f, thread:%d\n",i1,degree_l[i1],i2,degree_l[i2],threadIdx.x); 151 | int temp1 = degree_l[i1]; 152 | int temp2 = degree_l[i2]; 153 | degree_l[i1]=temp2; 154 | degree_l[i2]=temp2+temp1; 155 | //printf("Index:%d, Value:%d \n",i2,temp[i2]); 156 | } 157 | } 158 | } 159 | 160 | __device__ void 161 | ITS(float *degree_l,int offset_d_n,int warpsize, int neighbor_length){ 162 | float bits = log2f(neighbor_length); 163 | int raise = ceilf(bits); 164 | int max_bit = powf(2,raise); 165 | int len=max_bit; 166 | int total_step= log2f(max_bit); 167 | int warp_tid = threadIdx.x%32; 168 | // __syncthreads(); 169 | gpu_prefix(total_step,warp_tid,degree_l,offset_d_n,warpsize,len); 170 | float sum = degree_l[neighbor_length-1+offset_d_n]; 171 | for (int i = warp_tid; i < neighbor_length; i+=warpsize) 172 | { 173 | degree_l[i]=degree_l[i]/((double)sum); 174 | // printf("i:%d, degree:%.2f\n",i,degree_l[i]); 175 | } 176 | } 177 | 178 | __device__ int 179 | max(int *data, int len) 180 | { 181 | int max=data[0]; 182 | for(int i=0;imax){max=data[i];} 185 | // printf("data: %d\n",data[i]); 186 | } 187 | return max; 188 | } 189 | 190 | __device__ void 191 | read_prob(float *degree_l, float *prob, int len, int offset){ 192 | int index=threadIdx.x %32; 193 | while(indexdegree; 226 | int *bitmap = wvar->bitmap; 227 | int *selected_list = wvar->selected; 228 | int neighbor_length =wvar->NL; 229 | int neighbor_start= wvar->NS; 230 | int *total_counter = wvar->total_counter; 231 | warpsize=32; 232 | int prefix=0, index=0; 233 | int new_neighbor; 234 | clock_t start_time,stop_time; 235 | float pref_time; 236 | int counter; 237 | wvar->sindex[0]=0; 238 | // decide if prefix sum is required 239 | if(neighbor_length>N) { prefix=1; } 240 | if(prefix==1) 241 | { 242 | start_time = clock(); 243 | ITS(degree_l, 0, warpsize, neighbor_length); 244 | stop_time =clock(); 245 | pref_time= float(stop_time-start_time); 246 | index=warpTID; 247 | start_time= clock(); 248 | 249 | // reset bitmaps 250 | int start=warpTID; 251 | int end = neighbor_length/32 + 1; 252 | for(int i=start;i4000){break;} 259 | // if(warpTID==0){printf("Iteration: %d\n",counter);} 260 | index=warpTID; 261 | while(indextempSelected[index]= selected; 268 | is_in= bitmap_search(bitmap,0,selected); 269 | if(is_in==0){ 270 | int pos= atomicAdd(&wvar->sindex[0],1); 271 | if(possindex[0]adj_list[selected+neighbor_start]; 283 | // selected_list[index]= new_neighbor; 284 | 285 | } 286 | __syncwarp(); 287 | stop_time= clock(); 288 | float samp_time = float(stop_time-start_time); 289 | if(warpTID==0){ 290 | total_counter[0]+=samp_time; 291 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",counter,neighbor_length, N,source,pref_time,samp_time); 292 | } 293 | } 294 | else 295 | { 296 | index=warpTID; 297 | while(indexadj_list[index + neighbor_start]; 300 | selected_list[index]= new_neighbor; 301 | index+=warpsize; 302 | } 303 | } 304 | } 305 | 306 | __device__ void 307 | select(Wv *wvar, Cp *cache, int N,int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax, int bitflag, int Fcache) 308 | { 309 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 310 | int warpID = tid/32; 311 | int warpTID= threadIdx.x%32; 312 | float *degree_l = wvar->degree; 313 | int *bitmap = wvar->bitmap; 314 | int *selected_list = wvar->selected; 315 | int neighbor_length =wvar->NL; 316 | int neighbor_start= wvar->NS; 317 | int *total_counter = wvar->total_counter; 318 | int warpsize=32; 319 | int prefix=0, index=0; 320 | int new_neighbor; 321 | clock_t start_time,stop_time; 322 | float pref_time; 323 | // if(source%2==0){N=N+1;} 324 | // decide if prefix sum is required 325 | if(neighbor_length>N) { prefix=1; } 326 | if(prefix==1) 327 | { 328 | start_time = clock(); 329 | if(Fcache){ 330 | int offset=G->beg_pos[source]; 331 | if(cache->status[source]==1){ 332 | // if(warpTID==0){printf("avoided.\n");} 333 | read_prob(degree_l,cache->probability,neighbor_length,offset); 334 | } 335 | else{ 336 | ITS(degree_l, 0, warpsize, neighbor_length); 337 | write_prob(degree_l,cache->probability,neighbor_length,offset); 338 | if(warpTID==0){cache->status[source]=1;} 339 | } 340 | } 341 | else{ITS(degree_l, 0, warpsize, neighbor_length);} 342 | 343 | stop_time =clock(); 344 | pref_time= float(stop_time-start_time); 345 | index=warpTID; 346 | while(indexadj_list[selected+neighbor_start]; 371 | selected_list[index]= new_neighbor; 372 | #ifdef profile 373 | printf("Added %d to sampled.\n",selected); 374 | #endif 375 | 376 | break;} 377 | if(colcount[index]>400){ 378 | selected_list[index]= 0; 379 | break; 380 | } 381 | } 382 | // Add new neighbor to 383 | // printf("Index: %d, count: %d\n",index,colcount[index]); 384 | index+=warpsize; 385 | } 386 | // 387 | // 388 | __syncwarp(); 389 | stop_time= clock(); 390 | float samp_time = float(stop_time-start_time); 391 | if(warpTID==0){ 392 | int longer=max(colcount,N); 393 | atomicAdd(&Gmax[0], longer); 394 | total_counter[0]+=samp_time; 395 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time); 396 | } 397 | 398 | } 399 | else 400 | // pick up neighors 401 | { 402 | index=warpTID; 403 | while(indexadj_list[index + neighbor_start]; 406 | selected_list[index]= new_neighbor; 407 | index+=warpsize; 408 | } 409 | } 410 | } 411 | 412 | __device__ void 413 | naive_ITS(Wv *wvar, Cp *cache, int N,int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax, int bitflag, int Fcache) 414 | { 415 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 416 | int warpID = tid/32; 417 | int warpTID= threadIdx.x%32; 418 | float *degree_l = wvar->degree; 419 | int *bitmap = wvar->bitmap; 420 | int *selected_list = wvar->selected; 421 | int neighbor_length =wvar->NL; 422 | int neighbor_start= wvar->NS; 423 | int *total_counter = wvar->total_counter; 424 | int warpsize=32; 425 | int prefix=0, index=0; 426 | int new_neighbor; 427 | clock_t start_time,stop_time; 428 | float pref_time; 429 | // if(source%2==0){N=N+1;} 430 | // decide if prefix sum is required 431 | if(neighbor_length>N) { prefix=1; } 432 | if(prefix==1) 433 | { 434 | start_time = clock(); 435 | if(Fcache){ 436 | int offset=G->beg_pos[source]; 437 | if(cache->status[source]==1){ 438 | // if(warpTID==0){printf("avoided.\n");} 439 | read_prob(degree_l,cache->probability,neighbor_length,offset); 440 | } 441 | else{ 442 | ITS(degree_l, 0, warpsize, neighbor_length); 443 | write_prob(degree_l,cache->probability,neighbor_length,offset); 444 | if(warpTID==0){cache->status[source]=1;} 445 | } 446 | } 447 | else{ITS(degree_l, 0, warpsize, neighbor_length);} 448 | 449 | stop_time =clock(); 450 | pref_time= float(stop_time-start_time); 451 | index=warpTID; 452 | while(indexadj_list[selected+neighbor_start]; 477 | selected_list[index]= new_neighbor; 478 | #ifdef profile 479 | printf("Added %d to sampled.\n",selected); 480 | #endif 481 | 482 | break;} 483 | if(colcount[index]>400){ 484 | selected_list[index]= 0; 485 | break; 486 | } 487 | } 488 | // Add new neighbor to 489 | // printf("Index: %d, count: %d\n",index,colcount[index]); 490 | index+=warpsize; 491 | } 492 | // 493 | if(N<4){ 494 | ITS(degree_l, 0, warpsize, neighbor_length); 495 | float r= curand_uniform(&local_state); 496 | int selected= binary_search(0,neighbor_length,r,degree_l); 497 | } 498 | // 499 | __syncwarp(); 500 | stop_time= clock(); 501 | float samp_time = float(stop_time-start_time); 502 | if(warpTID==0){ 503 | int longer=max(colcount,N); 504 | atomicAdd(&Gmax[0], longer); 505 | total_counter[0]+=samp_time; 506 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time); 507 | } 508 | 509 | } 510 | else 511 | // pick up neighors 512 | { 513 | index=warpTID; 514 | while(indexadj_list[index + neighbor_start]; 517 | selected_list[index]= new_neighbor; 518 | index+=warpsize; 519 | } 520 | } 521 | } 522 | 523 | __device__ void 524 | normalize_over_select(Wv *wvar, int warpsize, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax) 525 | { 526 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 527 | int warpID = tid/32; 528 | int warpTID= threadIdx.x%32; 529 | float *degree_l = wvar->degree; 530 | int *bitmap = wvar->bitmap; 531 | int *selected_list = wvar->selected; 532 | int neighbor_length =wvar->NL; 533 | int neighbor_start= wvar->NS; 534 | int *total_counter = wvar->total_counter; 535 | warpsize=32; 536 | int prefix=0, index=0; 537 | int new_neighbor; 538 | clock_t start_time,stop_time; 539 | float pref_time; 540 | wvar->sindex[0]=0; 541 | if(neighbor_length>N) { prefix=1; } 542 | if(prefix==1) 543 | { 544 | start_time = clock(); 545 | ITS(degree_l, 0, warpsize, neighbor_length); 546 | stop_time =clock(); 547 | pref_time= float(stop_time-start_time); 548 | index=warpTID; 549 | 550 | // reset bitmaps 551 | int start=warpTID; 552 | int end = neighbor_length/32 + 1; 553 | // if(warpTID==0)printf("Bitmap end:%d\n",end); 554 | for(int i=start;isindex[0],1); 576 | if(posvalue){ 582 | a= degree_l[selected]; 583 | b= degree_l[selected+1];} 584 | else{ 585 | a= degree_l[selected-1]; 586 | b= degree_l[selected];} 587 | // if(lb==a && hb==b){}; 588 | // float temp = 0.23; 589 | temp= (float) (a-lb)/(a-lb+hb-b); 590 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp); 591 | if(r< temp) 592 | { 593 | // printf("Update hb.\n"); 594 | r= (lb+r*(a-lb)); 595 | hb=a;} 596 | else 597 | { 598 | // printf("Update lb.\n"); 599 | r= (b+(hb-b)*r); 600 | lb=b;} 601 | // printf("\nNew r: %.2f, lb: %.2f, hb: %.2f\n",r,lb,hb); 602 | } 603 | 604 | if(colcount[index]>80){break;} 605 | // else{ 606 | // // atomicAdd(&colcount[0],1); 607 | // printf("Repeated. Index: %d, selected: %d\n",index,selected); 608 | // } 609 | } 610 | new_neighbor= G->adj_list[selected+neighbor_start]; 611 | selected_list[index]= new_neighbor; 612 | index+=warpsize; 613 | } 614 | __syncwarp(); 615 | stop_time= clock(); 616 | float samp_time = float(stop_time-start_time); 617 | if(warpTID==0){ 618 | int longer=max(colcount,N); 619 | total_counter[0]+=samp_time; 620 | } 621 | 622 | } 623 | else{ 624 | index=warpTID; 625 | while(indexadj_list[index + neighbor_start]; 628 | selected_list[index]= new_neighbor; 629 | index+=warpsize; 630 | } 631 | } 632 | } 633 | 634 | 635 | 636 | __device__ void 637 | normalize(Wv *wvar, int warpsize, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax) 638 | { 639 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 640 | int warpID = tid/32; 641 | int warpTID= threadIdx.x%32; 642 | float *degree_l = wvar->degree; 643 | int *bitmap = wvar->bitmap; 644 | int *selected_list = wvar->selected; 645 | int neighbor_length =wvar->NL; 646 | int neighbor_start= wvar->NS; 647 | int *total_counter = wvar->total_counter; 648 | warpsize=32; 649 | int prefix=0, index=0; 650 | int new_neighbor; 651 | clock_t start_time,stop_time; 652 | float pref_time; 653 | if(neighbor_length>N) { prefix=1; } 654 | if(prefix==1) 655 | { 656 | start_time = clock(); 657 | ITS(degree_l, 0, warpsize, neighbor_length); 658 | stop_time =clock(); 659 | pref_time= float(stop_time-start_time); 660 | index=warpTID; 661 | 662 | // reset bitmaps 663 | int start=warpTID; 664 | int end = neighbor_length/32 + 1; 665 | // if(warpTID==0)printf("Bitmap end:%d\n",end); 666 | for(int i=start;ivalue){ 690 | a= degree_l[selected]; 691 | b= degree_l[selected+1];} 692 | else{ 693 | a= degree_l[selected-1]; 694 | b= degree_l[selected];} 695 | // if(lb==a && hb==b){}; 696 | // float temp = 0.23; 697 | temp= (float) (a-lb)/(a-lb+hb-b); 698 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp); 699 | if(r< temp) 700 | { 701 | // printf("Update hb.\n"); 702 | r= (lb+r*(a-lb)); 703 | hb=a;} 704 | else 705 | { 706 | // printf("Update lb.\n"); 707 | r= (b+(hb-b)*r); 708 | lb=b;} 709 | // printf("\nNew r: %.2f, lb: %.2f, hb: %.2f\n",r,lb,hb); 710 | } 711 | 712 | if(colcount[index]>1000){break;} 713 | // else{ 714 | // // atomicAdd(&colcount[0],1); 715 | // printf("Repeated. Index: %d, selected: %d\n",index,selected); 716 | // } 717 | } 718 | new_neighbor= G->adj_list[selected+neighbor_start]; 719 | selected_list[index]= new_neighbor; 720 | index+=warpsize; 721 | } 722 | __syncwarp(); 723 | stop_time= clock(); 724 | float samp_time = float(stop_time-start_time); 725 | if(warpTID==0){ 726 | int longer=max(colcount,N); 727 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time); 728 | total_counter[0]+=samp_time; 729 | } 730 | 731 | } 732 | else{ 733 | index=warpTID; 734 | while(indexadj_list[index + neighbor_start]; 737 | selected_list[index]= new_neighbor; 738 | index+=warpsize; 739 | } 740 | } 741 | } 742 | 743 | __device__ void 744 | heur_normalize(Wv *wvar, Cp *cache, int N, int overN, curandState local_state, gpu_graph *G, int *colcount, int source, int *Gmax,int bitflag,int Fcache) 745 | { 746 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 747 | int warpID = tid/32; 748 | int warpTID= threadIdx.x%32; 749 | float *degree_l = wvar->degree; 750 | int *bitmap = wvar->bitmap; 751 | int *selected_list = wvar->selected; 752 | int neighbor_length =wvar->NL; 753 | int neighbor_start= wvar->NS; 754 | int *total_counter = wvar->total_counter; 755 | int warpsize=32; 756 | int prefix=0, index=0; 757 | int new_neighbor; 758 | clock_t start_time,stop_time; 759 | float pref_time; 760 | // For forest fire 761 | // if(source%2==0){N=N+1;} 762 | if(neighbor_length>N) { prefix=1; } 763 | if(prefix==1) 764 | { 765 | start_time = clock(); 766 | if(Fcache){ 767 | int offset=G->beg_pos[source]; 768 | if(cache->status[source]==1){ 769 | // if(warpTID==0){printf("avoided.\n");} 770 | read_prob(degree_l,cache->probability,neighbor_length,offset); 771 | } 772 | else{ 773 | ITS(degree_l, 0, warpsize, neighbor_length); 774 | write_prob(degree_l,cache->probability,neighbor_length,offset); 775 | if(warpTID==0){cache->status[source]=1;} 776 | } 777 | } 778 | else{ITS(degree_l, 0, warpsize, neighbor_length);} 779 | stop_time =clock(); 780 | pref_time= float(stop_time-start_time); 781 | index=warpTID; 782 | 783 | // reset bitmaps 784 | int start=warpTID; 785 | int end = neighbor_length/32 + 1; 786 | // if(warpTID==0)printf("Bitmap end:%d\n",end); 787 | for(int i=start;iadj_list[selected+neighbor_start]; 810 | selected_list[index]= new_neighbor; 811 | break; 812 | } 813 | if(is_in==1){ 814 | float value = degree_l[selected]; 815 | if(r>value){ 816 | a= degree_l[selected]; 817 | b= degree_l[selected+1];} 818 | else{ 819 | a= degree_l[selected-1]; 820 | b= degree_l[selected];} 821 | // if(lb==a && hb==b){}; 822 | // float temp = 0.23; 823 | float lambda= (float) (a-lb)/(a-lb+hb-b); 824 | float delta= (float) (b-a)/(hb-lb); 825 | r= (float) r/lambda; 826 | // printf("a: %.2f, b: %.2f,lb: %.2f, hb: %.2f, temp: %.2f\n",a,b,lb,hb,temp); 827 | if(r< a) 828 | { 829 | hb=a;} 830 | else 831 | {r= r + delta; 832 | lb=b;} 833 | // printf("index: %d,random number: %.2f, selected: %d, is_in: %d\n",index,r,selected,is_in); 834 | localCount+=1; 835 | 836 | selected= binary_search(0,neighbor_length,r,degree_l); 837 | is_in= bitmap_search(bitmap,0,selected); 838 | if(is_in==0){ 839 | new_neighbor= G->adj_list[selected+neighbor_start]; 840 | selected_list[index]= new_neighbor; 841 | break; 842 | } 843 | } 844 | if(colcount[index]>400){break;} 845 | } 846 | index+=warpsize; 847 | } 848 | __syncwarp(); 849 | stop_time= clock(); 850 | float samp_time = float(stop_time-start_time); 851 | if(warpTID==0){ 852 | int longer=max(colcount,N); 853 | atomicAdd(&Gmax[0], longer); 854 | // printf("%d, %d, %d, %d, %.0f, %.0f\n",longer,neighbor_length, N,source,pref_time,samp_time); 855 | // total_counter[0]+=samp_time; 856 | } 857 | 858 | } 859 | else{ 860 | index=warpTID; 861 | while(indexadj_list[index + neighbor_start]; 864 | selected_list[index]= new_neighbor; 865 | index+=warpsize; 866 | } 867 | } 868 | } 869 | 870 | 871 | 872 | __device__ int 873 | get_neighbors(gpu_graph *graph, int vertex, Wv *wvar, int VertCount){ 874 | int warpTID= threadIdx.x%32; 875 | int index= warpTID; 876 | int len= graph->degree_list[vertex]; 877 | wvar->NL=len; 878 | int neighbor_start=graph->beg_pos[vertex]; 879 | wvar->NS= neighbor_start; 880 | #ifdef profile 881 | // if(warpTID==0){printf("Source: %d, NLen: %d, Nstart: %d\n",vertex, len,neighbor_start);} 882 | #endif 883 | while(indexadj_list[neighbor_start + index]; 886 | wvar->neighbors[index]= neighbor; 887 | wvar->degree[index]= EdgeBias(neighbor,graph); 888 | // {printf("Neighbor:%d, tid:%d\n",wvar->neighbors[index],index);} 889 | index+=warpSize; 890 | } 891 | return len; 892 | } 893 | 894 | // __device__ void 895 | // next(S){ 896 | 897 | // } 898 | 899 | __device__ int 900 | linear_search(int neighbor,int *partition1, int *bin_count, int bin, int BIN_SIZE, int BUCKETS) 901 | { 902 | if(bin>=32){printf("Bin error.\n");} 903 | int len = bin_count[bin]; 904 | int i = bin; 905 | // printf("\nL: %d, I:%d\n",len,i); 906 | int step=0; 907 | while(stepBUCKETS; 932 | int bin= vertex % BUCKETS; 933 | int BIN_SIZE = hashtable->bin_size; 934 | // #ifdef profile 935 | // printf("Bucket %d, bin: %d\n",BUCKETS,bin); 936 | // #endif 937 | int is_in=linear_search(vertex,hashtable->hash,hashtable->bin_counter,bin,BIN_SIZE,BUCKETS); 938 | // // if(is_in==1){printf("Duplicated Found: %d\n",new_neighbor);} 939 | return is_in; 940 | } 941 | 942 | __device__ void 943 | add_hash(Ht *hashtable, int vertex) 944 | { 945 | int BUCKETS = hashtable->BUCKETS; 946 | int bin= vertex % BUCKETS; 947 | int index=atomicAdd(&hashtable->bin_counter[bin],1); 948 | if(index>100){printf("error. %d\n",index);} 949 | #ifdef profile 950 | printf("Add: %d, bin: %d, INdex: %d\n",vertex,bin,index); 951 | #endif 952 | hashtable->hash[index*BUCKETS+ bin]=vertex; 953 | } 954 | 955 | 956 | 957 | __device__ int 958 | linear_duplicate(Si *samples, int vertex){ 959 | int warpTID=threadIdx.x%32; 960 | int index= warpTID; 961 | while(indexstart[0]) 962 | { 963 | if(vertex==samples->edge[index]){ 964 | return 1; 965 | break; 966 | } 967 | } 968 | return 0; 969 | } 970 | 971 | __device__ void 972 | frontier(gpu_graph *G,Sampling *S, int warpId,int SampleID, int N, int source, int sourceIndex, int hash, int Depth) 973 | { 974 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 975 | int warpTID= threadIdx.x%32; 976 | int *selected=S->wvar[warpId].selected; 977 | int index=warpTID; 978 | int is_in=0; 979 | while(indexhashtable[SampleID], vertex);} 984 | // else{is_in= linear_duplicate(&S->samples[SampleID], vertex);} 985 | int pos=atomicAdd(&S->samples[SampleID].start[0],1); 986 | // total count 987 | atomicAdd(&S->sampled_count[0],1); 988 | #ifdef profile 989 | 990 | // printf("Added to sampled.\n SID: %d, Updated: %d, pos: %d, is_in: %d\n",SampleID,vertex,pos,is_in); 991 | #endif 992 | S->samples[SampleID].vertex[pos]=source; 993 | S->samples[SampleID].edge[pos]=vertex; 994 | if(is_in==0) 995 | { 996 | // add_hash(&S->hashtable[SampleID], vertex); 997 | int currDepth= S->candidate.depth[sourceIndex]; 998 | if(currDepth < (Depth-1)){ 999 | // #ifdef profile 1000 | // printf("warpID: %d, Curr:%d, Added %d to queue.\n",tid/32,currDepth,vertex); 1001 | // #endif 1002 | int Qid= atomicAdd(&S->candidate.end[0],1); 1003 | S->candidate.vertices[Qid]= vertex; 1004 | S->candidate.instance_ID[Qid]= S->candidate.instance_ID[sourceIndex]; 1005 | S->candidate.depth[Qid]= currDepth+1; 1006 | } 1007 | } 1008 | index+=warpSize; 1009 | } 1010 | // __syncwarp(); 1011 | } 1012 | 1013 | 1014 | __device__ int 1015 | ITS_MDRW(Wv *wvar,curandState local_state, gpu_graph *G, int neighbor_length, float r) 1016 | { 1017 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 1018 | int warpID = tid/32; 1019 | int warpTID= threadIdx.x%32; 1020 | float *degree_l = wvar->degree; 1021 | int neighbor_start= wvar->NS; 1022 | int *total_counter = wvar->total_counter; 1023 | int warpsize=32; 1024 | int prefix=0, index=0; 1025 | int new_neighbor; 1026 | clock_t start_time,stop_time; 1027 | float pref_time; 1028 | // if(source%2==0){N=N+1;} 1029 | // decide if prefix sum is required 1030 | if(neighbor_length>1) { prefix=1; } 1031 | if(prefix==1) 1032 | { 1033 | start_time = clock(); 1034 | ITS(degree_l, 0, warpsize, neighbor_length); 1035 | __syncwarp(); 1036 | #ifdef profile 1037 | if(threadIdx.x==0){ 1038 | for(int i=0;iadj_list[selected+neighbor_start]; 1053 | return selected; 1054 | } 1055 | else 1056 | { 1057 | return 0; 1058 | // new_neighbor = G->adj_list[neighbor_start]; 1059 | } 1060 | } 1061 | 1062 | 1063 | #endif 1064 | 1065 | 1066 | -------------------------------------------------------------------------------- /non-stream/gpu_graph.cuh: -------------------------------------------------------------------------------- 1 | //10/03/2016 2 | //Graph data structure on GPUs 3 | #ifndef _GPU_GRAPH_H_ 4 | #define _GPU_GRAPH_H_ 5 | #include 6 | #include "header.h" 7 | #include "util.h" 8 | #include "graph.h" 9 | 10 | class gpu_graph 11 | { 12 | public: 13 | vertex_t *adj_list; 14 | weight_t *weight_list; 15 | index_t *beg_pos; 16 | vertex_t *degree_list; 17 | 18 | index_t vert_count; 19 | index_t edge_count; 20 | index_t avg_degree; 21 | 22 | public: 23 | ~gpu_graph(){} 24 | 25 | gpu_graph( 26 | graph *ginst) 27 | { 28 | vert_count=ginst->vert_count; 29 | edge_count=ginst->edge_count; 30 | avg_degree = ginst->edge_count/ginst->vert_count; 31 | 32 | // size_t weight_sz=sizeof(weight_t)*edge_count; 33 | size_t adj_sz=sizeof(vertex_t)*edge_count; 34 | size_t deg_sz=sizeof(vertex_t)*edge_count; 35 | size_t beg_sz=sizeof(index_t)*(vert_count+1); 36 | vertex_t *cpu_degree_list=(vertex_t*)malloc(sizeof(vertex_t)*edge_count); 37 | /* Alloc GPU space */ 38 | H_ERR(cudaMalloc((void **)&adj_list, adj_sz)); 39 | H_ERR(cudaMalloc((void **)°ree_list, deg_sz)); 40 | H_ERR(cudaMalloc((void **)&beg_pos, beg_sz)); 41 | //H_ERR(cudaMalloc((void **)&weight_list, weight_sz)); 42 | 43 | for(int i=0; i<(ginst->edge_count); i++) 44 | { 45 | int neighbor= ginst->adj_list[i]; 46 | //cout<<"Index: "<beg_pos[neighbor+1] - ginst->beg_pos[neighbor]; 48 | if((cpu_degree_list[i]>1950) & (cpu_degree_list[i]<2050)) 49 | { 50 | //printf("V: %d, Degree:%d\n",neighbor,cpu_degree_list[i]); 51 | } 52 | } 53 | 54 | /* copy it to GPU */ 55 | H_ERR(cudaMemcpy(adj_list,ginst->adj_list, 56 | adj_sz, cudaMemcpyHostToDevice)); 57 | H_ERR(cudaMemcpy(beg_pos,ginst->beg_pos, 58 | beg_sz, cudaMemcpyHostToDevice)); 59 | H_ERR(cudaMemcpy(degree_list,cpu_degree_list, 60 | beg_sz, cudaMemcpyHostToDevice)); 61 | 62 | //H_ERR(cudaMemcpy(weight_list,ginst->weight, 63 | // weight_sz, cudaMemcpyHostToDevice)); 64 | } 65 | }; 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /non-stream/graph.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_H__ 2 | #define __GRAPH_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "wtime.h" 8 | #include 9 | #include 10 | #include 11 | inline off_t fsize(const char *filename) { 12 | struct stat st; 13 | if (stat(filename, &st) == 0) 14 | return st.st_size; 15 | return -1; 16 | } 17 | 18 | template< 19 | typename file_vert_t, typename file_index_t, typename file_weight_t, 20 | typename new_vert_t, typename new_index_t, typename new_weight_t> 21 | class graph 22 | { 23 | public: 24 | new_index_t *beg_pos; 25 | new_vert_t *adj_list; 26 | new_weight_t *weight; 27 | new_vert_t *degree_list; 28 | new_index_t vert_count; 29 | new_index_t edge_count; 30 | 31 | public: 32 | graph(){}; 33 | ~graph(){}; 34 | graph(const char *beg_file, 35 | const char *adj_list_file, 36 | const char *weight_file); 37 | 38 | graph(file_vert_t *csr, 39 | file_index_t *beg_pos, 40 | file_weight_t *weight_list, 41 | file_index_t vert_count, 42 | file_index_t edge_count) 43 | { 44 | this->beg_pos = beg_pos; 45 | this->adj_list = csr; 46 | this->weight = weight_list; 47 | //this->degree_list= degree_list; 48 | this->edge_count = edge_count; 49 | this->vert_count = vert_count; 50 | }; 51 | }; 52 | #include "graph.hpp" 53 | #endif 54 | -------------------------------------------------------------------------------- /non-stream/graph.hpp: -------------------------------------------------------------------------------- 1 | #include "graph.h" 2 | #include 3 | 4 | template< 5 | typename file_vert_t, typename file_index_t, typename file_weight_t, 6 | typename new_vert_t, typename new_index_t, typename new_weight_t> 7 | graph 9 | ::graph( 10 | const char *beg_file, 11 | const char *adj_file, 12 | const char *weight_file) 13 | { 14 | double tm=wtime(); 15 | FILE *file=NULL; 16 | file_index_t ret; 17 | 18 | vert_count=fsize(beg_file)/sizeof(file_index_t) - 1; 19 | edge_count=fsize(adj_file)/sizeof(file_vert_t); 20 | 21 | file=fopen(beg_file, "rb"); 22 | if(file!=NULL) 23 | { 24 | file_index_t *tmp_beg_pos=NULL; 25 | 26 | if(posix_memalign((void **)&tmp_beg_pos, getpagesize(), 27 | sizeof(file_index_t)*(vert_count+1))) 28 | perror("posix_memalign"); 29 | 30 | ret=fread(tmp_beg_pos, sizeof(file_index_t), 31 | vert_count+1, file); 32 | assert(ret==vert_count+1); 33 | fclose(file); 34 | edge_count=tmp_beg_pos[vert_count]; 35 | //std::cout<<"Expected edge count: "<0); 38 | 39 | //converting to new type when different 40 | if(sizeof(file_index_t)!=sizeof(new_index_t)) 41 | { 42 | if(posix_memalign((void **)&beg_pos, getpagesize(), 43 | sizeof(new_index_t)*(vert_count+1))) 44 | perror("posix_memalign"); 45 | for(new_index_t i=0;i 2 | #include "graph.h" 3 | #include "wtime.h" 4 | #include 5 | #include 6 | #include 7 | #include "gpu_graph.cuh" 8 | #include 9 | #include 10 | #include "herror.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "api.cuh" 16 | #include "sampler.cuh" 17 | #include "sample_class.cuh" 18 | #include "functions.cuh" 19 | using namespace std; 20 | 21 | __global__ void 22 | check(Sampling *S, gpu_graph G,curandState *global_state,int n_subgraph, int FrontierSize, int NeighborSize, int Depth) 23 | { 24 | float prefix_time,local_d_time,global_d_time; 25 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 26 | int hash=1, cache=0, bitflag=1, NORMALIZE=1; 27 | #ifdef profile 28 | if(tid==0){ 29 | printf("\n"); 30 | for(int i=0; icandidate.vertices[i]; 33 | }} 34 | #endif 35 | __syncwarp(); 36 | int warpId = tid/32; 37 | int warpTid=threadIdx.x%32; 38 | clock_t start_time,stop_time; 39 | int i=0, warpsize=100; 40 | curandState local_state=global_state[threadIdx.x]; 41 | curand_init(tid, 0, 0, &local_state); // sequence created with different seed and same sequence 42 | int __shared__ l_search[256]; 43 | int __shared__ max_find[256]; 44 | S->candidate.start[0] = 0; 45 | int sourceIndex=warpId,source=0; 46 | if(warpTid==0){ 47 | atomicAdd(&S->candidate.start[0],1); 48 | } 49 | // sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0); 50 | __syncwarp(); 51 | 52 | #ifdef profile 53 | // if(threadIdx.x==0){printf("warpID:%d, sourceIndex:%d,start: %d\n",warpId, sourceIndex, S->candidate.start[0]);} 54 | #endif 55 | // start loop 56 | S->candidate.end[0]= n_subgraph; 57 | // clock_t start = clock(); 58 | 59 | // Subgraph number should be higher than the number of warps assigned 60 | while(sourceIndex < S->candidate.end[0]) 61 | { 62 | int VertCount=1; 63 | source= S->candidate.vertices[sourceIndex]; 64 | int SampleID= S->candidate.instance_ID[sourceIndex]; 65 | int NL= G.degree_list[source]; 66 | #ifdef profile 67 | // if(warpTid==0){printf(" Source: %d, len: %d\n",source,NL);} 68 | #endif 69 | if((NL==0)){ // Skip empty vertices 70 | if(warpTid==0){sourceIndex=atomicAdd(&S->candidate.start[0],1);} 71 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0); 72 | __syncwarp(); 73 | continue; 74 | } 75 | int len= get_neighbors(&G,source,&S->wvar[warpId],VertCount); 76 | 77 | if(NORMALIZE==0){ 78 | select(&S->wvar[warpId],&S->cache,NeighborSize,1,local_state, &G,S->count.colcount, source,S->max,bitflag,cache); 79 | } 80 | else{ 81 | heur_normalize(&S->wvar[warpId],&S->cache,NeighborSize,1,local_state, &G,S->count.colcount, source, S->max,bitflag,cache); 82 | } 83 | frontier(&G,S,warpId,SampleID,NeighborSize,source,sourceIndex, hash, Depth); 84 | 85 | __syncwarp(); 86 | if(warpTid==0){ 87 | sourceIndex=atomicAdd(&S->candidate.start[0],1); 88 | } 89 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0); 90 | __syncwarp(); 91 | } 92 | if(tid==0){printf("%d,",S->sampled_count[0]);} 93 | } 94 | 95 | __global__ void 96 | check_layer(Sampling *S, gpu_graph G,curandState *global_state,int n_subgraph, int FrontierSize, int NeighborSize, int Depth) 97 | { 98 | float prefix_time,local_d_time,global_d_time; 99 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 100 | int hash=1, cache=0, bitflag=1, NORMALIZE=1; 101 | int warpId = tid/32; 102 | 103 | // __syncwarp(); 104 | 105 | int warpTid=threadIdx.x%32; 106 | clock_t start_time,stop_time; 107 | int i=0, warpsize=100; 108 | curandState local_state=global_state[threadIdx.x]; 109 | curand_init(tid, 0, 0, &local_state); // sequence created with different seed and same sequence 110 | int __shared__ l_search[256]; 111 | int __shared__ max_find[256]; 112 | S->candidate.start[0] = 0; 113 | int sourceIndex=warpId,source=0; 114 | if(warpTid==0){ 115 | atomicAdd(&S->candidate.start[0],1); 116 | } 117 | // sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0); 118 | // __syncwarp(); 119 | 120 | #ifdef profile 121 | // if(threadIdx.x==0){printf("Block:%d, sourceIndex:%d, start: %d\n",blockIdx.x, sourceIndex, S->candidate.vertices[0]);} 122 | #endif 123 | S->candidate.end[0]= n_subgraph; 124 | // get degree for all frontiers 125 | int index= tid; 126 | // for(int i=0;i<(n_subgraph*FrontierSize);i++) 127 | // { 128 | // int vert= S->candidate.vertices[i]; 129 | // S->frontier_degree[i] = G.degree_list[source]; 130 | // } 131 | while(sourceIndex < n_subgraph) 132 | { 133 | int curr_depth=0; 134 | while(curr_depthcandidate.vertices[index]; 147 | int bias = VertexBias(vert, &G); 148 | S->wvar[warpId].degree[index-start_index]= (float)bias; 149 | #ifdef profile 150 | // printf("Vert: %d, Bias: %d\n",vert,bias); 151 | #endif 152 | } 153 | // __syncwarp(); 154 | // pick one with ITS 155 | float r = curand_uniform(&local_state); 156 | int selectedIndex= ITS_MDRW(&S->wvar[warpId], local_state, &G, FrontierSize,r); 157 | if(threadIdx.x==0){ 158 | int selected = S->candidate.vertices[selectedIndex]; 159 | #ifdef profile 160 | if(warpTid==0){printf("Random selected: %d, vertex: %d\n",selectedIndex, selected);} 161 | #endif 162 | int NL= G.degree_list[selected]; 163 | if(NL==0){curr_depth+=1;continue;} 164 | // generate one random integer with range of (0,NL); 165 | int r=rand_integer(local_state,NL); 166 | int neighbor_start= G.beg_pos[selected]; 167 | int sample= G.adj_list[r+neighbor_start] ; 168 | #ifdef profile 169 | if(warpTid==0){printf("NL: %d, New selected: %d, vertex: %d\n",NL,r, sample);} 170 | #endif 171 | int SampleID=sourceIndex; 172 | int pos=atomicAdd(&S->samples[SampleID].start[0],1); 173 | S->samples[SampleID].vertex[pos]=selected; 174 | S->samples[SampleID].edge[pos]=sample; 175 | if(warpTid==0){atomicAdd(&S->sampled_count[0],1);} 176 | // update the degree and frontier 177 | S->candidate.vertices[selectedIndex] = sample; 178 | S->frontier_degree[selectedIndex] = G.degree_list[sample]; 179 | #ifdef profile 180 | if(warpTid==0){printf("Next level. Curr Depth: %d\n",curr_depth);} 181 | #endif 182 | } 183 | // __syncwarp(); 184 | curr_depth+=1; 185 | } 186 | if(warpTid==0){ 187 | sourceIndex=atomicAdd(&S->candidate.start[0],1); 188 | } 189 | sourceIndex= __shfl_sync(0xffffffff,sourceIndex,0); 190 | // if(warpTid==0){printf("Next source. %d\n",sourceIndex);} 191 | // __syncwarp(); 192 | } 193 | if(tid==0){printf("%d,",S->sampled_count[0]);} 194 | } 195 | 196 | struct arguments Sampler(char beg[100], char csr[100],int n_blocks, int n_threads, int n_subgraph, int FrontierSize, int NeighborSize, int Depth, struct arguments args, int rank) 197 | { 198 | int *total=(int *)malloc(sizeof(int)*n_subgraph); 199 | int *host_counter=(int *)malloc(sizeof(int)); 200 | int T_Group=n_threads/32; 201 | int each_subgraph=Depth*NeighborSize; 202 | int total_length=each_subgraph*n_subgraph; 203 | int neighbor_length_max=n_blocks*6000*T_Group; 204 | int PER_BLOCK_WARP= T_Group; 205 | int BUCKET_SIZE=125; 206 | int BUCKETS=32; 207 | int warps = n_blocks * T_Group; 208 | 209 | int total_mem_for_hash=n_blocks*PER_BLOCK_WARP*BUCKETS*BUCKET_SIZE; 210 | int total_mem_for_bitmap=n_blocks*PER_BLOCK_WARP*300; 211 | //std::cout<<"Input: ./exe beg csr nblocks nthreads\n"; 212 | int *bitmap, *node, *qstop_global, *qstart_global, *sample_id, *depth_tracker, *g_sub_index, *degree_l, *counter, *pre_counter; 213 | int *seeds=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize); 214 | int *h_sample_id=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize); 215 | int *h_depth_tracker=(int *)malloc(sizeof(int)*n_subgraph*FrontierSize); 216 | 217 | const char *beg_file=beg; 218 | const char *csr_file=csr; 219 | const char *weight_file=csr; 220 | 221 | graph 222 | *ginst = new graph 223 | 224 | (beg_file,csr_file,weight_file); 225 | gpu_graph ggraph(ginst); 226 | curandState *d_state; 227 | cudaMalloc(&d_state,sizeof(curandState)); 228 | // int *host_counter=(int *)malloc(sizeof(int)); 229 | int *host_prefix_counter=(int *)malloc(sizeof(int)); 230 | int *node_list=(int *)malloc(sizeof(int)*total_length); 231 | int *set_list=(int *)malloc(sizeof(int)*total_length); 232 | 233 | int *degree_list=(int *)malloc(sizeof(int)*ginst->edge_count); 234 | std::random_device rd; 235 | std::mt19937 gen(56); 236 | std::uniform_int_distribution<> dis(1,10000); 237 | int numBlocks; 238 | //cudaGetDevice(&device); 239 | //cudaGetDeviceProperties(&prop, device); 240 | 241 | // cudaOccupancyMaxActiveBlocksPerMultiprocessor( 242 | // &numBlocks, 243 | // check, 244 | // n_threads, 245 | // 0); 246 | // cout<<"Max allocatable Blocks:"<edge_count, warps, 10000,n_subgraph, BUCKETS*BUCKET_SIZE, Depth*NeighborSize, FrontierSize, Depth); 255 | H_ERR(cudaMalloc((void **)&sampler, sizeof(Sampling))); 256 | 257 | for(int n=0;n>>(sampler, ggraph, d_state, n_subgraph, FrontierSize, NeighborSize, Depth); 284 | } 285 | else{ 286 | printf("Layer call\n"); 287 | check_layer<<>>(sampler, ggraph, d_state, n_subgraph, FrontierSize, NeighborSize, Depth); 288 | } 289 | HRR(cudaDeviceSynchronize()); 290 | // HRR(cudaMemcpy(host_counter, sampler->sampled_count, sizeof(int), cudaMemcpyDeviceToHost)); 291 | 292 | // int total_count=0; 293 | // for(int i=0; i < n_subgraph;i++){ 294 | // int count= S.samples[i].start[0]; 295 | // printf("Sampled: %d\n",host_counter[0]); 296 | // total_count+=count; 297 | // } 298 | total_time= wtime()-start_time; 299 | // printf("%s,SamplingTime:%.6f\n",argv[1],total_time); 300 | // Copy the sampled graph to CPU 301 | /* 302 | The sampled graph is stored as edge list. To get the samples in the CPU memory, copy each array from class Si to CPU allocated memory. 303 | */ 304 | // printf("Sampled edges:%d\n",host_counter[0]); 305 | // args.sampled_edges=host_counter[0]; 306 | args.time=total_time; 307 | return args; 308 | } 309 | -------------------------------------------------------------------------------- /non-stream/mpi_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "sampler.cuh" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | int main(int argc, char *argv[]) 18 | { 19 | if(argc!=11){std::cout<<"Input: ./exe <# of samples> <#GPUs>\n";exit(0);} 20 | // SampleSize, FrontierSize, NeighborSize 21 | // printf("MPI started\n"); 22 | int n_blocks = atoi(argv[4]); 23 | int n_threads = atoi(argv[5]); 24 | int SampleSize = atoi(argv[6]); 25 | int FrontierSize = atoi(argv[7]); 26 | int NeighborSize = atoi(argv[8]); 27 | int Depth= atoi(argv[9]); 28 | int total_GPU = atoi(argv[10]); 29 | 30 | MPI_Status status; 31 | int myrank; 32 | double global_max_time, global_min_time; 33 | int global_sampled_edges; 34 | struct arguments args; 35 | MPI_Init(&argc, &argv); 36 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank); 37 | int global_sum; 38 | SampleSize = SampleSize/total_GPU; 39 | 40 | args= Sampler(argv[2],argv[3], n_blocks, n_threads, SampleSize, FrontierSize, NeighborSize, Depth, args,myrank); 41 | MPI_Reduce(&args.time, &global_max_time, 1, MPI_DOUBLE,MPI_MAX, 0, MPI_COMM_WORLD); 42 | MPI_Reduce(&args.time, &global_min_time, 1, MPI_DOUBLE,MPI_MIN, 0, MPI_COMM_WORLD); 43 | float rate = global_sampled_edges/global_max_time/1000000; 44 | if(myrank==0) 45 | { 46 | printf("%s,%f,%f\n",argv[1],global_min_time,global_max_time); 47 | } 48 | MPI_Finalize(); 49 | return 0; 50 | } -------------------------------------------------------------------------------- /non-stream/run.sh: -------------------------------------------------------------------------------- 1 | 2 | ./sampling.bin WG WG/beg.bin WG/csr.bin 400 128 $1 $2 $3 $4 $5 3 | -------------------------------------------------------------------------------- /non-stream/sample_class.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SAMPLER_H 2 | #define SAMPLER_H 3 | 4 | #include "herror.h" 5 | #include "header.h" 6 | #include 7 | #include 8 | #include 9 | 10 | class Cd{ 11 | /* 12 | Candidate list shared by all instances 13 | */ 14 | public: 15 | int s=5; 16 | int *instance_ID, *vertices, *depth; 17 | int *start, *end; 18 | ~Cd(){}; 19 | Cd(){}; 20 | Cd(int len ){ 21 | H_ERR(cudaMalloc((void **)&instance_ID, sizeof(int)*len)); 22 | H_ERR(cudaMalloc((void **)&vertices, sizeof(int)*len)); 23 | H_ERR(cudaMalloc((void **)&depth, sizeof(int)*len)); 24 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2)); 25 | H_ERR(cudaMalloc((void **)&end, sizeof(int)*2)); 26 | } 27 | }; 28 | 29 | class Dimnesion{ 30 | public: 31 | int *pool; 32 | ~Dimnesion(){}; 33 | Dimnesion(){}; 34 | void init(int FrontierSize){ 35 | H_ERR(cudaMalloc((void **)&pool, sizeof(int)*FrontierSize)); 36 | } 37 | }; 38 | 39 | class Wv{ 40 | /* 41 | Warp variables 42 | */ 43 | public: 44 | int test=1; 45 | int *total_counter; 46 | int *frontier, *findex; 47 | int *neighbors, *nindex; 48 | float *degree; 49 | int *dindex; 50 | int *selected, *sindex; 51 | int *tempSelected; 52 | int *bitmap, *bindex; 53 | int NL, NS; 54 | int *max; 55 | ~Wv(){}; 56 | Wv(){} 57 | void init(int flen,int nlen, int dlen, int slen){ 58 | H_ERR(cudaMalloc((void **)&frontier, sizeof(int)*flen)); 59 | H_ERR(cudaMalloc((void **)&neighbors, sizeof(int)*nlen)); 60 | H_ERR(cudaMalloc((void **)°ree, sizeof(float)*dlen)); 61 | H_ERR(cudaMalloc((void **)&bitmap, sizeof(int)*(dlen/32))); 62 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen)); 63 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen)); 64 | H_ERR(cudaMalloc((void **)&findex, sizeof(int)*2)); 65 | H_ERR(cudaMalloc((void **)&nindex, sizeof(int)*2)); 66 | H_ERR(cudaMalloc((void **)&dindex, sizeof(int)*2)); 67 | H_ERR(cudaMalloc((void **)&sindex, sizeof(int)*2)); 68 | H_ERR(cudaMalloc((void **)&bindex, sizeof(int)*2)); 69 | H_ERR(cudaMalloc((void **)&max, sizeof(int)*2)); 70 | H_ERR(cudaMalloc((void **)&total_counter, sizeof(int)*2)); 71 | } 72 | }; 73 | 74 | 75 | 76 | class Si{ 77 | /* 78 | sampled graph for instances. Each instance have its own sample graph. 79 | */ 80 | public: 81 | int *vertex,*edge; 82 | int *start; 83 | ~Si(){}; 84 | Si(){} 85 | void init(int len){ 86 | H_ERR(cudaMalloc((void **)&vertex, sizeof(int)*len)); 87 | H_ERR(cudaMalloc((void **)&edge, sizeof(int)*len)); 88 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2)); 89 | } 90 | }; 91 | 92 | class Ht{ 93 | /* 94 | Hashtable for each instance 95 | */ 96 | public: 97 | int *hash; 98 | int *bin_counter; 99 | int BUCKETS; 100 | int bin_size=125; 101 | ~Ht(){}; 102 | Ht(){} 103 | void init(int bin_count){ 104 | BUCKETS=bin_count; 105 | H_ERR(cudaMalloc((void **)&hash, sizeof(int)*bin_count*bin_size)); 106 | H_ERR(cudaMalloc((void **)&bin_counter, sizeof(int)*bin_count)); 107 | } 108 | }; 109 | 110 | class Co{ 111 | /* 112 | Counters used in sampling. 113 | */ 114 | public: 115 | int max_NL=90000; // Update this value for dynamic allocation 116 | int *counter, *pre_counter, *total, *colcount, *max; 117 | ~Co(){}; 118 | Co(){}; 119 | Co(int total){ 120 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2)); 121 | HRR(cudaMalloc((void **) &max,sizeof(int)*max_NL)); 122 | HRR(cudaMalloc((void **) &pre_counter,sizeof(int)*2)); 123 | HRR(cudaMalloc((void **) &colcount,sizeof(int)*50)); 124 | HRR(cudaMalloc((void **) &total,sizeof(int)*total)); 125 | } 126 | }; 127 | 128 | class Cp{ 129 | /* 130 | Cache probability for each vertex in the graph. 131 | */ 132 | public: 133 | int *status; 134 | float *probability; 135 | int *counter; 136 | ~Cp(){}; 137 | Cp(){}; 138 | Cp(int len){ 139 | // HRR(cudaMalloc((void **) &status,sizeof(int)*len)); 140 | // HRR(cudaMalloc((void **) &probability,sizeof(float)*len)); 141 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2)); 142 | } 143 | }; 144 | 145 | 146 | class Sampling{ 147 | /* 148 | Collection of objects for sampling 149 | */ 150 | public: 151 | Cd candidate; 152 | Si samples[20100]; 153 | Ht hashtable[20100]; 154 | Co count; 155 | Wv wvar[2000]; 156 | Dimnesion front[4000]; 157 | Cp cache; 158 | int *max,*sampled_count,*frontier_degree; 159 | int n_child=1; 160 | int DEPTH_LIMIT; 161 | int BUCKETS=32; 162 | int max_NL=90000; // Update this value for dynamic allocation 163 | ~Sampling(){}; 164 | Sampling(int edgecount,int warpCount, int qlen, int seeds, int C_len, int sampleSize, int FrontierSize, int depth){ 165 | DEPTH_LIMIT=depth; 166 | count= Co(seeds); 167 | candidate= Cd(seeds*max_NL); 168 | cache= Cp(edgecount); 169 | HRR(cudaMalloc((void **) &max,sizeof(int)*2)); 170 | HRR(cudaMalloc((void **) &frontier_degree,sizeof(int)*sampleSize*FrontierSize)); 171 | HRR(cudaMalloc((void **) &sampled_count,sizeof(int))); 172 | for(int i=0;i 5 | static void HandleError( cudaError_t err, 6 | const char *file, 7 | int line ) { 8 | if (err != cudaSuccess) { 9 | printf( "%s in %s at line %d\n", \ 10 | cudaGetErrorString( err ), 11 | file, line ); 12 | exit( EXIT_FAILURE ); 13 | } 14 | } 15 | #define H_ERR( err ) \ 16 | (HandleError( err, __FILE__, __LINE__ )) 17 | 18 | 19 | #define SML_MID 32 20 | #define MID_LRG 1024 21 | //#define SWITCH_TO (float)10.3 22 | #define SWITCH_TO (float)0.2 23 | #define SWITCH_BACK (float)0.4 24 | //#define SWITCH_BACK (float)0.3 25 | 26 | //#define SML_MID 0 27 | //#define MID_LRG 6553600 28 | 29 | //#define SML_MID 0 30 | //#define MID_LRG 0 31 | #define GPUID 0 32 | #define THDS_NUM 512 33 | #define BLKS_NUM 512 34 | //#define BLKS_NUM 96 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /non-stream/wtime.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIME_H__ 2 | #define __TIME_H__ 3 | 4 | #include 5 | #include 6 | 7 | double wtime() 8 | { 9 | double time[2]; 10 | struct timeval time1; 11 | gettimeofday(&time1, NULL); 12 | 13 | time[0]=time1.tv_sec; 14 | time[1]=time1.tv_usec; 15 | 16 | return time[0]+time[1]*1.0e-6; 17 | } 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /streaming/Makefile: -------------------------------------------------------------------------------- 1 | exe=streaming.bin 2 | N=1 3 | cucc= "$(shell which nvcc)" 4 | cc= "$(shell which mpicxx)" 5 | commflags=-lcudart -L"$(shell dirname $(cucc))"/../lib64 6 | cuflags= --compiler-options -v -Xcudafe -\# --resource-usage 7 | cuflags+= -std=c++11 8 | objs = $(patsubst %.cu,%.o,$(wildcard *.cu)) \ 9 | $(patsubst %.cpp,%.o,$(wildcard *.cpp)) 10 | 11 | deps = $(wildcard ./*.cuh) \ 12 | $(wildcard ./*.hpp) \ 13 | $(wildcard ./*.h) \ 14 | 15 | 16 | %.o:%.cu $(deps) 17 | $(cucc) -c $(cuflags) $< -o $@ 18 | 19 | %.o:%.cpp $(deps) 20 | $(cc) -c $< -o $@ 21 | 22 | $(exe):$(objs) 23 | $(cc) $(objs) $(commflags) -O3 -o $(exe) 24 | 25 | 26 | test:$(exe) 27 | mpirun -n $(N) $(exe) WG WG/beg.bin WG/csr.bin 10 128 2 40 5 3 1 28 | clean: 29 | rm -rf *.o ${exe} 30 | -------------------------------------------------------------------------------- /streaming/README.md: -------------------------------------------------------------------------------- 1 | #### Streaming version: 2 | 3 | ##### Input format 4 | Input: ./exe <# of samples> <#GPUs> 5 | 6 | Neighbor size represents how many neighbors to sample for each vertex. 7 | 8 | ##### Example: 9 | mpirun -n 1 streaming.bin WG WG/beg.bin WG/csr.bin 10 128 1 40 5 3 1 10 | 11 | or 12 | 13 | ./run <# of samples> <#GPUs> 14 | 15 | 16 | 17 | ##### Note: 18 | The current source code only supports dividing the graph into four partition and streams two partitions into GPU. The dynamic version may be uploaded later. Memory allocation (espically for storing the samples and queue) may require higher allocation depending on the sampling parameters. 19 | Note, current version of this code works only with a single GPU. 20 | -------------------------------------------------------------------------------- /streaming/WG/beg.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/streaming/WG/beg.bin -------------------------------------------------------------------------------- /streaming/WG/csr.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/concept-inversion/C-SAW/d91ff3ac896a90a3ea7b71e9251d5e79f67f8c6c/streaming/WG/csr.bin -------------------------------------------------------------------------------- /streaming/gpu_graph.cuh: -------------------------------------------------------------------------------- 1 | //10/03/2016 2 | //Graph data structure on GPUs 3 | #ifndef _GPU_GRAPH_H_ 4 | #define _GPU_GRAPH_H_ 5 | #include 6 | #include "header.h" 7 | #include "util.h" 8 | #include "graph.h" 9 | 10 | class gpu_graph 11 | { 12 | public: 13 | vertex_t *adj_list; 14 | weight_t *weight_list; 15 | index_t *beg_pos; 16 | vertex_t *degree_list; 17 | 18 | index_t vert_count; 19 | index_t edge_count; 20 | index_t avg_degree; 21 | 22 | public: 23 | ~gpu_graph(){} 24 | 25 | gpu_graph( 26 | graph *ginst) 27 | { 28 | vert_count=ginst->vert_count; 29 | edge_count=ginst->edge_count; 30 | avg_degree = ginst->edge_count/ginst->vert_count; 31 | 32 | // size_t weight_sz=sizeof(weight_t)*edge_count; 33 | size_t adj_sz=sizeof(vertex_t)*edge_count; 34 | size_t deg_sz=sizeof(vertex_t)*edge_count; 35 | size_t beg_sz=sizeof(index_t)*(vert_count+1); 36 | vertex_t *cpu_degree_list=(vertex_t*)malloc(sizeof(vertex_t)*edge_count); 37 | /* Alloc GPU space */ 38 | H_ERR(cudaMalloc((void **)&adj_list, adj_sz)); 39 | H_ERR(cudaMalloc((void **)°ree_list, deg_sz)); 40 | H_ERR(cudaMalloc((void **)&beg_pos, beg_sz)); 41 | //H_ERR(cudaMalloc((void **)&weight_list, weight_sz)); 42 | 43 | for(int i=0; i<(ginst->edge_count); i++) 44 | { 45 | int neighbor= ginst->adj_list[i]; 46 | //cout<<"Index: "<beg_pos[neighbor+1] - ginst->beg_pos[neighbor]; 48 | } 49 | 50 | /* copy it to GPU */ 51 | H_ERR(cudaMemcpy(adj_list,ginst->adj_list, 52 | adj_sz, cudaMemcpyHostToDevice)); 53 | H_ERR(cudaMemcpy(beg_pos,ginst->beg_pos, 54 | beg_sz, cudaMemcpyHostToDevice)); 55 | H_ERR(cudaMemcpy(degree_list,cpu_degree_list, 56 | beg_sz, cudaMemcpyHostToDevice)); 57 | 58 | //H_ERR(cudaMemcpy(weight_list,ginst->weight, 59 | // weight_sz, cudaMemcpyHostToDevice)); 60 | } 61 | }; 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /streaming/graph.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_H__ 2 | #define __GRAPH_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "wtime.h" 8 | #include 9 | #include 10 | #include 11 | inline off_t fsize(const char *filename) { 12 | struct stat st; 13 | if (stat(filename, &st) == 0) 14 | return st.st_size; 15 | return -1; 16 | } 17 | 18 | template< 19 | typename file_vert_t, typename file_index_t, typename file_weight_t, 20 | typename new_vert_t, typename new_index_t, typename new_weight_t> 21 | class graph 22 | { 23 | public: 24 | new_index_t *beg_pos; 25 | new_vert_t *adj_list; 26 | new_weight_t *weight; 27 | new_vert_t *degree_list; 28 | new_index_t vert_count; 29 | new_index_t edge_count; 30 | 31 | public: 32 | graph(){}; 33 | ~graph(){}; 34 | graph(const char *beg_file, 35 | const char *adj_list_file, 36 | const char *weight_file); 37 | 38 | graph(file_vert_t *csr, 39 | file_index_t *beg_pos, 40 | file_weight_t *weight_list, 41 | file_index_t vert_count, 42 | file_index_t edge_count) 43 | { 44 | this->beg_pos = beg_pos; 45 | this->adj_list = csr; 46 | this->weight = weight_list; 47 | //this->degree_list= degree_list; 48 | this->edge_count = edge_count; 49 | this->vert_count = vert_count; 50 | }; 51 | }; 52 | #include "graph.hpp" 53 | #endif 54 | -------------------------------------------------------------------------------- /streaming/graph.hpp: -------------------------------------------------------------------------------- 1 | #include "graph.h" 2 | #include 3 | 4 | template< 5 | typename file_vert_t, typename file_index_t, typename file_weight_t, 6 | typename new_vert_t, typename new_index_t, typename new_weight_t> 7 | graph 9 | ::graph( 10 | const char *beg_file, 11 | const char *adj_file, 12 | const char *weight_file) 13 | { 14 | double tm=wtime(); 15 | FILE *file=NULL; 16 | file_index_t ret; 17 | 18 | vert_count=fsize(beg_file)/sizeof(file_index_t) - 1; 19 | edge_count=fsize(adj_file)/sizeof(file_vert_t); 20 | 21 | file=fopen(beg_file, "rb"); 22 | if(file!=NULL) 23 | { 24 | file_index_t *tmp_beg_pos=NULL; 25 | 26 | if(posix_memalign((void **)&tmp_beg_pos, getpagesize(), 27 | sizeof(file_index_t)*(vert_count+1))) 28 | perror("posix_memalign"); 29 | 30 | ret=fread(tmp_beg_pos, sizeof(file_index_t), 31 | vert_count+1, file); 32 | assert(ret==vert_count+1); 33 | fclose(file); 34 | edge_count=tmp_beg_pos[vert_count]; 35 | //std::cout<<"Expected edge count: "<0); 38 | 39 | //converting to new type when different 40 | if(sizeof(file_index_t)!=sizeof(new_index_t)) 41 | { 42 | if(posix_memalign((void **)&beg_pos, getpagesize(), 43 | sizeof(new_index_t)*(vert_count+1))) 44 | perror("posix_memalign"); 45 | for(new_index_t i=0;i 2 | #include 3 | #include 4 | #include "sampler.cuh" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | int main(int argc, char *argv[]) 18 | { 19 | if(argc!=11){std::cout<<"Input: ./exe <# of samples> <#GPUs>\n";exit(0);} 20 | // SampleSize, FrontierSize, NeighborSize 21 | // printf("MPI started\n"); 22 | int n_blocks = atoi(argv[4]); 23 | int n_threads = atoi(argv[5]); 24 | int n_subgraph = atoi(argv[6]); 25 | int FrontierSize = atoi(argv[7]); 26 | int NeighborSize = atoi(argv[8]); 27 | int Depth= atoi(argv[9]); 28 | int total_GPU = atoi(argv[10]); 29 | 30 | MPI_Status status; 31 | int myrank; 32 | double global_max_time, global_min_time; 33 | int global_sampled_edges; 34 | struct arguments args; 35 | MPI_Init(&argc, &argv); 36 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank); 37 | int global_sum; 38 | //SampleSize = SampleSize/total_GPU; 39 | args=Sampler(argv[2],argv[3], n_blocks, n_threads, n_subgraph, FrontierSize, NeighborSize, Depth, args, myrank); 40 | MPI_Reduce(&args.time, &global_max_time, 1, MPI_DOUBLE,MPI_MAX, 0, MPI_COMM_WORLD); 41 | MPI_Reduce(&args.time, &global_min_time, 1, MPI_DOUBLE,MPI_MIN, 0, MPI_COMM_WORLD); 42 | float rate = global_sampled_edges/global_max_time/1000000; 43 | if(myrank==0) 44 | { 45 | printf("%s,%f,%f\n",argv[1],global_min_time,global_max_time); 46 | } 47 | MPI_Finalize(); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /streaming/run.sh: -------------------------------------------------------------------------------- 1 | 2 | ./streaming.bin WG WG/beg.bin WG/csr.bin 10 128 $1 $2 $3 $4 $5 3 | 4 | -------------------------------------------------------------------------------- /streaming/sample_class.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SAMPLER_H 2 | #define SAMPLER_H 3 | 4 | #include "herror.h" 5 | #include "header.h" 6 | #include 7 | #include 8 | #include 9 | 10 | class Cd{ 11 | /* 12 | Candidate list shared by all instances 13 | */ 14 | public: 15 | int s=5; 16 | int *instance_ID, *vertices, *depth; 17 | int *start, *end; 18 | ~Cd(){}; 19 | Cd(){}; 20 | Cd(int len ){ 21 | H_ERR(cudaMalloc((void **)&instance_ID, sizeof(int)*len)); 22 | H_ERR(cudaMalloc((void **)&vertices, sizeof(int)*len)); 23 | H_ERR(cudaMalloc((void **)&depth, sizeof(int)*len)); 24 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2)); 25 | H_ERR(cudaMalloc((void **)&end, sizeof(int)*2)); 26 | } 27 | }; 28 | 29 | class Dimnesion{ 30 | public: 31 | int *pool; 32 | ~Dimnesion(){}; 33 | Dimnesion(){}; 34 | void init(int FrontierSize){ 35 | H_ERR(cudaMalloc((void **)&pool, sizeof(int)*FrontierSize)); 36 | } 37 | }; 38 | 39 | class Wv{ 40 | /* 41 | Warp variables 42 | */ 43 | public: 44 | int test=1; 45 | int *total_counter; 46 | int *frontier, *findex; 47 | int *neighbors, *nindex; 48 | float *degree; 49 | int *dindex; 50 | int *selected, *sindex; 51 | int *tempSelected; 52 | int *bitmap, *bindex; 53 | int NL, NS; 54 | int *max; 55 | ~Wv(){}; 56 | Wv(){} 57 | void init(int flen,int nlen, int dlen, int slen){ 58 | H_ERR(cudaMalloc((void **)&frontier, sizeof(int)*flen)); 59 | H_ERR(cudaMalloc((void **)&neighbors, sizeof(int)*nlen)); 60 | H_ERR(cudaMalloc((void **)°ree, sizeof(float)*dlen)); 61 | H_ERR(cudaMalloc((void **)&bitmap, sizeof(int)*(dlen/32))); 62 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen)); 63 | H_ERR(cudaMalloc((void **)&selected, sizeof(int)*slen)); 64 | H_ERR(cudaMalloc((void **)&findex, sizeof(int)*2)); 65 | H_ERR(cudaMalloc((void **)&nindex, sizeof(int)*2)); 66 | H_ERR(cudaMalloc((void **)&dindex, sizeof(int)*2)); 67 | H_ERR(cudaMalloc((void **)&sindex, sizeof(int)*2)); 68 | H_ERR(cudaMalloc((void **)&bindex, sizeof(int)*2)); 69 | H_ERR(cudaMalloc((void **)&max, sizeof(int)*2)); 70 | H_ERR(cudaMalloc((void **)&total_counter, sizeof(int)*2)); 71 | } 72 | }; 73 | 74 | 75 | 76 | class Si{ 77 | /* 78 | sampled graph for instances. Each instance have its own sample graph. 79 | */ 80 | public: 81 | int *vertex,*edge; 82 | int *start; 83 | ~Si(){}; 84 | Si(){} 85 | void init(int len){ 86 | H_ERR(cudaMalloc((void **)&vertex, sizeof(int)*len)); 87 | H_ERR(cudaMalloc((void **)&edge, sizeof(int)*len)); 88 | H_ERR(cudaMalloc((void **)&start, sizeof(int)*2)); 89 | } 90 | }; 91 | 92 | class Ht{ 93 | /* 94 | Hashtable for each instance 95 | */ 96 | public: 97 | int *hash; 98 | int *bin_counter; 99 | int BUCKETS; 100 | int bin_size=125; 101 | ~Ht(){}; 102 | Ht(){} 103 | void init(int bin_count){ 104 | BUCKETS=bin_count; 105 | H_ERR(cudaMalloc((void **)&hash, sizeof(int)*bin_count*bin_size)); 106 | H_ERR(cudaMalloc((void **)&bin_counter, sizeof(int)*bin_count)); 107 | } 108 | }; 109 | 110 | class Co{ 111 | /* 112 | Counters used in sampling. 113 | */ 114 | public: 115 | int *counter, *pre_counter, *total, *colcount, *max; 116 | int max_NL=90000; // Update this to dynamic allocation 117 | ~Co(){}; 118 | Co(){}; 119 | Co(int total){ 120 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2)); 121 | HRR(cudaMalloc((void **) &max,sizeof(int)*max_NL)); 122 | HRR(cudaMalloc((void **) &pre_counter,sizeof(int)*2)); 123 | HRR(cudaMalloc((void **) &colcount,sizeof(int)*50)); 124 | HRR(cudaMalloc((void **) &total,sizeof(int)*total)); 125 | } 126 | }; 127 | 128 | class Cp{ 129 | /* 130 | Cache probability for each vertex in the graph. 131 | */ 132 | public: 133 | int *status; 134 | float *probability; 135 | int *counter; 136 | ~Cp(){}; 137 | Cp(){}; 138 | Cp(int len){ 139 | // HRR(cudaMalloc((void **) &status,sizeof(int)*len)); 140 | // HRR(cudaMalloc((void **) &probability,sizeof(float)*len)); 141 | HRR(cudaMalloc((void **) &counter,sizeof(int)*2)); 142 | } 143 | }; 144 | 145 | 146 | class Sampling{ 147 | /* 148 | Collection of objects for sampling 149 | */ 150 | public: 151 | Cd candidate; 152 | Si samples[20100]; 153 | Ht hashtable[20100]; 154 | Co count; 155 | Wv wvar[2000]; 156 | Dimnesion front[4000]; 157 | Cp cache; 158 | int *max,*sampled_count,*frontier_degree; 159 | int n_child=1; 160 | int DEPTH_LIMIT; 161 | int BUCKETS=32; 162 | int max_NL=90000; // Update this to dynamic allocation 163 | ~Sampling(){}; 164 | Sampling(int edgecount,int warpCount, int qlen, int seeds, int C_len, int sampleSize, int FrontierSize, int depth){ 165 | DEPTH_LIMIT=depth; 166 | count= Co(seeds); 167 | candidate= Cd(seeds*max_NL); 168 | cache= Cp(edgecount); 169 | HRR(cudaMalloc((void **) &max,sizeof(int)*2)); 170 | HRR(cudaMalloc((void **) &frontier_degree,sizeof(int)*sampleSize*FrontierSize)); 171 | HRR(cudaMalloc((void **) &sampled_count,sizeof(int))); 172 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "gpu_graph.cuh" 12 | #include "graph.h" 13 | #include "herror.h" 14 | #include "sampler.cuh" 15 | #include "wtime.h" 16 | using namespace std; 17 | 18 | // int RAND_MAX=10000; 19 | int sum(int length, int *a) { 20 | int total = 0; 21 | // std::cout<<"\n size:"< arr[index]) { 62 | // set low to index+1 63 | low = index + 1; 64 | // printf("low:%d\n",low); 65 | 66 | } else { 67 | break; 68 | } 69 | } 70 | return index; 71 | } 72 | 73 | __device__ int bitmap_binary_search(int start, int end, float value, float *arr, 74 | int *bitmap, int bitmap_start, int &is_in) { 75 | // printf("low:%d,high:%d,value:%f\n",start,end,value); 76 | int low = start; 77 | int high = end; 78 | int index = start; 79 | int bitmap_width = 32; 80 | while (low <= high) { 81 | index = ((low + high) / 2); 82 | if (value < arr[index]) { 83 | // set high to index-1 84 | high = index - 1; 85 | // printf("high:%d\n",high); 86 | } else if (value > arr[index]) { 87 | // set low to index+1 88 | low = index + 1; 89 | // printf("low:%d\n",low); 90 | } else { 91 | break; 92 | } 93 | } 94 | int bitmap_pos = index; 95 | int bit_block_index = 96 | bitmap_pos / bitmap_width; // find the address of bitmap 97 | int bit_block_pos = bitmap_pos % bitmap_width; // position within a address 98 | // reversed------------ 99 | 100 | // int bit_block_pos = bitmap_pos / bitmap_width; 101 | // int bit_block_index= bitmap_pos % bitmap_width; 102 | int initial_mask = 1; 103 | int mask = (initial_mask << bit_block_pos); 104 | int status = atomicOr(&bitmap[bit_block_index + bitmap_start], mask); 105 | is_in = (mask & status) >> bit_block_pos; 106 | 107 | // is_in= 0x00000001 & (status >> bit_block_pos); 108 | // printf("thread: %d, index:%d, bit_block_index:%d, bit_block_pos:%d, 109 | // mask:%d, status: %d,shift: %d, 110 | // is_in:%d\n",threadIdx.x,index,bit_block_index,bit_block_pos,mask,status,(mask 111 | // & status),is_in); 112 | return index; 113 | } 114 | 115 | void r2() { 116 | std::random_device rd; 117 | std::mt19937 gen(rd()); 118 | std::uniform_real_distribution<> dis(0, 1); 119 | for (int n = 0; n < 10; ++n) { 120 | std::cout << dis(gen) << ' '; 121 | } 122 | } 123 | 124 | __device__ float frandom(curandState *global) { 125 | // curand_init(1000,threadIdx.x,10,&global[threadIdx.x]); 126 | float x = ((curand_uniform(&global[0]))); 127 | return x; 128 | } 129 | 130 | __device__ int linear_search(int neighbor, int *partition1, int *bin_count, 131 | int bin, int BIN_OFFSET, int BIN_START, 132 | int BUCKETS) { 133 | int len = bin_count[bin + BIN_OFFSET]; 134 | 135 | int i = bin + BIN_START; 136 | // printf("\nL: %d, I:%d\n",len,i); 137 | int step = 0; 138 | while (step < len) { 139 | int test = partition1[i]; 140 | // printf("Neighbor: %d, Test: %d, address: %d\n",neighbor,test,i); 141 | if (test == neighbor) { 142 | // printf("Duplicate detected 143 | // -------------------------------------------------------\n"); 144 | return 1; 145 | } else { 146 | i += BUCKETS; 147 | } 148 | step += 1; 149 | } 150 | return 0; 151 | } 152 | 153 | __device__ void gpu_prefix(int total_step, int warp_tid, float *degree_l, 154 | int offset_d_n, int warpsize, int len) { 155 | for (int i = 0; i < total_step; i++) { 156 | // Loop the threads 157 | int req_thread = len / (powf(2, (i + 1))); 158 | for (int iid = warp_tid; iid <= req_thread; iid += warpsize) { 159 | int tid_offset = iid * powf(2, i + 1); 160 | // calculate the index 161 | int i1 = (tid_offset) + (powf(2, i)) - 1 + offset_d_n; 162 | int i2 = (tid_offset) + powf(2, i + 1) - 1 + offset_d_n; 163 | if (i1 > (offset_d_n + len - 1)) { 164 | break; 165 | } 166 | // printf("i:%d, Index1 %d: %f,Index2 %d: %f, 167 | // thread:%d\n",i,i1,degree_l[i1],i2,degree_l[i2],threadIdx.x); 168 | // load the values to shared mem 169 | int temp1 = degree_l[i1]; 170 | int temp2 = degree_l[i2]; 171 | degree_l[i2] = temp2 + temp1; 172 | // printf("Index:%d, Value:%d \n",i2,temp[i2]); 173 | } 174 | } 175 | degree_l[len - 1 + offset_d_n] = 0; 176 | // printf("\nDownstep:%d\n",degree_l[len-1]); 177 | for (int i = (total_step - 1); i >= 0; i--) { 178 | // Loop the threads 179 | int req_thread = len / (powf(2, (i + 1))); 180 | for (int iid = warp_tid; iid <= req_thread; iid += warpsize) { 181 | int tid_offset = iid * powf(2, i + 1); 182 | int i1 = (tid_offset) + (powf(2, i)) - 1 + offset_d_n; 183 | int i2 = (tid_offset) + powf(2, i + 1) - 1 + offset_d_n; 184 | if (i1 > (offset_d_n + len - 1)) { 185 | break; 186 | } 187 | // printf("temp1: %d, temp2: %d, thread:%d\n",i1,i2,threadIdx.x); 188 | // printf("Index1 %d: %f,Index2 %d: %f, 189 | // thread:%d\n",i1,degree_l[i1],i2,degree_l[i2],threadIdx.x); 190 | int temp1 = degree_l[i1]; 191 | int temp2 = degree_l[i2]; 192 | degree_l[i1] = temp2; 193 | degree_l[i2] = temp2 + temp1; 194 | // printf("Index:%d, Value:%d \n",i2,temp[i2]); 195 | } 196 | } 197 | } 198 | 199 | __global__ void check(int Graph_block_size, int streamid, int block_id, 200 | vertex_t *adj_list, index_t *beg_pos, 201 | weight_t *weight_list, int vertex_count, 202 | curandState *global_state, int *g_node_list, 203 | int *g_edge_list, int *neigh_l, float *degree_l, 204 | int n_blocks, int *d_seed, int n_threads, int *total, 205 | int *hashtable, int *bitmap, int total_subgraphs, 206 | int *node, int *queue, int *sample_id, int *depth_tracker, 207 | int *qstart_global, int *qstop_global, int *g_sub_index, 208 | int n_child, int depth_limit, int sample_size, int queue_size) { 209 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 210 | // int __shared__ q1_start, q2_end, depth, q2_start, q2_stop; 211 | int temp_queue_start = qstart_global[block_id]; 212 | int temp_queue_stop = qstop_global[block_id]; 213 | //-----------------We may require a barrier here for storing temp 214 | //queue---------------------// 215 | int __shared__ bin_count[128]; 216 | int warp_tid = threadIdx.x % 32; 217 | int G_warpID = tid / 32; 218 | int warpId = threadIdx.x / 32; 219 | int warpsize = 32; 220 | int offset_d_n = G_warpID * 4000; 221 | int BUCKETS = 32; 222 | int BINsize = BUCKETS * 6; 223 | int bitmap_size = 100; 224 | int Graph_block = 4; 225 | float prefix_time, local_d_time, global_d_time; 226 | clock_t start_time, stop_time; 227 | int __shared__ prefix; 228 | int seed_index; 229 | int BIN_OFFSET = 0; 230 | int depthcount, edges_traversed, q_stop, vertex, total_work; 231 | int q_start; 232 | int queue_start_address = block_id * queue_size; 233 | curandState local_state = global_state[threadIdx.x]; 234 | curand_init( 235 | tid, 0, 0, 236 | &local_state); // sequence created with different seed and same sequence 237 | int depth_flag = 0; 238 | edges_traversed = 0; 239 | // add all items to the combined queue: Number of threads must be greater than 240 | // samples 241 | if ((qstop_global[block_id] - qstart_global[block_id]) != 0) { 242 | if (warp_tid == 0) { 243 | q_start = atomicAdd(&qstart_global[block_id], 1); 244 | } 245 | q_start = __shfl_sync(0xffffffff, q_start, 0); 246 | __syncwarp(); 247 | 248 | while (q_start < qstop_global[block_id]) { 249 | vertex = queue[q_start + queue_start_address]; 250 | //if(warp_tid==0){printf("Block_id:%d, StreamId: %d, G_warpID: %d,SampleID:%d, vertex:%d, q_stop:%d,q_start:%d,depth:%d\n",block_id,streamid,G_warpID,sample_id[q_start+queue_start_address],vertex,qstop_global[block_id],q_start,depth_tracker[q_start+queue_start_address]);} 251 | int neighbor_start = beg_pos[vertex]; 252 | int neighbor_end = beg_pos[vertex + 1]; 253 | int neighbor_length = neighbor_end - neighbor_start; 254 | edges_traversed += neighbor_length; 255 | if (neighbor_length == 0) { 256 | if (warp_tid == 0) { 257 | q_start = atomicAdd(&qstart_global[block_id], 1); 258 | } 259 | q_start = __shfl_sync(0xffffffff, q_start, 0); 260 | __syncwarp(); 261 | continue; 262 | } 263 | int is_in = 0; 264 | int new_neighbor; 265 | int selected = 0; 266 | if (neighbor_length < n_child) { 267 | prefix = 0; 268 | } else { 269 | prefix = 1; 270 | } 271 | int thread_flag = 0; 272 | if ((warp_tid < n_child) && (warp_tid < neighbor_length)) { 273 | thread_flag = 1; 274 | } 275 | if (prefix) { 276 | // For each neighbor, calculate the degree of its neighbor 277 | int index = offset_d_n + warp_tid; // use block and thread Id for index 278 | for (int i = warp_tid + neighbor_start; i < neighbor_end; 279 | i += warpsize) { 280 | // neighbor ID 281 | int temp = adj_list[i]; 282 | // if((temp>Graph_block_size)& (warp_tid==0)){printf("Reading from 283 | // outside.\n");} degree of neighbor 284 | degree_l[index] = float(beg_pos[temp + 1] - beg_pos[temp]); 285 | // printf("%d has a degree of %f found by 286 | // %d,index:%d\n",temp,degree_l[index],threadIdx.x,index); 287 | index += warpsize; 288 | } 289 | int i_start_neigh = offset_d_n; 290 | int i_end_neigh = i_start_neigh + neighbor_length; 291 | // printf("Starting prefix_sum\n"); 292 | // start_time = clock(); 293 | float bits = log2f(neighbor_length); 294 | int raise = ceilf(bits); 295 | int max_bit = powf(2, raise); 296 | int len = max_bit; 297 | int total_step = log2f(max_bit); 298 | gpu_prefix(total_step, warp_tid, degree_l, offset_d_n, warpsize, len); 299 | float sum = degree_l[neighbor_length - 1 + offset_d_n]; 300 | for (int i = warp_tid + i_start_neigh; i < i_end_neigh; i += warpsize) { 301 | // printf("i:%d, degree:%.2f\n",i,degree_l[i]); 302 | degree_l[i] = degree_l[i] / ((double)sum); 303 | } 304 | // start_time = clock(); 305 | int bitmap_start = G_warpID * bitmap_size; 306 | if (warp_tid < n_child) { 307 | float r = curand_uniform(&local_state); 308 | //------------------------------------Using 309 | //bitmaps---------------------------------------------- 310 | selected = 311 | bitmap_binary_search(i_start_neigh, i_end_neigh, r, degree_l, 312 | bitmap, bitmap_start, is_in); 313 | new_neighbor = adj_list[selected + neighbor_start - offset_d_n]; 314 | // if(is_in==0) {printf("Index: %d, New N: %d, Thread: 315 | // %d\n",selected,new_neighbor,threadIdx.x);} 316 | //-------------------------------------------------------------------------------------------- 317 | } 318 | // Reset Bitmaps 319 | int start = bitmap_start + warp_tid; 320 | int end = bitmap_start + bitmap_size; 321 | for (int i = start; i < end; i += warpsize) { 322 | bitmap[i] = 0; 323 | // printf("Bitmap cleared at %d\n",i); 324 | } 325 | } 326 | else { 327 | if (thread_flag) { 328 | new_neighbor = 329 | adj_list[warp_tid + neighbor_start]; // unwanted thread also may 330 | // get some child but will be 331 | // neglected in next section 332 | } 333 | // printf("New Neighbor: %d, thread: %d\n",new_neighbor,threadIdx.x); 334 | } 335 | /* Use hashtable for detecting duplicates*/ 336 | int BIN_START = sample_id[q_start] * BINsize; 337 | if (is_in == 0 && thread_flag) { 338 | int bin = new_neighbor % BUCKETS; 339 | is_in = linear_search(new_neighbor, hashtable, bin_count, bin, 340 | BIN_OFFSET, BIN_START, BUCKETS); 341 | // if(is_in==1){printf("Duplicated Found: %d\n",new_neighbor);} 342 | } 343 | //------------------------------------------------------------------- 344 | if (is_in == 0 && thread_flag) { 345 | //------------------------Store in 346 | //hashtable-----------------------------// 347 | int bin = new_neighbor % BUCKETS; 348 | // int index= warpId; 349 | int index = atomicAdd(&bin_count[bin + BIN_OFFSET], 1); 350 | hashtable[index] = new_neighbor; 351 | hashtable[index * BUCKETS + bin + BIN_START] = new_neighbor; 352 | int g_sub_start = sample_id[q_start] * sample_size; 353 | int g_to = atomicAdd(&g_sub_index[sample_id[q_start]], 1); 354 | //g_node_list[g_to + g_sub_start] = vertex; 355 | //g_edge_list[g_to + g_sub_start] = new_neighbor; 356 | printf("%d,%d,%d,%d\n",vertex,new_neighbor,sample_id[q_start],depth_tracker[q_start + queue_start_address]); 357 | //Added to sample:752601,328138,20,0,2 358 | // add to the expand queue 359 | if (depth_tracker[q_start] < depth_limit) { 360 | int new_bin = new_neighbor / Graph_block_size; 361 | int new_queue_start = new_bin * queue_size; 362 | // if(new_bin!=0) 363 | // { printf("Block:%d, Added to block:%d\n",block_id,new_bin);} 364 | int to = atomicAdd(&qstop_global[new_bin], 1); 365 | queue[to + new_queue_start] = new_neighbor; 366 | sample_id[to + new_queue_start] = 367 | sample_id[q_start + queue_start_address]; 368 | depth_tracker[to + new_queue_start] = 369 | depth_tracker[q_start + queue_start_address] + 1; 370 | //printf("Added: %d, to queue at index %d and block %d, local_index: %d, offset: %d, new_d: %d, prev_d: %d\n",new_neighbor,to + new_queue_start,new_bin, to, new_queue_start,depth_tracker[to + new_queue_start], depth_tracker[q_start + queue_start_address]); 371 | } 372 | } 373 | // q_start+=1; 374 | if ((qstart_global[block_id] > qstop_global[block_id])) { 375 | break; 376 | } 377 | if (warp_tid == 0) { 378 | q_start = atomicAdd(&qstart_global[block_id], 1); 379 | } 380 | q_start = __shfl_sync(0xffffffff, q_start, 0); 381 | __syncwarp(); 382 | } 383 | } 384 | } 385 | 386 | int build_histogram(int n_subgraph, int *input, int *frequency, 387 | int block_window_size, int block_size, int vert_count, 388 | int vertex_block_count) { 389 | int max_index = 0, max_value = 0; 390 | for (int i = 0; i < n_subgraph; i++) { 391 | int block = input[i] / block_size; 392 | if (block > vertex_block_count) { 393 | block = vertex_block_count; 394 | } 395 | // cout<<"Value:"< max_index) { 404 | max_index = j; 405 | max_value = combined_freq; 406 | } 407 | } 408 | cout << "Max_index:" << max_index << "Max_value:" << max_value << "\n"; 409 | return max_index; 410 | } 411 | 412 | int block_augument(int blocks, int vertex_count, index_t *beg_pos, 413 | int *beg_size_list, int *adj_size_list) { 414 | int block_size = (vertex_count) / blocks; 415 | for (int i = 0; i < (blocks + 1); i += 1) { 416 | int start_block = i * block_size; 417 | if (i == blocks) { 418 | start_block = vertex_count; 419 | } 420 | beg_size_list[i] = start_block; 421 | int start_adj = beg_pos[block_size * i]; 422 | adj_size_list[i] = start_adj; 423 | } 424 | return 0; 425 | } 426 | 427 | struct arguments Sampler(char beg[100], char csr[100], int n_blocks, 428 | int n_threads, int n_subgraph, int frontier_size, 429 | int neighbor_size, int depth, struct arguments args, 430 | int rank) { 431 | // if(args!=7){std::cout<<"Wrong input\n"; return -1;} 432 | //n_child, depth, each_subgraph, queue_size 433 | // cout<<"\nblocks:"< 457 | graph *ginst = 458 | new graph( 459 | beg_file, csr_file, weight_file); 460 | int vertex_count = ginst->vert_count; 461 | int edge_count = ginst->edge_count; 462 | int Graph_block_size = vertex_count / Graph_block; 463 | // int Graph_block_size=2000; 464 | /* 465 | printf("Size of blocks\n"); 466 | for (int i = 0; i < 4; i++) { 467 | printf("%d,%d\n", i, 468 | ginst->beg_pos[(i + 1) * Graph_block_size] - 469 | ginst->beg_pos[(i)*Graph_block_size]); 470 | } 471 | */ 472 | curandState *d_state; 473 | cudaMalloc(&d_state, sizeof(curandState)); 474 | gpu_graph ggraph(ginst); 475 | int *node_list = (int *)malloc(sizeof(int) * total_length); 476 | int *set_list = (int *)malloc(sizeof(int) * total_length); 477 | float *n_random = (float *)malloc(sizeof(float) * n_threads); 478 | int *seeds = (int *)malloc(sizeof(int) * total_queue_memory); 479 | int *seeds_counter = (int *)malloc(sizeof(int) * Graph_block); 480 | int *start_queue = (int *)malloc(sizeof(int) * Graph_block); 481 | int *degree_list = (int *)malloc(sizeof(int) * ginst->edge_count); 482 | int *adj_size_list = (int *)malloc(sizeof(int) * (Graph_block + 1)); 483 | int *beg_size_list = (int *)malloc(sizeof(int) * (Graph_block + 1)); 484 | for (int n = 0; n < Graph_block; n++) { 485 | seeds_counter[n] = 0; 486 | start_queue[n] = 0; 487 | } 488 | std::random_device rd; 489 | // 200 --> 370 Mteps 490 | int numBlocks; 491 | // cudaGetDevice(&device); 492 | // cudaGetDeviceProperties(&prop, device); 493 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, check, n_threads, 494 | 0); 495 | 496 | int deviceCount; 497 | HRR(cudaGetDeviceCount(&deviceCount)); 498 | // printf("My rank: %d, totaldevice: %d\n", rank,deviceCount); 499 | // HRR(cudaSetDevice(rank%deviceCount)); 500 | // cout<<"Max allocatable Blocks:"<edge_count); i++) { 509 | int neighbor = ginst->adj_list[i]; 510 | degree_list[i] = ginst->beg_pos[neighbor + 1] - ginst->beg_pos[neighbor]; 511 | } 512 | int *hashtable, *bitmap, *node, *queue, *qstop_global, *qstart_global, 513 | *sample_id, *depth_tracker, *g_sub_index, *degree_l, *prefix_status; 514 | // Size of blocks 515 | HRR(cudaMalloc((void **)&d_total, sizeof(int) * n_subgraph)); 516 | HRR(cudaMalloc((void **)&node, sizeof(int) * 2)); 517 | HRR(cudaMalloc((void **)°ree_l, sizeof(int) * ginst->edge_count)); 518 | HRR(cudaMalloc((void **)&prefix_status, sizeof(int) * ginst->edge_count)); 519 | HRR(cudaMalloc((void **)&d_degree_l, sizeof(float) * ginst->edge_count)); 520 | HRR(cudaMalloc((void **)&qstart_global, sizeof(int) * Graph_block)); 521 | HRR(cudaMalloc((void **)&qstop_global, sizeof(int) * Graph_block)); 522 | HRR(cudaMalloc((void **)&d_node_list, sizeof(int) * total_length)); 523 | HRR(cudaMalloc((void **)&d_edge_list, sizeof(int) * total_length)); 524 | HRR(cudaMalloc((void **)&d_neigh_l, sizeof(int) * neighbor_length_max)); 525 | HRR(cudaMalloc((void **)&hashtable, sizeof(int) * total_mem_for_hash)); 526 | HRR(cudaMalloc((void **)&bitmap, sizeof(int) * total_mem_for_bitmap)); 527 | HRR(cudaMalloc((void **)&d_degree_l, sizeof(float) * neighbor_length_max)); 528 | HRR(cudaMalloc((void **)&queue, sizeof(int) * total_queue_memory)); 529 | HRR(cudaMalloc((void **)&sample_id, sizeof(int) * total_queue_memory)); 530 | HRR(cudaMalloc((void **)&depth_tracker, sizeof(int) * total_queue_memory)); 531 | HRR(cudaMalloc((void **)&g_sub_index, sizeof(float) * total_queue_memory)); 532 | int *h_sample_id = (int *)malloc(sizeof(int) * total_queue_memory); 533 | int *h_depth_tracker = (int *)malloc(sizeof(int) * total_queue_memory); 534 | std::mt19937 gen(57); 535 | std::uniform_int_distribution<> dis(1, vertex_count / 4); 536 | 537 | for (int n = 0; n < n_subgraph; n++) { 538 | int new_seed = dis(gen); 539 | int bin_new = new_seed / Graph_block_size; 540 | if (bin_new > Graph_block) { 541 | bin_new = Graph_block; 542 | } 543 | int pos = bin_new * (queue_size) + seeds_counter[bin_new]; 544 | assert(pos < total_queue_memory); 545 | seeds_counter[bin_new]++; 546 | seeds[pos] = new_seed; 547 | h_sample_id[pos] = n; 548 | h_depth_tracker[pos] = 0; 549 | // printf("N_subgraph: %d, Seed:%d, Bin:%d\n",n,new_seed,bin_new); 550 | } 551 | /* For streaming partition */ 552 | 553 | HRR(cudaMemcpy(queue, seeds, sizeof(int) * total_queue_memory, 554 | cudaMemcpyHostToDevice)); 555 | HRR(cudaMemcpy(qstart_global, start_queue, sizeof(int) * Graph_block, 556 | cudaMemcpyHostToDevice)); 557 | HRR(cudaMemcpy(qstop_global, seeds_counter, sizeof(int) * Graph_block, 558 | cudaMemcpyHostToDevice)); 559 | HRR(cudaMemcpy(sample_id, h_sample_id, sizeof(int) * total_queue_memory, 560 | cudaMemcpyHostToDevice)); 561 | HRR(cudaMemcpy(depth_tracker, h_depth_tracker, 562 | sizeof(int) * total_queue_memory, cudaMemcpyHostToDevice)); 563 | // create three cuda streams 564 | 565 | cudaStream_t stream1, stream2, stream3, stream4; 566 | cudaStreamCreate(&stream1); 567 | cudaStreamCreate(&stream2); 568 | cudaStreamCreate(&stream3); 569 | cudaStreamCreate(&stream4); 570 | cudaEvent_t event; 571 | cudaEventCreate(&event); 572 | // find top 3 blocks 573 | int sampling_complete = false; 574 | int i = 0, block_id1 = 0, block_id2 = 1, block_id3 = 2, block_id4 = 3; 575 | 576 | int q_count, max, value; 577 | block_augument(Graph_block, vertex_count, ginst->beg_pos, beg_size_list, 578 | adj_size_list); 579 | int *block_active = (int *)malloc(sizeof(int) * (Graph_block)); 580 | int *frontiers_count = (int *)malloc(sizeof(int) * (Graph_block)); 581 | 582 | for (int j = 0; j < Graph_block; j++) { 583 | frontiers_count[j] = seeds_counter[j] - start_queue[j]; 584 | // printf("Value: %d, j: %d,Q_count:\n",frontiers_count[j],j); 585 | if (frontiers_count[j] == 0) { 586 | block_active[j] = 0; 587 | } else { 588 | block_active[j] = 1; 589 | } 590 | } 591 | printf("\nsource, destination, sample_id, depth\n"); 592 | // display(block_active,Graph_block); 593 | // block[1]=1; 594 | // block[2]=1; 595 | // printf("Start while loop.\n"); 596 | double time_start = wtime(); 597 | while (sampling_complete == false) { 598 | // display(block_active,Graph_block); 599 | if (1) { 600 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id1]], 601 | &ginst->adj_list[adj_size_list[block_id1]], 602 | adj_size_list[block_id2] - adj_size_list[block_id1], 603 | cudaMemcpyHostToDevice, stream1)); 604 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id1]], 605 | &ginst->beg_pos[beg_size_list[block_id1]], 606 | beg_size_list[block_id2] - beg_size_list[block_id1], 607 | cudaMemcpyHostToDevice, stream1)); 608 | 609 | check<<>>( 610 | Graph_block_size, 0, block_id1, ggraph.adj_list, ggraph.beg_pos, 611 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list, 612 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads, 613 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id, 614 | depth_tracker, qstart_global, qstop_global, g_sub_index, 615 | n_child, depth, each_subgraph, queue_size); 616 | } 617 | 618 | if (block_active[1]) { 619 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id2]], 620 | &ginst->adj_list[adj_size_list[block_id2]], 621 | adj_size_list[block_id3] - adj_size_list[block_id2], 622 | cudaMemcpyHostToDevice, stream2)); 623 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id2]], 624 | &ginst->beg_pos[beg_size_list[block_id2]], 625 | beg_size_list[block_id3] - beg_size_list[block_id2], 626 | cudaMemcpyHostToDevice, stream2)); 627 | check<<>>( 628 | Graph_block_size, 1, block_id2, ggraph.adj_list, ggraph.beg_pos, 629 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list, 630 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads, 631 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id, 632 | depth_tracker, qstart_global, qstop_global, g_sub_index, 633 | n_child, depth, each_subgraph, queue_size); 634 | } 635 | 636 | if (block_active[2]) { 637 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id3]], 638 | &ginst->adj_list[adj_size_list[block_id3]], 639 | adj_size_list[block_id4] - adj_size_list[block_id3], 640 | cudaMemcpyHostToDevice, stream3)); 641 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id3]], 642 | &ginst->beg_pos[beg_size_list[block_id3]], 643 | beg_size_list[block_id4] - beg_size_list[block_id3], 644 | cudaMemcpyHostToDevice, stream3)); 645 | check<<>>( 646 | Graph_block_size, 2, block_id3, ggraph.adj_list, ggraph.beg_pos, 647 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list, 648 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads, 649 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id, 650 | depth_tracker, qstart_global, qstop_global, g_sub_index, 651 | n_child, depth, each_subgraph, queue_size); 652 | } 653 | 654 | if (block_active[3]) { 655 | H_ERR(cudaMemcpyAsync(&ggraph.adj_list[adj_size_list[block_id4]], 656 | &ginst->adj_list[adj_size_list[block_id4]], 657 | adj_size_list[4] - adj_size_list[block_id4], 658 | cudaMemcpyHostToDevice, stream4)); 659 | H_ERR(cudaMemcpyAsync(&ggraph.beg_pos[beg_size_list[block_id4]], 660 | &ginst->beg_pos[beg_size_list[block_id4]], 661 | beg_size_list[4] - beg_size_list[block_id4], 662 | cudaMemcpyHostToDevice, stream4)); 663 | check<<>>( 664 | Graph_block_size, 3, block_id4, ggraph.adj_list, ggraph.beg_pos, 665 | ggraph.weight_list, ggraph.vert_count, d_state, d_node_list, 666 | d_edge_list, d_neigh_l, d_degree_l, n_blocks, d_seed, n_threads, 667 | d_total, hashtable, bitmap, n_subgraph, node, queue, sample_id, 668 | depth_tracker, qstart_global, qstop_global, g_sub_index, 669 | n_child, depth, each_subgraph, queue_size); 670 | } 671 | // wait for completion 672 | // find new top 3 blocks 673 | int status1 = cudaStreamQuery(stream1); 674 | // cout<<"Status1: "<\n"); 725 | args.sampled_edges = counted; 726 | args.time = cmp_time; 727 | return args; 728 | } 729 | 730 | // void blocks_allocator(int n_blocks,int *Block, ) 731 | -------------------------------------------------------------------------------- /streaming/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include 5 | static void HandleError( cudaError_t err, 6 | const char *file, 7 | int line ) { 8 | if (err != cudaSuccess) { 9 | printf( "%s in %s at line %d\n", \ 10 | cudaGetErrorString( err ), 11 | file, line ); 12 | exit( EXIT_FAILURE ); 13 | } 14 | } 15 | #define H_ERR( err ) \ 16 | (HandleError( err, __FILE__, __LINE__ )) 17 | 18 | 19 | #define SML_MID 32 20 | #define MID_LRG 1024 21 | //#define SWITCH_TO (float)10.3 22 | #define SWITCH_TO (float)0.2 23 | #define SWITCH_BACK (float)0.4 24 | //#define SWITCH_BACK (float)0.3 25 | 26 | //#define SML_MID 0 27 | //#define MID_LRG 6553600 28 | 29 | //#define SML_MID 0 30 | //#define MID_LRG 0 31 | #define GPUID 0 32 | #define THDS_NUM 512 33 | #define BLKS_NUM 512 34 | //#define BLKS_NUM 96 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /streaming/wtime.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIME_H__ 2 | #define __TIME_H__ 3 | 4 | #include 5 | #include 6 | 7 | double wtime() 8 | { 9 | double time[2]; 10 | struct timeval time1; 11 | gettimeofday(&time1, NULL); 12 | 13 | time[0]=time1.tv_sec; 14 | time[1]=time1.tv_usec; 15 | 16 | return time[0]+time[1]*1.0e-6; 17 | } 18 | 19 | #endif 20 | --------------------------------------------------------------------------------