├── .gitignore ├── BarnesHutParticleSystem.cpp ├── BarnesHutParticleSystem.h ├── LICENSE ├── Lock.cuh ├── Makefile ├── ParticleSystem.cpp ├── ParticleSystem.h ├── Particle_cuda.cu ├── Particle_cuda.cuh ├── README.md ├── SimulationParameters.h ├── StellarSolverVisualizer.cpp ├── StellarSolverVisualizer.h ├── debug.cpp ├── debug.h ├── kernels.cu ├── kernels.cuh └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /BarnesHutParticleSystem.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "debug.h" 7 | #include "BarnesHutParticleSystem.h" 8 | #include "Particle_cuda.cuh" 9 | 10 | #include 11 | #include 12 | // ========================================================================================== 13 | // CUDA ERROR CHECKING CODE 14 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 15 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 16 | { 17 | if (code != cudaSuccess) 18 | { 19 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 20 | if (abort) getchar(); 21 | } 22 | } 23 | 24 | // ========================================================================================== 25 | 26 | 27 | 28 | BarnesHutParticleSystem::BarnesHutParticleSystem(const SimulationParameters p, const int n) 29 | { 30 | parameters = p; 31 | step = 0; 32 | numParticles = n; 33 | numNodes = 2*n+12000; 34 | 35 | // allocate host data 36 | h_left = new float; 37 | h_right = new float; 38 | h_bottom = new float; 39 | h_top = new float; 40 | h_mass = new float[numNodes]; 41 | h_x = new float[numNodes]; 42 | h_y = new float[numNodes]; 43 | h_vx = new float[numNodes]; 44 | h_vy = new float[numNodes]; 45 | h_ax = new float[numNodes]; 46 | h_ay = new float[numNodes]; 47 | h_child = new int[4*numNodes]; 48 | h_start = new int[numNodes]; 49 | h_sorted = new int[numNodes]; 50 | h_count = new int[numNodes]; 51 | h_output = new float[2*numNodes]; 52 | 53 | // allocate device data 54 | gpuErrchk(cudaMalloc((void**)&d_left, sizeof(float))); 55 | gpuErrchk(cudaMalloc((void**)&d_right, sizeof(float))); 56 | gpuErrchk(cudaMalloc((void**)&d_bottom, sizeof(float))); 57 | gpuErrchk(cudaMalloc((void**)&d_top, sizeof(float))); 58 | gpuErrchk(cudaMemset(d_left, 0, sizeof(float))); 59 | gpuErrchk(cudaMemset(d_right, 0, sizeof(float))); 60 | gpuErrchk(cudaMemset(d_bottom, 0, sizeof(float))); 61 | gpuErrchk(cudaMemset(d_top, 0, sizeof(float))); 62 | 63 | gpuErrchk(cudaMalloc((void**)&d_mass, numNodes*sizeof(float))); 64 | gpuErrchk(cudaMalloc((void**)&d_x, numNodes*sizeof(float))); 65 | gpuErrchk(cudaMalloc((void**)&d_y, numNodes*sizeof(float))); 66 | gpuErrchk(cudaMalloc((void**)&d_vx, numNodes*sizeof(float))); 67 | gpuErrchk(cudaMalloc((void**)&d_vy, numNodes*sizeof(float))); 68 | gpuErrchk(cudaMalloc((void**)&d_ax, numNodes*sizeof(float))); 69 | gpuErrchk(cudaMalloc((void**)&d_ay, numNodes*sizeof(float))); 70 | 71 | gpuErrchk(cudaMalloc((void**)&d_index, sizeof(int))); 72 | gpuErrchk(cudaMalloc((void**)&d_child, 4*numNodes*sizeof(int))); 73 | gpuErrchk(cudaMalloc((void**)&d_start, numNodes*sizeof(int))); 74 | gpuErrchk(cudaMalloc((void**)&d_sorted, numNodes*sizeof(int))); 75 | gpuErrchk(cudaMalloc((void**)&d_count, numNodes*sizeof(int))); 76 | gpuErrchk(cudaMalloc((void**)&d_mutex, sizeof(int))); 77 | 78 | gpuErrchk(cudaMemset(d_start, -1, numNodes*sizeof(int))); 79 | gpuErrchk(cudaMemset(d_sorted, 0, numNodes*sizeof(int))); 80 | 81 | int memSize = sizeof(float) * 2 * numParticles; 82 | 83 | gpuErrchk(cudaMalloc((void**)&d_output, 2*numNodes*sizeof(float))); 84 | } 85 | 86 | 87 | BarnesHutParticleSystem::BarnesHutParticleSystem(const BarnesHutParticleSystem &system) 88 | { 89 | parameters = system.parameters; 90 | step = system.step; 91 | numParticles = system.numParticles; 92 | numNodes = system.numNodes; 93 | 94 | // allocate host data 95 | h_left = new float; 96 | h_right = new float; 97 | h_bottom = new float; 98 | h_top = new float; 99 | h_mass = new float[numNodes]; 100 | h_x = new float[numNodes]; 101 | h_y = new float[numNodes]; 102 | h_vx = new float[numNodes]; 103 | h_vy = new float[numNodes]; 104 | h_ax = new float[numNodes]; 105 | h_ay = new float[numNodes]; 106 | h_child = new int[4*numNodes]; 107 | h_start = new int[numNodes]; 108 | h_sorted = new int[numNodes]; 109 | h_count = new int[numNodes]; 110 | h_output = new float[2*numNodes]; 111 | 112 | // allocate device data 113 | gpuErrchk(cudaMalloc((void**)&d_left, sizeof(float))); 114 | gpuErrchk(cudaMalloc((void**)&d_right, sizeof(float))); 115 | gpuErrchk(cudaMalloc((void**)&d_bottom, sizeof(float))); 116 | gpuErrchk(cudaMalloc((void**)&d_top, sizeof(float))); 117 | gpuErrchk(cudaMemset(d_left, 0, sizeof(float))); 118 | gpuErrchk(cudaMemset(d_right, 0, sizeof(float))); 119 | gpuErrchk(cudaMemset(d_bottom, 0, sizeof(float))); 120 | gpuErrchk(cudaMemset(d_top, 0, sizeof(float))); 121 | 122 | gpuErrchk(cudaMalloc((void**)&d_mass, numNodes*sizeof(float))); 123 | gpuErrchk(cudaMalloc((void**)&d_x, numNodes*sizeof(float))); 124 | gpuErrchk(cudaMalloc((void**)&d_y, numNodes*sizeof(float))); 125 | gpuErrchk(cudaMalloc((void**)&d_vx, numNodes*sizeof(float))); 126 | gpuErrchk(cudaMalloc((void**)&d_vy, numNodes*sizeof(float))); 127 | gpuErrchk(cudaMalloc((void**)&d_ax, numNodes*sizeof(float))); 128 | gpuErrchk(cudaMalloc((void**)&d_ay, numNodes*sizeof(float))); 129 | 130 | gpuErrchk(cudaMalloc((void**)&d_index, sizeof(int))); 131 | gpuErrchk(cudaMalloc((void**)&d_child, 4*numNodes*sizeof(int))); 132 | gpuErrchk(cudaMalloc((void**)&d_start, numNodes*sizeof(int))); 133 | gpuErrchk(cudaMalloc((void**)&d_sorted, numNodes*sizeof(int))); 134 | gpuErrchk(cudaMalloc((void**)&d_count, numNodes*sizeof(int))); 135 | gpuErrchk(cudaMalloc((void**)&d_mutex, sizeof(int))); 136 | 137 | gpuErrchk(cudaMemset(d_start, -1, numNodes*sizeof(int))); 138 | gpuErrchk(cudaMemset(d_sorted, 0, numNodes*sizeof(int))); 139 | 140 | int memSize = sizeof(float) * 2 * numParticles; 141 | 142 | gpuErrchk(cudaMalloc((void**)&d_output, 2*numNodes*sizeof(float))); 143 | } 144 | 145 | 146 | BarnesHutParticleSystem& BarnesHutParticleSystem::operator=(const BarnesHutParticleSystem &system) 147 | { 148 | if(this != &system){ 149 | delete h_left; 150 | delete h_right; 151 | delete h_bottom; 152 | delete h_top; 153 | delete [] h_mass; 154 | delete [] h_x; 155 | delete [] h_y; 156 | delete [] h_vx; 157 | delete [] h_vy; 158 | delete [] h_ax; 159 | delete [] h_ay; 160 | delete [] h_child; 161 | delete [] h_start; 162 | delete [] h_sorted; 163 | delete [] h_count; 164 | delete [] h_output; 165 | 166 | gpuErrchk(cudaFree(d_left)); 167 | gpuErrchk(cudaFree(d_right)); 168 | gpuErrchk(cudaFree(d_bottom)); 169 | gpuErrchk(cudaFree(d_top)); 170 | 171 | gpuErrchk(cudaFree(d_mass)); 172 | gpuErrchk(cudaFree(d_x)); 173 | gpuErrchk(cudaFree(d_y)); 174 | gpuErrchk(cudaFree(d_vx)); 175 | gpuErrchk(cudaFree(d_vy)); 176 | gpuErrchk(cudaFree(d_ax)); 177 | gpuErrchk(cudaFree(d_ay)); 178 | 179 | gpuErrchk(cudaFree(d_index)); 180 | gpuErrchk(cudaFree(d_child)); 181 | gpuErrchk(cudaFree(d_start)); 182 | gpuErrchk(cudaFree(d_sorted)); 183 | gpuErrchk(cudaFree(d_count)); 184 | 185 | gpuErrchk(cudaFree(d_mutex)); 186 | 187 | gpuErrchk(cudaFree(d_output)); 188 | 189 | parameters = system.parameters; 190 | step = system.step; 191 | numParticles = system.numParticles; 192 | numNodes = system.numNodes; 193 | 194 | // allocate host data 195 | h_left = new float; 196 | h_right = new float; 197 | h_bottom = new float; 198 | h_top = new float; 199 | h_mass = new float[numNodes]; 200 | h_x = new float[numNodes]; 201 | h_y = new float[numNodes]; 202 | h_vx = new float[numNodes]; 203 | h_vy = new float[numNodes]; 204 | h_ax = new float[numNodes]; 205 | h_ay = new float[numNodes]; 206 | h_child = new int[4*numNodes]; 207 | h_start = new int[numNodes]; 208 | h_sorted = new int[numNodes]; 209 | h_count = new int[numNodes]; 210 | h_output = new float[2*numNodes]; 211 | 212 | // allocate device data 213 | gpuErrchk(cudaMalloc((void**)&d_left, sizeof(float))); 214 | gpuErrchk(cudaMalloc((void**)&d_right, sizeof(float))); 215 | gpuErrchk(cudaMalloc((void**)&d_bottom, sizeof(float))); 216 | gpuErrchk(cudaMalloc((void**)&d_top, sizeof(float))); 217 | gpuErrchk(cudaMemset(d_left, 0, sizeof(float))); 218 | gpuErrchk(cudaMemset(d_right, 0, sizeof(float))); 219 | gpuErrchk(cudaMemset(d_bottom, 0, sizeof(float))); 220 | gpuErrchk(cudaMemset(d_top, 0, sizeof(float))); 221 | 222 | gpuErrchk(cudaMalloc((void**)&d_mass, numNodes*sizeof(float))); 223 | gpuErrchk(cudaMalloc((void**)&d_x, numNodes*sizeof(float))); 224 | gpuErrchk(cudaMalloc((void**)&d_y, numNodes*sizeof(float))); 225 | gpuErrchk(cudaMalloc((void**)&d_vx, numNodes*sizeof(float))); 226 | gpuErrchk(cudaMalloc((void**)&d_vy, numNodes*sizeof(float))); 227 | gpuErrchk(cudaMalloc((void**)&d_ax, numNodes*sizeof(float))); 228 | gpuErrchk(cudaMalloc((void**)&d_ay, numNodes*sizeof(float))); 229 | 230 | gpuErrchk(cudaMalloc((void**)&d_index, sizeof(int))); 231 | gpuErrchk(cudaMalloc((void**)&d_child, 4*numNodes*sizeof(int))); 232 | gpuErrchk(cudaMalloc((void**)&d_start, numNodes*sizeof(int))); 233 | gpuErrchk(cudaMalloc((void**)&d_sorted, numNodes*sizeof(int))); 234 | gpuErrchk(cudaMalloc((void**)&d_count, numNodes*sizeof(int))); 235 | gpuErrchk(cudaMalloc((void**)&d_mutex, sizeof(int))); 236 | 237 | gpuErrchk(cudaMemset(d_start, -1, numNodes*sizeof(int))); 238 | gpuErrchk(cudaMemset(d_sorted, 0, numNodes*sizeof(int))); 239 | 240 | int memSize = sizeof(float) * 2 * numParticles; 241 | 242 | gpuErrchk(cudaMalloc((void**)&d_output, 2*numNodes*sizeof(float))); 243 | } 244 | 245 | return *this; 246 | } 247 | 248 | 249 | BarnesHutParticleSystem::~BarnesHutParticleSystem() 250 | { 251 | delete h_left; 252 | delete h_right; 253 | delete h_bottom; 254 | delete h_top; 255 | delete [] h_mass; 256 | delete [] h_x; 257 | delete [] h_y; 258 | delete [] h_vx; 259 | delete [] h_vy; 260 | delete [] h_ax; 261 | delete [] h_ay; 262 | delete [] h_child; 263 | delete [] h_start; 264 | delete [] h_sorted; 265 | delete [] h_count; 266 | delete [] h_output; 267 | 268 | gpuErrchk(cudaFree(d_left)); 269 | gpuErrchk(cudaFree(d_right)); 270 | gpuErrchk(cudaFree(d_bottom)); 271 | gpuErrchk(cudaFree(d_top)); 272 | 273 | gpuErrchk(cudaFree(d_mass)); 274 | gpuErrchk(cudaFree(d_x)); 275 | gpuErrchk(cudaFree(d_y)); 276 | gpuErrchk(cudaFree(d_vx)); 277 | gpuErrchk(cudaFree(d_vy)); 278 | gpuErrchk(cudaFree(d_ax)); 279 | gpuErrchk(cudaFree(d_ay)); 280 | 281 | gpuErrchk(cudaFree(d_index)); 282 | gpuErrchk(cudaFree(d_child)); 283 | gpuErrchk(cudaFree(d_start)); 284 | gpuErrchk(cudaFree(d_sorted)); 285 | gpuErrchk(cudaFree(d_count)); 286 | 287 | gpuErrchk(cudaFree(d_mutex)); 288 | 289 | gpuErrchk(cudaFree(d_output)); 290 | 291 | cudaDeviceSynchronize(); 292 | } 293 | 294 | 295 | int BarnesHutParticleSystem::getNumParticles() 296 | { 297 | return numParticles; 298 | } 299 | 300 | 301 | void BarnesHutParticleSystem::update() 302 | { 303 | float elapsedTime; 304 | cudaEventCreate(&start); 305 | cudaEventCreate(&stop); 306 | cudaEventRecord(start,0); 307 | 308 | ResetArrays(d_mutex, d_x, d_y, d_mass, d_count, d_start, d_sorted, d_child, d_index, d_left, d_right, d_bottom, d_top, numParticles, numNodes); 309 | ComputeBoundingBox(d_mutex, d_x, d_y, d_left, d_right, d_bottom, d_top, numParticles); 310 | BuildQuadTree(d_x, d_y, d_mass, d_count, d_start, d_child, d_index, d_left, d_right, d_bottom, d_top, numParticles, numNodes); 311 | ComputeCentreOfMass(d_x, d_y, d_mass, d_index, numParticles); 312 | SortParticles(d_count, d_start, d_sorted, d_child, d_index, numParticles); 313 | CalculateForces(d_x, d_y, d_vx, d_vy, d_ax, d_ay, d_mass, d_sorted, d_child, d_left, d_right, numParticles, parameters.gravity); 314 | IntegrateParticles(d_x, d_y, d_vx, d_vy, d_ax, d_ay, numParticles, parameters.timestep, parameters.dampening); 315 | FillOutputArray(d_x, d_y, d_output, numNodes); 316 | 317 | cudaEventRecord(stop,0); 318 | cudaEventSynchronize(stop); 319 | cudaEventElapsedTime(&elapsedTime, start, stop); 320 | cudaEventDestroy(start); 321 | cudaEventDestroy(stop); 322 | 323 | if(parameters.benchmark == true){ 324 | std::cout<<"Timestep: "< distribution(0, 1.0); 396 | std::uniform_real_distribution distribution2(0, 0.1); 397 | std::uniform_real_distribution distribution_phi(0.0, 2 * pi); 398 | std::uniform_real_distribution distribution_theta(-1.0, 1.0); 399 | 400 | // loop through all particles 401 | for (int i = 0; i < n; i++){ 402 | float phi = distribution_phi(generator); 403 | float theta = acos(distribution_theta(generator)); 404 | float r = a / sqrt(pow(distribution(generator), -0.666666) - 1); 405 | 406 | // set mass and position of particle 407 | mass[i] = 1.0; 408 | x[i] = r*cos(phi); 409 | y[i] = r*sin(phi); 410 | 411 | // set velocity of particle 412 | float s = 0.0; 413 | float t = 0.1; 414 | while(t > s*s*pow(1.0 - s*s, 3.5)){ 415 | s = distribution(generator); 416 | t = distribution2(generator); 417 | } 418 | float v = 100*s*sqrt(2)*pow(1.0 + r*r, -0.25); 419 | phi = distribution_phi(generator); 420 | theta = acos(distribution_theta(generator)); 421 | x_vel[i] = v*cos(phi); 422 | y_vel[i] = v*sin(phi); 423 | 424 | // set acceleration to zero 425 | x_acc[i] = 0.0; 426 | y_acc[i] = 0.0; 427 | } 428 | } 429 | 430 | 431 | 432 | //**************************************************************************************** 433 | // Simple disk galaxy 434 | // 435 | // 436 | // 437 | // 438 | //**************************************************************************************** 439 | void BarnesHutParticleSystem::diskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n) 440 | { 441 | float a = 1.0; 442 | float pi = 3.14159265; 443 | std::default_random_engine generator; 444 | std::uniform_real_distribution uniform(0.0, 2.0); 445 | std::uniform_real_distribution distribution(1.5, 12.0); 446 | std::uniform_real_distribution distribution_theta(0.0, 2 * pi); 447 | 448 | // loop through all particles 449 | for (int i = 0; i < n; i++){ 450 | float theta = distribution_theta(generator); 451 | float r = distribution(generator); 452 | 453 | // set mass and position of particle 454 | if(i==0){ 455 | mass[i] = 200000; 456 | x[i] = 0; 457 | y[i] = 0; 458 | } 459 | else{ 460 | mass[i] = 1.0; 461 | x[i] = r*cos(theta); 462 | y[i] = r*sin(theta); 463 | } 464 | 465 | 466 | // set velocity of particle 467 | float rotation = -1; // 1: clockwise -1: counter-clockwise 468 | float v = 1.0*sqrt(parameters.gravity*200000.0 / r); 469 | if(i==0){ 470 | x_vel[0] = 0; 471 | y_vel[0] = 0; 472 | } 473 | else{ 474 | x_vel[i] = rotation*v*sin(theta); 475 | y_vel[i] = -rotation*v*cos(theta); 476 | } 477 | 478 | // set acceleration to zero 479 | x_acc[i] = 0.0; 480 | y_acc[i] = 0.0; 481 | } 482 | 483 | } 484 | 485 | 486 | 487 | //**************************************************************************************** 488 | // Two galaxies colliding disk galaxy 489 | // 490 | // 491 | // 492 | // 493 | //**************************************************************************************** 494 | void BarnesHutParticleSystem::collidingDiskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n) 495 | { 496 | float a = 1.0; 497 | float pi = 3.14159265; 498 | std::default_random_engine generator; 499 | std::uniform_real_distribution distribution1(1.5, 12.0); 500 | std::uniform_real_distribution distribution2(1, 5.0); 501 | std::uniform_real_distribution distribution_theta(0.0, 2 * pi); 502 | 503 | 504 | // loop through all particles 505 | for (int i = 0; i < n; i++){ 506 | float theta = distribution_theta(generator); 507 | float r1 = distribution1(generator); 508 | float r2 = distribution2(generator); 509 | 510 | // set mass and position of particle 511 | if(i==0){ 512 | mass[i] = 100000; 513 | x[i] = 0; 514 | y[i] = 0; 515 | } 516 | else if(i==1){ 517 | mass[i] = 25000; 518 | x[i] = 20*cos(theta); 519 | y[i] = 20*sin(theta); 520 | } 521 | else if(i<=3*n/4){ 522 | mass[i] = 1.0; 523 | x[i] = r1*cos(theta); 524 | y[i] = r1*sin(theta); 525 | } 526 | else{ 527 | mass[i] = 1.0; 528 | x[i] = r2*cos(theta) + x[1]; 529 | y[i] = r2*sin(theta) + y[1]; 530 | } 531 | 532 | 533 | // set velocity of particle 534 | float rotation = 1; // 1: clockwise -1: counter-clockwise 535 | float v1 = 1.0*sqrt(parameters.gravity*100000.0 / r1); 536 | float v2 = 1.0*sqrt(parameters.gravity*25000.0 / r2); 537 | float v = 1.0*sqrt(parameters.gravity*100000.0 / sqrt(800)); 538 | if(i==0){ 539 | x_vel[0] = 0; 540 | y_vel[0] = 0; 541 | } 542 | else if(i==1){ 543 | x_vel[i] = 0.0;//rotation*v*sin(theta); 544 | y_vel[i] = 0.0;//-rotation*v*cos(theta); 545 | } 546 | else if(i<=3*n/4){ 547 | x_vel[i] = rotation*v1*sin(theta); 548 | y_vel[i] = -rotation*v1*cos(theta); 549 | } 550 | else{ 551 | x_vel[i] = rotation*v2*sin(theta); 552 | y_vel[i] = -rotation*v2*cos(theta); 553 | } 554 | 555 | // set acceleration to zero 556 | x_acc[i] = 0.0; 557 | y_acc[i] = 0.0; 558 | } 559 | 560 | } 561 | -------------------------------------------------------------------------------- /BarnesHutParticleSystem.h: -------------------------------------------------------------------------------- 1 | #ifndef __BARNESHUTPARTICLESYSTEM_H__ 2 | #define __BARNESHUTPARTICLESYSTEM_H__ 3 | 4 | #include 5 | #include 6 | #include "SimulationParameters.h" 7 | 8 | 9 | class BarnesHutParticleSystem 10 | { 11 | private: 12 | SimulationParameters parameters; 13 | int step; 14 | int numParticles; 15 | int numNodes; 16 | 17 | float *h_left; 18 | float *h_right; 19 | float *h_bottom; 20 | float *h_top; 21 | 22 | float *h_mass; 23 | float *h_x; 24 | float *h_y; 25 | float *h_vx; 26 | float *h_vy; 27 | float *h_ax; 28 | float *h_ay; 29 | 30 | int *h_child; 31 | int *h_start; 32 | int *h_sorted; 33 | int *h_count; 34 | 35 | float *d_left; 36 | float *d_right; 37 | float *d_bottom; 38 | float *d_top; 39 | 40 | float *d_mass; 41 | float *d_x; 42 | float *d_y; 43 | float *d_vx; 44 | float *d_vy; 45 | float *d_ax; 46 | float *d_ay; 47 | 48 | int *d_index; 49 | int *d_child; 50 | int *d_start; 51 | int *d_sorted; 52 | int *d_count; 53 | 54 | int *d_mutex; //used for locking 55 | 56 | cudaEvent_t start, stop; // used for timing 57 | 58 | float *h_output; //host output array for visualization 59 | float *d_output; //device output array for visualization 60 | 61 | public: 62 | BarnesHutParticleSystem(const SimulationParameters p, const int n); 63 | BarnesHutParticleSystem(const BarnesHutParticleSystem &system); 64 | BarnesHutParticleSystem& operator=(const BarnesHutParticleSystem &system); 65 | ~BarnesHutParticleSystem(); 66 | 67 | int getNumParticles(); 68 | void update(); 69 | void reset(); 70 | 71 | const float* getOutputBuffer(); 72 | 73 | private: 74 | void plummerModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 75 | void diskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 76 | void collidingDiskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 77 | 78 | }; 79 | 80 | 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Zhiwei Fang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Lock.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __LOCK_H__ 2 | #define __LOCK_H__ 3 | 4 | 5 | struct Lock{ 6 | int *mutex; 7 | Lock(){ 8 | int state = 0; 9 | cudaMalloc((void**)&mutex, sizeof(int)); 10 | cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice); 11 | } 12 | 13 | ~Lock(){ 14 | cudaFree(mutex); 15 | } 16 | 17 | __device__ void lock(){ 18 | while (atomicCAS(mutex, 0 ,1) != 0); 19 | } 20 | 21 | __device__ void unlock(){ 22 | atomicExch(mutex, 0); 23 | } 24 | }; 25 | 26 | 27 | #endif -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | objects= main.o StellarSolverVisualizer.o kernels.o BarnesHutParticleSystem.o Particle_cuda.o debug.o 2 | NVCC= nvcc #cuda c compiler 3 | CPP= g++ #c++ compiler 4 | opt= -O2 -g -G #optimization flag 5 | ARCH= -arch=sm_30 #cuda compute capability 6 | #LIBS= -lglut -lGLU -lGL 7 | #LIBS= -lGLU -lGL -lGLEW -lm -lsfml-graphics -lsfml-window -lsfml-system 8 | LIBS= -lGL -lGLEW -lsfml-graphics -lsfml-window -lsfml-system 9 | INCLUDE = -I/home/james/glm 10 | execname= app 11 | 12 | build: $(objects) 13 | $(NVCC) $(opt) -o $(execname) $(objects) $(LIBS) 14 | 15 | 16 | kernels.o: kernels.cu 17 | $(NVCC) $(opt) $(ARCH) -maxrregcount=32 -c kernels.cu 18 | BarnesHutParticleSystem.o: BarnesHutParticleSystem.cpp 19 | $(NVCC) $(opt) -std=c++11 -c BarnesHutParticleSystem.cpp 20 | Particle_cuda.o: Particle_cuda.cu 21 | $(NVCC) $(opt) $(ARCH) -c Particle_cuda.cu 22 | debug.o: debug.cpp 23 | $(NVCC) $(opt) $(ARCH) -c debug.cpp 24 | StellarSolverVisualizer.o: StellarSolverVisualizer.cpp 25 | $(NVCC) $(opt) $(ARCH) -c $(INCLUDE) StellarSolverVisualizer.cpp 26 | main.o: main.cpp 27 | $(NVCC) $(opt) $(ARCH) -c $(INCLUDE) main.cpp 28 | 29 | 30 | clean: 31 | rm $(objects) 32 | -------------------------------------------------------------------------------- /ParticleSystem.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "ParticleSystem.h" 4 | 5 | 6 | //**************************************************************************************** 7 | // Plummer model for spherical galaxy 8 | // 9 | // rho = 3*M_h/4*pi * (a^2 / (r^2 + a^2)^2.5) 10 | // 11 | // M(r) = M_h * (r^3 / (r^2 + a^2)^1.5) 12 | //**************************************************************************************** 13 | void ParticleSystem::plummerModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n) 14 | { 15 | float a = 1.0; 16 | float pi = 3.14159265; 17 | std::default_random_engine generator; 18 | std::uniform_real_distribution distribution(0, 1.0); 19 | std::uniform_real_distribution distribution_phi(0.0, 2 * pi); 20 | std::uniform_real_distribution distribution_theta(-1.0, 1.0); 21 | 22 | // loop through all particles 23 | for (int i = 0; i < n; i++){ 24 | float phi = distribution_phi(generator); 25 | float theta = acos(distribution_theta(generator)); 26 | float r = a*pow(distribution(generator), 0.3333) / sqrt(1 - pow(distribution(generator), 0.66667)); 27 | 28 | // set mass and position of particle 29 | if(i==0){ 30 | mass[i] = 100000; 31 | x[i] = 0; 32 | y[i] = 0; 33 | } 34 | else{ 35 | mass[i] = 1.0; 36 | x[i] = r*cos(phi); 37 | y[i] = r*sin(phi); 38 | } 39 | 40 | // set velocity of particle 41 | float rotation = 1; // 1: clockwise -1: counter-clockwise 42 | float v = 1.0*sqrt(parameters.gravity*100000.0 / r); 43 | if(i==0){ 44 | x_vel[0] = 0; 45 | y_vel[0] = 0; 46 | } 47 | else{ 48 | x_vel[i] = rotation*v*sin(phi); 49 | y_vel[i] = -rotation*v*cos(phi); 50 | } 51 | 52 | // set acceleration to zero 53 | x_acc[i] = 0.0; 54 | y_acc[i] = 0.0; 55 | } 56 | } 57 | 58 | 59 | 60 | //**************************************************************************************** 61 | // Simple disk galaxy 62 | // 63 | // 64 | // 65 | // 66 | //**************************************************************************************** 67 | void ParticleSystem::diskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n) 68 | { 69 | float a = 1.0; 70 | float pi = 3.14159265; 71 | std::default_random_engine generator; 72 | std::uniform_real_distribution distribution(1.5, 12.0); 73 | std::uniform_real_distribution distribution_theta(0.0, 2 * pi); 74 | 75 | // loop through all particles 76 | for (int i = 0; i < n; i++){ 77 | float theta = distribution_theta(generator); 78 | float r = distribution(generator); 79 | 80 | // set mass and position of particle 81 | if(i==0){ 82 | mass[i] = 100000; 83 | x[i] = 0; 84 | y[i] = 0; 85 | } 86 | else{ 87 | mass[i] = 1.0; 88 | x[i] = r*cos(theta); 89 | y[i] = r*sin(theta); 90 | } 91 | 92 | 93 | // set velocity of particle 94 | float rotation = 1; // 1: clockwise -1: counter-clockwise 95 | float v = 1.0*sqrt(parameters.gravity*100000.0 / r); 96 | if(i==0){ 97 | x_vel[0] = 0; 98 | y_vel[0] = 0; 99 | } 100 | else{ 101 | x_vel[i] = rotation*v*sin(theta); 102 | y_vel[i] = -rotation*v*cos(theta); 103 | } 104 | 105 | // set acceleration to zero 106 | x_acc[i] = 0.0; 107 | y_acc[i] = 0.0; 108 | } 109 | 110 | } 111 | 112 | 113 | 114 | //**************************************************************************************** 115 | // Two galaxies colliding disk galaxy 116 | // 117 | // 118 | // 119 | // 120 | //**************************************************************************************** 121 | void ParticleSystem::collidingDiskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n) 122 | { 123 | float a = 1.0; 124 | float pi = 3.14159265; 125 | std::default_random_engine generator; 126 | std::uniform_real_distribution distribution1(1.5, 12.0); 127 | std::uniform_real_distribution distribution2(1, 5.0); 128 | std::uniform_real_distribution distribution_theta(0.0, 2 * pi); 129 | 130 | 131 | // loop through all particles 132 | for (int i = 0; i < n; i++){ 133 | float theta = distribution_theta(generator); 134 | float r1 = distribution1(generator); 135 | float r2 = distribution2(generator); 136 | 137 | // set mass and position of particle 138 | if(i==0){ 139 | mass[i] = 100000; 140 | x[i] = 0; 141 | y[i] = 0; 142 | } 143 | else if(i==1){ 144 | mass[i] = 1; 145 | x[i] = 15*cos(theta); 146 | y[i] = 15*sin(theta); 147 | } 148 | else if(i<=n){ 149 | mass[i] = 1.0; 150 | x[i] = r1*cos(theta); 151 | y[i] = r1*sin(theta); 152 | } 153 | else{ 154 | mass[i] = 1.0; 155 | x[i] = r2*cos(theta) + x[1]; 156 | y[i] = r2*sin(theta) + y[1]; 157 | } 158 | 159 | 160 | // set velocity of particle 161 | float rotation = 1; // 1: clockwise -1: counter-clockwise 162 | float v1 = 1.0*sqrt(parameters.gravity*100000.0 / r1); 163 | float v2 = 1.0*sqrt(parameters.gravity*50000.0 / r2); 164 | float v = 1.0*sqrt(parameters.gravity*100000.0 / sqrt(450)); 165 | if(i==0){ 166 | x_vel[0] = 0; 167 | y_vel[0] = 0; 168 | } 169 | else if(i==1){ 170 | x_vel[i] = rotation*v*sin(theta); 171 | y_vel[i] = -rotation*v*cos(theta); 172 | } 173 | else if(i<=n){ 174 | x_vel[i] = rotation*v1*sin(theta); 175 | y_vel[i] = -rotation*v1*cos(theta); 176 | } 177 | else{ 178 | x_vel[i] = rotation*v2*sin(theta); 179 | y_vel[i] = -rotation*v2*cos(theta); 180 | } 181 | 182 | // set acceleration to zero 183 | x_acc[i] = 0.0; 184 | y_acc[i] = 0.0; 185 | } 186 | 187 | } -------------------------------------------------------------------------------- /ParticleSystem.h: -------------------------------------------------------------------------------- 1 | #ifndef __PARTICLESYSTEM_H__ 2 | #define __PARTICLESYSTEM_H__ 3 | 4 | #include 5 | #include "SimulationParameters.h" 6 | 7 | class ParticleSystem 8 | { 9 | public: 10 | SimulationParameters parameters; 11 | 12 | ParticleSystem(const SimulationParameters p, const int n){} 13 | virtual ~ParticleSystem(){}; 14 | 15 | virtual int getNumParticles() = 0; 16 | virtual void update() = 0; 17 | virtual void reset() = 0; 18 | virtual float* getOutputBuffer() = 0; 19 | 20 | void plummerModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 21 | void diskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 22 | void collidingDiskModel(float *mass, float *x, float* y, float *x_vel, float *y_vel, float *x_acc, float *y_acc, int n); 23 | }; 24 | 25 | 26 | #endif -------------------------------------------------------------------------------- /Particle_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "Particle_cuda.cuh" 2 | #include "kernels.cuh" 3 | 4 | 5 | dim3 gridSize = 512; 6 | dim3 blockSize = 256; 7 | 8 | void SetDrawArray(float *ptr, float *x, float *y, int n) 9 | { 10 | set_draw_array_kernel<<< gridSize, blockSize>>>(ptr, x, y, n); 11 | } 12 | 13 | 14 | void ResetArrays(int *mutex, float *x, float *y, float *mass, int *count, int *start, int *sorted, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m) 15 | { 16 | reset_arrays_kernel<<< gridSize, blockSize >>>(mutex, x, y, mass, count, start, sorted, child, index, left, right, bottom, top, n, m); 17 | } 18 | 19 | 20 | void ComputeBoundingBox(int *mutex, float *x, float *y, float *left, float *right, float *bottom, float *top, int n) 21 | { 22 | compute_bounding_box_kernel<<< gridSize, blockSize >>>(mutex, x, y, left, right, bottom, top, n); 23 | } 24 | 25 | 26 | void BuildQuadTree(float *x, float *y, float *mass, int *count, int *start, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m) 27 | { 28 | build_tree_kernel<<< gridSize, blockSize >>>(x, y, mass, count, start, child, index, left, right, bottom, top, n, m); 29 | } 30 | 31 | 32 | void ComputeCentreOfMass(float *x, float *y, float *mass, int *index, int n) 33 | { 34 | centre_of_mass_kernel<<>>(x, y, mass, index, n); 35 | } 36 | 37 | 38 | void SortParticles(int *count, int *start, int *sorted, int *child, int *index, int n) 39 | { 40 | sort_kernel<<< gridSize, blockSize >>>(count, start, sorted, child, index, n); 41 | } 42 | 43 | 44 | void CalculateForces(float* x, float *y, float *vx, float *vy, float *ax, float *ay, float *mass, int *sorted, int *child, float *left, float *right, int n, float g) 45 | { 46 | compute_forces_kernel<<< gridSize, blockSize >>>(x, y, vx, vy, ax, ay, mass, sorted, child, left, right, n, g); 47 | } 48 | 49 | 50 | void IntegrateParticles(float *x, float *y, float *vx, float *vy, float *ax, float *ay, int n, float dt, float d) 51 | { 52 | update_kernel<<>>(x, y, vx, vy, ax, ay, n, dt, d); 53 | } 54 | 55 | 56 | void FillOutputArray(float *x, float *y, float *out, int n) 57 | { 58 | copy_kernel<<>>(x, y, out, n); 59 | } -------------------------------------------------------------------------------- /Particle_cuda.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __PARTICLE_CUDA_CUH__ 2 | #define __PARTICLE_CUDA_CUH__ 3 | 4 | 5 | void SetDrawArray(float *ptr, float *x, float *y, int n); 6 | void ResetArrays(int *mutex, float *x, float *y, float *mass, int *count, int *start, int *sorted, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m); 7 | void ComputeBoundingBox(int *mutex, float *x, float *y, float *left, float *right, float *bottom, float *top, int n); 8 | void BuildQuadTree(float *x, float *y, float *mass, int *count, int *start, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m); 9 | void ComputeCentreOfMass(float *x, float *y, float *mass, int *index, int n); 10 | void SortParticles(int *count, int *start, int *sorted, int *child, int *index, int n); 11 | void CalculateForces(float* x, float *y, float *vx, float *vy, float *ax, float *ay, float *mass, int *sorted, int *child, float *left, float *right, int n, float g); 12 | void IntegrateParticles(float *x, float *y, float *vx, float *vy, float *ax, float *ay, int n, float dt, float d); 13 | void FillOutputArray(float *x, float *y, float *out, int n); 14 | 15 | 16 | #endif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StellarSolver: High-Performance N-Body Simulation with CUDA and Barnes-Hut Algorithm 2 | 3 | ## Overview 4 | 5 | StellarSolver is a comprehensive tool designed for simulating the n-body problem, utilizing the Barnes-Hut algorithm powered by CUDA. The project provides visualization through OpenGL, following Nvidia's CUDA toolkit examples. Currently, the visualization process is executed by the host, transferring data back at each time-step without utilizing CUDA-OpenGL interoperability due to system restrictions during development. 6 | 7 | ## Prerequisites 8 | 9 | StellarSolver necessitates the installation of Nvidia's CUDA toolkit on a system with a CUDA-capable device and a GCC compiler. The visualization component uses OpenGL, SFML, GLEW (OpenGL Extension Wrangler Library), and GLM (OpenGL Mathematics). 10 | 11 | For CUDA installation, refer to the [Nvidia CUDA download page](https://developer.nvidia.com/cuda-downloads) and the CUDA Quick Start Guide. On Ubuntu, install SFML and GLEW by executing the following commands: 12 | 13 | ```bash 14 | sudo apt-get install libsfml-dev 15 | sudo apt-get install libglew-dev 16 | ``` 17 | 18 | GLM, a collection of header files, can be acquired [here](http://glm.g-truc.net/0.9.8/index.html). Ensure to update the makefile INCLUDE variable to set the path to the GLM directory. 19 | 20 | ## Compilation 21 | 22 | To compile the code, execute the following commands: 23 | 24 | ```bash 25 | make clean 26 | make build 27 | ``` 28 | 29 | ## Execution 30 | 31 | StellarSolver offers multiple command-line arguments for customization. 32 | 33 | The standard execution of the Barnes-Hut algorithm with OpenGL visualization: 34 | 35 | ```bash 36 | ./app -barnes-hut -opengl 37 | ``` 38 | 39 | Execution with benchmark statistics for 500 iterations: 40 | 41 | ```bash 42 | ./app -barnes-hut -benchmark -iterations=500 43 | ``` 44 | 45 | Additional command-line options are detailed below: 46 | 47 | * `-disk` : Use a simple disk model (default). 48 | * `-plummer` : Use a Plummer model. 49 | * `-colliding-disks` : Use two colliding disks. 50 | * `-opengl` : Enable OpenGL visualization. 51 | * `-benchmark` : Output time statistics. 52 | * `-debug` : Run debug tests. 53 | * `-iterations=` : Define the number of iterations (defaults to 50). 54 | * `-gravity=` : Adjust the gravity parameter (defaults to 1.0). 55 | * `-dampening=` : Adjust the velocity dampening parameter (defaults to 1.0). 56 | 57 | ## Additional Notes 58 | 59 | Ensure to manually match the 'numbodies' variable in main.cpp and the 'blockSize' variables in kernels.cu and particle_cuda.cu. For instance, if you set `numbodies = 64*64` in main.cpp, also set `blockSize = 64` in kernels.cu, and `blockSize = 64, gridSize = 64` in particle_cuda.cu. 60 | -------------------------------------------------------------------------------- /SimulationParameters.h: -------------------------------------------------------------------------------- 1 | #ifndef __SIMULATIONPARAMETERS_H__ 2 | #define __SIMULATIONPARAMETERS_H__ 3 | 4 | 5 | 6 | typedef enum Model 7 | { 8 | disk_model, 9 | colliding_disk_model, 10 | plummer_model 11 | }Model; 12 | 13 | 14 | typedef struct SimulationParameters 15 | { 16 | Model model; 17 | bool opengl; 18 | bool debug; 19 | bool benchmark; 20 | bool fullscreen; 21 | float iterations; 22 | float timestep; 23 | float gravity; 24 | float dampening; 25 | }SimulationParameters; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /StellarSolverVisualizer.cpp: -------------------------------------------------------------------------------- 1 | #include "StellarSolverVisualizer.h" 2 | 3 | // Shader sources 4 | const GLchar* vertexSource = 5 | "#version 130\n" 6 | "in vec2 position;" 7 | "uniform mat4 model;" 8 | "uniform mat4 view;" 9 | "uniform mat4 projection;" 10 | "void main()" 11 | "{" 12 | " gl_Position = projection * view * model *vec4(position, 0.0, 1.0);" 13 | "}"; 14 | const GLchar* fragmentSource = 15 | "#version 130\n" 16 | //"out vec4 outColor;" 17 | "void main()" 18 | "{" 19 | " gl_FragColor = vec4(1.0, 1.0, 1.0, 0.1);" 20 | "}"; 21 | 22 | 23 | 24 | 25 | StellarSolverVisualizer::StellarSolverVisualizer(const SimulationParameters p, const int numBodies) 26 | { 27 | numOfBodies = numBodies; 28 | parameters = p; 29 | particles = new BarnesHutParticleSystem(parameters, numOfBodies); 30 | 31 | // opengl initialization 32 | if(parameters.opengl){ 33 | settings = new sf::ContextSettings(); 34 | settings->depthBits = 24; 35 | settings->stencilBits = 8; 36 | window = new sf::Window(sf::VideoMode(1000, 1000, 32), "N body Solver", sf::Style::Titlebar | sf::Style::Close, *settings); 37 | 38 | glewExperimental = GL_TRUE; 39 | glewInit(); 40 | 41 | // Create and compile the vertex shader 42 | vertexShader = glCreateShader(GL_VERTEX_SHADER); 43 | glShaderSource(vertexShader, 1, &vertexSource, NULL); 44 | glCompileShader(vertexShader); 45 | 46 | // Create and compile the fragment shader 47 | fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); 48 | glShaderSource(fragmentShader, 1, &fragmentSource, NULL); 49 | glCompileShader(fragmentShader); 50 | 51 | // Link the vertex and fragment shader into a shader program 52 | shaderProgram = glCreateProgram(); 53 | glAttachShader(shaderProgram, vertexShader); 54 | glAttachShader(shaderProgram, fragmentShader); 55 | glBindFragDataLocation(shaderProgram, 0, "outColor"); 56 | glLinkProgram(shaderProgram); 57 | glUseProgram(shaderProgram); 58 | } 59 | } 60 | 61 | 62 | StellarSolverVisualizer::StellarSolverVisualizer(const StellarSolverVisualizer &visualizer) 63 | { 64 | numOfBodies = visualizer.numOfBodies; 65 | parameters = visualizer.parameters; 66 | 67 | particles = new BarnesHutParticleSystem(parameters, numOfBodies); 68 | 69 | if(parameters.opengl){ 70 | settings = new sf::ContextSettings(); 71 | settings->depthBits = 24; 72 | settings->stencilBits = 8; 73 | window = new sf::Window(sf::VideoMode(1000, 1000, 32), "N body Solver", sf::Style::Titlebar | sf::Style::Close, *settings); 74 | 75 | glewExperimental = GL_TRUE; 76 | glewInit(); 77 | 78 | // Create and compile the vertex shader 79 | vertexShader = glCreateShader(GL_VERTEX_SHADER); 80 | glShaderSource(vertexShader, 1, &vertexSource, NULL); 81 | glCompileShader(vertexShader); 82 | 83 | // Create and compile the fragment shader 84 | fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); 85 | glShaderSource(fragmentShader, 1, &fragmentSource, NULL); 86 | glCompileShader(fragmentShader); 87 | 88 | // Link the vertex and fragment shader into a shader program 89 | shaderProgram = glCreateProgram(); 90 | glAttachShader(shaderProgram, vertexShader); 91 | glAttachShader(shaderProgram, fragmentShader); 92 | glBindFragDataLocation(shaderProgram, 0, "outColor"); 93 | glLinkProgram(shaderProgram); 94 | glUseProgram(shaderProgram); 95 | } 96 | } 97 | 98 | 99 | StellarSolverVisualizer& StellarSolverVisualizer::operator=(const StellarSolverVisualizer &visualizer) 100 | { 101 | if(this != &visualizer){ 102 | numOfBodies = visualizer.numOfBodies; 103 | parameters = visualizer.parameters; 104 | 105 | delete particles; 106 | particles = new BarnesHutParticleSystem(parameters, numOfBodies); 107 | 108 | if(parameters.opengl){ 109 | delete settings; 110 | delete window; 111 | 112 | settings = new sf::ContextSettings(); 113 | settings->depthBits = 24; 114 | settings->stencilBits = 8; 115 | window = new sf::Window(sf::VideoMode(1000, 1000, 32), "N body Solver", sf::Style::Titlebar | sf::Style::Close, *settings); 116 | 117 | glewExperimental = GL_TRUE; 118 | glewInit(); 119 | 120 | glDeleteProgram(shaderProgram); 121 | glDeleteShader(fragmentShader); 122 | glDeleteShader(vertexShader); 123 | 124 | // Create and compile the vertex shader 125 | vertexShader = glCreateShader(GL_VERTEX_SHADER); 126 | glShaderSource(vertexShader, 1, &vertexSource, NULL); 127 | glCompileShader(vertexShader); 128 | 129 | // Create and compile the fragment shader 130 | fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); 131 | glShaderSource(fragmentShader, 1, &fragmentSource, NULL); 132 | glCompileShader(fragmentShader); 133 | 134 | // Link the vertex and fragment shader into a shader program 135 | shaderProgram = glCreateProgram(); 136 | glAttachShader(shaderProgram, vertexShader); 137 | glAttachShader(shaderProgram, fragmentShader); 138 | glBindFragDataLocation(shaderProgram, 0, "outColor"); 139 | glLinkProgram(shaderProgram); 140 | glUseProgram(shaderProgram); 141 | } 142 | } 143 | 144 | return *this; 145 | } 146 | 147 | 148 | StellarSolverVisualizer::~StellarSolverVisualizer() 149 | { 150 | delete particles; 151 | 152 | if(parameters.opengl){ 153 | delete settings; 154 | delete window; 155 | 156 | glDeleteProgram(shaderProgram); 157 | glDeleteShader(fragmentShader); 158 | glDeleteShader(vertexShader); 159 | } 160 | } 161 | 162 | 163 | void StellarSolverVisualizer::displayDeviceProperties() 164 | { 165 | // Set up CUDA device 166 | cudaDeviceProp properties; 167 | 168 | cudaGetDeviceProperties(&properties,0); 169 | 170 | int fact = 1024; 171 | int driverVersion, runtimeVersion; 172 | 173 | cudaDriverGetVersion(&driverVersion); 174 | cudaRuntimeGetVersion(&runtimeVersion); 175 | 176 | std::cout << "************************************************************************" << std::endl; 177 | std::cout << " GPU Device Properties " << std::endl; 178 | std::cout << "************************************************************************" << std::endl; 179 | std::cout << "Name: " << properties.name << std::endl; 180 | std::cout << "CUDA driver/runtime version: " << driverVersion/1000 << "." << (driverVersion%100)/10 << "/" << runtimeVersion/1000 << "." << (runtimeVersion%100)/10 << std::endl; 181 | std::cout << "CUDA compute capabilitiy: " << properties.major << "." << properties.minor << std::endl; 182 | std::cout << "Number of multiprocessors: " << properties.multiProcessorCount << std::endl; 183 | std::cout << "GPU clock rate: " << properties.clockRate/fact << " (MHz)" << std::endl; 184 | std::cout << "Memory clock rate: " << properties.memoryClockRate/fact << " (MHz)" << std::endl; 185 | std::cout << "Memory bus width: " << properties.memoryBusWidth << "-bit" << std::endl; 186 | std::cout << "Theoretical memory bandwidth: " << (properties.memoryClockRate/fact*(properties.memoryBusWidth/8)*2)/fact <<" (GB/s)" << std::endl; 187 | std::cout << "Device global memory: " << properties.totalGlobalMem/(fact*fact) << " (MB)" << std::endl; 188 | std::cout << "Shared memory per block: " << properties.sharedMemPerBlock/fact <<" (KB)" << std::endl; 189 | std::cout << "Constant memory: " << properties.totalConstMem/fact << " (KB)" << std::endl; 190 | std::cout << "Maximum number of threads per block: " << properties.maxThreadsPerBlock << std::endl; 191 | std::cout << "Maximum thread dimension: [" << properties.maxThreadsDim[0] << ", " << properties.maxThreadsDim[1] << ", " << properties.maxThreadsDim[2] << "]" << std::endl; 192 | std::cout << "Maximum grid size: [" << properties.maxGridSize[0] << ", " << properties.maxGridSize[1] << ", " << properties.maxGridSize[2] << "]" << std::endl; 193 | std::cout << "**************************************************************************" << std::endl; 194 | std::cout << " " << std::endl; 195 | std::cout << "**************************************************************************" << std::endl; 196 | } 197 | 198 | 199 | void StellarSolverVisualizer::runSimulation() 200 | { 201 | displayDeviceProperties(); 202 | 203 | particles->reset(); 204 | 205 | for(int i=0;iupdate(); 207 | 208 | if(parameters.opengl){ 209 | const float* vertices = particles->getOutputBuffer(); 210 | 211 | glGenVertexArrays(1, &vao); 212 | glBindVertexArray(vao); 213 | 214 | glGenBuffers(1, &vbo); //generate a buffer 215 | glBindBuffer(GL_ARRAY_BUFFER, vbo); //make buffer active 216 | glBufferData(GL_ARRAY_BUFFER, 2*particles->getNumParticles()*sizeof(float), vertices, GL_DYNAMIC_DRAW); //copy data to active buffer 217 | 218 | // Specify the layout of the vertex data 219 | GLint posAttrib = glGetAttribLocation(shaderProgram, "position"); 220 | glEnableVertexAttribArray(posAttrib); 221 | glVertexAttribPointer(posAttrib, 2, GL_FLOAT, GL_FALSE, 0, 0); 222 | 223 | glBlendFunc(GL_SRC_ALPHA, GL_ONE); 224 | glEnable(GL_BLEND); 225 | 226 | // model, view, and projection matrices 227 | glm::mat4 model = glm::mat4(1.0f); 228 | glm::mat4 view = glm::mat4(1.0f); 229 | // view = glm::rotate(view, float(2*i), glm::vec3(0.0f, 1.0f, 0.0f)); 230 | glm::mat4 projection = glm::ortho(-25.0f, 25.0f, -25.0f, 25.0f, -10.0f, 10.0f); 231 | 232 | // link matrices with shader program 233 | GLint modelLoc = glGetUniformLocation(shaderProgram, "model"); 234 | GLint viewLoc = glGetUniformLocation(shaderProgram, "view"); 235 | GLint projLoc = glGetUniformLocation(shaderProgram, "projection"); 236 | glUniformMatrix4fv(modelLoc, 1, GL_FALSE, glm::value_ptr(model)); 237 | glUniformMatrix4fv(viewLoc, 1, GL_FALSE, glm::value_ptr(view)); 238 | glUniformMatrix4fv(projLoc, 1, GL_FALSE, glm::value_ptr(projection)); 239 | 240 | // Clear the screen to black 241 | glClearColor(0.0f, 0.0f, 0.0f, 0.5f); 242 | glClear(GL_COLOR_BUFFER_BIT); 243 | 244 | // Draw points 245 | glDrawArrays(GL_POINTS, 0, particles->getNumParticles()); 246 | 247 | // Swap buffers 248 | window->display(); 249 | 250 | glDeleteBuffers(1, &vbo); 251 | 252 | glDeleteVertexArrays(1, &vao); 253 | } 254 | } 255 | 256 | if(parameters.opengl){ 257 | window->close(); 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /StellarSolverVisualizer.h: -------------------------------------------------------------------------------- 1 | #ifndef __STELLARSOLVERVISUALIZER_H__ 2 | #define __STELLARSOLVERVISUALIZER_H__ 3 | 4 | 5 | #include 6 | #include "SimulationParameters.h" 7 | #include "BarnesHutParticleSystem.h" 8 | 9 | 10 | #define GLEW_STATIC 11 | #include 12 | #include 13 | #include 14 | 15 | #include "glm/glm.hpp" 16 | #include "glm/gtc/matrix_transform.hpp" 17 | #include "glm/gtc/type_ptr.hpp" 18 | 19 | 20 | 21 | class StellarSolverVisualizer 22 | { 23 | private: 24 | int numOfBodies; 25 | BarnesHutParticleSystem *particles; 26 | SimulationParameters parameters; 27 | 28 | sf::ContextSettings *settings; 29 | sf::Window *window; 30 | 31 | GLuint vao; 32 | GLuint vbo; 33 | 34 | GLuint vertexShader; 35 | GLuint fragmentShader; 36 | GLuint shaderProgram; 37 | 38 | void displayDeviceProperties(); 39 | 40 | public: 41 | StellarSolverVisualizer(const SimulationParameters p, const int numBodies); 42 | StellarSolverVisualizer(const StellarSolverVisualizer &visualizer); 43 | StellarSolverVisualizer& operator=(const StellarSolverVisualizer &visualizer); 44 | ~StellarSolverVisualizer(); 45 | 46 | void runSimulation(); 47 | }; 48 | 49 | 50 | 51 | #endif -------------------------------------------------------------------------------- /debug.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "debug.h" 4 | 5 | 6 | void DEBUG_RUN_TESTS(float *x, float *y, float *mass, int *count, int *start, int *sorted, int *child, float *left, float *right, float *bottom, float *top, int n, int m) 7 | { 8 | long test; // parameter determining whether test passed or failed 9 | 10 | 11 | //*********************************************************************** 12 | // x, y, & mass array tests 13 | //*********************************************************************** 14 | 15 | 16 | 17 | 18 | 19 | //*********************************************************************** 20 | // count array tests 21 | //*********************************************************************** 22 | 23 | // test 1 24 | test = 1; 25 | for(int i=n;i=0 && child[j] < n){ 53 | test += child[j]; 54 | } 55 | } 56 | if(test == (long)n*(long)(n-1)/2){ 57 | std::cout<<"CHILD ARRAY TEST 1 RESULT = PASS: "<= m){ 68 | test = 0; 69 | } 70 | } 71 | if(test == 1){ 72 | std::cout<<"CHILD ARRAY TEST 2 RESULT = PASS: "< *right){ 164 | test = 0; 165 | } 166 | if(*bottom > *top){ 167 | test = 0; 168 | } 169 | if(test == 1){ 170 | std::cout<<"BOUNDING BOX TEST 1 RESULT = PASS: "< 2 | #include "debug.h" 3 | #include "kernels.cuh" 4 | 5 | __device__ const int blockSize = 256; 6 | __device__ const int warp = 32; 7 | __device__ const int stackSize = 64; 8 | __device__ const float eps2 = 0.025; 9 | __device__ const float theta = 0.5; 10 | 11 | 12 | 13 | __global__ void set_draw_array_kernel(float *ptr, float *x, float *y, int n) 14 | { 15 | int index = threadIdx.x + blockDim.x*blockIdx.x; 16 | 17 | if(index < n){ 18 | ptr[2*index] = x[index]; 19 | ptr[2*index+1] = y[index]; 20 | 21 | } 22 | } 23 | 24 | 25 | __global__ void reset_arrays_kernel(int *mutex, float *x, float *y, float *mass, int *count, int *start, int *sorted, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m) 26 | { 27 | int bodyIndex = threadIdx.x + blockDim.x*blockIdx.x; 28 | int stride = blockDim.x*gridDim.x; 29 | int offset = 0; 30 | 31 | // reset quadtree arrays 32 | while(bodyIndex + offset < m){ 33 | #pragma unroll 4 34 | for(int i=0;i<4;i++){ 35 | child[(bodyIndex + offset)*4 + i] = -1; 36 | } 37 | if(bodyIndex + offset < n){ 38 | count[bodyIndex + offset] = 1; 39 | } 40 | else{ 41 | x[bodyIndex + offset] = 0; 42 | y[bodyIndex + offset] = 0; 43 | mass[bodyIndex + offset] = 0; 44 | count[bodyIndex + offset] = 0; 45 | } 46 | start[bodyIndex + offset] = -1; 47 | sorted[bodyIndex + offset] = 0; 48 | offset += stride; 49 | } 50 | 51 | if(bodyIndex == 0){ 52 | *mutex = 0; 53 | *index = n; 54 | *left = 0; 55 | *right = 0; 56 | *bottom = 0; 57 | *top = 0; 58 | } 59 | } 60 | 61 | 62 | __global__ void compute_bounding_box_kernel(int *mutex, float *x, float *y, float *left, float *right, float *bottom, float *top, int n) 63 | { 64 | int index = threadIdx.x + blockDim.x*blockIdx.x; 65 | int stride = blockDim.x*gridDim.x; 66 | float x_min = x[index]; 67 | float x_max = x[index]; 68 | float y_min = y[index]; 69 | float y_max = y[index]; 70 | 71 | __shared__ float left_cache[blockSize]; 72 | __shared__ float right_cache[blockSize]; 73 | __shared__ float bottom_cache[blockSize]; 74 | __shared__ float top_cache[blockSize]; 75 | 76 | 77 | int offset = stride; 78 | while(index + offset < n){ 79 | x_min = fminf(x_min, x[index + offset]); 80 | x_max = fmaxf(x_max, x[index + offset]); 81 | y_min = fminf(y_min, y[index + offset]); 82 | y_max = fmaxf(y_max, y[index + offset]); 83 | offset += stride; 84 | } 85 | 86 | left_cache[threadIdx.x] = x_min; 87 | right_cache[threadIdx.x] = x_max; 88 | bottom_cache[threadIdx.x] = y_min; 89 | top_cache[threadIdx.x] = y_max; 90 | 91 | __syncthreads(); 92 | 93 | // assumes blockDim.x is a power of 2! 94 | int i = blockDim.x/2; 95 | while(i != 0){ 96 | if(threadIdx.x < i){ 97 | left_cache[threadIdx.x] = fminf(left_cache[threadIdx.x], left_cache[threadIdx.x + i]); 98 | right_cache[threadIdx.x] = fmaxf(right_cache[threadIdx.x], right_cache[threadIdx.x + i]); 99 | bottom_cache[threadIdx.x] = fminf(bottom_cache[threadIdx.x], bottom_cache[threadIdx.x + i]); 100 | top_cache[threadIdx.x] = fmaxf(top_cache[threadIdx.x], top_cache[threadIdx.x + i]); 101 | } 102 | __syncthreads(); 103 | i /= 2; 104 | } 105 | 106 | if(threadIdx.x == 0){ 107 | while (atomicCAS(mutex, 0 ,1) != 0); // lock 108 | *left = fminf(*left, left_cache[0]); 109 | *right = fmaxf(*right, right_cache[0]); 110 | *bottom = fminf(*bottom, bottom_cache[0]); 111 | *top = fmaxf(*top, top_cache[0]); 112 | atomicExch(mutex, 0); // unlock 113 | } 114 | } 115 | 116 | 117 | __global__ void build_tree_kernel(float *x, float *y, float *mass, int *count, int *start, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m) 118 | { 119 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 120 | int stride = blockDim.x*gridDim.x; 121 | int offset = 0; 122 | bool newBody = true; 123 | 124 | // build quadtree 125 | float l; 126 | float r; 127 | float b; 128 | float t; 129 | int childPath; 130 | int temp; 131 | offset = 0; 132 | while((bodyIndex + offset) < n){ 133 | 134 | if(newBody){ 135 | newBody = false; 136 | 137 | l = *left; 138 | r = *right; 139 | b = *bottom; 140 | t = *top; 141 | 142 | temp = 0; 143 | childPath = 0; 144 | if(x[bodyIndex + offset] < 0.5*(l+r)){ 145 | childPath += 1; 146 | r = 0.5*(l+r); 147 | } 148 | else{ 149 | l = 0.5*(l+r); 150 | } 151 | if(y[bodyIndex + offset] < 0.5*(b+t)){ 152 | childPath += 2; 153 | t = 0.5*(t+b); 154 | } 155 | else{ 156 | b = 0.5*(t+b); 157 | } 158 | } 159 | int childIndex = child[temp*4 + childPath]; 160 | 161 | // traverse tree until we hit leaf node 162 | while(childIndex >= n){ 163 | temp = childIndex; 164 | childPath = 0; 165 | if(x[bodyIndex + offset] < 0.5*(l+r)){ 166 | childPath += 1; 167 | r = 0.5*(l+r); 168 | } 169 | else{ 170 | l = 0.5*(l+r); 171 | } 172 | if(y[bodyIndex + offset] < 0.5*(b+t)){ 173 | childPath += 2; 174 | t = 0.5*(t+b); 175 | } 176 | else{ 177 | b = 0.5*(t+b); 178 | } 179 | 180 | atomicAdd(&x[temp], mass[bodyIndex + offset]*x[bodyIndex + offset]); 181 | atomicAdd(&y[temp], mass[bodyIndex + offset]*y[bodyIndex + offset]); 182 | atomicAdd(&mass[temp], mass[bodyIndex + offset]); 183 | atomicAdd(&count[temp], 1); 184 | childIndex = child[4*temp + childPath]; 185 | } 186 | 187 | 188 | if(childIndex != -2){ 189 | int locked = temp*4 + childPath; 190 | if(atomicCAS(&child[locked], childIndex, -2) == childIndex){ 191 | if(childIndex == -1){ 192 | child[locked] = bodyIndex + offset; 193 | } 194 | else{ 195 | //int patch = 2*n; 196 | int patch = 4*n; 197 | while(childIndex >= 0 && childIndex < n){ 198 | 199 | int cell = atomicAdd(index,1); 200 | patch = min(patch, cell); 201 | if(patch != cell){ 202 | child[4*temp + childPath] = cell; 203 | } 204 | 205 | // insert old particle 206 | childPath = 0; 207 | if(x[childIndex] < 0.5*(l+r)){ 208 | childPath += 1; 209 | } 210 | if(y[childIndex] < 0.5*(b+t)){ 211 | childPath += 2; 212 | } 213 | 214 | if(DEBUG){ 215 | // if(cell >= 2*n){ 216 | if(cell >= m){ 217 | printf("%s\n", "error cell index is too large!!"); 218 | printf("cell: %d\n", cell); 219 | } 220 | } 221 | x[cell] += mass[childIndex]*x[childIndex]; 222 | y[cell] += mass[childIndex]*y[childIndex]; 223 | mass[cell] += mass[childIndex]; 224 | count[cell] += count[childIndex]; 225 | child[4*cell + childPath] = childIndex; 226 | 227 | start[cell] = -1; 228 | 229 | 230 | // insert new particle 231 | temp = cell; 232 | childPath = 0; 233 | if(x[bodyIndex + offset] < 0.5*(l+r)){ 234 | childPath += 1; 235 | r = 0.5*(l+r); 236 | } 237 | else{ 238 | l = 0.5*(l+r); 239 | } 240 | if(y[bodyIndex + offset] < 0.5*(b+t)){ 241 | childPath += 2; 242 | t = 0.5*(t+b); 243 | } 244 | else{ 245 | b = 0.5*(t+b); 246 | } 247 | x[cell] += mass[bodyIndex + offset]*x[bodyIndex + offset]; 248 | y[cell] += mass[bodyIndex + offset]*y[bodyIndex + offset]; 249 | mass[cell] += mass[bodyIndex + offset]; 250 | count[cell] += count[bodyIndex + offset]; 251 | childIndex = child[4*temp + childPath]; 252 | } 253 | 254 | child[4*temp + childPath] = bodyIndex + offset; 255 | 256 | __threadfence(); // we have been writing to global memory arrays (child, x, y, mass) thus need to fence 257 | 258 | child[locked] = patch; 259 | } 260 | 261 | // __threadfence(); // we have been writing to global memory arrays (child, x, y, mass) thus need to fence 262 | 263 | offset += stride; 264 | newBody = true; 265 | } 266 | 267 | } 268 | 269 | __syncthreads(); // not strictly needed 270 | } 271 | } 272 | 273 | 274 | 275 | __global__ void centre_of_mass_kernel(float *x, float *y, float *mass, int *index, int n) 276 | { 277 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 278 | int stride = blockDim.x*gridDim.x; 279 | int offset = 0; 280 | 281 | bodyIndex += n; 282 | while(bodyIndex + offset < *index){ 283 | x[bodyIndex + offset] /= mass[bodyIndex + offset]; 284 | y[bodyIndex + offset] /= mass[bodyIndex + offset]; 285 | 286 | offset += stride; 287 | } 288 | } 289 | 290 | 291 | 292 | __global__ void sort_kernel(int *count, int *start, int *sorted, int *child, int *index, int n) 293 | { 294 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 295 | int stride = blockDim.x*gridDim.x; 296 | int offset = 0; 297 | 298 | int s = 0; 299 | if(threadIdx.x == 0){ 300 | for(int i=0;i<4;i++){ 301 | int node = child[i]; 302 | 303 | if(node >= n){ // not a leaf node 304 | start[node] = s; 305 | s += count[node]; 306 | } 307 | else if(node >= 0){ // leaf node 308 | sorted[s] = node; 309 | s++; 310 | } 311 | } 312 | } 313 | 314 | int cell = n + bodyIndex; 315 | int ind = *index; 316 | while((cell + offset) < ind){ 317 | s = start[cell + offset]; 318 | 319 | if(s >= 0){ 320 | 321 | for(int i=0;i<4;i++){ 322 | int node = child[4*(cell+offset) + i]; 323 | 324 | if(node >= n){ // not a leaf node 325 | start[node] = s; 326 | s += count[node]; 327 | } 328 | else if(node >= 0){ // leaf node 329 | sorted[s] = node; 330 | s++; 331 | } 332 | } 333 | offset += stride; 334 | } 335 | } 336 | } 337 | 338 | 339 | 340 | __global__ void compute_forces_kernel(float* x, float *y, float *vx, float *vy, float *ax, float *ay, float *mass, int *sorted, int *child, float *left, float *right, int n, float g) 341 | { 342 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 343 | int stride = blockDim.x*gridDim.x; 344 | int offset = 0; 345 | 346 | __shared__ float depth[stackSize*blockSize/warp]; 347 | __shared__ int stack[stackSize*blockSize/warp]; // stack controled by one thread per warp 348 | 349 | float radius = 0.5*(*right - (*left)); 350 | 351 | // need this in case some of the first four entries of child are -1 (otherwise jj = 3) 352 | int jj = -1; 353 | for(int i=0;i<4;i++){ 354 | if(child[i] != -1){ 355 | jj++; 356 | } 357 | } 358 | 359 | int counter = threadIdx.x % warp; 360 | int stackStartIndex = stackSize*(threadIdx.x / warp); 361 | while(bodyIndex + offset < n){ 362 | int sortedIndex = sorted[bodyIndex + offset]; 363 | 364 | float pos_x = x[sortedIndex]; 365 | float pos_y = y[sortedIndex]; 366 | float acc_x = 0; 367 | float acc_y = 0; 368 | 369 | // initialize stack 370 | int top = jj + stackStartIndex; 371 | if(counter == 0){ 372 | int temp = 0; 373 | for(int i=0;i<4;i++){ 374 | if(child[i] != -1){ 375 | stack[stackStartIndex + temp] = child[i]; 376 | depth[stackStartIndex + temp] = radius*radius/theta; 377 | temp++; 378 | } 379 | // if(child[i] == -1){ 380 | // printf("%s %d %d %d %d %s %d\n", "THROW ERROR!!!!", child[0], child[1], child[2], child[3], "top: ",top); 381 | // } 382 | // else{ 383 | // stack[stackStartIndex + temp] = child[i]; 384 | // depth[stackStartIndex + temp] = radius*radius/theta; 385 | // temp++; 386 | // } 387 | } 388 | } 389 | 390 | __syncthreads(); 391 | 392 | // while stack is not empty 393 | while(top >= stackStartIndex){ 394 | int node = stack[top]; 395 | float dp = 0.25*depth[top]; 396 | // float dp = depth[top]; 397 | for(int i=0;i<4;i++){ 398 | int ch = child[4*node + i]; 399 | 400 | //__threadfence(); 401 | 402 | if(ch >= 0){ 403 | float dx = x[ch] - pos_x; 404 | float dy = y[ch] - pos_y; 405 | float r = dx*dx + dy*dy + eps2; 406 | if(ch < n /*is leaf node*/ || __all(dp <= r)/*meets criterion*/){ 407 | r = rsqrt(r); 408 | float f = mass[ch] * r * r * r; 409 | 410 | acc_x += f*dx; 411 | acc_y += f*dy; 412 | } 413 | else{ 414 | if(counter == 0){ 415 | stack[top] = ch; 416 | depth[top] = dp; 417 | // depth[top] = 0.25*dp; 418 | } 419 | top++; 420 | //__threadfence(); 421 | } 422 | } 423 | } 424 | 425 | top--; 426 | } 427 | 428 | ax[sortedIndex] = acc_x; 429 | ay[sortedIndex] = acc_y; 430 | 431 | offset += stride; 432 | 433 | __syncthreads(); 434 | } 435 | } 436 | 437 | 438 | 439 | // __global__ void compute_forces_kernel(float* x, float *y, float *vx, float *vy, float *ax, float *ay, float *mass, int *sorted, int *child, float *left, float *right, int n, float g) 440 | // { 441 | // int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 442 | // int stride = blockDim.x*gridDim.x; 443 | // int offset = 0; 444 | 445 | // __shared__ float depth[stackSize*blockSize/warp]; 446 | // __shared__ int stack[stackSize*blockSize/warp]; // stack controled by one thread per warp 447 | 448 | // int counter = threadIdx.x % warp; 449 | // int stackStartIndex = stackSize*(threadIdx.x / warp); 450 | // while(bodyIndex + offset < n){ 451 | // int sortedIndex = sorted[bodyIndex + offset]; 452 | 453 | // float pos_x = x[sortedIndex]; 454 | // float pos_y = y[sortedIndex]; 455 | // float acc_x = 0; 456 | // float acc_y = 0; 457 | 458 | // // initialize stack 459 | // int top = 3 + stackStartIndex; 460 | // float radius = 0.5*(*right - (*left)); 461 | // if(counter == 0){ 462 | // #pragma unroll 4 463 | // for(int i=0;i<4;i++){ 464 | // if(child[i] == -1){ 465 | // printf("%s\n", "THROW ERROR!!!!"); 466 | // } 467 | // stack[stackStartIndex + i] = child[i]; 468 | // depth[stackStartIndex + i] = radius; 469 | // } 470 | // } 471 | 472 | // __syncthreads(); 473 | 474 | // // while stack is not empty 475 | // while(top >= stackStartIndex){ 476 | // int node = stack[top]; 477 | // float dp = 0.5*depth[top]; 478 | // // float dp = depth[top]; 479 | // for(int i=0;i<4;i++){ 480 | // int ch = child[4*node + i]; 481 | 482 | // //__threadfence(); 483 | 484 | // if(ch >= 0){ 485 | // float dx = x[ch] - pos_x; 486 | // float dy = y[ch] - pos_y; 487 | // //float r = sqrt(dx*dx + dy*dy + eps2); 488 | // float r = rsqrt(dx*dx + dy*dy + eps2); 489 | // if(ch < n /*is leaf node*/ || __all(dp*r <= theta)/*meets criterion*/){ 490 | // //float f = mass[ch]/(r*r*r); 491 | // float f = mass[ch] * r * r * r; 492 | 493 | // acc_x += f*dx; 494 | // acc_y += f*dy; 495 | // } 496 | // else{ 497 | // if(counter == 0){ 498 | // stack[top] = ch; 499 | // depth[top] = dp; 500 | // // depth[top] = 0.5*dp; 501 | // } 502 | // top++; 503 | // //__threadfence(); 504 | // } 505 | // } 506 | // } 507 | 508 | // top--; 509 | // } 510 | 511 | // ax[sortedIndex] = acc_x; 512 | // ay[sortedIndex] = acc_y; 513 | 514 | // offset += stride; 515 | 516 | // __syncthreads(); 517 | // } 518 | // } 519 | 520 | 521 | 522 | __global__ void update_kernel(float *x, float *y, float *vx, float *vy, float *ax, float *ay, int n, float dt, float d) 523 | { 524 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 525 | int stride = blockDim.x*gridDim.x; 526 | int offset = 0; 527 | 528 | while(bodyIndex + offset < n){ 529 | vx[bodyIndex + offset] += dt*ax[bodyIndex + offset]; 530 | vy[bodyIndex + offset] += dt*ay[bodyIndex + offset]; 531 | 532 | x[bodyIndex + offset] += d*dt*vx[bodyIndex + offset]; 533 | y[bodyIndex + offset] += d*dt*vy[bodyIndex + offset]; 534 | 535 | offset += stride; 536 | } 537 | } 538 | 539 | 540 | 541 | __global__ void copy_kernel(float *x, float *y, float *out, int n) 542 | { 543 | int bodyIndex = threadIdx.x + blockIdx.x*blockDim.x; 544 | int stride = blockDim.x*gridDim.x; 545 | int offset = 0; 546 | 547 | while(bodyIndex + offset < n){ 548 | out[2*(bodyIndex + offset)] = x[bodyIndex + offset]; 549 | out[2*(bodyIndex + offset) + 1] = y[bodyIndex + offset]; 550 | 551 | offset += stride; 552 | } 553 | } 554 | -------------------------------------------------------------------------------- /kernels.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __KERNELS_H__ 2 | #define __KERNELS_H__ 3 | 4 | __global__ void set_draw_array_kernel(float *ptr, float *x, float *y, int n); 5 | __global__ void reset_arrays_kernel(int *mutex, float *x, float *y, float *mass, int *count, int *start, int *sorted, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m); 6 | __global__ void compute_bounding_box_kernel(int *mutex, float *x, float *y, float *left, float *right, float *bottom, float *top, int n); 7 | __global__ void build_tree_kernel(float *x, float *y, float *mass, int *count, int *start, int *child, int *index, float *left, float *right, float *bottom, float *top, int n, int m); 8 | __global__ void centre_of_mass_kernel(float *x, float *y, float *mass, int *index, int n); 9 | __global__ void sort_kernel(int *count, int *start, int *sorted, int *child, int *index, int n); 10 | __global__ void compute_forces_kernel(float* x, float *y, float *vx, float *vy, float *ax, float *ay, float *mass, int *sorted, int *child, float *left, float *right, int n, float g); 11 | __global__ void update_kernel(float *x, float *y, float *vx, float *vy, float *ax, float *ay, int n, float dt, float d); 12 | __global__ void copy_kernel(float* x, float* y, float* out, int n); 13 | 14 | #endif -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "SimulationParameters.h" 4 | #include "StellarSolverVisualizer.h" 5 | 6 | bool checkCmdLineFlag(const int argc, char** argv, const char* string) 7 | { 8 | bool flag = false; 9 | if(argc > 1){ 10 | for(int i=1;i 1){ 27 | for(int i=1;i 1){ 52 | for(int i=1;i