├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── data
    └── bfs
    │   └── simple.txt
├── example
    ├── CMakeLists.txt
    ├── bfs
    │   ├── CMakeLists.txt
    │   ├── bfs_const_size_array.cc
    │   ├── bfs_const_size_array.cu
    │   ├── bfs_inlined_dynamic_array.cc
    │   ├── bfs_loader.h
    │   └── bfs_std_vector.cc
    ├── n_body.cc
    ├── particle_simulation.cc
    └── traffic
    │   ├── CMakeLists.txt
    │   ├── benchmark.h
    │   ├── configuration.h
    │   ├── entities
    │       ├── car.inc
    │       ├── cell.inc
    │       ├── priority_yield_traffic_controller.inc
    │       ├── shared_signal_group.inc
    │       ├── simulation.inc
    │       └── traffic_light.inc
    │   ├── run_bench_traffic.sh
    │   ├── simulation_adapter.cu
    │   ├── simulation_adapter.h
    │   ├── simulation_converter.inc
    │   ├── statistics.inc
    │   └── traffic.cu
├── ikra
    ├── CMakeLists.txt
    ├── executor
    │   ├── cuda_executor.h
    │   ├── executor.h
    │   ├── iterator.h
    │   └── util.h
    └── soa
    │   ├── CMakeLists.txt
    │   ├── addressable_field_shared.inc
    │   ├── array_field.h
    │   ├── array_shared.inc
    │   ├── class_initialization.h
    │   ├── constants.h
    │   ├── cuda.h
    │   ├── field.h
    │   ├── field_shared.inc
    │   ├── field_type_generator.h
    │   ├── layout.h
    │   ├── partially_inlined_array_field.h
    │   ├── preprocessor.h
    │   ├── soa.h
    │   ├── storage.h
    │   └── util.h
└── test
    ├── CMakeLists.txt
    ├── array_cuda_test.cu
    ├── executor_test.cc
    ├── executor_test_layout.inc
    ├── minimum_cuda_test.cu
    └── soa
        ├── array_test.cc
        ├── array_test_layout.inc
        ├── basic_class_test.cc
        ├── basic_class_test_layout.inc
        ├── benchmarks
            ├── .gitignore
            ├── array
            │   ├── aos_style.cu
            │   ├── aos_style_cpu.cc
            │   ├── benchmark.h
            │   ├── configuration.h
            │   ├── inlined_soa_style.cu
            │   └── soa_style.cu
            ├── assembly
            │   ├── clang++-5.0_-O0_DynamicStorage_0_Soa.S
            │   ├── clang++-5.0_-O0_DynamicStorage_4_Soa.S
            │   ├── clang++-5.0_-O0_StaticStorage_0_Aos.S
            │   ├── clang++-5.0_-O0_StaticStorage_0_Soa.S
            │   ├── clang++-5.0_-O0_StaticStorage_4_Soa.S
            │   ├── clang++-5.0_-O3_DynamicStorage_0_Soa.S
            │   ├── clang++-5.0_-O3_DynamicStorage_4_Soa.S
            │   ├── clang++-5.0_-O3_StaticStorage_0_Aos.S
            │   ├── clang++-5.0_-O3_StaticStorage_0_Soa.S
            │   ├── clang++-5.0_-O3_StaticStorage_4_Soa.S
            │   ├── clang++-5.0_nbody_aos.S
            │   ├── clang++-5.0_nbody_ikracpp.S
            │   ├── clang++-5.0_nbody_ikracpp_field.S
            │   ├── clang++-5.0_nbody_ikracpp_inversed.S
            │   ├── clang++-5.0_nbody_soa.S
            │   ├── cuda_array_aos_style.S
            │   ├── cuda_array_inlined_soa_style.S
            │   ├── cuda_array_soa_style.S
            │   ├── cuda_codegen_test.S
            │   ├── cuda_nbody_ikracpp_inversed_field_gpu.S
            │   ├── cuda_nbody_ikracpp_inversed_gpu.S
            │   ├── cuda_nbody_soa_inversed_gpu.S
            │   ├── g++_-O0_DynamicStorage_0_Soa.S
            │   ├── g++_-O0_DynamicStorage_4_Soa.S
            │   ├── g++_-O0_StaticStorage_0_Aos.S
            │   ├── g++_-O0_StaticStorage_0_Soa.S
            │   ├── g++_-O0_StaticStorage_4_Soa.S
            │   ├── g++_-O3_DynamicStorage_0_Soa.S
            │   ├── g++_-O3_DynamicStorage_4_Soa.S
            │   ├── g++_-O3_StaticStorage_0_Aos.S
            │   ├── g++_-O3_StaticStorage_0_Soa.S
            │   ├── g++_-O3_StaticStorage_4_Soa.S
            │   ├── g++_nbody_aos.S
            │   ├── g++_nbody_ikracpp.S
            │   ├── g++_nbody_ikracpp_field.S
            │   ├── g++_nbody_ikracpp_inversed.S
            │   └── g++_nbody_soa.S
            ├── build.sh
            ├── codegen_test.cc
            ├── cuda_codegen_test.cu
            └── nbody
            │   ├── aos.cc
            │   ├── benchmark.h
            │   ├── ikracpp.cc
            │   ├── ikracpp_field.cc
            │   ├── ikracpp_inversed.cc
            │   ├── ikracpp_inversed_field_gpu.cu
            │   ├── ikracpp_inversed_gpu.cu
            │   ├── soa.cc
            │   └── soa_inversed.cu
        ├── cuda_array_test.cu
        ├── cuda_inline_array_memcpy_aos_test.cu
        ├── cuda_inline_array_memcpy_test.cu
        └── pointer_arithmetics_test.cc


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.i
 2 | *.ii
 3 | *.gpu
 4 | *.ptx
 5 | *.cubin
 6 | *.fatbin
 7 | 
 8 | # Prerequisites
 9 | *.d
10 | 
11 | # Compiled Object files
12 | *.slo
13 | *.lo
14 | *.o
15 | *.obj
16 | 
17 | # Precompiled Headers
18 | *.gch
19 | *.pch
20 | 
21 | # Compiled Dynamic libraries
22 | *.so
23 | *.dylib
24 | *.dll
25 | 
26 | # Fortran module files
27 | *.mod
28 | *.smod
29 | 
30 | # Compiled Static libraries
31 | *.lai
32 | *.la
33 | *.a
34 | *.lib
35 | 
36 | # Executables
37 | *.exe
38 | *.out
39 | *.app
40 | 
41 | test/experiment_test.cc
42 | 
43 | # CMake
44 | CMakeCache.txt
45 | CMakeFiles
46 | CMakeScripts
47 | Testing
48 | Makefile
49 | cmake_install.cmake
50 | install_manifest.txt
51 | compile_commands.json
52 | CTestTestfile.cmake
53 | bin/
54 | *.pc
55 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/googletest"]
2 | 	path = lib/googletest
3 | 	url = git@github.com:google/googletest.git
4 | [submodule "example/traffic/traffic-simulation"]
5 | 	path = example/traffic/traffic-simulation
6 | 	url = git@github.com:prg-titech/traffic-simulation.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
 2 | project(ikra-cpp)
 3 | 
 4 | # Set up CUDA
 5 | # TODO: -gencode arch=compute_52,code=sm_52
 6 | # TODO: This should be CUDA 9.0 but the version check is broken on my machine.
 7 | find_package(CUDA 8.0 REQUIRED)
 8 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++11 --expt-extended-lambda)
 9 | 
10 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
11 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
12 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
13 | 
14 | set(CMAKE_CXX_STANDARD 11)
15 | 
16 | if(NOT CMAKE_BUILD_TYPE)
17 |   set(CMAKE_BUILD_TYPE Debug)
18 | endif()
19 | 
20 | set(CMAKE_CXX_FLAGS "-Wall -Wextra")
21 | set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
22 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -Wno-unused-parameter")
23 | 
24 | add_subdirectory(lib/googletest)
25 | enable_testing()
26 | 
27 | add_subdirectory(ikra)
28 | add_subdirectory(test)
29 | add_subdirectory(example)
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Programming Research Group @ TokyoTech
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ikra-Cpp
  2 | Ikra-Cpp is a C++/CUDA DSL for object-oriented programming with Structure of Arrays (SOA) data layout. Ikra-Cpp provides an AOS-style OOP notation that is similar to standard C++ but layouts data as SOA. This gives programmers the performance benefit of SOA and the expressivness of AOS-style object-oriented programming at the same time.
  3 | 
  4 | ## Example
  5 | ```c++
  6 | // N-Body Simulation
  7 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  8 | 
  9 | #include "executor/executor.h"
 10 | #include "soa/soa.h"
 11 | 
 12 | using namespace std;
 13 | using ikra::executor::execute;
 14 | using ikra::soa::SoaLayout;
 15 | 
 16 | static const int kNumBodies = 25;
 17 | static const double kMaxMass = 1000;
 18 | static const double kTimeInterval = 0.5;
 19 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 20 | static const int kIterations = 1000;
 21 | 
 22 | #define RAND (1.0 * rand() / RAND_MAX)
 23 | 
 24 | class Body : public SoaLayout<Body, kNumBodies> {
 25 |  public:
 26 |   IKRA_INITIALIZE_CLASS
 27 | 
 28 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 29 |       : mass_(mass) {
 30 |     position_[0] = pos_x;
 31 |     position_[1] = pos_y;
 32 |     velocity_[0] = vel_x;
 33 |     velocity_[1] = vel_y;
 34 |     this->reset_force();
 35 |   }
 36 | 
 37 |   double_ mass_;
 38 |   array_(double, 2) position_;
 39 |   array_(double, 2) velocity_;
 40 |   array_(double, 2) force_;
 41 | 
 42 |   void add_force(Body* body) {
 43 |     if (this == body) return;
 44 | 
 45 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 46 |     double dx = body->position_[0] - position_[0];
 47 |     double dy = body->position_[1] - position_[1];
 48 |     double dist = sqrt(dx*dx + dy*dy);
 49 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 50 |     force_[0] += F*dx / dist;
 51 |     force_[1] += F*dy / dist;
 52 |   }
 53 | 
 54 |   void add_force_to_all() {
 55 |     execute(&Body::add_force, this);
 56 |   }
 57 | 
 58 |   void reset_force() {
 59 |     force_[0] = 0;
 60 |     force_[1] = 0;
 61 |   }
 62 | 
 63 |   void update(double dt) {
 64 |     velocity_[0] += force_[0]*dt / mass_;
 65 |     velocity_[1] += force_[1]*dt / mass_;
 66 |     position_[0] += velocity_[0]*dt;
 67 |     position_[1] += velocity_[1]*dt;
 68 | 
 69 |     // Reflect bodies.
 70 |     if (position_[0] < -1 || position_[0] > 1) velocity_[0] = -velocity_[0];
 71 |     if (position_[1] < -1 || position_[1] > 1) velocity_[1] = -velocity_[1];
 72 |   }
 73 | };
 74 | IKRA_HOST_STORAGE(Body)
 75 | 
 76 | int main() {
 77 |   // Initialize object storage.
 78 |   Body::initialize_storage();
 79 | 
 80 |   // Create objects.
 81 |   for (int i = 0; i < kNumBodies; ++i) {
 82 |     double mass = (RAND/2 + 0.5) * kMaxMass;
 83 |     double pos_x = RAND*2 - 1;
 84 |     double pos_y = RAND*2 - 1;
 85 |     double vel_x = (RAND - 0.5) / 1000;
 86 |     double vel_y = (RAND - 0.5) / 1000;
 87 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
 88 |   }
 89 | 
 90 |   // Run simulation.
 91 |   for (int i = 0; i < kIterations; ++i) {
 92 |     // Reset forces.
 93 |     execute(&Body::reset_force);
 94 | 
 95 |     // Update forces.
 96 |     execute(&Body::add_force_to_all);
 97 | 
 98 |     // Update velocities and positions.
 99 |     execute(&Body::update, kTimeInterval);
100 |   }
101 |   
102 |   return 0;
103 | }
104 | ```
105 | 
106 | ## Installation
107 | ### Prerequisites
108 | * gcc version 5.4.0 or higher
109 | * For visual examples (n-body simulation): SDL 2.0
110 | * For GPU support: CUDA Toolkit 9.0 or higher
111 | * C++11 compiler support in CPU mode, C++14 compiler support in GPU mode
112 | 
113 | ### Installation via Git
114 | ``` sh
115 | git clone --recursive git@github.com:prg-titech/ikra-cpp.git
116 | cmake CMakeLists.txt
117 | make
118 | # Check if everything is working.
119 | ctest
120 | bin/n_body
121 | ```
122 | 
123 | 


--------------------------------------------------------------------------------
/data/bfs/simple.txt:
--------------------------------------------------------------------------------
 1 | 0 1
 2 | 0 2
 3 | 1 0
 4 | 2 0
 5 | 2 3
 6 | 2 5
 7 | 3 2
 8 | 3 4
 9 | 3 7
10 | 3 8
11 | 4 3
12 | 5 2
13 | 5 8
14 | 5 6
15 | 6 5
16 | 6 7
17 | 7 4
18 | 7 6
19 | 8 3
20 | 8 5
21 | 


--------------------------------------------------------------------------------
/example/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(SDL2 REQUIRED)
 2 | include_directories(${SDL2_INCLUDE_DIRS})
 3 | string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
 4 | 
 5 | add_executable(particle_simulation particle_simulation.cc)
 6 | target_link_libraries(particle_simulation ikra ${SDL2_LIBRARIES})
 7 | 
 8 | add_executable(n_body n_body.cc)
 9 | target_link_libraries(n_body ikra ${SDL2_LIBRARIES})
10 | 
11 | add_subdirectory(bfs)
12 | 
13 | add_subdirectory(traffic)
14 | 


--------------------------------------------------------------------------------
/example/bfs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(bfs_const_size_array bfs_const_size_array.cc)
 2 | target_link_libraries(bfs_const_size_array ikra)
 3 | 
 4 | add_executable(bfs_inlined_dynamic_array bfs_inlined_dynamic_array.cc)
 5 | target_link_libraries(bfs_inlined_dynamic_array ikra)
 6 | 
 7 | add_executable(bfs_std_vector bfs_std_vector.cc)
 8 | target_link_libraries(bfs_std_vector ikra)
 9 | 
10 | 
11 | # CUDA
12 | # TODO: cuda_add_library does not support INTERFACE libraries
13 | include_directories(${CMAKE_SOURCE_DIR}/ikra)
14 | cuda_add_executable(bfs_const_size_array_cuda bfs_const_size_array.cu)
15 | target_link_libraries(bfs_const_size_array_cuda ikra)
16 | 


--------------------------------------------------------------------------------
/example/bfs/bfs_const_size_array.cc:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <math.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "bfs_loader.h"
  6 | #include "executor/executor.h"
  7 | #include "soa/soa.h"
  8 | 
  9 | static const int kMaxDegree = 10;
 10 | static const int kMaxVertices = 20000;
 11 | 
 12 | using ikra::soa::IndexType;
 13 | using ikra::soa::SoaLayout;
 14 | using ikra::executor::execute;
 15 | using ikra::executor::execute_and_reduce;
 16 | 
 17 | class Vertex : public SoaLayout<Vertex, kMaxVertices> {
 18 |  public:
 19 |   IKRA_INITIALIZE_CLASS
 20 | 
 21 |   Vertex(const std::vector<IndexType>& neighbors) {
 22 |     // If this check fails, we the dataset cannot be run with this
 23 |     // implementation.
 24 |     assert(neighbors.size() <= kMaxDegree);
 25 |     adj_list_size_ = neighbors.size();
 26 | 
 27 |     for (int i = 0; i < num_neighbors(); ++i) {
 28 |       adj_list_[i] = Vertex::get_uninitialized(neighbors[i]);
 29 |     }
 30 |   }
 31 | 
 32 |   int num_neighbors() {
 33 |     return adj_list_size_;
 34 |   }
 35 | 
 36 |   bool update_distance(int distance) {
 37 |     if (distance < distance_) {
 38 |       distance_ = distance;
 39 |       return true;
 40 |     } else {
 41 |       return false;
 42 |     }
 43 |   }
 44 | 
 45 |   // Visit the vertex, i.e., update the distances of all neighbors if this
 46 |   // vertex is in the frontier, as indicated by the "iteration" field. Returns
 47 |   // "true" if at least one neighbor was updated.
 48 |   bool visit(int iteration) {
 49 |     bool updated = false;
 50 | 
 51 |     if (distance_ == iteration) {
 52 |       for (int i = 0; i < adj_list_size_; ++i) {
 53 |         Vertex* neighbor = adj_list_[i];
 54 |         updated |= neighbor->update_distance(distance_ + 1);
 55 |       }
 56 |     }
 57 | 
 58 |     return updated;
 59 |   }
 60 | 
 61 |   void print_distance() {
 62 |     printf("distance[%lu] = %i\n", id(), (int) distance_);
 63 |   }
 64 | 
 65 |   int_ distance_ = std::numeric_limits<int>::max();
 66 |   int_ adj_list_size_;
 67 | 
 68 |   // A fully inlined SOA array.
 69 |   array_(Vertex*, kMaxDegree, fully_inlined) adj_list_;
 70 | };
 71 | 
 72 | IKRA_HOST_STORAGE(Vertex)
 73 | 
 74 | 
 75 | int run() {
 76 |   int counter = 0;
 77 |   bool running = true;
 78 | 
 79 |   for (int iteration = 0; running; ++iteration) {
 80 |     running = execute_and_reduce(&Vertex::visit,
 81 |                                  [](bool a, bool b) { return a || b; },
 82 |                                  /*default_value=*/ false,
 83 |                                  /*args...=*/ iteration);
 84 |     counter++;
 85 |   }
 86 | 
 87 |   return counter;
 88 | }
 89 | 
 90 | int main(int argc, char* argv[]) {
 91 |   // Load vertices from file.
 92 |   if (argc != 4) {
 93 |     printf("Usage: %s filename num_vertices start_vertex\n", argv[0]);
 94 |     exit(1);
 95 |   }
 96 | 
 97 |   Vertex::initialize_storage();
 98 |   load_file<Vertex>(argv[1], atoi(argv[2]));
 99 | 
100 |   // Set start vertex.
101 |   Vertex::get(atoi(argv[3]))->distance_ = 0;
102 |   // Start algorithm.
103 |   int iterations = run();
104 | 
105 |   // Print results.
106 |   printf("Iterations: %i\n", iterations);
107 |   execute(&Vertex::print_distance);
108 | }
109 | 


--------------------------------------------------------------------------------
/example/bfs/bfs_const_size_array.cu:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <math.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "bfs_loader.h"
  6 | #include "executor/cuda_executor.h"
  7 | #include "executor/executor.h"
  8 | #include "soa/soa.h"
  9 | 
 10 | static const int kMaxDegree = 10;
 11 | static const int kMaxVertices = 20000;
 12 | 
 13 | using ikra::soa::IndexType;
 14 | using ikra::soa::SoaLayout;
 15 | using ikra::executor::execute;
 16 | 
 17 | class Vertex : public SoaLayout<Vertex, kMaxVertices> {
 18 |  public:
 19 |   IKRA_INITIALIZE_CLASS
 20 | 
 21 |   Vertex(const std::vector<IndexType>& neighbors) {
 22 |     // If this check fails, we the dataset cannot be run with this
 23 |     // implementation.
 24 |     assert(neighbors.size() <= kMaxDegree);
 25 |     adj_list_size_ = neighbors.size();
 26 | 
 27 |     for (int i = 0; i < num_neighbors(); ++i) {
 28 |       Vertex* vertex = Vertex::get_uninitialized(neighbors[i]);
 29 |       adj_list_[i] = vertex;
 30 |     }
 31 |   }
 32 | 
 33 |   __host__ __device__ int num_neighbors() {
 34 |     return adj_list_size_;
 35 |   }
 36 | 
 37 |   // Visit the vertex, i.e., update the distances of all neighbors if this
 38 |   // vertex is in the frontier, as indicated by the "iteration" field. Returns
 39 |   // "true" if at least one neighbor was updated.
 40 |   __device__ bool visit(int iteration) {
 41 |     bool updated = false;
 42 | 
 43 |     if (distance_ == iteration) {
 44 |       for (int i = 0; i < num_neighbors(); ++i) {
 45 |         Vertex* neighbor = adj_list_[i];
 46 |         updated |= neighbor->update_distance(distance_ + 1);
 47 |       }
 48 |     }
 49 | 
 50 |     return updated;
 51 |   }
 52 | 
 53 |   void print_distance() {
 54 |     printf("distance[%lu] = %i\n", id(), (int) distance_);
 55 |   }
 56 | 
 57 |   void set_distance(int value) {
 58 |     distance_ = value;
 59 |   }
 60 | 
 61 |   __device__ bool update_distance(int distance) {
 62 |     if (distance < distance_) {
 63 |       distance_ = distance;
 64 |       return true;
 65 |     } else {
 66 |       return false;
 67 |     }
 68 |   }
 69 | 
 70 |   int_ distance_ = std::numeric_limits<int>::max();
 71 |   int_ adj_list_size_;
 72 | 
 73 |   // A fully inlined SOA array.
 74 |   array_(Vertex*, kMaxDegree, fully_inlined) adj_list_;
 75 | };
 76 | 
 77 | IKRA_DEVICE_STORAGE(Vertex)
 78 | 
 79 | 
 80 | int run() {
 81 |   int iteration = 0;
 82 |   bool running = true;
 83 | 
 84 |   while (running) {
 85 |     auto reducer = [](bool a, bool b) { return a || b; };
 86 |     running = cuda_execute_and_reduce(&Vertex::visit,
 87 |                                       reducer,
 88 |                                       iteration);
 89 | 
 90 |     ++iteration;
 91 |   }
 92 | 
 93 |   return iteration;
 94 | }
 95 | 
 96 | int main(int argc, char* argv[]) {
 97 |   // Load vertices from file.
 98 |   if (argc != 4) {
 99 |     printf("Usage: %s filename num_vertices start_vertex\n", argv[0]);
100 |     exit(1);
101 |   }
102 | 
103 |   Vertex::initialize_storage();
104 |   load_file<Vertex>(argv[1], atoi(argv[2]));
105 | 
106 |   // Set start vertex.
107 |   Vertex* start_vertex = Vertex::get(atoi(argv[3]));
108 |   start_vertex->set_distance(0);
109 | 
110 |   // Start algorithm.
111 |   int iterations = run();
112 | 
113 |   // Note: execute is host side, cuda_execute is device side.
114 |   printf("Iterations: %i\n", iterations);
115 |   execute(&Vertex::print_distance);
116 | 
117 |   // Ensure nothing went wrong on the GPU.
118 |   gpuErrchk(cudaPeekAtLastError());
119 | }
120 | 


--------------------------------------------------------------------------------
/example/bfs/bfs_inlined_dynamic_array.cc:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <math.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "bfs_loader.h"
  6 | #include "executor/executor.h"
  7 | #include "soa/soa.h"
  8 | 
  9 | static const int kInlineSize = 2;
 10 | static const int kMaxVertices = 20000;
 11 | static const int kMaxEdges = 100000;
 12 | 
 13 | using ikra::soa::IndexType;
 14 | using ikra::soa::SoaLayout;
 15 | using ikra::soa::StaticStorageWithArena;
 16 | using ikra::soa::kAddressModeZero;
 17 | using ikra::executor::execute;
 18 | using ikra::executor::execute_and_reduce;
 19 | 
 20 | 
 21 | class Vertex : public SoaLayout<
 22 |     Vertex, kMaxVertices, kAddressModeZero,
 23 |     StaticStorageWithArena<kMaxEdges*sizeof(Vertex*)>> {
 24 |  public:
 25 |   IKRA_INITIALIZE_CLASS
 26 | 
 27 |   Vertex(const std::vector<IndexType>& neighbors)
 28 |       : adj_list_(neighbors.size()) {
 29 |     adj_list_size_ = neighbors.size();
 30 |     for (int i = 0; i < num_neighbors(); ++i) {
 31 |       adj_list_[i] = Vertex::get_uninitialized(neighbors[i]);
 32 |     }
 33 |   }
 34 | 
 35 |   int num_neighbors() {
 36 |     return adj_list_size_;
 37 |   }
 38 | 
 39 |   bool update_distance(int distance) {
 40 |     if (distance < distance_) {
 41 |       distance_ = distance;
 42 |       return true;
 43 |     } else {
 44 |       return false;
 45 |     }
 46 |   }
 47 | 
 48 |   // Visit the vertex, i.e., update the distances of all neighbors if this
 49 |   // vertex is in the frontier, as indicated by the "iteration" field. Returns
 50 |   // "true" if at least one neighbor was updated.
 51 |   bool visit(int iteration) {
 52 |     bool updated = false;
 53 | 
 54 |     if (distance_ == iteration) {
 55 |       for (int i = 0; i < num_neighbors(); ++i) {
 56 |         Vertex* neighbor = adj_list_[i];
 57 |         updated |= neighbor->update_distance(distance_ + 1);
 58 |       }
 59 |     }
 60 | 
 61 |     return updated;
 62 |   }
 63 | 
 64 |   void print_distance() {
 65 |     printf("distance[%lu] = %i\n", id(), (int) distance_);
 66 |   }
 67 | 
 68 |   int_ distance_ = std::numeric_limits<int>::max();
 69 |   int_ adj_list_size_;
 70 | 
 71 |   // Some elements are stored in the external storage.
 72 |   array_(Vertex*, kInlineSize, partially_inlined) adj_list_;
 73 | };
 74 | 
 75 | IKRA_HOST_STORAGE(Vertex)
 76 | 
 77 | 
 78 | int run() {
 79 |   int counter = 0;
 80 |   bool running = true;
 81 | 
 82 |   for (int iteration = 0; running; ++iteration) {
 83 |     running = execute_and_reduce(&Vertex::visit,
 84 |                                  [](bool a, bool b) { return a || b; },
 85 |                                  /*default_value=*/ false,
 86 |                                  /*args...=*/ iteration);
 87 |     counter++;
 88 |   }
 89 | 
 90 |   return counter;
 91 | }
 92 | 
 93 | int main(int argc, char* argv[]) {
 94 |   // Load vertices from file.
 95 |   if (argc != 4) {
 96 |     printf("Usage: %s filename num_vertices start_vertex\n", argv[0]);
 97 |     exit(1);
 98 |   }
 99 | 
100 |   Vertex::initialize_storage();
101 |   load_file<Vertex>(argv[1], atoi(argv[2]));
102 | 
103 |   // Set start vertex.
104 |   Vertex::get(atoi(argv[3]))->distance_ = 0;
105 |   // Start algorithm.
106 |   int iterations = run();
107 | 
108 |   // Print results.
109 |   printf("Iterations: %i\n", iterations);
110 |   execute(&Vertex::print_distance);
111 | }
112 | 


--------------------------------------------------------------------------------
/example/bfs/bfs_loader.h:
--------------------------------------------------------------------------------
 1 | #ifndef EXAMPLE_BFS_BFS_LOADER_H
 2 | #define EXAMPLE_BFS_BFS_LOADER_H
 3 | 
 4 | #include <cassert>
 5 | #include <fstream>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | #include "soa/soa.h"
10 | 
11 | using namespace std;
12 | using ikra::soa::IndexType;
13 | 
14 | template<typename VertexT>
15 | void load_file(const char* filename, IndexType num_vertices) {
16 |   ifstream file;
17 |   file.open(filename);
18 | 
19 |   if (!file) {
20 |     printf("Unable to open file: %s\n", filename);
21 |     exit(1);
22 |   }
23 | 
24 |   // Build adj. list until we see a new vertex.
25 |   vector<IndexType> adj_list;
26 |   IndexType last_v_from = 0;
27 |   IndexType v_from, v_to;
28 | 
29 |   while (file >> v_from && file >> v_to) {
30 |     // We assume that the edges are soreted by edge source.
31 |     assert(v_from >= last_v_from);
32 | 
33 |     for (; last_v_from < v_from; ++last_v_from) {
34 |       VertexT* vertex = new VertexT(adj_list);
35 |       assert(vertex->id() == last_v_from);
36 | 
37 |       adj_list.clear();
38 |     }
39 | 
40 |     adj_list.push_back(v_to);
41 |   }
42 | 
43 |   for (; last_v_from < num_vertices; ++last_v_from) {
44 |     VertexT* vertex = new VertexT(adj_list);
45 |     assert(vertex->id() == last_v_from);
46 | 
47 |     adj_list.clear();
48 |   }
49 | }
50 | 
51 | #endif  // EXAMPLE_BFS_BFS_LOADER_H
52 | 


--------------------------------------------------------------------------------
/example/bfs/bfs_std_vector.cc:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <math.h>
  3 | #include <stdint.h>
  4 | #include <vector>
  5 | 
  6 | #include "bfs_loader.h"
  7 | #include "executor/executor.h"
  8 | #include "soa/soa.h"
  9 | 
 10 | static const int kMaxVertices = 20000;
 11 | 
 12 | using ikra::soa::IndexType;
 13 | using ikra::soa::SoaLayout;
 14 | using ikra::executor::execute;
 15 | using ikra::executor::execute_and_reduce;
 16 | 
 17 | class Vertex : public SoaLayout<Vertex, kMaxVertices> {
 18 |  public:
 19 |   IKRA_INITIALIZE_CLASS
 20 | 
 21 |   Vertex(const std::vector<IndexType>& neighbors) {
 22 |     adj_list_size_ = neighbors.size();
 23 | 
 24 |     for (int i = 0; i < num_neighbors(); ++i) {
 25 |       adj_list_->push_back(Vertex::get_uninitialized(neighbors[i]));
 26 |     }
 27 |   }
 28 | 
 29 |   int num_neighbors() {
 30 |     return adj_list_size_;
 31 |   }
 32 | 
 33 |   bool update_distance(int distance) {
 34 |     if (distance < distance_) {
 35 |       distance_ = distance;
 36 |       return true;
 37 |     } else {
 38 |       return false;
 39 |     }
 40 |   }
 41 | 
 42 |   // Visit the vertex, i.e., update the distances of all neighbors if this
 43 |   // vertex is in the frontier, as indicated by the "iteration" field. Returns
 44 |   // "true" if at least one neighbor was updated.
 45 |   bool visit(int iteration) {
 46 |     bool updated = false;
 47 | 
 48 |     if (distance_ == iteration) {
 49 |       for (int i = 0; i < adj_list_size_; ++i) {
 50 |         Vertex* neighbor = (*adj_list_)[i]; //->operator[](i);
 51 |         updated |= neighbor->update_distance(distance_ + 1);
 52 |       }
 53 |     }
 54 | 
 55 |     return updated;
 56 |   }
 57 | 
 58 |   void print_distance() {
 59 |     printf("distance[%lu] = %i\n", id(), (int) distance_);
 60 |   }
 61 | 
 62 |   int_ distance_ = std::numeric_limits<int>::max();
 63 |   int_ adj_list_size_;
 64 |   field_(std::vector<Vertex*>) adj_list_;
 65 | };
 66 | 
 67 | IKRA_HOST_STORAGE(Vertex)
 68 | 
 69 | 
 70 | int run() {
 71 |   int counter = 0;
 72 |   bool running = true;
 73 | 
 74 |   for (int iteration = 0; running; ++iteration) {
 75 |     running = execute_and_reduce(&Vertex::visit,
 76 |                                  [](bool a, bool b) { return a || b; },
 77 |                                  /*default_value=*/ false,
 78 |                                  /*args...=*/ iteration);
 79 |     counter++;
 80 |   }
 81 | 
 82 |   return counter;
 83 | }
 84 | 
 85 | int main(int argc, char* argv[]) {
 86 |   // Ensure that the std::vector is layouted correctly.
 87 |   static_assert(
 88 |       Vertex::ObjectSize::value == sizeof(std::vector<void*>) + sizeof(int)*2,
 89 |       "Expected different SOA object size.");
 90 | 
 91 |   // Load vertices from file.
 92 |   if (argc != 4) {
 93 |     printf("Usage: %s filename num_vertices start_vertex\n", argv[0]);
 94 |     exit(1);
 95 |   }
 96 | 
 97 |   Vertex::initialize_storage();
 98 |   load_file<Vertex>(argv[1], atoi(argv[2]));
 99 | 
100 |   // Set start vertex.
101 |   Vertex::get(atoi(argv[3]))->distance_ = 0;
102 |   // Start algorithm.
103 |   int iterations = run();
104 | 
105 |   // Print results.
106 |   printf("Iterations: %i\n", iterations);
107 |   execute(&Vertex::print_distance);
108 | }
109 | 


--------------------------------------------------------------------------------
/example/n_body.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cmath>
  7 | 
  8 | #include "SDL2/SDL.h"
  9 | 
 10 | #include "executor/executor.h"
 11 | #include "soa/soa.h"
 12 | 
 13 | using namespace std;
 14 | using ikra::executor::execute;
 15 | using ikra::executor::Iterator;
 16 | using ikra::soa::SoaLayout;
 17 | 
 18 | static const int kNumBodies = 25;
 19 | static const double kMaxMass = 1000;
 20 | static const double kTimeInterval = 0.5;
 21 | 
 22 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 23 | 
 24 | static const int kWindowWidth = 1000;
 25 | static const int kWindowHeight = 1000;
 26 | static const int kMaxRect = 20;
 27 | 
 28 | #define RAND (1.0 * rand() / RAND_MAX)
 29 | 
 30 | static void render_rect(SDL_Renderer* renderer, double x, double y, double mass) {
 31 |   SDL_Rect rect;
 32 |   rect.w = rect.h = mass / kMaxMass * kMaxRect;
 33 |   rect.x = (x/2 + 0.5) * kWindowWidth - rect.w/2;
 34 |   rect.y = (y/2 + 0.5) * kWindowHeight - rect.h/2;
 35 |   SDL_RenderDrawRect(renderer, &rect);
 36 | }
 37 | 
 38 | class Body : public SoaLayout<Body, kNumBodies> {
 39 |  public:
 40 |   IKRA_INITIALIZE_CLASS
 41 | 
 42 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 43 |       : mass_(mass) {
 44 |     position_[0] = pos_x;
 45 |     position_[1] = pos_y;
 46 |     velocity_[0] = vel_x;
 47 |     velocity_[1] = vel_y;
 48 |     this->reset_force();
 49 |   }
 50 | 
 51 |   double_ mass_;
 52 |   array_(double, 2, fully_inlined) position_;
 53 |   array_(double, 2, fully_inlined) velocity_;
 54 |   array_(double, 2, fully_inlined) force_;
 55 | 
 56 |   void add_force(Body* body) {
 57 |     if (this == body) return;
 58 | 
 59 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 60 |     double dx = body->position_[0] - position_[0];
 61 |     double dy = body->position_[1] - position_[1];
 62 |     double dist = sqrt(dx*dx + dy*dy);
 63 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 64 |     force_[0] += F*dx / dist;
 65 |     force_[1] += F*dy / dist;
 66 |   }
 67 | 
 68 |   void add_force_to_all() {
 69 |     execute(&Body::add_force, this);
 70 |   }
 71 | 
 72 |   void reset_force() {
 73 |     force_[0] = 0;
 74 |     force_[1] = 0;
 75 |   }
 76 | 
 77 |   void update(double dt) {
 78 |     velocity_[0] += force_[0]*dt / mass_;
 79 |     velocity_[1] += force_[1]*dt / mass_;
 80 |     position_[0] += velocity_[0]*dt;
 81 |     position_[1] += velocity_[1]*dt;
 82 | 
 83 |     if (position_[0] < -1 || position_[0] > 1) {
 84 |       velocity_[0] = -velocity_[0];
 85 |     }
 86 |     if (position_[1] < -1 || position_[1] > 1) {
 87 |       velocity_[1] = -velocity_[1];
 88 |     }
 89 |   }
 90 | 
 91 |   void draw(SDL_Renderer* renderer) {
 92 |     render_rect(renderer, position_[0], position_[1], mass_);
 93 |   }
 94 | };
 95 | 
 96 | IKRA_HOST_STORAGE(Body)
 97 | 
 98 | 
 99 | int main() {
100 |   if (SDL_Init(SDL_INIT_VIDEO) != 0) {
101 |     printf("Could not initialize SDL!\n");
102 |     exit(1);
103 |   }
104 | 
105 |   SDL_Window* window = nullptr;
106 |   SDL_Renderer* renderer = nullptr;
107 | 
108 |   if (SDL_CreateWindowAndRenderer(kWindowWidth, kWindowHeight, 0,
109 |         &window, &renderer) != 0) {
110 |     printf("Could not create window/render!\n");
111 |     exit(1);
112 |   }
113 | 
114 |   // Initialize object storage.
115 |   Body::initialize_storage();
116 | 
117 |   // Create objects.
118 |   for (int i = 0; i < kNumBodies; ++i) {
119 |     double mass = (RAND/2 + 0.5) * kMaxMass;
120 |     double pos_x = RAND*2 - 1;
121 |     double pos_y = RAND*2 - 1;
122 |     double vel_x = (RAND - 0.5) / 1000;
123 |     double vel_y = (RAND - 0.5) / 1000;
124 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
125 |   }
126 | 
127 |   SDL_bool done = SDL_FALSE;
128 |   while (!done) {
129 |     // Reset forces.
130 |     execute(&Body::reset_force);
131 | 
132 |     // Update forces.
133 |     execute(&Body::add_force_to_all);
134 | 
135 |     // Update velocities and positions.
136 |     execute(&Body::update, kTimeInterval);
137 | 
138 |     // Draw scene.
139 |     SDL_SetRenderDrawColor(renderer, 255, 255, 255, SDL_ALPHA_OPAQUE);
140 |     SDL_RenderClear(renderer);
141 |     SDL_SetRenderDrawColor(renderer, 0, 0, 0, SDL_ALPHA_OPAQUE);
142 |     execute(&Body::draw, renderer);
143 |     SDL_RenderPresent(renderer);
144 |     // done = SDL_TRUE;
145 | 
146 |     // Wait for user to close the window.
147 |     SDL_Event event;
148 |     while (SDL_PollEvent(&event)) {
149 |       if (event.type == SDL_QUIT) {
150 |         done = SDL_TRUE;
151 |       }
152 |     }
153 |   }
154 | 
155 |   SDL_DestroyRenderer(renderer);
156 |   SDL_DestroyWindow(window);
157 |   SDL_Quit();
158 |   return 0;
159 | }
160 | 


--------------------------------------------------------------------------------
/example/particle_simulation.cc:
--------------------------------------------------------------------------------
  1 | // Particle-Well Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cmath>
  7 | 
  8 | #include "SDL2/SDL.h"
  9 | 
 10 | #include "executor/executor.h"
 11 | #include "soa/soa.h"
 12 | 
 13 | using namespace std;
 14 | using ikra::executor::execute;
 15 | using ikra::executor::Iterator;
 16 | using ikra::soa::SoaLayout;
 17 | 
 18 | static const int kNumWells = 1;
 19 | static const int kNumParticles = 20;
 20 | static const double kTimeInterval = 1500;
 21 | 
 22 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 23 | static const double kSolarMass = 1.98892e30;
 24 | static const double kEarthMass = 5.972e24;
 25 | static const double kEarthSunDistance = 149600000000;
 26 | static const double kEarthVelocity = 30000;
 27 | 
 28 | static const int kWindowWidth = 1000;
 29 | static const int kWindowHeight = 1000;
 30 | static const double kScalingFactor = 2.5e-9;
 31 | 
 32 | // Draw a rectangle with equal height/width at a given position.
 33 | static void render_rect(SDL_Renderer* renderer, double x, double y, int side) {
 34 |   SDL_Rect rect;
 35 |   rect.x = x*kScalingFactor + kWindowWidth/2 - side/2;
 36 |   rect.y = y*kScalingFactor + kWindowHeight/2 - side/2;
 37 |   rect.w = side;
 38 |   rect.h = side;
 39 |   SDL_RenderDrawRect(renderer, &rect);
 40 | }
 41 | 
 42 | class Particle;
 43 | 
 44 | class Well : public SoaLayout<Well, kNumWells> {
 45 |  public:
 46 |   IKRA_INITIALIZE_CLASS
 47 | 
 48 |   Well(double mass, double pos_x, double pos_y) : mass_(mass) {
 49 |     position_[0] = pos_x;
 50 |     position_[1] = pos_y;
 51 |   }
 52 | 
 53 |   void add_force();
 54 | 
 55 |   void draw(SDL_Renderer* renderer) {
 56 |     render_rect(renderer, position_[0], position_[1], 50);
 57 |     render_rect(renderer, position_[0], position_[1], 60);
 58 |   }
 59 | 
 60 |   double_ mass_;
 61 |   array_(double, 2, fully_inlined) position_;
 62 | };
 63 | 
 64 | IKRA_HOST_STORAGE(Well)
 65 | 
 66 | 
 67 | class Particle : public SoaLayout<Particle, kNumParticles> {
 68 |  public:
 69 |   IKRA_INITIALIZE_CLASS
 70 | 
 71 |   Particle(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 72 |       : mass_(mass) {
 73 |     position_[0] = pos_x;
 74 |     position_[1] = pos_y;
 75 |     velocity_[0] = vel_x;
 76 |     velocity_[1] = vel_y;
 77 |     this->reset_force();
 78 |   }
 79 | 
 80 |   double_ mass_;
 81 |   array_(double, 2, fully_inlined) position_;
 82 |   array_(double, 2, fully_inlined) velocity_;
 83 |   array_(double, 2, fully_inlined) force_;
 84 | 
 85 |   void add_force(Well* well) {
 86 |     double EPS = 3e4;    // Softening parameter (just to avoid infinities).
 87 |     double dx = well->position_[0] - position_[0];
 88 |     double dy = well->position_[1] - position_[1];
 89 |     double dist = sqrt(dx*dx + dy*dy);
 90 |     double F = kGravityConstant*mass_*well->mass_ / (dist*dist + EPS*EPS);
 91 |     force_[0] += F*dx / dist;
 92 |     force_[1] += F*dy / dist;
 93 |   }
 94 | 
 95 |   void reset_force() {
 96 |     force_[0] = 0.0f;
 97 |     force_[1] = 0.0f;
 98 |   }
 99 | 
100 |   void update(double dt) {
101 |     velocity_[0] += force_[0]*dt / mass_;
102 |     velocity_[1] += force_[1]*dt / mass_;
103 |     position_[0] += velocity_[0]*dt;
104 |     position_[1] += velocity_[1]*dt;
105 |   }
106 | 
107 |   void draw(SDL_Renderer* renderer) {
108 |     render_rect(renderer, position_[0], position_[1], 30);
109 |   }
110 | };
111 | 
112 | IKRA_HOST_STORAGE(Particle)
113 | 
114 | 
115 | // Defined here because definition depends on Particle.
116 | void Well::add_force() {
117 |   // Add force induced by this well to all particles.
118 |   execute(&Particle::add_force, this);
119 | }
120 | 
121 | 
122 | int main() {
123 |   if (SDL_Init(SDL_INIT_VIDEO) != 0) {
124 |     printf("Could not initialize SDL!\n");
125 |     exit(1);
126 |   }
127 | 
128 |   SDL_Window* window = nullptr;
129 |   SDL_Renderer* renderer = nullptr;
130 | 
131 |   if (SDL_CreateWindowAndRenderer(kWindowWidth, kWindowHeight, 0,
132 |         &window, &renderer) != 0) {
133 |     printf("Could not create window/render!\n");
134 |     exit(1);
135 |   }
136 | 
137 |   // Initialize object storage.
138 |   Particle::initialize_storage();
139 |   Well::initialize_storage();
140 | 
141 |   // Create objects.
142 |   for (int i = 0; i < kNumWells; ++i) {
143 |     new Well(kSolarMass, 0, 0);
144 |   }
145 | 
146 |   for (int i = 0; i < kNumParticles; ++i) {
147 |     new Particle(kEarthMass, kEarthSunDistance, 0, 0, kEarthVelocity);
148 |   }
149 | 
150 |   SDL_bool done = SDL_FALSE;
151 |   while (!done) {
152 |     // Reset forces.
153 |     execute(&Particle::reset_force);
154 | 
155 |     // Update forces.
156 |     execute(&Well::add_force);
157 | 
158 |     // Update velocities and positions.
159 |     execute(&Particle::update, kTimeInterval);
160 | 
161 |     // Draw scene.
162 |     SDL_SetRenderDrawColor(renderer, 255, 255, 255, SDL_ALPHA_OPAQUE);
163 |     SDL_RenderClear(renderer);
164 |     SDL_SetRenderDrawColor(renderer, 0, 0, 0, SDL_ALPHA_OPAQUE);
165 |     execute(&Well::draw, renderer);
166 |     execute(&Particle::draw, renderer);
167 |     SDL_RenderPresent(renderer);
168 | 
169 |     // Wait for user to close the window.
170 |     SDL_Event event;
171 |     while (SDL_PollEvent(&event)) {
172 |       if (event.type == SDL_QUIT) {
173 |         done = SDL_TRUE;
174 |       }
175 |     }
176 |   }
177 | 
178 |   SDL_DestroyRenderer(renderer);
179 |   SDL_DestroyWindow(window);
180 |   SDL_Quit();
181 |   return 0;
182 | }
183 | 
184 | int codegen_test(Particle* p) {
185 |   return p->mass_;
186 | }
187 | 


--------------------------------------------------------------------------------
/example/traffic/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(BENCH_CELL_IN_MODE "--" CACHE STRING "")
 2 | ADD_DEFINITIONS(-DBENCH_CELL_IN_MODE=${BENCH_CELL_IN_MODE})
 3 | 
 4 | set(BENCH_CELL_OUT_MODE "--" CACHE STRING "")
 5 | ADD_DEFINITIONS(-DBENCH_CELL_OUT_MODE=${BENCH_CELL_OUT_MODE})
 6 | 
 7 | set(BENCH_CAR_MODE "--" CACHE STRING "")
 8 | ADD_DEFINITIONS(-DBENCH_CAR_MODE=${BENCH_CAR_MODE})
 9 | 
10 | set(BENCH_SIGNAL_GROUP_MODE "--" CACHE STRING "")
11 | ADD_DEFINITIONS(-DBENCH_SIGNAL_GROUP_MODE=${BENCH_SIGNAL_GROUP_MODE})
12 | 
13 | set(BENCH_TRAFFIC_LIGHT_MODE "--" CACHE STRING "")
14 | ADD_DEFINITIONS(-DBENCH_TRAFFIC_LIGHT_MODE=${BENCH_TRAFFIC_LIGHT_MODE})
15 | 
16 | set(BENCH_LAYOUT_MODE "--" CACHE STRING "")
17 | ADD_DEFINITIONS(-DBENCH_LAYOUT_MODE=${BENCH_LAYOUT_MODE})
18 | 
19 | set(BENCHMARK_MODE "0" CACHE STRING "")
20 | ADD_DEFINITIONS(-DBENCHMARK_MODE=${BENCHMARK_MODE})
21 | 
22 | 
23 | # CUDA
24 | # TODO: cuda_add_library does not support INTERFACE libraries
25 | include_directories(${CMAKE_SOURCE_DIR}/ikra)
26 | cuda_add_executable(traffic traffic.cu simulation_adapter.cu traffic-simulation/graphml_network_builder.cc traffic-simulation/traffic.cc traffic-simulation/simple_network_builder.cc traffic-simulation/drawing.cc traffic-simulation/traffic_aos_int_cuda.cu traffic-simulation/traffic_aos_int.cc)
27 | target_link_libraries(traffic ikra SDL2 SDL2_gfx)
28 | 


--------------------------------------------------------------------------------
/example/traffic/benchmark.h:
--------------------------------------------------------------------------------
 1 | #ifndef EXAMPLE_TRAFFIC_BENCHMARK_H
 2 | #define EXAMPLE_TRAFFIC_BENCHMARK_H
 3 | 
 4 | #include <chrono>
 5 | 
 6 | template<typename TimeT = std::chrono::microseconds>
 7 | struct measure
 8 | {
 9 |     template<typename F, typename ...Args>
10 |     static typename TimeT::rep execution(F&& func, Args&&... args)
11 |     {
12 |         auto start = std::chrono::steady_clock::now();
13 |         std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
14 |         auto duration = std::chrono::duration_cast< TimeT> 
15 |                             (std::chrono::steady_clock::now() - start);
16 |         return duration.count();
17 |     }
18 | };
19 | 
20 | #endif  // EXAMPLE_TRAFFIC_BENCHMARK_H
21 | 


--------------------------------------------------------------------------------
/example/traffic/configuration.h:
--------------------------------------------------------------------------------
  1 | #ifndef EXAMPLE_TRAFFIC_CONFIGURATION_H
  2 | #define EXAMPLE_TRAFFIC_CONFIGURATION_H
  3 | 
  4 | using ikra::soa::kLayoutModeSoa;
  5 | using ikra::soa::kLayoutModeAos;
  6 | 
  7 | class Cell;
  8 | class SharedSignalGroup;
  9 | 
 10 | // Simulation logic.
 11 | #define SMART_TRAFFIC_LIGHT
 12 | #define MOVE_ONTO_LARGE_STREETS
 13 | 
 14 | // Statistics for benchmark.
 15 | static const uint32_t kNumIterations = 1000;
 16 | static const uint32_t kNumBenchmarkRuns = 5;
 17 | 
 18 | // Number of objects.
 19 | #define LARGE_DATASET
 20 | #ifdef LARGE_DATASET
 21 | // Adapted for dataset: OSM-urbanized, Denver_Aurora_CO
 22 | static const uint32_t kNumCells = 8339802;
 23 | const uint32_t kNumCars = 229376;
 24 | static const uint32_t kNumSharedSignalGroups = 205129;
 25 | static const uint32_t kNumTrafficLights = 62819;
 26 | static const uint32_t kNumPriorityCtrls = 4792;
 27 | #else
 28 | // Small dataset can run on my laptop.
 29 | static const uint32_t kNumCells = 950778;
 30 | const uint32_t kNumCars = 50000;
 31 | static const uint32_t kNumSharedSignalGroups = 344418;
 32 | static const uint32_t kNumTrafficLights = 8000;
 33 | static const uint32_t kNumPriorityCtrls = 1000;
 34 | #endif  // LARGE
 35 | 
 36 | // Maximum array sizes.
 37 | static const uint32_t kMaxNumCellIncoming = 8;
 38 | static const uint32_t kMaxNumCellOutgoing = 8;
 39 | static const uint32_t kMaxNumCarPath = 30;
 40 | static const uint32_t kMaxNumSharedSignalGroupCells = 8;
 41 | static const uint32_t kMaxNumTrafficLightGroups = 6;
 42 | 
 43 | // Manual padding for AOS mode.
 44 | static const uint32_t kCellAlignmentPadding = 2;
 45 | static const uint32_t kCarAlignmentPadding = 7;
 46 | static const uint32_t kSharedSignalGroupAlignmentPadding = 4;
 47 | static const uint32_t kPriorityCtrlAlignmentPadding = 4;
 48 | // No padding required for traffic lights.
 49 | // static const uint32_t kTrafficLightAlignmentPadding = 0;
 50 | 
 51 | 
 52 | #if BENCHMARK_MODE != 1
 53 | // Dummy configuration
 54 | #define BENCH_CELL_IN_MODE -1
 55 | #define BENCH_CELL_OUT_MODE -1
 56 | #define BENCH_CAR_MODE -1
 57 | #define BENCH_SIGNAL_GROUP_MODE -1
 58 | #define BENCH_TRAFFIC_LIGHT_MODE -1
 59 | #define BENCH_LAYOUT_MODE kLayoutModeSoa
 60 | #endif
 61 | 
 62 | // Layout mode: SOA or AOS.
 63 | static const int kCellLayoutMode = BENCH_LAYOUT_MODE;
 64 | static const int kCarLayoutMode = BENCH_LAYOUT_MODE;
 65 | static const int kSharedSignalGroupLayoutMode = BENCH_LAYOUT_MODE;
 66 | static const int kPriorityCtrlLayoutMode = BENCH_LAYOUT_MODE;
 67 | static const int kTrafficLightLayoutMode = BENCH_LAYOUT_MODE;
 68 | 
 69 | // Array configurations.
 70 | #if BENCH_CELL_IN_MODE == -1
 71 | #define ARRAY_CELL_INCOMING \
 72 |     array_(Cell*, kMaxNumCellIncoming, object)
 73 | #elif BENCH_CELL_IN_MODE == -2
 74 | #define ARRAY_CELL_INCOMING \
 75 |     array_(Cell*, kMaxNumCellIncoming, fully_inlined)
 76 | #elif !defined(BENCH_CELL_IN_MODE)
 77 |     #error BENCH_CELL_IN_MODE is undefined
 78 | #else
 79 | static const uint32_t kArrInlineIncomingCells = BENCH_CELL_IN_MODE;   // 4
 80 | #define ARRAY_CELL_INCOMING \
 81 |     array_(Cell*, kArrInlineIncomingCells, partially_inlined)
 82 | #define ARRAY_CELL_IN_IS_PARTIAL
 83 | #endif
 84 | 
 85 | #if BENCH_CELL_OUT_MODE == -1
 86 | #define ARRAY_CELL_OUTGOING \
 87 |     array_(Cell*, kMaxNumCellOutgoing, object)
 88 | #elif BENCH_CELL_OUT_MODE == -2
 89 | #define ARRAY_CELL_OUTGOING \
 90 |     array_(Cell*, kMaxNumCellOutgoing, fully_inlined)
 91 | #elif !defined(BENCH_CELL_OUT_MODE)
 92 |     #error BENCH_CELL_OUT_MODE is undefined
 93 | #else
 94 | static const uint32_t kArrInlineOutgoingCells = BENCH_CELL_OUT_MODE;   // 4
 95 | #define ARRAY_CELL_OUTGOING \
 96 |     array_(Cell*, kArrInlineOutgoingCells, partially_inlined)
 97 | #define ARRAY_CELL_OUT_IS_PARTIAL
 98 | #endif
 99 | 
100 | #if BENCH_CAR_MODE == -1
101 | #define ARRAY_CAR_PATH \
102 |     array_(Cell*, kMaxNumCarPath, object)
103 | #elif BENCH_CAR_MODE == -2
104 | #define ARRAY_CAR_PATH \
105 |     array_(Cell*, kMaxNumCarPath, fully_inlined)
106 | #elif !defined(BENCH_CAR_MODE)
107 |     #error BENCH_CAR_MODE is undefined
108 | #else
109 | static const uint32_t kArrInlinePath = BENCH_CAR_MODE;   // 5
110 | #define ARRAY_CAR_PATH \
111 |     array_(Cell*, kArrInlinePath, partially_inlined)
112 | #define ARRAY_CAR_IS_PARTIAL
113 | #endif
114 | 
115 | #if BENCH_SIGNAL_GROUP_MODE == -1
116 | #define ARRAY_SIGNAL_GROUP_CELLS \
117 |     array_(Cell*, kMaxNumSharedSignalGroupCells, object)
118 | #elif BENCH_SIGNAL_GROUP_MODE == -2
119 | #define ARRAY_SIGNAL_GROUP_CELLS \
120 |     array_(Cell*, kMaxNumSharedSignalGroupCells, fully_inlined)
121 | #elif !defined(BENCH_SIGNAL_GROUP_MODE)
122 |     #error BENCH_SIGNAL_GROUP_MODE is undefined
123 | #else
124 | static const uint32_t kArrInlineSignalGroupCells =
125 |     BENCH_SIGNAL_GROUP_MODE;    // 4
126 | #define ARRAY_SIGNAL_GROUP_CELLS \
127 |     array_(Cell*, kArrInlineSignalGroupCells, partially_inlined)
128 | #define ARRAY_SIGNAL_GROUP_IS_PARTIAL
129 | #endif
130 | 
131 | #if BENCH_TRAFFIC_LIGHT_MODE == -1
132 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \
133 |     array_(SharedSignalGroup*, kMaxNumTrafficLightGroups, object)
134 | #elif BENCH_TRAFFIC_LIGHT_MODE == -2
135 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \
136 |     array_(SharedSignalGroup*, kMaxNumTrafficLightGroups, fully_inlined)
137 | #elif !defined(BENCH_TRAFFIC_LIGHT_MODE)
138 |     #error BENCH_TRAFFIC_LIGHT_MODE is undefined
139 | #else
140 | static const uint32_t kArrInlineTrafficLightSignalGroups =
141 |     BENCH_TRAFFIC_LIGHT_MODE;    // 4
142 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \
143 |     array_(SharedSignalGroup*, kArrInlineTrafficLightSignalGroups, \
144 |            partially_inlined)
145 | #define ARRAY_TRAFFIC_LIGHT_IS_PARTIAL
146 | #endif
147 | 
148 | // Sizes of arena.
149 | // Worse case: avg = 1.05 each, so share 3 slots/instance.
150 | static const uint32_t kCellArenaSize = kNumCells*3*sizeof(Cell*);
151 | // Worse case: avg is around 20.
152 | static const uint32_t kCarArenaSize = kNumCars*21*sizeof(Cell*);
153 | static const uint32_t kSharedSignalGroupArenaSize =
154 |     kNumSharedSignalGroups*4*sizeof(Cell*);
155 | // Worse case: ~40% have 4 signal groups or more.
156 | static const uint32_t kTrafficLightArenaSize =
157 |     kNumTrafficLights*3*sizeof(SharedSignalGroup*);
158 | static const uint32_t kPriorityCtrlArenaSize = 0;
159 | 
160 | // Types of other arrays.
161 | // Priority controller have always two signal groups.
162 | #define ARRAY_PRIORITY_CTRL_SIGNAL_GROUPS \
163 |     array_(SharedSignalGroup*, 2, fully_inlined)
164 | 
165 | #endif  // EXAMPLE_TRAFFIC_CONFIGURATION_H
166 | 


--------------------------------------------------------------------------------
/example/traffic/entities/car.inc:
--------------------------------------------------------------------------------
  1 | class Car : public SoaLayout<Car, kNumCars, kAddressModeZero,
  2 |                              StaticStorageWithArena<kCarArenaSize>,
  3 |                              kCarLayoutMode> {
  4 |  public:
  5 |   IKRA_INITIALIZE_CLASS
  6 | 
  7 |   __device__ __host__ Car(bool is_active, uint32_t velocity,
  8 |                           uint32_t max_velocity, uint32_t random_state,
  9 |                           Cell* position)
 10 |       : is_active_(is_active), velocity_(velocity), path_length_(0),
 11 | #ifdef ARRAY_CAR_IS_PARTIAL
 12 |         path_(max_velocity),
 13 | #endif  // ARRAY_CAR_IS_PARTIAL
 14 |         max_velocity_(max_velocity), random_state_(random_state),
 15 |         position_(position) {
 16 |     assert(is_active);
 17 |   }
 18 | 
 19 |   // The velocity of the car in cells/iteration.
 20 |   // [ OFFSET = 0 ]
 21 |   uint32_t_ velocity_;
 22 | 
 23 |   // The max. possible velocity of this car.
 24 |   // [ OFFSET = 4 ]
 25 |   uint32_t_ max_velocity_;
 26 | 
 27 |   // The lengths of the precalculated path.
 28 |   // [ OFFSET = 8 ]
 29 |   uint32_t_ path_length_;
 30 | 
 31 |   // Every car has a random state to allow for reproducible results.
 32 |   // [ OFFSET = 12 ]
 33 |   uint32_t_ random_state_;
 34 | 
 35 |   // An array of cells that the car will move onto next.
 36 |   // [ OFFSET = 16 ]
 37 |   ARRAY_CAR_PATH path_;
 38 | 
 39 |   // The current position of the car.
 40 |   // [ OFFSET = 16 + k*PTR_SIZE ]
 41 |   field_(Cell*) position_;
 42 | 
 43 |   // If a car enters a sink, it is removed from the simulation (inactive)
 44 |   // for a short time.
 45 |   // [ OFFSET = 16 + k*PTR_SIZE ]
 46 |   bool_ is_active_;
 47 | 
 48 |   // Add dummy field to make sure that objects are properly aligned in AOS.
 49 |   // [ OFFSET = 17 + k*PTR_SIZE ]
 50 |   array_(char, kCarAlignmentPadding, object) dummy_alignment_;
 51 | 
 52 |   __device__ uint32_t velocity() const { return velocity_; }
 53 | 
 54 |   __device__ uint32_t max_velocity() const { return max_velocity_; }
 55 | 
 56 |   __device__ Cell* position() const { return position_; }
 57 | 
 58 |   __device__ bool is_active() const { return is_active_; }
 59 | 
 60 |   __device__ uint32_t rand32() {
 61 |     // Advance and return random state.
 62 |     // Source: https://en.wikipedia.org/wiki/Lehmer_random_number_generator
 63 |     random_state_ = static_cast<uint32_t>(
 64 |         static_cast<uint64_t>(random_state()) * 279470273u) % 0xfffffffb;
 65 |     return random_state_;
 66 |   }
 67 | 
 68 |   __device__ uint32_t rand32(uint32_t max_value) {
 69 |     return rand32() % max_value;
 70 |   }
 71 | 
 72 |   __device__ uint32_t random_state() const {
 73 |     return random_state_;
 74 |   }
 75 | 
 76 |   __device__ void step_prepare_path() {
 77 |     if (is_active()) {
 78 |       step_initialize_iteration();
 79 |       step_accelerate();
 80 |       step_extend_path();
 81 |       step_constraint_velocity();
 82 |       step_slow_down();
 83 |     }
 84 |   }
 85 | 
 86 |   __device__ Cell* next_step(Cell* cell);
 87 | 
 88 |   __device__ void step_initialize_iteration();
 89 | 
 90 |   __device__ void step_accelerate();
 91 | 
 92 |   __device__ void step_extend_path();
 93 | 
 94 |   __device__ void step_constraint_velocity();
 95 | 
 96 |   __device__ void step_move();
 97 | 
 98 |   __device__ void step_reactivate();
 99 | 
100 |   __device__ void step_slow_down() {
101 |     if (rand32(1000) < 200 && velocity_ > 0) {
102 |       velocity_ = velocity_ - 1;
103 |     }
104 |   }
105 | };
106 | 
107 | IKRA_DEVICE_STORAGE(Car);
108 | 
109 | 
110 | __device__ Cell* Car::next_step(Cell* position) {
111 |   // Almost random walk.
112 |   const uint32_t num_outgoing = position->num_outgoing_cells();
113 |   assert(num_outgoing > 0);
114 | 
115 | #ifdef MOVE_ONTO_LARGE_STREETS
116 |   // Collect probabilities of outgoing cells.
117 |   int prob_sum = 0;
118 |   for (int i = 0; i < num_outgoing; ++i) {
119 |     Cell* candidate = position->outgoing_cell(i);
120 | 
121 |     // More likely to drive onto larger roads.
122 |     int difference = candidate->type() - position->type() + Cell::kMaxType;
123 |     prob_sum += 100*difference;
124 |   }
125 | 
126 |   // Now choose a path randomly.
127 |   int rand_select = rand32(prob_sum);
128 |   int threshold = 0;
129 |   for (int i = 0; i < num_outgoing; ++i) {
130 |     Cell* candidate = position->outgoing_cell(i);
131 | 
132 |     // More likely to drive onto larger roads.
133 |     int difference = candidate->type() - position->type() + Cell::kMaxType;
134 |     threshold += 100*difference;
135 | 
136 |     if (rand_select < threshold) {
137 |       assert(candidate != position);
138 |       return candidate;
139 |     }
140 |   }
141 | 
142 |   // We should never end up here.
143 |   assert(false);
144 | #endif  // MOVE_ONTO_LARGE_STREETS
145 | 
146 |   // Need some kind of return statement here.
147 |   return position->outgoing_cell(rand32(num_outgoing));
148 | }
149 | 
150 | 
151 | __device__ void Car::step_initialize_iteration() {
152 |   // Reset calculated path. This forces cars with a random moving behavior to
153 |   // select a new path in every iteration. Otherwise, cars might get "stucjk"
154 |   // on a full network if many cars are waiting for the one in front of them in
155 |   // a cycle.
156 |   // TODO: Check if we can keep the path at least partially somehow.
157 |   path_length_ = 0;
158 | }
159 | 
160 | __device__ void Car::step_accelerate() {
161 |   // Speed up the car by 1 or 2 units.
162 |   uint32_t speedup = rand32(2) + 1;
163 |   velocity_ = max_velocity_ < velocity_ + speedup
164 |       ? static_cast<uint32_t>(max_velocity_) : velocity_ + speedup;
165 | }
166 | 
167 | __device__ void Car::step_extend_path() {
168 |   Cell* cell = position_;
169 |   Cell* next_cell;
170 | 
171 |   for (uint32_t i = 0; i < velocity_; ++i) {
172 |     if (cell->is_sink()) {
173 |       break;
174 |     }
175 | 
176 |     next_cell = next_step(cell);
177 |     assert(next_cell != cell);
178 | 
179 |     cell = next_cell;
180 |     path_[i] = cell;
181 |     path_length_ = path_length_ + 1;
182 |   }
183 | 
184 |   velocity_ = path_length_;
185 | }
186 | 
187 | __device__ void Car::step_constraint_velocity() {
188 |   // This is actually only needed for the very first iteration, because a car
189 |   // may be positioned on a traffic light cell.
190 |   if (velocity_ > position()->max_velocity()) {
191 |     velocity_ = position()->max_velocity();
192 |   }
193 | 
194 |   uint32_t path_index = 0;
195 |   int distance = 1;
196 | 
197 |   while (distance <= velocity_) {
198 |     // Invariant: Movement of up to `distance - 1` many cells at `velocity_`
199 |     //            is allowed.
200 |     // Now check if next cell can be entered.
201 |     Cell* next_cell = path_[path_index];
202 | 
203 |     // Avoid collision.
204 |     if (!next_cell->is_free()) {
205 |       // Cannot enter cell.
206 |       --distance;
207 |       velocity_ = distance;
208 |       break;
209 |     } // else: Can enter next cell.
210 | 
211 |     if (velocity_ > next_cell->max_velocity()) {
212 |       // Car is too fast for this cell.
213 |       if (next_cell->max_velocity() > distance - 1) {
214 |         // Even if we slow down, we would still make progress.
215 |         velocity_ = next_cell->max_velocity();
216 |       } else {
217 |         // Do not enter the next cell.
218 |         --distance;
219 |         assert(distance >= 0);
220 | 
221 |         velocity_ = distance;
222 |         break;
223 |       }
224 |     }
225 | 
226 |     ++distance;
227 |     ++path_index;
228 |   }
229 | 
230 |   --distance;
231 | 
232 | 
233 |   for (uint32_t i = 0; i < velocity_; ++i) {
234 |     assert(path_[i]->is_free());
235 |     assert(i == 0 || path_[i - 1] != path_[i]);
236 |   }
237 |   // TODO: Check why the cast is necessary.
238 |   assert(distance <= (int) velocity());
239 | }
240 | 
241 | __device__ void Car::step_move() {
242 |   if (is_active()) {
243 |     Cell* cell = position_;
244 |     for (uint32_t i = 0; i < velocity_; ++i) {
245 |       assert(path_[i] != cell);
246 | 
247 |       cell = path_[i];
248 |       assert(cell->is_free());
249 | 
250 |       position()->release();
251 |       cell->occupy(this);
252 |       position_ = cell;
253 |     }
254 | 
255 |     if (position()->is_sink()) {
256 |       // Remove car from the simulation. Will be added again in the next
257 |       // iteration.
258 |       position()->release();
259 |       path_length_ = 0;
260 |       is_active_ = false;
261 |     }
262 |   }
263 | }
264 | 


--------------------------------------------------------------------------------
/example/traffic/entities/cell.inc:
--------------------------------------------------------------------------------
  1 | class Cell : public SoaLayout<Cell, kNumCells, kAddressModeZero,
  2 |                               StaticStorageWithArena<kCellArenaSize>,
  3 |                               kCellLayoutMode> {
  4 |  public:
  5 |   IKRA_INITIALIZE_CLASS
  6 | 
  7 |   enum Type {
  8 |     // Sorted from smallest to largest.
  9 |     kService,
 10 |     kResidential,
 11 |     kUnclassified,
 12 |     kTertiary,
 13 |     kSecondary,
 14 |     kPrimary,
 15 |     kTrunk,
 16 |     kMotorway,
 17 | 
 18 |     kMaxType
 19 |   };
 20 | 
 21 |   // Default constructor, where incoming/outgoing cell lists are passed as
 22 |   // Cell pointers.
 23 |   __host__ __device__ Cell(uint32_t max_velocity, double x, double y,
 24 |                            uint32_t num_incoming, Cell** incoming,
 25 |                            uint32_t num_outgoing, Cell** outgoing,
 26 |                            Car* car, bool is_free, bool is_sink,
 27 |                            uint32_t controller_max_velocity,
 28 |                            Type type = kResidential)
 29 |       : max_velocity_(max_velocity), x_(x), y_(y), type_(type),
 30 |         num_incoming_cells_(num_incoming), num_outgoing_cells_(num_outgoing),
 31 | #ifdef ARRAY_CELL_OUT_IS_PARTIAL
 32 |         outgoing_cells_(num_outgoing),
 33 | #endif  // ARRAY_CELL_OUT_IS_PARTIAL
 34 | #ifdef ARRAY_CELL_IN_IS_PARTIAL
 35 |         incoming_cells_(num_incoming),
 36 | #endif  // ARRAY_CELL_IN_IS_PARTIAL
 37 |         car_(car), is_free_(is_free), is_sink_(is_sink),
 38 |         controller_max_velocity_(controller_max_velocity) {
 39 |     for (uint32_t i = 0; i < num_incoming; ++i) {
 40 |       incoming_cells_[i] = incoming[i];
 41 |     }
 42 | 
 43 |     for (uint32_t i = 0; i < num_outgoing; ++i) {
 44 |       outgoing_cells_[i] = outgoing[i];
 45 |     }
 46 |   }
 47 | 
 48 |   // Overload: Provide cell indices instead of pointers.
 49 |   __host__ __device__ Cell(uint32_t max_velocity, double x, double y,
 50 |                            uint32_t num_incoming, unsigned int* incoming,
 51 |                            uint32_t num_outgoing, unsigned int* outgoing,
 52 |                            Car* car, bool is_free, bool is_sink,
 53 |                            uint32_t controller_max_velocity,
 54 |                            Type type = kResidential)
 55 |       : max_velocity_(max_velocity), x_(x), y_(y), type_(type),
 56 |         num_incoming_cells_(num_incoming), num_outgoing_cells_(num_outgoing),
 57 | #ifdef ARRAY_CELL_OUT_IS_PARTIAL
 58 |         outgoing_cells_(num_outgoing),
 59 | #endif  // ARRAY_CELL_OUT_IS_PARTIAL
 60 | #ifdef ARRAY_CELL_IN_IS_PARTIAL
 61 |         incoming_cells_(num_incoming),
 62 | #endif  // ARRAY_CELL_IN_IS_PARTIAL
 63 |         car_(car), is_free_(is_free), is_sink_(is_sink) {
 64 |     for (uint32_t i = 0; i < num_incoming; ++i) {
 65 |       incoming_cells_[i] = Cell::get_uninitialized(incoming[i]);
 66 |     }
 67 | 
 68 |     for (uint32_t i = 0; i < num_outgoing; ++i) {
 69 |       outgoing_cells_[i] = Cell::get_uninitialized(outgoing[i]);
 70 |     }
 71 | 
 72 |     controller_max_velocity_ = max_velocity_;
 73 |   }
 74 | 
 75 |   // Return the maximum velocity that is allowed on this street in general.
 76 |   // [ OFFSET = 0 ]
 77 |   uint32_t_ max_velocity_;
 78 | 
 79 |   // Return max. velocity allowed with respect to traffic controllers.
 80 |   // [ OFFSET = 4 ]
 81 |   uint32_t_ controller_max_velocity_;
 82 | 
 83 |   // Number of incoming cells.
 84 |   // [ OFFSET = 8 ]
 85 |   uint32_t_ num_incoming_cells_;
 86 | 
 87 |   // Number of outgoing cells.
 88 |   // [ OFFSET = 12 ]
 89 |   uint32_t_ num_outgoing_cells_;
 90 | 
 91 |   // x and y coordinates, only for rendering and debugging purposes.
 92 |   // [ OFFSET = 16 ]
 93 |   double_ x_;
 94 |   double_ y_;
 95 | 
 96 |   // Incoming cells.
 97 |   // [ OFFSET = 32 ]
 98 |   ARRAY_CELL_INCOMING incoming_cells_;
 99 | 
100 |   // Outgoing cells.
101 |   // [ OFFSET = 32 + k*PTR_SIZE ]
102 |   ARRAY_CELL_OUTGOING outgoing_cells_;
103 | 
104 |   // The car that is currently occupying this cell (if any).
105 |   // [ OFFSET = 32 + k*PTR_SIZE ]
106 |   field_(Car*) car_;
107 | 
108 |   // The type of this cell according to OSM data. Size of enum is 4 bytes.
109 |   // [ OFFSET = 32 + k*PTR_SIZE ]
110 |   field_(Type) type_;
111 | 
112 |   // A cell is free if is does not contain a car.
113 |   // [ OFFSET = 36 + k*PTR_SIZE ]
114 |   bool_ is_free_;
115 | 
116 |   // A cell is usually a sink if does not have any outgoing edges.
117 |   // [ OFFSET = 37 + k*PTR_SIZE ]
118 |   bool_ is_sink_;
119 | 
120 |   // Add dummy field to make sure that objects are properly aligned in AOS.
121 |   // [ OFFSET = 38 + k*PTR_SIZE ]. Need 2 bytes padding.
122 |   array_(char, kCellAlignmentPadding, object) dummy_alignment_;
123 | 
124 |   // Returns the maximum velocity allowed on this cell at this moment. This
125 |   // function takes into account velocity limitations due to traffic lights.
126 |   __device__ uint32_t max_velocity() const {
127 |     return controller_max_velocity_ < max_velocity_
128 |         ? controller_max_velocity_
129 |         : max_velocity_;
130 |   }
131 | 
132 |   // The maximum velocity allowed on this cell regardless of
133 |   // traffic controllers.
134 |   __device__ uint32_t street_max_velocity() const {
135 |     return max_velocity_;
136 |   }
137 | 
138 |   // Sets the maximum temporary speed limit (traffic controller).
139 |   __device__ void set_controller_max_velocity(uint32_t velocity) {
140 |     controller_max_velocity_ = velocity;
141 |   }
142 | 
143 |   // Removes the maximum temporary speed limit.
144 |   __device__ void remove_controller_max_velocity() {
145 |     controller_max_velocity_ = max_velocity_;
146 |   }
147 | 
148 |   __device__ uint32_t num_incoming_cells() const {
149 |     return num_incoming_cells_;
150 |   }
151 |   __device__ Cell* incoming_cell(uint32_t index) const {
152 |     return incoming_cells_[index];
153 |   }
154 | 
155 |   __device__ uint32_t num_outgoing_cells() const {
156 |     return num_outgoing_cells_;
157 |   }
158 |   __device__ Cell* outgoing_cell(uint32_t index) const {
159 |     return outgoing_cells_[index];
160 |   }
161 | 
162 |   // A car enters this cell.
163 |   __device__ void occupy(Car* car) {
164 |     assert((bool) is_free_);
165 |     assert(car_ == nullptr);
166 |     
167 |     car_ = car;
168 |     is_free_ = false;
169 |   }
170 | 
171 |   // A car leaves this cell.
172 |   __device__ void release() {
173 |     car_ = nullptr;
174 |     is_free_ = true;
175 |   }
176 | 
177 |   __device__ double x() const { return x_; }
178 | 
179 |   __device__ double y() const { return y_; }
180 | 
181 |   __device__ Type type() const { return type_; }
182 | 
183 |   __device__ bool is_free() const { return is_free_; }
184 | 
185 |   __device__ bool is_sink() const { return is_sink_; }
186 | };
187 | 
188 | IKRA_DEVICE_STORAGE(Cell);
189 | 


--------------------------------------------------------------------------------
/example/traffic/entities/priority_yield_traffic_controller.inc:
--------------------------------------------------------------------------------
 1 | class PriorityYieldTrafficController : public SoaLayout<
 2 |     PriorityYieldTrafficController, kNumPriorityCtrls, kAddressModeZero,
 3 |     StaticStorageWithArena<kPriorityCtrlArenaSize>,
 4 |     kPriorityCtrlLayoutMode> {
 5 |  public:
 6 |   IKRA_INITIALIZE_CLASS
 7 | 
 8 |   __device__ PriorityYieldTrafficController(unsigned int* signal_groups,
 9 |                                             uint32_t num_signal_groups) :
10 | #ifdef ARRAY_PRIORITY_CTRL_IS_PARTIAL 
11 |         signal_groups_(num_signal_groups),
12 | #endif  // ARRAY_PRIORITY_CTRL_IS_PARTIAL
13 |         num_signal_groups_(num_signal_groups) {
14 |     for (uint32_t i = 0; i < num_signal_groups; ++i) {
15 |       signal_groups_[i] =
16 |           SharedSignalGroup::get_uninitialized(signal_groups[i]);
17 |     }
18 |   }
19 | 
20 |   __device__ void initialize() {
21 |     for (uint32_t i = 0; i < num_signal_groups_; ++i) {
22 |       signal_groups_[i]->signal_stop();
23 |     }
24 |   }
25 | 
26 |   __device__ void step() {
27 |     bool found_traffic = false;
28 |     // Cells are sorted by priority.
29 |     for (uint32_t i = 0; i < num_signal_groups_; ++i) {
30 |       SharedSignalGroup* next_group = signal_groups_[i];
31 |       bool has_incoming = has_incoming_traffic(next_group);
32 |       uint32_t num_cells = next_group->num_cells();
33 | 
34 |       if (!found_traffic && has_incoming) {
35 |         found_traffic = true;
36 |         // Allow traffic to flow.
37 |         for (uint32_t j = 0; j < num_cells; ++j) {
38 |           next_group->cell(j)->remove_controller_max_velocity();
39 |         }
40 |       } else if (has_incoming) {
41 |         // Traffic with higher priority is incoming.
42 |         for (uint32_t j = 0; j < num_cells; ++j) {
43 |           next_group->cell(j)->set_controller_max_velocity(0);
44 |         }
45 |       }
46 |     }
47 |   }
48 | 
49 |   // Signal groups controlled by this controller.
50 |   // [ PADDING = 0]
51 |   ARRAY_PRIORITY_CTRL_SIGNAL_GROUPS signal_groups_;
52 | 
53 |   // Number of signal groups.
54 |   // [ PADDING = k*PTR_SIZE ]
55 |   uint32_t_ num_signal_groups_;
56 | 
57 |   // Add dummy field to make sure that objects are properly aligned in AOS.
58 |   // [ OFFSET = 4 + k*PTR_SIZE ]
59 |   array_(char, kPriorityCtrlAlignmentPadding, object) dummy_alignment_;
60 | 
61 |   __device__ bool has_incoming_traffic(SharedSignalGroup* group) const {
62 |     const uint32_t num_cells = group->num_cells();
63 |     for (uint32_t i = 0; i < num_cells; ++i) {
64 |       Cell* next_cell = group->cell(i);
65 | 
66 |       // Report incoming traffic if at least one cells in the group reports
67 |       // incoming traffic.
68 |       if (has_incoming_traffic(next_cell, next_cell->street_max_velocity())) {
69 |         return true;
70 |       }
71 |     }
72 |     return false;
73 |   }
74 | 
75 |   __device__ bool has_incoming_traffic(Cell* cell, uint32_t lookahead) const {
76 |     if (lookahead == 0) {
77 |       return !cell->is_free();
78 |     }
79 | 
80 |     // Check incoming cells. This is BFS.
81 |     const uint32_t num_incoming = cell->num_incoming_cells();
82 |     for (uint32_t i = 0; i < num_incoming; ++i) {
83 |       Cell* next_cell = cell->incoming_cell(i);
84 |       if (has_incoming_traffic(next_cell, lookahead - 1)) {
85 |         return true;
86 |       }
87 |     }
88 | 
89 |     return !cell->is_free();
90 |   }
91 | };
92 | 
93 | IKRA_DEVICE_STORAGE(PriorityYieldTrafficController)
94 | 


--------------------------------------------------------------------------------
/example/traffic/entities/shared_signal_group.inc:
--------------------------------------------------------------------------------
 1 | class SharedSignalGroup : public SoaLayout<
 2 |     SharedSignalGroup, kNumSharedSignalGroups, kAddressModeZero,
 3 |     StaticStorageWithArena<kSharedSignalGroupArenaSize>,
 4 |     kSharedSignalGroupLayoutMode> {
 5 |  public:
 6 |   IKRA_INITIALIZE_CLASS
 7 |   
 8 |   __device__ SharedSignalGroup(uint32_t num_cells, unsigned int* cells) :
 9 | #ifdef ARRAY_SIGNAL_GROUP_IS_PARTIAL 
10 |       cells_(num_cells),
11 | #endif  // ARRAY_SIGNAL_GROUP_IS_PARTIAL
12 |       num_cells_(num_cells) {
13 |     for (uint32_t i = 0; i < num_cells; ++i) {
14 |       cells_[i] = Cell::get_uninitialized(cells[i]);
15 |     }
16 |   }
17 | 
18 |   __device__ void signal_go() {
19 |     for (uint32_t i = 0; i < num_cells_; ++i) {
20 |       cells_[i]->remove_controller_max_velocity();
21 |     }
22 |   }
23 | 
24 |   __device__ void signal_stop() {
25 |     for (uint32_t i = 0; i < num_cells_; ++i) {
26 |       cells_[i]->set_controller_max_velocity(0);
27 |     }
28 |   }
29 | 
30 |   // Cells controlled by this group.
31 |   // [ PADDING = 0 ]
32 |   ARRAY_SIGNAL_GROUP_CELLS cells_;
33 | 
34 |   // Number of cells.
35 |   // [ PADDING = k*PTR_SIZE ]
36 |   uint32_t_ num_cells_;
37 | 
38 |   // Add dummy field to make sure that objects are properly aligned in AOS.
39 |   // [ OFFSET = 4 + k*PTR_SIZE ]
40 |   array_(char, kSharedSignalGroupAlignmentPadding, object) dummy_alignment_;
41 | 
42 |   __device__ uint32_t num_cells() const { return num_cells_; }
43 |   __device__ Cell* cell(uint32_t index) { return cells_[index]; }
44 | };
45 | 
46 | IKRA_DEVICE_STORAGE(SharedSignalGroup);
47 | 


--------------------------------------------------------------------------------
/example/traffic/entities/simulation.inc:
--------------------------------------------------------------------------------
 1 | class Simulation : public SoaLayout<Simulation, 1> {
 2 |  public:
 3 |   IKRA_INITIALIZE_CLASS
 4 | 
 5 |   __device__ Simulation() : random_state_(123) {}
 6 | 
 7 |   uint32_t_ random_state_;
 8 | 
 9 |   __device__ uint32_t rand32() {
10 |     // Advance and return random state.
11 |     // Source: https://en.wikipedia.org/wiki/Lehmer_random_number_generator
12 |     random_state_ = static_cast<uint32_t>(
13 |         static_cast<uint64_t>(random_state()) * 279470273u) % 0xfffffffb;
14 |     return random_state_;
15 |   }
16 | 
17 |   __device__ uint32_t rand32(uint32_t max_value) {
18 |     return rand32() % max_value;
19 |   }
20 | 
21 |   __device__ uint32_t random_state() const {
22 |     return random_state_;
23 |   }
24 | 
25 |   __device__ void step_random_state() {
26 |     rand32();
27 |   }
28 | 
29 |   __device__ Cell* random_free_cell(Car* car) {
30 |     uint64_t state = random_state();
31 |     uint64_t num_cars = Car::size();
32 |     uint64_t num_cells = Cell::size();
33 |     uint32_t max_tries = num_cells / num_cars;
34 | 
35 |     for (uint32_t i = 0; i < max_tries; ++i) {
36 |       uint64_t cell_id = (num_cells * (car->id() + state) / num_cars + i)
37 |                          % num_cells;
38 |       Cell* next_cell = Cell::get(cell_id);
39 |       if (next_cell->is_free()) {
40 |         return next_cell;
41 |       }
42 |     }
43 | 
44 |     // Could not find free cell. Try again in next iteration.
45 |     return nullptr;
46 |   }
47 | };
48 | 
49 | IKRA_DEVICE_STORAGE(Simulation);
50 | 
51 | 
52 | // Defined here because Simulation must be defined.
53 | __device__ void Car::step_reactivate() {
54 |   if (!is_active()) {
55 |     Cell* free_cell = Simulation::get(0)->random_free_cell(this);
56 | 
57 |     if (free_cell != nullptr) {
58 |       assert(free_cell->is_free());
59 |       position_ = free_cell;
60 |       is_active_ = true;
61 |       path_length_ = 0;
62 |       velocity_ = 0;
63 |       free_cell->occupy(this);
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/example/traffic/entities/traffic_light.inc:
--------------------------------------------------------------------------------
  1 | 
  2 | class TrafficLight : public SoaLayout<
  3 |     TrafficLight, kNumTrafficLights, kAddressModeZero, 
  4 |     StaticStorageWithArena<kTrafficLightArenaSize>,
  5 |     kTrafficLightLayoutMode> {
  6 |  public:
  7 |    IKRA_INITIALIZE_CLASS
  8 | 
  9 |   __device__ TrafficLight(uint32_t timer, uint32_t phase_time, uint32_t phase,
 10 |                           unsigned int* signal_groups,
 11 |                           uint32_t num_signal_groups)
 12 |       : timer_(timer), phase_time_(phase_time), phase_(phase),
 13 | #ifdef ARRAY_TRAFFIC_LIGHT_IS_PARTIAL
 14 |         signal_groups_(num_signal_groups),
 15 | #endif  // ARRAY_TRAFFIC_LIGHT_IS_PARTIAL
 16 |         num_signal_groups_(num_signal_groups) {
 17 |     for (uint32_t i = 0; i < num_signal_groups; ++i) {
 18 |       signal_groups_[i] =
 19 |           SharedSignalGroup::get_uninitialized(signal_groups[i]);
 20 |     }
 21 |   }
 22 | 
 23 |   __device__ void initialize() {
 24 |     for (uint32_t i = 0; i < num_signal_groups_; ++i) {
 25 |       signal_groups_[i]->signal_stop();
 26 |     }
 27 |   }
 28 | 
 29 |   __device__ bool has_incoming_traffic(SharedSignalGroup* group) const {
 30 |     const uint32_t num_cells = group->num_cells();
 31 |     for (uint32_t i = 0; i < num_cells; ++i) {
 32 |       Cell* next_cell = group->cell(i);
 33 | 
 34 |       // Report incoming traffic if at least one cells in the group reports
 35 |       // incoming traffic.
 36 |       if (has_incoming_traffic(next_cell, next_cell->street_max_velocity())) {
 37 |         return true;
 38 |       }
 39 |     }
 40 |     return false;
 41 |   }
 42 | 
 43 |   __device__ bool has_incoming_traffic(Cell* cell, uint32_t lookahead) const {
 44 |     if (lookahead == 0) {
 45 |       return !cell->is_free();
 46 |     }
 47 | 
 48 |     // Check incoming cells. This is BFS.
 49 |     const uint32_t num_incoming = cell->num_incoming_cells();
 50 |     for (uint32_t i = 0; i < num_incoming; ++i) {
 51 |       Cell* next_cell = cell->incoming_cell(i);
 52 |       if (has_incoming_traffic(next_cell, lookahead - 1)) {
 53 |         return true;
 54 |       }
 55 |     }
 56 | 
 57 |     return !cell->is_free();
 58 |   }
 59 | 
 60 |   __device__ void step() {
 61 | #ifdef SMART_TRAFFIC_LIGHT
 62 |   // This is a smart traffic light. If there are cars coming from only one
 63 |   // street, give them a green light.
 64 |   uint32_t num_found_traffic = 0;
 65 |   uint32_t index_last_found = 0;
 66 |   for (uint32_t i = 0; i < num_signal_groups_; ++i) {
 67 |     SharedSignalGroup* next_group = signal_groups_[i];
 68 |     bool has_incoming = has_incoming_traffic(next_group);
 69 | 
 70 |     if (has_incoming) {
 71 |       num_found_traffic++;
 72 |       index_last_found = i;
 73 |     }
 74 | 
 75 |     if (num_found_traffic > 1) break;
 76 |   }
 77 | 
 78 |   if (num_found_traffic == 1) {
 79 |     // Set this one to green.
 80 |     for (uint32_t i = 0; i < num_signal_groups_; ++i) {
 81 |       if (i == index_last_found) {
 82 |         signal_groups_[i]->signal_go();
 83 |       } else {
 84 |         signal_groups_[i]->signal_stop();
 85 |       }
 86 |     }
 87 |     timer_ = 0;
 88 |     phase_ = index_last_found;
 89 |   } else {
 90 |     timer_ = (timer_ + 1) % phase_time_;
 91 | 
 92 |     if (timer_ == 0) {
 93 |       signal_groups_[phase_]->signal_stop();
 94 |       phase_ = (phase_ + 1) % num_signal_groups_;
 95 |       signal_groups_[phase_]->signal_go();
 96 |     }
 97 |   }
 98 | #else
 99 |     timer_ = (timer_ + 1) % phase_time_;
100 | 
101 |     if (timer_ == 0) {
102 |       signal_groups_[phase_]->signal_stop();
103 |       phase_ = (phase_ + 1) % num_signal_groups_;
104 |       signal_groups_[phase_]->signal_go();
105 |     }
106 | #endif  // SMART_TRAFFIC_LIGHT
107 |   }
108 | 
109 |   // This timer is increased with every step.
110 |   // [ PADDING = 0 ]
111 |   uint32_t_ timer_;
112 | 
113 |   // The length of one phase, i.e., number of iterations until the signal
114 |   // changes.
115 |   // [ PADDING = 4 ]
116 |   uint32_t_ phase_time_;
117 | 
118 |   // An index into signal_groups_, marking the group that currently has a
119 |   // green signal.
120 |   // [ PADDING = 8 ]
121 |   uint32_t_ phase_;
122 | 
123 |   // Number of signal groups.
124 |   // [ PADDING = 12 ]
125 |   uint32_t_ num_signal_groups_;
126 |   __device__ uint32_t num_signal_groups() const { return num_signal_groups_; }
127 | 
128 |   // Signal groups controlled by this traffic lights.
129 |   // [ PADDING = 16 ]
130 |   ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS signal_groups_;
131 | 
132 |   // No additional padding required for AOS.
133 | };
134 | 
135 | IKRA_DEVICE_STORAGE(TrafficLight);
136 | 


--------------------------------------------------------------------------------
/example/traffic/run_bench_traffic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # MODES: object == -1
 3 | #        fully inlined == -2
 4 | 
 5 | # Change directory to root.
 6 | cd "$(dirname "$0")"
 7 | cd ../..
 8 | 
 9 | function build_and_run() {
10 |   cmake -DCMAKE_BUILD_TYPE=Release \
11 |         -DBENCHMARK_MODE=1 \
12 |         -DBENCH_CELL_IN_MODE=$1 \
13 |         -DBENCH_CELL_OUT_MODE=$2 \
14 |         -DBENCH_CAR_MODE=$3 \
15 |         -DBENCH_SIGNAL_GROUP_MODE=$4 \
16 |         -DBENCH_TRAFFIC_LIGHT_MODE=$5 \
17 |         -DBENCH_LAYOUT_MODE=$6 \
18 |         CMakeLists.txt
19 |   make
20 | 
21 |   output=$((
22 |   (
23 |     bin/traffic ~/Downloads/graphml_urbanized/Denver--Aurora__CO_Urbanized_Area_23527.graphml
24 |   ) 1>/dev/null
25 |   ) 2>&1)
26 | 
27 |   echo ${output} >> result_bench.txt
28 | }
29 | 
30 | for v_layout_mode in "kLayoutModeSoa" "kLayoutModeAos"
31 | do
32 |   # cell_in: [0; 8]
33 |   for i in $(seq -2 8)
34 |   do
35 |     build_and_run ${i} -2 -2 -2 -2 ${v_layout_mode}
36 |   done
37 | 
38 |   # cell_out: [0; 8]
39 |   for i in $(seq -2 8)
40 |   do
41 |     build_and_run -2 ${i} -2 -2 -2 ${v_layout_mode}
42 |   done
43 | 
44 |   # car: [0; 30]
45 |   for i in $(seq -2 30)
46 |   do
47 |     build_and_run -2 -2 ${i} -2 -2 ${v_layout_mode}
48 |   done
49 | 
50 |   # signal group: [0; 8]
51 |   for i in $(seq -2 8)
52 |   do
53 |     build_and_run -2 -2 -2 ${i} -2 ${v_layout_mode}
54 |   done
55 | 
56 |   # traffic light [0; 6]
57 |   for i in $(seq -2 6)
58 |   do
59 |     build_and_run -2 -2 -2 -2 ${i} ${v_layout_mode}
60 |   done
61 | 
62 |   build_and_run -1 -1 -1 -1 -1 ${v_layout_mode}
63 |   build_and_run -1 -1 -1 -1 -1 ${v_layout_mode}
64 |   build_and_run 3 3 5 4 4 ${v_layout_mode}
65 | done
66 | 
67 | 


--------------------------------------------------------------------------------
/example/traffic/simulation_adapter.cu:
--------------------------------------------------------------------------------
 1 | #include "simulation_adapter.h"
 2 | 
 3 | namespace simulation {
 4 | namespace standard {
 5 | class Simulation;
 6 | Simulation* instance;
 7 | }  // namespace standard
 8 | 
 9 | namespace aos_int {
10 | class Simulation;
11 | Simulation* instance;
12 | }  // namespace aos_int
13 | 
14 | namespace aos_int_cuda {
15 | void initialize();
16 | void step();
17 | }  // namespace aos_int_cuda
18 | }  // namespace simulation
19 | 
20 | #include "traffic-simulation/option_standard.inc"
21 | #include "traffic-simulation/graphml_simulation.inc"
22 | 
23 | using namespace std;
24 | 
25 | #include "traffic-simulation/traffic_aos_int.h"
26 | 
27 | void load_simulation(int argc, char** argv, unsigned int num_cars) {
28 |   instance = build_simulation(argc, argv, num_cars);
29 |   simulation::aos_int::instance = new simulation::aos_int::Simulation(
30 |       simulation::standard::instance);
31 |   simulation::aos_int_cuda::initialize();
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/example/traffic/simulation_adapter.h:
--------------------------------------------------------------------------------
1 | #ifndef EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H
2 | #define EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H
3 | 
4 | void load_simulation(int argc, char** argv, unsigned int num_cars);
5 | 
6 | #endif  // EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H
7 | 


--------------------------------------------------------------------------------
/example/traffic/simulation_converter.inc:
--------------------------------------------------------------------------------
  1 | using IndexType = unsigned int;
  2 | 
  3 | // Data storage.
  4 | namespace simulation {
  5 | namespace aos_int {
  6 | extern IndexType s_size_Cell;
  7 | extern IndexType s_size_outgoing_cells;
  8 | extern IndexType s_size_incoming_cells;
  9 | extern IndexType s_size_Car;
 10 | extern IndexType s_size_car_paths;
 11 | extern IndexType s_size_inactive_cars;
 12 | extern IndexType s_size_TrafficLight;
 13 | extern IndexType s_size_PriorityYieldTrafficController;
 14 | extern IndexType s_size_SharedSignalGroup;
 15 | extern IndexType s_size_traffic_light_signal_groups;
 16 | extern IndexType s_size_priority_ctrl_signal_groups;
 17 | extern IndexType s_size_signal_group_cells;
 18 | }  // namespace aos_int
 19 | 
 20 | namespace aos_int_cuda {
 21 | extern Cell* dev_Cell;
 22 | extern IndexType* dev_outgoing_cells;
 23 | extern IndexType* dev_incoming_cells;
 24 | extern Car* dev_Car;
 25 | extern IndexType* dev_car_paths;
 26 | extern IndexType* dev_inactive_cars;
 27 | extern TrafficLight* dev_TrafficLight;
 28 | extern PriorityYieldTrafficController* dev_PriorityYieldTrafficController;
 29 | extern SharedSignalGroup* dev_SharedSignalGroup;
 30 | extern IndexType* dev_traffic_light_signal_groups;
 31 | extern IndexType* dev_priority_ctrl_signal_groups;
 32 | extern IndexType* dev_signal_group_cells;
 33 | }  // namespace aos_int_cuda
 34 | }  // namespace simulation
 35 | 
 36 | __global__ void convert_to_ikra_cpp_cells(
 37 |     IndexType s_size_Cell,
 38 |     simulation::aos_int_cuda::Cell* s_Cell,
 39 |     IndexType s_size_outgoing_cells,
 40 |     IndexType* s_outgoing_cells,
 41 |     IndexType s_size_incoming_cells,
 42 |     IndexType* s_incoming_cells) {
 43 |   unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
 44 | 
 45 |   if (tid < s_size_Cell) {
 46 |     simulation::aos_int_cuda::Cell& cell = s_Cell[tid];
 47 |     Car* car_ptr = cell.car_ == 4294967295
 48 |       ? nullptr : Car::get_uninitialized(cell.car_);
 49 | 
 50 |     Cell* new_cell = new(Cell::get_uninitialized(tid)) Cell(
 51 |         cell.max_velocity_, cell.x_, cell.y_,
 52 |         cell.num_incoming_cells_,
 53 |         s_incoming_cells + cell.first_incoming_cell_idx_,
 54 |         cell.num_outgoing_cells_,
 55 |         s_outgoing_cells + cell.first_outgoing_cell_idx_,
 56 |         car_ptr, cell.is_free_, cell.is_sink_,
 57 |         cell.controller_max_velocity_,
 58 |         (Cell::Type) cell.type_);
 59 |     assert(new_cell->id() == tid);
 60 |   }
 61 | 
 62 |   if (tid == 0) {
 63 |     Cell::storage().increase_size(s_size_Cell);
 64 |   }
 65 | }
 66 | 
 67 | __global__ void convert_to_ikra_cpp_cars(
 68 |     IndexType s_size_Car,
 69 |     simulation::aos_int_cuda::Car* s_Car,
 70 |     IndexType s_size_Cell) {
 71 |   unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
 72 | 
 73 |   if (tid < s_size_Car) {
 74 |     simulation::aos_int_cuda::Car& car = s_Car[tid];
 75 | 
 76 |     // Every car must have a position.
 77 |     assert(car.position_ != 4294967295);
 78 |     assert(car.position_ < s_size_Cell);
 79 | 
 80 |     Cell* cell_ptr = car.position_ == 4294967295
 81 |       ? nullptr : Cell::get_uninitialized(car.position_);
 82 | 
 83 |     Car* new_car = new(Car::get_uninitialized(tid)) Car(
 84 |         car.is_active_, car.velocity_, car.max_velocity_,
 85 |         car.random_state_, cell_ptr);
 86 |     assert(new_car->id() == tid);
 87 |   }
 88 | 
 89 |   if (tid == 0) {
 90 |     Car::storage().increase_size(s_size_Car);
 91 |     new Simulation();
 92 |   }
 93 | }
 94 | 
 95 | __global__ void convert_to_ikra_cpp_signal_groups(
 96 |     IndexType s_size_SharedSignalGroup,
 97 |     simulation::aos_int_cuda::SharedSignalGroup* s_SharedSignalGroup,
 98 |     IndexType* s_cells) {
 99 |   unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
100 | 
101 |   if (tid < s_size_SharedSignalGroup) {
102 |     simulation::aos_int_cuda::SharedSignalGroup& group =
103 |         s_SharedSignalGroup[tid];
104 | 
105 |     SharedSignalGroup* new_group =
106 |         new(SharedSignalGroup::get_uninitialized(tid)) SharedSignalGroup(
107 |             group.num_cells_, s_cells + group.first_cell_idx_);
108 |     assert(new_group->id() == tid);
109 |   }
110 | 
111 |   if (tid == 0) {
112 |     SharedSignalGroup::storage().increase_size(s_size_SharedSignalGroup);
113 |   }
114 | }
115 | 
116 | __global__ void convert_to_ikra_cpp_traffic_lights(
117 |     IndexType s_size_TrafficLight,
118 |     simulation::aos_int_cuda::TrafficLight* s_TrafficLight,
119 |     IndexType* s_signal_groups) {
120 |   unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
121 | 
122 |   if (tid < s_size_TrafficLight) {
123 |     simulation::aos_int_cuda::TrafficLight& light = s_TrafficLight[tid];
124 | 
125 |     TrafficLight* new_light = new(TrafficLight::get_uninitialized(tid))
126 |         TrafficLight(light.timer_, light.phase_time_, light.phase_,
127 |                      s_signal_groups + light.first_signal_group_idx_,
128 |                      light.num_signal_groups_);
129 |     assert(new_light->id() == tid);
130 |   }
131 | 
132 |   if (tid == 0) {
133 |     TrafficLight::storage().increase_size(s_size_TrafficLight);
134 |   }
135 | 
136 | }
137 | 
138 | __global__ void convert_to_ikra_cpp_priority_ctrls(
139 |     IndexType s_size_PriorityYieldTrafficController,
140 |     simulation::aos_int_cuda::PriorityYieldTrafficController*
141 |         s_PriorityYieldTrafficController,
142 |     IndexType* s_signal_groups) {
143 |   unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
144 | 
145 |   if (tid < s_size_PriorityYieldTrafficController) {
146 |     simulation::aos_int_cuda::PriorityYieldTrafficController& ctrl =
147 |         s_PriorityYieldTrafficController[tid];
148 | 
149 |     PriorityYieldTrafficController* new_ctrl =
150 |         new(PriorityYieldTrafficController::get_uninitialized(tid))
151 |         PriorityYieldTrafficController(
152 |             s_signal_groups + ctrl.first_group_idx_,
153 |             ctrl.num_groups_);
154 |     assert(new_ctrl->id() == tid);
155 |   }
156 | 
157 |   if (tid == 0) {
158 |     PriorityYieldTrafficController::storage().increase_size(
159 |         s_size_PriorityYieldTrafficController);
160 |   }
161 | }
162 | 
163 | void convert_simulation() {
164 |   assert(simulation::aos_int::s_size_Car <= kNumCars);
165 |   assert(simulation::aos_int::s_size_Cell <= kNumCells);
166 |   assert(simulation::aos_int::s_size_SharedSignalGroup
167 |       <= kNumSharedSignalGroups);
168 |   assert(simulation::aos_int::s_size_TrafficLight <= kNumTrafficLights);
169 |   assert(simulation::aos_int::s_size_PriorityYieldTrafficController
170 |       <= kNumPriorityCtrls);
171 | 
172 |   convert_to_ikra_cpp_cells<<<kNumCells/1024 + 1, 1024>>>(
173 |       simulation::aos_int::s_size_Cell,
174 |       simulation::aos_int_cuda::dev_Cell,
175 |       simulation::aos_int::s_size_outgoing_cells,
176 |       simulation::aos_int_cuda::dev_outgoing_cells,
177 |       simulation::aos_int::s_size_incoming_cells,
178 |       simulation::aos_int_cuda::dev_incoming_cells);
179 |   gpuErrchk(cudaDeviceSynchronize());
180 | 
181 |   convert_to_ikra_cpp_cars<<<kNumCars/1024 + 1, 1024>>>(
182 |       simulation::aos_int::s_size_Car,
183 |       simulation::aos_int_cuda::dev_Car,
184 |       simulation::aos_int::s_size_Cell);
185 |   gpuErrchk(cudaDeviceSynchronize());
186 | 
187 |   convert_to_ikra_cpp_signal_groups<<<kNumSharedSignalGroups/1024 + 1, 1024>>>(
188 |       simulation::aos_int::s_size_SharedSignalGroup,
189 |       simulation::aos_int_cuda::dev_SharedSignalGroup,
190 |       simulation::aos_int_cuda::dev_signal_group_cells);
191 |   gpuErrchk(cudaDeviceSynchronize());
192 | 
193 |   convert_to_ikra_cpp_traffic_lights<<<kNumTrafficLights/1024 + 1, 1024>>>(
194 |       simulation::aos_int::s_size_TrafficLight,
195 |       simulation::aos_int_cuda::dev_TrafficLight,
196 |       simulation::aos_int_cuda::dev_traffic_light_signal_groups);
197 |   gpuErrchk(cudaDeviceSynchronize());
198 | 
199 |   convert_to_ikra_cpp_priority_ctrls<<<kNumPriorityCtrls/1024 + 1, 1024>>>(
200 |     simulation::aos_int::s_size_PriorityYieldTrafficController,
201 |     simulation::aos_int_cuda::dev_PriorityYieldTrafficController,
202 |     simulation::aos_int_cuda::dev_priority_ctrl_signal_groups);
203 |   gpuErrchk(cudaDeviceSynchronize());
204 | }
205 | 


--------------------------------------------------------------------------------
/example/traffic/statistics.inc:
--------------------------------------------------------------------------------
  1 | 
  2 | #define STRINGIFY(s) STRINGIFY2(s)
  3 | #define STRINGIFY2(val) #val
  4 | 
  5 | #define PRINT_HISTORAM(class, function, maxval) \
  6 | { \
  7 |   printf("------------------------------------------------\n"); \
  8 |   int counter[maxval];\
  9 |   uint64_t sum = 0; \
 10 |   uint64_t squared_sum = 0; \
 11 |   for (int i = 0; i < maxval; ++i) { \
 12 |     counter[i] = 0; \
 13 |   } \
 14 |   for (uint32_t i = 0; i < class::size(); ++i) { \
 15 |     counter[class::get(i)->function()]++; \
 16 |     sum += class::get(i)->function(); \
 17 |     squared_sum += class::get(i)->function() * class::get(i)->function(); \
 18 |   } \
 19 |   for (int i = 0; i < maxval; ++i) { \
 20 |     if (counter[i] > 0) { \
 21 |       printf(STRINGIFY(class) "::" STRINGIFY(function) "[%i] = %i  [ %f % ]\n", \
 22 |              i, counter[i], (float) counter[i] * 100.0f / class::size()); \
 23 |     } \
 24 |   } \
 25 |   double mean = sum / (double) class::size(); \
 26 |   double variance = (squared_sum - class::size() * mean * mean) \
 27 |       / (class::size() - 1.0f); \
 28 |   printf("Mean = %f,   Variance = %f,   stddev = %f\n", \
 29 |          mean, variance, sqrt(variance)); \
 30 | }
 31 | 
 32 | __global__ void print_histograms_1() {
 33 |   PRINT_HISTORAM(Car, velocity, 50);
 34 |   PRINT_HISTORAM(Car, max_velocity, 50);
 35 | }
 36 | 
 37 | __global__ void print_histograms_2() {
 38 |   PRINT_HISTORAM(SharedSignalGroup, num_cells, 50);
 39 |   PRINT_HISTORAM(TrafficLight, num_signal_groups, 50);
 40 | }
 41 | 
 42 | __global__ void print_histograms_3() {
 43 |   PRINT_HISTORAM(Cell, num_incoming_cells, 50);
 44 | }
 45 | 
 46 | __global__ void print_histograms_4() {
 47 |   PRINT_HISTORAM(Cell, num_outgoing_cells, 50);
 48 | }
 49 | 
 50 | __global__ void print_histograms_5() {
 51 |   PRINT_HISTORAM(Cell, street_max_velocity, 50);
 52 | }
 53 | 
 54 | __global__ void caluclate_checksum() {
 55 |   uint64_t c = 17;
 56 |   for (uint32_t i = 0; i < Car::size(); ++i) {
 57 |     Cell* position = Car::get(i)->position();
 58 |     c += position->x() + position->y();
 59 |     c %= UINT64_MAX;
 60 |   }
 61 |   int result = c % 1234567;
 62 | 
 63 |   printf("Checksum: %i\n", result);
 64 | }
 65 | 
 66 | void print_statistics(unsigned long time_cars, unsigned long time_controllers,
 67 |                       unsigned long time_total) {
 68 |   unsigned long utilization_Cell = Cell::storage().arena_utilization();
 69 |   unsigned long utilization_Car = Car::storage().arena_utilization();
 70 |   unsigned long utilization_SharedSignalGroup =
 71 |       SharedSignalGroup::storage().arena_utilization();
 72 |   unsigned long utilization_TrafficLight =
 73 |       TrafficLight::storage().arena_utilization();
 74 |   unsigned long utilization_PriorityCtrl = 
 75 |       PriorityYieldTrafficController::storage().arena_utilization();
 76 |   unsigned long total_utilization =
 77 |       utilization_Cell + utilization_Car + utilization_SharedSignalGroup
 78 |       + utilization_TrafficLight + utilization_PriorityCtrl;
 79 | 
 80 |   unsigned long long int storage_Cell = Cell::size()*Cell::ObjectSize::value;
 81 |   unsigned long long int storage_Car = Car::size()*Car::ObjectSize::value;
 82 |   unsigned long long int storage_SignalGroup =
 83 |       SharedSignalGroup::size()*SharedSignalGroup::ObjectSize::value;
 84 |   unsigned long long int storage_TrafficLight =
 85 |       TrafficLight::size()*TrafficLight::ObjectSize::value;
 86 |   unsigned long long int storage_PriorityCtrl =
 87 |      PriorityYieldTrafficController::size()
 88 |           *PriorityYieldTrafficController::ObjectSize::value;
 89 | 
 90 |   unsigned long long int regular_storage_size =
 91 |       storage_Cell + storage_Car + storage_SignalGroup
 92 |       + storage_TrafficLight + storage_PriorityCtrl;
 93 | 
 94 |   fprintf(stderr,
 95 |       "%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu,%llu,%llu,%llu,%llu,%llu,%llu,"
 96 |       STRINGIFY(BENCH_LAYOUT_MODE) ","
 97 |       STRINGIFY(BENCH_CELL_IN_MODE) ","
 98 |       STRINGIFY(BENCH_CELL_OUT_MODE) ","
 99 |       STRINGIFY(BENCH_CAR_MODE) ","
100 |       STRINGIFY(BENCH_SIGNAL_GROUP_MODE) ","
101 |       STRINGIFY(BENCH_TRAFFIC_LIGHT_MODE) "\n",
102 |       time_cars,
103 |       time_controllers,
104 |       time_total,
105 |       utilization_Cell,
106 |       utilization_Car,
107 |       utilization_SharedSignalGroup,
108 |       utilization_TrafficLight,
109 |       utilization_PriorityCtrl,
110 |       total_utilization,
111 |       storage_Cell,
112 |       storage_Car,
113 |       storage_SignalGroup,
114 |       storage_TrafficLight,
115 |       storage_PriorityCtrl,
116 |       regular_storage_size);
117 | }
118 | 


--------------------------------------------------------------------------------
/example/traffic/traffic.cu:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <iostream>
  3 | 
  4 | #include "executor/executor.h"
  5 | #include "soa/soa.h"
  6 | #include "executor/cuda_executor.h"
  7 | 
  8 | #include "benchmark.h"
  9 | #include "configuration.h"
 10 | #include "simulation_adapter.h"
 11 | #include "traffic-simulation/traffic_aos_int_cuda.h"
 12 | 
 13 | using ikra::soa::SoaLayout;
 14 | using ikra::soa::kAddressModeZero;
 15 | using ikra::soa::StaticStorageWithArena;
 16 | 
 17 | class Car;
 18 | 
 19 | // TODO: Figure out how extern __device__ variables work in CUDA, so that we
 20 | // can use normal header files instead of textual headers.
 21 | #include "entities/cell.inc"
 22 | #include "entities/car.inc"
 23 | #include "entities/shared_signal_group.inc"
 24 | #include "entities/traffic_light.inc"
 25 | #include "entities/priority_yield_traffic_controller.inc"
 26 | #include "entities/simulation.inc"
 27 | #include "simulation_converter.inc"
 28 | #include "statistics.inc"
 29 | 
 30 | 
 31 | void run_traffic_controllers() {
 32 |   cuda_execute(&Simulation::step_random_state);
 33 |   cuda_execute(&TrafficLight::step);
 34 |   cuda_execute(&PriorityYieldTrafficController::step);
 35 |   cudaDeviceSynchronize();
 36 | }
 37 | 
 38 | void run_cars() {
 39 |   cuda_execute(&Car::step_prepare_path);
 40 |   cudaDeviceSynchronize();
 41 | 
 42 |   cuda_execute(&Car::step_move);
 43 |   cudaDeviceSynchronize();
 44 | 
 45 |   cuda_execute(&Car::step_reactivate);
 46 |   cudaDeviceSynchronize();
 47 | }
 48 | 
 49 | void benchmark() {
 50 |   uint64_t time_controllers[kNumBenchmarkRuns] = {0};
 51 |   uint64_t time_cars[kNumBenchmarkRuns] = {0};
 52 |   uint64_t time_total[kNumBenchmarkRuns] = {0};
 53 | 
 54 |   for (uint32_t r = 0; r < kNumBenchmarkRuns; ++r) {
 55 |     Cell::initialize_storage();
 56 |     Car::initialize_storage();
 57 |     SharedSignalGroup::initialize_storage();
 58 |     TrafficLight::initialize_storage();
 59 |     PriorityYieldTrafficController::initialize_storage();
 60 | 
 61 |     convert_simulation();
 62 |     cuda_execute(&TrafficLight::initialize);
 63 |     cuda_execute(&PriorityYieldTrafficController::initialize);
 64 |     cudaDeviceSynchronize();
 65 | 
 66 |     for (uint32_t i = 0; i < kNumIterations; ++i) {
 67 |       uint64_t t_ctrl = measure<>::execution(run_traffic_controllers);
 68 |       time_controllers[r] += t_ctrl;
 69 | 
 70 |       uint64_t t_car = measure<>::execution(run_cars);
 71 |       time_cars[r] += t_car;
 72 | 
 73 |       time_total[r] = time_controllers[r] + time_cars[r];
 74 |     }
 75 |     gpuErrchk(cudaPeekAtLastError());
 76 |   }
 77 | 
 78 |   // Find best run.
 79 |   uint64_t best_time = std::numeric_limits<uint64_t>::max();
 80 |   uint32_t best_index = -1;
 81 |   for (uint32_t r = 0; r < kNumBenchmarkRuns; ++r) {
 82 |     if (time_total[r] < best_time) {
 83 |       best_time = time_total[r];
 84 |       best_index = r;
 85 |     }
 86 |   }
 87 | 
 88 |   // Print best run.
 89 |   print_statistics(time_cars[best_index]/1000,
 90 |                    time_controllers[best_index]/1000,
 91 |                    time_total[best_index]/1000);
 92 | }
 93 | 
 94 | int main(int argc, char** argv) {
 95 |   load_simulation(argc, argv, kNumCars);
 96 |   benchmark();
 97 | 
 98 |   print_histograms_1<<<1, 1>>>();
 99 |   gpuErrchk(cudaDeviceSynchronize());
100 |   print_histograms_2<<<1, 1>>>();
101 |   gpuErrchk(cudaDeviceSynchronize());
102 |   print_histograms_3<<<1, 1>>>();
103 |   gpuErrchk(cudaDeviceSynchronize());
104 |   print_histograms_4<<<1, 1>>>();
105 |   gpuErrchk(cudaDeviceSynchronize());
106 |   print_histograms_5<<<1, 1>>>();
107 |   gpuErrchk(cudaDeviceSynchronize());
108 |   caluclate_checksum<<<1,1>>>();
109 |   gpuErrchk(cudaDeviceSynchronize());
110 | }
111 | 


--------------------------------------------------------------------------------
/ikra/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(ikra INTERFACE)
2 | target_include_directories(ikra INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/ikra/executor/executor.h:
--------------------------------------------------------------------------------
  1 | #ifndef EXECUTOR_EXECUTOR_H
  2 | #define EXECUTOR_EXECUTOR_H
  3 | 
  4 | // Asserts active only in debug mode (NDEBUG).
  5 | #include <cassert>
  6 | #include <cstdlib>
  7 | 
  8 | #include <cstdint>
  9 | #include <type_traits>
 10 | 
 11 | #include "executor/util.h"
 12 | #include "soa/constants.h"
 13 | 
 14 | namespace ikra {
 15 | namespace executor {
 16 | 
 17 | using ikra::soa::IndexType;
 18 | 
 19 | // A convenience class storing two iterators, i.e., a range of SOA objects.
 20 | template<typename T>
 21 | class IteratorRange {
 22 |  public:
 23 |   IteratorRange(T begin, T end) : begin_(begin), end_(end) {}
 24 | 
 25 |   T begin() {
 26 |     return begin_;
 27 |   }
 28 | 
 29 |   T end() {
 30 |     return end_;
 31 |   }
 32 | 
 33 |  private:
 34 |   T begin_;
 35 |   T end_;
 36 | };
 37 | 
 38 | template<typename T, typename... Args>
 39 | T* construct(size_t count, Args... args) {
 40 |   assert(count > 0);
 41 |   T* first = new T(args...);
 42 |   for (size_t i = 1; i < count; ++i) {
 43 |     new T(args...);
 44 |   }
 45 |   return first;
 46 | }
 47 | 
 48 | // This function executes a method on all SOA objects that are enumerated by an
 49 | // iterator (between begin and end). This is currently a fully sequential
 50 | // operation.
 51 | template<typename T, typename F, typename... Args>
 52 | void execute(T begin, T end, F function, Args... args) {
 53 |   for (auto iter = begin; iter != end; ++iter) {
 54 |     // TODO: Should we override and use operator->* here?
 55 |     ((**iter).*function)(args...);
 56 |   }
 57 | }
 58 | 
 59 | // This function executes a method on all SOA objects that are enumerated by an
 60 | // iterator (between begin and end). It also reduces all return values.
 61 | template<typename T, typename F, typename... Args, typename G, typename D>
 62 | D execute_and_reduce(T begin, T end, F function, G reduce_function,
 63 |                      D default_value, Args... args) {
 64 |   if (begin == end) {
 65 |     return default_value;
 66 |   }
 67 | 
 68 |   D reduced_value = ((**begin).*function)(args...);
 69 |   for (auto iter = ++begin; iter != end; ++iter) {
 70 |     // TODO: Should we override and use operator->* here?
 71 |     reduced_value = reduce_function(reduced_value,
 72 |                                     ((**iter).*function)(args...));
 73 |   }
 74 |   return reduced_value;
 75 | }
 76 | 
 77 | // This function executes a method on all SOA objects that are enumerated by an
 78 | // iterator (between begin and end). This is currently a fully sequential
 79 | // operation.
 80 | template<typename T, typename F, typename... Args>
 81 | void execute(IteratorRange<T> range, F function, Args... args) {
 82 |   for (auto iter = range.begin(); iter != range.end(); ++iter) {
 83 |     ((**iter).*function)(args...);
 84 |   }
 85 | }
 86 | 
 87 | // This function executes a method on all SOA objects that are enumerated by an
 88 | // iterator (length many elements). This is currently a fully sequential
 89 | // operation.
 90 | template<typename T, typename F, typename... Args>
 91 | void execute(T begin, IndexType length, F function, Args... args) {
 92 |   T iter = begin;
 93 |   for (uint32_t i = 0; i < length; ++i, ++iter) {
 94 |     ((**iter).*function)(args...);
 95 |   }
 96 | }
 97 | 
 98 | // This function executes a method on all SOA objects of a given class.
 99 | // This is currently a fully sequential operation.
100 | template<typename F, typename... Args,
101 |          class T = typename FunctionTypeHelper<F>::class_type>
102 | void execute(F function, Args... args) {
103 |   IndexType num_instances = T::size();
104 |   for (IndexType i = 0; i < num_instances; ++i) {
105 |     ((*T::get(i)).*function)(args...);
106 |   }
107 | }
108 | 
109 | // This function executes a method on all SOA objects of a given class,
110 | // starting from ID 0 up to a given ID (template parameter).
111 | // This is currently a fully sequential operation.
112 | template<IndexType MaxId, typename F, typename... Args,
113 |          class T = typename FunctionTypeHelper<F>::class_type>
114 | void execute(F function, Args... args) {
115 |   for (IndexType i = 0; i < MaxId; ++i) {
116 |     ((*T::get(i)).*function)(args...);
117 |   }
118 | }
119 | 
120 | // This function executes a method on all SOA objects of a given class and
121 | // reduces the return values.
122 | template<typename F, typename... Args, typename G, typename D,
123 |          class T = typename FunctionTypeHelper<F>::class_type>
124 | D execute_and_reduce(F function, G reduce_function, D default_value,
125 |                      Args... args) {
126 |   return execute_and_reduce(T::begin(), T::end(),
127 |                             function, reduce_function, default_value, args...);
128 | }
129 | 
130 | }  // namespace executor
131 | }  // namespace ikra
132 | 
133 | #endif  // EXECUTOR_EXECUTOR_H
134 | 


--------------------------------------------------------------------------------
/ikra/executor/iterator.h:
--------------------------------------------------------------------------------
  1 | #ifndef EXECUTOR_ITERATOR_H
  2 | #define EXECUTOR_ITERATOR_H
  3 | 
  4 | #include <type_traits>
  5 | #include "soa/constants.h"
  6 | 
  7 | namespace ikra {
  8 | namespace executor {
  9 | 
 10 | using ikra::soa::IndexType;
 11 | 
 12 | // This class implements an iterator for SOA objects, starting a given ID. It
 13 | // points to an object with ID i and forwards the pointer to i + 1. This class
 14 | // can handle both valid and zero addressing mode.
 15 | template<typename T>
 16 | class Iterator {
 17 |  private:
 18 |   using InnerT = typename std::remove_pointer<T>::type;
 19 | 
 20 |  public:
 21 |   // This iterator should be used on pointers to SOA objects.
 22 |   static_assert(std::is_pointer<T>::value, "Expected pointer type.");
 23 | 
 24 |   T& operator*() {
 25 |     return position_;
 26 |   }
 27 | 
 28 |   // This iterator can go in both directions.
 29 |   template<int A = InnerT::kAddressMode>
 30 |   typename std::enable_if<A == soa::kAddressModeZero, Iterator<T>>::type&
 31 |   operator+=(IndexType distance) {
 32 |     auto next = reinterpret_cast<uintptr_t>(position_) + distance;
 33 |     position_ = reinterpret_cast<T>(next);
 34 |     return *this;
 35 |   }
 36 | 
 37 |   template<int A = InnerT::kAddressMode>
 38 |   typename std::enable_if<A != soa::kAddressModeZero, Iterator<T>>::type&
 39 |   operator+=(IndexType distance) {
 40 |     auto next = reinterpret_cast<uintptr_t>(position_) + A*distance;
 41 |     position_ = reinterpret_cast<T>(next);
 42 |     return *this;
 43 |   }
 44 | 
 45 |   template<int A = InnerT::kAddressMode>
 46 |   typename std::enable_if<A == soa::kAddressModeZero, Iterator<T>>::type&
 47 |   operator-=(IndexType distance) {
 48 |     auto next = reinterpret_cast<uintptr_t>(position_) - distance;
 49 |     position_ = reinterpret_cast<T>(next);
 50 |     return *this;
 51 |   }
 52 | 
 53 |   template<int A = InnerT::kAddressMode>
 54 |   typename std::enable_if<A != soa::kAddressModeZero, Iterator<T>>::type&
 55 |   operator-=(IndexType distance) {
 56 |     auto next = reinterpret_cast<uintptr_t>(position_) - A*distance;
 57 |     position_ = reinterpret_cast<T>(next);
 58 |     return *this;
 59 |   }
 60 | 
 61 |   template<int A = InnerT::kAddressMode>
 62 |   typename std::enable_if<A == soa::kAddressModeZero, Iterator<T>>::type
 63 |   operator+(IndexType delta) const {
 64 |     Iterator<T> copy = *this;
 65 |     auto new_pos = reinterpret_cast<uintptr_t>(copy.position_) + delta;
 66 |     copy.position_ = reinterpret_cast<T>(new_pos);
 67 |     return copy;
 68 |   }
 69 | 
 70 |   template<int A = InnerT::kAddressMode>
 71 |   typename std::enable_if<A != soa::kAddressModeZero, Iterator<T>>::type
 72 |   operator+(IndexType delta) const {
 73 |     Iterator<T> copy = *this;
 74 |     auto new_pos = reinterpret_cast<uintptr_t>(copy.position_) + A*delta;
 75 |     copy.position_ = reinterpret_cast<T>(new_pos);
 76 |     return copy;
 77 |   }
 78 | 
 79 |   Iterator<T>& operator++() {
 80 |     return this->operator+=(1);
 81 |   }
 82 | 
 83 |   Iterator<T>& operator--() {
 84 |     return this->operator-=(1);
 85 |   }
 86 | 
 87 |   bool operator!=(Iterator<T> other) const {
 88 |     return position_ != other.position_;
 89 |   }
 90 | 
 91 |   bool operator==(Iterator<T> other) const {
 92 |     return position_ == other.position_;
 93 |   }
 94 | 
 95 |   Iterator(T position) : position_(position) {}
 96 | 
 97 |  private:
 98 |   T position_;
 99 | };
100 | 
101 | // Helper function/constructor for Iterator for automatic template parameter
102 | // deduction.
103 | template<typename T>
104 | Iterator<T> make_iterator(T position) {
105 |   return Iterator<T>(position);
106 | }
107 | 
108 | }  // namespace executor
109 | }  // namespace ikra
110 | 
111 | #endif  // EXECUTOR_ITERATOR_H


--------------------------------------------------------------------------------
/ikra/executor/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef EXECUTOR_UTIL_H
 2 | #define EXECUTOR_UTIL_H
 3 | 
 4 | namespace ikra {
 5 | namespace executor {
 6 | 
 7 | template<class F>
 8 | struct FunctionTypeHelper;
 9 | 
10 | template<typename R, typename C, typename... Args>
11 | struct FunctionTypeHelper<R (C::*)(Args...)> {
12 |   using return_type = R;
13 |   using class_type = C;
14 | };
15 | 
16 | }  // namespace executor
17 | }  // namespace ikra
18 | 
19 | #endif  // EXECUTOR_UTIL_H
20 | 


--------------------------------------------------------------------------------
/ikra/soa/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(soa INTERFACE)
2 | target_include_directories(soa INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/..)
3 | 


--------------------------------------------------------------------------------
/ikra/soa/addressable_field_shared.inc:
--------------------------------------------------------------------------------
  1 |   // This file should be included for field types that have an address. E.g.,
  2 |   // AOS-style arrays have a single address, but SOA-style arrays do not.
  3 | 
  4 | public:
  5 | // TODO: Disable operator if T is a SOA class. SOA object pointer cannot be
  6 | // dereferenced.
  7 | __ikra_device__ T& operator*() const {
  8 |   return *data_ptr();
  9 | }
 10 | 
 11 | __ikra_device__ T* operator->() const {
 12 |   return data_ptr();
 13 | }
 14 | 
 15 | __ikra_device__ T* operator&() const {
 16 |   return data_ptr();
 17 | }
 18 | 
 19 | // Get the value of this field. This method is not usually needed and the
 20 | // preferred way to retrieve a value is through implicit conversion.
 21 | __ikra_device__ T& get() const {
 22 |   return *data_ptr();
 23 | }
 24 | 
 25 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__)
 26 | // Operator for implicit conversion to type T. Either running device code or
 27 | // not running in CUDA mode at all.
 28 | __ikra_device__ operator T&() const {
 29 |   return *data_ptr();
 30 | }
 31 | 
 32 | // Assignment operator.
 33 | __ikra_device__ Self& operator=(T value) {
 34 |   *data_ptr() = value;
 35 |   return *this;
 36 | }
 37 | #else
 38 | T* device_data_ptr() const {
 39 |   auto h_data_ptr = reinterpret_cast<uintptr_t>(data_ptr());
 40 |   auto h_storage_data = reinterpret_cast<uintptr_t>(&Owner::storage());
 41 |   auto data_offset = h_data_ptr - h_storage_data;
 42 |   auto d_storage_ptr = reinterpret_cast<uintptr_t>(
 43 |       Owner::storage().device_ptr());
 44 |   return reinterpret_cast<T*>(d_storage_ptr + data_offset);
 45 | }
 46 | 
 47 | void copy_from_device(T* target) const {
 48 |   cudaMemcpy(target, device_data_ptr(), sizeof(T), cudaMemcpyDeviceToHost);
 49 | }
 50 | 
 51 | T copy_from_device() const {
 52 |   T host_data;
 53 |   copy_from_device(&host_data);
 54 |   return host_data;
 55 | }
 56 | 
 57 | // Operator for implicit conversion to type T. Running in CUDA mode on the
 58 | // host. Data must be copied.
 59 | // TODO: This method is broken when compiling in CUDA mode but host execution
 60 | // is intended.
 61 | operator T() const {
 62 |   return copy_from_device();
 63 | }
 64 | 
 65 | // Assignment operator.
 66 | // TODO: Probably need to handle SOA pointer differently here.
 67 | Self& operator=(T value) {
 68 |   cudaMemcpy(device_data_ptr(), &value, sizeof(T), cudaMemcpyHostToDevice);
 69 |   return *this;
 70 | }
 71 | #endif  // __CUDA_ARCH__
 72 | 
 73 | protected:
 74 | __ikra_device__ T* data_ptr() const {
 75 |   return data_ptr_uninitialized();
 76 | }
 77 | 
 78 | // Calculate the address of this field based on the "this" pointer of this
 79 | // Field instance.
 80 | template<int A = AddressMode, int L = LayoutMode>
 81 | __ikra_device__ typename std::enable_if<A != kAddressModeZero &&
 82 |                                         L == kLayoutModeSoa, T*>::type
 83 | data_ptr_uninitialized() const {
 84 |   auto p_this = reinterpret_cast<uintptr_t>(this);
 85 |   auto p_base = reinterpret_cast<uintptr_t>(Owner::storage().data_ptr());
 86 |   auto p_result = (p_this - p_base - A)/A*sizeof(T) + p_base +
 87 |                    (Capacity+1)*Offset;
 88 |   return reinterpret_cast<T*>(p_result);
 89 | }
 90 | 
 91 | // Calculate the address of this field based on the "this" pointer of this
 92 | // Field instance.
 93 | template<int A = AddressMode, int S = StorageMode, int L = LayoutMode>
 94 | __ikra_device__ typename std::enable_if<A == kAddressModeZero &&
 95 |                                         S == kStorageModeStatic &&
 96 |                                         L == kLayoutModeAos, T*>::type
 97 | data_ptr_uninitialized() const {
 98 |   // TODO: Fix field alignment.
 99 |   return reinterpret_cast<T*>(
100 |       reinterpret_cast<char*>(const_cast<Self*>(this)) + Offset);
101 | }
102 | 
103 | template<int A = AddressMode, int S = StorageMode, int L = LayoutMode>
104 | __ikra_device__ typename std::enable_if<A == kAddressModeZero &&
105 |                                         S == kStorageModeStatic &&
106 |                                         L == kLayoutModeSoa, T*>::type
107 | data_ptr_uninitialized() const {
108 |   // Use constant-folded value for address computation.
109 |   constexpr auto cptr_data_offset =
110 |       StorageDataOffset<typename Owner::Storage>::value;
111 |   constexpr auto cptr_storage_buffer = Owner::storage_buffer();
112 |   constexpr auto array_location =
113 |       cptr_storage_buffer + cptr_data_offset + (Capacity+1)*Offset;
114 | 
115 | #ifdef __clang__
116 |   // Clang does not allow reinterpret_cast in constexprs.
117 |   constexpr T* soa_array = IKRA_fold(reinterpret_cast<T*>(array_location));
118 | #else
119 |   constexpr T* soa_array = reinterpret_cast<T*>(array_location);
120 | #endif  // __clang__
121 | 
122 |   // Check alignment.
123 |   assert(reinterpret_cast<uintptr_t>(soa_array) % 8 == 0);
124 | 
125 |   return soa_array + reinterpret_cast<uintptr_t>(this);
126 | }
127 | 
128 | template<int A = AddressMode, int S = StorageMode, int L = LayoutMode>
129 | __ikra_device__ typename std::enable_if<A == kAddressModeZero &&
130 |                                         S == kStorageModeDynamic &&
131 |                                         L == kLayoutModeSoa, T*>::type
132 | data_ptr_uninitialized() const {
133 |   // Cannot constant fold dynamically allocated storage.
134 |   auto p_base = Owner::storage().data_reference();
135 |   return reinterpret_cast<T*>(
136 |       p_base + (Capacity+1)*Offset +
137 |       reinterpret_cast<uintptr_t>(this)*sizeof(T));
138 | }
139 | 
140 | 


--------------------------------------------------------------------------------
/ikra/soa/array_shared.inc:
--------------------------------------------------------------------------------
 1 |  // This implementation is shared by all array classes. B is the base type
 2 |  // of the array. The most important functionality in here is interoperability
 3 |  // between host and device code.
 4 | 
 5 | public:
 6 | // Implement std::array interface.
 7 | 
 8 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__)
 9 | // Not running in CUDA mode or running on device: Can return a reference to
10 | // array values.
11 | 
12 | __ikra_device__ B& operator[](size_t pos) const {
13 |   return *this->array_data_ptr(pos);
14 | }
15 | 
16 | __ikra_device__ B& at(size_t pos) const {
17 |   // TODO: This function should throw an exception.
18 |   assert(pos < ArraySize);
19 |   return this->operator[](pos);
20 | }
21 | 
22 | template<size_t Pos>
23 | __ikra_device__ B& at() const {
24 |   static_assert(Pos < ArraySize, "Index out of bounds.");
25 |   return *array_data_ptr<Pos>();
26 | }
27 | 
28 | __ikra_device__ B& front() const {
29 |   return at<0>();
30 | }
31 | 
32 | __ikra_device__ B& back() const {
33 |   return at<ArraySize - 1>();
34 | }
35 | #else
36 | 
37 | // A helper class with an overridden operator= method. This class allows
38 | // clients to the "[]=" syntax for array assignment, even if the array is
39 | // physically located on the device.
40 | // Addresses of instances point to host data locations.
41 | class AssignmentHelper {
42 |  public:
43 |   // TODO: Assuming zero addressing mode. Must translate addresses in valid
44 |   // addressing mode.
45 |   void copy_from_device(B* target) {
46 |     auto dev_ptr = device_ptr();
47 |     cudaMemcpy(target, dev_ptr, sizeof(B), cudaMemcpyDeviceToHost);
48 |     assert(cudaPeekAtLastError() == cudaSuccess);
49 |   }
50 | 
51 |   B copy_from_device() {
52 |     B host_data;
53 |     copy_from_device(&host_data);
54 |     return host_data;
55 |   }
56 | 
57 |   // Implicit conversion: Copy from device.
58 |   operator B() {
59 |     return copy_from_device();
60 |   }
61 | 
62 |   // TODO: Assuming zero addressing mode.
63 |   AssignmentHelper& operator=(B value) {
64 |     cudaMemcpy(device_ptr(), &value, sizeof(B), cudaMemcpyHostToDevice);
65 |     assert(cudaPeekAtLastError() == cudaSuccess);
66 |     return *this;
67 |   }
68 | 
69 |   AssignmentHelper& operator+=(B value) {
70 |     // TODO: Implement.
71 |     printf("Warning: Calling unimplemented function AssignmentHelper+=.\n");
72 |     assert(false);
73 |   }
74 | 
75 |   B operator->() {
76 |     return copy_from_device();
77 |   }
78 | 
79 |  private:
80 |   B* device_ptr() {
81 |     return Owner::storage().translate_address_host_to_device(
82 |         reinterpret_cast<B*>(this));
83 |   }
84 | };
85 | 
86 | AssignmentHelper& operator[](size_t pos) const {
87 |   return *reinterpret_cast<AssignmentHelper*>(array_data_ptr(pos));
88 | }
89 | 
90 | AssignmentHelper& at(size_t pos) const {
91 |   // TODO: This function should throw an exception.
92 |   assert(pos < ArraySize);
93 |   return this->operator[](pos);
94 | }
95 | 
96 | // TODO: Implement template-based accessor methods.
97 | #endif
98 | 


--------------------------------------------------------------------------------
/ikra/soa/class_initialization.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_CLASS_INITIALIZATION_H
 2 | #define SOA_CLASS_INITIALIZATION_H
 3 | 
 4 | #include "soa/util.h"
 5 | 
 6 | #define IKRA_INITIALIZE_CLASS \
 7 | /* This class is used to calculate field offsets inside a SOA class. For every
 8 |  * field declaration, a new class specialization will be generated using
 9 |  * macro expansion. "N" is index of the field and "value" is the offset of
10 |  * that field in bytes. "DummyT" is required only because full template
11 |  * specializations are not allowed for nested classes in C++. */ \
12 | template<uint32_t N, typename DummyT = void> \
13 | struct OffsetCounter { \
14 |   /* Used by ObjectSize_. Indicates that the no more fields are available. */ \
15 |   static const bool kReachedGeneralCase = true; \
16 | \
17 |   static void DBG_print_offsets() { /* Nothing to do. */ } \
18 | }; \
19 | \
20 | /* Base case for first field declaration. */ \
21 | template<typename DummyT> \
22 | struct OffsetCounter<0, DummyT> { \
23 |   /* Offset of field 0 in bytes. */ \
24 |   static const uint32_t value = 0; \
25 | \
26 |   /* Used by ObjectSize_. Indicates that this template instantiation was
27 |   * generated by a field declaration. */ \
28 |   static const bool kIsSpecialization = true; \
29 | \
30 |   static void DBG_print_offsets() { \
31 |     printf("OffsetCounter<%i>::value = %u\n", 0, value); \
32 |     OffsetCounter<1>::DBG_print_offsets(); \
33 |   } \
34 | }; \
35 | \
36 | /* This class calculates object size and number of fields.
37 |  * Note: This class cannot be used from within a SOA class. Only when all
38 |  * fields are defined, members of this class may be accessed. */ \
39 | template<uint32_t N> \
40 | struct ObjectSize_ { \
41 |  private: \
42 |   template<int N0 = N> \
43 |   static typename std::enable_if< \
44 |       OffsetCounter<N0>::kIsSpecialization, \
45 |       std::integral_constant<int, ObjectSize_<N0 + 1>::kNumFields>>::type \
46 |   sfinae_dummy() {} \
47 | \
48 |   template<int N0 = N> \
49 |   static typename std::enable_if< \
50 |       OffsetCounter<N0>::kReachedGeneralCase, \
51 |       std::integral_constant<int, N0 - 1>>::type \
52 |   sfinae_dummy() {} \
53 | \
54 |  public: \
55 |   static const int kNumFields = decltype(sfinae_dummy())::value; \
56 |   static const uintptr_t value = OffsetCounter<kNumFields>::value; \
57 | }; \
58 | \
59 | /* Alias for simplier access of object size. */ \
60 | using ObjectSize = ObjectSize_<0>; \
61 | /* Function that returns a reference to the storage buffer.
62 |  * Will be inlined. Pointer dereference will be optimized out. */ \
63 | __ikra_host_device__ static constexpr char* storage_buffer(); \
64 | \
65 | static const uint32_t kCounterFirstIndex = __COUNTER__; \
66 | /* Debug function printing offsets. */ \
67 | static void DBG_print_offsets() { \
68 |   OffsetCounter<0>::DBG_print_offsets(); \
69 | }
70 | 
71 | #endif  // SOA_CLASS_INITIALIZATION_H
72 | 


--------------------------------------------------------------------------------
/ikra/soa/constants.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_CONSTANTS_H
 2 | #define SOA_CONSTANTS_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace ikra {
 7 | namespace soa {
 8 | 
 9 | // In Zero Addressing Mode, the address of an object is its ID. E.g., the
10 | // address of the first object (ID 0) is nullptr. Pointer arithmetics is not
11 | // possible in Zero Addressing Mode, because the size of a SOA class is zero.
12 | // I.e., sizeof(MyClass) = 0. Increasing the size to 1 does not work, because
13 | // instance create will attempt to zero-initialize data at an invalid address.
14 | // Zero Addressing Mode results in efficient assembly code for field reads/
15 | // writes. The address of a field `i` in object with ID t is defined as:
16 | // this*sizeof(type(i)) + Capacity*Offset(i) + ClassStorageBase.
17 | // Notice that the second operand is a compile-time constant if the container
18 | // size (max. number of elements of a class) is a compile-time constant.
19 | // See test/soa/benchmarks/codegen_test.cc for inspection of assembly code.
20 | static const int kAddressModeZero = 0;
21 | 
22 | // In Valid Addressing Mode, the address of an object is the address of its
23 | // first field, i.e., the beginning of the class storage data chunk
24 | // (ClassStorageBase) plus ID*sizeof(first field type). The size of an object
25 | // now defined as the size of the first field, allowing for pointer arithmetics
26 | // on objects. Generated assembly code is less efficient, because the address
27 | // of a field is now complex:
28 | // (this - ClassStorageBase - sizeof(first field type))
29 | //         / sizeof(first field type) * sizeof(type(i)) + 
30 | //     Capacity*Offset(i) + ClassStorageBase
31 | // See test/soa/pointer_arithmetics_test.cc for pointer arithmetics examples.
32 | static const int kAddressModeValid = 1;
33 | 
34 | static const int kLayoutModeAos = 0;
35 | 
36 | static const int kLayoutModeSoa = 1;
37 | 
38 | // The type that is used to represent indices.
39 | using IndexType = uintptr_t;
40 | 
41 | }  // namespace soa
42 | }  // namespace ikra
43 | 
44 | #endif  // SOA_CONSTANTS_H
45 | 


--------------------------------------------------------------------------------
/ikra/soa/cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_CUDA_H
 2 | #define SOA_CUDA_H
 3 | 
 4 | #ifdef __CUDACC__
 5 | 
 6 | #ifdef __CUDA_ARCH__
 7 | // Make all SOA functions device functions when compiling device code.
 8 | #define __ikra_device__ __device__
 9 | #else
10 | #define __ikra_device__
11 | #endif  // __CUDA_ARCH__
12 | 
13 | #define __ikra_host_device__ __host__ __device__
14 | 
15 | // Error checking code taken from:
16 | // https://stackoverflow.com/questions/14038589/
17 | // what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
18 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
19 | inline void gpuAssert(cudaError_t code, const char *file, int line,
20 |                       bool abort = true) {
21 |    if (code != cudaSuccess) {
22 |       fprintf(stderr,"GPUassert: %s %s %d\n",
23 |               cudaGetErrorString(code), file, line);
24 |       if (abort) exit(code);
25 |    }
26 | }
27 | 
28 | #else
29 | 
30 | #define __ikra_device__
31 | #define __ikra_host_device__
32 | 
33 | #endif  // __CUDACC__
34 | 
35 | #endif  // SOA_CUDA_H
36 | 


--------------------------------------------------------------------------------
/ikra/soa/field.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_FIELD_H
 2 | #define SOA_FIELD_H
 3 | 
 4 | #include "soa/constants.h"
 5 | #include "soa/cuda.h"
 6 | #include "soa/storage.h"
 7 | #include "soa/util.h"
 8 | 
 9 | // This marco is expanded for every inplace assignment operator and forwards
10 | // the operation to the wrapped data.
11 | #define IKRA_DEFINE_FIELD_ASSIGNMENT(symbol) \
12 |   __ikra_device__ Self operator symbol ## = (T value) { \
13 |     *data_ptr() symbol ## = value; \
14 |     return *this; \
15 |   }
16 | 
17 | 
18 | namespace ikra {
19 | namespace soa {
20 | 
21 | template<typename T,
22 |          IndexType Capacity,
23 |          uint32_t Offset,
24 |          int AddressMode,
25 |          int StorageMode,
26 |          int LayoutMode,
27 |          class Owner>
28 | class Field_ {
29 |  private:
30 |   using Self = Field_<T, Capacity, Offset, AddressMode,
31 |                       StorageMode, LayoutMode, Owner>;
32 | 
33 |  public:
34 |   static const uint32_t DBG_OFFSET = Offset;  // For debugging.
35 | 
36 |   // This class may only be used to declare fields of classes.
37 |   void* operator new(size_t count) = delete;
38 | 
39 |   // Define inplace assignment operators.
40 |   IKRA_DEFINE_FIELD_ASSIGNMENT(+);
41 |   IKRA_DEFINE_FIELD_ASSIGNMENT(-);
42 |   IKRA_DEFINE_FIELD_ASSIGNMENT(*);
43 |   IKRA_DEFINE_FIELD_ASSIGNMENT(/);
44 |   IKRA_DEFINE_FIELD_ASSIGNMENT(%);
45 |   IKRA_DEFINE_FIELD_ASSIGNMENT(&);
46 |   IKRA_DEFINE_FIELD_ASSIGNMENT(|);
47 |   IKRA_DEFINE_FIELD_ASSIGNMENT(^);
48 |   IKRA_DEFINE_FIELD_ASSIGNMENT(<<);
49 |   IKRA_DEFINE_FIELD_ASSIGNMENT(>>);
50 | 
51 |   // TODO: Implement special operators:
52 |   // http://en.cppreference.com/w/cpp/language/operator_logical
53 | 
54 |  protected:
55 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__)
56 |   // Only Owner can create new fields for itself.
57 |   friend Owner;
58 | #else
59 |   // Only Owner can create new fields for itself.
60 |   // Friend template is broken. This is an nvcc bug.
61 |   friend typename NvccWorkaroundIdentityClassHolder<Owner>::type;
62 | #endif
63 | 
64 |   __ikra_device__ Field_() {}
65 | 
66 |   // Initialize the field with a given value. Used in class constructor.
67 |   __ikra_device__ Field_(T value) {
68 |     *this = value;
69 |   }
70 | 
71 |   // Initialize the field with the value of another field. Used in class
72 |   // constructor. TODO: Unclear why exactly we need this one...
73 |   __ikra_device__ Field_(const Field_& /*other*/) {}
74 | 
75 |   // Not sure why we need this. Required to make field inplace initialization
76 |   // work.
77 |   __ikra_device__ Field_(Field_&& /*other*/) {}
78 | 
79 | 
80 | #include "soa/field_shared.inc"
81 | #include "soa/addressable_field_shared.inc"
82 | };
83 | 
84 | }  // namespace soa
85 | }  // namespace ikra
86 | 
87 | #undef IKRA_DEFINE_FIELD_ASSIGNMENT
88 | 
89 | #endif  // SOA_FIELD_H
90 | 


--------------------------------------------------------------------------------
/ikra/soa/field_shared.inc:
--------------------------------------------------------------------------------
 1 | // This file contains methods that are common to Field_ and
 2 | // FullyInlinedArrayField_. Cannot use inheritance here because sizeof
 3 | // cannot be set to 0 in CUDA with inheritance.
 4 | 
 5 | protected:
 6 | template<int A = AddressMode>
 7 | __ikra_device__
 8 | typename std::enable_if<A != kAddressModeZero, IndexType>::type
 9 | id() const {
10 |   return (reinterpret_cast<uintptr_t>(this) -
11 |           reinterpret_cast<uintptr_t>(Owner::storage().data_ptr()))
12 |              / A - 1 - 1;
13 | }
14 | 
15 | template<int A = AddressMode, int L = LayoutMode>
16 | __ikra_device__
17 | typename std::enable_if<A == kAddressModeZero, IndexType>::type
18 | id() const {
19 |   static_assert(sizeof(Self) == 0,
20 |                 "Zero addressing mode not supported by compiler.");
21 |   // TODO: Check if there's a slowdown here due to forwarding this.
22 |   return reinterpret_cast<Owner*>(this)->id();
23 | }
24 | 
25 | // Force size of this class to be 0.
26 | char dummy_[0];
27 | 
28 | 


--------------------------------------------------------------------------------
/ikra/soa/field_type_generator.h:
--------------------------------------------------------------------------------
 1 | // This macro is used when generating field types. A field type in user code
 2 | // such as int__(idx) should expand to an offset counter increment and a field
 3 | // type template instantiation (e.g., int_) at the computed offset.
 4 | // Note: This macro must not be undefined even though it is only called within
 5 | // this class. It is used from other macros and cannot be called anymore when
 6 | // undefined here.
 7 | #define IKRA_FIELD_TYPE_GENERATOR(type, field_id) \
 8 |   template<typename DummyT> \
 9 |   struct OffsetCounter<field_id + 1, DummyT> { \
10 |     static const uint32_t value = OffsetCounter<field_id>::value + \
11 |                                   sizeof(type); \
12 |     static const bool kIsSpecialization = true; \
13 |     static void DBG_print_offsets() { \
14 |       printf("OffsetCounter<%i>::value = %u\n", field_id + 1, value); \
15 |       OffsetCounter<field_id + 2>::DBG_print_offsets(); \
16 |     } \
17 |   }; \
18 |   soa_ ## type<OffsetCounter<field_id>::value>
19 | 
20 | #define IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, field_id) \
21 |   template<typename DummyT> \
22 |   struct OffsetCounter<field_id + 1, DummyT> { \
23 |     static const uint32_t value = OffsetCounter<field_id>::value + \
24 |         array::layout<type, size, OffsetCounter<field_id>::value>::kSize; \
25 |     static const bool kIsSpecialization = true; \
26 |     static void DBG_print_offsets() { \
27 |       printf("OffsetCounter<%i>::value = %u    [ ARRAY ]\n", \
28 |              field_id + 1, value); \
29 |       OffsetCounter<field_id + 2>::DBG_print_offsets(); \
30 |     } \
31 |   }; \
32 |   array::layout<type, size, OffsetCounter<field_id>::value>
33 | 
34 | #define IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, field_id) \
35 |   template<typename DummyT> \
36 |   struct OffsetCounter<field_id + 1, DummyT> { \
37 |     static const uint32_t value = OffsetCounter<field_id>::value + \
38 |                                   sizeof(type); \
39 |     static const bool kIsSpecialization = true; \
40 |     static void DBG_print_offsets() { \
41 |       printf("OffsetCounter<%i>::value = %u    [ CUSTOM ]\n", \
42 |              field_id + 1, value); \
43 |       OffsetCounter<field_id + 2>::DBG_print_offsets(); \
44 |     } \
45 |   }; \
46 |   Field<type, OffsetCounter<field_id>::value>
47 | 
48 | // Generate types that keep track of offsets by themselves. Add more types
49 | // as needed.
50 | #define bool__(field_id) IKRA_FIELD_TYPE_GENERATOR(bool, field_id)
51 | #define char__(field_id) IKRA_FIELD_TYPE_GENERATOR(char, field_id)
52 | #define double__(field_id) IKRA_FIELD_TYPE_GENERATOR(double, field_id)
53 | #define float__(field_id) IKRA_FIELD_TYPE_GENERATOR(float, field_id)
54 | #define int__(field_id) IKRA_FIELD_TYPE_GENERATOR(int, field_id)
55 | #define uint8_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint8_t, field_id)
56 | #define uint16_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint16_t, field_id)
57 | #define uint32_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint32_t, field_id)
58 | #define uint64_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint64_t, field_id)
59 | 
60 | #define array__(...) PP_CONCAT(ARRAY__, PP_NARG(__VA_ARGS__))(__VA_ARGS__)
61 | #define ARRAY__4(type, size, layout, field_id) \
62 |     IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, field_id)
63 | #define ARRAY__3(type, size, field_id) ARRAY_3(type, size, soa, field_id)
64 | #define ARRAY__2(arg1, arg2) \
65 |     static_assert(false, "At least three arguments required for array__.")
66 | #define ARRAY__1(arg1) ARRAY_2(0, 0)
67 | #define ARRAY__0() ARRAY_2(0, 0)
68 | 
69 | #define field__(type, field_id) \
70 |     IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, field_id)
71 | #define ref__(type, field_id) \
72 |     IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type*, field_id)
73 | 
74 | // Generate types that keep track of offsets and field indices by themselves.
75 | // Add more types are needed.
76 | #define IKRA_NEXT_FIELD_ID __COUNTER__ - kCounterFirstIndex - 1
77 | #define bool_ IKRA_FIELD_TYPE_GENERATOR(bool, IKRA_NEXT_FIELD_ID)
78 | #define char_ IKRA_FIELD_TYPE_GENERATOR(char, IKRA_NEXT_FIELD_ID)
79 | #define double_ IKRA_FIELD_TYPE_GENERATOR(double, IKRA_NEXT_FIELD_ID)
80 | #define float_ IKRA_FIELD_TYPE_GENERATOR(float, IKRA_NEXT_FIELD_ID)
81 | #define int_ IKRA_FIELD_TYPE_GENERATOR(int, IKRA_NEXT_FIELD_ID)
82 | #define uint8_t_ IKRA_FIELD_TYPE_GENERATOR(uint8_t, IKRA_NEXT_FIELD_ID)
83 | #define uint16_t_ IKRA_FIELD_TYPE_GENERATOR(uint16_t, IKRA_NEXT_FIELD_ID)
84 | #define uint32_t_ IKRA_FIELD_TYPE_GENERATOR(uint32_t, IKRA_NEXT_FIELD_ID)
85 | #define uint64_t_ IKRA_FIELD_TYPE_GENERATOR(uint64_t, IKRA_NEXT_FIELD_ID)
86 | 
87 | #define array_(...) PP_CONCAT(ARRAY_, PP_NARG(__VA_ARGS__))(__VA_ARGS__)
88 | #define ARRAY_3(type, size, layout) \
89 |     IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, IKRA_NEXT_FIELD_ID)
90 | #define ARRAY_2(type, size) ARRAY_3(type, size, soa)
91 | #define ARRAY_1(arg) \
92 |     static_assert(false, "At least two arguments required for array_.")
93 | #define ARRAY_0() ARRAY_1(0)
94 | 
95 | #define field_(type) IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, IKRA_NEXT_FIELD_ID)
96 | #define ref_(type) IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type*, IKRA_NEXT_FIELD_ID)
97 | 


--------------------------------------------------------------------------------
/ikra/soa/preprocessor.h:
--------------------------------------------------------------------------------
 1 | // Code adapted from:
 2 | // https://stackoverflow.com/questions/3046889/optional-parameters-with-c-macros
 3 | 
 4 | #ifndef SOA_PREPROCESSOR_H
 5 | #define SOA_PREPROCESSOR_H
 6 | 
 7 | #define PP_NARG(...) \
 8 |     PP_NARG_(__VA_ARGS__,PP_RSEQ_N())
 9 | #define PP_NARG_(...) \
10 |     PP_ARG_N(__VA_ARGS__)
11 | #define PP_ARG_N( \
12 |     _1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \
13 |     _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
14 |     _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
15 |     _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
16 |     _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
17 |     _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
18 |     _61,_62,_63,N,...) N
19 | #define PP_RSEQ_N() \
20 |     63,62,61,60,                   \
21 |     59,58,57,56,55,54,53,52,51,50, \
22 |     49,48,47,46,45,44,43,42,41,40, \
23 |     39,38,37,36,35,34,33,32,31,30, \
24 |     29,28,27,26,25,24,23,22,21,20, \
25 |     19,18,17,16,15,14,13,12,11,10, \
26 |     9,8,7,6,5,4,3,2,1,0
27 | 
28 | #define PP_CONCAT(a,b) PP_CONCAT_(a,b)
29 | #define PP_CONCAT_(a,b) a ## b
30 | 
31 | #endif  // SOA_PREPROCESSOR_H
32 | 


--------------------------------------------------------------------------------
/ikra/soa/soa.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_SOA_H
 2 | #define SOA_SOA_H
 3 | 
 4 | #if __cplusplus < 201103L
 5 |   #error This library needs at least a C++11 compliant compiler.
 6 | #endif
 7 | 
 8 | #if defined(__clang__)
 9 |   #warning Loop vectorization is broken when using clang.
10 | #elif defined(__GNUC__) || defined(__GNUG__)
11 |   // Everything OK with gcc.
12 | #else
13 |   #warning ikra-cpp was not tested with this compiler.
14 | #endif
15 | 
16 | #include <array>
17 | #include <cstddef>
18 | #include <cstdint>
19 | #include <type_traits>
20 | 
21 | // Asserts active only in debug mode (NDEBUG).
22 | #include <cassert>
23 | 
24 | #include "executor/iterator.h"
25 | #include "soa/array_field.h"
26 | #include "soa/constants.h"
27 | #include "soa/field.h"
28 | #include "soa/field_type_generator.h"
29 | #include "soa/partially_inlined_array_field.h"
30 | #include "soa/layout.h"
31 | #include "soa/preprocessor.h"
32 | #include "soa/storage.h"
33 | 
34 | #endif  // SOA_SOA_H
35 | 


--------------------------------------------------------------------------------
/ikra/soa/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOA_UTIL_H
 2 | #define SOA_UTIL_H
 3 | 
 4 | 
 5 | namespace ikra {
 6 | namespace soa {
 7 | 
 8 | // This class exports its single template parameter as "type". Used as
 9 | // workaround for an nvcc bug with friend declarations.
10 | template<typename T>
11 | struct NvccWorkaroundIdentityClassHolder {
12 |   using type = T;
13 | };
14 | 
15 | // This helper macro can be used to allow reinterpret_casts in constexpr.
16 | // Only needed for clang.
17 | #define IKRA_fold(x) (__builtin_constant_p(x) ? (x) : (x))
18 | 
19 | // This class is used to check if the compiler supports the selected addressing
20 | // mode.
21 | template<uintptr_t N>
22 | class AddressingModeCompilerCheck {
23 |   char ___[N];
24 | };
25 | 
26 | }  // soa
27 | }  // ikra
28 | 
29 | #endif  // SOA_UTIL_H
30 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(array_test soa/array_test.cc)
 2 | target_link_libraries(array_test gtest gtest_main ikra)
 3 | add_test(array_test ${CMAKE_BINARY_DIR}/bin/array_test)
 4 | 
 5 | add_executable(basic_class_test soa/basic_class_test.cc)
 6 | target_link_libraries(basic_class_test gtest gtest_main ikra)
 7 | add_test(basic_class_test ${CMAKE_BINARY_DIR}/bin/basic_class_test)
 8 | 
 9 | add_executable(pointer_arithmetics_test soa/pointer_arithmetics_test.cc)
10 | target_link_libraries(pointer_arithmetics_test gtest gtest_main ikra)
11 | add_test(pointer_arithmetics_test ${CMAKE_BINARY_DIR}/bin/pointer_arithmetics_test)
12 | 
13 | add_executable(executor_test executor_test.cc)
14 | target_link_libraries(executor_test gtest gtest_main ikra)
15 | add_test(executor_test ${CMAKE_BINARY_DIR}/bin/executor_test)
16 | 
17 | 
18 | # CUDA Tests
19 | # TODO: cuda_add_library does not support INTERFACE libraries
20 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
21 | cuda_add_executable(minimum_cuda_test minimum_cuda_test.cu)
22 | target_link_libraries(minimum_cuda_test gtest gtest_main ikra)
23 | add_test(minimum_cuda_test ${CMAKE_BINARY_DIR}/bin/minimum_cuda_test)
24 | 
25 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
26 | cuda_add_executable(array_cuda_test array_cuda_test.cu)
27 | target_link_libraries(array_cuda_test gtest gtest_main ikra)
28 | add_test(array_cuda_test ${CMAKE_BINARY_DIR}/bin/array_cuda_test)
29 | 
30 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
31 | cuda_add_executable(cuda_array_test soa/cuda_array_test.cu)
32 | target_link_libraries(cuda_array_test gtest gtest_main ikra)
33 | add_test(cuda_array_test ${CMAKE_BINARY_DIR}/bin/cuda_array_test)
34 | 
35 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
36 | cuda_add_executable(cuda_inline_array_memcpy_test soa/cuda_inline_array_memcpy_test.cu)
37 | target_link_libraries(cuda_inline_array_memcpy_test gtest gtest_main ikra)
38 | add_test(cuda_inline_array_memcpy_test ${CMAKE_BINARY_DIR}/bin/cuda_inline_array_memcpy_test)
39 | 
40 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
41 | cuda_add_executable(cuda_inline_array_memcpy_aos_test soa/cuda_inline_array_memcpy_aos_test.cu)
42 | target_link_libraries(cuda_inline_array_memcpy_aos_test gtest gtest_main ikra)
43 | add_test(cuda_inline_array_memcpy_aos_test ${CMAKE_BINARY_DIR}/bin/cuda_inline_array_memcpy_aos_test)
44 | 


--------------------------------------------------------------------------------
/test/array_cuda_test.cu:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | 
 3 | #include "gtest/gtest.h"
 4 | 
 5 | #include "executor/cuda_executor.h"
 6 | #include "soa/soa.h"
 7 | 
 8 | using ikra::soa::IndexType;
 9 | using ikra::soa::SoaLayout;
10 | using ikra::executor::cuda::construct;
11 | 
12 | const static int kTestSize = 12;
13 | 
14 | class TestClass : public SoaLayout<TestClass, 1000> {
15 |  public:
16 |   IKRA_INITIALIZE_CLASS
17 | 
18 |   TestClass(std::array<double, 2> f0, std::array<double, 2> f1) {
19 |     field0[0] = f0[0];
20 |     field0[1] = f0[1];
21 |     field1[0] = f1[0];
22 |     field1[1] = f1[1];
23 |   }
24 | 
25 |   array_(double, 2, fully_inlined) field0;
26 |   array_(double, 2, fully_inlined) field1;
27 | 
28 |   __device__ void add_fields() {
29 |     field0[0] = field0[0] + field0[1] + field1[0] + field1[1];
30 |   }
31 | };
32 | 
33 | IKRA_DEVICE_STORAGE(TestClass);
34 | 
35 | 
36 | // Cannot run "cuda_execute" inside gtest case.
37 | void run_test_set_from_host_and_read() {
38 |   TestClass::initialize_storage();
39 |   EXPECT_EQ(TestClass::size(), 0UL);
40 | 
41 |   for (int i = 0; i < kTestSize; ++i) {
42 |     new TestClass({{ 1.0, 2.0 }}, {{ 3.0, 4.0 }});
43 |   }
44 |   gpuErrchk(cudaPeekAtLastError());
45 | 
46 |   TestClass* first = TestClass::get(0);
47 |   cuda_execute(&TestClass::add_fields, first, kTestSize);
48 | 
49 |   // Check result.
50 |   for (int i = 0; i < kTestSize; ++i) {
51 |     double actual = TestClass::get(i)->field0[0];
52 |     double expected = 1.0 + 2.0 + 3.0 + 4.0;
53 |     EXPECT_EQ(actual, expected);
54 |   }
55 | 
56 |   // Copy size to host memory and compare.
57 |   EXPECT_EQ(TestClass::size(), static_cast<IndexType>(kTestSize));
58 | 
59 |   // Make sure that we had no CUDA failures.
60 |   gpuErrchk(cudaPeekAtLastError());
61 | }
62 | 
63 | TEST(ArrayCudaTest, SetFromHostAndRead) {
64 |   // This test ensures that address computation for array fields is done
65 |   // correctly.
66 |   run_test_set_from_host_and_read();
67 | }
68 | 


--------------------------------------------------------------------------------
/test/executor_test.cc:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | #include "executor/executor.h"
  3 | #include "executor/iterator.h"
  4 | #include "soa/soa.h"
  5 | 
  6 | namespace {
  7 | using ikra::soa::SoaLayout;
  8 | using ikra::soa::kAddressModeZero;
  9 | using ikra::executor::construct;
 10 | using ikra::executor::execute;
 11 | using ikra::executor::make_iterator;
 12 | 
 13 | static const int kClassMaxInst = 1024;
 14 | static const int kTestSize = 40;
 15 | 
 16 | // Zero addressing mode.
 17 | #define IKRA_TEST_CLASSNAME TestClassZ
 18 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero
 19 | #include "executor_test_layout.inc"
 20 | #undef IKRA_TEST_CLASSNAME
 21 | #undef IKRA_TEST_ADDRESS_MODE
 22 | IKRA_HOST_STORAGE(TestClassZ)
 23 | 
 24 | // Valid addressing mode.
 25 | #define IKRA_TEST_CLASSNAME TestClassV
 26 | #define IKRA_TEST_ADDRESS_MODE sizeof(int)
 27 | #include "executor_test_layout.inc"
 28 | #undef IKRA_TEST_CLASSNAME
 29 | #undef IKRA_TEST_ADDRESS_MODE
 30 | IKRA_HOST_STORAGE(TestClassV)
 31 | 
 32 | template<typename T>
 33 | class ExecutorTest : public testing::Test {};
 34 | 
 35 | TYPED_TEST_CASE_P(ExecutorTest);
 36 | 
 37 | TYPED_TEST_P(ExecutorTest, StdArray) {
 38 |   TypeParam::initialize_storage();
 39 |   std::array<TypeParam*, kTestSize> arr;
 40 | 
 41 |   for (int i = 0; i < kTestSize; ++i) {
 42 |     arr[i] = new TypeParam(i + 1, 100*i + 1);
 43 |   }
 44 | 
 45 |   execute(arr.begin(), arr.end(), &TypeParam::add_field1_and_a_to_field0, 50);
 46 | 
 47 |   // Check result
 48 |   for (int i = 0; i < kTestSize; ++i) {
 49 |     int expected = i + 1 + 100*i + 1 + 50;
 50 |     EXPECT_EQ(arr[i]->field0, expected);
 51 |   }
 52 | }
 53 | 
 54 | TYPED_TEST_P(ExecutorTest, IteratorFromPointers) {
 55 |   TypeParam::initialize_storage();
 56 |   std::array<TypeParam*, kTestSize> arr;
 57 | 
 58 |   for (int i = 0; i < kTestSize; ++i) {
 59 |     arr[i] = new TypeParam(i + 1, 200*i + 1);
 60 |   }
 61 | 
 62 |   execute(make_iterator(arr[0]),
 63 |           ++make_iterator(arr[kTestSize - 1]),
 64 |           &TypeParam::add_field1_and_a_to_field0,
 65 |           50);
 66 | 
 67 |   // Check result
 68 |   for (int i = 0; i < kTestSize; ++i) {
 69 |     int expected = i + 1 + 200*i + 1 + 50;
 70 |     EXPECT_EQ(arr[i]->field0, expected);
 71 |   }
 72 | }
 73 | 
 74 | TYPED_TEST_P(ExecutorTest, ExecuteAndReduce) {
 75 |   TypeParam::initialize_storage();
 76 |   std::array<TypeParam*, kTestSize> arr;
 77 | 
 78 |   for (int i = 0; i < kTestSize; ++i) {
 79 |     arr[i] = new TypeParam(i, 100*i);
 80 |   }
 81 | 
 82 |   int expected = ((kTestSize - 1)*kTestSize)/2*101 + kTestSize;
 83 |   int actual = execute_and_reduce(make_iterator(arr[0]),
 84 |                                   ++make_iterator(arr[kTestSize - 1]),
 85 |                                   &TypeParam::sum_plus_delta,
 86 |                                   [](int a, int b) { return a + b; }, 0, 1);
 87 | 
 88 |   EXPECT_EQ(actual, expected);
 89 | }
 90 | 
 91 | TYPED_TEST_P(ExecutorTest, Construct) {
 92 |   TypeParam::initialize_storage();
 93 |   TypeParam* first = construct<TypeParam>(kTestSize, 1, 2);
 94 | 
 95 |   int expected = (20 + 1 + 2)*kTestSize;
 96 |   int actual = execute_and_reduce(make_iterator(first),
 97 |                                   make_iterator(first) + kTestSize,
 98 |                                   &TypeParam::sum_plus_delta,
 99 |                                   [](int a, int b) { return a + b; }, 0, 20);
100 |   EXPECT_EQ(actual, expected);
101 | }
102 | 
103 | REGISTER_TYPED_TEST_CASE_P(ExecutorTest,
104 |                            StdArray,
105 |                            IteratorFromPointers,
106 |                            ExecuteAndReduce,
107 |                            Construct);
108 | 
109 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, ExecutorTest, TestClassV);
110 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, ExecutorTest, TestClassZ);
111 | 
112 | }  // namespace
113 | 
114 | 


--------------------------------------------------------------------------------
/test/executor_test_layout.inc:
--------------------------------------------------------------------------------
 1 | class IKRA_TEST_CLASSNAME : public SoaLayout<IKRA_TEST_CLASSNAME,
 2 |                                              kClassMaxInst,
 3 |                                              IKRA_TEST_ADDRESS_MODE> {
 4 |  public:
 5 |   IKRA_INITIALIZE_CLASS
 6 | 
 7 |   int_ field0;
 8 |   int_ field1;
 9 | 
10 |   IKRA_TEST_CLASSNAME(int a, int b) : field0(a), field1(b) {}
11 | 
12 |   void add_field1_and_a_to_field0(int a) {
13 |     field0 += field1 + a;
14 |   }
15 | 
16 |   int sum_plus_delta(int delta) {
17 |     return field0 + field1 + delta;
18 |   }
19 | };
20 | 


--------------------------------------------------------------------------------
/test/minimum_cuda_test.cu:
--------------------------------------------------------------------------------
  1 | #include <array>
  2 | 
  3 | #include "gtest/gtest.h"
  4 | 
  5 | #include "executor/cuda_executor.h"
  6 | #include "soa/soa.h"
  7 | 
  8 | using ikra::soa::IndexType;
  9 | using ikra::soa::SoaLayout;
 10 | using ikra::executor::cuda::construct;
 11 | 
 12 | const static int kTestSize = 12;
 13 | 
 14 | class Vertex : public SoaLayout<Vertex, 1000> {
 15 |  public:
 16 |   IKRA_INITIALIZE_CLASS
 17 | 
 18 |   __host__ __device__ Vertex(int f0, int f1) : field0(f0), field1(f1) {}
 19 | 
 20 |   int_ field0;
 21 |   int_ field1;
 22 | 
 23 |   __device__ void add_fields(int increment) {
 24 |     field0 = field0 + field1 + increment + this->id();
 25 |   }
 26 | };
 27 | 
 28 | IKRA_DEVICE_STORAGE(Vertex);
 29 | 
 30 | 
 31 | // Cannot run "cuda_execute" inside gtest case.
 32 | void run_test_construct_and_execute() {
 33 |   Vertex::initialize_storage();
 34 |   EXPECT_EQ(Vertex::size(), 0UL);
 35 | 
 36 |   Vertex* first = construct<Vertex>(kTestSize, 5, 6);
 37 |   gpuErrchk(cudaPeekAtLastError());
 38 | 
 39 |   cuda_execute(&Vertex::add_fields, first, 12, 10);
 40 | 
 41 |   // Check result.
 42 |   for (int i = 0; i < kTestSize; ++i) {
 43 |     int actual = Vertex::get(i)->field0;
 44 |     int expected = 10 + 5 + 6 + i;
 45 |     EXPECT_EQ(actual, expected);
 46 |   }
 47 | 
 48 |   // Copy size to host memory and compare.
 49 |   EXPECT_EQ(Vertex::size(), static_cast<IndexType>(kTestSize));
 50 | 
 51 |   // Make sure that we had no CUDA failures.
 52 |   gpuErrchk(cudaPeekAtLastError());
 53 | }
 54 | 
 55 | void run_test_host_side_assignment() {
 56 |   Vertex::initialize_storage();
 57 |   EXPECT_EQ(Vertex::size(), 0UL);
 58 | 
 59 |   Vertex* first = construct<Vertex>(kTestSize, 5, 6);
 60 |   cuda_execute(&Vertex::add_fields, first, kTestSize, 10);
 61 | 
 62 |   for (int i = 0; i < kTestSize; ++i) {
 63 |     Vertex::get(i)->field0 = Vertex::get(i)->field0*Vertex::get(i)->field0;
 64 |   }
 65 | 
 66 |   // Check result.
 67 |   for (int i = 0; i < kTestSize; ++i) {
 68 |     int actual = Vertex::get(i)->field0;
 69 |     int expected = (10 + 5 + 6 + i)*(10 + 5 + 6 + i);
 70 |     EXPECT_EQ(actual, expected);
 71 |   }
 72 | 
 73 |   // Copy size to host memory and compare.
 74 |   EXPECT_EQ(Vertex::size(), static_cast<IndexType>(kTestSize));
 75 | 
 76 |   // Make sure that we had no CUDA failures.
 77 |   gpuErrchk(cudaPeekAtLastError());
 78 | }
 79 | 
 80 | void run_test_host_side_new() {
 81 |   Vertex::initialize_storage();
 82 |   EXPECT_EQ(Vertex::size(), 0UL);
 83 | 
 84 |   std::array<Vertex*, kTestSize> vertices;
 85 | 
 86 |   for (int i = 0; i < kTestSize; ++i) {
 87 |     vertices[i] = new Vertex(i + 1, i * i);
 88 |     EXPECT_EQ(vertices[i]->id(), static_cast<IndexType>(i));
 89 |   }
 90 | 
 91 |   cuda_execute(&Vertex::add_fields, vertices[0], kTestSize, 10);
 92 | 
 93 |   // Check result.
 94 |   for (int i = 0; i < kTestSize; ++i) {
 95 |     int actual = Vertex::get(i)->field0;
 96 |     int expected = 10 + i + (i + 1) + (i*i);
 97 |     EXPECT_EQ(actual, expected);
 98 | 
 99 |     actual = vertices[i]->field1;
100 |     expected = i*i;
101 |     EXPECT_EQ(actual, expected);
102 |   }
103 | 
104 |   // Copy size to host memory and compare.
105 |   EXPECT_EQ(Vertex::size(), static_cast<IndexType>(kTestSize));
106 | 
107 |   // Make sure that we had no CUDA failures.
108 |   gpuErrchk(cudaPeekAtLastError());
109 | }
110 | 
111 | void run_test_placement_new() {
112 |   Vertex::initialize_storage();
113 |   EXPECT_EQ(Vertex::size(), 0UL);
114 | 
115 |   Vertex* v1 = new(Vertex::get_uninitialized(4)) Vertex(10, 20);
116 |   EXPECT_EQ(Vertex::size(), 0UL);
117 |   EXPECT_EQ(v1->field0, 10);
118 |   EXPECT_EQ(v1->field1, 20);
119 |   EXPECT_EQ(v1->id(), 4);
120 | 
121 |   // Make sure that we had no CUDA failures.
122 |   gpuErrchk(cudaPeekAtLastError());
123 | }
124 | 
125 | TEST(MinimumCudaTest, ConstructAndExecute) {
126 |   run_test_construct_and_execute();
127 | }
128 | 
129 | TEST(MinimumCudaTest, HostSideAssignment) {
130 |   run_test_host_side_assignment();
131 | }
132 | 
133 | TEST(MinimumCudaTest, HostSideNew) {
134 |   run_test_host_side_new();
135 | }
136 | 
137 | TEST(MinimumCudaTest, PlacementNew) {
138 |   run_test_placement_new();
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/test/soa/array_test.cc:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | #include "soa/soa.h"
  3 | 
  4 | namespace {
  5 | using ikra::soa::SoaLayout;
  6 | using ikra::soa::kAddressModeZero;
  7 | 
  8 | static const int kClassMaxInst = 1024;
  9 | static const int kTestSize = 9;
 10 | 
 11 | // Zero addressing mode.
 12 | #define IKRA_TEST_CLASSNAME TestClassZ
 13 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero
 14 | #include "array_test_layout.inc"
 15 | #undef IKRA_TEST_CLASSNAME
 16 | #undef IKRA_TEST_ADDRESS_MODE
 17 | IKRA_HOST_STORAGE(TestClassZ)
 18 | 
 19 | // Valid addressing mode.
 20 | #define IKRA_TEST_CLASSNAME TestClassV
 21 | #define IKRA_TEST_ADDRESS_MODE sizeof(int)
 22 | #include "array_test_layout.inc"
 23 | #undef IKRA_TEST_CLASSNAME
 24 | #undef IKRA_TEST_ADDRESS_MODE
 25 | IKRA_HOST_STORAGE(TestClassV)
 26 | 
 27 | template<typename T>
 28 | class ArrayTest : public testing::Test {};
 29 | 
 30 | TYPED_TEST_CASE_P(ArrayTest);
 31 | 
 32 | TYPED_TEST_P(ArrayTest, AosArray) {
 33 |   TypeParam::initialize_storage();
 34 |   TypeParam** instances = new TypeParam*[kTestSize];
 35 | 
 36 |   // Create a few instances
 37 |   for (int i = 0; i < kTestSize; ++i) {
 38 |     instances[i] = new TypeParam();
 39 |     instances[i]->field0 = i + 1;
 40 |     EXPECT_EQ(instances[i]->field0, i + 1);
 41 |     instances[i]->field2 = i*i;
 42 |   }
 43 | 
 44 |   // Set values.
 45 |   for (int i = 0; i < kTestSize; ++i) {
 46 |     instances[i]->field1[0] = 1*i + 1;
 47 |     instances[i]->field1[1] = 2*i + 2;
 48 |     // TODO: Should use "." for consistency.
 49 |     instances[i]->field1->at(2) = 3*i + 3;
 50 |   }
 51 | 
 52 |   // Check result.
 53 |   for (int i = 0; i < kTestSize; ++i) {
 54 |     // Ensure that no data was overwritten.
 55 |     EXPECT_EQ(instances[i]->field0, i + 1);
 56 |     EXPECT_EQ(instances[i]->field2, i*i);
 57 | 
 58 |     int expected = i+1 + 2*i+2 + 3*i+3;
 59 |     EXPECT_EQ(instances[i]->field1_sum(), expected);
 60 |   }
 61 | }
 62 | 
 63 | TYPED_TEST_P(ArrayTest, SoaArray) {
 64 |   TypeParam::initialize_storage();
 65 |   TypeParam** instances = new TypeParam*[kTestSize];
 66 | 
 67 |   // Create a few instances
 68 |   for (int i = 0; i < kTestSize; ++i) {
 69 |     instances[i] = new TypeParam();
 70 |   }
 71 | 
 72 |   // Set values.
 73 |   for (int i = 0; i < kTestSize; ++i) {
 74 |     instances[i]->field3[0] = 1*i + 1;
 75 |     instances[i]->field3[1] = 2*i + 2;
 76 |     instances[i]->field3->at(2) = 3*i + 3;
 77 |   }
 78 | 
 79 |   // Check result.
 80 |   for (int i = 0; i < kTestSize; ++i) {
 81 |     int expected = i+1 + 2*i+2 + 3*i+3;
 82 |     EXPECT_EQ(instances[i]->field3_sum(), expected);
 83 |   }
 84 | }
 85 | 
 86 | TYPED_TEST_P(ArrayTest, SoaArrayStaticGet) {
 87 |   TypeParam::initialize_storage();
 88 |   TypeParam** instances = new TypeParam*[kTestSize];
 89 | 
 90 |   // Create a few instances
 91 |   for (int i = 0; i < kTestSize; ++i) {
 92 |     instances[i] = new TypeParam();
 93 |   }
 94 | 
 95 |   // Set values.
 96 |   for (int i = 0; i < kTestSize; ++i) {
 97 |     instances[i]->field3.template at<0>() = 1*i + 1;
 98 |     instances[i]->field3.template at<1>() = 2*i + 2;
 99 |     instances[i]->field3.template at<2>() = 3*i + 3;
100 |   }
101 | 
102 |   // Check result.
103 |   for (int i = 0; i < kTestSize; ++i) {
104 |     int expected = i+1 + 2*i+2 + 3*i+3;
105 |     EXPECT_EQ(instances[i]->field3_sum(), expected);
106 |   }
107 | }
108 | 
109 | REGISTER_TYPED_TEST_CASE_P(ArrayTest,
110 |                            AosArray,
111 |                            SoaArray,
112 |                            SoaArrayStaticGet);
113 | 
114 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, ArrayTest, TestClassV);
115 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, ArrayTest, TestClassZ);
116 | 
117 | }  // namespace
118 | 


--------------------------------------------------------------------------------
/test/soa/array_test_layout.inc:
--------------------------------------------------------------------------------
 1 | class IKRA_TEST_CLASSNAME
 2 |     : public SoaLayout<IKRA_TEST_CLASSNAME, kClassMaxInst,
 3 |                        IKRA_TEST_ADDRESS_MODE> {
 4 |  public:
 5 |   IKRA_INITIALIZE_CLASS
 6 | 
 7 |   int_ field0;
 8 | 
 9 |   // Array has size 12 bytes.
10 |   array_(int, 3, object) field1;
11 |   int_ field2;
12 | 
13 |   // Array has size 12 bytes.
14 |   array_(int, 3, fully_inlined) field3;
15 |   int_ field4;
16 | 
17 |   int field1_sum() {
18 |     int result = 0;
19 |     for (int i = 0; i < 3; ++i) {
20 |       result = result + field1[i];
21 |     }
22 |     return result;
23 |   }
24 | 
25 |   int field3_sum() {
26 |     int result = 0;
27 |     for (int i = 0; i < 3; ++i) {
28 |       result = result + field3[i];
29 |     }
30 |     return result;
31 |   }
32 | };
33 | 
34 | 


--------------------------------------------------------------------------------
/test/soa/basic_class_test.cc:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | #include "soa/soa.h"
  3 | 
  4 | namespace {
  5 | using ikra::soa::SoaLayout;
  6 | using ikra::soa::StaticStorage;
  7 | using ikra::soa::kAddressModeZero;
  8 | using ikra::soa::kLayoutModeSoa;
  9 | using ikra::soa::kLayoutModeAos;
 10 | 
 11 | static const int kClassMaxInst = 1024;
 12 | static const int kTestSize = 40;
 13 | 
 14 | // Zero addressing mode.
 15 | #define IKRA_TEST_CLASSNAME TestClassZ_Soa
 16 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero
 17 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeSoa
 18 | #include "basic_class_test_layout.inc"
 19 | #undef IKRA_TEST_CLASSNAME
 20 | #undef IKRA_TEST_ADDRESS_MODE
 21 | IKRA_HOST_STORAGE(TestClassZ_Soa)
 22 | 
 23 | #define IKRA_TEST_CLASSNAME TestClassZ_Aos
 24 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero
 25 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeAos
 26 | #include "basic_class_test_layout.inc"
 27 | #undef IKRA_TEST_CLASSNAME
 28 | #undef IKRA_TEST_ADDRESS_MODE
 29 | IKRA_HOST_STORAGE(TestClassZ_Aos)
 30 | 
 31 | // Valid addressing mode.
 32 | #define IKRA_TEST_CLASSNAME TestClassV
 33 | #define IKRA_TEST_ADDRESS_MODE sizeof(int)
 34 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeSoa
 35 | #include "basic_class_test_layout.inc"
 36 | #undef IKRA_TEST_CLASSNAME
 37 | #undef IKRA_TEST_ADDRESS_MODE
 38 | IKRA_HOST_STORAGE(TestClassV)
 39 | 
 40 | 
 41 | template<typename T>
 42 | class BasicClassTest : public testing::Test {};
 43 | 
 44 | TYPED_TEST_CASE_P(BasicClassTest);
 45 | 
 46 | TYPED_TEST_P(BasicClassTest, Fields) {
 47 |   TypeParam::initialize_storage();
 48 |   TypeParam** instances = new TypeParam*[kTestSize];
 49 | 
 50 |   // Create a few instances
 51 |   for (int i = 0; i < kTestSize; ++i) {
 52 |     instances[i] = new TypeParam();
 53 |   }
 54 | 
 55 |   // Set values.
 56 |   for (int i = 0; i < kTestSize; ++i) {
 57 |     instances[i]->field0 = i + 99;
 58 |     EXPECT_EQ(instances[i]->field0, i + 99);
 59 | 
 60 |     instances[i]->field4 = i*i;
 61 |     EXPECT_EQ(instances[i]->field4, i*i);
 62 | 
 63 |     instances[i]->field2 = i % 3 == 0;
 64 |     EXPECT_EQ(instances[i]->field2, i % 3 == 0);
 65 | 
 66 |     int expected = i % 3 == 0 ? i*i : -1;
 67 |     EXPECT_EQ(instances[i]->get_field4_if_field2(), expected);
 68 |   }
 69 | 
 70 |   // Do increment.
 71 |   for (int i = 0; i < kTestSize; ++i) {
 72 |     instances[i]->add_field0_to_field4();
 73 |   }
 74 | 
 75 |   // Check result.
 76 |   for (int i = 0; i < kTestSize; ++i) {
 77 |     int expected = i % 3 == 0 ? i + i*i + 99: -1;
 78 |     EXPECT_EQ(instances[i]->get_field4_if_field2(), expected);
 79 |   }
 80 | }
 81 | 
 82 | TYPED_TEST_P(BasicClassTest, SetFieldsAfterNew) {
 83 |   TypeParam::initialize_storage();
 84 |   TypeParam** instances = new TypeParam*[kTestSize];
 85 | 
 86 |   // Create a few instances
 87 |   for (int i = 0; i < kTestSize; ++i) {
 88 |     instances[i] = new TypeParam();
 89 | 
 90 |     instances[i]->field0 = i;
 91 |     EXPECT_EQ(instances[i]->field0, i);
 92 | 
 93 |     instances[i]->field4 = i*i;
 94 |     EXPECT_EQ(instances[i]->field4, i*i);
 95 | 
 96 |     instances[i]->field2 = i % 3 == 0;
 97 |     EXPECT_EQ(instances[i]->field2, i % 3 == 0);
 98 | 
 99 |     int expected = i % 3 == 0 ? i*i : -1;
100 |     EXPECT_EQ(instances[i]->get_field4_if_field2(), expected);
101 |   }
102 | 
103 |   // Do increment.
104 |   for (int i = 0; i < kTestSize; ++i) {
105 |     instances[i]->add_field0_to_field4();
106 |   }
107 | 
108 |   // Check result.
109 |   for (int i = 0; i < kTestSize; ++i) {
110 |     int expected = i % 3 == 0 ? i + i*i : -1;
111 |     EXPECT_EQ(instances[i]->get_field4_if_field2(), expected);
112 |   }
113 | }
114 | 
115 | TYPED_TEST_P(BasicClassTest, Constructor) {
116 |   TypeParam::initialize_storage();
117 |   TypeParam** instances = new TypeParam*[kTestSize];
118 | 
119 |   // Create a few instances
120 |   for (int i = 0; i < kTestSize; ++i) {
121 |     instances[i] = new TypeParam(3*i, 5*i);
122 |   }
123 | 
124 |   // Check result.
125 |   for (int i = 0; i < kTestSize; ++i) {
126 |     EXPECT_EQ((int) instances[i]->field0, 3*i);
127 |     EXPECT_EQ((int) instances[i]->field4, 5*i);
128 |   }
129 | }
130 | 
131 | TYPED_TEST_P(BasicClassTest, PlacementNew) {
132 |   TypeParam::initialize_storage();
133 |   EXPECT_EQ(TypeParam::size(), 0UL);
134 | 
135 |   TypeParam* obj = new(TypeParam::get_uninitialized(4)) TypeParam();
136 |   EXPECT_EQ(obj->id(), 4);
137 | }
138 | 
139 | REGISTER_TYPED_TEST_CASE_P(BasicClassTest,
140 |                            Fields,
141 |                            SetFieldsAfterNew,
142 |                            Constructor,
143 |                            PlacementNew);
144 | 
145 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, BasicClassTest, TestClassV);
146 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, BasicClassTest, TestClassZ_Soa);
147 | INSTANTIATE_TYPED_TEST_CASE_P(ZeroAos, BasicClassTest, TestClassZ_Aos);
148 | 
149 | }  // namespace
150 | 


--------------------------------------------------------------------------------
/test/soa/basic_class_test_layout.inc:
--------------------------------------------------------------------------------
 1 | class IKRA_TEST_CLASSNAME : public SoaLayout<IKRA_TEST_CLASSNAME,
 2 |                                              kClassMaxInst,
 3 |                                              IKRA_TEST_ADDRESS_MODE,
 4 |                                              StaticStorage,
 5 |                                              IKRA_TEST_LAYOUT_MODE> {
 6 |  public:
 7 |   IKRA_INITIALIZE_CLASS
 8 | 
 9 |   int_ field0;
10 |   double_ field1;
11 |   bool_ field2;
12 |   int_ field4;
13 | 
14 |   IKRA_TEST_CLASSNAME() {}
15 | 
16 |   IKRA_TEST_CLASSNAME(int field0_a, int field4_a) : field4(field4_a) {
17 |     field0 = field0_a;
18 |   }
19 | 
20 |   void add_field0_to_field4() {
21 |     field4 = field4 + field0;
22 |   }
23 | 
24 |   int get_field4_if_field2() {
25 |     if (field2) {
26 |       return field4;
27 |     } else {
28 |       return -1;
29 |     }
30 |   }
31 | };
32 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | 
3 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/aos_style.cu:
--------------------------------------------------------------------------------
 1 | #include "executor/cuda_executor.h"
 2 | #include "soa/soa.h"
 3 | 
 4 | #include "benchmark.h"
 5 | #include "configuration.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | 
11 | class DummyClass : public SoaLayout<DummyClass, NUM_INST> {
12 |  public:
13 |   IKRA_INITIALIZE_CLASS
14 | 
15 |   __device__ DummyClass(int f0, int f2): field0(f0), field2(f2) {
16 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
17 |       field1[i] = id()*17 + i;
18 |     }
19 |   }
20 | 
21 |   int_ field0;
22 | 
23 |   array_(int, ARRAY_SIZE, object) field1;
24 | 
25 |   int_ field2;
26 | 
27 |   __device__ void update_field1(int increment) {
28 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
29 |       field1[i] += increment + field0 + field2;
30 |     }
31 |   }
32 | };
33 | 
34 | IKRA_DEVICE_STORAGE(DummyClass);
35 | 
36 | 
37 | void action() {
38 |   DummyClass::initialize_storage();
39 |   DummyClass* first = construct<DummyClass>(NUM_INST, 29, 1357);
40 |   gpuErrchk(cudaPeekAtLastError());
41 | 
42 |   cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19);
43 |   gpuErrchk(cudaPeekAtLastError());
44 | 
45 |   cudaDeviceSynchronize();
46 | }
47 | 
48 | void run_test_construct_and_execute() {
49 |   uint64_t time_action = measure<>::execution(action);
50 |   printf("[AOS] Time for action: %lu\n", time_action);
51 | 
52 | #ifndef NDEBUG
53 |   // Check result (some samples).
54 |   for (int k = 0; k < 100; ++k) {
55 |     int i = rand() % NUM_INST;
56 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
57 |       int actual1 = DummyClass::get(i)->field1[j];
58 |       int expected1 = i*17 + j + 19 + 29 + 1357;
59 |       if (actual1 != expected1) {
60 |         printf("[AOS] Dummy[%i].field1[%i]: Expected %i, but found %i\n",
61 |                i, j, expected1, actual1);
62 |         exit(1);
63 |       }
64 |     }
65 |   }
66 | #endif  // NDEBUG
67 | }
68 | 
69 | int main() {
70 |   run_test_construct_and_execute();
71 | }
72 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/aos_style_cpu.cc:
--------------------------------------------------------------------------------
 1 | #include "executor/executor.h"
 2 | #include "soa/soa.h"
 3 | 
 4 | #include "benchmark.h"
 5 | #include "configuration.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::construct;
10 | using ikra::executor::execute;
11 | 
12 | class DummyClass : public SoaLayout<DummyClass, NUM_INST> {
13 |  public:
14 |   IKRA_INITIALIZE_CLASS
15 | 
16 |   DummyClass(int f0, int f2): field0(f0), field2(f2) {
17 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
18 |       field1[i] = id()*17 + i;
19 |     }
20 |   }
21 | 
22 |   int_ field0;
23 | 
24 |   array_(int, ARRAY_SIZE, object) field1;
25 | 
26 |   int_ field2;
27 | 
28 |   void update_field1(int increment) {
29 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
30 |       field1[i] += increment + field0 + field2;
31 |     }
32 |   }
33 | };
34 | 
35 | IKRA_HOST_STORAGE(DummyClass);
36 | 
37 | 
38 | void action() {
39 |   DummyClass::DBG_print_offsets();
40 |   DummyClass::initialize_storage();
41 |   DummyClass* first = construct<DummyClass>(NUM_INST, 29, 1357);
42 | 
43 | #ifndef NDEBUG
44 |   // Check values (some samples).
45 |   for (int k = 0; k < 100; ++k) {
46 |     int i = rand() % NUM_INST;
47 |     assert((int) DummyClass::get(i)->field0 == 29);
48 |     assert((int) DummyClass::get(i)->field2 == 1357);
49 |   }
50 | #endif  // NDEBUG
51 | 
52 |   execute(&DummyClass::update_field1, 19);
53 | }
54 | 
55 | void run_test_construct_and_execute() {
56 |   uint64_t time_action = measure<>::execution(action);
57 |   printf("[AOS-CPU] Time for action: %lu\n", time_action);
58 | 
59 | #ifndef NDEBUG
60 |   // Check result (some samples).
61 |   for (int k = 0; k < 100; ++k) {
62 |     int i = rand() % NUM_INST;
63 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
64 |       int actual1 = DummyClass::get(i)->field1[j];
65 |       int expected1 = i*17 + j + 19 + 29 + 1357;// + 19 + 29 + 1357;
66 |       if (actual1 != expected1) {
67 |         printf("[AOS-CPU] Dummy[%i].field1[%i]: Expected %i, but found %i\n",
68 |                i, j, expected1, actual1);
69 |         exit(1);
70 |       }
71 |     }
72 |   }
73 | #endif  // NDEBUG
74 | }
75 | 
76 | int main() {
77 |   run_test_construct_and_execute();
78 | }
79 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/benchmark.h:
--------------------------------------------------------------------------------
 1 | // Taken from: https://stackoverflow.com/questions/2808398/easily-measure-elapsed-time
 2 | 
 3 | #ifndef TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H
 4 | #define TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | template<typename TimeT = std::chrono::milliseconds>
10 | struct measure
11 | {
12 |     template<typename F, typename ...Args>
13 |     static typename TimeT::rep execution(F&& func, Args&&... args)
14 |     {
15 |         auto start = std::chrono::steady_clock::now();
16 |         std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
17 |         auto duration = std::chrono::duration_cast< TimeT> 
18 |                             (std::chrono::steady_clock::now() - start);
19 |         return duration.count();
20 |     }
21 | };
22 | 
23 | #endif  // TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H
24 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/configuration.h:
--------------------------------------------------------------------------------
1 | //#define NDEBUG
2 | 
3 | #define NUM_INST 32768
4 | #define ARRAY_SIZE 64
5 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/inlined_soa_style.cu:
--------------------------------------------------------------------------------
 1 | #include "executor/cuda_executor.h"
 2 | #include "soa/soa.h"
 3 | 
 4 | #include "benchmark.h"
 5 | #include "configuration.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | using ikra::soa::StaticStorageWithArena;
11 | using ikra::soa::kAddressModeZero;
12 | 
13 | #define READ_FROM_ARENA_ELEMENTS 8
14 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST)
15 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS)
16 | 
17 | class DummyClass : public SoaLayout<DummyClass, NUM_INST, kAddressModeZero,
18 |     StaticStorageWithArena<EXTRA_BYTES>> {
19 |  public:
20 |   IKRA_INITIALIZE_CLASS
21 | 
22 |   __device__ DummyClass(int f0, int f2): field0(f0), field2(f2),
23 |                                          field1(ARRAY_SIZE) {
24 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
25 |       field1[i] = id()*17 + i;
26 |     }
27 |   }
28 | 
29 |   int_ field0;
30 | 
31 |   array_(int, INLINE_ARR_SIZE, partially_inlined) field1;
32 | 
33 |   int_ field2;
34 | 
35 |   __device__ void update_field1(int increment) {
36 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
37 |       field1[i] += increment + field0 + field2;
38 |     }
39 |   }
40 | };
41 | 
42 | IKRA_DEVICE_STORAGE(DummyClass);
43 | 
44 | 
45 | void action() {
46 |   DummyClass::initialize_storage();
47 |   DummyClass* first = construct<DummyClass>(NUM_INST, 29, 1357);
48 |   gpuErrchk(cudaPeekAtLastError());
49 | 
50 |   cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19);
51 |   gpuErrchk(cudaPeekAtLastError());
52 | 
53 |   cudaDeviceSynchronize();
54 | }
55 | 
56 | void run_test_construct_and_execute() {
57 |   uint64_t time_action = measure<>::execution(action);
58 |   printf("[INLINE-%i/%i SOA] Time for action: %lu\n",
59 |          INLINE_ARR_SIZE, ARRAY_SIZE, time_action);
60 | 
61 | #ifndef NDEBUG
62 |   // Check result (some samples).
63 |   for (int k = 0; k < 100; ++k) {
64 |     int i = rand() % NUM_INST;
65 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
66 |       int actual1 = DummyClass::get(i)->field1[j];
67 |       int expected1 = i*17 + j + 19 + 29 + 1357;
68 |       if (actual1 != expected1) {
69 |         printf("[INLINE] Dummy[%i].field1[%i]: Expected %i, but found %i\n",
70 |                i, j, expected1, actual1);
71 |         exit(1);
72 |       }
73 |     }
74 |   }
75 | #endif  // NDEBUG
76 | }
77 | 
78 | int main() {
79 |   run_test_construct_and_execute();
80 | }
81 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/array/soa_style.cu:
--------------------------------------------------------------------------------
 1 | #include "executor/cuda_executor.h"
 2 | #include "soa/soa.h"
 3 | 
 4 | #include "benchmark.h"
 5 | #include "configuration.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | 
11 | class DummyClass : public SoaLayout<DummyClass, NUM_INST> {
12 |  public:
13 |   IKRA_INITIALIZE_CLASS
14 | 
15 |   __device__ DummyClass(int f0, int f2): field0(f0), field2(f2) {
16 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
17 |       field1[i] = id()*17 + i;
18 |     }
19 |   }
20 | 
21 |   int_ field0;
22 | 
23 |   array_(int, ARRAY_SIZE, fully_inlined) field1;
24 | 
25 |   int_ field2;
26 | 
27 |   __device__ void update_field1(int increment) {
28 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
29 |       field1[i] += increment + field0 + field2;
30 |     }
31 |   }
32 | };
33 | 
34 | IKRA_DEVICE_STORAGE(DummyClass);
35 | 
36 | 
37 | void action() {
38 |   DummyClass::initialize_storage();
39 |   DummyClass* first = construct<DummyClass>(NUM_INST, 29, 1357);
40 |   gpuErrchk(cudaPeekAtLastError());
41 | 
42 |   cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19);
43 |   gpuErrchk(cudaPeekAtLastError());
44 | 
45 |   cudaDeviceSynchronize();
46 | }
47 | 
48 | void run_test_construct_and_execute() {
49 |   uint64_t time_action = measure<>::execution(action);
50 |   printf("[SOA] Time for action: %lu\n", time_action);
51 | 
52 | #ifndef NDEBUG
53 |   // Check result (some samples).
54 |   for (int k = 0; k < 100; ++k) {
55 |     int i = rand() % NUM_INST;
56 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
57 |       int actual1 = DummyClass::get(i)->field1[j];
58 |       int expected1 = i*17 + j + 19 + 29 + 1357;
59 |       if (actual1 != expected1) {
60 |         printf("[SOA] Dummy[%i].field1[%i]: Expected %i, but found %i\n",
61 |                i, j, expected1, actual1);
62 |         exit(1);
63 |       }
64 |     }
65 |   }
66 | #endif  // NDEBUG
67 | }
68 | 
69 | int main() {
70 |   run_test_construct_and_execute();
71 | }
72 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | echo "Script will stop on error or incorrect result."
  4 | extra_args="-march=native -fomit-frame-pointer"
  5 | clang_bin="clang++-5.0"
  6 | 
  7 | mkdir -p bin
  8 | mkdir -p assembly
  9 | 
 10 | for v_compiler in "g++" "${clang_bin}"
 11 | do
 12 |   for v_storage in "StaticStorage" "DynamicStorage"
 13 |   do
 14 |     for v_opt_mode in "-O0" "-O3"
 15 |     do
 16 |       for v_addr_mode in "0" "4"
 17 |       do
 18 |         out_name="${v_compiler}_${v_opt_mode}_${v_storage}_${v_addr_mode}_Soa"
 19 |         ${v_compiler} -std=c++11 ${v_opt_mode} ${extra_args} \
 20 |             -DSTORAGE_STRATEGY=${v_storage} \
 21 |             -DADDRESS_MODE=${v_addr_mode} \
 22 |             -DLAYOUT_MODE=Soa \
 23 |             codegen_test.cc \
 24 |             -I../../../ikra \
 25 |             -o bin/${out_name}
 26 |         bin/${out_name}
 27 |         objdump -S bin/${out_name} > assembly/${out_name}.S
 28 |       done
 29 |     done
 30 |   done
 31 | done
 32 | 
 33 | for v_compiler in "g++" "${clang_bin}"
 34 | do
 35 |   for v_opt_mode in "-O0" "-O3"
 36 |   do
 37 |     for v_layout_mode in "Aos" "Soa"
 38 |     do
 39 |       out_name="${v_compiler}_${v_opt_mode}_StaticStorage_0_Aos"
 40 |       ${v_compiler} -std=c++11 ${v_opt_mode} ${extra_args} \
 41 |           -DSTORAGE_STRATEGY=StaticStorage \
 42 |           -DADDRESS_MODE=0 \
 43 |           -DLAYOUT_MODE=Aos \
 44 |           codegen_test.cc \
 45 |           -I../../../ikra \
 46 |           -o bin/${out_name}
 47 |       bin/${out_name}
 48 |       objdump -S bin/${out_name} > assembly/${out_name}.S
 49 |     done
 50 |   done
 51 | done
 52 | 
 53 | 
 54 | # CUDA test
 55 | /usr/local/cuda/bin/nvcc \
 56 |     -std=c++11 \
 57 |     --expt-extended-lambda \
 58 |     -O3 \
 59 |     -I../../../ikra \
 60 |     -o bin/cuda_codegen_test \
 61 |     cuda_codegen_test.cu
 62 | bin/cuda_codegen_test
 63 | /usr/local/cuda/bin/cuobjdump bin/cuda_codegen_test -ptx -sass -res-usage \
 64 |     > assembly/cuda_codegen_test.S
 65 | 
 66 | g++ -std=c++11 -O3 -I../../../ikra -o bin/cuda_aos_style_cpu array/aos_style_cpu.cc
 67 | bin/cuda_aos_style_cpu
 68 | 
 69 | /usr/local/cuda/bin/nvcc \
 70 |     -std=c++11 \
 71 |     --expt-extended-lambda \
 72 |     -O3 \
 73 |     -I../../../ikra \
 74 |     -o bin/cuda_array_aos_style \
 75 |     array/aos_style.cu
 76 | bin/cuda_array_aos_style
 77 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_aos_style -ptx -sass -res-usage \
 78 |     > assembly/cuda_array_aos_style.S
 79 | 
 80 | /usr/local/cuda/bin/nvcc \
 81 |     -std=c++11 \
 82 |     --expt-extended-lambda \
 83 |     -O3 \
 84 |     -I../../../ikra \
 85 |     -o bin/cuda_array_soa_style \
 86 |     array/soa_style.cu
 87 | bin/cuda_array_soa_style
 88 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_soa_style -ptx -sass -res-usage \
 89 |     > assembly/cuda_array_soa_style.S
 90 | 
 91 | /usr/local/cuda/bin/nvcc \
 92 |     -std=c++11 \
 93 |     --expt-extended-lambda \
 94 |     -O3 \
 95 |     -I../../../ikra \
 96 |     -o bin/cuda_array_inlined_soa_style \
 97 |     array/inlined_soa_style.cu
 98 | bin/cuda_array_inlined_soa_style
 99 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_inlined_soa_style -ptx -sass -res-usage \
100 |     > assembly/cuda_array_inlined_soa_style.S
101 | 
102 | extra_args="-march=native -fomit-frame-pointer -Ofast"
103 | for v_compiler in "g++" "${clang_bin}"
104 | do
105 |   # Build nbody
106 |   out_name="${v_compiler}_nbody_ikracpp"
107 |   ${v_compiler} -O3 ${extra_args} nbody/ikracpp.cc -std=c++11 -I../../../ikra \
108 |       -o bin/${out_name}
109 |   objdump -S bin/${out_name} > assembly/${out_name}.S
110 |   echo -n "."
111 | 
112 |   out_name="${v_compiler}_nbody_ikracpp_field"
113 |   ${v_compiler} -O3 ${extra_args} nbody/ikracpp_field.cc -std=c++11 -I../../../ikra \
114 |       -o bin/${out_name}
115 |   objdump -S bin/${out_name} > assembly/${out_name}.S
116 |   echo -n "."
117 | 
118 |   out_name="${v_compiler}_nbody_ikracpp_inversed"
119 |   ${v_compiler} -O3 ${extra_args} nbody/ikracpp_inversed.cc -std=c++11 -I../../../ikra \
120 |       -o bin/${out_name}
121 |   objdump -S bin/${out_name} > assembly/${out_name}.S
122 |   echo -n "."
123 | 
124 |   # No vectorization with: -fno-tree-vectorize
125 |   out_name="${v_compiler}_nbody_soa"
126 |   ${v_compiler} -O3 ${extra_args} nbody/soa.cc -std=c++11 -o bin/${out_name}
127 |   objdump -S bin/${out_name} > assembly/${out_name}.S
128 |   echo -n "."
129 | 
130 |   out_name="${v_compiler}_nbody_aos"
131 |   ${v_compiler} -O3 ${extra_args} nbody/aos.cc -std=c++11 -o bin/${out_name}
132 |   objdump -S bin/${out_name} > assembly/${out_name}.S
133 |   echo -n "."
134 | done
135 | 
136 | /usr/local/cuda/bin/nvcc \
137 |     -std=c++11 \
138 |     --expt-extended-lambda \
139 |     -O3 \
140 |     -I../../../ikra \
141 |     -o bin/cuda_nbody_ikracpp_inversed_gpu \
142 |     nbody/ikracpp_inversed_gpu.cu
143 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_ikracpp_inversed_gpu \
144 |     -ptx -sass -res-usage > assembly/cuda_nbody_ikracpp_inversed_gpu.S
145 | echo -n "."
146 | 
147 | /usr/local/cuda/bin/nvcc \
148 |     -std=c++11 \
149 |     --expt-extended-lambda \
150 |     -O3 \
151 |     -I../../../ikra \
152 |     -o bin/cuda_nbody_soa_inversed_gpu \
153 |     nbody/soa_inversed.cu
154 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_soa_inversed_gpu \
155 |     -ptx -sass -res-usage > assembly/cuda_nbody_soa_inversed_gpu.S
156 | echo -n "."
157 | 
158 | /usr/local/cuda/bin/nvcc \
159 |     -std=c++11 \
160 |     --expt-extended-lambda \
161 |     -O3 \
162 |     -I../../../ikra \
163 |     -o bin/cuda_nbody_ikracpp_inversed_field_gpu \
164 |     nbody/ikracpp_inversed_field_gpu.cu
165 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_ikracpp_inversed_field_gpu \
166 |     -ptx -sass -res-usage > assembly/cuda_nbody_ikracpp_inversed_field_gpu.S
167 | echo "."
168 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/codegen_test.cc:
--------------------------------------------------------------------------------
  1 | // This is a minimal test case whose purpose is to check if the compiler
  2 | // generates efficient assembly code. Should be built with -O3 optimization
  3 | // and not through Bazel (is not set up to generate such optimized code).
  4 | 
  5 | #include <stdio.h>
  6 | 
  7 | #define NDEBUG    // No asserts.
  8 | #include "soa/soa.h"
  9 | #include "soa/storage.h"
 10 | 
 11 | using ikra::soa::SoaLayout;
 12 | using ikra::soa::kAddressModeZero;
 13 | using ikra::soa::DynamicStorage;
 14 | using ikra::soa::StaticStorage;
 15 | using ikra::soa::kLayoutModeAos;
 16 | using ikra::soa::kLayoutModeSoa;
 17 | 
 18 | 
 19 | // Compiler flags determine addressing mode and storage strategy.
 20 | #ifndef ADDRESS_MODE
 21 | #error Address mode undefined
 22 | #endif
 23 | 
 24 | #ifndef STORAGE_STRATEGY
 25 | #error Storage strategy undefined
 26 | #endif
 27 | 
 28 | #ifndef LAYOUT_MODE
 29 | #error Layout mode undefined
 30 | #endif
 31 | 
 32 | static const uint32_t kClassMaxInst = 0x1234;
 33 | 
 34 | class TestClass : public SoaLayout<TestClass, kClassMaxInst,
 35 |                                    ADDRESS_MODE, STORAGE_STRATEGY,
 36 |                                    PP_CONCAT(kLayoutMode, LAYOUT_MODE)> {
 37 |  public:
 38 |   IKRA_INITIALIZE_CLASS
 39 | 
 40 |   int_ field0;
 41 |   int_ field1;
 42 | 
 43 |   void increase_field0() {
 44 |     field0 *= 0x5555;
 45 |   }
 46 | 
 47 |   void increase_field1() {
 48 |     field1 *= 0x4444;
 49 |   }
 50 | };
 51 | 
 52 | IKRA_HOST_STORAGE(TestClass);
 53 | 
 54 | class TestClassCompare {
 55 |  public:
 56 |   int field0;
 57 |   int field1;
 58 | 
 59 |   void increase_field0() {
 60 |     field0 *= 0x5555;
 61 |   }
 62 | 
 63 |   void increase_field1() {
 64 |     field1 *= 0x4444;
 65 |   }
 66 | };
 67 | 
 68 | TestClassCompare test_compare_array[kClassMaxInst];
 69 | 
 70 | 
 71 | int main() {
 72 |   TestClass::initialize_storage();
 73 | 
 74 |   TestClass* instance = new TestClass();
 75 |   instance->field0 = 0x7777;
 76 |   instance->field1 = 0x8888;
 77 |   instance->increase_field0();
 78 |   instance->increase_field1();
 79 | 
 80 |   // Expected output: FIELD0: 668085635, FIELD1: 610821152
 81 |   printf("FIELD0: %i, FIELD1: %i\n", (int) instance->field0,
 82 |                                      (int) instance->field1);
 83 | 
 84 |   // Return 0 if correct result.
 85 |   return !(instance->field0 == 668085635 && instance->field1 == 610821152);
 86 | }
 87 | 
 88 | // Extra methods for code isolation: Will show up in .S file.
 89 | TestClass* new_instance() {
 90 |   return new TestClass();
 91 | }
 92 | 
 93 | void write_field0(TestClass* instance) {
 94 |   instance->field0 = 0x7777;
 95 | }
 96 | 
 97 | int read_field0(TestClass* instance) {
 98 |   return instance->field0;
 99 | }
100 | 
101 | void write_field1(TestClass* instance) {
102 |   instance->field1 = 0x7777;
103 | }
104 | 
105 | int read_field1(TestClass* instance) {
106 |   return instance->field1;
107 | }
108 | 
109 | // Compare with explicit, hand-written SOA code.
110 | int explicit_field0[100];
111 | 
112 | void explicit_write_field0(uintptr_t id) {
113 |   explicit_field0[id] = 0x7777;
114 | }
115 | 
116 | int explicit_read_field0(uintptr_t id) {
117 |   return explicit_field0[id];
118 | }
119 | 
120 | void explicit_write_field0_aos(TestClassCompare* obj) {
121 |   obj->field0 = 0x7777;
122 | }
123 | 
124 | int explicit_read_field0_aos(TestClassCompare* obj) {
125 |   return obj->field0;
126 | }
127 | 
128 | void explicit_write_field1_aos(TestClassCompare* obj) {
129 |   obj->field1 = 0x7777;
130 | }
131 | 
132 | int explicit_read_field1_aos(TestClassCompare* obj) {
133 |   return obj->field1;
134 | }
135 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/cuda_codegen_test.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define NDEBUG    // No asserts.
 4 | 
 5 | #include "executor/cuda_executor.h"
 6 | #include "soa/soa.h"
 7 | 
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | 
11 | const static int kTestSize = 12;
12 | 
13 | class TestClass : public SoaLayout<TestClass, 1000> {
14 |  public:
15 |   IKRA_INITIALIZE_CLASS
16 | 
17 |   __device__ TestClass(int f0, int f1) : field0(f0), field1(f1) {}
18 | 
19 |   int_ field0;
20 |   int_ field1;
21 | 
22 |   __device__ void add_fields(int increment) {
23 |     field0 = field0 + field1 + increment + this->id() + 0x7777;
24 | 
25 |     TestClass::get(0)->field0.get();
26 |   }
27 | };
28 | 
29 | IKRA_DEVICE_STORAGE(TestClass)
30 | 
31 | 
32 | int main() {
33 |   TestClass::initialize_storage();
34 | 
35 |   TestClass* first = construct<TestClass>(kTestSize, 0x6666, 0x5555);
36 |   cuda_execute(&TestClass::add_fields, 0x8888);
37 | 
38 |   // Check result.
39 |   for (int i = 0; i < kTestSize; ++i) {
40 |     int actual = TestClass::get_uninitialized(i)->field0;
41 |     int expected = 0x7777 + 0x6666 + 0x5555 + i + 0x8888;
42 | 
43 |     if (actual == expected) {
44 |       printf("[OK] ");
45 |     } else {
46 |       printf("[FAIL]\n");
47 |       return 1;
48 |     }
49 |   }
50 | 
51 |   printf("\n");
52 | 
53 |   // Make sure that we had no CUDA failures.
54 |   gpuErrchk(cudaPeekAtLastError());
55 |   return 0;
56 | }
57 | 
58 | // Extra methods for code isolation: Will show up in .S file.
59 | __device__ int int_result;
60 | 
61 | __global__ void write_field0(TestClass* instance) {
62 |   instance->field0 = 0x7777;
63 | }
64 | 
65 | __global__ void read_field0(TestClass* instance) {
66 |   int_result = instance->field0;
67 | }
68 | 
69 | __global__ void write_field1(TestClass* instance) {
70 |   instance->field1 = 0x7777;
71 | }
72 | 
73 | __global__ void read_field1(TestClass* instance) {
74 |   int_result = instance->field1;
75 | }
76 | 
77 | // Compare with explicit, hand-written SOA code.
78 | __device__ int explicit_field0[100];
79 | 
80 | __global__ void explicit_write_field0(uintptr_t id) {
81 |   explicit_field0[id] = 0x7777;
82 | }
83 | 
84 | __global__ void explicit_read_field0(uintptr_t id) {
85 |   int_result = explicit_field0[id];
86 | }
87 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/aos.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cmath>
  7 | #include <array>
  8 | 
  9 | #include "benchmark.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | static const int kIterations = 5;
 14 | static const int kNumBodies = 8000;
 15 | static const double kMaxMass = 1000;
 16 | static const double kTimeInterval = 0.5;
 17 | 
 18 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 19 | 
 20 | #define RAND (1.0 * rand() / RAND_MAX)
 21 | 
 22 | char Body_arena[kNumBodies*8*8];
 23 | char* Body_arena_head = Body_arena;
 24 | 
 25 | class Body {
 26 |  public:
 27 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 28 |       : mass_(mass) {
 29 |     position_[0] = pos_x;
 30 |     position_[1] = pos_y;
 31 |     velocity_[0] = vel_x;
 32 |     velocity_[1] = vel_y;
 33 |     this->reset_force();
 34 |   }
 35 | 
 36 |   double mass_;
 37 |   std::array<double, 2> position_;
 38 |   std::array<double, 2> velocity_;
 39 |   std::array<double, 2> force_;
 40 | 
 41 |   void add_force(Body* body) {
 42 |     if (this == body) return;
 43 | 
 44 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 45 |     double dx = body->position_[0] - position_[0];
 46 |     double dy = body->position_[1] - position_[1];
 47 |     double dist = sqrt(dx*dx + dy*dy);
 48 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 49 |     force_[0] += F*dx / dist;
 50 |     force_[1] += F*dy / dist;
 51 |   }
 52 | 
 53 |   void add_force_to_all() {
 54 |     for (int i = 0; i < kNumBodies; ++i) {
 55 |       Body* body = reinterpret_cast<Body*>(Body_arena + sizeof(Body)*i);
 56 |       body->add_force(this);
 57 |     }
 58 |   }
 59 | 
 60 |   void reset_force() {
 61 |     force_[0] = 0.0;
 62 |     force_[1] = 0.0;
 63 |   }
 64 | 
 65 |   void update(double dt) {
 66 |     velocity_[0] += force_[0]*dt / mass_;
 67 |     velocity_[1] += force_[1]*dt / mass_;
 68 |     position_[0] += velocity_[0]*dt;
 69 |     position_[1] += velocity_[1]*dt;
 70 | 
 71 |     if (position_[0] < -1 || position_[0] > 1) {
 72 |       velocity_[0] = -velocity_[0];
 73 |     }
 74 |     if (position_[1] < -1 || position_[1] > 1) {
 75 |       velocity_[1] = -velocity_[1];
 76 |     }
 77 |   }
 78 | 
 79 |   void codengen_simple_update(double dt) {
 80 |     for (int i = 0; i < 100; ++i) {
 81 |       velocity_[0] += force_[0]*dt / mass_;
 82 |       velocity_[1] += force_[1]*dt / mass_;
 83 |       position_[0] += velocity_[0]*dt;
 84 |       position_[1] += velocity_[1]*dt;
 85 |     }
 86 |   }
 87 | };
 88 | 
 89 | 
 90 | void instantiation() {
 91 |   srand(42);
 92 | 
 93 |   // Create objects.
 94 |   for (int i = 0; i < kNumBodies; ++i) {
 95 |     double mass = (RAND/2 + 0.5) * kMaxMass;
 96 |     double pos_x = RAND*2 - 1;
 97 |     double pos_y = RAND*2 - 1;
 98 |     double vel_x = (RAND - 0.5) / 1000;
 99 |     double vel_y = (RAND - 0.5) / 1000;
100 |     new(Body_arena_head) Body(mass, pos_x, pos_y, vel_x, vel_y);
101 |     Body_arena_head += sizeof(Body);
102 |   }
103 | }
104 | 
105 | void run_simulation() {
106 |   for (int i = 0; i < kIterations; ++i) {
107 |     // Reset forces.
108 |     for (int j = 0; j < kNumBodies; ++j) {
109 |       Body* body = reinterpret_cast<Body*>(Body_arena + sizeof(Body)*j);
110 |       body->reset_force();
111 |     }
112 | 
113 |     // Update forces.
114 |     for (int j = 0; j < kNumBodies; ++j) {
115 |       Body* body = reinterpret_cast<Body*>(Body_arena + sizeof(Body)*j);
116 |       body->add_force_to_all();
117 |     }
118 | 
119 |     // Update velocities and positions.
120 |     for (int j = 0; j < kNumBodies; ++j) {
121 |       Body* body = reinterpret_cast<Body*>(Body_arena + sizeof(Body)*j);
122 |       body->update(kTimeInterval);
123 |     }
124 |   }
125 | }
126 | 
127 | void run_simple() {
128 |   for (int i = 0; i < kIterations*100; ++i) {
129 |     for (int j = 0; j < kNumBodies; ++j) {
130 |       Body* body = reinterpret_cast<Body*>(Body_arena + sizeof(Body)*j);
131 |       body->codengen_simple_update(kTimeInterval);
132 |     }
133 |   }
134 | }
135 | 
136 | int main() {
137 |   uint64_t time_instantiation = measure<>::execution(instantiation);
138 |   uint64_t time_simulation = measure<>::execution(run_simulation);
139 |   uint64_t time_simple = measure<>::execution(run_simple);
140 | 
141 |   // Calculate checksum
142 |   int checksum = 11;
143 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
144 |     checksum += reinterpret_cast<int>(r_float2int(
145 |         reinterpret_cast<Body*>(Body_arena + sizeof(Body)*i)->position_[0]));
146 |     checksum += reinterpret_cast<int>(r_float2int(
147 |         reinterpret_cast<Body*>(Body_arena + sizeof(Body)*i)->position_[1]));
148 |     checksum = checksum % 1234567;
149 | 
150 |     if (i < 10) {
151 |       printf("VALUE[%lu] = %f, %f\n", i,
152 |           reinterpret_cast<Body*>(Body_arena + sizeof(Body)*i)->position_[0],
153 |           reinterpret_cast<Body*>(Body_arena + sizeof(Body)*i)->position_[1]);
154 |     }
155 |   }
156 | 
157 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
158 |          time_instantiation, time_simulation, time_simple, checksum);
159 |   return 0;
160 | }
161 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/benchmark.h:
--------------------------------------------------------------------------------
 1 | // Taken from: https://stackoverflow.com/questions/2808398/easily-measure-elapsed-time
 2 | 
 3 | #ifndef TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H
 4 | #define TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | template<typename TimeT = std::chrono::milliseconds>
10 | struct measure
11 | {
12 |     template<typename F, typename ...Args>
13 |     static typename TimeT::rep execution(F&& func, Args&&... args)
14 |     {
15 |         auto start = std::chrono::steady_clock::now();
16 |         std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
17 |         auto duration = std::chrono::duration_cast< TimeT> 
18 |                             (std::chrono::steady_clock::now() - start);
19 |         return duration.count();
20 |     }
21 | };
22 | 
23 | int r_float2int(float value) {
24 |   return *reinterpret_cast<int*>(&value);
25 | }
26 | 
27 | #endif  // TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H
28 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/ikracpp.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #define NDEBUG    // No asserts.
  5 | 
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | 
 10 | #include "benchmark.h"
 11 | #include "executor/executor.h"
 12 | #include "soa/soa.h"
 13 | 
 14 | using namespace std;
 15 | using ikra::executor::execute;
 16 | using ikra::executor::Iterator;
 17 | using ikra::soa::SoaLayout;
 18 | 
 19 | static const int kIterations = 5;
 20 | static const int kNumBodies = 8000;
 21 | static const double kMaxMass = 1000;
 22 | static const double kTimeInterval = 0.5;
 23 | 
 24 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 25 | 
 26 | #define RAND (1.0 * rand() / RAND_MAX)
 27 | 
 28 | class Body : public SoaLayout<Body, kNumBodies> {
 29 |  public:
 30 |   IKRA_INITIALIZE_CLASS
 31 | 
 32 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 33 |       : mass_(mass) {
 34 |     position_[0] = pos_x;
 35 |     position_[1] = pos_y;
 36 |     velocity_[0] = vel_x;
 37 |     velocity_[1] = vel_y;
 38 |     this->reset_force();
 39 |   }
 40 | 
 41 |   double_ mass_;
 42 |   array_(double, 2, fully_inlined) position_;
 43 |   array_(double, 2, fully_inlined) velocity_;
 44 |   array_(double, 2, fully_inlined) force_;
 45 | 
 46 |   void add_force(Body* body) {
 47 |     if (this == body) return;
 48 | 
 49 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 50 |     double dx = body->position_[0] - position_[0];
 51 |     double dy = body->position_[1] - position_[1];
 52 |     double dist = sqrt(dx*dx + dy*dy);
 53 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 54 |     force_[0] += F*dx / dist;
 55 |     force_[1] += F*dy / dist;
 56 |   }
 57 | 
 58 |   void add_force_to_all() {
 59 |     execute<kNumBodies>(&Body::add_force, this);
 60 |   }
 61 | 
 62 |   void reset_force() {
 63 |     force_[0] = 0.0;
 64 |     force_[1] = 0.0;
 65 |   }
 66 | 
 67 |   void update(double dt) {
 68 |     velocity_[0] += force_[0]*dt / mass_;
 69 |     velocity_[1] += force_[1]*dt / mass_;
 70 |     position_[0] += velocity_[0]*dt;
 71 |     position_[1] += velocity_[1]*dt;
 72 | 
 73 |     if (position_[0] < -1 || position_[0] > 1) {
 74 |       velocity_[0] = -velocity_[0];
 75 |     }
 76 |     if (position_[1] < -1 || position_[1] > 1) {
 77 |       velocity_[1] = -velocity_[1];
 78 |     }
 79 |   }
 80 | 
 81 |   void codengen_simple_update_templated(double dt) {
 82 |     velocity_.at<0>() += force_.at<0>()*dt / mass_;
 83 |     velocity_.at<1>() += force_.at<1>()*dt / mass_;
 84 |     position_.at<0>() += velocity_.at<0>()*dt;
 85 |     position_.at<1>() += velocity_.at<1>()*dt;
 86 |   }
 87 | 
 88 |   void codengen_simple_update(double dt) {
 89 |     for (int i = 0; i < 100; ++i) {
 90 |       velocity_[0] += force_[0]*dt / mass_;
 91 |       velocity_[1] += force_[1]*dt / mass_;
 92 |       position_[0] += velocity_[0]*dt;
 93 |       position_[1] += velocity_[1]*dt;
 94 |     }
 95 |   }
 96 | };
 97 | 
 98 | IKRA_HOST_STORAGE(Body)
 99 | 
100 | void instantiation() {
101 |   srand(42);
102 | 
103 |   // Create objects.
104 |   for (int i = 0; i < kNumBodies; ++i) {
105 |     double mass = (RAND/2 + 0.5) * kMaxMass;
106 |     double pos_x = RAND*2 - 1;
107 |     double pos_y = RAND*2 - 1;
108 |     double vel_x = (RAND - 0.5) / 1000;
109 |     double vel_y = (RAND - 0.5) / 1000;
110 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
111 |   }
112 | }
113 | 
114 | void run_simulation() {
115 |   for (int i = 0; i < kIterations; ++i) {
116 |     // Reset forces.
117 |     execute<kNumBodies>(&Body::reset_force);
118 | 
119 |     // Update forces.
120 |     execute<kNumBodies>(&Body::add_force_to_all);
121 | 
122 |     // Update velocities and positions.
123 |     execute<kNumBodies>(&Body::update, kTimeInterval);
124 |   }
125 | }
126 | 
127 | void run_simple() {
128 |   for (int i = 0; i < kIterations*100; ++i) {
129 |     execute<kNumBodies>(&Body::codengen_simple_update, kTimeInterval);
130 |   }
131 | }
132 | 
133 | int main() {
134 |     // Initialize object storage.
135 |   Body::initialize_storage();
136 | 
137 |   uint64_t time_instantiation = measure<>::execution(instantiation);
138 |   uint64_t time_simulation = measure<>::execution(run_simulation);
139 |   uint64_t time_simple = measure<>::execution(run_simple);
140 | 
141 |   // Calculate checksum
142 |   int checksum = 11;
143 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
144 |     checksum += reinterpret_cast<int>(
145 |         r_float2int(Body::get(i)->position_[0]));
146 |     checksum += reinterpret_cast<int>(
147 |         r_float2int(Body::get(i)->position_[1]));
148 |     checksum = checksum % 1234567;
149 | 
150 |     if (i < 10) {
151 |       printf("VALUE[%lu] = %f, %f\n", i,
152 |              (double) Body::get(i)->position_[0],
153 |              (double) Body::get(i)->position_[1]);
154 |     }
155 |   }
156 | 
157 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
158 |          time_instantiation, time_simulation, time_simple, checksum);
159 |   return 0;
160 | }
161 | 
162 | void codegen_reset_force(Body* body) {
163 |   body->reset_force();
164 | }
165 | 
166 | void codegen_add_force(Body* self, Body* other) {
167 |   self->add_force(other);
168 | }
169 | 
170 | void codegen_update(Body* self, double dt) {
171 |   self->update(dt);
172 | }
173 | 
174 | void codegen_reset_force_manual(Body* body) {
175 |   body->force_[0] = 0.0;
176 |   body->force_[1] = 0.0;
177 | }
178 | 
179 | void codegen_reset_force_half(Body* body) {
180 |   body->force_[0] = 0.0;
181 | }
182 | 
183 | void codegen_reset_mass(Body* body) {
184 |   body->mass_ = 0.0;
185 | }
186 | 
187 | void codengen_simple_update(Body* body, double dt) {
188 |   body->codengen_simple_update(dt);
189 | }
190 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/ikracpp_field.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #define NDEBUG    // No asserts.
  5 | 
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | 
 10 | #include "benchmark.h"
 11 | #include "executor/executor.h"
 12 | #include "soa/soa.h"
 13 | 
 14 | using namespace std;
 15 | using ikra::executor::execute;
 16 | using ikra::executor::Iterator;
 17 | using ikra::soa::SoaLayout;
 18 | 
 19 | static const int kIterations = 5;
 20 | static const int kNumBodies = 8000;
 21 | static const double kMaxMass = 1000;
 22 | static const double kTimeInterval = 0.5;
 23 | 
 24 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 25 | 
 26 | #define RAND (1.0 * rand() / RAND_MAX)
 27 | 
 28 | class Body : public SoaLayout<Body, kNumBodies> {
 29 |  public:
 30 |   IKRA_INITIALIZE_CLASS
 31 | 
 32 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 33 |       : mass_(mass) {
 34 |     position0_ = pos_x;
 35 |     position1_ = pos_y;
 36 |     velocity0_ = vel_x;
 37 |     velocity1_ = vel_y;
 38 |     this->reset_force();
 39 |   }
 40 | 
 41 |   double_ mass_;
 42 |   double_ position0_;
 43 |   double_ position1_;
 44 |   double_ velocity0_;
 45 |   double_ velocity1_;
 46 |   double_ force0_;
 47 |   double_ force1_;
 48 | 
 49 |   void add_force(Body* body) {
 50 |     if (this == body) return;
 51 | 
 52 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 53 |     double dx = body->position0_ - position0_;
 54 |     double dy = body->position1_ - position1_;
 55 |     double dist = sqrt(dx*dx + dy*dy);
 56 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 57 |     force0_ += F*dx / dist;
 58 |     force1_ += F*dy / dist;
 59 |   }
 60 | 
 61 |   void add_force_to_all() {
 62 |     execute<kNumBodies>(&Body::add_force, this);
 63 |   }
 64 | 
 65 |   void reset_force() {
 66 |     force0_ = 0.0;
 67 |     force1_ = 0.0;
 68 |   }
 69 | 
 70 |   void update(double dt) {
 71 |     velocity0_ += force0_*dt / mass_;
 72 |     velocity1_ += force1_*dt / mass_;
 73 |     position0_ += velocity0_*dt;
 74 |     position1_ += velocity1_*dt;
 75 | 
 76 |     if (position0_ < -1 || position0_ > 1) {
 77 |       velocity0_ = -velocity0_;
 78 |     }
 79 |     if (position1_ < -1 || position1_ > 1) {
 80 |       velocity1_ = -velocity1_;
 81 |     }
 82 |   }
 83 | 
 84 |   void codengen_simple_update(double dt) {
 85 |     for (int i = 0; i < 100; ++i) {
 86 |       velocity0_ += force0_*dt / mass_;
 87 |       velocity1_ += force1_*dt / mass_;
 88 |       position0_ += velocity0_*dt;
 89 |       position1_ += velocity1_*dt;
 90 |     }
 91 |   }
 92 | };
 93 | 
 94 | IKRA_HOST_STORAGE(Body)
 95 | 
 96 | void instantiation() {
 97 |   srand(42);
 98 | 
 99 |   // Create objects.
100 |   for (int i = 0; i < kNumBodies; ++i) {
101 |     double mass = (RAND/2 + 0.5) * kMaxMass;
102 |     double pos_x = RAND*2 - 1;
103 |     double pos_y = RAND*2 - 1;
104 |     double vel_x = (RAND - 0.5) / 1000;
105 |     double vel_y = (RAND - 0.5) / 1000;
106 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
107 |   }
108 | }
109 | 
110 | void run_simulation() {
111 |   for (int i = 0; i < kIterations; ++i) {
112 |     // Reset forces.
113 |     execute<kNumBodies>(&Body::reset_force);
114 | 
115 |     // Update forces.
116 |     execute<kNumBodies>(&Body::add_force_to_all);
117 | 
118 |     // Update velocities and positions.
119 |     execute<kNumBodies>(&Body::update, kTimeInterval);
120 |   }
121 | }
122 | 
123 | void run_simple() {
124 |   for (int i = 0; i < kIterations*100; ++i) {
125 |     execute<kNumBodies>(&Body::codengen_simple_update, kTimeInterval);
126 |   }
127 | }
128 | 
129 | int main() {
130 |     // Initialize object storage.
131 |   Body::initialize_storage();
132 | 
133 |   uint64_t time_instantiation = measure<>::execution(instantiation);
134 |   uint64_t time_simulation = measure<>::execution(run_simulation);
135 |   uint64_t time_simple = measure<>::execution(run_simple);
136 | 
137 |   // Calculate checksum
138 |   int checksum = 11;
139 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
140 |     checksum += reinterpret_cast<int>(
141 |         r_float2int(Body::get(i)->position0_));
142 |     checksum += reinterpret_cast<int>(
143 |         r_float2int(Body::get(i)->position1_));
144 |     checksum = checksum % 1234567;
145 | 
146 |     if (i < 10) {
147 |       printf("VALUE[%lu] = %f, %f\n", i,
148 |              (double) Body::get(i)->position0_,
149 |              (double) Body::get(i)->position1_);
150 |     }
151 |   }
152 | 
153 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
154 |          time_instantiation, time_simulation, time_simple, checksum);
155 |   return 0;
156 | }
157 | 
158 | void codegen_reset_force(Body* body) {
159 |   body->reset_force();
160 | }
161 | 
162 | void codegen_add_force(Body* self, Body* other) {
163 |   self->add_force(other);
164 | }
165 | 
166 | void codegen_update(Body* self, double dt) {
167 |   self->update(dt);
168 | }
169 | 
170 | void codegen_reset_force_manual(Body* body) {
171 |   body->force0_ = 0.0;
172 |   body->force1_ = 0.0;
173 | }
174 | 
175 | void codegen_reset_force_half(Body* body) {
176 |   body->force0_ = 0.0;
177 | }
178 | 
179 | void codegen_reset_mass(Body* body) {
180 |   body->mass_ = 0.0;
181 | }
182 | 
183 | void codengen_simple_update(Body* body, double dt) {
184 |   body->codengen_simple_update(dt);
185 | }
186 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/ikracpp_inversed.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #define NDEBUG    // No asserts.
  5 | 
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | 
 10 | #include "benchmark.h"
 11 | #include "executor/executor.h"
 12 | #include "soa/soa.h"
 13 | 
 14 | using namespace std;
 15 | using ikra::executor::execute;
 16 | using ikra::executor::Iterator;
 17 | using ikra::soa::SoaLayout;
 18 | 
 19 | static const int kIterations = 5;
 20 | static const int kNumBodies = 8000;
 21 | static const double kMaxMass = 1000;
 22 | static const double kTimeInterval = 0.5;
 23 | 
 24 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 25 | 
 26 | #define RAND (1.0 * rand() / RAND_MAX)
 27 | 
 28 | class Body : public SoaLayout<Body, kNumBodies> {
 29 |  public:
 30 |   IKRA_INITIALIZE_CLASS
 31 | 
 32 |   Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y)
 33 |       : mass_(mass) {
 34 |     position_[0] = pos_x;
 35 |     position_[1] = pos_y;
 36 |     velocity_[0] = vel_x;
 37 |     velocity_[1] = vel_y;
 38 |     this->reset_force();
 39 |   }
 40 | 
 41 |   double_ mass_;
 42 |   array_(double, 2, fully_inlined) position_;
 43 |   array_(double, 2, fully_inlined) velocity_;
 44 |   array_(double, 2, fully_inlined) force_;
 45 | 
 46 |   void add_force(Body* body) {
 47 |     if (this == body) return;
 48 | 
 49 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 50 |     double dx = body->position_[0] - position_[0];
 51 |     double dy = body->position_[1] - position_[1];
 52 |     double dist = sqrt(dx*dx + dy*dy);
 53 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 54 |     force_[0] += F*dx / dist;
 55 |     force_[1] += F*dy / dist;
 56 |   }
 57 | 
 58 |   void add_all_forces_to_this() {
 59 |     for (uintptr_t i = 0; i < kNumBodies; ++i) {
 60 |       this->add_force(Body::get(i));
 61 |     }
 62 |   }
 63 | 
 64 |   void reset_force() {
 65 |     force_[0] = 0.0;
 66 |     force_[1] = 0.0;
 67 |   }
 68 | 
 69 |   void update(double dt) {
 70 |     velocity_[0] += force_[0]*dt / mass_;
 71 |     velocity_[1] += force_[1]*dt / mass_;
 72 |     position_[0] += velocity_[0]*dt;
 73 |     position_[1] += velocity_[1]*dt;
 74 | 
 75 |     if (position_[0] < -1 || position_[0] > 1) {
 76 |       velocity_[0] = -velocity_[0];
 77 |     }
 78 |     if (position_[1] < -1 || position_[1] > 1) {
 79 |       velocity_[1] = -velocity_[1];
 80 |     }
 81 |   }
 82 | 
 83 |   void codengen_simple_update_templated(double dt) {
 84 |     velocity_.at<0>() += force_.at<0>()*dt / mass_;
 85 |     velocity_.at<1>() += force_.at<1>()*dt / mass_;
 86 |     position_.at<0>() += velocity_.at<0>()*dt;
 87 |     position_.at<1>() += velocity_.at<1>()*dt;
 88 |   }
 89 | 
 90 |   void codengen_simple_update(double dt) {
 91 |     for (int i = 0; i < 100; ++i) {
 92 |       velocity_[0] += force_[0]*dt / mass_;
 93 |       velocity_[1] += force_[1]*dt / mass_;
 94 |       position_[0] += velocity_[0]*dt;
 95 |       position_[1] += velocity_[1]*dt;
 96 |     }
 97 |   }
 98 | };
 99 | 
100 | IKRA_HOST_STORAGE(Body)
101 | 
102 | void instantiation() {
103 |   srand(42);
104 | 
105 |   // Create objects.
106 |   for (int i = 0; i < kNumBodies; ++i) {
107 |     double mass = (RAND/2 + 0.5) * kMaxMass;
108 |     double pos_x = RAND*2 - 1;
109 |     double pos_y = RAND*2 - 1;
110 |     double vel_x = (RAND - 0.5) / 1000;
111 |     double vel_y = (RAND - 0.5) / 1000;
112 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
113 |   }
114 | }
115 | 
116 | void run_simulation() {
117 |   for (int i = 0; i < kIterations; ++i) {
118 |     // Reset forces.
119 |     execute<kNumBodies>(&Body::reset_force);
120 | 
121 |     // Update forces.
122 |     execute<kNumBodies>(&Body::add_all_forces_to_this);
123 | 
124 |     // Update velocities and positions.
125 |     execute<kNumBodies>(&Body::update, kTimeInterval);
126 |   }
127 | }
128 | 
129 | void run_simple() {
130 |   for (int i = 0; i < kIterations*100; ++i) {
131 |     execute<kNumBodies>(&Body::codengen_simple_update, kTimeInterval);
132 |   }
133 | }
134 | 
135 | int main() {
136 |     // Initialize object storage.
137 |   Body::initialize_storage();
138 | 
139 |   uint64_t time_instantiation = measure<>::execution(instantiation);
140 |   uint64_t time_simulation = measure<>::execution(run_simulation);
141 |   uint64_t time_simple = measure<>::execution(run_simple);
142 | 
143 |   // Calculate checksum
144 |   int checksum = 11;
145 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
146 |     checksum += reinterpret_cast<int>(
147 |         r_float2int(Body::get(i)->position_[0]));
148 |     checksum += reinterpret_cast<int>(
149 |         r_float2int(Body::get(i)->position_[1]));
150 |     checksum = checksum % 1234567;
151 | 
152 |     if (i < 10) {
153 |       printf("VALUE[%lu] = %f, %f\n", i,
154 |              (double) Body::get(i)->position_[0],
155 |              (double) Body::get(i)->position_[1]);
156 |     }
157 |   }
158 | 
159 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
160 |          time_instantiation, time_simulation, time_simple, checksum);
161 |   return 0;
162 | }
163 | 
164 | void codegen_reset_force(Body* body) {
165 |   body->reset_force();
166 | }
167 | 
168 | void codegen_add_force(Body* self, Body* other) {
169 |   self->add_force(other);
170 | }
171 | 
172 | void codegen_update(Body* self, double dt) {
173 |   self->update(dt);
174 | }
175 | 
176 | void codegen_reset_force_manual(Body* body) {
177 |   body->force_[0] = 0.0;
178 |   body->force_[1] = 0.0;
179 | }
180 | 
181 | void codegen_reset_force_half(Body* body) {
182 |   body->force_[0] = 0.0;
183 | }
184 | 
185 | void codegen_reset_mass(Body* body) {
186 |   body->mass_ = 0.0;
187 | }
188 | 
189 | void codengen_simple_update(Body* body, double dt) {
190 |   body->codengen_simple_update(dt);
191 | }
192 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/ikracpp_inversed_field_gpu.cu:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #define NDEBUG    // No asserts.
  5 | 
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | 
 10 | #include "benchmark.h"
 11 | #include "executor/cuda_executor.h"
 12 | #include "executor/executor.h"
 13 | #include "soa/soa.h"
 14 | 
 15 | using namespace std;
 16 | using ikra::executor::execute;
 17 | using ikra::executor::Iterator;
 18 | using ikra::soa::SoaLayout;
 19 | 
 20 | static const int kIterations = 5;
 21 | static const int kNumBodies = 8000;
 22 | static const double kMaxMass = 1000;
 23 | static const double kTimeInterval = 0.5;
 24 | 
 25 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 26 | 
 27 | #define RAND (1.0 * rand() / RAND_MAX)
 28 | 
 29 | class Body : public SoaLayout<Body, kNumBodies> {
 30 |  public:
 31 |   IKRA_INITIALIZE_CLASS
 32 | 
 33 |   Body(double mass, double pos_x, double pos_y,
 34 |        double vel_x, double vel_y) : mass_(mass) {
 35 |     position0_ = pos_x;
 36 |     position1_ = pos_y;
 37 |     velocity0_ = vel_x;
 38 |     velocity1_ = vel_y;
 39 |     this->reset_force();
 40 |   }
 41 | 
 42 |   double_ mass_;
 43 |   double_ position0_;
 44 |   double_ position1_;
 45 |   double_ velocity0_;
 46 |   double_ velocity1_;
 47 |   double_ force0_;
 48 |   double_ force1_;
 49 | 
 50 |   __device__ void add_force(Body* body) {
 51 |     if (this == body) return;
 52 | 
 53 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 54 |     double dx = body->position0_ - position0_;
 55 |     double dy = body->position1_ - position1_;
 56 |     double dist = sqrt(dx*dx + dy*dy);
 57 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 58 |     force0_ += F*dx / dist;
 59 |     force1_ += F*dy / dist;
 60 |   }
 61 | 
 62 |   __device__ void add_all_forces_to_this() {
 63 |     for (uintptr_t i = 0; i < kNumBodies; ++i) {
 64 |       this->add_force(Body::get(i));
 65 |     }
 66 |   }
 67 | 
 68 |   __host__ __device__ void reset_force() {
 69 |     force0_ = 0.0;
 70 |     force1_ = 0.0;
 71 |   }
 72 | 
 73 |   __device__ void update(double dt) {
 74 |     velocity0_ += force0_*dt / mass_;
 75 |     velocity1_ += force1_*dt / mass_;
 76 |     position0_ += velocity0_*dt;
 77 |     position1_ += velocity1_*dt;
 78 | 
 79 |     if (position0_ < -1 || position0_ > 1) {
 80 |       velocity0_ = -velocity0_;
 81 |     }
 82 |     if (position1_ < -1 || position1_ > 1) {
 83 |       velocity1_ = -velocity1_;
 84 |     }
 85 |   }
 86 | 
 87 |   __device__ void codengen_simple_update(double dt) {
 88 |     for (int i = 0; i < 1000; ++i) {
 89 |       velocity0_ += force0_*dt / mass_;
 90 |       velocity1_ += force1_*dt / mass_;
 91 |       position0_ += velocity0_*dt;
 92 |       position1_ += velocity1_*dt;
 93 |     }
 94 |   }
 95 | };
 96 | 
 97 | IKRA_DEVICE_STORAGE(Body)
 98 | 
 99 | // TODO: Benchmark instance creation on GPU.
100 | void instantiation() {
101 |   srand(42);
102 | 
103 |   // Create objects.
104 |   for (int i = 0; i < kNumBodies; ++i) {
105 |     double mass = (RAND/2 + 0.5) * kMaxMass;
106 |     double pos_x = RAND*2 - 1;
107 |     double pos_y = RAND*2 - 1;
108 |     double vel_x = (RAND - 0.5) / 1000;
109 |     double vel_y = (RAND - 0.5) / 1000;
110 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
111 |   }
112 | }
113 | 
114 | void run_simulation() {
115 |   for (int i = 0; i < kIterations; ++i) {
116 |     // Reset forces.
117 |     cuda_execute(&Body::reset_force);
118 | 
119 |     // Update forces.
120 |     cuda_execute(&Body::add_all_forces_to_this);
121 | 
122 |     // Update velocities and positions.
123 |     cuda_execute(&Body::update, kTimeInterval);
124 |   }
125 | }
126 | 
127 | void run_simple() {
128 |   for (int i = 0; i < kIterations*100; ++i) {
129 |     cuda_execute(&Body::codengen_simple_update, kTimeInterval);
130 |   }
131 | }
132 | 
133 | int main() {
134 |     // Initialize object storage.
135 |   Body::initialize_storage();
136 | 
137 |   uint64_t time_instantiation = measure<>::execution(instantiation);
138 |   gpuErrchk(cudaPeekAtLastError());
139 | 
140 |   uint64_t time_simulation = measure<>::execution(run_simulation);
141 |   gpuErrchk(cudaPeekAtLastError());
142 | 
143 |   uint64_t time_simple = measure<>::execution(run_simple);
144 |   gpuErrchk(cudaPeekAtLastError());
145 | 
146 |   // Calculate checksum
147 |   int checksum = 11;
148 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
149 |     checksum += reinterpret_cast<int>(
150 |         r_float2int(Body::get(i)->position0_));
151 |     checksum += reinterpret_cast<int>(
152 |         r_float2int(Body::get(i)->position1_));
153 |     checksum = checksum % 1234567;
154 | 
155 |     if (i < 10) {
156 |       printf("VALUE[%lu] = %f, %f\n", i,
157 |              (double) Body::get(i)->position0_,
158 |              (double) Body::get(i)->position1_);
159 |     }
160 |   }
161 | 
162 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
163 |          time_instantiation, time_simulation, time_simple, checksum);
164 |   return 0;
165 | }
166 | 
167 | __global__ void codegen_reset_force(Body* body) {
168 |   body->reset_force();
169 | }
170 | 
171 | __global__  void codegen_add_force(Body* self, Body* other) {
172 |   self->add_force(other);
173 | }
174 | 
175 | __global__  void codegen_update(Body* self, double dt) {
176 |   self->update(dt);
177 | }
178 | 
179 | __global__  void codegen_reset_force_manual(Body* body) {
180 |   body->force0_ = 0.0;
181 |   body->force1_ = 0.0;
182 | }
183 | 
184 | __global__  void codegen_reset_force_half(Body* body) {
185 |   body->force0_ = 0.0;
186 | }
187 | 
188 | __global__  void codegen_reset_mass(Body* body) {
189 |   body->mass_ = 0.0;
190 | }
191 | 
192 | __global__  void codengen_simple_update(Body* body, double dt) {
193 |   body->codengen_simple_update(dt);
194 | }
195 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/ikracpp_inversed_gpu.cu:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #define NDEBUG    // No asserts.
  5 | 
  6 | #include <cstdio>
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | 
 10 | #include "benchmark.h"
 11 | #include "executor/cuda_executor.h"
 12 | #include "executor/executor.h"
 13 | #include "soa/soa.h"
 14 | 
 15 | using namespace std;
 16 | using ikra::executor::execute;
 17 | using ikra::executor::Iterator;
 18 | using ikra::soa::SoaLayout;
 19 | 
 20 | static const int kIterations = 5;
 21 | static const int kNumBodies = 8000;
 22 | static const double kMaxMass = 1000;
 23 | static const double kTimeInterval = 0.5;
 24 | 
 25 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 26 | 
 27 | #define RAND (1.0 * rand() / RAND_MAX)
 28 | 
 29 | class Body : public SoaLayout<Body, kNumBodies> {
 30 |  public:
 31 |   IKRA_INITIALIZE_CLASS
 32 | 
 33 |   Body(double mass, double pos_x, double pos_y,
 34 |        double vel_x, double vel_y) : mass_(mass) {
 35 |     position_[0] = pos_x;
 36 |     position_[1] = pos_y;
 37 |     velocity_[0] = vel_x;
 38 |     velocity_[1] = vel_y;
 39 |     this->reset_force();
 40 |   }
 41 | 
 42 |   double_ mass_;
 43 |   array_(double, 2, fully_inlined) position_;
 44 |   array_(double, 2, fully_inlined) velocity_;
 45 |   array_(double, 2, fully_inlined) force_;
 46 | 
 47 |   __device__ void add_force(Body* body) {
 48 |     if (this == body) return;
 49 | 
 50 |     double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 51 |     double dx = body->position_[0] - position_[0];
 52 |     double dy = body->position_[1] - position_[1];
 53 |     double dist = sqrt(dx*dx + dy*dy);
 54 |     double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS);
 55 |     force_[0] += F*dx / dist;
 56 |     force_[1] += F*dy / dist;
 57 |   }
 58 | 
 59 |   __device__ void add_all_forces_to_this() {
 60 |     for (uintptr_t i = 0; i < kNumBodies; ++i) {
 61 |       this->add_force(Body::get(i));
 62 |     }
 63 |   }
 64 | 
 65 |   __host__ __device__ void reset_force() {
 66 |     force_[0] = 0.0;
 67 |     force_[1] = 0.0;
 68 |   }
 69 | 
 70 |   __device__ void update(double dt) {
 71 |     velocity_[0] += force_[0]*dt / mass_;
 72 |     velocity_[1] += force_[1]*dt / mass_;
 73 |     position_[0] += velocity_[0]*dt;
 74 |     position_[1] += velocity_[1]*dt;
 75 | 
 76 |     if (position_[0] < -1 || position_[0] > 1) {
 77 |       velocity_[0] = -velocity_[0];
 78 |     }
 79 |     if (position_[1] < -1 || position_[1] > 1) {
 80 |       velocity_[1] = -velocity_[1];
 81 |     }
 82 |   }
 83 | 
 84 |   __device__ void codengen_simple_update(double dt) {
 85 |     for (int i = 0; i < 1000; ++i) {
 86 |       velocity_[0] += force_[0]*dt / mass_;
 87 |       velocity_[1] += force_[1]*dt / mass_;
 88 |       position_[0] += velocity_[0]*dt;
 89 |       position_[1] += velocity_[1]*dt;
 90 |     }
 91 |   }
 92 | };
 93 | 
 94 | IKRA_DEVICE_STORAGE(Body)
 95 | 
 96 | // TODO: Benchmark instance creation on GPU.
 97 | void instantiation() {
 98 |   srand(42);
 99 | 
100 |   // Create objects.
101 |   for (int i = 0; i < kNumBodies; ++i) {
102 |     double mass = (RAND/2 + 0.5) * kMaxMass;
103 |     double pos_x = RAND*2 - 1;
104 |     double pos_y = RAND*2 - 1;
105 |     double vel_x = (RAND - 0.5) / 1000;
106 |     double vel_y = (RAND - 0.5) / 1000;
107 |     new Body(mass, pos_x, pos_y, vel_x, vel_y);
108 |   }
109 | }
110 | 
111 | void run_simulation() {
112 |   for (int i = 0; i < kIterations; ++i) {
113 |     // Reset forces.
114 |     cuda_execute(&Body::reset_force);
115 | 
116 |     // Update forces.
117 |     cuda_execute(&Body::add_all_forces_to_this);
118 | 
119 |     // Update velocities and positions.
120 |     cuda_execute(&Body::update, kTimeInterval);
121 |   }
122 | }
123 | 
124 | void run_simple() {
125 |   for (int i = 0; i < kIterations*100; ++i) {
126 |     cuda_execute(&Body::codengen_simple_update, kTimeInterval);
127 |   }
128 | }
129 | 
130 | int main() {
131 |     // Initialize object storage.
132 |   Body::initialize_storage();
133 | 
134 |   uint64_t time_instantiation = measure<>::execution(instantiation);
135 |   gpuErrchk(cudaPeekAtLastError());
136 | 
137 |   uint64_t time_simulation = measure<>::execution(run_simulation);
138 |   gpuErrchk(cudaPeekAtLastError());
139 | 
140 |   uint64_t time_simple = measure<>::execution(run_simple);
141 |   gpuErrchk(cudaPeekAtLastError());
142 | 
143 |   // Calculate checksum
144 |   int checksum = 11;
145 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
146 |     checksum += reinterpret_cast<int>(
147 |         r_float2int(Body::get(i)->position_[0]));
148 |     checksum += reinterpret_cast<int>(
149 |         r_float2int(Body::get(i)->position_[1]));
150 |     checksum = checksum % 1234567;
151 | 
152 |     if (i < 10) {
153 |       printf("VALUE[%lu] = %f, %f\n", i,
154 |              (double) Body::get(i)->position_[0],
155 |              (double) Body::get(i)->position_[1]);
156 |     }
157 |   }
158 | 
159 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n",
160 |          time_instantiation, time_simulation, time_simple, checksum);
161 |   return 0;
162 | }
163 | 
164 | __global__ void codegen_reset_force(Body* body) {
165 |   body->reset_force();
166 | }
167 | 
168 | __global__  void codegen_add_force(Body* self, Body* other) {
169 |   self->add_force(other);
170 | }
171 | 
172 | __global__  void codegen_update(Body* self, double dt) {
173 |   self->update(dt);
174 | }
175 | 
176 | __global__  void codegen_reset_force_manual(Body* body) {
177 |   body->force_[0] = 0.0;
178 |   body->force_[1] = 0.0;
179 | }
180 | 
181 | __global__  void codegen_reset_force_half(Body* body) {
182 |   body->force_[0] = 0.0;
183 | }
184 | 
185 | __global__  void codegen_reset_mass(Body* body) {
186 |   body->mass_ = 0.0;
187 | }
188 | 
189 | __global__  void codengen_simple_update(Body* body, double dt) {
190 |   body->codengen_simple_update(dt);
191 | }
192 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/soa.cc:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cmath>
  7 | 
  8 | #include "benchmark.h"
  9 | 
 10 | using namespace std;
 11 | using IdType = uintptr_t;
 12 | 
 13 | static const int kIterations = 5;
 14 | static const int kNumBodies = 8000;
 15 | static const double kMaxMass = 1000;
 16 | static const double kTimeInterval = 0.5;
 17 | 
 18 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 19 | 
 20 | double a_Body_mass[kNumBodies];
 21 | double a_Body_position_0[kNumBodies];
 22 | double a_Body_position_1[kNumBodies];
 23 | double a_Body_velocity_0[kNumBodies];
 24 | double a_Body_velocity_1[kNumBodies];
 25 | double a_Body_force_0[kNumBodies];
 26 | double a_Body_force_1[kNumBodies];
 27 | 
 28 | #define RAND (1.0 * rand() / RAND_MAX)
 29 | 
 30 | void Body_initialize(IdType i, double mass, double pos_x, double pos_y,
 31 |                      double vel_x, double vel_y) {
 32 |   a_Body_mass[i] = mass;
 33 |   a_Body_position_0[i] = pos_x;
 34 |   a_Body_position_1[i] = pos_y;
 35 |   a_Body_velocity_0[i] = vel_x;
 36 |   a_Body_velocity_1[i] = vel_y;
 37 |   a_Body_force_0[i] = 0.0;
 38 |   a_Body_force_1[i] = 0.0;
 39 | }
 40 | 
 41 | void Body_add_force(IdType self, IdType body) {
 42 |   if (self == body) return;
 43 | 
 44 |   double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 45 |   double dx = a_Body_position_0[body] - a_Body_position_0[self];
 46 |   double dy = a_Body_position_1[body] - a_Body_position_1[self];
 47 |   double dist = sqrt(dx*dx + dy*dy);
 48 |   double F = kGravityConstant*a_Body_mass[self]*a_Body_mass[body]
 49 |              / (dist*dist + EPS*EPS);
 50 |   a_Body_force_0[self] += F*dx / dist;
 51 |   a_Body_force_1[self] += F*dy / dist;
 52 | }
 53 | 
 54 | void Body_add_force_to_all(IdType self) {
 55 |   for (IdType i = 0; i < kNumBodies; ++i) {
 56 |     Body_add_force(i, self);
 57 |   }
 58 | }
 59 | 
 60 | void Body_reset_force(IdType self) {
 61 |   a_Body_force_0[self] = 0.0;
 62 |   a_Body_force_1[self] = 0.0;
 63 | }
 64 | 
 65 | void Body_update(IdType self, double dt) {
 66 |   a_Body_velocity_0[self] += a_Body_force_0[self]*dt / a_Body_mass[self];
 67 |   a_Body_velocity_1[self] += a_Body_force_1[self]*dt / a_Body_mass[self];
 68 |   a_Body_position_0[self] += a_Body_velocity_0[self]*dt;
 69 |   a_Body_position_1[self] += a_Body_velocity_1[self]*dt;
 70 | 
 71 |   if (a_Body_position_0[self] < -1 || a_Body_position_0[self] > 1) {
 72 |     a_Body_velocity_0[self] = -a_Body_velocity_0[self];
 73 |   }
 74 |   if (a_Body_position_1[self] < -1 || a_Body_position_1[self] > 1) {
 75 |     a_Body_velocity_1[self] = -a_Body_velocity_1[self];
 76 |   }
 77 | }
 78 | 
 79 | void Body_codegen_simple_update(IdType self, double dt) {
 80 |   for (int i = 0; i < 100; ++i) {
 81 |     a_Body_velocity_0[self] += a_Body_force_0[self]*dt / a_Body_mass[self];
 82 |     a_Body_velocity_1[self] += a_Body_force_1[self]*dt / a_Body_mass[self];
 83 |     a_Body_position_0[self] += a_Body_velocity_0[self]*dt;
 84 |     a_Body_position_1[self] += a_Body_velocity_1[self]*dt;
 85 |   }
 86 | }
 87 | 
 88 | void instantiation() {
 89 |   srand(42);
 90 | 
 91 |   // Create objects.
 92 |   for (IdType i = 0; i < kNumBodies; ++i) {
 93 |     double mass = (RAND/2 + 0.5) * kMaxMass;
 94 |     double pos_x = RAND*2 - 1;
 95 |     double pos_y = RAND*2 - 1;
 96 |     double vel_x = (RAND - 0.5) / 1000;
 97 |     double vel_y = (RAND - 0.5) / 1000;
 98 |     Body_initialize(i, mass, pos_x, pos_y, vel_x, vel_y);
 99 |   }
100 | }
101 | 
102 | void run_simulation() {
103 |   for (int i = 0; i < kIterations; ++i) {
104 |     // Reset forces.
105 |     for (IdType j = 0; j < kNumBodies; ++j) {
106 |       Body_reset_force(j);
107 |     }
108 | 
109 |     // Update forces.
110 |     for (IdType j = 0; j < kNumBodies; ++j) {
111 |       Body_add_force_to_all(j);
112 |     }
113 | 
114 |     // Update velocities and positions.
115 |     for (IdType j = 0; j < kNumBodies; ++j) {
116 |       Body_update(j, kTimeInterval);
117 |     }
118 |   }
119 | }
120 | 
121 | void run_simple() {
122 |   for (int i = 0; i < kIterations*100; ++i) {
123 |     for (IdType j = 0; j < kNumBodies; ++j) {
124 |       Body_codegen_simple_update(j, kTimeInterval);
125 |     }
126 |   }
127 | }
128 | 
129 | int main() {
130 |   uint64_t time_instantiation = measure<>::execution(instantiation);
131 |   uint64_t time_simulation = measure<>::execution(run_simulation);
132 |   uint64_t time_simple = measure<>::execution(run_simple);
133 | 
134 |   // Calculate checksum
135 |   int checksum = 11;
136 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
137 |     checksum += reinterpret_cast<int>(r_float2int(a_Body_position_0[i]));
138 |     checksum += reinterpret_cast<int>(r_float2int(a_Body_position_1[i]));
139 |     checksum = checksum % 1234567;
140 | 
141 |     if (i < 10) {
142 |       printf("VALUE[%lu] = %f, %f\n", i,
143 |              a_Body_position_0[i],
144 |              a_Body_position_1[i]);
145 |     }
146 |   }
147 | 
148 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\nchecksum: %i\n",
149 |          time_instantiation, time_simulation, time_simple, checksum);
150 |   return 0;
151 | }
152 | 


--------------------------------------------------------------------------------
/test/soa/benchmarks/nbody/soa_inversed.cu:
--------------------------------------------------------------------------------
  1 | // N-Body Simulation
  2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm
  3 | 
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cmath>
  7 | 
  8 | #include "benchmark.h"
  9 | #include "executor/cuda_executor.h"
 10 | 
 11 | using namespace std;
 12 | using IdType = uintptr_t;
 13 | 
 14 | static const int kIterations = 5;
 15 | static const int kNumBodies = 8000;
 16 | static const double kMaxMass = 1000;
 17 | static const double kTimeInterval = 0.5;
 18 | 
 19 | static const double kGravityConstant = 6.673e-11;   // gravitational constant
 20 | 
 21 | struct Container {
 22 |   double a_Body_mass[kNumBodies];
 23 |   double a_Body_position_0[kNumBodies];
 24 |   double a_Body_position_1[kNumBodies];
 25 |   double a_Body_velocity_0[kNumBodies];
 26 |   double a_Body_velocity_1[kNumBodies];
 27 |   double a_Body_force_0[kNumBodies];
 28 |   double a_Body_force_1[kNumBodies];
 29 | };
 30 | 
 31 | __device__ Container d_container;
 32 | Container h_container;
 33 | 
 34 | #define RAND (1.0 * rand() / RAND_MAX)
 35 | 
 36 | void Body_initialize(IdType i, double mass, double pos_x, double pos_y,
 37 |                      double vel_x, double vel_y) {
 38 |   h_container.a_Body_mass[i] = mass;
 39 |   h_container.a_Body_position_0[i] = pos_x;
 40 |   h_container.a_Body_position_1[i] = pos_y;
 41 |   h_container.a_Body_velocity_0[i] = vel_x;
 42 |   h_container.a_Body_velocity_1[i] = vel_y;
 43 |   h_container.a_Body_force_0[i] = 0.0;
 44 |   h_container.a_Body_force_1[i] = 0.0;
 45 | }
 46 | 
 47 | __device__ void Body_add_force(IdType self, IdType body) {
 48 |   if (self == body) return;
 49 |   double EPS = 0.01;    // Softening parameter (just to avoid infinities).
 50 |   double dx = d_container.a_Body_position_0[body]
 51 |               - d_container.a_Body_position_0[self];
 52 |   double dy = d_container.a_Body_position_1[body]
 53 |               - d_container.a_Body_position_1[self];
 54 |   double dist = sqrt(dx*dx + dy*dy);
 55 |   double F = kGravityConstant
 56 |              * d_container.a_Body_mass[self]*d_container.a_Body_mass[body]
 57 |              / (dist*dist + EPS*EPS);
 58 |   d_container.a_Body_force_0[self] += F*dx / dist;
 59 |   d_container.a_Body_force_1[self] += F*dy / dist;
 60 | }
 61 | 
 62 | __device__  void Body_add_all_forces_to_this(IdType self) {
 63 |   for (IdType i = 0; i < kNumBodies; ++i) {
 64 |     Body_add_force(self, i);
 65 |   }
 66 | }
 67 | 
 68 | __global__ void kernel_Body_add_all_forces_to_this() {
 69 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
 70 |   if (tid < kNumBodies) {
 71 |     Body_add_all_forces_to_this(tid);
 72 |   }
 73 | }
 74 | 
 75 | __device__ void Body_reset_force(IdType self) {
 76 |   d_container.a_Body_force_0[self] = 0.0;
 77 |   d_container.a_Body_force_1[self] = 0.0;
 78 | }
 79 | 
 80 | __global__ void kernel_Body_reset_force() {
 81 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
 82 |   if (tid < kNumBodies) {
 83 |     Body_reset_force(tid);
 84 |   }
 85 | }
 86 | 
 87 | __device__ void Body_update(IdType self, double dt) {
 88 |   d_container.a_Body_velocity_0[self] +=
 89 |       d_container.a_Body_force_0[self]*dt / d_container.a_Body_mass[self];
 90 |   d_container.a_Body_velocity_1[self] +=
 91 |       d_container.a_Body_force_1[self]*dt / d_container.a_Body_mass[self];
 92 |   d_container.a_Body_position_0[self] +=
 93 |       d_container.a_Body_velocity_0[self]*dt;
 94 |   d_container.a_Body_position_1[self] +=
 95 |       d_container.a_Body_velocity_1[self]*dt;
 96 | 
 97 |   if (d_container.a_Body_position_0[self] < -1
 98 |         || d_container.a_Body_position_0[self] > 1) {
 99 |     d_container.a_Body_velocity_0[self] = -d_container.a_Body_velocity_0[self];
100 |   }
101 |   if (d_container.a_Body_position_1[self] < -1
102 |         || d_container.a_Body_position_1[self] > 1) {
103 |     d_container.a_Body_velocity_1[self] = -d_container.a_Body_velocity_1[self];
104 |   }
105 | }
106 | 
107 | __global__ void kernel_Body_update(double dt) {
108 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
109 |   if (tid < kNumBodies) {
110 |     Body_update(tid, dt);
111 |   }
112 | }
113 | 
114 | __device__ void Body_codegen_simple_update(IdType self, double dt) {
115 |   for (int i = 0; i < 1000; ++i) {
116 |     d_container.a_Body_velocity_0[self] +=
117 |         d_container.a_Body_force_0[self]*dt / d_container.a_Body_mass[self];
118 |     d_container.a_Body_velocity_1[self] +=
119 |         d_container.a_Body_force_1[self]*dt / d_container.a_Body_mass[self];
120 |     d_container.a_Body_position_0[self] +=
121 |         d_container.a_Body_velocity_0[self]*dt;
122 |     d_container.a_Body_position_1[self] +=
123 |         d_container.a_Body_velocity_1[self]*dt;
124 |   }
125 | }
126 | 
127 | __global__ void kernel_Body_codegen_simple_update(double dt) {
128 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
129 |   if (tid < kNumBodies) {
130 |     Body_codegen_simple_update(tid, dt);
131 |   }
132 | }
133 | 
134 | void instantiation() {
135 |   srand(42);
136 | 
137 |   // Create objects.
138 |   for (IdType i = 0; i < kNumBodies; ++i) {
139 |     double mass = (RAND/2 + 0.5) * kMaxMass;
140 |     double pos_x = RAND*2 - 1;
141 |     double pos_y = RAND*2 - 1;
142 |     double vel_x = (RAND - 0.5) / 1000;
143 |     double vel_y = (RAND - 0.5) / 1000;
144 |     Body_initialize(i, mass, pos_x, pos_y, vel_x, vel_y);
145 |   }
146 | 
147 |   // Transfer data to GPU.
148 |   cudaMemcpyToSymbol(d_container, &h_container, sizeof(Container), 0,
149 |                      cudaMemcpyHostToDevice);
150 |   cudaDeviceSynchronize();
151 |   assert(cudaPeekAtLastError() == cudaSuccess);
152 | }
153 | 
154 | void run_simulation() {
155 |   uintptr_t num_blocks = ikra::executor::cuda::cuda_blocks_1d(kNumBodies);
156 |   uintptr_t num_threads = ikra::executor::cuda::cuda_threads_1d(kNumBodies);
157 | 
158 |   for (int i = 0; i < kIterations; ++i) {
159 |     // Reset forces.
160 |     kernel_Body_reset_force<<<num_blocks, num_threads>>>();
161 | 
162 |     // Update forces.
163 |     kernel_Body_add_all_forces_to_this<<<num_blocks, num_threads>>>();
164 | 
165 |     // Update velocities and positions.
166 |     kernel_Body_update<<<num_blocks, num_threads>>>(kTimeInterval);
167 |   }
168 | 
169 |   cudaDeviceSynchronize();
170 |   assert(cudaPeekAtLastError() == cudaSuccess);
171 | }
172 | 
173 | void run_simple() {
174 |   uintptr_t num_blocks = ikra::executor::cuda::cuda_blocks_1d(kNumBodies);
175 |   uintptr_t num_threads = ikra::executor::cuda::cuda_threads_1d(kNumBodies);
176 | 
177 |   for (int i = 0; i < kIterations*100; ++i) {
178 |     kernel_Body_codegen_simple_update<<<num_blocks, num_threads>>>(
179 |         kTimeInterval);
180 |   }
181 | 
182 |   cudaDeviceSynchronize();
183 |   assert(cudaPeekAtLastError() == cudaSuccess);
184 | }
185 | 
186 | int main() {
187 |   uint64_t time_instantiation = measure<>::execution(instantiation);
188 |   gpuErrchk(cudaPeekAtLastError());
189 | 
190 |   uint64_t time_simulation = measure<>::execution(run_simulation);
191 |   gpuErrchk(cudaPeekAtLastError());
192 | 
193 |   uint64_t time_simple = measure<>::execution(run_simple);
194 |   gpuErrchk(cudaPeekAtLastError());
195 | 
196 |   // Transfer data to CPU.
197 |   cudaMemcpyFromSymbol(&h_container, d_container, sizeof(Container), 0,
198 |                        cudaMemcpyDeviceToHost);
199 |   gpuErrchk(cudaPeekAtLastError());
200 | 
201 |   // Calculate checksum
202 |   int checksum = 11;
203 |   for (uintptr_t i = 0; i < kNumBodies; i++) {
204 |     checksum += reinterpret_cast<int>(
205 |         r_float2int(h_container.a_Body_position_0[i]));
206 |     checksum += reinterpret_cast<int>(
207 |         r_float2int(h_container.a_Body_position_1[i]));
208 |     checksum = checksum % 1234567;
209 | 
210 |     if (i < 10) {
211 |       printf("VALUE[%lu] = %f, %f\n", i,
212 |              h_container.a_Body_position_0[i],
213 |              h_container.a_Body_position_1[i]);
214 |     }
215 |   }
216 | 
217 |   printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\nchecksum: %i\n",
218 |          time_instantiation, time_simulation, time_simple, checksum);
219 |   return 0;
220 | }
221 | 
222 | __global__  void codengen_simple_update(IdType body, double dt) {
223 |   Body_codegen_simple_update(body, dt);
224 | }
225 | 


--------------------------------------------------------------------------------
/test/soa/cuda_array_test.cu:
--------------------------------------------------------------------------------
  1 | #include <array>
  2 | 
  3 | #include "gtest/gtest.h"
  4 | 
  5 | #include "executor/cuda_executor.h"
  6 | #include "soa/soa.h"
  7 | 
  8 | using ikra::soa::IndexType;
  9 | using ikra::soa::SoaLayout;
 10 | using ikra::executor::cuda::construct;
 11 | 
 12 | const static int kTestSize = 12;
 13 | 
 14 | #define CUDA_THREAD_ID (threadIdx.x + blockIdx.x * blockDim.x)
 15 | 
 16 | class DummyClass : public SoaLayout<DummyClass, 1000> {
 17 |  public:
 18 |   IKRA_INITIALIZE_CLASS
 19 | 
 20 |   __device__ DummyClass(int v2, int v4) : field2(v2), field4(v4) {
 21 |     for (int i = 0; i < 3; ++i) {
 22 |       field1[i] = CUDA_THREAD_ID*1000 + i*31;
 23 |       field3[i] = CUDA_THREAD_ID*500 + i*59;
 24 |     }
 25 |   }
 26 | 
 27 |   int_ field0;
 28 | 
 29 |   // Array has size 12 bytes.
 30 |   array_(int, 3, object) field1;
 31 |   int_ field2;
 32 | 
 33 |   // Array has size 12 bytes.
 34 |   array_(int, 3, fully_inlined) field3;
 35 |   int_ field4;
 36 | 
 37 |   __device__ void update_field1(int increment) {
 38 |     for (int i = 0; i < 3; ++i) {
 39 |       field1[i] += field2 + increment;
 40 |     }
 41 |   }
 42 | 
 43 |   __device__ void update_field3(int increment) {
 44 |     for (int i = 0; i < 3; ++i) {
 45 |       field3[i] += field4 + increment;
 46 |     }
 47 |   }
 48 | 
 49 |   __device__ void update_field3_second() {
 50 |     for (int i = 0; i < 3; ++i) {
 51 |       field3[i] += field1[i];
 52 |     }
 53 |   }
 54 | };
 55 | 
 56 | IKRA_DEVICE_STORAGE(DummyClass);
 57 | 
 58 | 
 59 | // Cannot run "cuda_execute" inside gtest case.
 60 | void run_test_construct_and_execute() {
 61 |   DummyClass::initialize_storage();
 62 |   EXPECT_EQ(DummyClass::size(), 0UL);
 63 | 
 64 |   DummyClass* first = construct<DummyClass>(kTestSize, 17, 29);
 65 |   gpuErrchk(cudaDeviceSynchronize());
 66 |   gpuErrchk(cudaPeekAtLastError());
 67 | 
 68 |   // Check result.
 69 |   for (int i = 0; i < kTestSize; ++i) {
 70 |     for (int j = 0; j < 3; ++j) {
 71 |       int actual1 = DummyClass::get(i)->field1[j];
 72 |       int actual3 = DummyClass::get(i)->field3[j];
 73 |       int expected1 = i*1000 + j*31;
 74 |       int expected3 = i*500 + j*59;
 75 |       EXPECT_EQ(actual1, expected1);
 76 |       EXPECT_EQ(actual3, expected3);
 77 |     }
 78 |   }
 79 | 
 80 |   cuda_execute(&DummyClass::update_field1, first, kTestSize, 2);
 81 |   gpuErrchk(cudaPeekAtLastError());
 82 | 
 83 |   // Check result.
 84 |   for (int i = 0; i < kTestSize; ++i) {
 85 |     for (int j = 0; j < 3; ++j) {
 86 |       int actual1 = DummyClass::get(i)->field1[j];
 87 |       int actual3 = DummyClass::get(i)->field3[j];
 88 |       int expected1 = i*1000 + j*31 + 2 + 17;
 89 |       int expected3 = i*500 + j*59;
 90 |       EXPECT_EQ(actual1, expected1);
 91 |       EXPECT_EQ(actual3, expected3);
 92 |     }
 93 |   }
 94 | 
 95 |   cuda_execute(&DummyClass::update_field3, first, kTestSize, 5);
 96 |   gpuErrchk(cudaPeekAtLastError());
 97 | 
 98 |   // Check result.
 99 |   for (int i = 0; i < kTestSize; ++i) {
100 |     for (int j = 0; j < 3; ++j) {
101 |       int actual1 = DummyClass::get(i)->field1[j];
102 |       int actual3 = DummyClass::get(i)->field3[j];
103 |       int expected1 = i*1000 + j*31 + 2 + 17;
104 |       int expected3 = i*500 + j*59 + 5 + 29;
105 |       EXPECT_EQ(actual1, expected1);
106 |       EXPECT_EQ(actual3, expected3);
107 |     }
108 |   }
109 | 
110 |   cuda_execute(&DummyClass::update_field3_second, first, kTestSize);
111 |   gpuErrchk(cudaPeekAtLastError());
112 | 
113 |   // Check result.
114 |   for (int i = 0; i < kTestSize; ++i) {
115 |     for (int j = 0; j < 3; ++j) {
116 |       int actual1 = DummyClass::get(i)->field1[j];
117 |       int actual3 = DummyClass::get(i)->field3[j];
118 |       int expected1 = i*1000 + j*31 + 2 + 17;
119 |       int expected3 = i*500 + j*59 + 5 + 29 + expected1;
120 |       EXPECT_EQ(actual1, expected1);
121 |       EXPECT_EQ(actual3, expected3);
122 |     }
123 |   }
124 | 
125 |   // Copy size to host memory and compare.
126 |   EXPECT_EQ(DummyClass::size(), static_cast<IndexType>(kTestSize));
127 | 
128 |   // Make sure that we had no CUDA failures.
129 |   gpuErrchk(cudaPeekAtLastError());
130 | }
131 | 
132 | 
133 | TEST(CudaArrayTest, ConstructAndExecute) {
134 |   run_test_construct_and_execute();
135 | }
136 | 
137 | 


--------------------------------------------------------------------------------
/test/soa/cuda_inline_array_memcpy_aos_test.cu:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include "executor/cuda_executor.h"
 4 | #include "executor/executor.h"
 5 | #include "soa/soa.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | using ikra::soa::StaticStorageWithArena;
11 | using ikra::soa::kAddressModeZero;
12 | 
13 | // Size of inline SOA array.
14 | #define ARRAY_SIZE 100
15 | // Number of DummyClass instances.
16 | #define NUM_INST 121
17 | // Number of instances constructed on the device.
18 | #define CONSTRUCT_DEVICE 79
19 | // Number of instances constructed on the host.
20 | #define CONSTRUCT_HOST (NUM_INST - CONSTRUCT_DEVICE)
21 | // Number of array elements stored in the area.
22 | #define READ_FROM_ARENA_ELEMENTS 32
23 | // Size of the arena in bytes.
24 | // TODO: Why is the +1 required here?
25 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST+1)
26 | // Number of array elements stored in inline storage.
27 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS)
28 | 
29 | // TODO: Alignment is broken in AOS mode.
30 | class DummyClass : public SoaLayout<DummyClass, NUM_INST, kAddressModeZero,
31 |     StaticStorageWithArena<EXTRA_BYTES>, ikra::soa::kLayoutModeAos> {
32 |  public:
33 |   IKRA_INITIALIZE_CLASS
34 | 
35 |   __host__ __device__ DummyClass(int f0, int f2) : field0(f0), field2(f2),
36 |                                                    field1(ARRAY_SIZE) {
37 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
38 |       field1[i] = id()*17 + i;
39 |     }
40 |   }
41 | 
42 |   int_ field0;
43 | 
44 |   int_ field2;
45 | 
46 |   array_(int, INLINE_ARR_SIZE, partially_inlined) field1;
47 | 
48 |   __device__ void update_field1(int increment) {
49 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
50 |       field1[i] += increment + field0 + field2;
51 |     }
52 |   }
53 | };
54 | 
55 | IKRA_DEVICE_STORAGE(DummyClass);
56 | 
57 | 
58 | void run_test_construct_and_execute() {
59 |   DummyClass::initialize_storage();
60 |   // Create most instances on the device.
61 |   DummyClass* first = construct<DummyClass>(CONSTRUCT_DEVICE, 29, 1357);
62 |   gpuErrchk(cudaPeekAtLastError());
63 | 
64 |   // Create some instances on the host.
65 |   ikra::executor::construct<DummyClass>(CONSTRUCT_HOST, 29, 1357);
66 | 
67 |   cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19);
68 |   gpuErrchk(cudaPeekAtLastError());
69 | 
70 |   // Check result.
71 |   for (int i = 0; i < NUM_INST; ++i) {
72 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
73 |       int actual1 = DummyClass::get(i)->field1[j];
74 |       int expected1 = i*17 + j + 19 + 29 + 1357;
75 |       EXPECT_EQ(actual1, expected1);
76 |     }
77 |   }
78 | }
79 | 
80 | TEST(CudaInlineArrayMemcpyAosTest, ConstructAndExecute) {
81 |   run_test_construct_and_execute();
82 | }
83 | 


--------------------------------------------------------------------------------
/test/soa/cuda_inline_array_memcpy_test.cu:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include "executor/cuda_executor.h"
 4 | #include "executor/executor.h"
 5 | #include "soa/soa.h"
 6 | 
 7 | using ikra::soa::IndexType;
 8 | using ikra::soa::SoaLayout;
 9 | using ikra::executor::cuda::construct;
10 | using ikra::soa::StaticStorageWithArena;
11 | using ikra::soa::kAddressModeZero;
12 | 
13 | // Size of inline SOA array.
14 | #define ARRAY_SIZE 100
15 | // Number of DummyClass instances.
16 | #define NUM_INST 121
17 | // Number of instances constructed on the device.
18 | #define CONSTRUCT_DEVICE 79
19 | // Number of instances constructed on the host.
20 | #define CONSTRUCT_HOST (NUM_INST - CONSTRUCT_DEVICE)
21 | // Number of array elements stored in the area.
22 | #define READ_FROM_ARENA_ELEMENTS 32
23 | // Size of the arena in bytes.
24 | // TODO: Why is the +1 required here?
25 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST+1)
26 | // Number of array elements stored in inline storage.
27 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS)
28 | 
29 | class DummyClass : public SoaLayout<DummyClass, NUM_INST, kAddressModeZero,
30 |     StaticStorageWithArena<EXTRA_BYTES>, ikra::soa::kLayoutModeSoa> {
31 |  public:
32 |   IKRA_INITIALIZE_CLASS
33 | 
34 |   __host__ __device__ DummyClass(int f0, int f2) : field0(f0), field2(f2),
35 |                                                    field1(ARRAY_SIZE) {
36 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
37 |       field1[i] = id()*17 + i;
38 |     }
39 |   }
40 | 
41 |   int_ field0;
42 | 
43 |   array_(int, INLINE_ARR_SIZE, partially_inlined) field1;
44 | 
45 |   int_ field2;
46 | 
47 |   __device__ void update_field1(int increment) {
48 |     for (int i = 0; i < ARRAY_SIZE; ++i) {
49 |       field1[i] += increment + field0 + field2;
50 |     }
51 |   }
52 | };
53 | 
54 | IKRA_DEVICE_STORAGE(DummyClass);
55 | 
56 | 
57 | void run_test_construct_and_execute() {
58 |   DummyClass::initialize_storage();
59 |   // Create most instances on the device.
60 |   DummyClass* first = construct<DummyClass>(CONSTRUCT_DEVICE, 29, 1357);
61 |   gpuErrchk(cudaPeekAtLastError());
62 | 
63 |   // Create some instances on the host.
64 |   ikra::executor::construct<DummyClass>(CONSTRUCT_HOST, 29, 1357);
65 | 
66 |   cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19);
67 |   gpuErrchk(cudaPeekAtLastError());
68 | 
69 |   // Check result.
70 |   for (int i = 0; i < NUM_INST; ++i) {
71 |     for (int j = 0; j < ARRAY_SIZE; ++j) {
72 |       int actual1 = DummyClass::get(i)->field1[j];
73 |       int expected1 = i*17 + j + 19 + 29 + 1357;
74 |       EXPECT_EQ(actual1, expected1);
75 |     }
76 |   }
77 | }
78 | 
79 | TEST(CudaInlineArrayMemcpyTest, ConstructAndExecute) {
80 |   run_test_construct_and_execute();
81 | }
82 | 


--------------------------------------------------------------------------------
/test/soa/pointer_arithmetics_test.cc:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | #include "soa/soa.h"
 3 | 
 4 | namespace {
 5 | using ikra::soa::SoaLayout;
 6 | 
 7 | static const int kClassMaxInst = 1024;
 8 | static const uint32_t kTestSize = 40;
 9 | 
10 | // Pointer arithmetics works only in valid addressing mode.
11 | class TestClass : public SoaLayout<TestClass, kClassMaxInst, sizeof(int)> {
12 |  public:
13 |   IKRA_INITIALIZE_CLASS
14 | 
15 |   int_ field0;
16 | };
17 | 
18 | IKRA_HOST_STORAGE(TestClass)
19 | 
20 | 
21 | TEST(PointerArithmeticsTest, IncrementDecrement) {
22 |   TestClass::initialize_storage();
23 | 
24 |   auto first = new TestClass[kTestSize];
25 |   EXPECT_EQ(TestClass::storage().size(), kTestSize);
26 | 
27 |   int counter = 0;
28 |   for (auto it = first; it < first + kTestSize; ++it, counter += 2) {
29 |     it->field0 = counter;
30 |   }
31 | 
32 |   for (int i = 0; i < static_cast<int>(kTestSize); ++i) {
33 |     EXPECT_EQ(TestClass::get(i)->field0, 2*i);
34 |   }
35 | }
36 | 
37 | }  // namespace
38 | 


--------------------------------------------------------------------------------