├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data └── bfs │ └── simple.txt ├── example ├── CMakeLists.txt ├── bfs │ ├── CMakeLists.txt │ ├── bfs_const_size_array.cc │ ├── bfs_const_size_array.cu │ ├── bfs_inlined_dynamic_array.cc │ ├── bfs_loader.h │ └── bfs_std_vector.cc ├── n_body.cc ├── particle_simulation.cc └── traffic │ ├── CMakeLists.txt │ ├── benchmark.h │ ├── configuration.h │ ├── entities │ ├── car.inc │ ├── cell.inc │ ├── priority_yield_traffic_controller.inc │ ├── shared_signal_group.inc │ ├── simulation.inc │ └── traffic_light.inc │ ├── run_bench_traffic.sh │ ├── simulation_adapter.cu │ ├── simulation_adapter.h │ ├── simulation_converter.inc │ ├── statistics.inc │ └── traffic.cu ├── ikra ├── CMakeLists.txt ├── executor │ ├── cuda_executor.h │ ├── executor.h │ ├── iterator.h │ └── util.h └── soa │ ├── CMakeLists.txt │ ├── addressable_field_shared.inc │ ├── array_field.h │ ├── array_shared.inc │ ├── class_initialization.h │ ├── constants.h │ ├── cuda.h │ ├── field.h │ ├── field_shared.inc │ ├── field_type_generator.h │ ├── layout.h │ ├── partially_inlined_array_field.h │ ├── preprocessor.h │ ├── soa.h │ ├── storage.h │ └── util.h └── test ├── CMakeLists.txt ├── array_cuda_test.cu ├── executor_test.cc ├── executor_test_layout.inc ├── minimum_cuda_test.cu └── soa ├── array_test.cc ├── array_test_layout.inc ├── basic_class_test.cc ├── basic_class_test_layout.inc ├── benchmarks ├── .gitignore ├── array │ ├── aos_style.cu │ ├── aos_style_cpu.cc │ ├── benchmark.h │ ├── configuration.h │ ├── inlined_soa_style.cu │ └── soa_style.cu ├── assembly │ ├── clang++-5.0_-O0_DynamicStorage_0_Soa.S │ ├── clang++-5.0_-O0_DynamicStorage_4_Soa.S │ ├── clang++-5.0_-O0_StaticStorage_0_Aos.S │ ├── clang++-5.0_-O0_StaticStorage_0_Soa.S │ ├── clang++-5.0_-O0_StaticStorage_4_Soa.S │ ├── clang++-5.0_-O3_DynamicStorage_0_Soa.S │ ├── clang++-5.0_-O3_DynamicStorage_4_Soa.S │ ├── clang++-5.0_-O3_StaticStorage_0_Aos.S │ ├── clang++-5.0_-O3_StaticStorage_0_Soa.S │ ├── clang++-5.0_-O3_StaticStorage_4_Soa.S │ ├── clang++-5.0_nbody_aos.S │ ├── clang++-5.0_nbody_ikracpp.S │ ├── clang++-5.0_nbody_ikracpp_field.S │ ├── clang++-5.0_nbody_ikracpp_inversed.S │ ├── clang++-5.0_nbody_soa.S │ ├── cuda_array_aos_style.S │ ├── cuda_array_inlined_soa_style.S │ ├── cuda_array_soa_style.S │ ├── cuda_codegen_test.S │ ├── cuda_nbody_ikracpp_inversed_field_gpu.S │ ├── cuda_nbody_ikracpp_inversed_gpu.S │ ├── cuda_nbody_soa_inversed_gpu.S │ ├── g++_-O0_DynamicStorage_0_Soa.S │ ├── g++_-O0_DynamicStorage_4_Soa.S │ ├── g++_-O0_StaticStorage_0_Aos.S │ ├── g++_-O0_StaticStorage_0_Soa.S │ ├── g++_-O0_StaticStorage_4_Soa.S │ ├── g++_-O3_DynamicStorage_0_Soa.S │ ├── g++_-O3_DynamicStorage_4_Soa.S │ ├── g++_-O3_StaticStorage_0_Aos.S │ ├── g++_-O3_StaticStorage_0_Soa.S │ ├── g++_-O3_StaticStorage_4_Soa.S │ ├── g++_nbody_aos.S │ ├── g++_nbody_ikracpp.S │ ├── g++_nbody_ikracpp_field.S │ ├── g++_nbody_ikracpp_inversed.S │ └── g++_nbody_soa.S ├── build.sh ├── codegen_test.cc ├── cuda_codegen_test.cu └── nbody │ ├── aos.cc │ ├── benchmark.h │ ├── ikracpp.cc │ ├── ikracpp_field.cc │ ├── ikracpp_inversed.cc │ ├── ikracpp_inversed_field_gpu.cu │ ├── ikracpp_inversed_gpu.cu │ ├── soa.cc │ └── soa_inversed.cu ├── cuda_array_test.cu ├── cuda_inline_array_memcpy_aos_test.cu ├── cuda_inline_array_memcpy_test.cu └── pointer_arithmetics_test.cc /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | 8 | # Prerequisites 9 | *.d 10 | 11 | # Compiled Object files 12 | *.slo 13 | *.lo 14 | *.o 15 | *.obj 16 | 17 | # Precompiled Headers 18 | *.gch 19 | *.pch 20 | 21 | # Compiled Dynamic libraries 22 | *.so 23 | *.dylib 24 | *.dll 25 | 26 | # Fortran module files 27 | *.mod 28 | *.smod 29 | 30 | # Compiled Static libraries 31 | *.lai 32 | *.la 33 | *.a 34 | *.lib 35 | 36 | # Executables 37 | *.exe 38 | *.out 39 | *.app 40 | 41 | test/experiment_test.cc 42 | 43 | # CMake 44 | CMakeCache.txt 45 | CMakeFiles 46 | CMakeScripts 47 | Testing 48 | Makefile 49 | cmake_install.cmake 50 | install_manifest.txt 51 | compile_commands.json 52 | CTestTestfile.cmake 53 | bin/ 54 | *.pc 55 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/googletest"] 2 | path = lib/googletest 3 | url = git@github.com:google/googletest.git 4 | [submodule "example/traffic/traffic-simulation"] 5 | path = example/traffic/traffic-simulation 6 | url = git@github.com:prg-titech/traffic-simulation.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2 FATAL_ERROR) 2 | project(ikra-cpp) 3 | 4 | # Set up CUDA 5 | # TODO: -gencode arch=compute_52,code=sm_52 6 | # TODO: This should be CUDA 9.0 but the version check is broken on my machine. 7 | find_package(CUDA 8.0 REQUIRED) 8 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++11 --expt-extended-lambda) 9 | 10 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 11 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 12 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 13 | 14 | set(CMAKE_CXX_STANDARD 11) 15 | 16 | if(NOT CMAKE_BUILD_TYPE) 17 | set(CMAKE_BUILD_TYPE Debug) 18 | endif() 19 | 20 | set(CMAKE_CXX_FLAGS "-Wall -Wextra") 21 | set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") 22 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -Wno-unused-parameter") 23 | 24 | add_subdirectory(lib/googletest) 25 | enable_testing() 26 | 27 | add_subdirectory(ikra) 28 | add_subdirectory(test) 29 | add_subdirectory(example) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Programming Research Group @ TokyoTech 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ikra-Cpp 2 | Ikra-Cpp is a C++/CUDA DSL for object-oriented programming with Structure of Arrays (SOA) data layout. Ikra-Cpp provides an AOS-style OOP notation that is similar to standard C++ but layouts data as SOA. This gives programmers the performance benefit of SOA and the expressivness of AOS-style object-oriented programming at the same time. 3 | 4 | ## Example 5 | ```c++ 6 | // N-Body Simulation 7 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 8 | 9 | #include "executor/executor.h" 10 | #include "soa/soa.h" 11 | 12 | using namespace std; 13 | using ikra::executor::execute; 14 | using ikra::soa::SoaLayout; 15 | 16 | static const int kNumBodies = 25; 17 | static const double kMaxMass = 1000; 18 | static const double kTimeInterval = 0.5; 19 | static const double kGravityConstant = 6.673e-11; // gravitational constant 20 | static const int kIterations = 1000; 21 | 22 | #define RAND (1.0 * rand() / RAND_MAX) 23 | 24 | class Body : public SoaLayout { 25 | public: 26 | IKRA_INITIALIZE_CLASS 27 | 28 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 29 | : mass_(mass) { 30 | position_[0] = pos_x; 31 | position_[1] = pos_y; 32 | velocity_[0] = vel_x; 33 | velocity_[1] = vel_y; 34 | this->reset_force(); 35 | } 36 | 37 | double_ mass_; 38 | array_(double, 2) position_; 39 | array_(double, 2) velocity_; 40 | array_(double, 2) force_; 41 | 42 | void add_force(Body* body) { 43 | if (this == body) return; 44 | 45 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 46 | double dx = body->position_[0] - position_[0]; 47 | double dy = body->position_[1] - position_[1]; 48 | double dist = sqrt(dx*dx + dy*dy); 49 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 50 | force_[0] += F*dx / dist; 51 | force_[1] += F*dy / dist; 52 | } 53 | 54 | void add_force_to_all() { 55 | execute(&Body::add_force, this); 56 | } 57 | 58 | void reset_force() { 59 | force_[0] = 0; 60 | force_[1] = 0; 61 | } 62 | 63 | void update(double dt) { 64 | velocity_[0] += force_[0]*dt / mass_; 65 | velocity_[1] += force_[1]*dt / mass_; 66 | position_[0] += velocity_[0]*dt; 67 | position_[1] += velocity_[1]*dt; 68 | 69 | // Reflect bodies. 70 | if (position_[0] < -1 || position_[0] > 1) velocity_[0] = -velocity_[0]; 71 | if (position_[1] < -1 || position_[1] > 1) velocity_[1] = -velocity_[1]; 72 | } 73 | }; 74 | IKRA_HOST_STORAGE(Body) 75 | 76 | int main() { 77 | // Initialize object storage. 78 | Body::initialize_storage(); 79 | 80 | // Create objects. 81 | for (int i = 0; i < kNumBodies; ++i) { 82 | double mass = (RAND/2 + 0.5) * kMaxMass; 83 | double pos_x = RAND*2 - 1; 84 | double pos_y = RAND*2 - 1; 85 | double vel_x = (RAND - 0.5) / 1000; 86 | double vel_y = (RAND - 0.5) / 1000; 87 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 88 | } 89 | 90 | // Run simulation. 91 | for (int i = 0; i < kIterations; ++i) { 92 | // Reset forces. 93 | execute(&Body::reset_force); 94 | 95 | // Update forces. 96 | execute(&Body::add_force_to_all); 97 | 98 | // Update velocities and positions. 99 | execute(&Body::update, kTimeInterval); 100 | } 101 | 102 | return 0; 103 | } 104 | ``` 105 | 106 | ## Installation 107 | ### Prerequisites 108 | * gcc version 5.4.0 or higher 109 | * For visual examples (n-body simulation): SDL 2.0 110 | * For GPU support: CUDA Toolkit 9.0 or higher 111 | * C++11 compiler support in CPU mode, C++14 compiler support in GPU mode 112 | 113 | ### Installation via Git 114 | ``` sh 115 | git clone --recursive git@github.com:prg-titech/ikra-cpp.git 116 | cmake CMakeLists.txt 117 | make 118 | # Check if everything is working. 119 | ctest 120 | bin/n_body 121 | ``` 122 | 123 | -------------------------------------------------------------------------------- /data/bfs/simple.txt: -------------------------------------------------------------------------------- 1 | 0 1 2 | 0 2 3 | 1 0 4 | 2 0 5 | 2 3 6 | 2 5 7 | 3 2 8 | 3 4 9 | 3 7 10 | 3 8 11 | 4 3 12 | 5 2 13 | 5 8 14 | 5 6 15 | 6 5 16 | 6 7 17 | 7 4 18 | 7 6 19 | 8 3 20 | 8 5 21 | -------------------------------------------------------------------------------- /example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(SDL2 REQUIRED) 2 | include_directories(${SDL2_INCLUDE_DIRS}) 3 | string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES) 4 | 5 | add_executable(particle_simulation particle_simulation.cc) 6 | target_link_libraries(particle_simulation ikra ${SDL2_LIBRARIES}) 7 | 8 | add_executable(n_body n_body.cc) 9 | target_link_libraries(n_body ikra ${SDL2_LIBRARIES}) 10 | 11 | add_subdirectory(bfs) 12 | 13 | add_subdirectory(traffic) 14 | -------------------------------------------------------------------------------- /example/bfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(bfs_const_size_array bfs_const_size_array.cc) 2 | target_link_libraries(bfs_const_size_array ikra) 3 | 4 | add_executable(bfs_inlined_dynamic_array bfs_inlined_dynamic_array.cc) 5 | target_link_libraries(bfs_inlined_dynamic_array ikra) 6 | 7 | add_executable(bfs_std_vector bfs_std_vector.cc) 8 | target_link_libraries(bfs_std_vector ikra) 9 | 10 | 11 | # CUDA 12 | # TODO: cuda_add_library does not support INTERFACE libraries 13 | include_directories(${CMAKE_SOURCE_DIR}/ikra) 14 | cuda_add_executable(bfs_const_size_array_cuda bfs_const_size_array.cu) 15 | target_link_libraries(bfs_const_size_array_cuda ikra) 16 | -------------------------------------------------------------------------------- /example/bfs/bfs_const_size_array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "bfs_loader.h" 6 | #include "executor/executor.h" 7 | #include "soa/soa.h" 8 | 9 | static const int kMaxDegree = 10; 10 | static const int kMaxVertices = 20000; 11 | 12 | using ikra::soa::IndexType; 13 | using ikra::soa::SoaLayout; 14 | using ikra::executor::execute; 15 | using ikra::executor::execute_and_reduce; 16 | 17 | class Vertex : public SoaLayout { 18 | public: 19 | IKRA_INITIALIZE_CLASS 20 | 21 | Vertex(const std::vector& neighbors) { 22 | // If this check fails, we the dataset cannot be run with this 23 | // implementation. 24 | assert(neighbors.size() <= kMaxDegree); 25 | adj_list_size_ = neighbors.size(); 26 | 27 | for (int i = 0; i < num_neighbors(); ++i) { 28 | adj_list_[i] = Vertex::get_uninitialized(neighbors[i]); 29 | } 30 | } 31 | 32 | int num_neighbors() { 33 | return adj_list_size_; 34 | } 35 | 36 | bool update_distance(int distance) { 37 | if (distance < distance_) { 38 | distance_ = distance; 39 | return true; 40 | } else { 41 | return false; 42 | } 43 | } 44 | 45 | // Visit the vertex, i.e., update the distances of all neighbors if this 46 | // vertex is in the frontier, as indicated by the "iteration" field. Returns 47 | // "true" if at least one neighbor was updated. 48 | bool visit(int iteration) { 49 | bool updated = false; 50 | 51 | if (distance_ == iteration) { 52 | for (int i = 0; i < adj_list_size_; ++i) { 53 | Vertex* neighbor = adj_list_[i]; 54 | updated |= neighbor->update_distance(distance_ + 1); 55 | } 56 | } 57 | 58 | return updated; 59 | } 60 | 61 | void print_distance() { 62 | printf("distance[%lu] = %i\n", id(), (int) distance_); 63 | } 64 | 65 | int_ distance_ = std::numeric_limits::max(); 66 | int_ adj_list_size_; 67 | 68 | // A fully inlined SOA array. 69 | array_(Vertex*, kMaxDegree, fully_inlined) adj_list_; 70 | }; 71 | 72 | IKRA_HOST_STORAGE(Vertex) 73 | 74 | 75 | int run() { 76 | int counter = 0; 77 | bool running = true; 78 | 79 | for (int iteration = 0; running; ++iteration) { 80 | running = execute_and_reduce(&Vertex::visit, 81 | [](bool a, bool b) { return a || b; }, 82 | /*default_value=*/ false, 83 | /*args...=*/ iteration); 84 | counter++; 85 | } 86 | 87 | return counter; 88 | } 89 | 90 | int main(int argc, char* argv[]) { 91 | // Load vertices from file. 92 | if (argc != 4) { 93 | printf("Usage: %s filename num_vertices start_vertex\n", argv[0]); 94 | exit(1); 95 | } 96 | 97 | Vertex::initialize_storage(); 98 | load_file(argv[1], atoi(argv[2])); 99 | 100 | // Set start vertex. 101 | Vertex::get(atoi(argv[3]))->distance_ = 0; 102 | // Start algorithm. 103 | int iterations = run(); 104 | 105 | // Print results. 106 | printf("Iterations: %i\n", iterations); 107 | execute(&Vertex::print_distance); 108 | } 109 | -------------------------------------------------------------------------------- /example/bfs/bfs_const_size_array.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "bfs_loader.h" 6 | #include "executor/cuda_executor.h" 7 | #include "executor/executor.h" 8 | #include "soa/soa.h" 9 | 10 | static const int kMaxDegree = 10; 11 | static const int kMaxVertices = 20000; 12 | 13 | using ikra::soa::IndexType; 14 | using ikra::soa::SoaLayout; 15 | using ikra::executor::execute; 16 | 17 | class Vertex : public SoaLayout { 18 | public: 19 | IKRA_INITIALIZE_CLASS 20 | 21 | Vertex(const std::vector& neighbors) { 22 | // If this check fails, we the dataset cannot be run with this 23 | // implementation. 24 | assert(neighbors.size() <= kMaxDegree); 25 | adj_list_size_ = neighbors.size(); 26 | 27 | for (int i = 0; i < num_neighbors(); ++i) { 28 | Vertex* vertex = Vertex::get_uninitialized(neighbors[i]); 29 | adj_list_[i] = vertex; 30 | } 31 | } 32 | 33 | __host__ __device__ int num_neighbors() { 34 | return adj_list_size_; 35 | } 36 | 37 | // Visit the vertex, i.e., update the distances of all neighbors if this 38 | // vertex is in the frontier, as indicated by the "iteration" field. Returns 39 | // "true" if at least one neighbor was updated. 40 | __device__ bool visit(int iteration) { 41 | bool updated = false; 42 | 43 | if (distance_ == iteration) { 44 | for (int i = 0; i < num_neighbors(); ++i) { 45 | Vertex* neighbor = adj_list_[i]; 46 | updated |= neighbor->update_distance(distance_ + 1); 47 | } 48 | } 49 | 50 | return updated; 51 | } 52 | 53 | void print_distance() { 54 | printf("distance[%lu] = %i\n", id(), (int) distance_); 55 | } 56 | 57 | void set_distance(int value) { 58 | distance_ = value; 59 | } 60 | 61 | __device__ bool update_distance(int distance) { 62 | if (distance < distance_) { 63 | distance_ = distance; 64 | return true; 65 | } else { 66 | return false; 67 | } 68 | } 69 | 70 | int_ distance_ = std::numeric_limits::max(); 71 | int_ adj_list_size_; 72 | 73 | // A fully inlined SOA array. 74 | array_(Vertex*, kMaxDegree, fully_inlined) adj_list_; 75 | }; 76 | 77 | IKRA_DEVICE_STORAGE(Vertex) 78 | 79 | 80 | int run() { 81 | int iteration = 0; 82 | bool running = true; 83 | 84 | while (running) { 85 | auto reducer = [](bool a, bool b) { return a || b; }; 86 | running = cuda_execute_and_reduce(&Vertex::visit, 87 | reducer, 88 | iteration); 89 | 90 | ++iteration; 91 | } 92 | 93 | return iteration; 94 | } 95 | 96 | int main(int argc, char* argv[]) { 97 | // Load vertices from file. 98 | if (argc != 4) { 99 | printf("Usage: %s filename num_vertices start_vertex\n", argv[0]); 100 | exit(1); 101 | } 102 | 103 | Vertex::initialize_storage(); 104 | load_file(argv[1], atoi(argv[2])); 105 | 106 | // Set start vertex. 107 | Vertex* start_vertex = Vertex::get(atoi(argv[3])); 108 | start_vertex->set_distance(0); 109 | 110 | // Start algorithm. 111 | int iterations = run(); 112 | 113 | // Note: execute is host side, cuda_execute is device side. 114 | printf("Iterations: %i\n", iterations); 115 | execute(&Vertex::print_distance); 116 | 117 | // Ensure nothing went wrong on the GPU. 118 | gpuErrchk(cudaPeekAtLastError()); 119 | } 120 | -------------------------------------------------------------------------------- /example/bfs/bfs_inlined_dynamic_array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "bfs_loader.h" 6 | #include "executor/executor.h" 7 | #include "soa/soa.h" 8 | 9 | static const int kInlineSize = 2; 10 | static const int kMaxVertices = 20000; 11 | static const int kMaxEdges = 100000; 12 | 13 | using ikra::soa::IndexType; 14 | using ikra::soa::SoaLayout; 15 | using ikra::soa::StaticStorageWithArena; 16 | using ikra::soa::kAddressModeZero; 17 | using ikra::executor::execute; 18 | using ikra::executor::execute_and_reduce; 19 | 20 | 21 | class Vertex : public SoaLayout< 22 | Vertex, kMaxVertices, kAddressModeZero, 23 | StaticStorageWithArena> { 24 | public: 25 | IKRA_INITIALIZE_CLASS 26 | 27 | Vertex(const std::vector& neighbors) 28 | : adj_list_(neighbors.size()) { 29 | adj_list_size_ = neighbors.size(); 30 | for (int i = 0; i < num_neighbors(); ++i) { 31 | adj_list_[i] = Vertex::get_uninitialized(neighbors[i]); 32 | } 33 | } 34 | 35 | int num_neighbors() { 36 | return adj_list_size_; 37 | } 38 | 39 | bool update_distance(int distance) { 40 | if (distance < distance_) { 41 | distance_ = distance; 42 | return true; 43 | } else { 44 | return false; 45 | } 46 | } 47 | 48 | // Visit the vertex, i.e., update the distances of all neighbors if this 49 | // vertex is in the frontier, as indicated by the "iteration" field. Returns 50 | // "true" if at least one neighbor was updated. 51 | bool visit(int iteration) { 52 | bool updated = false; 53 | 54 | if (distance_ == iteration) { 55 | for (int i = 0; i < num_neighbors(); ++i) { 56 | Vertex* neighbor = adj_list_[i]; 57 | updated |= neighbor->update_distance(distance_ + 1); 58 | } 59 | } 60 | 61 | return updated; 62 | } 63 | 64 | void print_distance() { 65 | printf("distance[%lu] = %i\n", id(), (int) distance_); 66 | } 67 | 68 | int_ distance_ = std::numeric_limits::max(); 69 | int_ adj_list_size_; 70 | 71 | // Some elements are stored in the external storage. 72 | array_(Vertex*, kInlineSize, partially_inlined) adj_list_; 73 | }; 74 | 75 | IKRA_HOST_STORAGE(Vertex) 76 | 77 | 78 | int run() { 79 | int counter = 0; 80 | bool running = true; 81 | 82 | for (int iteration = 0; running; ++iteration) { 83 | running = execute_and_reduce(&Vertex::visit, 84 | [](bool a, bool b) { return a || b; }, 85 | /*default_value=*/ false, 86 | /*args...=*/ iteration); 87 | counter++; 88 | } 89 | 90 | return counter; 91 | } 92 | 93 | int main(int argc, char* argv[]) { 94 | // Load vertices from file. 95 | if (argc != 4) { 96 | printf("Usage: %s filename num_vertices start_vertex\n", argv[0]); 97 | exit(1); 98 | } 99 | 100 | Vertex::initialize_storage(); 101 | load_file(argv[1], atoi(argv[2])); 102 | 103 | // Set start vertex. 104 | Vertex::get(atoi(argv[3]))->distance_ = 0; 105 | // Start algorithm. 106 | int iterations = run(); 107 | 108 | // Print results. 109 | printf("Iterations: %i\n", iterations); 110 | execute(&Vertex::print_distance); 111 | } 112 | -------------------------------------------------------------------------------- /example/bfs/bfs_loader.h: -------------------------------------------------------------------------------- 1 | #ifndef EXAMPLE_BFS_BFS_LOADER_H 2 | #define EXAMPLE_BFS_BFS_LOADER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "soa/soa.h" 10 | 11 | using namespace std; 12 | using ikra::soa::IndexType; 13 | 14 | template 15 | void load_file(const char* filename, IndexType num_vertices) { 16 | ifstream file; 17 | file.open(filename); 18 | 19 | if (!file) { 20 | printf("Unable to open file: %s\n", filename); 21 | exit(1); 22 | } 23 | 24 | // Build adj. list until we see a new vertex. 25 | vector adj_list; 26 | IndexType last_v_from = 0; 27 | IndexType v_from, v_to; 28 | 29 | while (file >> v_from && file >> v_to) { 30 | // We assume that the edges are soreted by edge source. 31 | assert(v_from >= last_v_from); 32 | 33 | for (; last_v_from < v_from; ++last_v_from) { 34 | VertexT* vertex = new VertexT(adj_list); 35 | assert(vertex->id() == last_v_from); 36 | 37 | adj_list.clear(); 38 | } 39 | 40 | adj_list.push_back(v_to); 41 | } 42 | 43 | for (; last_v_from < num_vertices; ++last_v_from) { 44 | VertexT* vertex = new VertexT(adj_list); 45 | assert(vertex->id() == last_v_from); 46 | 47 | adj_list.clear(); 48 | } 49 | } 50 | 51 | #endif // EXAMPLE_BFS_BFS_LOADER_H 52 | -------------------------------------------------------------------------------- /example/bfs/bfs_std_vector.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "bfs_loader.h" 7 | #include "executor/executor.h" 8 | #include "soa/soa.h" 9 | 10 | static const int kMaxVertices = 20000; 11 | 12 | using ikra::soa::IndexType; 13 | using ikra::soa::SoaLayout; 14 | using ikra::executor::execute; 15 | using ikra::executor::execute_and_reduce; 16 | 17 | class Vertex : public SoaLayout { 18 | public: 19 | IKRA_INITIALIZE_CLASS 20 | 21 | Vertex(const std::vector& neighbors) { 22 | adj_list_size_ = neighbors.size(); 23 | 24 | for (int i = 0; i < num_neighbors(); ++i) { 25 | adj_list_->push_back(Vertex::get_uninitialized(neighbors[i])); 26 | } 27 | } 28 | 29 | int num_neighbors() { 30 | return adj_list_size_; 31 | } 32 | 33 | bool update_distance(int distance) { 34 | if (distance < distance_) { 35 | distance_ = distance; 36 | return true; 37 | } else { 38 | return false; 39 | } 40 | } 41 | 42 | // Visit the vertex, i.e., update the distances of all neighbors if this 43 | // vertex is in the frontier, as indicated by the "iteration" field. Returns 44 | // "true" if at least one neighbor was updated. 45 | bool visit(int iteration) { 46 | bool updated = false; 47 | 48 | if (distance_ == iteration) { 49 | for (int i = 0; i < adj_list_size_; ++i) { 50 | Vertex* neighbor = (*adj_list_)[i]; //->operator[](i); 51 | updated |= neighbor->update_distance(distance_ + 1); 52 | } 53 | } 54 | 55 | return updated; 56 | } 57 | 58 | void print_distance() { 59 | printf("distance[%lu] = %i\n", id(), (int) distance_); 60 | } 61 | 62 | int_ distance_ = std::numeric_limits::max(); 63 | int_ adj_list_size_; 64 | field_(std::vector) adj_list_; 65 | }; 66 | 67 | IKRA_HOST_STORAGE(Vertex) 68 | 69 | 70 | int run() { 71 | int counter = 0; 72 | bool running = true; 73 | 74 | for (int iteration = 0; running; ++iteration) { 75 | running = execute_and_reduce(&Vertex::visit, 76 | [](bool a, bool b) { return a || b; }, 77 | /*default_value=*/ false, 78 | /*args...=*/ iteration); 79 | counter++; 80 | } 81 | 82 | return counter; 83 | } 84 | 85 | int main(int argc, char* argv[]) { 86 | // Ensure that the std::vector is layouted correctly. 87 | static_assert( 88 | Vertex::ObjectSize::value == sizeof(std::vector) + sizeof(int)*2, 89 | "Expected different SOA object size."); 90 | 91 | // Load vertices from file. 92 | if (argc != 4) { 93 | printf("Usage: %s filename num_vertices start_vertex\n", argv[0]); 94 | exit(1); 95 | } 96 | 97 | Vertex::initialize_storage(); 98 | load_file(argv[1], atoi(argv[2])); 99 | 100 | // Set start vertex. 101 | Vertex::get(atoi(argv[3]))->distance_ = 0; 102 | // Start algorithm. 103 | int iterations = run(); 104 | 105 | // Print results. 106 | printf("Iterations: %i\n", iterations); 107 | execute(&Vertex::print_distance); 108 | } 109 | -------------------------------------------------------------------------------- /example/n_body.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "SDL2/SDL.h" 9 | 10 | #include "executor/executor.h" 11 | #include "soa/soa.h" 12 | 13 | using namespace std; 14 | using ikra::executor::execute; 15 | using ikra::executor::Iterator; 16 | using ikra::soa::SoaLayout; 17 | 18 | static const int kNumBodies = 25; 19 | static const double kMaxMass = 1000; 20 | static const double kTimeInterval = 0.5; 21 | 22 | static const double kGravityConstant = 6.673e-11; // gravitational constant 23 | 24 | static const int kWindowWidth = 1000; 25 | static const int kWindowHeight = 1000; 26 | static const int kMaxRect = 20; 27 | 28 | #define RAND (1.0 * rand() / RAND_MAX) 29 | 30 | static void render_rect(SDL_Renderer* renderer, double x, double y, double mass) { 31 | SDL_Rect rect; 32 | rect.w = rect.h = mass / kMaxMass * kMaxRect; 33 | rect.x = (x/2 + 0.5) * kWindowWidth - rect.w/2; 34 | rect.y = (y/2 + 0.5) * kWindowHeight - rect.h/2; 35 | SDL_RenderDrawRect(renderer, &rect); 36 | } 37 | 38 | class Body : public SoaLayout { 39 | public: 40 | IKRA_INITIALIZE_CLASS 41 | 42 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 43 | : mass_(mass) { 44 | position_[0] = pos_x; 45 | position_[1] = pos_y; 46 | velocity_[0] = vel_x; 47 | velocity_[1] = vel_y; 48 | this->reset_force(); 49 | } 50 | 51 | double_ mass_; 52 | array_(double, 2, fully_inlined) position_; 53 | array_(double, 2, fully_inlined) velocity_; 54 | array_(double, 2, fully_inlined) force_; 55 | 56 | void add_force(Body* body) { 57 | if (this == body) return; 58 | 59 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 60 | double dx = body->position_[0] - position_[0]; 61 | double dy = body->position_[1] - position_[1]; 62 | double dist = sqrt(dx*dx + dy*dy); 63 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 64 | force_[0] += F*dx / dist; 65 | force_[1] += F*dy / dist; 66 | } 67 | 68 | void add_force_to_all() { 69 | execute(&Body::add_force, this); 70 | } 71 | 72 | void reset_force() { 73 | force_[0] = 0; 74 | force_[1] = 0; 75 | } 76 | 77 | void update(double dt) { 78 | velocity_[0] += force_[0]*dt / mass_; 79 | velocity_[1] += force_[1]*dt / mass_; 80 | position_[0] += velocity_[0]*dt; 81 | position_[1] += velocity_[1]*dt; 82 | 83 | if (position_[0] < -1 || position_[0] > 1) { 84 | velocity_[0] = -velocity_[0]; 85 | } 86 | if (position_[1] < -1 || position_[1] > 1) { 87 | velocity_[1] = -velocity_[1]; 88 | } 89 | } 90 | 91 | void draw(SDL_Renderer* renderer) { 92 | render_rect(renderer, position_[0], position_[1], mass_); 93 | } 94 | }; 95 | 96 | IKRA_HOST_STORAGE(Body) 97 | 98 | 99 | int main() { 100 | if (SDL_Init(SDL_INIT_VIDEO) != 0) { 101 | printf("Could not initialize SDL!\n"); 102 | exit(1); 103 | } 104 | 105 | SDL_Window* window = nullptr; 106 | SDL_Renderer* renderer = nullptr; 107 | 108 | if (SDL_CreateWindowAndRenderer(kWindowWidth, kWindowHeight, 0, 109 | &window, &renderer) != 0) { 110 | printf("Could not create window/render!\n"); 111 | exit(1); 112 | } 113 | 114 | // Initialize object storage. 115 | Body::initialize_storage(); 116 | 117 | // Create objects. 118 | for (int i = 0; i < kNumBodies; ++i) { 119 | double mass = (RAND/2 + 0.5) * kMaxMass; 120 | double pos_x = RAND*2 - 1; 121 | double pos_y = RAND*2 - 1; 122 | double vel_x = (RAND - 0.5) / 1000; 123 | double vel_y = (RAND - 0.5) / 1000; 124 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 125 | } 126 | 127 | SDL_bool done = SDL_FALSE; 128 | while (!done) { 129 | // Reset forces. 130 | execute(&Body::reset_force); 131 | 132 | // Update forces. 133 | execute(&Body::add_force_to_all); 134 | 135 | // Update velocities and positions. 136 | execute(&Body::update, kTimeInterval); 137 | 138 | // Draw scene. 139 | SDL_SetRenderDrawColor(renderer, 255, 255, 255, SDL_ALPHA_OPAQUE); 140 | SDL_RenderClear(renderer); 141 | SDL_SetRenderDrawColor(renderer, 0, 0, 0, SDL_ALPHA_OPAQUE); 142 | execute(&Body::draw, renderer); 143 | SDL_RenderPresent(renderer); 144 | // done = SDL_TRUE; 145 | 146 | // Wait for user to close the window. 147 | SDL_Event event; 148 | while (SDL_PollEvent(&event)) { 149 | if (event.type == SDL_QUIT) { 150 | done = SDL_TRUE; 151 | } 152 | } 153 | } 154 | 155 | SDL_DestroyRenderer(renderer); 156 | SDL_DestroyWindow(window); 157 | SDL_Quit(); 158 | return 0; 159 | } 160 | -------------------------------------------------------------------------------- /example/particle_simulation.cc: -------------------------------------------------------------------------------- 1 | // Particle-Well Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "SDL2/SDL.h" 9 | 10 | #include "executor/executor.h" 11 | #include "soa/soa.h" 12 | 13 | using namespace std; 14 | using ikra::executor::execute; 15 | using ikra::executor::Iterator; 16 | using ikra::soa::SoaLayout; 17 | 18 | static const int kNumWells = 1; 19 | static const int kNumParticles = 20; 20 | static const double kTimeInterval = 1500; 21 | 22 | static const double kGravityConstant = 6.673e-11; // gravitational constant 23 | static const double kSolarMass = 1.98892e30; 24 | static const double kEarthMass = 5.972e24; 25 | static const double kEarthSunDistance = 149600000000; 26 | static const double kEarthVelocity = 30000; 27 | 28 | static const int kWindowWidth = 1000; 29 | static const int kWindowHeight = 1000; 30 | static const double kScalingFactor = 2.5e-9; 31 | 32 | // Draw a rectangle with equal height/width at a given position. 33 | static void render_rect(SDL_Renderer* renderer, double x, double y, int side) { 34 | SDL_Rect rect; 35 | rect.x = x*kScalingFactor + kWindowWidth/2 - side/2; 36 | rect.y = y*kScalingFactor + kWindowHeight/2 - side/2; 37 | rect.w = side; 38 | rect.h = side; 39 | SDL_RenderDrawRect(renderer, &rect); 40 | } 41 | 42 | class Particle; 43 | 44 | class Well : public SoaLayout { 45 | public: 46 | IKRA_INITIALIZE_CLASS 47 | 48 | Well(double mass, double pos_x, double pos_y) : mass_(mass) { 49 | position_[0] = pos_x; 50 | position_[1] = pos_y; 51 | } 52 | 53 | void add_force(); 54 | 55 | void draw(SDL_Renderer* renderer) { 56 | render_rect(renderer, position_[0], position_[1], 50); 57 | render_rect(renderer, position_[0], position_[1], 60); 58 | } 59 | 60 | double_ mass_; 61 | array_(double, 2, fully_inlined) position_; 62 | }; 63 | 64 | IKRA_HOST_STORAGE(Well) 65 | 66 | 67 | class Particle : public SoaLayout { 68 | public: 69 | IKRA_INITIALIZE_CLASS 70 | 71 | Particle(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 72 | : mass_(mass) { 73 | position_[0] = pos_x; 74 | position_[1] = pos_y; 75 | velocity_[0] = vel_x; 76 | velocity_[1] = vel_y; 77 | this->reset_force(); 78 | } 79 | 80 | double_ mass_; 81 | array_(double, 2, fully_inlined) position_; 82 | array_(double, 2, fully_inlined) velocity_; 83 | array_(double, 2, fully_inlined) force_; 84 | 85 | void add_force(Well* well) { 86 | double EPS = 3e4; // Softening parameter (just to avoid infinities). 87 | double dx = well->position_[0] - position_[0]; 88 | double dy = well->position_[1] - position_[1]; 89 | double dist = sqrt(dx*dx + dy*dy); 90 | double F = kGravityConstant*mass_*well->mass_ / (dist*dist + EPS*EPS); 91 | force_[0] += F*dx / dist; 92 | force_[1] += F*dy / dist; 93 | } 94 | 95 | void reset_force() { 96 | force_[0] = 0.0f; 97 | force_[1] = 0.0f; 98 | } 99 | 100 | void update(double dt) { 101 | velocity_[0] += force_[0]*dt / mass_; 102 | velocity_[1] += force_[1]*dt / mass_; 103 | position_[0] += velocity_[0]*dt; 104 | position_[1] += velocity_[1]*dt; 105 | } 106 | 107 | void draw(SDL_Renderer* renderer) { 108 | render_rect(renderer, position_[0], position_[1], 30); 109 | } 110 | }; 111 | 112 | IKRA_HOST_STORAGE(Particle) 113 | 114 | 115 | // Defined here because definition depends on Particle. 116 | void Well::add_force() { 117 | // Add force induced by this well to all particles. 118 | execute(&Particle::add_force, this); 119 | } 120 | 121 | 122 | int main() { 123 | if (SDL_Init(SDL_INIT_VIDEO) != 0) { 124 | printf("Could not initialize SDL!\n"); 125 | exit(1); 126 | } 127 | 128 | SDL_Window* window = nullptr; 129 | SDL_Renderer* renderer = nullptr; 130 | 131 | if (SDL_CreateWindowAndRenderer(kWindowWidth, kWindowHeight, 0, 132 | &window, &renderer) != 0) { 133 | printf("Could not create window/render!\n"); 134 | exit(1); 135 | } 136 | 137 | // Initialize object storage. 138 | Particle::initialize_storage(); 139 | Well::initialize_storage(); 140 | 141 | // Create objects. 142 | for (int i = 0; i < kNumWells; ++i) { 143 | new Well(kSolarMass, 0, 0); 144 | } 145 | 146 | for (int i = 0; i < kNumParticles; ++i) { 147 | new Particle(kEarthMass, kEarthSunDistance, 0, 0, kEarthVelocity); 148 | } 149 | 150 | SDL_bool done = SDL_FALSE; 151 | while (!done) { 152 | // Reset forces. 153 | execute(&Particle::reset_force); 154 | 155 | // Update forces. 156 | execute(&Well::add_force); 157 | 158 | // Update velocities and positions. 159 | execute(&Particle::update, kTimeInterval); 160 | 161 | // Draw scene. 162 | SDL_SetRenderDrawColor(renderer, 255, 255, 255, SDL_ALPHA_OPAQUE); 163 | SDL_RenderClear(renderer); 164 | SDL_SetRenderDrawColor(renderer, 0, 0, 0, SDL_ALPHA_OPAQUE); 165 | execute(&Well::draw, renderer); 166 | execute(&Particle::draw, renderer); 167 | SDL_RenderPresent(renderer); 168 | 169 | // Wait for user to close the window. 170 | SDL_Event event; 171 | while (SDL_PollEvent(&event)) { 172 | if (event.type == SDL_QUIT) { 173 | done = SDL_TRUE; 174 | } 175 | } 176 | } 177 | 178 | SDL_DestroyRenderer(renderer); 179 | SDL_DestroyWindow(window); 180 | SDL_Quit(); 181 | return 0; 182 | } 183 | 184 | int codegen_test(Particle* p) { 185 | return p->mass_; 186 | } 187 | -------------------------------------------------------------------------------- /example/traffic/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(BENCH_CELL_IN_MODE "--" CACHE STRING "") 2 | ADD_DEFINITIONS(-DBENCH_CELL_IN_MODE=${BENCH_CELL_IN_MODE}) 3 | 4 | set(BENCH_CELL_OUT_MODE "--" CACHE STRING "") 5 | ADD_DEFINITIONS(-DBENCH_CELL_OUT_MODE=${BENCH_CELL_OUT_MODE}) 6 | 7 | set(BENCH_CAR_MODE "--" CACHE STRING "") 8 | ADD_DEFINITIONS(-DBENCH_CAR_MODE=${BENCH_CAR_MODE}) 9 | 10 | set(BENCH_SIGNAL_GROUP_MODE "--" CACHE STRING "") 11 | ADD_DEFINITIONS(-DBENCH_SIGNAL_GROUP_MODE=${BENCH_SIGNAL_GROUP_MODE}) 12 | 13 | set(BENCH_TRAFFIC_LIGHT_MODE "--" CACHE STRING "") 14 | ADD_DEFINITIONS(-DBENCH_TRAFFIC_LIGHT_MODE=${BENCH_TRAFFIC_LIGHT_MODE}) 15 | 16 | set(BENCH_LAYOUT_MODE "--" CACHE STRING "") 17 | ADD_DEFINITIONS(-DBENCH_LAYOUT_MODE=${BENCH_LAYOUT_MODE}) 18 | 19 | set(BENCHMARK_MODE "0" CACHE STRING "") 20 | ADD_DEFINITIONS(-DBENCHMARK_MODE=${BENCHMARK_MODE}) 21 | 22 | 23 | # CUDA 24 | # TODO: cuda_add_library does not support INTERFACE libraries 25 | include_directories(${CMAKE_SOURCE_DIR}/ikra) 26 | cuda_add_executable(traffic traffic.cu simulation_adapter.cu traffic-simulation/graphml_network_builder.cc traffic-simulation/traffic.cc traffic-simulation/simple_network_builder.cc traffic-simulation/drawing.cc traffic-simulation/traffic_aos_int_cuda.cu traffic-simulation/traffic_aos_int.cc) 27 | target_link_libraries(traffic ikra SDL2 SDL2_gfx) 28 | -------------------------------------------------------------------------------- /example/traffic/benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef EXAMPLE_TRAFFIC_BENCHMARK_H 2 | #define EXAMPLE_TRAFFIC_BENCHMARK_H 3 | 4 | #include 5 | 6 | template 7 | struct measure 8 | { 9 | template 10 | static typename TimeT::rep execution(F&& func, Args&&... args) 11 | { 12 | auto start = std::chrono::steady_clock::now(); 13 | std::forward(func)(std::forward(args)...); 14 | auto duration = std::chrono::duration_cast< TimeT> 15 | (std::chrono::steady_clock::now() - start); 16 | return duration.count(); 17 | } 18 | }; 19 | 20 | #endif // EXAMPLE_TRAFFIC_BENCHMARK_H 21 | -------------------------------------------------------------------------------- /example/traffic/configuration.h: -------------------------------------------------------------------------------- 1 | #ifndef EXAMPLE_TRAFFIC_CONFIGURATION_H 2 | #define EXAMPLE_TRAFFIC_CONFIGURATION_H 3 | 4 | using ikra::soa::kLayoutModeSoa; 5 | using ikra::soa::kLayoutModeAos; 6 | 7 | class Cell; 8 | class SharedSignalGroup; 9 | 10 | // Simulation logic. 11 | #define SMART_TRAFFIC_LIGHT 12 | #define MOVE_ONTO_LARGE_STREETS 13 | 14 | // Statistics for benchmark. 15 | static const uint32_t kNumIterations = 1000; 16 | static const uint32_t kNumBenchmarkRuns = 5; 17 | 18 | // Number of objects. 19 | #define LARGE_DATASET 20 | #ifdef LARGE_DATASET 21 | // Adapted for dataset: OSM-urbanized, Denver_Aurora_CO 22 | static const uint32_t kNumCells = 8339802; 23 | const uint32_t kNumCars = 229376; 24 | static const uint32_t kNumSharedSignalGroups = 205129; 25 | static const uint32_t kNumTrafficLights = 62819; 26 | static const uint32_t kNumPriorityCtrls = 4792; 27 | #else 28 | // Small dataset can run on my laptop. 29 | static const uint32_t kNumCells = 950778; 30 | const uint32_t kNumCars = 50000; 31 | static const uint32_t kNumSharedSignalGroups = 344418; 32 | static const uint32_t kNumTrafficLights = 8000; 33 | static const uint32_t kNumPriorityCtrls = 1000; 34 | #endif // LARGE 35 | 36 | // Maximum array sizes. 37 | static const uint32_t kMaxNumCellIncoming = 8; 38 | static const uint32_t kMaxNumCellOutgoing = 8; 39 | static const uint32_t kMaxNumCarPath = 30; 40 | static const uint32_t kMaxNumSharedSignalGroupCells = 8; 41 | static const uint32_t kMaxNumTrafficLightGroups = 6; 42 | 43 | // Manual padding for AOS mode. 44 | static const uint32_t kCellAlignmentPadding = 2; 45 | static const uint32_t kCarAlignmentPadding = 7; 46 | static const uint32_t kSharedSignalGroupAlignmentPadding = 4; 47 | static const uint32_t kPriorityCtrlAlignmentPadding = 4; 48 | // No padding required for traffic lights. 49 | // static const uint32_t kTrafficLightAlignmentPadding = 0; 50 | 51 | 52 | #if BENCHMARK_MODE != 1 53 | // Dummy configuration 54 | #define BENCH_CELL_IN_MODE -1 55 | #define BENCH_CELL_OUT_MODE -1 56 | #define BENCH_CAR_MODE -1 57 | #define BENCH_SIGNAL_GROUP_MODE -1 58 | #define BENCH_TRAFFIC_LIGHT_MODE -1 59 | #define BENCH_LAYOUT_MODE kLayoutModeSoa 60 | #endif 61 | 62 | // Layout mode: SOA or AOS. 63 | static const int kCellLayoutMode = BENCH_LAYOUT_MODE; 64 | static const int kCarLayoutMode = BENCH_LAYOUT_MODE; 65 | static const int kSharedSignalGroupLayoutMode = BENCH_LAYOUT_MODE; 66 | static const int kPriorityCtrlLayoutMode = BENCH_LAYOUT_MODE; 67 | static const int kTrafficLightLayoutMode = BENCH_LAYOUT_MODE; 68 | 69 | // Array configurations. 70 | #if BENCH_CELL_IN_MODE == -1 71 | #define ARRAY_CELL_INCOMING \ 72 | array_(Cell*, kMaxNumCellIncoming, object) 73 | #elif BENCH_CELL_IN_MODE == -2 74 | #define ARRAY_CELL_INCOMING \ 75 | array_(Cell*, kMaxNumCellIncoming, fully_inlined) 76 | #elif !defined(BENCH_CELL_IN_MODE) 77 | #error BENCH_CELL_IN_MODE is undefined 78 | #else 79 | static const uint32_t kArrInlineIncomingCells = BENCH_CELL_IN_MODE; // 4 80 | #define ARRAY_CELL_INCOMING \ 81 | array_(Cell*, kArrInlineIncomingCells, partially_inlined) 82 | #define ARRAY_CELL_IN_IS_PARTIAL 83 | #endif 84 | 85 | #if BENCH_CELL_OUT_MODE == -1 86 | #define ARRAY_CELL_OUTGOING \ 87 | array_(Cell*, kMaxNumCellOutgoing, object) 88 | #elif BENCH_CELL_OUT_MODE == -2 89 | #define ARRAY_CELL_OUTGOING \ 90 | array_(Cell*, kMaxNumCellOutgoing, fully_inlined) 91 | #elif !defined(BENCH_CELL_OUT_MODE) 92 | #error BENCH_CELL_OUT_MODE is undefined 93 | #else 94 | static const uint32_t kArrInlineOutgoingCells = BENCH_CELL_OUT_MODE; // 4 95 | #define ARRAY_CELL_OUTGOING \ 96 | array_(Cell*, kArrInlineOutgoingCells, partially_inlined) 97 | #define ARRAY_CELL_OUT_IS_PARTIAL 98 | #endif 99 | 100 | #if BENCH_CAR_MODE == -1 101 | #define ARRAY_CAR_PATH \ 102 | array_(Cell*, kMaxNumCarPath, object) 103 | #elif BENCH_CAR_MODE == -2 104 | #define ARRAY_CAR_PATH \ 105 | array_(Cell*, kMaxNumCarPath, fully_inlined) 106 | #elif !defined(BENCH_CAR_MODE) 107 | #error BENCH_CAR_MODE is undefined 108 | #else 109 | static const uint32_t kArrInlinePath = BENCH_CAR_MODE; // 5 110 | #define ARRAY_CAR_PATH \ 111 | array_(Cell*, kArrInlinePath, partially_inlined) 112 | #define ARRAY_CAR_IS_PARTIAL 113 | #endif 114 | 115 | #if BENCH_SIGNAL_GROUP_MODE == -1 116 | #define ARRAY_SIGNAL_GROUP_CELLS \ 117 | array_(Cell*, kMaxNumSharedSignalGroupCells, object) 118 | #elif BENCH_SIGNAL_GROUP_MODE == -2 119 | #define ARRAY_SIGNAL_GROUP_CELLS \ 120 | array_(Cell*, kMaxNumSharedSignalGroupCells, fully_inlined) 121 | #elif !defined(BENCH_SIGNAL_GROUP_MODE) 122 | #error BENCH_SIGNAL_GROUP_MODE is undefined 123 | #else 124 | static const uint32_t kArrInlineSignalGroupCells = 125 | BENCH_SIGNAL_GROUP_MODE; // 4 126 | #define ARRAY_SIGNAL_GROUP_CELLS \ 127 | array_(Cell*, kArrInlineSignalGroupCells, partially_inlined) 128 | #define ARRAY_SIGNAL_GROUP_IS_PARTIAL 129 | #endif 130 | 131 | #if BENCH_TRAFFIC_LIGHT_MODE == -1 132 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \ 133 | array_(SharedSignalGroup*, kMaxNumTrafficLightGroups, object) 134 | #elif BENCH_TRAFFIC_LIGHT_MODE == -2 135 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \ 136 | array_(SharedSignalGroup*, kMaxNumTrafficLightGroups, fully_inlined) 137 | #elif !defined(BENCH_TRAFFIC_LIGHT_MODE) 138 | #error BENCH_TRAFFIC_LIGHT_MODE is undefined 139 | #else 140 | static const uint32_t kArrInlineTrafficLightSignalGroups = 141 | BENCH_TRAFFIC_LIGHT_MODE; // 4 142 | #define ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS \ 143 | array_(SharedSignalGroup*, kArrInlineTrafficLightSignalGroups, \ 144 | partially_inlined) 145 | #define ARRAY_TRAFFIC_LIGHT_IS_PARTIAL 146 | #endif 147 | 148 | // Sizes of arena. 149 | // Worse case: avg = 1.05 each, so share 3 slots/instance. 150 | static const uint32_t kCellArenaSize = kNumCells*3*sizeof(Cell*); 151 | // Worse case: avg is around 20. 152 | static const uint32_t kCarArenaSize = kNumCars*21*sizeof(Cell*); 153 | static const uint32_t kSharedSignalGroupArenaSize = 154 | kNumSharedSignalGroups*4*sizeof(Cell*); 155 | // Worse case: ~40% have 4 signal groups or more. 156 | static const uint32_t kTrafficLightArenaSize = 157 | kNumTrafficLights*3*sizeof(SharedSignalGroup*); 158 | static const uint32_t kPriorityCtrlArenaSize = 0; 159 | 160 | // Types of other arrays. 161 | // Priority controller have always two signal groups. 162 | #define ARRAY_PRIORITY_CTRL_SIGNAL_GROUPS \ 163 | array_(SharedSignalGroup*, 2, fully_inlined) 164 | 165 | #endif // EXAMPLE_TRAFFIC_CONFIGURATION_H 166 | -------------------------------------------------------------------------------- /example/traffic/entities/car.inc: -------------------------------------------------------------------------------- 1 | class Car : public SoaLayout, 3 | kCarLayoutMode> { 4 | public: 5 | IKRA_INITIALIZE_CLASS 6 | 7 | __device__ __host__ Car(bool is_active, uint32_t velocity, 8 | uint32_t max_velocity, uint32_t random_state, 9 | Cell* position) 10 | : is_active_(is_active), velocity_(velocity), path_length_(0), 11 | #ifdef ARRAY_CAR_IS_PARTIAL 12 | path_(max_velocity), 13 | #endif // ARRAY_CAR_IS_PARTIAL 14 | max_velocity_(max_velocity), random_state_(random_state), 15 | position_(position) { 16 | assert(is_active); 17 | } 18 | 19 | // The velocity of the car in cells/iteration. 20 | // [ OFFSET = 0 ] 21 | uint32_t_ velocity_; 22 | 23 | // The max. possible velocity of this car. 24 | // [ OFFSET = 4 ] 25 | uint32_t_ max_velocity_; 26 | 27 | // The lengths of the precalculated path. 28 | // [ OFFSET = 8 ] 29 | uint32_t_ path_length_; 30 | 31 | // Every car has a random state to allow for reproducible results. 32 | // [ OFFSET = 12 ] 33 | uint32_t_ random_state_; 34 | 35 | // An array of cells that the car will move onto next. 36 | // [ OFFSET = 16 ] 37 | ARRAY_CAR_PATH path_; 38 | 39 | // The current position of the car. 40 | // [ OFFSET = 16 + k*PTR_SIZE ] 41 | field_(Cell*) position_; 42 | 43 | // If a car enters a sink, it is removed from the simulation (inactive) 44 | // for a short time. 45 | // [ OFFSET = 16 + k*PTR_SIZE ] 46 | bool_ is_active_; 47 | 48 | // Add dummy field to make sure that objects are properly aligned in AOS. 49 | // [ OFFSET = 17 + k*PTR_SIZE ] 50 | array_(char, kCarAlignmentPadding, object) dummy_alignment_; 51 | 52 | __device__ uint32_t velocity() const { return velocity_; } 53 | 54 | __device__ uint32_t max_velocity() const { return max_velocity_; } 55 | 56 | __device__ Cell* position() const { return position_; } 57 | 58 | __device__ bool is_active() const { return is_active_; } 59 | 60 | __device__ uint32_t rand32() { 61 | // Advance and return random state. 62 | // Source: https://en.wikipedia.org/wiki/Lehmer_random_number_generator 63 | random_state_ = static_cast( 64 | static_cast(random_state()) * 279470273u) % 0xfffffffb; 65 | return random_state_; 66 | } 67 | 68 | __device__ uint32_t rand32(uint32_t max_value) { 69 | return rand32() % max_value; 70 | } 71 | 72 | __device__ uint32_t random_state() const { 73 | return random_state_; 74 | } 75 | 76 | __device__ void step_prepare_path() { 77 | if (is_active()) { 78 | step_initialize_iteration(); 79 | step_accelerate(); 80 | step_extend_path(); 81 | step_constraint_velocity(); 82 | step_slow_down(); 83 | } 84 | } 85 | 86 | __device__ Cell* next_step(Cell* cell); 87 | 88 | __device__ void step_initialize_iteration(); 89 | 90 | __device__ void step_accelerate(); 91 | 92 | __device__ void step_extend_path(); 93 | 94 | __device__ void step_constraint_velocity(); 95 | 96 | __device__ void step_move(); 97 | 98 | __device__ void step_reactivate(); 99 | 100 | __device__ void step_slow_down() { 101 | if (rand32(1000) < 200 && velocity_ > 0) { 102 | velocity_ = velocity_ - 1; 103 | } 104 | } 105 | }; 106 | 107 | IKRA_DEVICE_STORAGE(Car); 108 | 109 | 110 | __device__ Cell* Car::next_step(Cell* position) { 111 | // Almost random walk. 112 | const uint32_t num_outgoing = position->num_outgoing_cells(); 113 | assert(num_outgoing > 0); 114 | 115 | #ifdef MOVE_ONTO_LARGE_STREETS 116 | // Collect probabilities of outgoing cells. 117 | int prob_sum = 0; 118 | for (int i = 0; i < num_outgoing; ++i) { 119 | Cell* candidate = position->outgoing_cell(i); 120 | 121 | // More likely to drive onto larger roads. 122 | int difference = candidate->type() - position->type() + Cell::kMaxType; 123 | prob_sum += 100*difference; 124 | } 125 | 126 | // Now choose a path randomly. 127 | int rand_select = rand32(prob_sum); 128 | int threshold = 0; 129 | for (int i = 0; i < num_outgoing; ++i) { 130 | Cell* candidate = position->outgoing_cell(i); 131 | 132 | // More likely to drive onto larger roads. 133 | int difference = candidate->type() - position->type() + Cell::kMaxType; 134 | threshold += 100*difference; 135 | 136 | if (rand_select < threshold) { 137 | assert(candidate != position); 138 | return candidate; 139 | } 140 | } 141 | 142 | // We should never end up here. 143 | assert(false); 144 | #endif // MOVE_ONTO_LARGE_STREETS 145 | 146 | // Need some kind of return statement here. 147 | return position->outgoing_cell(rand32(num_outgoing)); 148 | } 149 | 150 | 151 | __device__ void Car::step_initialize_iteration() { 152 | // Reset calculated path. This forces cars with a random moving behavior to 153 | // select a new path in every iteration. Otherwise, cars might get "stucjk" 154 | // on a full network if many cars are waiting for the one in front of them in 155 | // a cycle. 156 | // TODO: Check if we can keep the path at least partially somehow. 157 | path_length_ = 0; 158 | } 159 | 160 | __device__ void Car::step_accelerate() { 161 | // Speed up the car by 1 or 2 units. 162 | uint32_t speedup = rand32(2) + 1; 163 | velocity_ = max_velocity_ < velocity_ + speedup 164 | ? static_cast(max_velocity_) : velocity_ + speedup; 165 | } 166 | 167 | __device__ void Car::step_extend_path() { 168 | Cell* cell = position_; 169 | Cell* next_cell; 170 | 171 | for (uint32_t i = 0; i < velocity_; ++i) { 172 | if (cell->is_sink()) { 173 | break; 174 | } 175 | 176 | next_cell = next_step(cell); 177 | assert(next_cell != cell); 178 | 179 | cell = next_cell; 180 | path_[i] = cell; 181 | path_length_ = path_length_ + 1; 182 | } 183 | 184 | velocity_ = path_length_; 185 | } 186 | 187 | __device__ void Car::step_constraint_velocity() { 188 | // This is actually only needed for the very first iteration, because a car 189 | // may be positioned on a traffic light cell. 190 | if (velocity_ > position()->max_velocity()) { 191 | velocity_ = position()->max_velocity(); 192 | } 193 | 194 | uint32_t path_index = 0; 195 | int distance = 1; 196 | 197 | while (distance <= velocity_) { 198 | // Invariant: Movement of up to `distance - 1` many cells at `velocity_` 199 | // is allowed. 200 | // Now check if next cell can be entered. 201 | Cell* next_cell = path_[path_index]; 202 | 203 | // Avoid collision. 204 | if (!next_cell->is_free()) { 205 | // Cannot enter cell. 206 | --distance; 207 | velocity_ = distance; 208 | break; 209 | } // else: Can enter next cell. 210 | 211 | if (velocity_ > next_cell->max_velocity()) { 212 | // Car is too fast for this cell. 213 | if (next_cell->max_velocity() > distance - 1) { 214 | // Even if we slow down, we would still make progress. 215 | velocity_ = next_cell->max_velocity(); 216 | } else { 217 | // Do not enter the next cell. 218 | --distance; 219 | assert(distance >= 0); 220 | 221 | velocity_ = distance; 222 | break; 223 | } 224 | } 225 | 226 | ++distance; 227 | ++path_index; 228 | } 229 | 230 | --distance; 231 | 232 | 233 | for (uint32_t i = 0; i < velocity_; ++i) { 234 | assert(path_[i]->is_free()); 235 | assert(i == 0 || path_[i - 1] != path_[i]); 236 | } 237 | // TODO: Check why the cast is necessary. 238 | assert(distance <= (int) velocity()); 239 | } 240 | 241 | __device__ void Car::step_move() { 242 | if (is_active()) { 243 | Cell* cell = position_; 244 | for (uint32_t i = 0; i < velocity_; ++i) { 245 | assert(path_[i] != cell); 246 | 247 | cell = path_[i]; 248 | assert(cell->is_free()); 249 | 250 | position()->release(); 251 | cell->occupy(this); 252 | position_ = cell; 253 | } 254 | 255 | if (position()->is_sink()) { 256 | // Remove car from the simulation. Will be added again in the next 257 | // iteration. 258 | position()->release(); 259 | path_length_ = 0; 260 | is_active_ = false; 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /example/traffic/entities/cell.inc: -------------------------------------------------------------------------------- 1 | class Cell : public SoaLayout, 3 | kCellLayoutMode> { 4 | public: 5 | IKRA_INITIALIZE_CLASS 6 | 7 | enum Type { 8 | // Sorted from smallest to largest. 9 | kService, 10 | kResidential, 11 | kUnclassified, 12 | kTertiary, 13 | kSecondary, 14 | kPrimary, 15 | kTrunk, 16 | kMotorway, 17 | 18 | kMaxType 19 | }; 20 | 21 | // Default constructor, where incoming/outgoing cell lists are passed as 22 | // Cell pointers. 23 | __host__ __device__ Cell(uint32_t max_velocity, double x, double y, 24 | uint32_t num_incoming, Cell** incoming, 25 | uint32_t num_outgoing, Cell** outgoing, 26 | Car* car, bool is_free, bool is_sink, 27 | uint32_t controller_max_velocity, 28 | Type type = kResidential) 29 | : max_velocity_(max_velocity), x_(x), y_(y), type_(type), 30 | num_incoming_cells_(num_incoming), num_outgoing_cells_(num_outgoing), 31 | #ifdef ARRAY_CELL_OUT_IS_PARTIAL 32 | outgoing_cells_(num_outgoing), 33 | #endif // ARRAY_CELL_OUT_IS_PARTIAL 34 | #ifdef ARRAY_CELL_IN_IS_PARTIAL 35 | incoming_cells_(num_incoming), 36 | #endif // ARRAY_CELL_IN_IS_PARTIAL 37 | car_(car), is_free_(is_free), is_sink_(is_sink), 38 | controller_max_velocity_(controller_max_velocity) { 39 | for (uint32_t i = 0; i < num_incoming; ++i) { 40 | incoming_cells_[i] = incoming[i]; 41 | } 42 | 43 | for (uint32_t i = 0; i < num_outgoing; ++i) { 44 | outgoing_cells_[i] = outgoing[i]; 45 | } 46 | } 47 | 48 | // Overload: Provide cell indices instead of pointers. 49 | __host__ __device__ Cell(uint32_t max_velocity, double x, double y, 50 | uint32_t num_incoming, unsigned int* incoming, 51 | uint32_t num_outgoing, unsigned int* outgoing, 52 | Car* car, bool is_free, bool is_sink, 53 | uint32_t controller_max_velocity, 54 | Type type = kResidential) 55 | : max_velocity_(max_velocity), x_(x), y_(y), type_(type), 56 | num_incoming_cells_(num_incoming), num_outgoing_cells_(num_outgoing), 57 | #ifdef ARRAY_CELL_OUT_IS_PARTIAL 58 | outgoing_cells_(num_outgoing), 59 | #endif // ARRAY_CELL_OUT_IS_PARTIAL 60 | #ifdef ARRAY_CELL_IN_IS_PARTIAL 61 | incoming_cells_(num_incoming), 62 | #endif // ARRAY_CELL_IN_IS_PARTIAL 63 | car_(car), is_free_(is_free), is_sink_(is_sink) { 64 | for (uint32_t i = 0; i < num_incoming; ++i) { 65 | incoming_cells_[i] = Cell::get_uninitialized(incoming[i]); 66 | } 67 | 68 | for (uint32_t i = 0; i < num_outgoing; ++i) { 69 | outgoing_cells_[i] = Cell::get_uninitialized(outgoing[i]); 70 | } 71 | 72 | controller_max_velocity_ = max_velocity_; 73 | } 74 | 75 | // Return the maximum velocity that is allowed on this street in general. 76 | // [ OFFSET = 0 ] 77 | uint32_t_ max_velocity_; 78 | 79 | // Return max. velocity allowed with respect to traffic controllers. 80 | // [ OFFSET = 4 ] 81 | uint32_t_ controller_max_velocity_; 82 | 83 | // Number of incoming cells. 84 | // [ OFFSET = 8 ] 85 | uint32_t_ num_incoming_cells_; 86 | 87 | // Number of outgoing cells. 88 | // [ OFFSET = 12 ] 89 | uint32_t_ num_outgoing_cells_; 90 | 91 | // x and y coordinates, only for rendering and debugging purposes. 92 | // [ OFFSET = 16 ] 93 | double_ x_; 94 | double_ y_; 95 | 96 | // Incoming cells. 97 | // [ OFFSET = 32 ] 98 | ARRAY_CELL_INCOMING incoming_cells_; 99 | 100 | // Outgoing cells. 101 | // [ OFFSET = 32 + k*PTR_SIZE ] 102 | ARRAY_CELL_OUTGOING outgoing_cells_; 103 | 104 | // The car that is currently occupying this cell (if any). 105 | // [ OFFSET = 32 + k*PTR_SIZE ] 106 | field_(Car*) car_; 107 | 108 | // The type of this cell according to OSM data. Size of enum is 4 bytes. 109 | // [ OFFSET = 32 + k*PTR_SIZE ] 110 | field_(Type) type_; 111 | 112 | // A cell is free if is does not contain a car. 113 | // [ OFFSET = 36 + k*PTR_SIZE ] 114 | bool_ is_free_; 115 | 116 | // A cell is usually a sink if does not have any outgoing edges. 117 | // [ OFFSET = 37 + k*PTR_SIZE ] 118 | bool_ is_sink_; 119 | 120 | // Add dummy field to make sure that objects are properly aligned in AOS. 121 | // [ OFFSET = 38 + k*PTR_SIZE ]. Need 2 bytes padding. 122 | array_(char, kCellAlignmentPadding, object) dummy_alignment_; 123 | 124 | // Returns the maximum velocity allowed on this cell at this moment. This 125 | // function takes into account velocity limitations due to traffic lights. 126 | __device__ uint32_t max_velocity() const { 127 | return controller_max_velocity_ < max_velocity_ 128 | ? controller_max_velocity_ 129 | : max_velocity_; 130 | } 131 | 132 | // The maximum velocity allowed on this cell regardless of 133 | // traffic controllers. 134 | __device__ uint32_t street_max_velocity() const { 135 | return max_velocity_; 136 | } 137 | 138 | // Sets the maximum temporary speed limit (traffic controller). 139 | __device__ void set_controller_max_velocity(uint32_t velocity) { 140 | controller_max_velocity_ = velocity; 141 | } 142 | 143 | // Removes the maximum temporary speed limit. 144 | __device__ void remove_controller_max_velocity() { 145 | controller_max_velocity_ = max_velocity_; 146 | } 147 | 148 | __device__ uint32_t num_incoming_cells() const { 149 | return num_incoming_cells_; 150 | } 151 | __device__ Cell* incoming_cell(uint32_t index) const { 152 | return incoming_cells_[index]; 153 | } 154 | 155 | __device__ uint32_t num_outgoing_cells() const { 156 | return num_outgoing_cells_; 157 | } 158 | __device__ Cell* outgoing_cell(uint32_t index) const { 159 | return outgoing_cells_[index]; 160 | } 161 | 162 | // A car enters this cell. 163 | __device__ void occupy(Car* car) { 164 | assert((bool) is_free_); 165 | assert(car_ == nullptr); 166 | 167 | car_ = car; 168 | is_free_ = false; 169 | } 170 | 171 | // A car leaves this cell. 172 | __device__ void release() { 173 | car_ = nullptr; 174 | is_free_ = true; 175 | } 176 | 177 | __device__ double x() const { return x_; } 178 | 179 | __device__ double y() const { return y_; } 180 | 181 | __device__ Type type() const { return type_; } 182 | 183 | __device__ bool is_free() const { return is_free_; } 184 | 185 | __device__ bool is_sink() const { return is_sink_; } 186 | }; 187 | 188 | IKRA_DEVICE_STORAGE(Cell); 189 | -------------------------------------------------------------------------------- /example/traffic/entities/priority_yield_traffic_controller.inc: -------------------------------------------------------------------------------- 1 | class PriorityYieldTrafficController : public SoaLayout< 2 | PriorityYieldTrafficController, kNumPriorityCtrls, kAddressModeZero, 3 | StaticStorageWithArena, 4 | kPriorityCtrlLayoutMode> { 5 | public: 6 | IKRA_INITIALIZE_CLASS 7 | 8 | __device__ PriorityYieldTrafficController(unsigned int* signal_groups, 9 | uint32_t num_signal_groups) : 10 | #ifdef ARRAY_PRIORITY_CTRL_IS_PARTIAL 11 | signal_groups_(num_signal_groups), 12 | #endif // ARRAY_PRIORITY_CTRL_IS_PARTIAL 13 | num_signal_groups_(num_signal_groups) { 14 | for (uint32_t i = 0; i < num_signal_groups; ++i) { 15 | signal_groups_[i] = 16 | SharedSignalGroup::get_uninitialized(signal_groups[i]); 17 | } 18 | } 19 | 20 | __device__ void initialize() { 21 | for (uint32_t i = 0; i < num_signal_groups_; ++i) { 22 | signal_groups_[i]->signal_stop(); 23 | } 24 | } 25 | 26 | __device__ void step() { 27 | bool found_traffic = false; 28 | // Cells are sorted by priority. 29 | for (uint32_t i = 0; i < num_signal_groups_; ++i) { 30 | SharedSignalGroup* next_group = signal_groups_[i]; 31 | bool has_incoming = has_incoming_traffic(next_group); 32 | uint32_t num_cells = next_group->num_cells(); 33 | 34 | if (!found_traffic && has_incoming) { 35 | found_traffic = true; 36 | // Allow traffic to flow. 37 | for (uint32_t j = 0; j < num_cells; ++j) { 38 | next_group->cell(j)->remove_controller_max_velocity(); 39 | } 40 | } else if (has_incoming) { 41 | // Traffic with higher priority is incoming. 42 | for (uint32_t j = 0; j < num_cells; ++j) { 43 | next_group->cell(j)->set_controller_max_velocity(0); 44 | } 45 | } 46 | } 47 | } 48 | 49 | // Signal groups controlled by this controller. 50 | // [ PADDING = 0] 51 | ARRAY_PRIORITY_CTRL_SIGNAL_GROUPS signal_groups_; 52 | 53 | // Number of signal groups. 54 | // [ PADDING = k*PTR_SIZE ] 55 | uint32_t_ num_signal_groups_; 56 | 57 | // Add dummy field to make sure that objects are properly aligned in AOS. 58 | // [ OFFSET = 4 + k*PTR_SIZE ] 59 | array_(char, kPriorityCtrlAlignmentPadding, object) dummy_alignment_; 60 | 61 | __device__ bool has_incoming_traffic(SharedSignalGroup* group) const { 62 | const uint32_t num_cells = group->num_cells(); 63 | for (uint32_t i = 0; i < num_cells; ++i) { 64 | Cell* next_cell = group->cell(i); 65 | 66 | // Report incoming traffic if at least one cells in the group reports 67 | // incoming traffic. 68 | if (has_incoming_traffic(next_cell, next_cell->street_max_velocity())) { 69 | return true; 70 | } 71 | } 72 | return false; 73 | } 74 | 75 | __device__ bool has_incoming_traffic(Cell* cell, uint32_t lookahead) const { 76 | if (lookahead == 0) { 77 | return !cell->is_free(); 78 | } 79 | 80 | // Check incoming cells. This is BFS. 81 | const uint32_t num_incoming = cell->num_incoming_cells(); 82 | for (uint32_t i = 0; i < num_incoming; ++i) { 83 | Cell* next_cell = cell->incoming_cell(i); 84 | if (has_incoming_traffic(next_cell, lookahead - 1)) { 85 | return true; 86 | } 87 | } 88 | 89 | return !cell->is_free(); 90 | } 91 | }; 92 | 93 | IKRA_DEVICE_STORAGE(PriorityYieldTrafficController) 94 | -------------------------------------------------------------------------------- /example/traffic/entities/shared_signal_group.inc: -------------------------------------------------------------------------------- 1 | class SharedSignalGroup : public SoaLayout< 2 | SharedSignalGroup, kNumSharedSignalGroups, kAddressModeZero, 3 | StaticStorageWithArena, 4 | kSharedSignalGroupLayoutMode> { 5 | public: 6 | IKRA_INITIALIZE_CLASS 7 | 8 | __device__ SharedSignalGroup(uint32_t num_cells, unsigned int* cells) : 9 | #ifdef ARRAY_SIGNAL_GROUP_IS_PARTIAL 10 | cells_(num_cells), 11 | #endif // ARRAY_SIGNAL_GROUP_IS_PARTIAL 12 | num_cells_(num_cells) { 13 | for (uint32_t i = 0; i < num_cells; ++i) { 14 | cells_[i] = Cell::get_uninitialized(cells[i]); 15 | } 16 | } 17 | 18 | __device__ void signal_go() { 19 | for (uint32_t i = 0; i < num_cells_; ++i) { 20 | cells_[i]->remove_controller_max_velocity(); 21 | } 22 | } 23 | 24 | __device__ void signal_stop() { 25 | for (uint32_t i = 0; i < num_cells_; ++i) { 26 | cells_[i]->set_controller_max_velocity(0); 27 | } 28 | } 29 | 30 | // Cells controlled by this group. 31 | // [ PADDING = 0 ] 32 | ARRAY_SIGNAL_GROUP_CELLS cells_; 33 | 34 | // Number of cells. 35 | // [ PADDING = k*PTR_SIZE ] 36 | uint32_t_ num_cells_; 37 | 38 | // Add dummy field to make sure that objects are properly aligned in AOS. 39 | // [ OFFSET = 4 + k*PTR_SIZE ] 40 | array_(char, kSharedSignalGroupAlignmentPadding, object) dummy_alignment_; 41 | 42 | __device__ uint32_t num_cells() const { return num_cells_; } 43 | __device__ Cell* cell(uint32_t index) { return cells_[index]; } 44 | }; 45 | 46 | IKRA_DEVICE_STORAGE(SharedSignalGroup); 47 | -------------------------------------------------------------------------------- /example/traffic/entities/simulation.inc: -------------------------------------------------------------------------------- 1 | class Simulation : public SoaLayout { 2 | public: 3 | IKRA_INITIALIZE_CLASS 4 | 5 | __device__ Simulation() : random_state_(123) {} 6 | 7 | uint32_t_ random_state_; 8 | 9 | __device__ uint32_t rand32() { 10 | // Advance and return random state. 11 | // Source: https://en.wikipedia.org/wiki/Lehmer_random_number_generator 12 | random_state_ = static_cast( 13 | static_cast(random_state()) * 279470273u) % 0xfffffffb; 14 | return random_state_; 15 | } 16 | 17 | __device__ uint32_t rand32(uint32_t max_value) { 18 | return rand32() % max_value; 19 | } 20 | 21 | __device__ uint32_t random_state() const { 22 | return random_state_; 23 | } 24 | 25 | __device__ void step_random_state() { 26 | rand32(); 27 | } 28 | 29 | __device__ Cell* random_free_cell(Car* car) { 30 | uint64_t state = random_state(); 31 | uint64_t num_cars = Car::size(); 32 | uint64_t num_cells = Cell::size(); 33 | uint32_t max_tries = num_cells / num_cars; 34 | 35 | for (uint32_t i = 0; i < max_tries; ++i) { 36 | uint64_t cell_id = (num_cells * (car->id() + state) / num_cars + i) 37 | % num_cells; 38 | Cell* next_cell = Cell::get(cell_id); 39 | if (next_cell->is_free()) { 40 | return next_cell; 41 | } 42 | } 43 | 44 | // Could not find free cell. Try again in next iteration. 45 | return nullptr; 46 | } 47 | }; 48 | 49 | IKRA_DEVICE_STORAGE(Simulation); 50 | 51 | 52 | // Defined here because Simulation must be defined. 53 | __device__ void Car::step_reactivate() { 54 | if (!is_active()) { 55 | Cell* free_cell = Simulation::get(0)->random_free_cell(this); 56 | 57 | if (free_cell != nullptr) { 58 | assert(free_cell->is_free()); 59 | position_ = free_cell; 60 | is_active_ = true; 61 | path_length_ = 0; 62 | velocity_ = 0; 63 | free_cell->occupy(this); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /example/traffic/entities/traffic_light.inc: -------------------------------------------------------------------------------- 1 | 2 | class TrafficLight : public SoaLayout< 3 | TrafficLight, kNumTrafficLights, kAddressModeZero, 4 | StaticStorageWithArena, 5 | kTrafficLightLayoutMode> { 6 | public: 7 | IKRA_INITIALIZE_CLASS 8 | 9 | __device__ TrafficLight(uint32_t timer, uint32_t phase_time, uint32_t phase, 10 | unsigned int* signal_groups, 11 | uint32_t num_signal_groups) 12 | : timer_(timer), phase_time_(phase_time), phase_(phase), 13 | #ifdef ARRAY_TRAFFIC_LIGHT_IS_PARTIAL 14 | signal_groups_(num_signal_groups), 15 | #endif // ARRAY_TRAFFIC_LIGHT_IS_PARTIAL 16 | num_signal_groups_(num_signal_groups) { 17 | for (uint32_t i = 0; i < num_signal_groups; ++i) { 18 | signal_groups_[i] = 19 | SharedSignalGroup::get_uninitialized(signal_groups[i]); 20 | } 21 | } 22 | 23 | __device__ void initialize() { 24 | for (uint32_t i = 0; i < num_signal_groups_; ++i) { 25 | signal_groups_[i]->signal_stop(); 26 | } 27 | } 28 | 29 | __device__ bool has_incoming_traffic(SharedSignalGroup* group) const { 30 | const uint32_t num_cells = group->num_cells(); 31 | for (uint32_t i = 0; i < num_cells; ++i) { 32 | Cell* next_cell = group->cell(i); 33 | 34 | // Report incoming traffic if at least one cells in the group reports 35 | // incoming traffic. 36 | if (has_incoming_traffic(next_cell, next_cell->street_max_velocity())) { 37 | return true; 38 | } 39 | } 40 | return false; 41 | } 42 | 43 | __device__ bool has_incoming_traffic(Cell* cell, uint32_t lookahead) const { 44 | if (lookahead == 0) { 45 | return !cell->is_free(); 46 | } 47 | 48 | // Check incoming cells. This is BFS. 49 | const uint32_t num_incoming = cell->num_incoming_cells(); 50 | for (uint32_t i = 0; i < num_incoming; ++i) { 51 | Cell* next_cell = cell->incoming_cell(i); 52 | if (has_incoming_traffic(next_cell, lookahead - 1)) { 53 | return true; 54 | } 55 | } 56 | 57 | return !cell->is_free(); 58 | } 59 | 60 | __device__ void step() { 61 | #ifdef SMART_TRAFFIC_LIGHT 62 | // This is a smart traffic light. If there are cars coming from only one 63 | // street, give them a green light. 64 | uint32_t num_found_traffic = 0; 65 | uint32_t index_last_found = 0; 66 | for (uint32_t i = 0; i < num_signal_groups_; ++i) { 67 | SharedSignalGroup* next_group = signal_groups_[i]; 68 | bool has_incoming = has_incoming_traffic(next_group); 69 | 70 | if (has_incoming) { 71 | num_found_traffic++; 72 | index_last_found = i; 73 | } 74 | 75 | if (num_found_traffic > 1) break; 76 | } 77 | 78 | if (num_found_traffic == 1) { 79 | // Set this one to green. 80 | for (uint32_t i = 0; i < num_signal_groups_; ++i) { 81 | if (i == index_last_found) { 82 | signal_groups_[i]->signal_go(); 83 | } else { 84 | signal_groups_[i]->signal_stop(); 85 | } 86 | } 87 | timer_ = 0; 88 | phase_ = index_last_found; 89 | } else { 90 | timer_ = (timer_ + 1) % phase_time_; 91 | 92 | if (timer_ == 0) { 93 | signal_groups_[phase_]->signal_stop(); 94 | phase_ = (phase_ + 1) % num_signal_groups_; 95 | signal_groups_[phase_]->signal_go(); 96 | } 97 | } 98 | #else 99 | timer_ = (timer_ + 1) % phase_time_; 100 | 101 | if (timer_ == 0) { 102 | signal_groups_[phase_]->signal_stop(); 103 | phase_ = (phase_ + 1) % num_signal_groups_; 104 | signal_groups_[phase_]->signal_go(); 105 | } 106 | #endif // SMART_TRAFFIC_LIGHT 107 | } 108 | 109 | // This timer is increased with every step. 110 | // [ PADDING = 0 ] 111 | uint32_t_ timer_; 112 | 113 | // The length of one phase, i.e., number of iterations until the signal 114 | // changes. 115 | // [ PADDING = 4 ] 116 | uint32_t_ phase_time_; 117 | 118 | // An index into signal_groups_, marking the group that currently has a 119 | // green signal. 120 | // [ PADDING = 8 ] 121 | uint32_t_ phase_; 122 | 123 | // Number of signal groups. 124 | // [ PADDING = 12 ] 125 | uint32_t_ num_signal_groups_; 126 | __device__ uint32_t num_signal_groups() const { return num_signal_groups_; } 127 | 128 | // Signal groups controlled by this traffic lights. 129 | // [ PADDING = 16 ] 130 | ARRAY_TRAFFIC_LIGHT_SIGNAL_GROUPS signal_groups_; 131 | 132 | // No additional padding required for AOS. 133 | }; 134 | 135 | IKRA_DEVICE_STORAGE(TrafficLight); 136 | -------------------------------------------------------------------------------- /example/traffic/run_bench_traffic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # MODES: object == -1 3 | # fully inlined == -2 4 | 5 | # Change directory to root. 6 | cd "$(dirname "$0")" 7 | cd ../.. 8 | 9 | function build_and_run() { 10 | cmake -DCMAKE_BUILD_TYPE=Release \ 11 | -DBENCHMARK_MODE=1 \ 12 | -DBENCH_CELL_IN_MODE=$1 \ 13 | -DBENCH_CELL_OUT_MODE=$2 \ 14 | -DBENCH_CAR_MODE=$3 \ 15 | -DBENCH_SIGNAL_GROUP_MODE=$4 \ 16 | -DBENCH_TRAFFIC_LIGHT_MODE=$5 \ 17 | -DBENCH_LAYOUT_MODE=$6 \ 18 | CMakeLists.txt 19 | make 20 | 21 | output=$(( 22 | ( 23 | bin/traffic ~/Downloads/graphml_urbanized/Denver--Aurora__CO_Urbanized_Area_23527.graphml 24 | ) 1>/dev/null 25 | ) 2>&1) 26 | 27 | echo ${output} >> result_bench.txt 28 | } 29 | 30 | for v_layout_mode in "kLayoutModeSoa" "kLayoutModeAos" 31 | do 32 | # cell_in: [0; 8] 33 | for i in $(seq -2 8) 34 | do 35 | build_and_run ${i} -2 -2 -2 -2 ${v_layout_mode} 36 | done 37 | 38 | # cell_out: [0; 8] 39 | for i in $(seq -2 8) 40 | do 41 | build_and_run -2 ${i} -2 -2 -2 ${v_layout_mode} 42 | done 43 | 44 | # car: [0; 30] 45 | for i in $(seq -2 30) 46 | do 47 | build_and_run -2 -2 ${i} -2 -2 ${v_layout_mode} 48 | done 49 | 50 | # signal group: [0; 8] 51 | for i in $(seq -2 8) 52 | do 53 | build_and_run -2 -2 -2 ${i} -2 ${v_layout_mode} 54 | done 55 | 56 | # traffic light [0; 6] 57 | for i in $(seq -2 6) 58 | do 59 | build_and_run -2 -2 -2 -2 ${i} ${v_layout_mode} 60 | done 61 | 62 | build_and_run -1 -1 -1 -1 -1 ${v_layout_mode} 63 | build_and_run -1 -1 -1 -1 -1 ${v_layout_mode} 64 | build_and_run 3 3 5 4 4 ${v_layout_mode} 65 | done 66 | 67 | -------------------------------------------------------------------------------- /example/traffic/simulation_adapter.cu: -------------------------------------------------------------------------------- 1 | #include "simulation_adapter.h" 2 | 3 | namespace simulation { 4 | namespace standard { 5 | class Simulation; 6 | Simulation* instance; 7 | } // namespace standard 8 | 9 | namespace aos_int { 10 | class Simulation; 11 | Simulation* instance; 12 | } // namespace aos_int 13 | 14 | namespace aos_int_cuda { 15 | void initialize(); 16 | void step(); 17 | } // namespace aos_int_cuda 18 | } // namespace simulation 19 | 20 | #include "traffic-simulation/option_standard.inc" 21 | #include "traffic-simulation/graphml_simulation.inc" 22 | 23 | using namespace std; 24 | 25 | #include "traffic-simulation/traffic_aos_int.h" 26 | 27 | void load_simulation(int argc, char** argv, unsigned int num_cars) { 28 | instance = build_simulation(argc, argv, num_cars); 29 | simulation::aos_int::instance = new simulation::aos_int::Simulation( 30 | simulation::standard::instance); 31 | simulation::aos_int_cuda::initialize(); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /example/traffic/simulation_adapter.h: -------------------------------------------------------------------------------- 1 | #ifndef EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H 2 | #define EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H 3 | 4 | void load_simulation(int argc, char** argv, unsigned int num_cars); 5 | 6 | #endif // EXAMPLE_TRAFFIC_SIMULATION_ADAPTER_H 7 | -------------------------------------------------------------------------------- /example/traffic/simulation_converter.inc: -------------------------------------------------------------------------------- 1 | using IndexType = unsigned int; 2 | 3 | // Data storage. 4 | namespace simulation { 5 | namespace aos_int { 6 | extern IndexType s_size_Cell; 7 | extern IndexType s_size_outgoing_cells; 8 | extern IndexType s_size_incoming_cells; 9 | extern IndexType s_size_Car; 10 | extern IndexType s_size_car_paths; 11 | extern IndexType s_size_inactive_cars; 12 | extern IndexType s_size_TrafficLight; 13 | extern IndexType s_size_PriorityYieldTrafficController; 14 | extern IndexType s_size_SharedSignalGroup; 15 | extern IndexType s_size_traffic_light_signal_groups; 16 | extern IndexType s_size_priority_ctrl_signal_groups; 17 | extern IndexType s_size_signal_group_cells; 18 | } // namespace aos_int 19 | 20 | namespace aos_int_cuda { 21 | extern Cell* dev_Cell; 22 | extern IndexType* dev_outgoing_cells; 23 | extern IndexType* dev_incoming_cells; 24 | extern Car* dev_Car; 25 | extern IndexType* dev_car_paths; 26 | extern IndexType* dev_inactive_cars; 27 | extern TrafficLight* dev_TrafficLight; 28 | extern PriorityYieldTrafficController* dev_PriorityYieldTrafficController; 29 | extern SharedSignalGroup* dev_SharedSignalGroup; 30 | extern IndexType* dev_traffic_light_signal_groups; 31 | extern IndexType* dev_priority_ctrl_signal_groups; 32 | extern IndexType* dev_signal_group_cells; 33 | } // namespace aos_int_cuda 34 | } // namespace simulation 35 | 36 | __global__ void convert_to_ikra_cpp_cells( 37 | IndexType s_size_Cell, 38 | simulation::aos_int_cuda::Cell* s_Cell, 39 | IndexType s_size_outgoing_cells, 40 | IndexType* s_outgoing_cells, 41 | IndexType s_size_incoming_cells, 42 | IndexType* s_incoming_cells) { 43 | unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x; 44 | 45 | if (tid < s_size_Cell) { 46 | simulation::aos_int_cuda::Cell& cell = s_Cell[tid]; 47 | Car* car_ptr = cell.car_ == 4294967295 48 | ? nullptr : Car::get_uninitialized(cell.car_); 49 | 50 | Cell* new_cell = new(Cell::get_uninitialized(tid)) Cell( 51 | cell.max_velocity_, cell.x_, cell.y_, 52 | cell.num_incoming_cells_, 53 | s_incoming_cells + cell.first_incoming_cell_idx_, 54 | cell.num_outgoing_cells_, 55 | s_outgoing_cells + cell.first_outgoing_cell_idx_, 56 | car_ptr, cell.is_free_, cell.is_sink_, 57 | cell.controller_max_velocity_, 58 | (Cell::Type) cell.type_); 59 | assert(new_cell->id() == tid); 60 | } 61 | 62 | if (tid == 0) { 63 | Cell::storage().increase_size(s_size_Cell); 64 | } 65 | } 66 | 67 | __global__ void convert_to_ikra_cpp_cars( 68 | IndexType s_size_Car, 69 | simulation::aos_int_cuda::Car* s_Car, 70 | IndexType s_size_Cell) { 71 | unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x; 72 | 73 | if (tid < s_size_Car) { 74 | simulation::aos_int_cuda::Car& car = s_Car[tid]; 75 | 76 | // Every car must have a position. 77 | assert(car.position_ != 4294967295); 78 | assert(car.position_ < s_size_Cell); 79 | 80 | Cell* cell_ptr = car.position_ == 4294967295 81 | ? nullptr : Cell::get_uninitialized(car.position_); 82 | 83 | Car* new_car = new(Car::get_uninitialized(tid)) Car( 84 | car.is_active_, car.velocity_, car.max_velocity_, 85 | car.random_state_, cell_ptr); 86 | assert(new_car->id() == tid); 87 | } 88 | 89 | if (tid == 0) { 90 | Car::storage().increase_size(s_size_Car); 91 | new Simulation(); 92 | } 93 | } 94 | 95 | __global__ void convert_to_ikra_cpp_signal_groups( 96 | IndexType s_size_SharedSignalGroup, 97 | simulation::aos_int_cuda::SharedSignalGroup* s_SharedSignalGroup, 98 | IndexType* s_cells) { 99 | unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x; 100 | 101 | if (tid < s_size_SharedSignalGroup) { 102 | simulation::aos_int_cuda::SharedSignalGroup& group = 103 | s_SharedSignalGroup[tid]; 104 | 105 | SharedSignalGroup* new_group = 106 | new(SharedSignalGroup::get_uninitialized(tid)) SharedSignalGroup( 107 | group.num_cells_, s_cells + group.first_cell_idx_); 108 | assert(new_group->id() == tid); 109 | } 110 | 111 | if (tid == 0) { 112 | SharedSignalGroup::storage().increase_size(s_size_SharedSignalGroup); 113 | } 114 | } 115 | 116 | __global__ void convert_to_ikra_cpp_traffic_lights( 117 | IndexType s_size_TrafficLight, 118 | simulation::aos_int_cuda::TrafficLight* s_TrafficLight, 119 | IndexType* s_signal_groups) { 120 | unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x; 121 | 122 | if (tid < s_size_TrafficLight) { 123 | simulation::aos_int_cuda::TrafficLight& light = s_TrafficLight[tid]; 124 | 125 | TrafficLight* new_light = new(TrafficLight::get_uninitialized(tid)) 126 | TrafficLight(light.timer_, light.phase_time_, light.phase_, 127 | s_signal_groups + light.first_signal_group_idx_, 128 | light.num_signal_groups_); 129 | assert(new_light->id() == tid); 130 | } 131 | 132 | if (tid == 0) { 133 | TrafficLight::storage().increase_size(s_size_TrafficLight); 134 | } 135 | 136 | } 137 | 138 | __global__ void convert_to_ikra_cpp_priority_ctrls( 139 | IndexType s_size_PriorityYieldTrafficController, 140 | simulation::aos_int_cuda::PriorityYieldTrafficController* 141 | s_PriorityYieldTrafficController, 142 | IndexType* s_signal_groups) { 143 | unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x; 144 | 145 | if (tid < s_size_PriorityYieldTrafficController) { 146 | simulation::aos_int_cuda::PriorityYieldTrafficController& ctrl = 147 | s_PriorityYieldTrafficController[tid]; 148 | 149 | PriorityYieldTrafficController* new_ctrl = 150 | new(PriorityYieldTrafficController::get_uninitialized(tid)) 151 | PriorityYieldTrafficController( 152 | s_signal_groups + ctrl.first_group_idx_, 153 | ctrl.num_groups_); 154 | assert(new_ctrl->id() == tid); 155 | } 156 | 157 | if (tid == 0) { 158 | PriorityYieldTrafficController::storage().increase_size( 159 | s_size_PriorityYieldTrafficController); 160 | } 161 | } 162 | 163 | void convert_simulation() { 164 | assert(simulation::aos_int::s_size_Car <= kNumCars); 165 | assert(simulation::aos_int::s_size_Cell <= kNumCells); 166 | assert(simulation::aos_int::s_size_SharedSignalGroup 167 | <= kNumSharedSignalGroups); 168 | assert(simulation::aos_int::s_size_TrafficLight <= kNumTrafficLights); 169 | assert(simulation::aos_int::s_size_PriorityYieldTrafficController 170 | <= kNumPriorityCtrls); 171 | 172 | convert_to_ikra_cpp_cells<<>>( 173 | simulation::aos_int::s_size_Cell, 174 | simulation::aos_int_cuda::dev_Cell, 175 | simulation::aos_int::s_size_outgoing_cells, 176 | simulation::aos_int_cuda::dev_outgoing_cells, 177 | simulation::aos_int::s_size_incoming_cells, 178 | simulation::aos_int_cuda::dev_incoming_cells); 179 | gpuErrchk(cudaDeviceSynchronize()); 180 | 181 | convert_to_ikra_cpp_cars<<>>( 182 | simulation::aos_int::s_size_Car, 183 | simulation::aos_int_cuda::dev_Car, 184 | simulation::aos_int::s_size_Cell); 185 | gpuErrchk(cudaDeviceSynchronize()); 186 | 187 | convert_to_ikra_cpp_signal_groups<<>>( 188 | simulation::aos_int::s_size_SharedSignalGroup, 189 | simulation::aos_int_cuda::dev_SharedSignalGroup, 190 | simulation::aos_int_cuda::dev_signal_group_cells); 191 | gpuErrchk(cudaDeviceSynchronize()); 192 | 193 | convert_to_ikra_cpp_traffic_lights<<>>( 194 | simulation::aos_int::s_size_TrafficLight, 195 | simulation::aos_int_cuda::dev_TrafficLight, 196 | simulation::aos_int_cuda::dev_traffic_light_signal_groups); 197 | gpuErrchk(cudaDeviceSynchronize()); 198 | 199 | convert_to_ikra_cpp_priority_ctrls<<>>( 200 | simulation::aos_int::s_size_PriorityYieldTrafficController, 201 | simulation::aos_int_cuda::dev_PriorityYieldTrafficController, 202 | simulation::aos_int_cuda::dev_priority_ctrl_signal_groups); 203 | gpuErrchk(cudaDeviceSynchronize()); 204 | } 205 | -------------------------------------------------------------------------------- /example/traffic/statistics.inc: -------------------------------------------------------------------------------- 1 | 2 | #define STRINGIFY(s) STRINGIFY2(s) 3 | #define STRINGIFY2(val) #val 4 | 5 | #define PRINT_HISTORAM(class, function, maxval) \ 6 | { \ 7 | printf("------------------------------------------------\n"); \ 8 | int counter[maxval];\ 9 | uint64_t sum = 0; \ 10 | uint64_t squared_sum = 0; \ 11 | for (int i = 0; i < maxval; ++i) { \ 12 | counter[i] = 0; \ 13 | } \ 14 | for (uint32_t i = 0; i < class::size(); ++i) { \ 15 | counter[class::get(i)->function()]++; \ 16 | sum += class::get(i)->function(); \ 17 | squared_sum += class::get(i)->function() * class::get(i)->function(); \ 18 | } \ 19 | for (int i = 0; i < maxval; ++i) { \ 20 | if (counter[i] > 0) { \ 21 | printf(STRINGIFY(class) "::" STRINGIFY(function) "[%i] = %i [ %f % ]\n", \ 22 | i, counter[i], (float) counter[i] * 100.0f / class::size()); \ 23 | } \ 24 | } \ 25 | double mean = sum / (double) class::size(); \ 26 | double variance = (squared_sum - class::size() * mean * mean) \ 27 | / (class::size() - 1.0f); \ 28 | printf("Mean = %f, Variance = %f, stddev = %f\n", \ 29 | mean, variance, sqrt(variance)); \ 30 | } 31 | 32 | __global__ void print_histograms_1() { 33 | PRINT_HISTORAM(Car, velocity, 50); 34 | PRINT_HISTORAM(Car, max_velocity, 50); 35 | } 36 | 37 | __global__ void print_histograms_2() { 38 | PRINT_HISTORAM(SharedSignalGroup, num_cells, 50); 39 | PRINT_HISTORAM(TrafficLight, num_signal_groups, 50); 40 | } 41 | 42 | __global__ void print_histograms_3() { 43 | PRINT_HISTORAM(Cell, num_incoming_cells, 50); 44 | } 45 | 46 | __global__ void print_histograms_4() { 47 | PRINT_HISTORAM(Cell, num_outgoing_cells, 50); 48 | } 49 | 50 | __global__ void print_histograms_5() { 51 | PRINT_HISTORAM(Cell, street_max_velocity, 50); 52 | } 53 | 54 | __global__ void caluclate_checksum() { 55 | uint64_t c = 17; 56 | for (uint32_t i = 0; i < Car::size(); ++i) { 57 | Cell* position = Car::get(i)->position(); 58 | c += position->x() + position->y(); 59 | c %= UINT64_MAX; 60 | } 61 | int result = c % 1234567; 62 | 63 | printf("Checksum: %i\n", result); 64 | } 65 | 66 | void print_statistics(unsigned long time_cars, unsigned long time_controllers, 67 | unsigned long time_total) { 68 | unsigned long utilization_Cell = Cell::storage().arena_utilization(); 69 | unsigned long utilization_Car = Car::storage().arena_utilization(); 70 | unsigned long utilization_SharedSignalGroup = 71 | SharedSignalGroup::storage().arena_utilization(); 72 | unsigned long utilization_TrafficLight = 73 | TrafficLight::storage().arena_utilization(); 74 | unsigned long utilization_PriorityCtrl = 75 | PriorityYieldTrafficController::storage().arena_utilization(); 76 | unsigned long total_utilization = 77 | utilization_Cell + utilization_Car + utilization_SharedSignalGroup 78 | + utilization_TrafficLight + utilization_PriorityCtrl; 79 | 80 | unsigned long long int storage_Cell = Cell::size()*Cell::ObjectSize::value; 81 | unsigned long long int storage_Car = Car::size()*Car::ObjectSize::value; 82 | unsigned long long int storage_SignalGroup = 83 | SharedSignalGroup::size()*SharedSignalGroup::ObjectSize::value; 84 | unsigned long long int storage_TrafficLight = 85 | TrafficLight::size()*TrafficLight::ObjectSize::value; 86 | unsigned long long int storage_PriorityCtrl = 87 | PriorityYieldTrafficController::size() 88 | *PriorityYieldTrafficController::ObjectSize::value; 89 | 90 | unsigned long long int regular_storage_size = 91 | storage_Cell + storage_Car + storage_SignalGroup 92 | + storage_TrafficLight + storage_PriorityCtrl; 93 | 94 | fprintf(stderr, 95 | "%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu,%llu,%llu,%llu,%llu,%llu,%llu," 96 | STRINGIFY(BENCH_LAYOUT_MODE) "," 97 | STRINGIFY(BENCH_CELL_IN_MODE) "," 98 | STRINGIFY(BENCH_CELL_OUT_MODE) "," 99 | STRINGIFY(BENCH_CAR_MODE) "," 100 | STRINGIFY(BENCH_SIGNAL_GROUP_MODE) "," 101 | STRINGIFY(BENCH_TRAFFIC_LIGHT_MODE) "\n", 102 | time_cars, 103 | time_controllers, 104 | time_total, 105 | utilization_Cell, 106 | utilization_Car, 107 | utilization_SharedSignalGroup, 108 | utilization_TrafficLight, 109 | utilization_PriorityCtrl, 110 | total_utilization, 111 | storage_Cell, 112 | storage_Car, 113 | storage_SignalGroup, 114 | storage_TrafficLight, 115 | storage_PriorityCtrl, 116 | regular_storage_size); 117 | } 118 | -------------------------------------------------------------------------------- /example/traffic/traffic.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "executor/executor.h" 5 | #include "soa/soa.h" 6 | #include "executor/cuda_executor.h" 7 | 8 | #include "benchmark.h" 9 | #include "configuration.h" 10 | #include "simulation_adapter.h" 11 | #include "traffic-simulation/traffic_aos_int_cuda.h" 12 | 13 | using ikra::soa::SoaLayout; 14 | using ikra::soa::kAddressModeZero; 15 | using ikra::soa::StaticStorageWithArena; 16 | 17 | class Car; 18 | 19 | // TODO: Figure out how extern __device__ variables work in CUDA, so that we 20 | // can use normal header files instead of textual headers. 21 | #include "entities/cell.inc" 22 | #include "entities/car.inc" 23 | #include "entities/shared_signal_group.inc" 24 | #include "entities/traffic_light.inc" 25 | #include "entities/priority_yield_traffic_controller.inc" 26 | #include "entities/simulation.inc" 27 | #include "simulation_converter.inc" 28 | #include "statistics.inc" 29 | 30 | 31 | void run_traffic_controllers() { 32 | cuda_execute(&Simulation::step_random_state); 33 | cuda_execute(&TrafficLight::step); 34 | cuda_execute(&PriorityYieldTrafficController::step); 35 | cudaDeviceSynchronize(); 36 | } 37 | 38 | void run_cars() { 39 | cuda_execute(&Car::step_prepare_path); 40 | cudaDeviceSynchronize(); 41 | 42 | cuda_execute(&Car::step_move); 43 | cudaDeviceSynchronize(); 44 | 45 | cuda_execute(&Car::step_reactivate); 46 | cudaDeviceSynchronize(); 47 | } 48 | 49 | void benchmark() { 50 | uint64_t time_controllers[kNumBenchmarkRuns] = {0}; 51 | uint64_t time_cars[kNumBenchmarkRuns] = {0}; 52 | uint64_t time_total[kNumBenchmarkRuns] = {0}; 53 | 54 | for (uint32_t r = 0; r < kNumBenchmarkRuns; ++r) { 55 | Cell::initialize_storage(); 56 | Car::initialize_storage(); 57 | SharedSignalGroup::initialize_storage(); 58 | TrafficLight::initialize_storage(); 59 | PriorityYieldTrafficController::initialize_storage(); 60 | 61 | convert_simulation(); 62 | cuda_execute(&TrafficLight::initialize); 63 | cuda_execute(&PriorityYieldTrafficController::initialize); 64 | cudaDeviceSynchronize(); 65 | 66 | for (uint32_t i = 0; i < kNumIterations; ++i) { 67 | uint64_t t_ctrl = measure<>::execution(run_traffic_controllers); 68 | time_controllers[r] += t_ctrl; 69 | 70 | uint64_t t_car = measure<>::execution(run_cars); 71 | time_cars[r] += t_car; 72 | 73 | time_total[r] = time_controllers[r] + time_cars[r]; 74 | } 75 | gpuErrchk(cudaPeekAtLastError()); 76 | } 77 | 78 | // Find best run. 79 | uint64_t best_time = std::numeric_limits::max(); 80 | uint32_t best_index = -1; 81 | for (uint32_t r = 0; r < kNumBenchmarkRuns; ++r) { 82 | if (time_total[r] < best_time) { 83 | best_time = time_total[r]; 84 | best_index = r; 85 | } 86 | } 87 | 88 | // Print best run. 89 | print_statistics(time_cars[best_index]/1000, 90 | time_controllers[best_index]/1000, 91 | time_total[best_index]/1000); 92 | } 93 | 94 | int main(int argc, char** argv) { 95 | load_simulation(argc, argv, kNumCars); 96 | benchmark(); 97 | 98 | print_histograms_1<<<1, 1>>>(); 99 | gpuErrchk(cudaDeviceSynchronize()); 100 | print_histograms_2<<<1, 1>>>(); 101 | gpuErrchk(cudaDeviceSynchronize()); 102 | print_histograms_3<<<1, 1>>>(); 103 | gpuErrchk(cudaDeviceSynchronize()); 104 | print_histograms_4<<<1, 1>>>(); 105 | gpuErrchk(cudaDeviceSynchronize()); 106 | print_histograms_5<<<1, 1>>>(); 107 | gpuErrchk(cudaDeviceSynchronize()); 108 | caluclate_checksum<<<1,1>>>(); 109 | gpuErrchk(cudaDeviceSynchronize()); 110 | } 111 | -------------------------------------------------------------------------------- /ikra/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(ikra INTERFACE) 2 | target_include_directories(ikra INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 3 | 4 | 5 | -------------------------------------------------------------------------------- /ikra/executor/executor.h: -------------------------------------------------------------------------------- 1 | #ifndef EXECUTOR_EXECUTOR_H 2 | #define EXECUTOR_EXECUTOR_H 3 | 4 | // Asserts active only in debug mode (NDEBUG). 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "executor/util.h" 12 | #include "soa/constants.h" 13 | 14 | namespace ikra { 15 | namespace executor { 16 | 17 | using ikra::soa::IndexType; 18 | 19 | // A convenience class storing two iterators, i.e., a range of SOA objects. 20 | template 21 | class IteratorRange { 22 | public: 23 | IteratorRange(T begin, T end) : begin_(begin), end_(end) {} 24 | 25 | T begin() { 26 | return begin_; 27 | } 28 | 29 | T end() { 30 | return end_; 31 | } 32 | 33 | private: 34 | T begin_; 35 | T end_; 36 | }; 37 | 38 | template 39 | T* construct(size_t count, Args... args) { 40 | assert(count > 0); 41 | T* first = new T(args...); 42 | for (size_t i = 1; i < count; ++i) { 43 | new T(args...); 44 | } 45 | return first; 46 | } 47 | 48 | // This function executes a method on all SOA objects that are enumerated by an 49 | // iterator (between begin and end). This is currently a fully sequential 50 | // operation. 51 | template 52 | void execute(T begin, T end, F function, Args... args) { 53 | for (auto iter = begin; iter != end; ++iter) { 54 | // TODO: Should we override and use operator->* here? 55 | ((**iter).*function)(args...); 56 | } 57 | } 58 | 59 | // This function executes a method on all SOA objects that are enumerated by an 60 | // iterator (between begin and end). It also reduces all return values. 61 | template 62 | D execute_and_reduce(T begin, T end, F function, G reduce_function, 63 | D default_value, Args... args) { 64 | if (begin == end) { 65 | return default_value; 66 | } 67 | 68 | D reduced_value = ((**begin).*function)(args...); 69 | for (auto iter = ++begin; iter != end; ++iter) { 70 | // TODO: Should we override and use operator->* here? 71 | reduced_value = reduce_function(reduced_value, 72 | ((**iter).*function)(args...)); 73 | } 74 | return reduced_value; 75 | } 76 | 77 | // This function executes a method on all SOA objects that are enumerated by an 78 | // iterator (between begin and end). This is currently a fully sequential 79 | // operation. 80 | template 81 | void execute(IteratorRange range, F function, Args... args) { 82 | for (auto iter = range.begin(); iter != range.end(); ++iter) { 83 | ((**iter).*function)(args...); 84 | } 85 | } 86 | 87 | // This function executes a method on all SOA objects that are enumerated by an 88 | // iterator (length many elements). This is currently a fully sequential 89 | // operation. 90 | template 91 | void execute(T begin, IndexType length, F function, Args... args) { 92 | T iter = begin; 93 | for (uint32_t i = 0; i < length; ++i, ++iter) { 94 | ((**iter).*function)(args...); 95 | } 96 | } 97 | 98 | // This function executes a method on all SOA objects of a given class. 99 | // This is currently a fully sequential operation. 100 | template::class_type> 102 | void execute(F function, Args... args) { 103 | IndexType num_instances = T::size(); 104 | for (IndexType i = 0; i < num_instances; ++i) { 105 | ((*T::get(i)).*function)(args...); 106 | } 107 | } 108 | 109 | // This function executes a method on all SOA objects of a given class, 110 | // starting from ID 0 up to a given ID (template parameter). 111 | // This is currently a fully sequential operation. 112 | template::class_type> 114 | void execute(F function, Args... args) { 115 | for (IndexType i = 0; i < MaxId; ++i) { 116 | ((*T::get(i)).*function)(args...); 117 | } 118 | } 119 | 120 | // This function executes a method on all SOA objects of a given class and 121 | // reduces the return values. 122 | template::class_type> 124 | D execute_and_reduce(F function, G reduce_function, D default_value, 125 | Args... args) { 126 | return execute_and_reduce(T::begin(), T::end(), 127 | function, reduce_function, default_value, args...); 128 | } 129 | 130 | } // namespace executor 131 | } // namespace ikra 132 | 133 | #endif // EXECUTOR_EXECUTOR_H 134 | -------------------------------------------------------------------------------- /ikra/executor/iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef EXECUTOR_ITERATOR_H 2 | #define EXECUTOR_ITERATOR_H 3 | 4 | #include 5 | #include "soa/constants.h" 6 | 7 | namespace ikra { 8 | namespace executor { 9 | 10 | using ikra::soa::IndexType; 11 | 12 | // This class implements an iterator for SOA objects, starting a given ID. It 13 | // points to an object with ID i and forwards the pointer to i + 1. This class 14 | // can handle both valid and zero addressing mode. 15 | template 16 | class Iterator { 17 | private: 18 | using InnerT = typename std::remove_pointer::type; 19 | 20 | public: 21 | // This iterator should be used on pointers to SOA objects. 22 | static_assert(std::is_pointer::value, "Expected pointer type."); 23 | 24 | T& operator*() { 25 | return position_; 26 | } 27 | 28 | // This iterator can go in both directions. 29 | template 30 | typename std::enable_if>::type& 31 | operator+=(IndexType distance) { 32 | auto next = reinterpret_cast(position_) + distance; 33 | position_ = reinterpret_cast(next); 34 | return *this; 35 | } 36 | 37 | template 38 | typename std::enable_if>::type& 39 | operator+=(IndexType distance) { 40 | auto next = reinterpret_cast(position_) + A*distance; 41 | position_ = reinterpret_cast(next); 42 | return *this; 43 | } 44 | 45 | template 46 | typename std::enable_if>::type& 47 | operator-=(IndexType distance) { 48 | auto next = reinterpret_cast(position_) - distance; 49 | position_ = reinterpret_cast(next); 50 | return *this; 51 | } 52 | 53 | template 54 | typename std::enable_if>::type& 55 | operator-=(IndexType distance) { 56 | auto next = reinterpret_cast(position_) - A*distance; 57 | position_ = reinterpret_cast(next); 58 | return *this; 59 | } 60 | 61 | template 62 | typename std::enable_if>::type 63 | operator+(IndexType delta) const { 64 | Iterator copy = *this; 65 | auto new_pos = reinterpret_cast(copy.position_) + delta; 66 | copy.position_ = reinterpret_cast(new_pos); 67 | return copy; 68 | } 69 | 70 | template 71 | typename std::enable_if>::type 72 | operator+(IndexType delta) const { 73 | Iterator copy = *this; 74 | auto new_pos = reinterpret_cast(copy.position_) + A*delta; 75 | copy.position_ = reinterpret_cast(new_pos); 76 | return copy; 77 | } 78 | 79 | Iterator& operator++() { 80 | return this->operator+=(1); 81 | } 82 | 83 | Iterator& operator--() { 84 | return this->operator-=(1); 85 | } 86 | 87 | bool operator!=(Iterator other) const { 88 | return position_ != other.position_; 89 | } 90 | 91 | bool operator==(Iterator other) const { 92 | return position_ == other.position_; 93 | } 94 | 95 | Iterator(T position) : position_(position) {} 96 | 97 | private: 98 | T position_; 99 | }; 100 | 101 | // Helper function/constructor for Iterator for automatic template parameter 102 | // deduction. 103 | template 104 | Iterator make_iterator(T position) { 105 | return Iterator(position); 106 | } 107 | 108 | } // namespace executor 109 | } // namespace ikra 110 | 111 | #endif // EXECUTOR_ITERATOR_H -------------------------------------------------------------------------------- /ikra/executor/util.h: -------------------------------------------------------------------------------- 1 | #ifndef EXECUTOR_UTIL_H 2 | #define EXECUTOR_UTIL_H 3 | 4 | namespace ikra { 5 | namespace executor { 6 | 7 | template 8 | struct FunctionTypeHelper; 9 | 10 | template 11 | struct FunctionTypeHelper { 12 | using return_type = R; 13 | using class_type = C; 14 | }; 15 | 16 | } // namespace executor 17 | } // namespace ikra 18 | 19 | #endif // EXECUTOR_UTIL_H 20 | -------------------------------------------------------------------------------- /ikra/soa/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(soa INTERFACE) 2 | target_include_directories(soa INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/..) 3 | -------------------------------------------------------------------------------- /ikra/soa/addressable_field_shared.inc: -------------------------------------------------------------------------------- 1 | // This file should be included for field types that have an address. E.g., 2 | // AOS-style arrays have a single address, but SOA-style arrays do not. 3 | 4 | public: 5 | // TODO: Disable operator if T is a SOA class. SOA object pointer cannot be 6 | // dereferenced. 7 | __ikra_device__ T& operator*() const { 8 | return *data_ptr(); 9 | } 10 | 11 | __ikra_device__ T* operator->() const { 12 | return data_ptr(); 13 | } 14 | 15 | __ikra_device__ T* operator&() const { 16 | return data_ptr(); 17 | } 18 | 19 | // Get the value of this field. This method is not usually needed and the 20 | // preferred way to retrieve a value is through implicit conversion. 21 | __ikra_device__ T& get() const { 22 | return *data_ptr(); 23 | } 24 | 25 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__) 26 | // Operator for implicit conversion to type T. Either running device code or 27 | // not running in CUDA mode at all. 28 | __ikra_device__ operator T&() const { 29 | return *data_ptr(); 30 | } 31 | 32 | // Assignment operator. 33 | __ikra_device__ Self& operator=(T value) { 34 | *data_ptr() = value; 35 | return *this; 36 | } 37 | #else 38 | T* device_data_ptr() const { 39 | auto h_data_ptr = reinterpret_cast(data_ptr()); 40 | auto h_storage_data = reinterpret_cast(&Owner::storage()); 41 | auto data_offset = h_data_ptr - h_storage_data; 42 | auto d_storage_ptr = reinterpret_cast( 43 | Owner::storage().device_ptr()); 44 | return reinterpret_cast(d_storage_ptr + data_offset); 45 | } 46 | 47 | void copy_from_device(T* target) const { 48 | cudaMemcpy(target, device_data_ptr(), sizeof(T), cudaMemcpyDeviceToHost); 49 | } 50 | 51 | T copy_from_device() const { 52 | T host_data; 53 | copy_from_device(&host_data); 54 | return host_data; 55 | } 56 | 57 | // Operator for implicit conversion to type T. Running in CUDA mode on the 58 | // host. Data must be copied. 59 | // TODO: This method is broken when compiling in CUDA mode but host execution 60 | // is intended. 61 | operator T() const { 62 | return copy_from_device(); 63 | } 64 | 65 | // Assignment operator. 66 | // TODO: Probably need to handle SOA pointer differently here. 67 | Self& operator=(T value) { 68 | cudaMemcpy(device_data_ptr(), &value, sizeof(T), cudaMemcpyHostToDevice); 69 | return *this; 70 | } 71 | #endif // __CUDA_ARCH__ 72 | 73 | protected: 74 | __ikra_device__ T* data_ptr() const { 75 | return data_ptr_uninitialized(); 76 | } 77 | 78 | // Calculate the address of this field based on the "this" pointer of this 79 | // Field instance. 80 | template 81 | __ikra_device__ typename std::enable_if::type 83 | data_ptr_uninitialized() const { 84 | auto p_this = reinterpret_cast(this); 85 | auto p_base = reinterpret_cast(Owner::storage().data_ptr()); 86 | auto p_result = (p_this - p_base - A)/A*sizeof(T) + p_base + 87 | (Capacity+1)*Offset; 88 | return reinterpret_cast(p_result); 89 | } 90 | 91 | // Calculate the address of this field based on the "this" pointer of this 92 | // Field instance. 93 | template 94 | __ikra_device__ typename std::enable_if::type 97 | data_ptr_uninitialized() const { 98 | // TODO: Fix field alignment. 99 | return reinterpret_cast( 100 | reinterpret_cast(const_cast(this)) + Offset); 101 | } 102 | 103 | template 104 | __ikra_device__ typename std::enable_if::type 107 | data_ptr_uninitialized() const { 108 | // Use constant-folded value for address computation. 109 | constexpr auto cptr_data_offset = 110 | StorageDataOffset::value; 111 | constexpr auto cptr_storage_buffer = Owner::storage_buffer(); 112 | constexpr auto array_location = 113 | cptr_storage_buffer + cptr_data_offset + (Capacity+1)*Offset; 114 | 115 | #ifdef __clang__ 116 | // Clang does not allow reinterpret_cast in constexprs. 117 | constexpr T* soa_array = IKRA_fold(reinterpret_cast(array_location)); 118 | #else 119 | constexpr T* soa_array = reinterpret_cast(array_location); 120 | #endif // __clang__ 121 | 122 | // Check alignment. 123 | assert(reinterpret_cast(soa_array) % 8 == 0); 124 | 125 | return soa_array + reinterpret_cast(this); 126 | } 127 | 128 | template 129 | __ikra_device__ typename std::enable_if::type 132 | data_ptr_uninitialized() const { 133 | // Cannot constant fold dynamically allocated storage. 134 | auto p_base = Owner::storage().data_reference(); 135 | return reinterpret_cast( 136 | p_base + (Capacity+1)*Offset + 137 | reinterpret_cast(this)*sizeof(T)); 138 | } 139 | 140 | -------------------------------------------------------------------------------- /ikra/soa/array_shared.inc: -------------------------------------------------------------------------------- 1 | // This implementation is shared by all array classes. B is the base type 2 | // of the array. The most important functionality in here is interoperability 3 | // between host and device code. 4 | 5 | public: 6 | // Implement std::array interface. 7 | 8 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__) 9 | // Not running in CUDA mode or running on device: Can return a reference to 10 | // array values. 11 | 12 | __ikra_device__ B& operator[](size_t pos) const { 13 | return *this->array_data_ptr(pos); 14 | } 15 | 16 | __ikra_device__ B& at(size_t pos) const { 17 | // TODO: This function should throw an exception. 18 | assert(pos < ArraySize); 19 | return this->operator[](pos); 20 | } 21 | 22 | template 23 | __ikra_device__ B& at() const { 24 | static_assert(Pos < ArraySize, "Index out of bounds."); 25 | return *array_data_ptr(); 26 | } 27 | 28 | __ikra_device__ B& front() const { 29 | return at<0>(); 30 | } 31 | 32 | __ikra_device__ B& back() const { 33 | return at(); 34 | } 35 | #else 36 | 37 | // A helper class with an overridden operator= method. This class allows 38 | // clients to the "[]=" syntax for array assignment, even if the array is 39 | // physically located on the device. 40 | // Addresses of instances point to host data locations. 41 | class AssignmentHelper { 42 | public: 43 | // TODO: Assuming zero addressing mode. Must translate addresses in valid 44 | // addressing mode. 45 | void copy_from_device(B* target) { 46 | auto dev_ptr = device_ptr(); 47 | cudaMemcpy(target, dev_ptr, sizeof(B), cudaMemcpyDeviceToHost); 48 | assert(cudaPeekAtLastError() == cudaSuccess); 49 | } 50 | 51 | B copy_from_device() { 52 | B host_data; 53 | copy_from_device(&host_data); 54 | return host_data; 55 | } 56 | 57 | // Implicit conversion: Copy from device. 58 | operator B() { 59 | return copy_from_device(); 60 | } 61 | 62 | // TODO: Assuming zero addressing mode. 63 | AssignmentHelper& operator=(B value) { 64 | cudaMemcpy(device_ptr(), &value, sizeof(B), cudaMemcpyHostToDevice); 65 | assert(cudaPeekAtLastError() == cudaSuccess); 66 | return *this; 67 | } 68 | 69 | AssignmentHelper& operator+=(B value) { 70 | // TODO: Implement. 71 | printf("Warning: Calling unimplemented function AssignmentHelper+=.\n"); 72 | assert(false); 73 | } 74 | 75 | B operator->() { 76 | return copy_from_device(); 77 | } 78 | 79 | private: 80 | B* device_ptr() { 81 | return Owner::storage().translate_address_host_to_device( 82 | reinterpret_cast(this)); 83 | } 84 | }; 85 | 86 | AssignmentHelper& operator[](size_t pos) const { 87 | return *reinterpret_cast(array_data_ptr(pos)); 88 | } 89 | 90 | AssignmentHelper& at(size_t pos) const { 91 | // TODO: This function should throw an exception. 92 | assert(pos < ArraySize); 93 | return this->operator[](pos); 94 | } 95 | 96 | // TODO: Implement template-based accessor methods. 97 | #endif 98 | -------------------------------------------------------------------------------- /ikra/soa/class_initialization.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_CLASS_INITIALIZATION_H 2 | #define SOA_CLASS_INITIALIZATION_H 3 | 4 | #include "soa/util.h" 5 | 6 | #define IKRA_INITIALIZE_CLASS \ 7 | /* This class is used to calculate field offsets inside a SOA class. For every 8 | * field declaration, a new class specialization will be generated using 9 | * macro expansion. "N" is index of the field and "value" is the offset of 10 | * that field in bytes. "DummyT" is required only because full template 11 | * specializations are not allowed for nested classes in C++. */ \ 12 | template \ 13 | struct OffsetCounter { \ 14 | /* Used by ObjectSize_. Indicates that the no more fields are available. */ \ 15 | static const bool kReachedGeneralCase = true; \ 16 | \ 17 | static void DBG_print_offsets() { /* Nothing to do. */ } \ 18 | }; \ 19 | \ 20 | /* Base case for first field declaration. */ \ 21 | template \ 22 | struct OffsetCounter<0, DummyT> { \ 23 | /* Offset of field 0 in bytes. */ \ 24 | static const uint32_t value = 0; \ 25 | \ 26 | /* Used by ObjectSize_. Indicates that this template instantiation was 27 | * generated by a field declaration. */ \ 28 | static const bool kIsSpecialization = true; \ 29 | \ 30 | static void DBG_print_offsets() { \ 31 | printf("OffsetCounter<%i>::value = %u\n", 0, value); \ 32 | OffsetCounter<1>::DBG_print_offsets(); \ 33 | } \ 34 | }; \ 35 | \ 36 | /* This class calculates object size and number of fields. 37 | * Note: This class cannot be used from within a SOA class. Only when all 38 | * fields are defined, members of this class may be accessed. */ \ 39 | template \ 40 | struct ObjectSize_ { \ 41 | private: \ 42 | template \ 43 | static typename std::enable_if< \ 44 | OffsetCounter::kIsSpecialization, \ 45 | std::integral_constant::kNumFields>>::type \ 46 | sfinae_dummy() {} \ 47 | \ 48 | template \ 49 | static typename std::enable_if< \ 50 | OffsetCounter::kReachedGeneralCase, \ 51 | std::integral_constant>::type \ 52 | sfinae_dummy() {} \ 53 | \ 54 | public: \ 55 | static const int kNumFields = decltype(sfinae_dummy())::value; \ 56 | static const uintptr_t value = OffsetCounter::value; \ 57 | }; \ 58 | \ 59 | /* Alias for simplier access of object size. */ \ 60 | using ObjectSize = ObjectSize_<0>; \ 61 | /* Function that returns a reference to the storage buffer. 62 | * Will be inlined. Pointer dereference will be optimized out. */ \ 63 | __ikra_host_device__ static constexpr char* storage_buffer(); \ 64 | \ 65 | static const uint32_t kCounterFirstIndex = __COUNTER__; \ 66 | /* Debug function printing offsets. */ \ 67 | static void DBG_print_offsets() { \ 68 | OffsetCounter<0>::DBG_print_offsets(); \ 69 | } 70 | 71 | #endif // SOA_CLASS_INITIALIZATION_H 72 | -------------------------------------------------------------------------------- /ikra/soa/constants.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_CONSTANTS_H 2 | #define SOA_CONSTANTS_H 3 | 4 | #include 5 | 6 | namespace ikra { 7 | namespace soa { 8 | 9 | // In Zero Addressing Mode, the address of an object is its ID. E.g., the 10 | // address of the first object (ID 0) is nullptr. Pointer arithmetics is not 11 | // possible in Zero Addressing Mode, because the size of a SOA class is zero. 12 | // I.e., sizeof(MyClass) = 0. Increasing the size to 1 does not work, because 13 | // instance create will attempt to zero-initialize data at an invalid address. 14 | // Zero Addressing Mode results in efficient assembly code for field reads/ 15 | // writes. The address of a field `i` in object with ID t is defined as: 16 | // this*sizeof(type(i)) + Capacity*Offset(i) + ClassStorageBase. 17 | // Notice that the second operand is a compile-time constant if the container 18 | // size (max. number of elements of a class) is a compile-time constant. 19 | // See test/soa/benchmarks/codegen_test.cc for inspection of assembly code. 20 | static const int kAddressModeZero = 0; 21 | 22 | // In Valid Addressing Mode, the address of an object is the address of its 23 | // first field, i.e., the beginning of the class storage data chunk 24 | // (ClassStorageBase) plus ID*sizeof(first field type). The size of an object 25 | // now defined as the size of the first field, allowing for pointer arithmetics 26 | // on objects. Generated assembly code is less efficient, because the address 27 | // of a field is now complex: 28 | // (this - ClassStorageBase - sizeof(first field type)) 29 | // / sizeof(first field type) * sizeof(type(i)) + 30 | // Capacity*Offset(i) + ClassStorageBase 31 | // See test/soa/pointer_arithmetics_test.cc for pointer arithmetics examples. 32 | static const int kAddressModeValid = 1; 33 | 34 | static const int kLayoutModeAos = 0; 35 | 36 | static const int kLayoutModeSoa = 1; 37 | 38 | // The type that is used to represent indices. 39 | using IndexType = uintptr_t; 40 | 41 | } // namespace soa 42 | } // namespace ikra 43 | 44 | #endif // SOA_CONSTANTS_H 45 | -------------------------------------------------------------------------------- /ikra/soa/cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_CUDA_H 2 | #define SOA_CUDA_H 3 | 4 | #ifdef __CUDACC__ 5 | 6 | #ifdef __CUDA_ARCH__ 7 | // Make all SOA functions device functions when compiling device code. 8 | #define __ikra_device__ __device__ 9 | #else 10 | #define __ikra_device__ 11 | #endif // __CUDA_ARCH__ 12 | 13 | #define __ikra_host_device__ __host__ __device__ 14 | 15 | // Error checking code taken from: 16 | // https://stackoverflow.com/questions/14038589/ 17 | // what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api 18 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 19 | inline void gpuAssert(cudaError_t code, const char *file, int line, 20 | bool abort = true) { 21 | if (code != cudaSuccess) { 22 | fprintf(stderr,"GPUassert: %s %s %d\n", 23 | cudaGetErrorString(code), file, line); 24 | if (abort) exit(code); 25 | } 26 | } 27 | 28 | #else 29 | 30 | #define __ikra_device__ 31 | #define __ikra_host_device__ 32 | 33 | #endif // __CUDACC__ 34 | 35 | #endif // SOA_CUDA_H 36 | -------------------------------------------------------------------------------- /ikra/soa/field.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_FIELD_H 2 | #define SOA_FIELD_H 3 | 4 | #include "soa/constants.h" 5 | #include "soa/cuda.h" 6 | #include "soa/storage.h" 7 | #include "soa/util.h" 8 | 9 | // This marco is expanded for every inplace assignment operator and forwards 10 | // the operation to the wrapped data. 11 | #define IKRA_DEFINE_FIELD_ASSIGNMENT(symbol) \ 12 | __ikra_device__ Self operator symbol ## = (T value) { \ 13 | *data_ptr() symbol ## = value; \ 14 | return *this; \ 15 | } 16 | 17 | 18 | namespace ikra { 19 | namespace soa { 20 | 21 | template 28 | class Field_ { 29 | private: 30 | using Self = Field_; 32 | 33 | public: 34 | static const uint32_t DBG_OFFSET = Offset; // For debugging. 35 | 36 | // This class may only be used to declare fields of classes. 37 | void* operator new(size_t count) = delete; 38 | 39 | // Define inplace assignment operators. 40 | IKRA_DEFINE_FIELD_ASSIGNMENT(+); 41 | IKRA_DEFINE_FIELD_ASSIGNMENT(-); 42 | IKRA_DEFINE_FIELD_ASSIGNMENT(*); 43 | IKRA_DEFINE_FIELD_ASSIGNMENT(/); 44 | IKRA_DEFINE_FIELD_ASSIGNMENT(%); 45 | IKRA_DEFINE_FIELD_ASSIGNMENT(&); 46 | IKRA_DEFINE_FIELD_ASSIGNMENT(|); 47 | IKRA_DEFINE_FIELD_ASSIGNMENT(^); 48 | IKRA_DEFINE_FIELD_ASSIGNMENT(<<); 49 | IKRA_DEFINE_FIELD_ASSIGNMENT(>>); 50 | 51 | // TODO: Implement special operators: 52 | // http://en.cppreference.com/w/cpp/language/operator_logical 53 | 54 | protected: 55 | #if defined(__CUDA_ARCH__) || !defined(__CUDACC__) 56 | // Only Owner can create new fields for itself. 57 | friend Owner; 58 | #else 59 | // Only Owner can create new fields for itself. 60 | // Friend template is broken. This is an nvcc bug. 61 | friend typename NvccWorkaroundIdentityClassHolder::type; 62 | #endif 63 | 64 | __ikra_device__ Field_() {} 65 | 66 | // Initialize the field with a given value. Used in class constructor. 67 | __ikra_device__ Field_(T value) { 68 | *this = value; 69 | } 70 | 71 | // Initialize the field with the value of another field. Used in class 72 | // constructor. TODO: Unclear why exactly we need this one... 73 | __ikra_device__ Field_(const Field_& /*other*/) {} 74 | 75 | // Not sure why we need this. Required to make field inplace initialization 76 | // work. 77 | __ikra_device__ Field_(Field_&& /*other*/) {} 78 | 79 | 80 | #include "soa/field_shared.inc" 81 | #include "soa/addressable_field_shared.inc" 82 | }; 83 | 84 | } // namespace soa 85 | } // namespace ikra 86 | 87 | #undef IKRA_DEFINE_FIELD_ASSIGNMENT 88 | 89 | #endif // SOA_FIELD_H 90 | -------------------------------------------------------------------------------- /ikra/soa/field_shared.inc: -------------------------------------------------------------------------------- 1 | // This file contains methods that are common to Field_ and 2 | // FullyInlinedArrayField_. Cannot use inheritance here because sizeof 3 | // cannot be set to 0 in CUDA with inheritance. 4 | 5 | protected: 6 | template 7 | __ikra_device__ 8 | typename std::enable_if::type 9 | id() const { 10 | return (reinterpret_cast(this) - 11 | reinterpret_cast(Owner::storage().data_ptr())) 12 | / A - 1 - 1; 13 | } 14 | 15 | template 16 | __ikra_device__ 17 | typename std::enable_if::type 18 | id() const { 19 | static_assert(sizeof(Self) == 0, 20 | "Zero addressing mode not supported by compiler."); 21 | // TODO: Check if there's a slowdown here due to forwarding this. 22 | return reinterpret_cast(this)->id(); 23 | } 24 | 25 | // Force size of this class to be 0. 26 | char dummy_[0]; 27 | 28 | -------------------------------------------------------------------------------- /ikra/soa/field_type_generator.h: -------------------------------------------------------------------------------- 1 | // This macro is used when generating field types. A field type in user code 2 | // such as int__(idx) should expand to an offset counter increment and a field 3 | // type template instantiation (e.g., int_) at the computed offset. 4 | // Note: This macro must not be undefined even though it is only called within 5 | // this class. It is used from other macros and cannot be called anymore when 6 | // undefined here. 7 | #define IKRA_FIELD_TYPE_GENERATOR(type, field_id) \ 8 | template \ 9 | struct OffsetCounter { \ 10 | static const uint32_t value = OffsetCounter::value + \ 11 | sizeof(type); \ 12 | static const bool kIsSpecialization = true; \ 13 | static void DBG_print_offsets() { \ 14 | printf("OffsetCounter<%i>::value = %u\n", field_id + 1, value); \ 15 | OffsetCounter::DBG_print_offsets(); \ 16 | } \ 17 | }; \ 18 | soa_ ## type::value> 19 | 20 | #define IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, field_id) \ 21 | template \ 22 | struct OffsetCounter { \ 23 | static const uint32_t value = OffsetCounter::value + \ 24 | array::layout::value>::kSize; \ 25 | static const bool kIsSpecialization = true; \ 26 | static void DBG_print_offsets() { \ 27 | printf("OffsetCounter<%i>::value = %u [ ARRAY ]\n", \ 28 | field_id + 1, value); \ 29 | OffsetCounter::DBG_print_offsets(); \ 30 | } \ 31 | }; \ 32 | array::layout::value> 33 | 34 | #define IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, field_id) \ 35 | template \ 36 | struct OffsetCounter { \ 37 | static const uint32_t value = OffsetCounter::value + \ 38 | sizeof(type); \ 39 | static const bool kIsSpecialization = true; \ 40 | static void DBG_print_offsets() { \ 41 | printf("OffsetCounter<%i>::value = %u [ CUSTOM ]\n", \ 42 | field_id + 1, value); \ 43 | OffsetCounter::DBG_print_offsets(); \ 44 | } \ 45 | }; \ 46 | Field::value> 47 | 48 | // Generate types that keep track of offsets by themselves. Add more types 49 | // as needed. 50 | #define bool__(field_id) IKRA_FIELD_TYPE_GENERATOR(bool, field_id) 51 | #define char__(field_id) IKRA_FIELD_TYPE_GENERATOR(char, field_id) 52 | #define double__(field_id) IKRA_FIELD_TYPE_GENERATOR(double, field_id) 53 | #define float__(field_id) IKRA_FIELD_TYPE_GENERATOR(float, field_id) 54 | #define int__(field_id) IKRA_FIELD_TYPE_GENERATOR(int, field_id) 55 | #define uint8_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint8_t, field_id) 56 | #define uint16_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint16_t, field_id) 57 | #define uint32_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint32_t, field_id) 58 | #define uint64_t__(field_id) IKRA_FIELD_TYPE_GENERATOR(uint64_t, field_id) 59 | 60 | #define array__(...) PP_CONCAT(ARRAY__, PP_NARG(__VA_ARGS__))(__VA_ARGS__) 61 | #define ARRAY__4(type, size, layout, field_id) \ 62 | IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, field_id) 63 | #define ARRAY__3(type, size, field_id) ARRAY_3(type, size, soa, field_id) 64 | #define ARRAY__2(arg1, arg2) \ 65 | static_assert(false, "At least three arguments required for array__.") 66 | #define ARRAY__1(arg1) ARRAY_2(0, 0) 67 | #define ARRAY__0() ARRAY_2(0, 0) 68 | 69 | #define field__(type, field_id) \ 70 | IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, field_id) 71 | #define ref__(type, field_id) \ 72 | IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type*, field_id) 73 | 74 | // Generate types that keep track of offsets and field indices by themselves. 75 | // Add more types are needed. 76 | #define IKRA_NEXT_FIELD_ID __COUNTER__ - kCounterFirstIndex - 1 77 | #define bool_ IKRA_FIELD_TYPE_GENERATOR(bool, IKRA_NEXT_FIELD_ID) 78 | #define char_ IKRA_FIELD_TYPE_GENERATOR(char, IKRA_NEXT_FIELD_ID) 79 | #define double_ IKRA_FIELD_TYPE_GENERATOR(double, IKRA_NEXT_FIELD_ID) 80 | #define float_ IKRA_FIELD_TYPE_GENERATOR(float, IKRA_NEXT_FIELD_ID) 81 | #define int_ IKRA_FIELD_TYPE_GENERATOR(int, IKRA_NEXT_FIELD_ID) 82 | #define uint8_t_ IKRA_FIELD_TYPE_GENERATOR(uint8_t, IKRA_NEXT_FIELD_ID) 83 | #define uint16_t_ IKRA_FIELD_TYPE_GENERATOR(uint16_t, IKRA_NEXT_FIELD_ID) 84 | #define uint32_t_ IKRA_FIELD_TYPE_GENERATOR(uint32_t, IKRA_NEXT_FIELD_ID) 85 | #define uint64_t_ IKRA_FIELD_TYPE_GENERATOR(uint64_t, IKRA_NEXT_FIELD_ID) 86 | 87 | #define array_(...) PP_CONCAT(ARRAY_, PP_NARG(__VA_ARGS__))(__VA_ARGS__) 88 | #define ARRAY_3(type, size, layout) \ 89 | IKRA_ARRAY_FIELD_TYPE_GENERATOR(type, size, layout, IKRA_NEXT_FIELD_ID) 90 | #define ARRAY_2(type, size) ARRAY_3(type, size, soa) 91 | #define ARRAY_1(arg) \ 92 | static_assert(false, "At least two arguments required for array_.") 93 | #define ARRAY_0() ARRAY_1(0) 94 | 95 | #define field_(type) IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type, IKRA_NEXT_FIELD_ID) 96 | #define ref_(type) IKRA_CUSTOM_FIELD_TYPE_GENERATOR(type*, IKRA_NEXT_FIELD_ID) 97 | -------------------------------------------------------------------------------- /ikra/soa/preprocessor.h: -------------------------------------------------------------------------------- 1 | // Code adapted from: 2 | // https://stackoverflow.com/questions/3046889/optional-parameters-with-c-macros 3 | 4 | #ifndef SOA_PREPROCESSOR_H 5 | #define SOA_PREPROCESSOR_H 6 | 7 | #define PP_NARG(...) \ 8 | PP_NARG_(__VA_ARGS__,PP_RSEQ_N()) 9 | #define PP_NARG_(...) \ 10 | PP_ARG_N(__VA_ARGS__) 11 | #define PP_ARG_N( \ 12 | _1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \ 13 | _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \ 14 | _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \ 15 | _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \ 16 | _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \ 17 | _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \ 18 | _61,_62,_63,N,...) N 19 | #define PP_RSEQ_N() \ 20 | 63,62,61,60, \ 21 | 59,58,57,56,55,54,53,52,51,50, \ 22 | 49,48,47,46,45,44,43,42,41,40, \ 23 | 39,38,37,36,35,34,33,32,31,30, \ 24 | 29,28,27,26,25,24,23,22,21,20, \ 25 | 19,18,17,16,15,14,13,12,11,10, \ 26 | 9,8,7,6,5,4,3,2,1,0 27 | 28 | #define PP_CONCAT(a,b) PP_CONCAT_(a,b) 29 | #define PP_CONCAT_(a,b) a ## b 30 | 31 | #endif // SOA_PREPROCESSOR_H 32 | -------------------------------------------------------------------------------- /ikra/soa/soa.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_SOA_H 2 | #define SOA_SOA_H 3 | 4 | #if __cplusplus < 201103L 5 | #error This library needs at least a C++11 compliant compiler. 6 | #endif 7 | 8 | #if defined(__clang__) 9 | #warning Loop vectorization is broken when using clang. 10 | #elif defined(__GNUC__) || defined(__GNUG__) 11 | // Everything OK with gcc. 12 | #else 13 | #warning ikra-cpp was not tested with this compiler. 14 | #endif 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // Asserts active only in debug mode (NDEBUG). 22 | #include 23 | 24 | #include "executor/iterator.h" 25 | #include "soa/array_field.h" 26 | #include "soa/constants.h" 27 | #include "soa/field.h" 28 | #include "soa/field_type_generator.h" 29 | #include "soa/partially_inlined_array_field.h" 30 | #include "soa/layout.h" 31 | #include "soa/preprocessor.h" 32 | #include "soa/storage.h" 33 | 34 | #endif // SOA_SOA_H 35 | -------------------------------------------------------------------------------- /ikra/soa/util.h: -------------------------------------------------------------------------------- 1 | #ifndef SOA_UTIL_H 2 | #define SOA_UTIL_H 3 | 4 | 5 | namespace ikra { 6 | namespace soa { 7 | 8 | // This class exports its single template parameter as "type". Used as 9 | // workaround for an nvcc bug with friend declarations. 10 | template 11 | struct NvccWorkaroundIdentityClassHolder { 12 | using type = T; 13 | }; 14 | 15 | // This helper macro can be used to allow reinterpret_casts in constexpr. 16 | // Only needed for clang. 17 | #define IKRA_fold(x) (__builtin_constant_p(x) ? (x) : (x)) 18 | 19 | // This class is used to check if the compiler supports the selected addressing 20 | // mode. 21 | template 22 | class AddressingModeCompilerCheck { 23 | char ___[N]; 24 | }; 25 | 26 | } // soa 27 | } // ikra 28 | 29 | #endif // SOA_UTIL_H 30 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(array_test soa/array_test.cc) 2 | target_link_libraries(array_test gtest gtest_main ikra) 3 | add_test(array_test ${CMAKE_BINARY_DIR}/bin/array_test) 4 | 5 | add_executable(basic_class_test soa/basic_class_test.cc) 6 | target_link_libraries(basic_class_test gtest gtest_main ikra) 7 | add_test(basic_class_test ${CMAKE_BINARY_DIR}/bin/basic_class_test) 8 | 9 | add_executable(pointer_arithmetics_test soa/pointer_arithmetics_test.cc) 10 | target_link_libraries(pointer_arithmetics_test gtest gtest_main ikra) 11 | add_test(pointer_arithmetics_test ${CMAKE_BINARY_DIR}/bin/pointer_arithmetics_test) 12 | 13 | add_executable(executor_test executor_test.cc) 14 | target_link_libraries(executor_test gtest gtest_main ikra) 15 | add_test(executor_test ${CMAKE_BINARY_DIR}/bin/executor_test) 16 | 17 | 18 | # CUDA Tests 19 | # TODO: cuda_add_library does not support INTERFACE libraries 20 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) 21 | cuda_add_executable(minimum_cuda_test minimum_cuda_test.cu) 22 | target_link_libraries(minimum_cuda_test gtest gtest_main ikra) 23 | add_test(minimum_cuda_test ${CMAKE_BINARY_DIR}/bin/minimum_cuda_test) 24 | 25 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) 26 | cuda_add_executable(array_cuda_test array_cuda_test.cu) 27 | target_link_libraries(array_cuda_test gtest gtest_main ikra) 28 | add_test(array_cuda_test ${CMAKE_BINARY_DIR}/bin/array_cuda_test) 29 | 30 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) 31 | cuda_add_executable(cuda_array_test soa/cuda_array_test.cu) 32 | target_link_libraries(cuda_array_test gtest gtest_main ikra) 33 | add_test(cuda_array_test ${CMAKE_BINARY_DIR}/bin/cuda_array_test) 34 | 35 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) 36 | cuda_add_executable(cuda_inline_array_memcpy_test soa/cuda_inline_array_memcpy_test.cu) 37 | target_link_libraries(cuda_inline_array_memcpy_test gtest gtest_main ikra) 38 | add_test(cuda_inline_array_memcpy_test ${CMAKE_BINARY_DIR}/bin/cuda_inline_array_memcpy_test) 39 | 40 | include_directories(${CMAKE_SOURCE_DIR}/ikra ${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) 41 | cuda_add_executable(cuda_inline_array_memcpy_aos_test soa/cuda_inline_array_memcpy_aos_test.cu) 42 | target_link_libraries(cuda_inline_array_memcpy_aos_test gtest gtest_main ikra) 43 | add_test(cuda_inline_array_memcpy_aos_test ${CMAKE_BINARY_DIR}/bin/cuda_inline_array_memcpy_aos_test) 44 | -------------------------------------------------------------------------------- /test/array_cuda_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "gtest/gtest.h" 4 | 5 | #include "executor/cuda_executor.h" 6 | #include "soa/soa.h" 7 | 8 | using ikra::soa::IndexType; 9 | using ikra::soa::SoaLayout; 10 | using ikra::executor::cuda::construct; 11 | 12 | const static int kTestSize = 12; 13 | 14 | class TestClass : public SoaLayout { 15 | public: 16 | IKRA_INITIALIZE_CLASS 17 | 18 | TestClass(std::array f0, std::array f1) { 19 | field0[0] = f0[0]; 20 | field0[1] = f0[1]; 21 | field1[0] = f1[0]; 22 | field1[1] = f1[1]; 23 | } 24 | 25 | array_(double, 2, fully_inlined) field0; 26 | array_(double, 2, fully_inlined) field1; 27 | 28 | __device__ void add_fields() { 29 | field0[0] = field0[0] + field0[1] + field1[0] + field1[1]; 30 | } 31 | }; 32 | 33 | IKRA_DEVICE_STORAGE(TestClass); 34 | 35 | 36 | // Cannot run "cuda_execute" inside gtest case. 37 | void run_test_set_from_host_and_read() { 38 | TestClass::initialize_storage(); 39 | EXPECT_EQ(TestClass::size(), 0UL); 40 | 41 | for (int i = 0; i < kTestSize; ++i) { 42 | new TestClass({{ 1.0, 2.0 }}, {{ 3.0, 4.0 }}); 43 | } 44 | gpuErrchk(cudaPeekAtLastError()); 45 | 46 | TestClass* first = TestClass::get(0); 47 | cuda_execute(&TestClass::add_fields, first, kTestSize); 48 | 49 | // Check result. 50 | for (int i = 0; i < kTestSize; ++i) { 51 | double actual = TestClass::get(i)->field0[0]; 52 | double expected = 1.0 + 2.0 + 3.0 + 4.0; 53 | EXPECT_EQ(actual, expected); 54 | } 55 | 56 | // Copy size to host memory and compare. 57 | EXPECT_EQ(TestClass::size(), static_cast(kTestSize)); 58 | 59 | // Make sure that we had no CUDA failures. 60 | gpuErrchk(cudaPeekAtLastError()); 61 | } 62 | 63 | TEST(ArrayCudaTest, SetFromHostAndRead) { 64 | // This test ensures that address computation for array fields is done 65 | // correctly. 66 | run_test_set_from_host_and_read(); 67 | } 68 | -------------------------------------------------------------------------------- /test/executor_test.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "executor/executor.h" 3 | #include "executor/iterator.h" 4 | #include "soa/soa.h" 5 | 6 | namespace { 7 | using ikra::soa::SoaLayout; 8 | using ikra::soa::kAddressModeZero; 9 | using ikra::executor::construct; 10 | using ikra::executor::execute; 11 | using ikra::executor::make_iterator; 12 | 13 | static const int kClassMaxInst = 1024; 14 | static const int kTestSize = 40; 15 | 16 | // Zero addressing mode. 17 | #define IKRA_TEST_CLASSNAME TestClassZ 18 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero 19 | #include "executor_test_layout.inc" 20 | #undef IKRA_TEST_CLASSNAME 21 | #undef IKRA_TEST_ADDRESS_MODE 22 | IKRA_HOST_STORAGE(TestClassZ) 23 | 24 | // Valid addressing mode. 25 | #define IKRA_TEST_CLASSNAME TestClassV 26 | #define IKRA_TEST_ADDRESS_MODE sizeof(int) 27 | #include "executor_test_layout.inc" 28 | #undef IKRA_TEST_CLASSNAME 29 | #undef IKRA_TEST_ADDRESS_MODE 30 | IKRA_HOST_STORAGE(TestClassV) 31 | 32 | template 33 | class ExecutorTest : public testing::Test {}; 34 | 35 | TYPED_TEST_CASE_P(ExecutorTest); 36 | 37 | TYPED_TEST_P(ExecutorTest, StdArray) { 38 | TypeParam::initialize_storage(); 39 | std::array arr; 40 | 41 | for (int i = 0; i < kTestSize; ++i) { 42 | arr[i] = new TypeParam(i + 1, 100*i + 1); 43 | } 44 | 45 | execute(arr.begin(), arr.end(), &TypeParam::add_field1_and_a_to_field0, 50); 46 | 47 | // Check result 48 | for (int i = 0; i < kTestSize; ++i) { 49 | int expected = i + 1 + 100*i + 1 + 50; 50 | EXPECT_EQ(arr[i]->field0, expected); 51 | } 52 | } 53 | 54 | TYPED_TEST_P(ExecutorTest, IteratorFromPointers) { 55 | TypeParam::initialize_storage(); 56 | std::array arr; 57 | 58 | for (int i = 0; i < kTestSize; ++i) { 59 | arr[i] = new TypeParam(i + 1, 200*i + 1); 60 | } 61 | 62 | execute(make_iterator(arr[0]), 63 | ++make_iterator(arr[kTestSize - 1]), 64 | &TypeParam::add_field1_and_a_to_field0, 65 | 50); 66 | 67 | // Check result 68 | for (int i = 0; i < kTestSize; ++i) { 69 | int expected = i + 1 + 200*i + 1 + 50; 70 | EXPECT_EQ(arr[i]->field0, expected); 71 | } 72 | } 73 | 74 | TYPED_TEST_P(ExecutorTest, ExecuteAndReduce) { 75 | TypeParam::initialize_storage(); 76 | std::array arr; 77 | 78 | for (int i = 0; i < kTestSize; ++i) { 79 | arr[i] = new TypeParam(i, 100*i); 80 | } 81 | 82 | int expected = ((kTestSize - 1)*kTestSize)/2*101 + kTestSize; 83 | int actual = execute_and_reduce(make_iterator(arr[0]), 84 | ++make_iterator(arr[kTestSize - 1]), 85 | &TypeParam::sum_plus_delta, 86 | [](int a, int b) { return a + b; }, 0, 1); 87 | 88 | EXPECT_EQ(actual, expected); 89 | } 90 | 91 | TYPED_TEST_P(ExecutorTest, Construct) { 92 | TypeParam::initialize_storage(); 93 | TypeParam* first = construct(kTestSize, 1, 2); 94 | 95 | int expected = (20 + 1 + 2)*kTestSize; 96 | int actual = execute_and_reduce(make_iterator(first), 97 | make_iterator(first) + kTestSize, 98 | &TypeParam::sum_plus_delta, 99 | [](int a, int b) { return a + b; }, 0, 20); 100 | EXPECT_EQ(actual, expected); 101 | } 102 | 103 | REGISTER_TYPED_TEST_CASE_P(ExecutorTest, 104 | StdArray, 105 | IteratorFromPointers, 106 | ExecuteAndReduce, 107 | Construct); 108 | 109 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, ExecutorTest, TestClassV); 110 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, ExecutorTest, TestClassZ); 111 | 112 | } // namespace 113 | 114 | -------------------------------------------------------------------------------- /test/executor_test_layout.inc: -------------------------------------------------------------------------------- 1 | class IKRA_TEST_CLASSNAME : public SoaLayout { 4 | public: 5 | IKRA_INITIALIZE_CLASS 6 | 7 | int_ field0; 8 | int_ field1; 9 | 10 | IKRA_TEST_CLASSNAME(int a, int b) : field0(a), field1(b) {} 11 | 12 | void add_field1_and_a_to_field0(int a) { 13 | field0 += field1 + a; 14 | } 15 | 16 | int sum_plus_delta(int delta) { 17 | return field0 + field1 + delta; 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /test/minimum_cuda_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "gtest/gtest.h" 4 | 5 | #include "executor/cuda_executor.h" 6 | #include "soa/soa.h" 7 | 8 | using ikra::soa::IndexType; 9 | using ikra::soa::SoaLayout; 10 | using ikra::executor::cuda::construct; 11 | 12 | const static int kTestSize = 12; 13 | 14 | class Vertex : public SoaLayout { 15 | public: 16 | IKRA_INITIALIZE_CLASS 17 | 18 | __host__ __device__ Vertex(int f0, int f1) : field0(f0), field1(f1) {} 19 | 20 | int_ field0; 21 | int_ field1; 22 | 23 | __device__ void add_fields(int increment) { 24 | field0 = field0 + field1 + increment + this->id(); 25 | } 26 | }; 27 | 28 | IKRA_DEVICE_STORAGE(Vertex); 29 | 30 | 31 | // Cannot run "cuda_execute" inside gtest case. 32 | void run_test_construct_and_execute() { 33 | Vertex::initialize_storage(); 34 | EXPECT_EQ(Vertex::size(), 0UL); 35 | 36 | Vertex* first = construct(kTestSize, 5, 6); 37 | gpuErrchk(cudaPeekAtLastError()); 38 | 39 | cuda_execute(&Vertex::add_fields, first, 12, 10); 40 | 41 | // Check result. 42 | for (int i = 0; i < kTestSize; ++i) { 43 | int actual = Vertex::get(i)->field0; 44 | int expected = 10 + 5 + 6 + i; 45 | EXPECT_EQ(actual, expected); 46 | } 47 | 48 | // Copy size to host memory and compare. 49 | EXPECT_EQ(Vertex::size(), static_cast(kTestSize)); 50 | 51 | // Make sure that we had no CUDA failures. 52 | gpuErrchk(cudaPeekAtLastError()); 53 | } 54 | 55 | void run_test_host_side_assignment() { 56 | Vertex::initialize_storage(); 57 | EXPECT_EQ(Vertex::size(), 0UL); 58 | 59 | Vertex* first = construct(kTestSize, 5, 6); 60 | cuda_execute(&Vertex::add_fields, first, kTestSize, 10); 61 | 62 | for (int i = 0; i < kTestSize; ++i) { 63 | Vertex::get(i)->field0 = Vertex::get(i)->field0*Vertex::get(i)->field0; 64 | } 65 | 66 | // Check result. 67 | for (int i = 0; i < kTestSize; ++i) { 68 | int actual = Vertex::get(i)->field0; 69 | int expected = (10 + 5 + 6 + i)*(10 + 5 + 6 + i); 70 | EXPECT_EQ(actual, expected); 71 | } 72 | 73 | // Copy size to host memory and compare. 74 | EXPECT_EQ(Vertex::size(), static_cast(kTestSize)); 75 | 76 | // Make sure that we had no CUDA failures. 77 | gpuErrchk(cudaPeekAtLastError()); 78 | } 79 | 80 | void run_test_host_side_new() { 81 | Vertex::initialize_storage(); 82 | EXPECT_EQ(Vertex::size(), 0UL); 83 | 84 | std::array vertices; 85 | 86 | for (int i = 0; i < kTestSize; ++i) { 87 | vertices[i] = new Vertex(i + 1, i * i); 88 | EXPECT_EQ(vertices[i]->id(), static_cast(i)); 89 | } 90 | 91 | cuda_execute(&Vertex::add_fields, vertices[0], kTestSize, 10); 92 | 93 | // Check result. 94 | for (int i = 0; i < kTestSize; ++i) { 95 | int actual = Vertex::get(i)->field0; 96 | int expected = 10 + i + (i + 1) + (i*i); 97 | EXPECT_EQ(actual, expected); 98 | 99 | actual = vertices[i]->field1; 100 | expected = i*i; 101 | EXPECT_EQ(actual, expected); 102 | } 103 | 104 | // Copy size to host memory and compare. 105 | EXPECT_EQ(Vertex::size(), static_cast(kTestSize)); 106 | 107 | // Make sure that we had no CUDA failures. 108 | gpuErrchk(cudaPeekAtLastError()); 109 | } 110 | 111 | void run_test_placement_new() { 112 | Vertex::initialize_storage(); 113 | EXPECT_EQ(Vertex::size(), 0UL); 114 | 115 | Vertex* v1 = new(Vertex::get_uninitialized(4)) Vertex(10, 20); 116 | EXPECT_EQ(Vertex::size(), 0UL); 117 | EXPECT_EQ(v1->field0, 10); 118 | EXPECT_EQ(v1->field1, 20); 119 | EXPECT_EQ(v1->id(), 4); 120 | 121 | // Make sure that we had no CUDA failures. 122 | gpuErrchk(cudaPeekAtLastError()); 123 | } 124 | 125 | TEST(MinimumCudaTest, ConstructAndExecute) { 126 | run_test_construct_and_execute(); 127 | } 128 | 129 | TEST(MinimumCudaTest, HostSideAssignment) { 130 | run_test_host_side_assignment(); 131 | } 132 | 133 | TEST(MinimumCudaTest, HostSideNew) { 134 | run_test_host_side_new(); 135 | } 136 | 137 | TEST(MinimumCudaTest, PlacementNew) { 138 | run_test_placement_new(); 139 | } 140 | 141 | -------------------------------------------------------------------------------- /test/soa/array_test.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "soa/soa.h" 3 | 4 | namespace { 5 | using ikra::soa::SoaLayout; 6 | using ikra::soa::kAddressModeZero; 7 | 8 | static const int kClassMaxInst = 1024; 9 | static const int kTestSize = 9; 10 | 11 | // Zero addressing mode. 12 | #define IKRA_TEST_CLASSNAME TestClassZ 13 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero 14 | #include "array_test_layout.inc" 15 | #undef IKRA_TEST_CLASSNAME 16 | #undef IKRA_TEST_ADDRESS_MODE 17 | IKRA_HOST_STORAGE(TestClassZ) 18 | 19 | // Valid addressing mode. 20 | #define IKRA_TEST_CLASSNAME TestClassV 21 | #define IKRA_TEST_ADDRESS_MODE sizeof(int) 22 | #include "array_test_layout.inc" 23 | #undef IKRA_TEST_CLASSNAME 24 | #undef IKRA_TEST_ADDRESS_MODE 25 | IKRA_HOST_STORAGE(TestClassV) 26 | 27 | template 28 | class ArrayTest : public testing::Test {}; 29 | 30 | TYPED_TEST_CASE_P(ArrayTest); 31 | 32 | TYPED_TEST_P(ArrayTest, AosArray) { 33 | TypeParam::initialize_storage(); 34 | TypeParam** instances = new TypeParam*[kTestSize]; 35 | 36 | // Create a few instances 37 | for (int i = 0; i < kTestSize; ++i) { 38 | instances[i] = new TypeParam(); 39 | instances[i]->field0 = i + 1; 40 | EXPECT_EQ(instances[i]->field0, i + 1); 41 | instances[i]->field2 = i*i; 42 | } 43 | 44 | // Set values. 45 | for (int i = 0; i < kTestSize; ++i) { 46 | instances[i]->field1[0] = 1*i + 1; 47 | instances[i]->field1[1] = 2*i + 2; 48 | // TODO: Should use "." for consistency. 49 | instances[i]->field1->at(2) = 3*i + 3; 50 | } 51 | 52 | // Check result. 53 | for (int i = 0; i < kTestSize; ++i) { 54 | // Ensure that no data was overwritten. 55 | EXPECT_EQ(instances[i]->field0, i + 1); 56 | EXPECT_EQ(instances[i]->field2, i*i); 57 | 58 | int expected = i+1 + 2*i+2 + 3*i+3; 59 | EXPECT_EQ(instances[i]->field1_sum(), expected); 60 | } 61 | } 62 | 63 | TYPED_TEST_P(ArrayTest, SoaArray) { 64 | TypeParam::initialize_storage(); 65 | TypeParam** instances = new TypeParam*[kTestSize]; 66 | 67 | // Create a few instances 68 | for (int i = 0; i < kTestSize; ++i) { 69 | instances[i] = new TypeParam(); 70 | } 71 | 72 | // Set values. 73 | for (int i = 0; i < kTestSize; ++i) { 74 | instances[i]->field3[0] = 1*i + 1; 75 | instances[i]->field3[1] = 2*i + 2; 76 | instances[i]->field3->at(2) = 3*i + 3; 77 | } 78 | 79 | // Check result. 80 | for (int i = 0; i < kTestSize; ++i) { 81 | int expected = i+1 + 2*i+2 + 3*i+3; 82 | EXPECT_EQ(instances[i]->field3_sum(), expected); 83 | } 84 | } 85 | 86 | TYPED_TEST_P(ArrayTest, SoaArrayStaticGet) { 87 | TypeParam::initialize_storage(); 88 | TypeParam** instances = new TypeParam*[kTestSize]; 89 | 90 | // Create a few instances 91 | for (int i = 0; i < kTestSize; ++i) { 92 | instances[i] = new TypeParam(); 93 | } 94 | 95 | // Set values. 96 | for (int i = 0; i < kTestSize; ++i) { 97 | instances[i]->field3.template at<0>() = 1*i + 1; 98 | instances[i]->field3.template at<1>() = 2*i + 2; 99 | instances[i]->field3.template at<2>() = 3*i + 3; 100 | } 101 | 102 | // Check result. 103 | for (int i = 0; i < kTestSize; ++i) { 104 | int expected = i+1 + 2*i+2 + 3*i+3; 105 | EXPECT_EQ(instances[i]->field3_sum(), expected); 106 | } 107 | } 108 | 109 | REGISTER_TYPED_TEST_CASE_P(ArrayTest, 110 | AosArray, 111 | SoaArray, 112 | SoaArrayStaticGet); 113 | 114 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, ArrayTest, TestClassV); 115 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, ArrayTest, TestClassZ); 116 | 117 | } // namespace 118 | -------------------------------------------------------------------------------- /test/soa/array_test_layout.inc: -------------------------------------------------------------------------------- 1 | class IKRA_TEST_CLASSNAME 2 | : public SoaLayout { 4 | public: 5 | IKRA_INITIALIZE_CLASS 6 | 7 | int_ field0; 8 | 9 | // Array has size 12 bytes. 10 | array_(int, 3, object) field1; 11 | int_ field2; 12 | 13 | // Array has size 12 bytes. 14 | array_(int, 3, fully_inlined) field3; 15 | int_ field4; 16 | 17 | int field1_sum() { 18 | int result = 0; 19 | for (int i = 0; i < 3; ++i) { 20 | result = result + field1[i]; 21 | } 22 | return result; 23 | } 24 | 25 | int field3_sum() { 26 | int result = 0; 27 | for (int i = 0; i < 3; ++i) { 28 | result = result + field3[i]; 29 | } 30 | return result; 31 | } 32 | }; 33 | 34 | -------------------------------------------------------------------------------- /test/soa/basic_class_test.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "soa/soa.h" 3 | 4 | namespace { 5 | using ikra::soa::SoaLayout; 6 | using ikra::soa::StaticStorage; 7 | using ikra::soa::kAddressModeZero; 8 | using ikra::soa::kLayoutModeSoa; 9 | using ikra::soa::kLayoutModeAos; 10 | 11 | static const int kClassMaxInst = 1024; 12 | static const int kTestSize = 40; 13 | 14 | // Zero addressing mode. 15 | #define IKRA_TEST_CLASSNAME TestClassZ_Soa 16 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero 17 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeSoa 18 | #include "basic_class_test_layout.inc" 19 | #undef IKRA_TEST_CLASSNAME 20 | #undef IKRA_TEST_ADDRESS_MODE 21 | IKRA_HOST_STORAGE(TestClassZ_Soa) 22 | 23 | #define IKRA_TEST_CLASSNAME TestClassZ_Aos 24 | #define IKRA_TEST_ADDRESS_MODE kAddressModeZero 25 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeAos 26 | #include "basic_class_test_layout.inc" 27 | #undef IKRA_TEST_CLASSNAME 28 | #undef IKRA_TEST_ADDRESS_MODE 29 | IKRA_HOST_STORAGE(TestClassZ_Aos) 30 | 31 | // Valid addressing mode. 32 | #define IKRA_TEST_CLASSNAME TestClassV 33 | #define IKRA_TEST_ADDRESS_MODE sizeof(int) 34 | #define IKRA_TEST_LAYOUT_MODE kLayoutModeSoa 35 | #include "basic_class_test_layout.inc" 36 | #undef IKRA_TEST_CLASSNAME 37 | #undef IKRA_TEST_ADDRESS_MODE 38 | IKRA_HOST_STORAGE(TestClassV) 39 | 40 | 41 | template 42 | class BasicClassTest : public testing::Test {}; 43 | 44 | TYPED_TEST_CASE_P(BasicClassTest); 45 | 46 | TYPED_TEST_P(BasicClassTest, Fields) { 47 | TypeParam::initialize_storage(); 48 | TypeParam** instances = new TypeParam*[kTestSize]; 49 | 50 | // Create a few instances 51 | for (int i = 0; i < kTestSize; ++i) { 52 | instances[i] = new TypeParam(); 53 | } 54 | 55 | // Set values. 56 | for (int i = 0; i < kTestSize; ++i) { 57 | instances[i]->field0 = i + 99; 58 | EXPECT_EQ(instances[i]->field0, i + 99); 59 | 60 | instances[i]->field4 = i*i; 61 | EXPECT_EQ(instances[i]->field4, i*i); 62 | 63 | instances[i]->field2 = i % 3 == 0; 64 | EXPECT_EQ(instances[i]->field2, i % 3 == 0); 65 | 66 | int expected = i % 3 == 0 ? i*i : -1; 67 | EXPECT_EQ(instances[i]->get_field4_if_field2(), expected); 68 | } 69 | 70 | // Do increment. 71 | for (int i = 0; i < kTestSize; ++i) { 72 | instances[i]->add_field0_to_field4(); 73 | } 74 | 75 | // Check result. 76 | for (int i = 0; i < kTestSize; ++i) { 77 | int expected = i % 3 == 0 ? i + i*i + 99: -1; 78 | EXPECT_EQ(instances[i]->get_field4_if_field2(), expected); 79 | } 80 | } 81 | 82 | TYPED_TEST_P(BasicClassTest, SetFieldsAfterNew) { 83 | TypeParam::initialize_storage(); 84 | TypeParam** instances = new TypeParam*[kTestSize]; 85 | 86 | // Create a few instances 87 | for (int i = 0; i < kTestSize; ++i) { 88 | instances[i] = new TypeParam(); 89 | 90 | instances[i]->field0 = i; 91 | EXPECT_EQ(instances[i]->field0, i); 92 | 93 | instances[i]->field4 = i*i; 94 | EXPECT_EQ(instances[i]->field4, i*i); 95 | 96 | instances[i]->field2 = i % 3 == 0; 97 | EXPECT_EQ(instances[i]->field2, i % 3 == 0); 98 | 99 | int expected = i % 3 == 0 ? i*i : -1; 100 | EXPECT_EQ(instances[i]->get_field4_if_field2(), expected); 101 | } 102 | 103 | // Do increment. 104 | for (int i = 0; i < kTestSize; ++i) { 105 | instances[i]->add_field0_to_field4(); 106 | } 107 | 108 | // Check result. 109 | for (int i = 0; i < kTestSize; ++i) { 110 | int expected = i % 3 == 0 ? i + i*i : -1; 111 | EXPECT_EQ(instances[i]->get_field4_if_field2(), expected); 112 | } 113 | } 114 | 115 | TYPED_TEST_P(BasicClassTest, Constructor) { 116 | TypeParam::initialize_storage(); 117 | TypeParam** instances = new TypeParam*[kTestSize]; 118 | 119 | // Create a few instances 120 | for (int i = 0; i < kTestSize; ++i) { 121 | instances[i] = new TypeParam(3*i, 5*i); 122 | } 123 | 124 | // Check result. 125 | for (int i = 0; i < kTestSize; ++i) { 126 | EXPECT_EQ((int) instances[i]->field0, 3*i); 127 | EXPECT_EQ((int) instances[i]->field4, 5*i); 128 | } 129 | } 130 | 131 | TYPED_TEST_P(BasicClassTest, PlacementNew) { 132 | TypeParam::initialize_storage(); 133 | EXPECT_EQ(TypeParam::size(), 0UL); 134 | 135 | TypeParam* obj = new(TypeParam::get_uninitialized(4)) TypeParam(); 136 | EXPECT_EQ(obj->id(), 4); 137 | } 138 | 139 | REGISTER_TYPED_TEST_CASE_P(BasicClassTest, 140 | Fields, 141 | SetFieldsAfterNew, 142 | Constructor, 143 | PlacementNew); 144 | 145 | INSTANTIATE_TYPED_TEST_CASE_P(Valid, BasicClassTest, TestClassV); 146 | INSTANTIATE_TYPED_TEST_CASE_P(Zero, BasicClassTest, TestClassZ_Soa); 147 | INSTANTIATE_TYPED_TEST_CASE_P(ZeroAos, BasicClassTest, TestClassZ_Aos); 148 | 149 | } // namespace 150 | -------------------------------------------------------------------------------- /test/soa/basic_class_test_layout.inc: -------------------------------------------------------------------------------- 1 | class IKRA_TEST_CLASSNAME : public SoaLayout { 6 | public: 7 | IKRA_INITIALIZE_CLASS 8 | 9 | int_ field0; 10 | double_ field1; 11 | bool_ field2; 12 | int_ field4; 13 | 14 | IKRA_TEST_CLASSNAME() {} 15 | 16 | IKRA_TEST_CLASSNAME(int field0_a, int field4_a) : field4(field4_a) { 17 | field0 = field0_a; 18 | } 19 | 20 | void add_field0_to_field4() { 21 | field4 = field4 + field0; 22 | } 23 | 24 | int get_field4_if_field2() { 25 | if (field2) { 26 | return field4; 27 | } else { 28 | return -1; 29 | } 30 | } 31 | }; 32 | -------------------------------------------------------------------------------- /test/soa/benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | 3 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/aos_style.cu: -------------------------------------------------------------------------------- 1 | #include "executor/cuda_executor.h" 2 | #include "soa/soa.h" 3 | 4 | #include "benchmark.h" 5 | #include "configuration.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | 11 | class DummyClass : public SoaLayout { 12 | public: 13 | IKRA_INITIALIZE_CLASS 14 | 15 | __device__ DummyClass(int f0, int f2): field0(f0), field2(f2) { 16 | for (int i = 0; i < ARRAY_SIZE; ++i) { 17 | field1[i] = id()*17 + i; 18 | } 19 | } 20 | 21 | int_ field0; 22 | 23 | array_(int, ARRAY_SIZE, object) field1; 24 | 25 | int_ field2; 26 | 27 | __device__ void update_field1(int increment) { 28 | for (int i = 0; i < ARRAY_SIZE; ++i) { 29 | field1[i] += increment + field0 + field2; 30 | } 31 | } 32 | }; 33 | 34 | IKRA_DEVICE_STORAGE(DummyClass); 35 | 36 | 37 | void action() { 38 | DummyClass::initialize_storage(); 39 | DummyClass* first = construct(NUM_INST, 29, 1357); 40 | gpuErrchk(cudaPeekAtLastError()); 41 | 42 | cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19); 43 | gpuErrchk(cudaPeekAtLastError()); 44 | 45 | cudaDeviceSynchronize(); 46 | } 47 | 48 | void run_test_construct_and_execute() { 49 | uint64_t time_action = measure<>::execution(action); 50 | printf("[AOS] Time for action: %lu\n", time_action); 51 | 52 | #ifndef NDEBUG 53 | // Check result (some samples). 54 | for (int k = 0; k < 100; ++k) { 55 | int i = rand() % NUM_INST; 56 | for (int j = 0; j < ARRAY_SIZE; ++j) { 57 | int actual1 = DummyClass::get(i)->field1[j]; 58 | int expected1 = i*17 + j + 19 + 29 + 1357; 59 | if (actual1 != expected1) { 60 | printf("[AOS] Dummy[%i].field1[%i]: Expected %i, but found %i\n", 61 | i, j, expected1, actual1); 62 | exit(1); 63 | } 64 | } 65 | } 66 | #endif // NDEBUG 67 | } 68 | 69 | int main() { 70 | run_test_construct_and_execute(); 71 | } 72 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/aos_style_cpu.cc: -------------------------------------------------------------------------------- 1 | #include "executor/executor.h" 2 | #include "soa/soa.h" 3 | 4 | #include "benchmark.h" 5 | #include "configuration.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::construct; 10 | using ikra::executor::execute; 11 | 12 | class DummyClass : public SoaLayout { 13 | public: 14 | IKRA_INITIALIZE_CLASS 15 | 16 | DummyClass(int f0, int f2): field0(f0), field2(f2) { 17 | for (int i = 0; i < ARRAY_SIZE; ++i) { 18 | field1[i] = id()*17 + i; 19 | } 20 | } 21 | 22 | int_ field0; 23 | 24 | array_(int, ARRAY_SIZE, object) field1; 25 | 26 | int_ field2; 27 | 28 | void update_field1(int increment) { 29 | for (int i = 0; i < ARRAY_SIZE; ++i) { 30 | field1[i] += increment + field0 + field2; 31 | } 32 | } 33 | }; 34 | 35 | IKRA_HOST_STORAGE(DummyClass); 36 | 37 | 38 | void action() { 39 | DummyClass::DBG_print_offsets(); 40 | DummyClass::initialize_storage(); 41 | DummyClass* first = construct(NUM_INST, 29, 1357); 42 | 43 | #ifndef NDEBUG 44 | // Check values (some samples). 45 | for (int k = 0; k < 100; ++k) { 46 | int i = rand() % NUM_INST; 47 | assert((int) DummyClass::get(i)->field0 == 29); 48 | assert((int) DummyClass::get(i)->field2 == 1357); 49 | } 50 | #endif // NDEBUG 51 | 52 | execute(&DummyClass::update_field1, 19); 53 | } 54 | 55 | void run_test_construct_and_execute() { 56 | uint64_t time_action = measure<>::execution(action); 57 | printf("[AOS-CPU] Time for action: %lu\n", time_action); 58 | 59 | #ifndef NDEBUG 60 | // Check result (some samples). 61 | for (int k = 0; k < 100; ++k) { 62 | int i = rand() % NUM_INST; 63 | for (int j = 0; j < ARRAY_SIZE; ++j) { 64 | int actual1 = DummyClass::get(i)->field1[j]; 65 | int expected1 = i*17 + j + 19 + 29 + 1357;// + 19 + 29 + 1357; 66 | if (actual1 != expected1) { 67 | printf("[AOS-CPU] Dummy[%i].field1[%i]: Expected %i, but found %i\n", 68 | i, j, expected1, actual1); 69 | exit(1); 70 | } 71 | } 72 | } 73 | #endif // NDEBUG 74 | } 75 | 76 | int main() { 77 | run_test_construct_and_execute(); 78 | } 79 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/benchmark.h: -------------------------------------------------------------------------------- 1 | // Taken from: https://stackoverflow.com/questions/2808398/easily-measure-elapsed-time 2 | 3 | #ifndef TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H 4 | #define TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H 5 | 6 | #include 7 | #include 8 | 9 | template 10 | struct measure 11 | { 12 | template 13 | static typename TimeT::rep execution(F&& func, Args&&... args) 14 | { 15 | auto start = std::chrono::steady_clock::now(); 16 | std::forward(func)(std::forward(args)...); 17 | auto duration = std::chrono::duration_cast< TimeT> 18 | (std::chrono::steady_clock::now() - start); 19 | return duration.count(); 20 | } 21 | }; 22 | 23 | #endif // TEST_SOA_BENCHMARK_ARRAY_BENCHMARK_H 24 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/configuration.h: -------------------------------------------------------------------------------- 1 | //#define NDEBUG 2 | 3 | #define NUM_INST 32768 4 | #define ARRAY_SIZE 64 5 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/inlined_soa_style.cu: -------------------------------------------------------------------------------- 1 | #include "executor/cuda_executor.h" 2 | #include "soa/soa.h" 3 | 4 | #include "benchmark.h" 5 | #include "configuration.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | using ikra::soa::StaticStorageWithArena; 11 | using ikra::soa::kAddressModeZero; 12 | 13 | #define READ_FROM_ARENA_ELEMENTS 8 14 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST) 15 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS) 16 | 17 | class DummyClass : public SoaLayout> { 19 | public: 20 | IKRA_INITIALIZE_CLASS 21 | 22 | __device__ DummyClass(int f0, int f2): field0(f0), field2(f2), 23 | field1(ARRAY_SIZE) { 24 | for (int i = 0; i < ARRAY_SIZE; ++i) { 25 | field1[i] = id()*17 + i; 26 | } 27 | } 28 | 29 | int_ field0; 30 | 31 | array_(int, INLINE_ARR_SIZE, partially_inlined) field1; 32 | 33 | int_ field2; 34 | 35 | __device__ void update_field1(int increment) { 36 | for (int i = 0; i < ARRAY_SIZE; ++i) { 37 | field1[i] += increment + field0 + field2; 38 | } 39 | } 40 | }; 41 | 42 | IKRA_DEVICE_STORAGE(DummyClass); 43 | 44 | 45 | void action() { 46 | DummyClass::initialize_storage(); 47 | DummyClass* first = construct(NUM_INST, 29, 1357); 48 | gpuErrchk(cudaPeekAtLastError()); 49 | 50 | cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19); 51 | gpuErrchk(cudaPeekAtLastError()); 52 | 53 | cudaDeviceSynchronize(); 54 | } 55 | 56 | void run_test_construct_and_execute() { 57 | uint64_t time_action = measure<>::execution(action); 58 | printf("[INLINE-%i/%i SOA] Time for action: %lu\n", 59 | INLINE_ARR_SIZE, ARRAY_SIZE, time_action); 60 | 61 | #ifndef NDEBUG 62 | // Check result (some samples). 63 | for (int k = 0; k < 100; ++k) { 64 | int i = rand() % NUM_INST; 65 | for (int j = 0; j < ARRAY_SIZE; ++j) { 66 | int actual1 = DummyClass::get(i)->field1[j]; 67 | int expected1 = i*17 + j + 19 + 29 + 1357; 68 | if (actual1 != expected1) { 69 | printf("[INLINE] Dummy[%i].field1[%i]: Expected %i, but found %i\n", 70 | i, j, expected1, actual1); 71 | exit(1); 72 | } 73 | } 74 | } 75 | #endif // NDEBUG 76 | } 77 | 78 | int main() { 79 | run_test_construct_and_execute(); 80 | } 81 | -------------------------------------------------------------------------------- /test/soa/benchmarks/array/soa_style.cu: -------------------------------------------------------------------------------- 1 | #include "executor/cuda_executor.h" 2 | #include "soa/soa.h" 3 | 4 | #include "benchmark.h" 5 | #include "configuration.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | 11 | class DummyClass : public SoaLayout { 12 | public: 13 | IKRA_INITIALIZE_CLASS 14 | 15 | __device__ DummyClass(int f0, int f2): field0(f0), field2(f2) { 16 | for (int i = 0; i < ARRAY_SIZE; ++i) { 17 | field1[i] = id()*17 + i; 18 | } 19 | } 20 | 21 | int_ field0; 22 | 23 | array_(int, ARRAY_SIZE, fully_inlined) field1; 24 | 25 | int_ field2; 26 | 27 | __device__ void update_field1(int increment) { 28 | for (int i = 0; i < ARRAY_SIZE; ++i) { 29 | field1[i] += increment + field0 + field2; 30 | } 31 | } 32 | }; 33 | 34 | IKRA_DEVICE_STORAGE(DummyClass); 35 | 36 | 37 | void action() { 38 | DummyClass::initialize_storage(); 39 | DummyClass* first = construct(NUM_INST, 29, 1357); 40 | gpuErrchk(cudaPeekAtLastError()); 41 | 42 | cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19); 43 | gpuErrchk(cudaPeekAtLastError()); 44 | 45 | cudaDeviceSynchronize(); 46 | } 47 | 48 | void run_test_construct_and_execute() { 49 | uint64_t time_action = measure<>::execution(action); 50 | printf("[SOA] Time for action: %lu\n", time_action); 51 | 52 | #ifndef NDEBUG 53 | // Check result (some samples). 54 | for (int k = 0; k < 100; ++k) { 55 | int i = rand() % NUM_INST; 56 | for (int j = 0; j < ARRAY_SIZE; ++j) { 57 | int actual1 = DummyClass::get(i)->field1[j]; 58 | int expected1 = i*17 + j + 19 + 29 + 1357; 59 | if (actual1 != expected1) { 60 | printf("[SOA] Dummy[%i].field1[%i]: Expected %i, but found %i\n", 61 | i, j, expected1, actual1); 62 | exit(1); 63 | } 64 | } 65 | } 66 | #endif // NDEBUG 67 | } 68 | 69 | int main() { 70 | run_test_construct_and_execute(); 71 | } 72 | -------------------------------------------------------------------------------- /test/soa/benchmarks/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Script will stop on error or incorrect result." 4 | extra_args="-march=native -fomit-frame-pointer" 5 | clang_bin="clang++-5.0" 6 | 7 | mkdir -p bin 8 | mkdir -p assembly 9 | 10 | for v_compiler in "g++" "${clang_bin}" 11 | do 12 | for v_storage in "StaticStorage" "DynamicStorage" 13 | do 14 | for v_opt_mode in "-O0" "-O3" 15 | do 16 | for v_addr_mode in "0" "4" 17 | do 18 | out_name="${v_compiler}_${v_opt_mode}_${v_storage}_${v_addr_mode}_Soa" 19 | ${v_compiler} -std=c++11 ${v_opt_mode} ${extra_args} \ 20 | -DSTORAGE_STRATEGY=${v_storage} \ 21 | -DADDRESS_MODE=${v_addr_mode} \ 22 | -DLAYOUT_MODE=Soa \ 23 | codegen_test.cc \ 24 | -I../../../ikra \ 25 | -o bin/${out_name} 26 | bin/${out_name} 27 | objdump -S bin/${out_name} > assembly/${out_name}.S 28 | done 29 | done 30 | done 31 | done 32 | 33 | for v_compiler in "g++" "${clang_bin}" 34 | do 35 | for v_opt_mode in "-O0" "-O3" 36 | do 37 | for v_layout_mode in "Aos" "Soa" 38 | do 39 | out_name="${v_compiler}_${v_opt_mode}_StaticStorage_0_Aos" 40 | ${v_compiler} -std=c++11 ${v_opt_mode} ${extra_args} \ 41 | -DSTORAGE_STRATEGY=StaticStorage \ 42 | -DADDRESS_MODE=0 \ 43 | -DLAYOUT_MODE=Aos \ 44 | codegen_test.cc \ 45 | -I../../../ikra \ 46 | -o bin/${out_name} 47 | bin/${out_name} 48 | objdump -S bin/${out_name} > assembly/${out_name}.S 49 | done 50 | done 51 | done 52 | 53 | 54 | # CUDA test 55 | /usr/local/cuda/bin/nvcc \ 56 | -std=c++11 \ 57 | --expt-extended-lambda \ 58 | -O3 \ 59 | -I../../../ikra \ 60 | -o bin/cuda_codegen_test \ 61 | cuda_codegen_test.cu 62 | bin/cuda_codegen_test 63 | /usr/local/cuda/bin/cuobjdump bin/cuda_codegen_test -ptx -sass -res-usage \ 64 | > assembly/cuda_codegen_test.S 65 | 66 | g++ -std=c++11 -O3 -I../../../ikra -o bin/cuda_aos_style_cpu array/aos_style_cpu.cc 67 | bin/cuda_aos_style_cpu 68 | 69 | /usr/local/cuda/bin/nvcc \ 70 | -std=c++11 \ 71 | --expt-extended-lambda \ 72 | -O3 \ 73 | -I../../../ikra \ 74 | -o bin/cuda_array_aos_style \ 75 | array/aos_style.cu 76 | bin/cuda_array_aos_style 77 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_aos_style -ptx -sass -res-usage \ 78 | > assembly/cuda_array_aos_style.S 79 | 80 | /usr/local/cuda/bin/nvcc \ 81 | -std=c++11 \ 82 | --expt-extended-lambda \ 83 | -O3 \ 84 | -I../../../ikra \ 85 | -o bin/cuda_array_soa_style \ 86 | array/soa_style.cu 87 | bin/cuda_array_soa_style 88 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_soa_style -ptx -sass -res-usage \ 89 | > assembly/cuda_array_soa_style.S 90 | 91 | /usr/local/cuda/bin/nvcc \ 92 | -std=c++11 \ 93 | --expt-extended-lambda \ 94 | -O3 \ 95 | -I../../../ikra \ 96 | -o bin/cuda_array_inlined_soa_style \ 97 | array/inlined_soa_style.cu 98 | bin/cuda_array_inlined_soa_style 99 | /usr/local/cuda/bin/cuobjdump bin/cuda_array_inlined_soa_style -ptx -sass -res-usage \ 100 | > assembly/cuda_array_inlined_soa_style.S 101 | 102 | extra_args="-march=native -fomit-frame-pointer -Ofast" 103 | for v_compiler in "g++" "${clang_bin}" 104 | do 105 | # Build nbody 106 | out_name="${v_compiler}_nbody_ikracpp" 107 | ${v_compiler} -O3 ${extra_args} nbody/ikracpp.cc -std=c++11 -I../../../ikra \ 108 | -o bin/${out_name} 109 | objdump -S bin/${out_name} > assembly/${out_name}.S 110 | echo -n "." 111 | 112 | out_name="${v_compiler}_nbody_ikracpp_field" 113 | ${v_compiler} -O3 ${extra_args} nbody/ikracpp_field.cc -std=c++11 -I../../../ikra \ 114 | -o bin/${out_name} 115 | objdump -S bin/${out_name} > assembly/${out_name}.S 116 | echo -n "." 117 | 118 | out_name="${v_compiler}_nbody_ikracpp_inversed" 119 | ${v_compiler} -O3 ${extra_args} nbody/ikracpp_inversed.cc -std=c++11 -I../../../ikra \ 120 | -o bin/${out_name} 121 | objdump -S bin/${out_name} > assembly/${out_name}.S 122 | echo -n "." 123 | 124 | # No vectorization with: -fno-tree-vectorize 125 | out_name="${v_compiler}_nbody_soa" 126 | ${v_compiler} -O3 ${extra_args} nbody/soa.cc -std=c++11 -o bin/${out_name} 127 | objdump -S bin/${out_name} > assembly/${out_name}.S 128 | echo -n "." 129 | 130 | out_name="${v_compiler}_nbody_aos" 131 | ${v_compiler} -O3 ${extra_args} nbody/aos.cc -std=c++11 -o bin/${out_name} 132 | objdump -S bin/${out_name} > assembly/${out_name}.S 133 | echo -n "." 134 | done 135 | 136 | /usr/local/cuda/bin/nvcc \ 137 | -std=c++11 \ 138 | --expt-extended-lambda \ 139 | -O3 \ 140 | -I../../../ikra \ 141 | -o bin/cuda_nbody_ikracpp_inversed_gpu \ 142 | nbody/ikracpp_inversed_gpu.cu 143 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_ikracpp_inversed_gpu \ 144 | -ptx -sass -res-usage > assembly/cuda_nbody_ikracpp_inversed_gpu.S 145 | echo -n "." 146 | 147 | /usr/local/cuda/bin/nvcc \ 148 | -std=c++11 \ 149 | --expt-extended-lambda \ 150 | -O3 \ 151 | -I../../../ikra \ 152 | -o bin/cuda_nbody_soa_inversed_gpu \ 153 | nbody/soa_inversed.cu 154 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_soa_inversed_gpu \ 155 | -ptx -sass -res-usage > assembly/cuda_nbody_soa_inversed_gpu.S 156 | echo -n "." 157 | 158 | /usr/local/cuda/bin/nvcc \ 159 | -std=c++11 \ 160 | --expt-extended-lambda \ 161 | -O3 \ 162 | -I../../../ikra \ 163 | -o bin/cuda_nbody_ikracpp_inversed_field_gpu \ 164 | nbody/ikracpp_inversed_field_gpu.cu 165 | /usr/local/cuda/bin/cuobjdump bin/cuda_nbody_ikracpp_inversed_field_gpu \ 166 | -ptx -sass -res-usage > assembly/cuda_nbody_ikracpp_inversed_field_gpu.S 167 | echo "." 168 | -------------------------------------------------------------------------------- /test/soa/benchmarks/codegen_test.cc: -------------------------------------------------------------------------------- 1 | // This is a minimal test case whose purpose is to check if the compiler 2 | // generates efficient assembly code. Should be built with -O3 optimization 3 | // and not through Bazel (is not set up to generate such optimized code). 4 | 5 | #include 6 | 7 | #define NDEBUG // No asserts. 8 | #include "soa/soa.h" 9 | #include "soa/storage.h" 10 | 11 | using ikra::soa::SoaLayout; 12 | using ikra::soa::kAddressModeZero; 13 | using ikra::soa::DynamicStorage; 14 | using ikra::soa::StaticStorage; 15 | using ikra::soa::kLayoutModeAos; 16 | using ikra::soa::kLayoutModeSoa; 17 | 18 | 19 | // Compiler flags determine addressing mode and storage strategy. 20 | #ifndef ADDRESS_MODE 21 | #error Address mode undefined 22 | #endif 23 | 24 | #ifndef STORAGE_STRATEGY 25 | #error Storage strategy undefined 26 | #endif 27 | 28 | #ifndef LAYOUT_MODE 29 | #error Layout mode undefined 30 | #endif 31 | 32 | static const uint32_t kClassMaxInst = 0x1234; 33 | 34 | class TestClass : public SoaLayout { 37 | public: 38 | IKRA_INITIALIZE_CLASS 39 | 40 | int_ field0; 41 | int_ field1; 42 | 43 | void increase_field0() { 44 | field0 *= 0x5555; 45 | } 46 | 47 | void increase_field1() { 48 | field1 *= 0x4444; 49 | } 50 | }; 51 | 52 | IKRA_HOST_STORAGE(TestClass); 53 | 54 | class TestClassCompare { 55 | public: 56 | int field0; 57 | int field1; 58 | 59 | void increase_field0() { 60 | field0 *= 0x5555; 61 | } 62 | 63 | void increase_field1() { 64 | field1 *= 0x4444; 65 | } 66 | }; 67 | 68 | TestClassCompare test_compare_array[kClassMaxInst]; 69 | 70 | 71 | int main() { 72 | TestClass::initialize_storage(); 73 | 74 | TestClass* instance = new TestClass(); 75 | instance->field0 = 0x7777; 76 | instance->field1 = 0x8888; 77 | instance->increase_field0(); 78 | instance->increase_field1(); 79 | 80 | // Expected output: FIELD0: 668085635, FIELD1: 610821152 81 | printf("FIELD0: %i, FIELD1: %i\n", (int) instance->field0, 82 | (int) instance->field1); 83 | 84 | // Return 0 if correct result. 85 | return !(instance->field0 == 668085635 && instance->field1 == 610821152); 86 | } 87 | 88 | // Extra methods for code isolation: Will show up in .S file. 89 | TestClass* new_instance() { 90 | return new TestClass(); 91 | } 92 | 93 | void write_field0(TestClass* instance) { 94 | instance->field0 = 0x7777; 95 | } 96 | 97 | int read_field0(TestClass* instance) { 98 | return instance->field0; 99 | } 100 | 101 | void write_field1(TestClass* instance) { 102 | instance->field1 = 0x7777; 103 | } 104 | 105 | int read_field1(TestClass* instance) { 106 | return instance->field1; 107 | } 108 | 109 | // Compare with explicit, hand-written SOA code. 110 | int explicit_field0[100]; 111 | 112 | void explicit_write_field0(uintptr_t id) { 113 | explicit_field0[id] = 0x7777; 114 | } 115 | 116 | int explicit_read_field0(uintptr_t id) { 117 | return explicit_field0[id]; 118 | } 119 | 120 | void explicit_write_field0_aos(TestClassCompare* obj) { 121 | obj->field0 = 0x7777; 122 | } 123 | 124 | int explicit_read_field0_aos(TestClassCompare* obj) { 125 | return obj->field0; 126 | } 127 | 128 | void explicit_write_field1_aos(TestClassCompare* obj) { 129 | obj->field1 = 0x7777; 130 | } 131 | 132 | int explicit_read_field1_aos(TestClassCompare* obj) { 133 | return obj->field1; 134 | } 135 | -------------------------------------------------------------------------------- /test/soa/benchmarks/cuda_codegen_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NDEBUG // No asserts. 4 | 5 | #include "executor/cuda_executor.h" 6 | #include "soa/soa.h" 7 | 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | 11 | const static int kTestSize = 12; 12 | 13 | class TestClass : public SoaLayout { 14 | public: 15 | IKRA_INITIALIZE_CLASS 16 | 17 | __device__ TestClass(int f0, int f1) : field0(f0), field1(f1) {} 18 | 19 | int_ field0; 20 | int_ field1; 21 | 22 | __device__ void add_fields(int increment) { 23 | field0 = field0 + field1 + increment + this->id() + 0x7777; 24 | 25 | TestClass::get(0)->field0.get(); 26 | } 27 | }; 28 | 29 | IKRA_DEVICE_STORAGE(TestClass) 30 | 31 | 32 | int main() { 33 | TestClass::initialize_storage(); 34 | 35 | TestClass* first = construct(kTestSize, 0x6666, 0x5555); 36 | cuda_execute(&TestClass::add_fields, 0x8888); 37 | 38 | // Check result. 39 | for (int i = 0; i < kTestSize; ++i) { 40 | int actual = TestClass::get_uninitialized(i)->field0; 41 | int expected = 0x7777 + 0x6666 + 0x5555 + i + 0x8888; 42 | 43 | if (actual == expected) { 44 | printf("[OK] "); 45 | } else { 46 | printf("[FAIL]\n"); 47 | return 1; 48 | } 49 | } 50 | 51 | printf("\n"); 52 | 53 | // Make sure that we had no CUDA failures. 54 | gpuErrchk(cudaPeekAtLastError()); 55 | return 0; 56 | } 57 | 58 | // Extra methods for code isolation: Will show up in .S file. 59 | __device__ int int_result; 60 | 61 | __global__ void write_field0(TestClass* instance) { 62 | instance->field0 = 0x7777; 63 | } 64 | 65 | __global__ void read_field0(TestClass* instance) { 66 | int_result = instance->field0; 67 | } 68 | 69 | __global__ void write_field1(TestClass* instance) { 70 | instance->field1 = 0x7777; 71 | } 72 | 73 | __global__ void read_field1(TestClass* instance) { 74 | int_result = instance->field1; 75 | } 76 | 77 | // Compare with explicit, hand-written SOA code. 78 | __device__ int explicit_field0[100]; 79 | 80 | __global__ void explicit_write_field0(uintptr_t id) { 81 | explicit_field0[id] = 0x7777; 82 | } 83 | 84 | __global__ void explicit_read_field0(uintptr_t id) { 85 | int_result = explicit_field0[id]; 86 | } 87 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/aos.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "benchmark.h" 10 | 11 | using namespace std; 12 | 13 | static const int kIterations = 5; 14 | static const int kNumBodies = 8000; 15 | static const double kMaxMass = 1000; 16 | static const double kTimeInterval = 0.5; 17 | 18 | static const double kGravityConstant = 6.673e-11; // gravitational constant 19 | 20 | #define RAND (1.0 * rand() / RAND_MAX) 21 | 22 | char Body_arena[kNumBodies*8*8]; 23 | char* Body_arena_head = Body_arena; 24 | 25 | class Body { 26 | public: 27 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 28 | : mass_(mass) { 29 | position_[0] = pos_x; 30 | position_[1] = pos_y; 31 | velocity_[0] = vel_x; 32 | velocity_[1] = vel_y; 33 | this->reset_force(); 34 | } 35 | 36 | double mass_; 37 | std::array position_; 38 | std::array velocity_; 39 | std::array force_; 40 | 41 | void add_force(Body* body) { 42 | if (this == body) return; 43 | 44 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 45 | double dx = body->position_[0] - position_[0]; 46 | double dy = body->position_[1] - position_[1]; 47 | double dist = sqrt(dx*dx + dy*dy); 48 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 49 | force_[0] += F*dx / dist; 50 | force_[1] += F*dy / dist; 51 | } 52 | 53 | void add_force_to_all() { 54 | for (int i = 0; i < kNumBodies; ++i) { 55 | Body* body = reinterpret_cast(Body_arena + sizeof(Body)*i); 56 | body->add_force(this); 57 | } 58 | } 59 | 60 | void reset_force() { 61 | force_[0] = 0.0; 62 | force_[1] = 0.0; 63 | } 64 | 65 | void update(double dt) { 66 | velocity_[0] += force_[0]*dt / mass_; 67 | velocity_[1] += force_[1]*dt / mass_; 68 | position_[0] += velocity_[0]*dt; 69 | position_[1] += velocity_[1]*dt; 70 | 71 | if (position_[0] < -1 || position_[0] > 1) { 72 | velocity_[0] = -velocity_[0]; 73 | } 74 | if (position_[1] < -1 || position_[1] > 1) { 75 | velocity_[1] = -velocity_[1]; 76 | } 77 | } 78 | 79 | void codengen_simple_update(double dt) { 80 | for (int i = 0; i < 100; ++i) { 81 | velocity_[0] += force_[0]*dt / mass_; 82 | velocity_[1] += force_[1]*dt / mass_; 83 | position_[0] += velocity_[0]*dt; 84 | position_[1] += velocity_[1]*dt; 85 | } 86 | } 87 | }; 88 | 89 | 90 | void instantiation() { 91 | srand(42); 92 | 93 | // Create objects. 94 | for (int i = 0; i < kNumBodies; ++i) { 95 | double mass = (RAND/2 + 0.5) * kMaxMass; 96 | double pos_x = RAND*2 - 1; 97 | double pos_y = RAND*2 - 1; 98 | double vel_x = (RAND - 0.5) / 1000; 99 | double vel_y = (RAND - 0.5) / 1000; 100 | new(Body_arena_head) Body(mass, pos_x, pos_y, vel_x, vel_y); 101 | Body_arena_head += sizeof(Body); 102 | } 103 | } 104 | 105 | void run_simulation() { 106 | for (int i = 0; i < kIterations; ++i) { 107 | // Reset forces. 108 | for (int j = 0; j < kNumBodies; ++j) { 109 | Body* body = reinterpret_cast(Body_arena + sizeof(Body)*j); 110 | body->reset_force(); 111 | } 112 | 113 | // Update forces. 114 | for (int j = 0; j < kNumBodies; ++j) { 115 | Body* body = reinterpret_cast(Body_arena + sizeof(Body)*j); 116 | body->add_force_to_all(); 117 | } 118 | 119 | // Update velocities and positions. 120 | for (int j = 0; j < kNumBodies; ++j) { 121 | Body* body = reinterpret_cast(Body_arena + sizeof(Body)*j); 122 | body->update(kTimeInterval); 123 | } 124 | } 125 | } 126 | 127 | void run_simple() { 128 | for (int i = 0; i < kIterations*100; ++i) { 129 | for (int j = 0; j < kNumBodies; ++j) { 130 | Body* body = reinterpret_cast(Body_arena + sizeof(Body)*j); 131 | body->codengen_simple_update(kTimeInterval); 132 | } 133 | } 134 | } 135 | 136 | int main() { 137 | uint64_t time_instantiation = measure<>::execution(instantiation); 138 | uint64_t time_simulation = measure<>::execution(run_simulation); 139 | uint64_t time_simple = measure<>::execution(run_simple); 140 | 141 | // Calculate checksum 142 | int checksum = 11; 143 | for (uintptr_t i = 0; i < kNumBodies; i++) { 144 | checksum += reinterpret_cast(r_float2int( 145 | reinterpret_cast(Body_arena + sizeof(Body)*i)->position_[0])); 146 | checksum += reinterpret_cast(r_float2int( 147 | reinterpret_cast(Body_arena + sizeof(Body)*i)->position_[1])); 148 | checksum = checksum % 1234567; 149 | 150 | if (i < 10) { 151 | printf("VALUE[%lu] = %f, %f\n", i, 152 | reinterpret_cast(Body_arena + sizeof(Body)*i)->position_[0], 153 | reinterpret_cast(Body_arena + sizeof(Body)*i)->position_[1]); 154 | } 155 | } 156 | 157 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 158 | time_instantiation, time_simulation, time_simple, checksum); 159 | return 0; 160 | } 161 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/benchmark.h: -------------------------------------------------------------------------------- 1 | // Taken from: https://stackoverflow.com/questions/2808398/easily-measure-elapsed-time 2 | 3 | #ifndef TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H 4 | #define TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H 5 | 6 | #include 7 | #include 8 | 9 | template 10 | struct measure 11 | { 12 | template 13 | static typename TimeT::rep execution(F&& func, Args&&... args) 14 | { 15 | auto start = std::chrono::steady_clock::now(); 16 | std::forward(func)(std::forward(args)...); 17 | auto duration = std::chrono::duration_cast< TimeT> 18 | (std::chrono::steady_clock::now() - start); 19 | return duration.count(); 20 | } 21 | }; 22 | 23 | int r_float2int(float value) { 24 | return *reinterpret_cast(&value); 25 | } 26 | 27 | #endif // TEST_SOA_BENCHMARK_NBODY_BENCHMARK_H 28 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/ikracpp.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #define NDEBUG // No asserts. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark.h" 11 | #include "executor/executor.h" 12 | #include "soa/soa.h" 13 | 14 | using namespace std; 15 | using ikra::executor::execute; 16 | using ikra::executor::Iterator; 17 | using ikra::soa::SoaLayout; 18 | 19 | static const int kIterations = 5; 20 | static const int kNumBodies = 8000; 21 | static const double kMaxMass = 1000; 22 | static const double kTimeInterval = 0.5; 23 | 24 | static const double kGravityConstant = 6.673e-11; // gravitational constant 25 | 26 | #define RAND (1.0 * rand() / RAND_MAX) 27 | 28 | class Body : public SoaLayout { 29 | public: 30 | IKRA_INITIALIZE_CLASS 31 | 32 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 33 | : mass_(mass) { 34 | position_[0] = pos_x; 35 | position_[1] = pos_y; 36 | velocity_[0] = vel_x; 37 | velocity_[1] = vel_y; 38 | this->reset_force(); 39 | } 40 | 41 | double_ mass_; 42 | array_(double, 2, fully_inlined) position_; 43 | array_(double, 2, fully_inlined) velocity_; 44 | array_(double, 2, fully_inlined) force_; 45 | 46 | void add_force(Body* body) { 47 | if (this == body) return; 48 | 49 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 50 | double dx = body->position_[0] - position_[0]; 51 | double dy = body->position_[1] - position_[1]; 52 | double dist = sqrt(dx*dx + dy*dy); 53 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 54 | force_[0] += F*dx / dist; 55 | force_[1] += F*dy / dist; 56 | } 57 | 58 | void add_force_to_all() { 59 | execute(&Body::add_force, this); 60 | } 61 | 62 | void reset_force() { 63 | force_[0] = 0.0; 64 | force_[1] = 0.0; 65 | } 66 | 67 | void update(double dt) { 68 | velocity_[0] += force_[0]*dt / mass_; 69 | velocity_[1] += force_[1]*dt / mass_; 70 | position_[0] += velocity_[0]*dt; 71 | position_[1] += velocity_[1]*dt; 72 | 73 | if (position_[0] < -1 || position_[0] > 1) { 74 | velocity_[0] = -velocity_[0]; 75 | } 76 | if (position_[1] < -1 || position_[1] > 1) { 77 | velocity_[1] = -velocity_[1]; 78 | } 79 | } 80 | 81 | void codengen_simple_update_templated(double dt) { 82 | velocity_.at<0>() += force_.at<0>()*dt / mass_; 83 | velocity_.at<1>() += force_.at<1>()*dt / mass_; 84 | position_.at<0>() += velocity_.at<0>()*dt; 85 | position_.at<1>() += velocity_.at<1>()*dt; 86 | } 87 | 88 | void codengen_simple_update(double dt) { 89 | for (int i = 0; i < 100; ++i) { 90 | velocity_[0] += force_[0]*dt / mass_; 91 | velocity_[1] += force_[1]*dt / mass_; 92 | position_[0] += velocity_[0]*dt; 93 | position_[1] += velocity_[1]*dt; 94 | } 95 | } 96 | }; 97 | 98 | IKRA_HOST_STORAGE(Body) 99 | 100 | void instantiation() { 101 | srand(42); 102 | 103 | // Create objects. 104 | for (int i = 0; i < kNumBodies; ++i) { 105 | double mass = (RAND/2 + 0.5) * kMaxMass; 106 | double pos_x = RAND*2 - 1; 107 | double pos_y = RAND*2 - 1; 108 | double vel_x = (RAND - 0.5) / 1000; 109 | double vel_y = (RAND - 0.5) / 1000; 110 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 111 | } 112 | } 113 | 114 | void run_simulation() { 115 | for (int i = 0; i < kIterations; ++i) { 116 | // Reset forces. 117 | execute(&Body::reset_force); 118 | 119 | // Update forces. 120 | execute(&Body::add_force_to_all); 121 | 122 | // Update velocities and positions. 123 | execute(&Body::update, kTimeInterval); 124 | } 125 | } 126 | 127 | void run_simple() { 128 | for (int i = 0; i < kIterations*100; ++i) { 129 | execute(&Body::codengen_simple_update, kTimeInterval); 130 | } 131 | } 132 | 133 | int main() { 134 | // Initialize object storage. 135 | Body::initialize_storage(); 136 | 137 | uint64_t time_instantiation = measure<>::execution(instantiation); 138 | uint64_t time_simulation = measure<>::execution(run_simulation); 139 | uint64_t time_simple = measure<>::execution(run_simple); 140 | 141 | // Calculate checksum 142 | int checksum = 11; 143 | for (uintptr_t i = 0; i < kNumBodies; i++) { 144 | checksum += reinterpret_cast( 145 | r_float2int(Body::get(i)->position_[0])); 146 | checksum += reinterpret_cast( 147 | r_float2int(Body::get(i)->position_[1])); 148 | checksum = checksum % 1234567; 149 | 150 | if (i < 10) { 151 | printf("VALUE[%lu] = %f, %f\n", i, 152 | (double) Body::get(i)->position_[0], 153 | (double) Body::get(i)->position_[1]); 154 | } 155 | } 156 | 157 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 158 | time_instantiation, time_simulation, time_simple, checksum); 159 | return 0; 160 | } 161 | 162 | void codegen_reset_force(Body* body) { 163 | body->reset_force(); 164 | } 165 | 166 | void codegen_add_force(Body* self, Body* other) { 167 | self->add_force(other); 168 | } 169 | 170 | void codegen_update(Body* self, double dt) { 171 | self->update(dt); 172 | } 173 | 174 | void codegen_reset_force_manual(Body* body) { 175 | body->force_[0] = 0.0; 176 | body->force_[1] = 0.0; 177 | } 178 | 179 | void codegen_reset_force_half(Body* body) { 180 | body->force_[0] = 0.0; 181 | } 182 | 183 | void codegen_reset_mass(Body* body) { 184 | body->mass_ = 0.0; 185 | } 186 | 187 | void codengen_simple_update(Body* body, double dt) { 188 | body->codengen_simple_update(dt); 189 | } 190 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/ikracpp_field.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #define NDEBUG // No asserts. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark.h" 11 | #include "executor/executor.h" 12 | #include "soa/soa.h" 13 | 14 | using namespace std; 15 | using ikra::executor::execute; 16 | using ikra::executor::Iterator; 17 | using ikra::soa::SoaLayout; 18 | 19 | static const int kIterations = 5; 20 | static const int kNumBodies = 8000; 21 | static const double kMaxMass = 1000; 22 | static const double kTimeInterval = 0.5; 23 | 24 | static const double kGravityConstant = 6.673e-11; // gravitational constant 25 | 26 | #define RAND (1.0 * rand() / RAND_MAX) 27 | 28 | class Body : public SoaLayout { 29 | public: 30 | IKRA_INITIALIZE_CLASS 31 | 32 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 33 | : mass_(mass) { 34 | position0_ = pos_x; 35 | position1_ = pos_y; 36 | velocity0_ = vel_x; 37 | velocity1_ = vel_y; 38 | this->reset_force(); 39 | } 40 | 41 | double_ mass_; 42 | double_ position0_; 43 | double_ position1_; 44 | double_ velocity0_; 45 | double_ velocity1_; 46 | double_ force0_; 47 | double_ force1_; 48 | 49 | void add_force(Body* body) { 50 | if (this == body) return; 51 | 52 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 53 | double dx = body->position0_ - position0_; 54 | double dy = body->position1_ - position1_; 55 | double dist = sqrt(dx*dx + dy*dy); 56 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 57 | force0_ += F*dx / dist; 58 | force1_ += F*dy / dist; 59 | } 60 | 61 | void add_force_to_all() { 62 | execute(&Body::add_force, this); 63 | } 64 | 65 | void reset_force() { 66 | force0_ = 0.0; 67 | force1_ = 0.0; 68 | } 69 | 70 | void update(double dt) { 71 | velocity0_ += force0_*dt / mass_; 72 | velocity1_ += force1_*dt / mass_; 73 | position0_ += velocity0_*dt; 74 | position1_ += velocity1_*dt; 75 | 76 | if (position0_ < -1 || position0_ > 1) { 77 | velocity0_ = -velocity0_; 78 | } 79 | if (position1_ < -1 || position1_ > 1) { 80 | velocity1_ = -velocity1_; 81 | } 82 | } 83 | 84 | void codengen_simple_update(double dt) { 85 | for (int i = 0; i < 100; ++i) { 86 | velocity0_ += force0_*dt / mass_; 87 | velocity1_ += force1_*dt / mass_; 88 | position0_ += velocity0_*dt; 89 | position1_ += velocity1_*dt; 90 | } 91 | } 92 | }; 93 | 94 | IKRA_HOST_STORAGE(Body) 95 | 96 | void instantiation() { 97 | srand(42); 98 | 99 | // Create objects. 100 | for (int i = 0; i < kNumBodies; ++i) { 101 | double mass = (RAND/2 + 0.5) * kMaxMass; 102 | double pos_x = RAND*2 - 1; 103 | double pos_y = RAND*2 - 1; 104 | double vel_x = (RAND - 0.5) / 1000; 105 | double vel_y = (RAND - 0.5) / 1000; 106 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 107 | } 108 | } 109 | 110 | void run_simulation() { 111 | for (int i = 0; i < kIterations; ++i) { 112 | // Reset forces. 113 | execute(&Body::reset_force); 114 | 115 | // Update forces. 116 | execute(&Body::add_force_to_all); 117 | 118 | // Update velocities and positions. 119 | execute(&Body::update, kTimeInterval); 120 | } 121 | } 122 | 123 | void run_simple() { 124 | for (int i = 0; i < kIterations*100; ++i) { 125 | execute(&Body::codengen_simple_update, kTimeInterval); 126 | } 127 | } 128 | 129 | int main() { 130 | // Initialize object storage. 131 | Body::initialize_storage(); 132 | 133 | uint64_t time_instantiation = measure<>::execution(instantiation); 134 | uint64_t time_simulation = measure<>::execution(run_simulation); 135 | uint64_t time_simple = measure<>::execution(run_simple); 136 | 137 | // Calculate checksum 138 | int checksum = 11; 139 | for (uintptr_t i = 0; i < kNumBodies; i++) { 140 | checksum += reinterpret_cast( 141 | r_float2int(Body::get(i)->position0_)); 142 | checksum += reinterpret_cast( 143 | r_float2int(Body::get(i)->position1_)); 144 | checksum = checksum % 1234567; 145 | 146 | if (i < 10) { 147 | printf("VALUE[%lu] = %f, %f\n", i, 148 | (double) Body::get(i)->position0_, 149 | (double) Body::get(i)->position1_); 150 | } 151 | } 152 | 153 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 154 | time_instantiation, time_simulation, time_simple, checksum); 155 | return 0; 156 | } 157 | 158 | void codegen_reset_force(Body* body) { 159 | body->reset_force(); 160 | } 161 | 162 | void codegen_add_force(Body* self, Body* other) { 163 | self->add_force(other); 164 | } 165 | 166 | void codegen_update(Body* self, double dt) { 167 | self->update(dt); 168 | } 169 | 170 | void codegen_reset_force_manual(Body* body) { 171 | body->force0_ = 0.0; 172 | body->force1_ = 0.0; 173 | } 174 | 175 | void codegen_reset_force_half(Body* body) { 176 | body->force0_ = 0.0; 177 | } 178 | 179 | void codegen_reset_mass(Body* body) { 180 | body->mass_ = 0.0; 181 | } 182 | 183 | void codengen_simple_update(Body* body, double dt) { 184 | body->codengen_simple_update(dt); 185 | } 186 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/ikracpp_inversed.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #define NDEBUG // No asserts. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark.h" 11 | #include "executor/executor.h" 12 | #include "soa/soa.h" 13 | 14 | using namespace std; 15 | using ikra::executor::execute; 16 | using ikra::executor::Iterator; 17 | using ikra::soa::SoaLayout; 18 | 19 | static const int kIterations = 5; 20 | static const int kNumBodies = 8000; 21 | static const double kMaxMass = 1000; 22 | static const double kTimeInterval = 0.5; 23 | 24 | static const double kGravityConstant = 6.673e-11; // gravitational constant 25 | 26 | #define RAND (1.0 * rand() / RAND_MAX) 27 | 28 | class Body : public SoaLayout { 29 | public: 30 | IKRA_INITIALIZE_CLASS 31 | 32 | Body(double mass, double pos_x, double pos_y, double vel_x, double vel_y) 33 | : mass_(mass) { 34 | position_[0] = pos_x; 35 | position_[1] = pos_y; 36 | velocity_[0] = vel_x; 37 | velocity_[1] = vel_y; 38 | this->reset_force(); 39 | } 40 | 41 | double_ mass_; 42 | array_(double, 2, fully_inlined) position_; 43 | array_(double, 2, fully_inlined) velocity_; 44 | array_(double, 2, fully_inlined) force_; 45 | 46 | void add_force(Body* body) { 47 | if (this == body) return; 48 | 49 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 50 | double dx = body->position_[0] - position_[0]; 51 | double dy = body->position_[1] - position_[1]; 52 | double dist = sqrt(dx*dx + dy*dy); 53 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 54 | force_[0] += F*dx / dist; 55 | force_[1] += F*dy / dist; 56 | } 57 | 58 | void add_all_forces_to_this() { 59 | for (uintptr_t i = 0; i < kNumBodies; ++i) { 60 | this->add_force(Body::get(i)); 61 | } 62 | } 63 | 64 | void reset_force() { 65 | force_[0] = 0.0; 66 | force_[1] = 0.0; 67 | } 68 | 69 | void update(double dt) { 70 | velocity_[0] += force_[0]*dt / mass_; 71 | velocity_[1] += force_[1]*dt / mass_; 72 | position_[0] += velocity_[0]*dt; 73 | position_[1] += velocity_[1]*dt; 74 | 75 | if (position_[0] < -1 || position_[0] > 1) { 76 | velocity_[0] = -velocity_[0]; 77 | } 78 | if (position_[1] < -1 || position_[1] > 1) { 79 | velocity_[1] = -velocity_[1]; 80 | } 81 | } 82 | 83 | void codengen_simple_update_templated(double dt) { 84 | velocity_.at<0>() += force_.at<0>()*dt / mass_; 85 | velocity_.at<1>() += force_.at<1>()*dt / mass_; 86 | position_.at<0>() += velocity_.at<0>()*dt; 87 | position_.at<1>() += velocity_.at<1>()*dt; 88 | } 89 | 90 | void codengen_simple_update(double dt) { 91 | for (int i = 0; i < 100; ++i) { 92 | velocity_[0] += force_[0]*dt / mass_; 93 | velocity_[1] += force_[1]*dt / mass_; 94 | position_[0] += velocity_[0]*dt; 95 | position_[1] += velocity_[1]*dt; 96 | } 97 | } 98 | }; 99 | 100 | IKRA_HOST_STORAGE(Body) 101 | 102 | void instantiation() { 103 | srand(42); 104 | 105 | // Create objects. 106 | for (int i = 0; i < kNumBodies; ++i) { 107 | double mass = (RAND/2 + 0.5) * kMaxMass; 108 | double pos_x = RAND*2 - 1; 109 | double pos_y = RAND*2 - 1; 110 | double vel_x = (RAND - 0.5) / 1000; 111 | double vel_y = (RAND - 0.5) / 1000; 112 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 113 | } 114 | } 115 | 116 | void run_simulation() { 117 | for (int i = 0; i < kIterations; ++i) { 118 | // Reset forces. 119 | execute(&Body::reset_force); 120 | 121 | // Update forces. 122 | execute(&Body::add_all_forces_to_this); 123 | 124 | // Update velocities and positions. 125 | execute(&Body::update, kTimeInterval); 126 | } 127 | } 128 | 129 | void run_simple() { 130 | for (int i = 0; i < kIterations*100; ++i) { 131 | execute(&Body::codengen_simple_update, kTimeInterval); 132 | } 133 | } 134 | 135 | int main() { 136 | // Initialize object storage. 137 | Body::initialize_storage(); 138 | 139 | uint64_t time_instantiation = measure<>::execution(instantiation); 140 | uint64_t time_simulation = measure<>::execution(run_simulation); 141 | uint64_t time_simple = measure<>::execution(run_simple); 142 | 143 | // Calculate checksum 144 | int checksum = 11; 145 | for (uintptr_t i = 0; i < kNumBodies; i++) { 146 | checksum += reinterpret_cast( 147 | r_float2int(Body::get(i)->position_[0])); 148 | checksum += reinterpret_cast( 149 | r_float2int(Body::get(i)->position_[1])); 150 | checksum = checksum % 1234567; 151 | 152 | if (i < 10) { 153 | printf("VALUE[%lu] = %f, %f\n", i, 154 | (double) Body::get(i)->position_[0], 155 | (double) Body::get(i)->position_[1]); 156 | } 157 | } 158 | 159 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 160 | time_instantiation, time_simulation, time_simple, checksum); 161 | return 0; 162 | } 163 | 164 | void codegen_reset_force(Body* body) { 165 | body->reset_force(); 166 | } 167 | 168 | void codegen_add_force(Body* self, Body* other) { 169 | self->add_force(other); 170 | } 171 | 172 | void codegen_update(Body* self, double dt) { 173 | self->update(dt); 174 | } 175 | 176 | void codegen_reset_force_manual(Body* body) { 177 | body->force_[0] = 0.0; 178 | body->force_[1] = 0.0; 179 | } 180 | 181 | void codegen_reset_force_half(Body* body) { 182 | body->force_[0] = 0.0; 183 | } 184 | 185 | void codegen_reset_mass(Body* body) { 186 | body->mass_ = 0.0; 187 | } 188 | 189 | void codengen_simple_update(Body* body, double dt) { 190 | body->codengen_simple_update(dt); 191 | } 192 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/ikracpp_inversed_field_gpu.cu: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #define NDEBUG // No asserts. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark.h" 11 | #include "executor/cuda_executor.h" 12 | #include "executor/executor.h" 13 | #include "soa/soa.h" 14 | 15 | using namespace std; 16 | using ikra::executor::execute; 17 | using ikra::executor::Iterator; 18 | using ikra::soa::SoaLayout; 19 | 20 | static const int kIterations = 5; 21 | static const int kNumBodies = 8000; 22 | static const double kMaxMass = 1000; 23 | static const double kTimeInterval = 0.5; 24 | 25 | static const double kGravityConstant = 6.673e-11; // gravitational constant 26 | 27 | #define RAND (1.0 * rand() / RAND_MAX) 28 | 29 | class Body : public SoaLayout { 30 | public: 31 | IKRA_INITIALIZE_CLASS 32 | 33 | Body(double mass, double pos_x, double pos_y, 34 | double vel_x, double vel_y) : mass_(mass) { 35 | position0_ = pos_x; 36 | position1_ = pos_y; 37 | velocity0_ = vel_x; 38 | velocity1_ = vel_y; 39 | this->reset_force(); 40 | } 41 | 42 | double_ mass_; 43 | double_ position0_; 44 | double_ position1_; 45 | double_ velocity0_; 46 | double_ velocity1_; 47 | double_ force0_; 48 | double_ force1_; 49 | 50 | __device__ void add_force(Body* body) { 51 | if (this == body) return; 52 | 53 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 54 | double dx = body->position0_ - position0_; 55 | double dy = body->position1_ - position1_; 56 | double dist = sqrt(dx*dx + dy*dy); 57 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 58 | force0_ += F*dx / dist; 59 | force1_ += F*dy / dist; 60 | } 61 | 62 | __device__ void add_all_forces_to_this() { 63 | for (uintptr_t i = 0; i < kNumBodies; ++i) { 64 | this->add_force(Body::get(i)); 65 | } 66 | } 67 | 68 | __host__ __device__ void reset_force() { 69 | force0_ = 0.0; 70 | force1_ = 0.0; 71 | } 72 | 73 | __device__ void update(double dt) { 74 | velocity0_ += force0_*dt / mass_; 75 | velocity1_ += force1_*dt / mass_; 76 | position0_ += velocity0_*dt; 77 | position1_ += velocity1_*dt; 78 | 79 | if (position0_ < -1 || position0_ > 1) { 80 | velocity0_ = -velocity0_; 81 | } 82 | if (position1_ < -1 || position1_ > 1) { 83 | velocity1_ = -velocity1_; 84 | } 85 | } 86 | 87 | __device__ void codengen_simple_update(double dt) { 88 | for (int i = 0; i < 1000; ++i) { 89 | velocity0_ += force0_*dt / mass_; 90 | velocity1_ += force1_*dt / mass_; 91 | position0_ += velocity0_*dt; 92 | position1_ += velocity1_*dt; 93 | } 94 | } 95 | }; 96 | 97 | IKRA_DEVICE_STORAGE(Body) 98 | 99 | // TODO: Benchmark instance creation on GPU. 100 | void instantiation() { 101 | srand(42); 102 | 103 | // Create objects. 104 | for (int i = 0; i < kNumBodies; ++i) { 105 | double mass = (RAND/2 + 0.5) * kMaxMass; 106 | double pos_x = RAND*2 - 1; 107 | double pos_y = RAND*2 - 1; 108 | double vel_x = (RAND - 0.5) / 1000; 109 | double vel_y = (RAND - 0.5) / 1000; 110 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 111 | } 112 | } 113 | 114 | void run_simulation() { 115 | for (int i = 0; i < kIterations; ++i) { 116 | // Reset forces. 117 | cuda_execute(&Body::reset_force); 118 | 119 | // Update forces. 120 | cuda_execute(&Body::add_all_forces_to_this); 121 | 122 | // Update velocities and positions. 123 | cuda_execute(&Body::update, kTimeInterval); 124 | } 125 | } 126 | 127 | void run_simple() { 128 | for (int i = 0; i < kIterations*100; ++i) { 129 | cuda_execute(&Body::codengen_simple_update, kTimeInterval); 130 | } 131 | } 132 | 133 | int main() { 134 | // Initialize object storage. 135 | Body::initialize_storage(); 136 | 137 | uint64_t time_instantiation = measure<>::execution(instantiation); 138 | gpuErrchk(cudaPeekAtLastError()); 139 | 140 | uint64_t time_simulation = measure<>::execution(run_simulation); 141 | gpuErrchk(cudaPeekAtLastError()); 142 | 143 | uint64_t time_simple = measure<>::execution(run_simple); 144 | gpuErrchk(cudaPeekAtLastError()); 145 | 146 | // Calculate checksum 147 | int checksum = 11; 148 | for (uintptr_t i = 0; i < kNumBodies; i++) { 149 | checksum += reinterpret_cast( 150 | r_float2int(Body::get(i)->position0_)); 151 | checksum += reinterpret_cast( 152 | r_float2int(Body::get(i)->position1_)); 153 | checksum = checksum % 1234567; 154 | 155 | if (i < 10) { 156 | printf("VALUE[%lu] = %f, %f\n", i, 157 | (double) Body::get(i)->position0_, 158 | (double) Body::get(i)->position1_); 159 | } 160 | } 161 | 162 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 163 | time_instantiation, time_simulation, time_simple, checksum); 164 | return 0; 165 | } 166 | 167 | __global__ void codegen_reset_force(Body* body) { 168 | body->reset_force(); 169 | } 170 | 171 | __global__ void codegen_add_force(Body* self, Body* other) { 172 | self->add_force(other); 173 | } 174 | 175 | __global__ void codegen_update(Body* self, double dt) { 176 | self->update(dt); 177 | } 178 | 179 | __global__ void codegen_reset_force_manual(Body* body) { 180 | body->force0_ = 0.0; 181 | body->force1_ = 0.0; 182 | } 183 | 184 | __global__ void codegen_reset_force_half(Body* body) { 185 | body->force0_ = 0.0; 186 | } 187 | 188 | __global__ void codegen_reset_mass(Body* body) { 189 | body->mass_ = 0.0; 190 | } 191 | 192 | __global__ void codengen_simple_update(Body* body, double dt) { 193 | body->codengen_simple_update(dt); 194 | } 195 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/ikracpp_inversed_gpu.cu: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #define NDEBUG // No asserts. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark.h" 11 | #include "executor/cuda_executor.h" 12 | #include "executor/executor.h" 13 | #include "soa/soa.h" 14 | 15 | using namespace std; 16 | using ikra::executor::execute; 17 | using ikra::executor::Iterator; 18 | using ikra::soa::SoaLayout; 19 | 20 | static const int kIterations = 5; 21 | static const int kNumBodies = 8000; 22 | static const double kMaxMass = 1000; 23 | static const double kTimeInterval = 0.5; 24 | 25 | static const double kGravityConstant = 6.673e-11; // gravitational constant 26 | 27 | #define RAND (1.0 * rand() / RAND_MAX) 28 | 29 | class Body : public SoaLayout { 30 | public: 31 | IKRA_INITIALIZE_CLASS 32 | 33 | Body(double mass, double pos_x, double pos_y, 34 | double vel_x, double vel_y) : mass_(mass) { 35 | position_[0] = pos_x; 36 | position_[1] = pos_y; 37 | velocity_[0] = vel_x; 38 | velocity_[1] = vel_y; 39 | this->reset_force(); 40 | } 41 | 42 | double_ mass_; 43 | array_(double, 2, fully_inlined) position_; 44 | array_(double, 2, fully_inlined) velocity_; 45 | array_(double, 2, fully_inlined) force_; 46 | 47 | __device__ void add_force(Body* body) { 48 | if (this == body) return; 49 | 50 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 51 | double dx = body->position_[0] - position_[0]; 52 | double dy = body->position_[1] - position_[1]; 53 | double dist = sqrt(dx*dx + dy*dy); 54 | double F = kGravityConstant*mass_*body->mass_ / (dist*dist + EPS*EPS); 55 | force_[0] += F*dx / dist; 56 | force_[1] += F*dy / dist; 57 | } 58 | 59 | __device__ void add_all_forces_to_this() { 60 | for (uintptr_t i = 0; i < kNumBodies; ++i) { 61 | this->add_force(Body::get(i)); 62 | } 63 | } 64 | 65 | __host__ __device__ void reset_force() { 66 | force_[0] = 0.0; 67 | force_[1] = 0.0; 68 | } 69 | 70 | __device__ void update(double dt) { 71 | velocity_[0] += force_[0]*dt / mass_; 72 | velocity_[1] += force_[1]*dt / mass_; 73 | position_[0] += velocity_[0]*dt; 74 | position_[1] += velocity_[1]*dt; 75 | 76 | if (position_[0] < -1 || position_[0] > 1) { 77 | velocity_[0] = -velocity_[0]; 78 | } 79 | if (position_[1] < -1 || position_[1] > 1) { 80 | velocity_[1] = -velocity_[1]; 81 | } 82 | } 83 | 84 | __device__ void codengen_simple_update(double dt) { 85 | for (int i = 0; i < 1000; ++i) { 86 | velocity_[0] += force_[0]*dt / mass_; 87 | velocity_[1] += force_[1]*dt / mass_; 88 | position_[0] += velocity_[0]*dt; 89 | position_[1] += velocity_[1]*dt; 90 | } 91 | } 92 | }; 93 | 94 | IKRA_DEVICE_STORAGE(Body) 95 | 96 | // TODO: Benchmark instance creation on GPU. 97 | void instantiation() { 98 | srand(42); 99 | 100 | // Create objects. 101 | for (int i = 0; i < kNumBodies; ++i) { 102 | double mass = (RAND/2 + 0.5) * kMaxMass; 103 | double pos_x = RAND*2 - 1; 104 | double pos_y = RAND*2 - 1; 105 | double vel_x = (RAND - 0.5) / 1000; 106 | double vel_y = (RAND - 0.5) / 1000; 107 | new Body(mass, pos_x, pos_y, vel_x, vel_y); 108 | } 109 | } 110 | 111 | void run_simulation() { 112 | for (int i = 0; i < kIterations; ++i) { 113 | // Reset forces. 114 | cuda_execute(&Body::reset_force); 115 | 116 | // Update forces. 117 | cuda_execute(&Body::add_all_forces_to_this); 118 | 119 | // Update velocities and positions. 120 | cuda_execute(&Body::update, kTimeInterval); 121 | } 122 | } 123 | 124 | void run_simple() { 125 | for (int i = 0; i < kIterations*100; ++i) { 126 | cuda_execute(&Body::codengen_simple_update, kTimeInterval); 127 | } 128 | } 129 | 130 | int main() { 131 | // Initialize object storage. 132 | Body::initialize_storage(); 133 | 134 | uint64_t time_instantiation = measure<>::execution(instantiation); 135 | gpuErrchk(cudaPeekAtLastError()); 136 | 137 | uint64_t time_simulation = measure<>::execution(run_simulation); 138 | gpuErrchk(cudaPeekAtLastError()); 139 | 140 | uint64_t time_simple = measure<>::execution(run_simple); 141 | gpuErrchk(cudaPeekAtLastError()); 142 | 143 | // Calculate checksum 144 | int checksum = 11; 145 | for (uintptr_t i = 0; i < kNumBodies; i++) { 146 | checksum += reinterpret_cast( 147 | r_float2int(Body::get(i)->position_[0])); 148 | checksum += reinterpret_cast( 149 | r_float2int(Body::get(i)->position_[1])); 150 | checksum = checksum % 1234567; 151 | 152 | if (i < 10) { 153 | printf("VALUE[%lu] = %f, %f\n", i, 154 | (double) Body::get(i)->position_[0], 155 | (double) Body::get(i)->position_[1]); 156 | } 157 | } 158 | 159 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\n checksum: %i\n", 160 | time_instantiation, time_simulation, time_simple, checksum); 161 | return 0; 162 | } 163 | 164 | __global__ void codegen_reset_force(Body* body) { 165 | body->reset_force(); 166 | } 167 | 168 | __global__ void codegen_add_force(Body* self, Body* other) { 169 | self->add_force(other); 170 | } 171 | 172 | __global__ void codegen_update(Body* self, double dt) { 173 | self->update(dt); 174 | } 175 | 176 | __global__ void codegen_reset_force_manual(Body* body) { 177 | body->force_[0] = 0.0; 178 | body->force_[1] = 0.0; 179 | } 180 | 181 | __global__ void codegen_reset_force_half(Body* body) { 182 | body->force_[0] = 0.0; 183 | } 184 | 185 | __global__ void codegen_reset_mass(Body* body) { 186 | body->mass_ = 0.0; 187 | } 188 | 189 | __global__ void codengen_simple_update(Body* body, double dt) { 190 | body->codengen_simple_update(dt); 191 | } 192 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/soa.cc: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "benchmark.h" 9 | 10 | using namespace std; 11 | using IdType = uintptr_t; 12 | 13 | static const int kIterations = 5; 14 | static const int kNumBodies = 8000; 15 | static const double kMaxMass = 1000; 16 | static const double kTimeInterval = 0.5; 17 | 18 | static const double kGravityConstant = 6.673e-11; // gravitational constant 19 | 20 | double a_Body_mass[kNumBodies]; 21 | double a_Body_position_0[kNumBodies]; 22 | double a_Body_position_1[kNumBodies]; 23 | double a_Body_velocity_0[kNumBodies]; 24 | double a_Body_velocity_1[kNumBodies]; 25 | double a_Body_force_0[kNumBodies]; 26 | double a_Body_force_1[kNumBodies]; 27 | 28 | #define RAND (1.0 * rand() / RAND_MAX) 29 | 30 | void Body_initialize(IdType i, double mass, double pos_x, double pos_y, 31 | double vel_x, double vel_y) { 32 | a_Body_mass[i] = mass; 33 | a_Body_position_0[i] = pos_x; 34 | a_Body_position_1[i] = pos_y; 35 | a_Body_velocity_0[i] = vel_x; 36 | a_Body_velocity_1[i] = vel_y; 37 | a_Body_force_0[i] = 0.0; 38 | a_Body_force_1[i] = 0.0; 39 | } 40 | 41 | void Body_add_force(IdType self, IdType body) { 42 | if (self == body) return; 43 | 44 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 45 | double dx = a_Body_position_0[body] - a_Body_position_0[self]; 46 | double dy = a_Body_position_1[body] - a_Body_position_1[self]; 47 | double dist = sqrt(dx*dx + dy*dy); 48 | double F = kGravityConstant*a_Body_mass[self]*a_Body_mass[body] 49 | / (dist*dist + EPS*EPS); 50 | a_Body_force_0[self] += F*dx / dist; 51 | a_Body_force_1[self] += F*dy / dist; 52 | } 53 | 54 | void Body_add_force_to_all(IdType self) { 55 | for (IdType i = 0; i < kNumBodies; ++i) { 56 | Body_add_force(i, self); 57 | } 58 | } 59 | 60 | void Body_reset_force(IdType self) { 61 | a_Body_force_0[self] = 0.0; 62 | a_Body_force_1[self] = 0.0; 63 | } 64 | 65 | void Body_update(IdType self, double dt) { 66 | a_Body_velocity_0[self] += a_Body_force_0[self]*dt / a_Body_mass[self]; 67 | a_Body_velocity_1[self] += a_Body_force_1[self]*dt / a_Body_mass[self]; 68 | a_Body_position_0[self] += a_Body_velocity_0[self]*dt; 69 | a_Body_position_1[self] += a_Body_velocity_1[self]*dt; 70 | 71 | if (a_Body_position_0[self] < -1 || a_Body_position_0[self] > 1) { 72 | a_Body_velocity_0[self] = -a_Body_velocity_0[self]; 73 | } 74 | if (a_Body_position_1[self] < -1 || a_Body_position_1[self] > 1) { 75 | a_Body_velocity_1[self] = -a_Body_velocity_1[self]; 76 | } 77 | } 78 | 79 | void Body_codegen_simple_update(IdType self, double dt) { 80 | for (int i = 0; i < 100; ++i) { 81 | a_Body_velocity_0[self] += a_Body_force_0[self]*dt / a_Body_mass[self]; 82 | a_Body_velocity_1[self] += a_Body_force_1[self]*dt / a_Body_mass[self]; 83 | a_Body_position_0[self] += a_Body_velocity_0[self]*dt; 84 | a_Body_position_1[self] += a_Body_velocity_1[self]*dt; 85 | } 86 | } 87 | 88 | void instantiation() { 89 | srand(42); 90 | 91 | // Create objects. 92 | for (IdType i = 0; i < kNumBodies; ++i) { 93 | double mass = (RAND/2 + 0.5) * kMaxMass; 94 | double pos_x = RAND*2 - 1; 95 | double pos_y = RAND*2 - 1; 96 | double vel_x = (RAND - 0.5) / 1000; 97 | double vel_y = (RAND - 0.5) / 1000; 98 | Body_initialize(i, mass, pos_x, pos_y, vel_x, vel_y); 99 | } 100 | } 101 | 102 | void run_simulation() { 103 | for (int i = 0; i < kIterations; ++i) { 104 | // Reset forces. 105 | for (IdType j = 0; j < kNumBodies; ++j) { 106 | Body_reset_force(j); 107 | } 108 | 109 | // Update forces. 110 | for (IdType j = 0; j < kNumBodies; ++j) { 111 | Body_add_force_to_all(j); 112 | } 113 | 114 | // Update velocities and positions. 115 | for (IdType j = 0; j < kNumBodies; ++j) { 116 | Body_update(j, kTimeInterval); 117 | } 118 | } 119 | } 120 | 121 | void run_simple() { 122 | for (int i = 0; i < kIterations*100; ++i) { 123 | for (IdType j = 0; j < kNumBodies; ++j) { 124 | Body_codegen_simple_update(j, kTimeInterval); 125 | } 126 | } 127 | } 128 | 129 | int main() { 130 | uint64_t time_instantiation = measure<>::execution(instantiation); 131 | uint64_t time_simulation = measure<>::execution(run_simulation); 132 | uint64_t time_simple = measure<>::execution(run_simple); 133 | 134 | // Calculate checksum 135 | int checksum = 11; 136 | for (uintptr_t i = 0; i < kNumBodies; i++) { 137 | checksum += reinterpret_cast(r_float2int(a_Body_position_0[i])); 138 | checksum += reinterpret_cast(r_float2int(a_Body_position_1[i])); 139 | checksum = checksum % 1234567; 140 | 141 | if (i < 10) { 142 | printf("VALUE[%lu] = %f, %f\n", i, 143 | a_Body_position_0[i], 144 | a_Body_position_1[i]); 145 | } 146 | } 147 | 148 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\nchecksum: %i\n", 149 | time_instantiation, time_simulation, time_simple, checksum); 150 | return 0; 151 | } 152 | -------------------------------------------------------------------------------- /test/soa/benchmarks/nbody/soa_inversed.cu: -------------------------------------------------------------------------------- 1 | // N-Body Simulation 2 | // Code adapted from: http://physics.princeton.edu/~fpretori/Nbody/code.htm 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "benchmark.h" 9 | #include "executor/cuda_executor.h" 10 | 11 | using namespace std; 12 | using IdType = uintptr_t; 13 | 14 | static const int kIterations = 5; 15 | static const int kNumBodies = 8000; 16 | static const double kMaxMass = 1000; 17 | static const double kTimeInterval = 0.5; 18 | 19 | static const double kGravityConstant = 6.673e-11; // gravitational constant 20 | 21 | struct Container { 22 | double a_Body_mass[kNumBodies]; 23 | double a_Body_position_0[kNumBodies]; 24 | double a_Body_position_1[kNumBodies]; 25 | double a_Body_velocity_0[kNumBodies]; 26 | double a_Body_velocity_1[kNumBodies]; 27 | double a_Body_force_0[kNumBodies]; 28 | double a_Body_force_1[kNumBodies]; 29 | }; 30 | 31 | __device__ Container d_container; 32 | Container h_container; 33 | 34 | #define RAND (1.0 * rand() / RAND_MAX) 35 | 36 | void Body_initialize(IdType i, double mass, double pos_x, double pos_y, 37 | double vel_x, double vel_y) { 38 | h_container.a_Body_mass[i] = mass; 39 | h_container.a_Body_position_0[i] = pos_x; 40 | h_container.a_Body_position_1[i] = pos_y; 41 | h_container.a_Body_velocity_0[i] = vel_x; 42 | h_container.a_Body_velocity_1[i] = vel_y; 43 | h_container.a_Body_force_0[i] = 0.0; 44 | h_container.a_Body_force_1[i] = 0.0; 45 | } 46 | 47 | __device__ void Body_add_force(IdType self, IdType body) { 48 | if (self == body) return; 49 | double EPS = 0.01; // Softening parameter (just to avoid infinities). 50 | double dx = d_container.a_Body_position_0[body] 51 | - d_container.a_Body_position_0[self]; 52 | double dy = d_container.a_Body_position_1[body] 53 | - d_container.a_Body_position_1[self]; 54 | double dist = sqrt(dx*dx + dy*dy); 55 | double F = kGravityConstant 56 | * d_container.a_Body_mass[self]*d_container.a_Body_mass[body] 57 | / (dist*dist + EPS*EPS); 58 | d_container.a_Body_force_0[self] += F*dx / dist; 59 | d_container.a_Body_force_1[self] += F*dy / dist; 60 | } 61 | 62 | __device__ void Body_add_all_forces_to_this(IdType self) { 63 | for (IdType i = 0; i < kNumBodies; ++i) { 64 | Body_add_force(self, i); 65 | } 66 | } 67 | 68 | __global__ void kernel_Body_add_all_forces_to_this() { 69 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 70 | if (tid < kNumBodies) { 71 | Body_add_all_forces_to_this(tid); 72 | } 73 | } 74 | 75 | __device__ void Body_reset_force(IdType self) { 76 | d_container.a_Body_force_0[self] = 0.0; 77 | d_container.a_Body_force_1[self] = 0.0; 78 | } 79 | 80 | __global__ void kernel_Body_reset_force() { 81 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 82 | if (tid < kNumBodies) { 83 | Body_reset_force(tid); 84 | } 85 | } 86 | 87 | __device__ void Body_update(IdType self, double dt) { 88 | d_container.a_Body_velocity_0[self] += 89 | d_container.a_Body_force_0[self]*dt / d_container.a_Body_mass[self]; 90 | d_container.a_Body_velocity_1[self] += 91 | d_container.a_Body_force_1[self]*dt / d_container.a_Body_mass[self]; 92 | d_container.a_Body_position_0[self] += 93 | d_container.a_Body_velocity_0[self]*dt; 94 | d_container.a_Body_position_1[self] += 95 | d_container.a_Body_velocity_1[self]*dt; 96 | 97 | if (d_container.a_Body_position_0[self] < -1 98 | || d_container.a_Body_position_0[self] > 1) { 99 | d_container.a_Body_velocity_0[self] = -d_container.a_Body_velocity_0[self]; 100 | } 101 | if (d_container.a_Body_position_1[self] < -1 102 | || d_container.a_Body_position_1[self] > 1) { 103 | d_container.a_Body_velocity_1[self] = -d_container.a_Body_velocity_1[self]; 104 | } 105 | } 106 | 107 | __global__ void kernel_Body_update(double dt) { 108 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 109 | if (tid < kNumBodies) { 110 | Body_update(tid, dt); 111 | } 112 | } 113 | 114 | __device__ void Body_codegen_simple_update(IdType self, double dt) { 115 | for (int i = 0; i < 1000; ++i) { 116 | d_container.a_Body_velocity_0[self] += 117 | d_container.a_Body_force_0[self]*dt / d_container.a_Body_mass[self]; 118 | d_container.a_Body_velocity_1[self] += 119 | d_container.a_Body_force_1[self]*dt / d_container.a_Body_mass[self]; 120 | d_container.a_Body_position_0[self] += 121 | d_container.a_Body_velocity_0[self]*dt; 122 | d_container.a_Body_position_1[self] += 123 | d_container.a_Body_velocity_1[self]*dt; 124 | } 125 | } 126 | 127 | __global__ void kernel_Body_codegen_simple_update(double dt) { 128 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 129 | if (tid < kNumBodies) { 130 | Body_codegen_simple_update(tid, dt); 131 | } 132 | } 133 | 134 | void instantiation() { 135 | srand(42); 136 | 137 | // Create objects. 138 | for (IdType i = 0; i < kNumBodies; ++i) { 139 | double mass = (RAND/2 + 0.5) * kMaxMass; 140 | double pos_x = RAND*2 - 1; 141 | double pos_y = RAND*2 - 1; 142 | double vel_x = (RAND - 0.5) / 1000; 143 | double vel_y = (RAND - 0.5) / 1000; 144 | Body_initialize(i, mass, pos_x, pos_y, vel_x, vel_y); 145 | } 146 | 147 | // Transfer data to GPU. 148 | cudaMemcpyToSymbol(d_container, &h_container, sizeof(Container), 0, 149 | cudaMemcpyHostToDevice); 150 | cudaDeviceSynchronize(); 151 | assert(cudaPeekAtLastError() == cudaSuccess); 152 | } 153 | 154 | void run_simulation() { 155 | uintptr_t num_blocks = ikra::executor::cuda::cuda_blocks_1d(kNumBodies); 156 | uintptr_t num_threads = ikra::executor::cuda::cuda_threads_1d(kNumBodies); 157 | 158 | for (int i = 0; i < kIterations; ++i) { 159 | // Reset forces. 160 | kernel_Body_reset_force<<>>(); 161 | 162 | // Update forces. 163 | kernel_Body_add_all_forces_to_this<<>>(); 164 | 165 | // Update velocities and positions. 166 | kernel_Body_update<<>>(kTimeInterval); 167 | } 168 | 169 | cudaDeviceSynchronize(); 170 | assert(cudaPeekAtLastError() == cudaSuccess); 171 | } 172 | 173 | void run_simple() { 174 | uintptr_t num_blocks = ikra::executor::cuda::cuda_blocks_1d(kNumBodies); 175 | uintptr_t num_threads = ikra::executor::cuda::cuda_threads_1d(kNumBodies); 176 | 177 | for (int i = 0; i < kIterations*100; ++i) { 178 | kernel_Body_codegen_simple_update<<>>( 179 | kTimeInterval); 180 | } 181 | 182 | cudaDeviceSynchronize(); 183 | assert(cudaPeekAtLastError() == cudaSuccess); 184 | } 185 | 186 | int main() { 187 | uint64_t time_instantiation = measure<>::execution(instantiation); 188 | gpuErrchk(cudaPeekAtLastError()); 189 | 190 | uint64_t time_simulation = measure<>::execution(run_simulation); 191 | gpuErrchk(cudaPeekAtLastError()); 192 | 193 | uint64_t time_simple = measure<>::execution(run_simple); 194 | gpuErrchk(cudaPeekAtLastError()); 195 | 196 | // Transfer data to CPU. 197 | cudaMemcpyFromSymbol(&h_container, d_container, sizeof(Container), 0, 198 | cudaMemcpyDeviceToHost); 199 | gpuErrchk(cudaPeekAtLastError()); 200 | 201 | // Calculate checksum 202 | int checksum = 11; 203 | for (uintptr_t i = 0; i < kNumBodies; i++) { 204 | checksum += reinterpret_cast( 205 | r_float2int(h_container.a_Body_position_0[i])); 206 | checksum += reinterpret_cast( 207 | r_float2int(h_container.a_Body_position_1[i])); 208 | checksum = checksum % 1234567; 209 | 210 | if (i < 10) { 211 | printf("VALUE[%lu] = %f, %f\n", i, 212 | h_container.a_Body_position_0[i], 213 | h_container.a_Body_position_1[i]); 214 | } 215 | } 216 | 217 | printf("instantiation: %lu\nsimulation: %lu\nsimple: %lu\nchecksum: %i\n", 218 | time_instantiation, time_simulation, time_simple, checksum); 219 | return 0; 220 | } 221 | 222 | __global__ void codengen_simple_update(IdType body, double dt) { 223 | Body_codegen_simple_update(body, dt); 224 | } 225 | -------------------------------------------------------------------------------- /test/soa/cuda_array_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "gtest/gtest.h" 4 | 5 | #include "executor/cuda_executor.h" 6 | #include "soa/soa.h" 7 | 8 | using ikra::soa::IndexType; 9 | using ikra::soa::SoaLayout; 10 | using ikra::executor::cuda::construct; 11 | 12 | const static int kTestSize = 12; 13 | 14 | #define CUDA_THREAD_ID (threadIdx.x + blockIdx.x * blockDim.x) 15 | 16 | class DummyClass : public SoaLayout { 17 | public: 18 | IKRA_INITIALIZE_CLASS 19 | 20 | __device__ DummyClass(int v2, int v4) : field2(v2), field4(v4) { 21 | for (int i = 0; i < 3; ++i) { 22 | field1[i] = CUDA_THREAD_ID*1000 + i*31; 23 | field3[i] = CUDA_THREAD_ID*500 + i*59; 24 | } 25 | } 26 | 27 | int_ field0; 28 | 29 | // Array has size 12 bytes. 30 | array_(int, 3, object) field1; 31 | int_ field2; 32 | 33 | // Array has size 12 bytes. 34 | array_(int, 3, fully_inlined) field3; 35 | int_ field4; 36 | 37 | __device__ void update_field1(int increment) { 38 | for (int i = 0; i < 3; ++i) { 39 | field1[i] += field2 + increment; 40 | } 41 | } 42 | 43 | __device__ void update_field3(int increment) { 44 | for (int i = 0; i < 3; ++i) { 45 | field3[i] += field4 + increment; 46 | } 47 | } 48 | 49 | __device__ void update_field3_second() { 50 | for (int i = 0; i < 3; ++i) { 51 | field3[i] += field1[i]; 52 | } 53 | } 54 | }; 55 | 56 | IKRA_DEVICE_STORAGE(DummyClass); 57 | 58 | 59 | // Cannot run "cuda_execute" inside gtest case. 60 | void run_test_construct_and_execute() { 61 | DummyClass::initialize_storage(); 62 | EXPECT_EQ(DummyClass::size(), 0UL); 63 | 64 | DummyClass* first = construct(kTestSize, 17, 29); 65 | gpuErrchk(cudaDeviceSynchronize()); 66 | gpuErrchk(cudaPeekAtLastError()); 67 | 68 | // Check result. 69 | for (int i = 0; i < kTestSize; ++i) { 70 | for (int j = 0; j < 3; ++j) { 71 | int actual1 = DummyClass::get(i)->field1[j]; 72 | int actual3 = DummyClass::get(i)->field3[j]; 73 | int expected1 = i*1000 + j*31; 74 | int expected3 = i*500 + j*59; 75 | EXPECT_EQ(actual1, expected1); 76 | EXPECT_EQ(actual3, expected3); 77 | } 78 | } 79 | 80 | cuda_execute(&DummyClass::update_field1, first, kTestSize, 2); 81 | gpuErrchk(cudaPeekAtLastError()); 82 | 83 | // Check result. 84 | for (int i = 0; i < kTestSize; ++i) { 85 | for (int j = 0; j < 3; ++j) { 86 | int actual1 = DummyClass::get(i)->field1[j]; 87 | int actual3 = DummyClass::get(i)->field3[j]; 88 | int expected1 = i*1000 + j*31 + 2 + 17; 89 | int expected3 = i*500 + j*59; 90 | EXPECT_EQ(actual1, expected1); 91 | EXPECT_EQ(actual3, expected3); 92 | } 93 | } 94 | 95 | cuda_execute(&DummyClass::update_field3, first, kTestSize, 5); 96 | gpuErrchk(cudaPeekAtLastError()); 97 | 98 | // Check result. 99 | for (int i = 0; i < kTestSize; ++i) { 100 | for (int j = 0; j < 3; ++j) { 101 | int actual1 = DummyClass::get(i)->field1[j]; 102 | int actual3 = DummyClass::get(i)->field3[j]; 103 | int expected1 = i*1000 + j*31 + 2 + 17; 104 | int expected3 = i*500 + j*59 + 5 + 29; 105 | EXPECT_EQ(actual1, expected1); 106 | EXPECT_EQ(actual3, expected3); 107 | } 108 | } 109 | 110 | cuda_execute(&DummyClass::update_field3_second, first, kTestSize); 111 | gpuErrchk(cudaPeekAtLastError()); 112 | 113 | // Check result. 114 | for (int i = 0; i < kTestSize; ++i) { 115 | for (int j = 0; j < 3; ++j) { 116 | int actual1 = DummyClass::get(i)->field1[j]; 117 | int actual3 = DummyClass::get(i)->field3[j]; 118 | int expected1 = i*1000 + j*31 + 2 + 17; 119 | int expected3 = i*500 + j*59 + 5 + 29 + expected1; 120 | EXPECT_EQ(actual1, expected1); 121 | EXPECT_EQ(actual3, expected3); 122 | } 123 | } 124 | 125 | // Copy size to host memory and compare. 126 | EXPECT_EQ(DummyClass::size(), static_cast(kTestSize)); 127 | 128 | // Make sure that we had no CUDA failures. 129 | gpuErrchk(cudaPeekAtLastError()); 130 | } 131 | 132 | 133 | TEST(CudaArrayTest, ConstructAndExecute) { 134 | run_test_construct_and_execute(); 135 | } 136 | 137 | -------------------------------------------------------------------------------- /test/soa/cuda_inline_array_memcpy_aos_test.cu: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include "executor/cuda_executor.h" 4 | #include "executor/executor.h" 5 | #include "soa/soa.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | using ikra::soa::StaticStorageWithArena; 11 | using ikra::soa::kAddressModeZero; 12 | 13 | // Size of inline SOA array. 14 | #define ARRAY_SIZE 100 15 | // Number of DummyClass instances. 16 | #define NUM_INST 121 17 | // Number of instances constructed on the device. 18 | #define CONSTRUCT_DEVICE 79 19 | // Number of instances constructed on the host. 20 | #define CONSTRUCT_HOST (NUM_INST - CONSTRUCT_DEVICE) 21 | // Number of array elements stored in the area. 22 | #define READ_FROM_ARENA_ELEMENTS 32 23 | // Size of the arena in bytes. 24 | // TODO: Why is the +1 required here? 25 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST+1) 26 | // Number of array elements stored in inline storage. 27 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS) 28 | 29 | // TODO: Alignment is broken in AOS mode. 30 | class DummyClass : public SoaLayout, ikra::soa::kLayoutModeAos> { 32 | public: 33 | IKRA_INITIALIZE_CLASS 34 | 35 | __host__ __device__ DummyClass(int f0, int f2) : field0(f0), field2(f2), 36 | field1(ARRAY_SIZE) { 37 | for (int i = 0; i < ARRAY_SIZE; ++i) { 38 | field1[i] = id()*17 + i; 39 | } 40 | } 41 | 42 | int_ field0; 43 | 44 | int_ field2; 45 | 46 | array_(int, INLINE_ARR_SIZE, partially_inlined) field1; 47 | 48 | __device__ void update_field1(int increment) { 49 | for (int i = 0; i < ARRAY_SIZE; ++i) { 50 | field1[i] += increment + field0 + field2; 51 | } 52 | } 53 | }; 54 | 55 | IKRA_DEVICE_STORAGE(DummyClass); 56 | 57 | 58 | void run_test_construct_and_execute() { 59 | DummyClass::initialize_storage(); 60 | // Create most instances on the device. 61 | DummyClass* first = construct(CONSTRUCT_DEVICE, 29, 1357); 62 | gpuErrchk(cudaPeekAtLastError()); 63 | 64 | // Create some instances on the host. 65 | ikra::executor::construct(CONSTRUCT_HOST, 29, 1357); 66 | 67 | cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19); 68 | gpuErrchk(cudaPeekAtLastError()); 69 | 70 | // Check result. 71 | for (int i = 0; i < NUM_INST; ++i) { 72 | for (int j = 0; j < ARRAY_SIZE; ++j) { 73 | int actual1 = DummyClass::get(i)->field1[j]; 74 | int expected1 = i*17 + j + 19 + 29 + 1357; 75 | EXPECT_EQ(actual1, expected1); 76 | } 77 | } 78 | } 79 | 80 | TEST(CudaInlineArrayMemcpyAosTest, ConstructAndExecute) { 81 | run_test_construct_and_execute(); 82 | } 83 | -------------------------------------------------------------------------------- /test/soa/cuda_inline_array_memcpy_test.cu: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include "executor/cuda_executor.h" 4 | #include "executor/executor.h" 5 | #include "soa/soa.h" 6 | 7 | using ikra::soa::IndexType; 8 | using ikra::soa::SoaLayout; 9 | using ikra::executor::cuda::construct; 10 | using ikra::soa::StaticStorageWithArena; 11 | using ikra::soa::kAddressModeZero; 12 | 13 | // Size of inline SOA array. 14 | #define ARRAY_SIZE 100 15 | // Number of DummyClass instances. 16 | #define NUM_INST 121 17 | // Number of instances constructed on the device. 18 | #define CONSTRUCT_DEVICE 79 19 | // Number of instances constructed on the host. 20 | #define CONSTRUCT_HOST (NUM_INST - CONSTRUCT_DEVICE) 21 | // Number of array elements stored in the area. 22 | #define READ_FROM_ARENA_ELEMENTS 32 23 | // Size of the arena in bytes. 24 | // TODO: Why is the +1 required here? 25 | #define EXTRA_BYTES (READ_FROM_ARENA_ELEMENTS*sizeof(int)*NUM_INST+1) 26 | // Number of array elements stored in inline storage. 27 | #define INLINE_ARR_SIZE (ARRAY_SIZE - READ_FROM_ARENA_ELEMENTS) 28 | 29 | class DummyClass : public SoaLayout, ikra::soa::kLayoutModeSoa> { 31 | public: 32 | IKRA_INITIALIZE_CLASS 33 | 34 | __host__ __device__ DummyClass(int f0, int f2) : field0(f0), field2(f2), 35 | field1(ARRAY_SIZE) { 36 | for (int i = 0; i < ARRAY_SIZE; ++i) { 37 | field1[i] = id()*17 + i; 38 | } 39 | } 40 | 41 | int_ field0; 42 | 43 | array_(int, INLINE_ARR_SIZE, partially_inlined) field1; 44 | 45 | int_ field2; 46 | 47 | __device__ void update_field1(int increment) { 48 | for (int i = 0; i < ARRAY_SIZE; ++i) { 49 | field1[i] += increment + field0 + field2; 50 | } 51 | } 52 | }; 53 | 54 | IKRA_DEVICE_STORAGE(DummyClass); 55 | 56 | 57 | void run_test_construct_and_execute() { 58 | DummyClass::initialize_storage(); 59 | // Create most instances on the device. 60 | DummyClass* first = construct(CONSTRUCT_DEVICE, 29, 1357); 61 | gpuErrchk(cudaPeekAtLastError()); 62 | 63 | // Create some instances on the host. 64 | ikra::executor::construct(CONSTRUCT_HOST, 29, 1357); 65 | 66 | cuda_execute(&DummyClass::update_field1, first, NUM_INST, 19); 67 | gpuErrchk(cudaPeekAtLastError()); 68 | 69 | // Check result. 70 | for (int i = 0; i < NUM_INST; ++i) { 71 | for (int j = 0; j < ARRAY_SIZE; ++j) { 72 | int actual1 = DummyClass::get(i)->field1[j]; 73 | int expected1 = i*17 + j + 19 + 29 + 1357; 74 | EXPECT_EQ(actual1, expected1); 75 | } 76 | } 77 | } 78 | 79 | TEST(CudaInlineArrayMemcpyTest, ConstructAndExecute) { 80 | run_test_construct_and_execute(); 81 | } 82 | -------------------------------------------------------------------------------- /test/soa/pointer_arithmetics_test.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "soa/soa.h" 3 | 4 | namespace { 5 | using ikra::soa::SoaLayout; 6 | 7 | static const int kClassMaxInst = 1024; 8 | static const uint32_t kTestSize = 40; 9 | 10 | // Pointer arithmetics works only in valid addressing mode. 11 | class TestClass : public SoaLayout { 12 | public: 13 | IKRA_INITIALIZE_CLASS 14 | 15 | int_ field0; 16 | }; 17 | 18 | IKRA_HOST_STORAGE(TestClass) 19 | 20 | 21 | TEST(PointerArithmeticsTest, IncrementDecrement) { 22 | TestClass::initialize_storage(); 23 | 24 | auto first = new TestClass[kTestSize]; 25 | EXPECT_EQ(TestClass::storage().size(), kTestSize); 26 | 27 | int counter = 0; 28 | for (auto it = first; it < first + kTestSize; ++it, counter += 2) { 29 | it->field0 = counter; 30 | } 31 | 32 | for (int i = 0; i < static_cast(kTestSize); ++i) { 33 | EXPECT_EQ(TestClass::get(i)->field0, 2*i); 34 | } 35 | } 36 | 37 | } // namespace 38 | --------------------------------------------------------------------------------