├── .gitignore ├── examples ├── performance_tests │ ├── CMakeLists.txt │ └── plot ├── smoothed_particle_hydrodynamics │ ├── CMakeLists.txt │ ├── kernels.h │ └── kernels.c ├── CMakeLists.txt ├── gauss │ ├── CMakeLists.txt │ └── filter_c99.c ├── jacobi │ ├── CMakeLists.txt │ └── update_c99.c └── lbm │ ├── CMakeLists.txt │ ├── generator │ ├── main.cu │ ├── update_lbm_cuda_flat_array.h │ ├── cudalineupdatefunctorprototype.h │ ├── util.h │ ├── update_lbm_object_oriented.h │ ├── update_lbm_classic.h │ ├── flatarray_implementation_0.cu │ ├── flatarray_implementation_1.cu │ ├── flatarray_implementation_2.cu │ └── flatarray_implementation_3.cu ├── AUTHORS ├── .appveyor.yml ├── include └── libflatarray │ ├── short_vec_base.hpp │ ├── number_of_members.hpp │ ├── coord.hpp │ ├── aggregated_member_size.hpp │ ├── detail │ ├── init_kernel.hpp │ ├── sqrt_reference.hpp │ ├── sibling_short_vec_switch.hpp │ ├── streaming_short_vec_switch.hpp │ ├── simple_streak.hpp │ ├── offset.hpp │ ├── generate_cuda_launch_config.hpp │ ├── copy_functor.hpp │ ├── generic_destruct.hpp │ ├── set_byte_size_functor.hpp │ ├── staging_buffer.hpp │ ├── construct_functor.hpp │ ├── destroy_functor.hpp │ ├── dual_callback_helper.hpp │ ├── get_instance_functor.hpp │ ├── save_functor.hpp │ ├── set_instance_functor.hpp │ ├── short_vec_helpers.hpp │ ├── load_functor.hpp │ ├── short_vec_mic_double_8.hpp │ └── short_vec_scalar_int_2.hpp │ ├── member_ptr_to_offset.hpp │ ├── alignment.hpp │ ├── testbed │ ├── gpu_benchmark.hpp │ ├── benchmark.hpp │ ├── evaluate.hpp │ └── cpu_benchmark.hpp │ ├── flat_array.hpp │ ├── soa_accessor.hpp │ ├── cuda_allocator.hpp │ ├── preprocessor.hpp │ ├── ilp_to_arity.hpp │ ├── streaming_short_vec.hpp │ ├── estimate_optimum_short_vec_type.hpp │ ├── aligned_allocator.hpp │ ├── soa_vector.hpp │ └── loop_peeler.hpp ├── test ├── short_vec_additional_test.cpp ├── cuda_allocator_test.cu ├── aligned_allocator_test.cpp ├── loop_peeler_test.cpp ├── soa_array_cuda_test.cu ├── test.hpp ├── estimate_optimum_short_vec_type_test.cpp ├── preprocessor_test.cpp └── CMakeLists.txt ├── .circleci └── config.yml ├── LICENSE ├── README └── CMakeModules └── FindSilo.cmake /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | *~ 3 | 4 | -------------------------------------------------------------------------------- /examples/performance_tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(performance_tests main.cpp) 2 | target_link_libraries(performance_tests ${libflatarray_LIBS}) 3 | -------------------------------------------------------------------------------- /examples/smoothed_particle_hydrodynamics/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(WITH_SILO AND WITH_CPP14) 2 | add_executable(sph main.cpp kernels.c) 3 | include_directories(${Silo_INCLUDE_DIR}) 4 | target_link_libraries(sph ${Silo_LIBRARY}) 5 | endif() 6 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(OpenMP) 2 | 3 | if(NOT MSVC) 4 | add_subdirectory(jacobi) 5 | add_subdirectory(gauss) 6 | endif() 7 | add_subdirectory(lbm) 8 | add_subdirectory(performance_tests) 9 | add_subdirectory(smoothed_particle_hydrodynamics) 10 | -------------------------------------------------------------------------------- /examples/gauss/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(gauss main.cpp filter_c99.c) 2 | target_link_libraries(gauss ${libflatarray_LIBS}) 3 | 4 | if(OPENMP_FOUND) 5 | if(CMAKE_VERSION VERSION_GREATER 2.8.11) 6 | target_compile_options(gauss PRIVATE ${OpenMP_CXX_FLAGS}) 7 | endif() 8 | set_target_properties(gauss PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS}) 9 | endif() 10 | -------------------------------------------------------------------------------- /examples/jacobi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(jacobi main.cpp update_c99.c) 2 | target_link_libraries(jacobi ${libflatarray_LIBS}) 3 | 4 | if(OPENMP_FOUND) 5 | if(CMAKE_VERSION VERSION_GREATER 2.8.11) 6 | target_compile_options(jacobi PRIVATE ${OpenMP_CXX_FLAGS}) 7 | endif() 8 | set_target_properties(jacobi PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS}) 9 | endif() 10 | -------------------------------------------------------------------------------- /examples/lbm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(WITH_CUDA) 2 | lfa_cuda_add_executable(lbm2 main.cu flatarray_implementation_0.cu flatarray_implementation_1.cu flatarray_implementation_2.cu flatarray_implementation_3.cu flatarray_implementation_4.cu flatarray_implementation_5.cu flatarray_implementation_6.cu flatarray_implementation_7.cu flatarray_implementation_8.cu flatarray_implementation_9.cu flatarray_implementation_10.cu) 3 | endif() 4 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Copyright: 2 | 3 | Year(s) Name Affiliation Email 4 | --------- -------------------- ----------- --------------------------------- 5 | 2012-2015 Andreas Schäfer FAU gentryx@gmx.de 6 | 2014-2015 Kurt Kanzenbach FAU kurt@kmk-computers.de 7 | 2015-2015 Di Xiao (Larry) SJTU xiaodi@sjtu.edu.cn 8 | 9 | Affiliation Abbreviations: 10 | -------------------------- 11 | 12 | FAU = Friedrich-Alexander-Universität Erlangen-Nürnberg 13 | SJTU = Shanghai Jiao Tong University 14 | -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 1.0.{build} 2 | 3 | shallow_clone: true 4 | 5 | matrix: 6 | fast_finish: true 7 | 8 | environment: 9 | matrix: 10 | - GENERATOR: "Visual Studio 14" 11 | CONFIG: Debug 12 | 13 | - GENERATOR: "Visual Studio 14" 14 | CONFIG: Release 15 | 16 | os: Visual Studio 2015 17 | 18 | build_script: 19 | - cmake "-G%GENERATOR%" -H. -B_builds 20 | - cmake --build _builds --config "%CONFIG%" 21 | - cmake --build _builds --config "%CONFIG%" --target tests 22 | 23 | # - _builds\test\Debug\api_traits_test.exe 24 | -------------------------------------------------------------------------------- /include/libflatarray/short_vec_base.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_SHORT_VEC_BASE_HPP 9 | #define FLAT_ARRAY_SHORT_VEC_BASE_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | template 14 | class short_vec_base 15 | { 16 | public: 17 | static inline 18 | std::size_t size() 19 | { 20 | return ARITY; 21 | } 22 | 23 | }; 24 | 25 | } 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /include/libflatarray/number_of_members.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_NUMBER_OF_MEMBERS_HPP 9 | #define FLAT_ARRAY_NUMBER_OF_MEMBERS_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | /** 14 | * Allow the user to access the number of data members of the SoA type. 15 | * 16 | * Will be instantiated by LIBFLATARRAY_REGISTER_SOA(). 17 | */ 18 | template 19 | class number_of_members; 20 | 21 | } 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /test/short_vec_additional_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | // include here to have another object file and check if linking with 9 | // origninal test still works. 10 | #include 11 | 12 | // globally disable some warnings with MSVC, that are issued not for a 13 | // specific header, but rather for the interaction of system headers 14 | // and LibFlatArray source: 15 | #ifdef _MSC_BUILD 16 | #pragma warning( disable : 4710 ) 17 | #endif 18 | -------------------------------------------------------------------------------- /include/libflatarray/coord.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_COORD_HPP 9 | #define FLAT_ARRAY_COORD_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | /** 14 | * A utility class to specify (relative) coordinates. The class is to 15 | * be used with soa_accessor. 16 | * 17 | * Since the coordinates are fixed at compile time, all dependent 18 | * address calculations can be done at compile time. 19 | */ 20 | template 21 | class coord 22 | {}; 23 | 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: circleci/ruby:stretch 6 | steps: 7 | - checkout 8 | - run: mkdir build 9 | - run: sudo apt-get update && sudo apt-get install -y cmake 10 | - run: cd build && cmake .. 11 | - run: cd build && make 12 | test: 13 | docker: 14 | - image: circleci/ruby:stretch 15 | steps: 16 | - checkout 17 | - run: mkdir build 18 | - run: sudo apt-get update && sudo apt-get install -y cmake 19 | - run: cd build && cmake .. 20 | - run: cd build && make check 21 | workflows: 22 | version: 2 23 | build_and_test: 24 | jobs: 25 | - build 26 | - test 27 | -------------------------------------------------------------------------------- /include/libflatarray/aggregated_member_size.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_AGGREGATED_MEMBER_SIZE_HPP 9 | #define FLAT_ARRAY_AGGREGATED_MEMBER_SIZE_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | /** 14 | * Accumulate the sizes of the individual data members. This may be 15 | * lower than sizeof(CELL_TYPE) as structs/objects in C++ may need 16 | * padding. We can avoid the padding of individual members in a SoA 17 | * memory layout. 18 | * 19 | * Will be instantiated by LIBFLATARRAY_REGISTER_SOA(). 20 | */ 21 | template 22 | class aggregated_member_size; 23 | 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /examples/lbm/generator: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | intervals = [ 4 | [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032], 5 | [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032], 6 | [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032] 7 | ] 8 | 9 | counter = -1 10 | 11 | intervals[0].size.times do |x1| 12 | counter += 1 13 | File.open("flatarray_implementation_#{counter}.cu", "w") do |f| 14 | f.puts < 16 | #include "cudalineupdatefunctorprototype.h" 17 | 18 | EOF 19 | 20 | xA = intervals[0][x1] 21 | xB = xA 22 | 23 | intervals[1].size.times do |y1| 24 | yA = intervals[1][y1] 25 | yB = yA 26 | 27 | intervals[2].size.times do |z1| 28 | zA = intervals[2][z1] 29 | zB = zA 30 | 31 | f.puts "IMPLEMENTATION(CellLBM, #{xA}, #{yA}, #{zA}, #{xB}, #{yB}, #{zB})" 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /include/libflatarray/detail/init_kernel.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2018 Google 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_INIT_KERNEL_HPP 9 | #define FLAT_ARRAY_DETAIL_INIT_KERNEL_HPP 10 | 11 | #include 12 | 13 | namespace LibFlatArray { 14 | 15 | namespace detail { 16 | 17 | namespace flat_array { 18 | 19 | #ifdef LIBFLATARRAY_WITH_CUDA 20 | #ifdef __CUDACC__ 21 | 22 | template 23 | __global__ 24 | void init_kernel(CELL source, CELL *target, long count) 25 | { 26 | long thread_index = blockDim.x * blockIdx.x + threadIdx.x; 27 | if (thread_index >= count) { 28 | return; 29 | } 30 | 31 | target[thread_index] = source; 32 | } 33 | 34 | #endif 35 | #endif 36 | 37 | } 38 | 39 | } 40 | 41 | } 42 | 43 | #endif 44 | 45 | -------------------------------------------------------------------------------- /include/libflatarray/detail/sqrt_reference.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_SQRT_REFERENCE_HPP 9 | #define FLAT_ARRAY_DETAIL_SQRT_REFERENCE_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | template 14 | class sqrt_reference; 15 | 16 | template 17 | short_vec operator/(const sqrt_reference& a, const short_vec& b) 18 | { 19 | return short_vec(a) / b; 20 | } 21 | 22 | template 23 | inline short_vec operator/(const sqrt_reference& a, const CARGO b) 24 | { 25 | return short_vec(a) / b; 26 | } 27 | 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /include/libflatarray/detail/sibling_short_vec_switch.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_SIBLING_SHORT_VEC_SWITCH_HPP 9 | #define FLAT_ARRAY_DETAIL_SIBLING_SHORT_VEC_SWITCH_HPP 10 | 11 | namespace LibFlatArray { 12 | namespace detail { 13 | namespace flat_array { 14 | 15 | template 16 | class sibling_short_vec_switch; 17 | 18 | template< 19 | template class SHORT_VEC_TEMPLATE, 20 | typename CARGO, 21 | std::size_t ARITY, 22 | std::size_t TARGET_ARITY> 23 | class sibling_short_vec_switch, TARGET_ARITY> 24 | { 25 | public: 26 | typedef SHORT_VEC_TEMPLATE VALUE; 27 | }; 28 | 29 | } 30 | } 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /include/libflatarray/detail/streaming_short_vec_switch.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_STREAMING_SHORT_VEC_SWITCH_HPP 9 | #define FLAT_ARRAY_DETAIL_STREAMING_SHORT_VEC_SWITCH_HPP 10 | 11 | #include 12 | #include 13 | 14 | namespace LibFlatArray { 15 | namespace detail { 16 | namespace flat_array { 17 | 18 | template 19 | class streaming_short_vec_switch 20 | { 21 | public: 22 | typedef streaming_short_vec VALUE; 23 | }; 24 | 25 | template 26 | class streaming_short_vec_switch 27 | { 28 | public: 29 | typedef short_vec VALUE; 30 | }; 31 | 32 | } 33 | } 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /examples/lbm/main.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013-2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | #include "cell.h" 12 | #include "util.h" 13 | #include "update_lbm_classic.h" 14 | #include "update_lbm_object_oriented.h" 15 | #include "update_lbm_cuda_flat_array.h" 16 | 17 | int main(int argc, char **argv) 18 | { 19 | if (argc != 2) { 20 | std::cerr << "usage: " << argv[0] << " CUDA_DEVICE\n"; 21 | return 1; 22 | } 23 | 24 | std::stringstream s; 25 | s << argv[1]; 26 | int cudaDevice; 27 | s >> cudaDevice; 28 | cudaSetDevice(cudaDevice); 29 | 30 | std::cout << "# test name ; dim ; performance\n"; 31 | benchmark_lbm_cuda_object_oriented().evaluate(); 32 | benchmark_lbm_cuda_classic().evaluate(); 33 | benchmark_lbm_cuda_flat_array().evaluate(); 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /include/libflatarray/member_ptr_to_offset.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_MEMBER_PTR_TO_OFFSET_HPP 9 | #define FLAT_ARRAY_MEMBER_PTR_TO_OFFSET_HPP 10 | 11 | #include 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | /** 17 | * Lets user code discover a member's offset in the SoA layout from 18 | * the member pointer of the original cell type. See test 19 | * TestMemberPtrToOffset for an explanation. 20 | * 21 | * Will be instantiated by LIBFLATARRAY_REGISTER_SOA(). 22 | */ 23 | class member_ptr_to_offset 24 | { 25 | public: 26 | template 27 | int operator()(MEMBER_TYPE CELL_TYPE:: *member_ptr) 28 | { 29 | return detail::flat_array::offset< 30 | CELL_TYPE, 31 | number_of_members::VALUE>()(member_ptr); 32 | } 33 | }; 34 | 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /include/libflatarray/alignment.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_ALIGNMENT_HPP 9 | #define FLAT_ARRAY_ALIGNMENT_HPP 10 | 11 | #include 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | template 17 | class alignment; 18 | 19 | template 20 | class alignment > 21 | { 22 | public: 23 | typedef typename short_vec::strategy strategy; 24 | typedef typename strategy::template alignment align; 25 | const static std::size_t VALUE = align::ALIGNMENT; 26 | }; 27 | 28 | template 29 | class alignment > 30 | { 31 | public: 32 | typedef typename short_vec::strategy strategy; 33 | typedef typename strategy::template alignment align; 34 | const static std::size_t VALUE = align::ALIGNMENT; 35 | }; 36 | 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /examples/smoothed_particle_hydrodynamics/kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBFLATARRAY_EXAMPLES_SMOOTHED_PARTICLE_HYDRODYNAMICS_KERNELS_H 2 | #define LIBFLATARRAY_EXAMPLES_SMOOTHED_PARTICLE_HYDRODYNAMICS_KERNELS_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void compute_density( 9 | int n, 10 | float *rho, 11 | float *pos_x, 12 | float *pos_y, 13 | float h, 14 | float mass); 15 | 16 | void compute_accel( 17 | int n, 18 | float *rho, 19 | float *pos_x, 20 | float *pos_y, 21 | float *v_x, 22 | float *v_y, 23 | float *a_x, 24 | float *a_y, 25 | float mass, 26 | float g, 27 | float h, 28 | float k, 29 | float rho0, 30 | float mu); 31 | 32 | void leapfrog( 33 | int n, 34 | float *pos_x, 35 | float *pos_y, 36 | float *v_x, 37 | float *v_y, 38 | float *a_x, 39 | float *a_y, 40 | double dt); 41 | 42 | void reflect_bc( 43 | int n, 44 | float *pos_x, 45 | float *pos_y, 46 | float *v_x, 47 | float *v_y); 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /include/libflatarray/testbed/gpu_benchmark.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_TESTBED_GPU_BENCHMARK_HPP 9 | #define FLAT_ARRAY_TESTBED_GPU_BENCHMARK_HPP 10 | 11 | #include 12 | 13 | // disable certain warnings from system headers when compiling with 14 | // Microsoft Visual Studio: 15 | #ifdef _MSC_BUILD 16 | #pragma warning( push ) 17 | #pragma warning( disable : 4514 ) 18 | #endif 19 | 20 | #include 21 | 22 | #ifdef _MSC_BUILD 23 | #pragma warning( pop ) 24 | #endif 25 | 26 | namespace LibFlatArray { 27 | 28 | class gpu_benchmark : benchmark 29 | { 30 | public: 31 | std::string order() 32 | { 33 | return "GPU"; 34 | } 35 | 36 | std::string device() 37 | { 38 | int cudaDevice; 39 | cudaGetDevice(&cudaDevice); 40 | cudaDeviceProp properties; 41 | cudaGetDeviceProperties(&properties, cudaDevice); 42 | std::string cudaDeviceID = properties.name; 43 | 44 | return cudaDeviceID; 45 | } 46 | }; 47 | 48 | 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /include/libflatarray/flat_array.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012-2016 Andreas Schäfer 3 | * Copyright 2017 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_FLAT_ARRAY_HPP 10 | #define FLAT_ARRAY_FLAT_ARRAY_HPP 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #ifdef __CUDACC__ 17 | #include 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/libflatarray/detail/simple_streak.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_SIMPLE_STREAK_HPP 9 | #define FLAT_ARRAY_DETAIL_SIMPLE_STREAK_HPP 10 | 11 | // Don't warn about these functions being stripped from an executable 12 | // as they're not being used, that's actually expected behavior. 13 | #ifdef _MSC_BUILD 14 | #pragma warning( push ) 15 | #pragma warning( disable : 4514 ) 16 | #endif 17 | 18 | namespace LibFlatArray { 19 | 20 | namespace detail { 21 | 22 | namespace flat_array { 23 | 24 | class simple_streak { 25 | public: 26 | explicit simple_streak(std::size_t x = 0, std::size_t y = 0, std::size_t z = 0, std::size_t count = 0) : 27 | count(count) 28 | { 29 | origin[0] = x; 30 | origin[1] = y; 31 | origin[2] = z; 32 | } 33 | 34 | std::size_t length() const 35 | { 36 | return count; 37 | } 38 | 39 | std::size_t origin[3]; 40 | std::size_t count; 41 | }; 42 | 43 | } 44 | 45 | } 46 | 47 | } 48 | 49 | #ifdef _MSC_BUILD 50 | #pragma warning( pop ) 51 | #endif 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /examples/jacobi/update_c99.c: -------------------------------------------------------------------------------- 1 | #ifdef __ICC 2 | #include 3 | #endif 4 | 5 | /** 6 | * Recommended reference for multi-dimensional array handling in C99 7 | * by Jeff Hammond: 8 | * 9 | * https://github.com/jeffhammond/HPCInfo/blob/master/c99/array3d.c 10 | */ 11 | void update_c99(double *data_new, const double *data_old, int dim_x, int dim_y, int dim_z) 12 | { 13 | // cast types here to maintain a C++-compatible signature: 14 | double (* const restrict grid_old)[dim_y][dim_x] = (double (* const)[dim_y][dim_x])data_old; 15 | double (* restrict grid_new)[dim_y][dim_x] = (double (* )[dim_y][dim_x])data_new; 16 | 17 | #pragma omp parallel for schedule(static) 18 | for (int z = 1; z < (dim_z - 1); ++z) { 19 | for (int y = 1; y < (dim_y - 1); ++y) { 20 | #ifdef __ICC 21 | #pragma vector always nontemporal 22 | #endif 23 | for (int x = 1; x < (dim_x - 1); ++x) { 24 | grid_new[z][y][x] = 25 | (grid_old[z - 1][y ][x ] + 26 | grid_old[z ][y - 1][x ] + 27 | grid_old[z ][y ][x - 1] + 28 | grid_old[z ][y ][x + 1] + 29 | grid_old[z ][y + 1][x ] + 30 | grid_old[z + 1][y ][x ]) * (1.0 / 6.0); 31 | } 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /include/libflatarray/soa_accessor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_SOA_ACCESSOR_HPP 9 | #define FLAT_ARRAY_SOA_ACCESSOR_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | /** 14 | * This class provides an object-oriented view to a "Struct of 15 | * Arrays"-style grid. It requires the user to register the type CELL 16 | * using the macro LIBFLATARRAY_REGISTER_SOA. 17 | * 18 | * All registed members will be avalable by functions of the same 19 | * name, so if "Cell" had two members "float a" and "char b", then 20 | * these would be accessible via soa_accessor::a() and 21 | * soa_accessor::b(). 22 | * 23 | * soa_accessor<> also provides an operator[] which can be used to 24 | * access neighboring cells. 25 | */ 26 | template 27 | class soa_accessor; 28 | 29 | template 30 | class const_soa_accessor; 31 | 32 | template 33 | class soa_accessor_light; 34 | 35 | template 36 | class const_soa_accessor_light; 37 | 38 | } 39 | 40 | #endif 41 | 42 | -------------------------------------------------------------------------------- /include/libflatarray/detail/offset.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_OFFSET_HPP 10 | #define FLAT_ARRAY_DETAIL_OFFSET_HPP 11 | 12 | // disable certain warnings from system headers when compiling with 13 | // Microsoft Visual Studio: 14 | #ifdef _MSC_BUILD 15 | #pragma warning( push ) 16 | #pragma warning( disable : 4514 4548 4626 4710 4711 4820 4996 5027 ) 17 | #endif 18 | 19 | #include 20 | 21 | #ifdef _MSC_BUILD 22 | #pragma warning( pop ) 23 | #endif 24 | 25 | #ifdef _MSC_BUILD 26 | #pragma warning( push ) 27 | #pragma warning( disable : 4710 4711 ) 28 | #endif 29 | 30 | namespace LibFlatArray { 31 | 32 | namespace detail { 33 | 34 | namespace flat_array { 35 | 36 | template 37 | class offset; 38 | 39 | template 40 | class offset 41 | { 42 | public: 43 | static const long OFFSET = 0; 44 | 45 | template 46 | int operator()(MEMBER_TYPE CELL::* /* member_ptr */) 47 | { 48 | throw std::invalid_argument("member was not registered with LibFlatArray"); 49 | } 50 | }; 51 | 52 | } 53 | 54 | } 55 | 56 | } 57 | 58 | #ifdef _MSC_BUILD 59 | #pragma warning( pop ) 60 | #endif 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /include/libflatarray/cuda_allocator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_CUDA_ALLOCATOR_HPP 9 | #define FLAT_ARRAY_CUDA_ALLOCATOR_HPP 10 | 11 | #ifdef __CUDACC__ 12 | 13 | #ifdef __ICC 14 | // disabling this warning as implicit type conversion here as it's an intented feature for dim3 15 | #pragma warning push 16 | #pragma warning (disable: 2304) 17 | #endif 18 | 19 | // disable certain warnings from system headers when compiling with 20 | // Microsoft Visual Studio: 21 | #ifdef _MSC_BUILD 22 | #pragma warning( push ) 23 | #pragma warning( disable : 4514 ) 24 | #endif 25 | 26 | #include 27 | 28 | #ifdef _MSC_BUILD 29 | #pragma warning( pop ) 30 | #endif 31 | 32 | #ifdef __ICC 33 | #pragma warning pop 34 | #endif 35 | 36 | namespace LibFlatArray { 37 | 38 | template 39 | class cuda_allocator 40 | { 41 | public: 42 | typedef ptrdiff_t difference_type; 43 | typedef T* pointer; 44 | typedef const T* const_pointer; 45 | typedef T& reference; 46 | typedef const T& const_reference; 47 | typedef T value_type; 48 | 49 | pointer allocate(std::size_t n, const void* = 0) 50 | { 51 | pointer ret; 52 | cudaMalloc(&ret, n * sizeof(T)); 53 | return ret; 54 | } 55 | 56 | void deallocate(pointer p, std::size_t) 57 | { 58 | cudaFree(p); 59 | } 60 | }; 61 | 62 | } 63 | 64 | #endif 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /test/cuda_allocator_test.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | #include "test.hpp" 12 | 13 | using namespace LibFlatArray; 14 | 15 | ADD_TEST(basic) 16 | { 17 | cuda_allocator allocator; 18 | 19 | double *devArray1 = allocator.allocate( 50); 20 | double *devArray2 = allocator.allocate(110); 21 | BOOST_TEST(devArray1 != devArray2); 22 | 23 | std::vector hostArray1(120, -1); 24 | std::vector hostArray2(130, -2); 25 | 26 | for (int i = 0; i < 50; ++i) { 27 | hostArray1[i] = i + 0.5; 28 | 29 | BOOST_TEST(hostArray2[i] == -2); 30 | } 31 | 32 | std::size_t byteSize = 50 * sizeof(double); 33 | cudaMemcpy(devArray1, &hostArray1[0], byteSize, cudaMemcpyHostToDevice); 34 | cudaMemcpy(devArray2, devArray1, byteSize, cudaMemcpyDeviceToDevice); 35 | cudaMemcpy(&hostArray2[0], devArray2, byteSize, cudaMemcpyDeviceToHost); 36 | 37 | for (int i = 0; i < 50; ++i) { 38 | double expected = i + 0.5; 39 | BOOST_TEST(hostArray2[i] == expected); 40 | } 41 | } 42 | 43 | ADD_TEST(null_allocation) 44 | { 45 | cuda_allocator allocator; 46 | double *p = allocator.allocate(0); 47 | allocator.deallocate(p, 0); 48 | BOOST_TEST(p == 0); 49 | } 50 | 51 | int main(int argc, char **argv) 52 | { 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /include/libflatarray/detail/generate_cuda_launch_config.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_GENERATE_CUDA_LAUNCH_CONFIG_HPP 10 | #define FLAT_ARRAY_DETAIL_GENERATE_CUDA_LAUNCH_CONFIG_HPP 11 | 12 | #include 13 | 14 | #ifdef LIBFLATARRAY_WITH_CUDA 15 | #ifdef __CUDACC__ 16 | 17 | namespace LibFlatArray { 18 | 19 | namespace detail { 20 | 21 | namespace flat_array { 22 | 23 | /** 24 | * Returns a somewhat sensible decomposition of the grid into thread 25 | * blocks for launching CUDA kernels. 26 | */ 27 | class generate_cuda_launch_config 28 | { 29 | public: 30 | void operator()(dim3 *grid_dim, dim3 *block_dim, int x, int y, int z) 31 | { 32 | if (y >= 4) { 33 | *block_dim = dim3(128, 4, 1); 34 | } else { 35 | *block_dim = dim3(512, 1, 1); 36 | } 37 | 38 | grid_dim->x = divide_and_round_up(x, block_dim->x); 39 | grid_dim->y = divide_and_round_up(y, block_dim->y); 40 | grid_dim->z = divide_and_round_up(z, block_dim->z); 41 | } 42 | 43 | private: 44 | int divide_and_round_up(int i, int dividend) 45 | { 46 | int ret = i / dividend; 47 | if (i % dividend) { 48 | ret += 1; 49 | } 50 | 51 | return ret; 52 | } 53 | }; 54 | 55 | } 56 | 57 | } 58 | 59 | } 60 | 61 | #endif 62 | #endif 63 | 64 | #endif 65 | 66 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ABOUT 2 | ===== 3 | 4 | LibFlatArray acts as a highly efficient multi-dimensional array of 5 | arbitrary objects (array of structs, AoS), but really uses a struct of 6 | arrays (SoA) memory layout. It's great for writing vectorized code and 7 | its lightning-fast iterators give you access to neighboring elements 8 | with zero address generation overhead. 9 | 10 | Use cases include: 11 | - computer simulations (e.g. stencil codes such as Lattice Boltzmann Methods) 12 | - image processing (e.g. Gaussian filters) 13 | - numerical methods (e.g. multiplication of complex matrices) 14 | 15 | The library is written in C++ and uses templates to shift the burden 16 | of address computation from runtime to compile time. It shares some 17 | infrastructure with its parent project LibGeoDecomp. 18 | 19 | Further information: 20 | - homepage: http://www.libgeodecomp.org/libflatarray.html 21 | - mailing list: http://www.libgeodecomp.org/mailing_lists.html 22 | - source repository: https://bitbucket.org/gentryx/libflatarray 23 | - contributors: see file "AUTHORS" 24 | 25 | DEPENDENCIES 26 | ============ 27 | 28 | - C++ compiler (min. C++98, tested with GCC's g++, Clang's clang++, 29 | and Intel's icpc) 30 | 31 | - CMake (min. 2.8.10) 32 | 33 | - build tool supported by CMake (e.g. make, ninja) 34 | 35 | BUILDING 36 | ======== 37 | 38 | For compiling LibFlatArray you'll need CMake (http://www.cmake.org) 39 | installed. We recommend an out-of-source build: 40 | 41 | BUILD_DIR=build/`uname -ms | sed s/\ /-/g` 42 | mkdir -p $BUILD_DIR 43 | cd $BUILD_DIR 44 | cmake ../../ 45 | make 46 | -------------------------------------------------------------------------------- /include/libflatarray/detail/copy_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_COPY_FUNCTOR_HPP 9 | #define FLAT_ARRAY_DETAIL_COPY_FUNCTOR_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | namespace detail { 14 | 15 | namespace flat_array { 16 | 17 | /** 18 | * Will copy all members of all grid cells by invoking std::copy on 19 | * all member instances. We can't just memcpy() as members may still 20 | * be C++ objects that need to run their copy c-tors for 21 | * allocation/deallocation 22 | */ 23 | template 24 | class copy_functor 25 | { 26 | public: 27 | copy_functor( 28 | std::size_t dim_x, 29 | std::size_t dim_y, 30 | std::size_t dim_z) : 31 | dim_x(dim_x), 32 | dim_y(dim_y), 33 | dim_z(dim_z) 34 | {} 35 | 36 | template 37 | void operator()(ACCESSOR1& source_accessor, ACCESSOR2 target_accessor) const 38 | { 39 | for (std::size_t z = 0; z < dim_z; ++z) { 40 | for (std::size_t y = 0; y < dim_y; ++y) { 41 | target_accessor.index() = ACCESSOR1::gen_index(0, y, z); 42 | source_accessor.index() = target_accessor.index(); 43 | target_accessor.copy_members(source_accessor, dim_x); 44 | } 45 | } 46 | } 47 | 48 | private: 49 | std::size_t dim_x; 50 | std::size_t dim_y; 51 | std::size_t dim_z; 52 | }; 53 | 54 | } 55 | 56 | } 57 | 58 | } 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /include/libflatarray/detail/generic_destruct.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_GENERIC_DESTRUCT_HPP 9 | #define FLAT_ARRAY_DETAIL_GENERIC_DESTRUCT_HPP 10 | 11 | // this fixes compilation for non-cuda builds 12 | #ifndef __host__ 13 | #define __host__ 14 | #endif 15 | 16 | #ifndef __device__ 17 | #define __device__ 18 | #endif 19 | 20 | // Don't warn about these functions being stripped from an executable 21 | // as they're not being used, that's actually expected behavior. 22 | #ifdef _MSC_BUILD 23 | #pragma warning( push ) 24 | #pragma warning( disable : 4514 ) 25 | #endif 26 | 27 | namespace LibFlatArray { 28 | 29 | namespace detail { 30 | 31 | namespace flat_array { 32 | 33 | template 34 | __host__ __device__ 35 | inline void generic_destruct(TYPENAME *member) 36 | { 37 | member->~TYPENAME(); 38 | } 39 | 40 | // primitive types don't have d-tors: 41 | __host__ __device__ 42 | inline void generic_destruct(char *) 43 | {} 44 | 45 | __host__ __device__ 46 | inline void generic_destruct(float *) 47 | {} 48 | 49 | __host__ __device__ 50 | inline void generic_destruct(double *) 51 | {} 52 | 53 | __host__ __device__ 54 | inline void generic_destruct(int *) 55 | {} 56 | 57 | __host__ __device__ 58 | inline void generic_destruct(unsigned *) 59 | {} 60 | 61 | __host__ __device__ 62 | inline void generic_destruct(long *) 63 | {} 64 | 65 | } 66 | 67 | } 68 | 69 | } 70 | 71 | #ifdef _MSC_BUILD 72 | #pragma warning( pop ) 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /include/libflatarray/testbed/benchmark.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_TESTBED_BENCHMARK_HPP 10 | #define FLAT_ARRAY_TESTBED_BENCHMARK_HPP 11 | 12 | // disable certain warnings from system headers when compiling with 13 | // Microsoft Visual Studio: 14 | #ifdef _MSC_BUILD 15 | #pragma warning( push ) 16 | #pragma warning( disable : 4514 4548 4668 4711 4820 4996 ) 17 | #endif 18 | 19 | #include 20 | #include 21 | 22 | #ifdef _WIN32 23 | #include 24 | #else 25 | #include 26 | #endif 27 | 28 | #ifdef _MSC_BUILD 29 | #pragma warning( pop ) 30 | #endif 31 | 32 | namespace LibFlatArray { 33 | 34 | class benchmark 35 | { 36 | public: 37 | virtual ~benchmark() 38 | {} 39 | 40 | virtual std::string order() = 0; 41 | virtual std::string family() = 0; 42 | virtual std::string species() = 0; 43 | virtual double performance(std::vector dim) = 0; 44 | virtual std::string unit() = 0; 45 | virtual std::string device() = 0; 46 | 47 | static 48 | inline double time() 49 | { 50 | #ifdef _WIN32 51 | LARGE_INTEGER time; 52 | LARGE_INTEGER freq; 53 | QueryPerformanceCounter(&time); 54 | QueryPerformanceFrequency(&freq); 55 | return 1.0 * time.QuadPart / freq.QuadPart; 56 | #else 57 | timeval t; 58 | gettimeofday(&t, 0); 59 | return t.tv_sec + t.tv_usec * 1.0e-6; 60 | #endif 61 | } 62 | 63 | }; 64 | 65 | 66 | } 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /include/libflatarray/preprocessor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer, 3 | * heavily based on the Boost Preprocessor library by Paul Mensonides (copyright 2002) 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_PREPROCESSOR_HPP 10 | #define FLAT_ARRAY_PREPROCESSOR_HPP 11 | 12 | #include 13 | 14 | /** 15 | * Returns the element of LIST at position INDEX. Assumes 0-based 16 | * addressing. 17 | */ 18 | #define LIBFLATARRAY_ELEM(INDEX, LIST) LIBFLATARRAY_ELEM_I(INDEX, LIST) 19 | 20 | /** 21 | * Return lenght of LIST. LIST is assumed to be of the form 22 | * 23 | * (foo)(bar)(goo) 24 | * 25 | * i.e. all elements are enclosed in parentheses. 26 | */ 27 | #define LIBFLATARRAY_SIZE(LIST) LIBFLATARRAY_SIZE_I(LIBFLATARRAY_SIZE_0 LIST) 28 | 29 | // Expands to an empty string, useful for deleting arguments from a 30 | // list. 31 | #define LIBFLATARRAY_NULL(_) 32 | 33 | // Returns a list which is identical to LIST, but with the first 34 | // element removed. Will fail for empty lists. 35 | #define LIBFLATARRAY_DEQUEUE(LIST) LIBFLATARRAY_NULL LIST 36 | 37 | /** 38 | * Will instantiate MACRO for each element of LIST with three parameters: 39 | * 1. an integer index, starting at 0, 40 | * 2. PARAM 41 | * 3. the element of LIST at the given index. 42 | */ 43 | #define LIBFLATARRAY_FOR_EACH(MACRO, DEFAULT_ARG, LIST) LIBFLATARRAY_FOR_EACH_I(MACRO, DEFAULT_ARG, LIBFLATARRAY_DEQUEUE(LIST), LIST) 44 | 45 | /** 46 | * Will expand to A if the size of LIST is less than LENGTH. Will 47 | * expand to B if the number of elements in LIST is equal to or larger 48 | * than LENGTH. 49 | */ 50 | #define LIBFLATARRAY_IF_SHORTER(LIST, LENGTH, A, B) LIBFLATARRAY_IF_SHORTER_I(LIBFLATARRAY_IF_SHORTER_ ## LENGTH, LIST, A, B) 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /include/libflatarray/detail/set_byte_size_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_SET_BYTE_SIZE_FUNCTOR_HPP 9 | #define FLAT_ARRAY_DETAIL_SET_BYTE_SIZE_FUNCTOR_HPP 10 | 11 | #include 12 | 13 | namespace LibFlatArray { 14 | 15 | namespace detail { 16 | 17 | namespace flat_array { 18 | 19 | /** 20 | * This helper class uses the dimension specified in the accessor to 21 | * compute how many bytes a grid needs to allocate im memory. 22 | */ 23 | template 24 | class set_byte_size_functor 25 | { 26 | public: 27 | explicit set_byte_size_functor( 28 | std::size_t *byte_size, 29 | std::size_t *extent_x, 30 | std::size_t *extent_y, 31 | std::size_t *extent_z) : 32 | byte_size(byte_size), 33 | extent_x(extent_x), 34 | extent_y(extent_y), 35 | extent_z(extent_z) 36 | {} 37 | 38 | template 39 | void operator()(const soa_accessor& /* accessor */) const 40 | { 41 | // Overflow is fine here (it's actually to be expected for 42 | // 32-bit builds) as such large grids can't be instantiated at 43 | // runtime anyway; 44 | #ifdef _MSC_BUILD 45 | #pragma warning( push ) 46 | #pragma warning( disable : 4307 ) 47 | #endif 48 | 49 | *byte_size = aggregated_member_size::VALUE * DIM_X * DIM_Y * DIM_Z; 50 | 51 | #ifdef _MSC_BUILD 52 | #pragma warning( pop ) 53 | #endif 54 | 55 | *extent_x = DIM_X; 56 | *extent_y = DIM_Y; 57 | *extent_z = DIM_Z; 58 | } 59 | 60 | private: 61 | std::size_t *byte_size; 62 | std::size_t *extent_x; 63 | std::size_t *extent_y; 64 | std::size_t *extent_z; 65 | }; 66 | 67 | } 68 | 69 | } 70 | 71 | } 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /include/libflatarray/ilp_to_arity.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_ILP_TO_ARITY_HPP 9 | #define FLAT_ARRAY_ILP_TO_ARITY_HPP 10 | 11 | #include 12 | 13 | namespace LibFlatArray { 14 | 15 | /** 16 | * This class allows users to select the arity of a short_vec type by 17 | * specifying the desired degree of instruction level parallelism 18 | * (i.e. loop unrolling factor). For instance, setting ILP to 4 for 19 | * double on an AVX-cabable CPU would yield short_vec, but 20 | * for a SSE-only CPU it would return a short_vec. 21 | */ 22 | template 23 | class ilp_to_arity 24 | { 25 | public: 26 | // Revert to scalar values when running on a CUDA device. The 27 | // vector unit is much wider, but from a programming PoV it's 28 | // scalar: 29 | #ifdef __CUDA_ARCH__ 30 | static const std::size_t ARITY = 1; 31 | #else 32 | // for IBM Blue Gene/Q's QPX, which is mutually exclusive to 33 | // Intel/AMD's AVX/SSE or ARM's NEON ISAs: 34 | # ifdef __VECTOR4DOUBLE__ 35 | static const int BIT_WIDTH = 256; 36 | # endif 37 | 38 | // Dito for ARM NEON: 39 | # ifdef __ARM_NEON__ 40 | static const int BIT_WIDTH = 128; 41 | # endif 42 | 43 | // Only the case of the IBM PC is complicated. No thanks to you, 44 | // history! 45 | # if !defined(__CUDA_ARCH__) && !defined(__ARM_NEON__) && !defined(__MIC__) 46 | # ifdef LFA_AVX512_HELPER 47 | static const int BIT_WIDTH = 512; 48 | # else 49 | # ifdef __AVX__ 50 | static const int BIT_WIDTH = 256; 51 | # else 52 | # ifdef __SSE__ 53 | static const int BIT_WIDTH = 128; 54 | # else 55 | static const int BIT_WIDTH = sizeof(CARGO) * 8; 56 | # endif 57 | # endif 58 | # endif 59 | # endif 60 | static const std::size_t ARITY = ILP * BIT_WIDTH / sizeof(CARGO) / 8; 61 | #endif 62 | 63 | }; 64 | 65 | } 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /examples/lbm/update_lbm_cuda_flat_array.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013-2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CUDA_FLAT_ARRAY_H 9 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CUDA_FLAT_ARRAY_H 10 | 11 | #include 12 | #include 13 | 14 | #include "util.h" 15 | #include "cudalineupdatefunctorprototype.h" 16 | 17 | class benchmark_lbm_cuda_flat_array : public benchmark_lbm_cuda 18 | { 19 | virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats) 20 | { 21 | LibFlatArray::soa_grid gridA(dim, dim, 256); 22 | LibFlatArray::soa_grid gridB(dim, dim, 256); 23 | // fixme: init grid? 24 | 25 | char *dataA = gridA.data(); 26 | char *dataB = gridB.data(); 27 | 28 | char *buf; 29 | cudaMalloc(reinterpret_cast(&buf), gridA.byte_size()); 30 | gridA.set_data(buf); 31 | cudaMalloc(reinterpret_cast(&buf), gridB.byte_size()); 32 | gridB.set_data(buf); 33 | 34 | LibFlatArray::soa_grid *gridOld = &gridA; 35 | LibFlatArray::soa_grid *gridNew = &gridB; 36 | 37 | cudaDeviceSynchronize(); 38 | double t_start = LibFlatArray::benchmark::time(); 39 | 40 | CudaLineUpdateFunctorPrototype updater(dimBlock, dimGrid); 41 | 42 | for (int t = 0; t < repeats; ++t) { 43 | gridOld->callback(gridNew, updater); 44 | std::swap(gridOld, gridNew); 45 | } 46 | 47 | cudaDeviceSynchronize(); 48 | double t_end = LibFlatArray::benchmark::time(); 49 | check_cuda_error(); 50 | 51 | cudaFree(gridA.data()); 52 | cudaFree(gridB.data()); 53 | 54 | gridA.set_data(dataA); 55 | gridB.set_data(dataB); 56 | 57 | return t_end - t_start; 58 | } 59 | 60 | virtual std::string name() 61 | { 62 | return "lbm_cuda_flat_array"; 63 | } 64 | }; 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /examples/lbm/cudalineupdatefunctorprototype.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_CUDALINEUPDATEFUNCTORPROTOTYPE_H 2 | #define LIBFLATARRAY_EXAMPLES_LBM_CUDALINEUPDATEFUNCTORPROTOTYPE_H 3 | 4 | #include "cell.h" 5 | 6 | template 7 | __global__ 8 | void update(ACCESSOR1 accessor1, ACCESSOR2 accessor2) 9 | { 10 | ACCESSOR1 accessorOld(accessor1.data(), 0); 11 | ACCESSOR2 accessorNew(accessor2.data(), 0); 12 | 13 | CELL::updateLine( 14 | accessorOld, &accessorOld.index(), 15 | accessorNew, &accessorNew.index(), 2, 256 - 2); 16 | } 17 | 18 | template 19 | class CudaLineUpdateFunctorPrototypeImplementation 20 | { 21 | public: 22 | CudaLineUpdateFunctorPrototypeImplementation(dim3 dim_block, dim3 dim_grid) : 23 | dim_block(dim_block), 24 | dim_grid(dim_grid) 25 | {} 26 | 27 | template 28 | void operator()(ACCESSOR1 accessor1, ACCESSOR2 accessor2) const 29 | { 30 | update<<>>(accessor1, accessor2); 31 | } 32 | 33 | private: 34 | dim3 dim_block; 35 | dim3 dim_grid; 36 | }; 37 | 38 | template 39 | class CudaLineUpdateFunctorPrototype 40 | { 41 | public: 42 | CudaLineUpdateFunctorPrototype(dim3 dim_block, dim3 dim_grid) : 43 | dim_block(dim_block), 44 | dim_grid(dim_grid) 45 | {} 46 | 47 | template 48 | void operator()(ACCESSOR1 accessor1, ACCESSOR2 accessor2) const; 49 | 50 | private: 51 | dim3 dim_block; 52 | dim3 dim_grid; 53 | }; 54 | 55 | #define IMPLEMENTATION(CELL, X1, Y1, Z1, X2, Y2, Z2) \ 56 | template<> \ 57 | template<> \ 58 | void CudaLineUpdateFunctorPrototype::operator()( \ 59 | LibFlatArray::soa_accessor accessor1, \ 60 | LibFlatArray::soa_accessor accessor2) const \ 61 | { \ 62 | CudaLineUpdateFunctorPrototypeImplementation i(dim_block, dim_grid); \ 63 | i(accessor1, accessor2); \ 64 | } 65 | 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /include/libflatarray/detail/staging_buffer.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_STAGING_BUFFER_HPP 9 | #define FLAT_ARRAY_DETAIL_STAGING_BUFFER_HPP 10 | 11 | #include 12 | 13 | namespace LibFlatArray { 14 | 15 | namespace detail { 16 | 17 | namespace flat_array { 18 | 19 | /** 20 | * Dummy class which presents the same interface as cuda_array, 21 | * but won't actually buffer the data. Instead the pointers are 22 | * forwarded directly so no additional copies of the data need to 23 | * be made. 24 | */ 25 | template 26 | class staging_buffer 27 | { 28 | public: 29 | void resize(std::size_t /* unused */) 30 | { 31 | // intentionally left blank 32 | } 33 | 34 | void load(const CELL *new_data) 35 | { 36 | data_pointer = const_cast(new_data); 37 | } 38 | 39 | void save(CELL* /* new_data */) const 40 | { 41 | // intentionally left blank 42 | } 43 | 44 | const CELL *data() const 45 | { 46 | return data_pointer; 47 | } 48 | 49 | CELL *data() 50 | { 51 | return data_pointer; 52 | } 53 | 54 | void prep(CELL *new_data) 55 | { 56 | data_pointer = new_data; 57 | } 58 | private: 59 | CELL *data_pointer; 60 | }; 61 | 62 | #ifdef __CUDACC__ 63 | 64 | template 65 | class staging_buffer 66 | { 67 | public: 68 | void resize(std::size_t n) 69 | { 70 | delegate.resize(n); 71 | } 72 | 73 | void load(const CELL *new_data) 74 | { 75 | delegate.load(new_data); 76 | } 77 | 78 | void save(CELL *new_data) const 79 | { 80 | delegate.save(new_data); 81 | } 82 | 83 | const CELL *data() const 84 | { 85 | return delegate.data(); 86 | } 87 | 88 | CELL *data() 89 | { 90 | return delegate.data(); 91 | } 92 | 93 | void prep(CELL* /* new_data */) 94 | { 95 | // intentionally left blank 96 | } 97 | 98 | private: 99 | cuda_array delegate; 100 | }; 101 | 102 | #endif 103 | 104 | } 105 | 106 | } 107 | 108 | } 109 | 110 | #endif 111 | 112 | -------------------------------------------------------------------------------- /test/aligned_allocator_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | // globally disable some warnings with MSVC, that are issued not for a 9 | // specific header, but rather for the interaction of system headers 10 | // and LibFlatArray source: 11 | #ifdef _MSC_BUILD 12 | #pragma warning( disable : 4710 ) 13 | #endif 14 | 15 | #include 16 | 17 | // disable certain warnings from system headers when compiling with 18 | // Microsoft Visual Studio: 19 | #ifdef _MSC_BUILD 20 | #pragma warning( push ) 21 | #pragma warning( disable : 4514 ) 22 | #endif 23 | 24 | #include 25 | 26 | #ifdef _MSC_BUILD 27 | #pragma warning( pop ) 28 | #endif 29 | 30 | #include "test.hpp" 31 | 32 | using namespace LibFlatArray; 33 | 34 | ADD_TEST(test_alignment_64) 35 | { 36 | int *p = aligned_allocator().allocate(3); 37 | BOOST_TEST(0 == (long(p) % 64)); 38 | aligned_allocator().deallocate(p, 3); 39 | } 40 | 41 | ADD_TEST(test_alignment_128) 42 | { 43 | char *p = aligned_allocator().allocate(199); 44 | BOOST_TEST(0 == (long(p) % 128)); 45 | aligned_allocator().deallocate(p, 199); 46 | } 47 | 48 | ADD_TEST(test_alignment_512) 49 | { 50 | long *p = aligned_allocator().allocate(256); 51 | BOOST_TEST(0 == (long(p) % 512)); 52 | aligned_allocator().deallocate(p, 256); 53 | } 54 | 55 | ADD_TEST(test_usage_with_std_vector) 56 | { 57 | typedef std::vector > vec_type; 58 | vec_type vec(40, -1); 59 | 60 | BOOST_TEST(0 == (std::size_t(&vec[0])) % 64); 61 | 62 | for (vec_type::iterator i = vec.begin(); i != vec.end(); ++i) { 63 | BOOST_TEST(-1 == *i); 64 | } 65 | 66 | vec.resize(80); 67 | for (int i = 0; i < 80; ++i) { 68 | vec[std::size_t(i)] = 4711 + i; 69 | } 70 | for (int i = 0; i < 80; ++i) { 71 | BOOST_TEST((4711 + i) == vec[std::size_t(i)]); 72 | } 73 | 74 | vec.resize(0); 75 | for (int i = 0; i < 90; ++i) { 76 | vec.push_back(23 + i); 77 | } 78 | for (int i = 0; i < 90; ++i) { 79 | BOOST_TEST((23 + i) == vec[std::size_t(i)]); 80 | } 81 | 82 | vec.resize(0); 83 | vec.reserve(95); 84 | for (int i = 0; i < 95; ++i) { 85 | vec.push_back(69 + i); 86 | } 87 | for (int i = 0; i < 95; ++i) { 88 | BOOST_TEST((69 + i) == vec[std::size_t(i)]); 89 | } 90 | 91 | } 92 | 93 | int main(int /* argc */, char** /* argv */) 94 | { 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /include/libflatarray/streaming_short_vec.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_STREAMING_SHORT_VEC_HPP 10 | #define FLAT_ARRAY_STREAMING_SHORT_VEC_HPP 11 | 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | #ifdef __ICC 17 | // disabling this warning as implicit type conversion is exactly our goal here: 18 | #pragma warning push 19 | #pragma warning (disable: 2304) 20 | #endif 21 | 22 | template 23 | class streaming_short_vec; 24 | 25 | template 26 | inline bool any(const streaming_short_vec& vec) 27 | { 28 | return vec.any(); 29 | } 30 | 31 | // Don't warn about these functions being stripped from an executable 32 | // as they're not being used, that's actually expected behavior. 33 | #ifdef _MSC_BUILD 34 | #pragma warning( push ) 35 | #pragma warning( disable : 4514 ) 36 | #endif 37 | 38 | /** 39 | * Wraps functionality of short_vec, but replaces all stores by 40 | * streaming (i.e. non-temporal) stores. Downside: all store addresses 41 | * must be aligned. 42 | */ 43 | template 44 | class streaming_short_vec : public short_vec 45 | { 46 | public: 47 | 48 | inline 49 | streaming_short_vec(const CARGO val = 0) : short_vec(val) 50 | {} 51 | 52 | inline 53 | streaming_short_vec(const CARGO *data) : short_vec(data) 54 | {} 55 | 56 | inline 57 | streaming_short_vec(short_vec&& val) : short_vec(std::move(val)) 58 | {} 59 | 60 | #ifdef LIBFLATARRAY_WITH_CPP14 61 | inline 62 | streaming_short_vec(const std::initializer_list& list) 63 | { 64 | const CARGO *ptr = static_cast(&(*list.begin())); 65 | load(ptr); 66 | } 67 | #endif 68 | 69 | using short_vec::load; 70 | 71 | inline 72 | void store(CARGO *data) 73 | { 74 | short_vec::store_nt(data); 75 | } 76 | 77 | inline 78 | void store_aligned(CARGO *data) 79 | { 80 | short_vec::store_nt(data); 81 | } 82 | }; 83 | 84 | #ifdef __ICC 85 | #pragma warning pop 86 | #endif 87 | 88 | template 89 | inline 90 | void operator<<(double *data, const streaming_short_vec& vec) 91 | { 92 | vec.store_nt(data); 93 | } 94 | 95 | #ifdef _MSC_BUILD 96 | #pragma warning( pop ) 97 | #endif 98 | 99 | } 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /include/libflatarray/detail/construct_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_CONSTRUCT_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_CONSTRUCT_FUNCTOR_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace LibFlatArray { 16 | 17 | namespace detail { 18 | 19 | namespace flat_array { 20 | 21 | /** 22 | * Will initialize all grid cells, relies on the SoA (Struct of 23 | * Arrays) accessor to initialize a cell's members individually. 24 | */ 25 | template 26 | class construct_functor 27 | { 28 | public: 29 | construct_functor( 30 | std::size_t dim_x, 31 | std::size_t dim_y, 32 | std::size_t dim_z) : 33 | dim_x(dim_x), 34 | dim_y(dim_y), 35 | dim_z(dim_z) 36 | {} 37 | 38 | template 39 | void operator()(soa_accessor& accessor) const 40 | { 41 | for (std::size_t z = 0; z < dim_z; ++z) { 42 | for (std::size_t y = 0; y < dim_y; ++y) { 43 | accessor.index() = soa_accessor::gen_index(0, y, z); 44 | 45 | for (std::size_t x = 0; x < dim_x; ++x) { 46 | accessor.construct_members(); 47 | ++accessor; 48 | } 49 | } 50 | } 51 | } 52 | 53 | private: 54 | std::size_t dim_x; 55 | std::size_t dim_y; 56 | std::size_t dim_z; 57 | }; 58 | 59 | #ifdef LIBFLATARRAY_WITH_CUDA 60 | #ifdef __CUDACC__ 61 | 62 | template 63 | __global__ 64 | void construct_kernel(char *data, long dim_x, long dim_y, long dim_z) 65 | { 66 | long x = blockDim.x * blockIdx.x + threadIdx.x; 67 | long y = blockDim.y * blockIdx.y + threadIdx.y; 68 | long z = blockDim.z * blockIdx.z + threadIdx.z; 69 | 70 | if (x >= dim_x) { 71 | return; 72 | } 73 | 74 | if (y >= dim_y) { 75 | return; 76 | } 77 | 78 | if (z >= dim_z) { 79 | return; 80 | } 81 | 82 | typedef soa_accessor_light accessor_type; 83 | 84 | long index = accessor_type::gen_index(x, y, z); 85 | accessor_type accessor(data, index); 86 | accessor.construct_members(); 87 | } 88 | 89 | /** 90 | * Specialization for CUDA 91 | */ 92 | template 93 | class construct_functor 94 | { 95 | public: 96 | construct_functor( 97 | std::size_t dim_x, 98 | std::size_t dim_y, 99 | std::size_t dim_z) : 100 | dim_x(dim_x), 101 | dim_y(dim_y), 102 | dim_z(dim_z) 103 | {} 104 | 105 | template 106 | void operator()(soa_accessor& accessor) const 107 | { 108 | dim3 grid_dim; 109 | dim3 block_dim; 110 | generate_cuda_launch_config()(&grid_dim, &block_dim, dim_x, dim_y, dim_z); 111 | 112 | construct_kernel<<>>(accessor.data(), dim_x, dim_y, dim_z); 113 | } 114 | 115 | private: 116 | std::size_t dim_x; 117 | std::size_t dim_y; 118 | std::size_t dim_z; 119 | }; 120 | 121 | #endif 122 | #endif 123 | 124 | } 125 | 126 | } 127 | 128 | } 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /include/libflatarray/detail/destroy_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_DESTROY_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_DESTROY_FUNCTOR_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace LibFlatArray { 16 | 17 | namespace detail { 18 | 19 | namespace flat_array { 20 | 21 | /** 22 | * Will call the destructor on all grid cells, relies on the SoA 23 | * (Struct of Arrays) accessor to destroy a cell's members 24 | * individually. 25 | */ 26 | template 27 | class destroy_functor 28 | { 29 | public: 30 | destroy_functor( 31 | std::size_t dim_x, 32 | std::size_t dim_y, 33 | std::size_t dim_z) : 34 | dim_x(dim_x), 35 | dim_y(dim_y), 36 | dim_z(dim_z) 37 | {} 38 | 39 | template 40 | void operator()(soa_accessor& accessor) const 41 | { 42 | for (std::size_t z = 0; z < dim_z; ++z) { 43 | for (std::size_t y = 0; y < dim_y; ++y) { 44 | accessor.index() = long(soa_accessor::gen_index(0, y, z)); 45 | 46 | for (std::size_t x = 0; x < dim_x; ++x) { 47 | accessor.destroy_members(); 48 | ++accessor; 49 | } 50 | } 51 | } 52 | } 53 | 54 | private: 55 | std::size_t dim_x; 56 | std::size_t dim_y; 57 | std::size_t dim_z; 58 | }; 59 | 60 | #ifdef LIBFLATARRAY_WITH_CUDA 61 | #ifdef __CUDACC__ 62 | 63 | template 64 | __global__ 65 | void destroy_kernel(char *data, long dim_x, long dim_y, long dim_z) 66 | { 67 | long x = blockDim.x * blockIdx.x + threadIdx.x; 68 | long y = blockDim.y * blockIdx.y + threadIdx.y; 69 | long z = blockDim.z * blockIdx.z + threadIdx.z; 70 | 71 | if (x >= dim_x) { 72 | return; 73 | } 74 | 75 | if (y >= dim_y) { 76 | return; 77 | } 78 | 79 | if (z >= dim_z) { 80 | return; 81 | } 82 | 83 | typedef soa_accessor_light accessor_type; 84 | 85 | long index = accessor_type::gen_index(x, y, z); 86 | accessor_type accessor(data, index); 87 | accessor.destroy_members(); 88 | } 89 | 90 | /** 91 | * Specialization for CUDA 92 | */ 93 | template 94 | class destroy_functor 95 | { 96 | public: 97 | destroy_functor( 98 | std::size_t dim_x, 99 | std::size_t dim_y, 100 | std::size_t dim_z) : 101 | dim_x(dim_x), 102 | dim_y(dim_y), 103 | dim_z(dim_z) 104 | {} 105 | 106 | template 107 | void operator()(soa_accessor& accessor) const 108 | { 109 | dim3 grid_dim; 110 | dim3 block_dim; 111 | generate_cuda_launch_config()(&grid_dim, &block_dim, dim_x, dim_y, dim_z); 112 | 113 | destroy_kernel<<>>(accessor.data(), dim_x, dim_y, dim_z); 114 | } 115 | 116 | private: 117 | std::size_t dim_x; 118 | std::size_t dim_y; 119 | std::size_t dim_z; 120 | }; 121 | 122 | #endif 123 | #endif 124 | 125 | } 126 | 127 | } 128 | 129 | } 130 | 131 | #endif 132 | 133 | -------------------------------------------------------------------------------- /include/libflatarray/detail/dual_callback_helper.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_DUAL_CALLBACK_HELPER_HPP 9 | #define FLAT_ARRAY_DETAIL_DUAL_CALLBACK_HELPER_HPP 10 | 11 | namespace LibFlatArray { 12 | 13 | namespace detail { 14 | 15 | namespace flat_array { 16 | 17 | template 18 | class dual_callback_helper2 19 | { 20 | public: 21 | dual_callback_helper2(ACCESSOR1& accessor1, FUNCTOR& functor) : 22 | accessor1(accessor1), 23 | functor(functor) 24 | {} 25 | 26 | template 27 | void operator()(ACCESSOR2& accessor2) 28 | { 29 | functor(accessor1, accessor2); 30 | } 31 | 32 | private: 33 | ACCESSOR1& accessor1; 34 | FUNCTOR& functor; 35 | }; 36 | 37 | template 38 | class dual_callback_helper1 39 | { 40 | public: 41 | dual_callback_helper1(GRID2 *grid2, const FUNCTOR& functor) : 42 | grid2(grid2), 43 | functor(functor) 44 | {} 45 | 46 | template 47 | void operator()(ACCESSOR1& accessor1) const 48 | { 49 | dual_callback_helper2 helper(accessor1, functor); 50 | grid2->callback(helper); 51 | } 52 | 53 | private: 54 | GRID2 *grid2; 55 | FUNCTOR& functor; 56 | }; 57 | 58 | class dual_callback_helper 59 | { 60 | public: 61 | template 62 | void operator()(GRID1 *gridOld, GRID2 *gridNew, FUNCTOR& functor) 63 | { 64 | dual_callback_helper1 helper(gridNew, functor); 65 | gridOld->callback(helper); 66 | } 67 | }; 68 | 69 | template 70 | class dual_callback_helper_symmetric 71 | { 72 | public: 73 | dual_callback_helper_symmetric(GRID_TYPE *other_grid, FUNCTOR& functor) : 74 | other_grid(other_grid), 75 | functor(functor) 76 | {} 77 | 78 | template 79 | void operator()(ACCESSOR& accessor1) const 80 | { 81 | ACCESSOR accessor2(other_grid->data()); 82 | 83 | functor(accessor1, accessor2); 84 | } 85 | 86 | private: 87 | GRID_TYPE *other_grid; 88 | FUNCTOR& functor; 89 | }; 90 | 91 | // Hardwire this warning to off as MSVC would otherwise complain about 92 | // an assignment operator missing -- which is clearly there: 93 | #ifdef _MSC_BUILD 94 | #pragma warning( push ) 95 | #pragma warning( disable : 4626 4710 ) 96 | #endif 97 | 98 | template 99 | class const_dual_callback_helper_symmetric 100 | { 101 | public: 102 | 103 | #ifdef LIBFLATARRAY_WITH_CPP14 104 | inline const_dual_callback_helper_symmetric(const const_dual_callback_helper_symmetric& other) = default; 105 | inline const_dual_callback_helper_symmetric(const_dual_callback_helper_symmetric&& other) = default; 106 | #endif 107 | 108 | const_dual_callback_helper_symmetric(GRID_TYPE *other_grid, const FUNCTOR& functor) : 109 | other_grid(other_grid), 110 | functor(functor) 111 | {} 112 | 113 | template 114 | void operator()(ACCESSOR& accessor1) const 115 | { 116 | ACCESSOR accessor2(other_grid->data(), 0); 117 | 118 | functor(accessor1, accessor2); 119 | } 120 | 121 | private: 122 | GRID_TYPE *other_grid; 123 | const FUNCTOR& functor; 124 | }; 125 | 126 | #ifdef _MSC_BUILD 127 | #pragma warning( pop ) 128 | #endif 129 | 130 | } 131 | 132 | } 133 | 134 | } 135 | 136 | #endif 137 | -------------------------------------------------------------------------------- /CMakeModules/FindSilo.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2012 Sandia Corporation, Kitware Inc. 2 | # Copyright (c) 2014-2014 Andreas Schäfer 3 | # 4 | # Sandia National Laboratories, New Mexico 5 | # PO Box 5800 6 | # Albuquerque, NM 87185 7 | # 8 | # Kitware Inc. 9 | # 28 Corporate Drive 10 | # Clifton Park, NY 12065 11 | # USA 12 | # 13 | # Andreas Schäfer 14 | # Informatik 3 15 | # Martensstr. 3 16 | # 91058 Erlangen 17 | # Germany 18 | # 19 | # Under the terms of Contract DE-AC04-94AL85000, there is a 20 | # non-exclusive license for use of this work by or on behalf of the 21 | # U.S. Government. 22 | # 23 | # Redistribution and use in source and binary forms, with or without 24 | # modification, are permitted provided that the following conditions are 25 | # met: 26 | # 27 | # * Redistributions of source code must retain the above copyright 28 | # notice, this list of conditions and the following disclaimer. 29 | # 30 | # * Redistributions in binary form must reproduce the above copyright 31 | # notice, this list of conditions and the following disclaimer in the 32 | # documentation and/or other materials provided with the 33 | # distribution. 34 | # 35 | # * Neither the name of Kitware nor the names of any contributors may 36 | # be used to endorse or promote products derived from this software 37 | # without specific prior written permission. 38 | # 39 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 40 | # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 41 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 42 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR 43 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 44 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 45 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 46 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 47 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 48 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 49 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 50 | # ======================================================================== 51 | # 52 | # Try to find Silo library and headers. Define Silo_ROOT if Silo is 53 | # installed in a non-standard directory. 54 | # 55 | # This file sets the following variables: 56 | # 57 | # Silo_INCLUDE_DIR, where to find silo.h, etc. 58 | # Silo_LIBRARIES, the libraries to link against 59 | # Silo_FOUND, If false, do not try to use Silo. 60 | # 61 | # Also defined, but not for general use are: 62 | # Silo_LIBRARY, the full path to the silo library. 63 | # Silo_INCLUDE_PATH, for CMake backward compatibility 64 | 65 | FIND_PATH( Silo_INCLUDE_DIR silo.h 66 | PATHS /usr/local/include 67 | /usr/include 68 | ${Silo_ROOT}/include 69 | ) 70 | 71 | FIND_LIBRARY( Silo_LIBRARY NAMES siloh5 silo 72 | PATHS /usr/lib 73 | /usr/lib64 74 | /usr/local/lib 75 | ${Silo_ROOT}/lib 76 | ${Silo_ROOT}/lib64 77 | ) 78 | 79 | SET(Silo_FOUND "NO" ) 80 | IF(Silo_INCLUDE_DIR) 81 | IF(Silo_LIBRARY) 82 | 83 | SET(Silo_LIBRARIES ${Silo_LIBRARY}) 84 | SET(Silo_FOUND "YES" ) 85 | 86 | ELSE(Silo_LIBRARY) 87 | IF(Silo_FIND_REQURIED) 88 | message(SEND_ERROR "Unable to find the requested Silo libraries.") 89 | ENDIF(Silo_FIND_REQURIED) 90 | ENDIF(Silo_LIBRARY) 91 | ENDIF(Silo_INCLUDE_DIR) 92 | 93 | # handle the QUIETLY and REQUIRED arguments and set Silo_FOUND to TRUE if 94 | # all listed variables are TRUE 95 | INCLUDE(FindPackageHandleStandardArgs) 96 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Silo DEFAULT_MSG Silo_LIBRARY Silo_INCLUDE_DIR) 97 | 98 | MARK_AS_ADVANCED( 99 | Silo_INCLUDE_DIR 100 | Silo_LIBRARY 101 | ) 102 | -------------------------------------------------------------------------------- /test/loop_peeler_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | // globally disable some warnings with MSVC, that are issued not for a 10 | // specific header, but rather for the interaction of system headers 11 | // and LibFlatArray source: 12 | #ifdef _MSC_BUILD 13 | #pragma warning( disable : 4710 ) 14 | #endif 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "test.hpp" 23 | 24 | template 25 | LIBFLATARRAY_INLINE 26 | void scaler(int& i, int endX, double *data, double factor) 27 | { 28 | for (; i < endX; i += SHORT_VEC::ARITY) { 29 | SHORT_VEC vec(data + i); 30 | vec *= factor; 31 | (data + i) << vec; 32 | } 33 | } 34 | 35 | ADD_TEST(TestLoopPeelerFunctionality) 36 | { 37 | std::vector > foo; 38 | for (int i = 0; i < 123; ++i) { 39 | foo.push_back(1000 + i); 40 | } 41 | 42 | int x = 3; 43 | typedef LibFlatArray::short_vec short_vec_type; 44 | LIBFLATARRAY_LOOP_PEELER(short_vec_type, int, x, 113, scaler, &foo[0], 2.5); 45 | 46 | for (std::size_t i = 0; i < 123; ++i) { 47 | double expected = 1000 + i; 48 | if ((i >= 3) && (i < 113)) { 49 | expected *= 2.5; 50 | } 51 | 52 | BOOST_TEST_EQ(expected, foo[i]); 53 | } 54 | } 55 | 56 | ADD_TEST(TestLoopPeelerInteroperabilityWithStreamingShortVecs) 57 | { 58 | std::vector > foo; 59 | for (int i = 0; i < 1234; ++i) { 60 | foo.push_back(1000 + i); 61 | } 62 | 63 | int x = 13; 64 | typedef LibFlatArray::streaming_short_vec short_vec_type; 65 | LIBFLATARRAY_LOOP_PEELER(short_vec_type, int, x, 1113, scaler, &foo[0], 2.5); 66 | 67 | for (std::size_t i = 0; i < 1234; ++i) { 68 | double expected = 1000 + i; 69 | if ((i >= 13) && (i < 1113)) { 70 | expected *= 2.5; 71 | } 72 | 73 | BOOST_TEST_EQ(expected, foo[i]); 74 | } 75 | } 76 | 77 | #ifdef LIBFLATARRAY_WITH_CPP14 78 | #ifndef LIBFLATARRAY_WITH_CUDA 79 | #ifndef LIBFLATARRAY_WITH_FORCED_CPP11 80 | 81 | ADD_TEST(TestCpp14StyleLoopPeeler) 82 | { 83 | unsigned i = 5; 84 | unsigned end = 43; 85 | std::vector > foo(64, 0); 86 | 87 | // Actually MSVC is wrong here to assume we're not referencing 88 | // my_float in the following lamda. We're just not referencing its 89 | // value, just the type: 90 | #ifdef _MSC_BUILD 91 | #pragma warning( push ) 92 | #pragma warning( disable : 4100 ) 93 | #endif 94 | 95 | LibFlatArray::loop_peeler >(&i, end, [&foo](auto my_float, unsigned *i, unsigned end) { 96 | typedef decltype(my_float) FLOAT; 97 | for (; *i < end; *i += FLOAT::ARITY) { 98 | &foo[*i] << FLOAT(1.0); 99 | } 100 | }); 101 | 102 | #ifdef _MSC_BUILD 103 | #pragma warning( pop ) 104 | #endif 105 | 106 | 107 | for (std::size_t c = 0; c < 5; ++c) { 108 | BOOST_TEST_EQ(0.0, foo[c]); 109 | } 110 | for (std::size_t c = 5; c < 43; ++c) { 111 | BOOST_TEST_EQ(1.0, foo[c]); 112 | } 113 | for (std::size_t c = 43; c < 64; ++c) { 114 | BOOST_TEST_EQ(0.0, foo[c]); 115 | } 116 | } 117 | 118 | #endif 119 | #endif 120 | #endif 121 | 122 | int main(int /* argc */, char** /* argv */) 123 | { 124 | return 0; 125 | } 126 | -------------------------------------------------------------------------------- /examples/gauss/filter_c99.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * Copyright 2017 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifdef __ICC 10 | #include 11 | #endif 12 | 13 | #ifdef _MSC_BUILD 14 | #pragma warning( push ) 15 | #pragma warning( disable : 4514 ) 16 | #endif 17 | 18 | #include 19 | 20 | #ifdef _MSC_BUILD 21 | #pragma warning( pop ) 22 | #endif 23 | 24 | /** 25 | * Computes a 2D gaussian filter with a 5x5 stencil accross the YZ-plane. 26 | */ 27 | void filter_c99(double *data_new, const double *data_old, int dim_x, int dim_y, int dim_z) 28 | { 29 | // cast types here to maintain a C++-compatible signature: 30 | double (* const restrict grid_old)[dim_y][dim_x] = (double (* const)[dim_y][dim_x])data_old; 31 | double (* restrict grid_new)[dim_y][dim_x] = (double (* )[dim_y][dim_x])data_new; 32 | 33 | double weights[5][5]; 34 | double sum = 0; 35 | 36 | for (int y = 0; y < 5; ++y) { 37 | for (int x = 0; x < 5; ++x) { 38 | double x_component = x - 2; 39 | double y_component = y - 2; 40 | weights[y][x] = exp(-0.5 * (x_component * x_component + 41 | y_component * y_component)) / 2 / 3.14159265358979323846; 42 | sum += weights[y][x]; 43 | } 44 | } 45 | for (int y = 0; y < 5; ++y) { 46 | for (int x = 0; x < 5; ++x) { 47 | weights[y][x] /= sum; 48 | } 49 | } 50 | 51 | // we exploit symmetry to avoid redudant loads of weights: 52 | double weight_00 = weights[2][2]; 53 | double weight_01 = weights[2][1]; 54 | double weight_02 = weights[2][0]; 55 | double weight_11 = weights[1][1]; 56 | double weight_12 = weights[1][0]; 57 | double weight_22 = weights[0][0]; 58 | 59 | #pragma omp parallel for schedule(static) 60 | for (int z = 2; z < (dim_z - 2); ++z) { 61 | for (int y = 2; y < (dim_y - 2); ++y) { 62 | #ifdef __ICC 63 | #pragma vector always nontemporal 64 | #endif 65 | for (int x = 0; x < dim_x; ++x) { 66 | grid_new[z][y][x] = 67 | grid_old[z - 2][y - 2][x] * weight_22 + 68 | grid_old[z - 2][y - 1][x] * weight_12 + 69 | grid_old[z - 2][y + 0][x] * weight_02 + 70 | grid_old[z - 2][y + 1][x] * weight_12 + 71 | grid_old[z - 2][y + 2][x] * weight_22 + 72 | 73 | grid_old[z - 1][y - 2][x] * weight_12 + 74 | grid_old[z - 1][y - 1][x] * weight_11 + 75 | grid_old[z - 1][y + 0][x] * weight_01 + 76 | grid_old[z - 1][y + 1][x] * weight_11 + 77 | grid_old[z - 1][y + 2][x] * weight_12 + 78 | 79 | grid_old[z + 0][y - 2][x] * weight_02 + 80 | grid_old[z + 0][y - 1][x] * weight_01 + 81 | grid_old[z + 0][y + 0][x] * weight_00 + 82 | grid_old[z + 0][y + 1][x] * weight_01 + 83 | grid_old[z + 0][y + 2][x] * weight_02 + 84 | 85 | grid_old[z + 1][y - 2][x] * weight_12 + 86 | grid_old[z + 1][y - 1][x] * weight_11 + 87 | grid_old[z + 1][y + 0][x] * weight_01 + 88 | grid_old[z + 1][y + 1][x] * weight_11 + 89 | grid_old[z + 1][y + 2][x] * weight_12 + 90 | 91 | grid_old[z + 2][y - 2][x] * weight_22 + 92 | grid_old[z + 2][y - 1][x] * weight_12 + 93 | grid_old[z + 2][y + 0][x] * weight_02 + 94 | grid_old[z + 2][y + 1][x] * weight_12 + 95 | grid_old[z + 2][y + 2][x] * weight_22; 96 | } 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /examples/performance_tests/plot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require 'pp' 3 | require 'set' 4 | 5 | def parse_logfile(filename) 6 | data = [] 7 | 8 | lines = File.readlines(filename) 9 | header = lines.shift[1..-1] 10 | header = header.split(";").map do |token| 11 | token.strip 12 | end 13 | 14 | lines.each do |line| 15 | tokens = line.split(";") 16 | entry = {} 17 | 18 | tokens.size.times do |i| 19 | entry[header[i]] = tokens[i].strip 20 | end 21 | 22 | data.push entry 23 | end 24 | 25 | return [data, header] 26 | end 27 | 28 | def gather_range(entries, column) 29 | ret = Set.new 30 | 31 | entries.each do |entry| 32 | ret.add entry[column] 33 | end 34 | 35 | return ret 36 | end 37 | 38 | def plot_jacobi(data, header) 39 | entries = data.find_all do |entry| 40 | entry["family"] == "JacobiD3Q7" 41 | end 42 | 43 | dimensions = gather_range(entries, "dimensions") 44 | species = gather_range(entries, "species") 45 | 46 | outfile = "test.png" 47 | x_label = "Grid Size" 48 | y_label = "GLUPS" 49 | plot_specs = [] 50 | datafile = "temp.dat" 51 | 52 | File.open(datafile, "w") do |file| 53 | dimensions.each do |dim| 54 | dim =~ /\((\d+),/ 55 | file.print "#{$1}" 56 | 57 | species.each do |species| 58 | entries.each do |entry| 59 | if (entry["dimensions"] == dim) && (entry["species"] == species) 60 | file.print " #{entry["perf"]}" 61 | end 62 | end 63 | end 64 | 65 | file.puts 66 | end 67 | end 68 | 69 | index = 2 70 | species.each do |s| 71 | plot_specs << [index, s] 72 | index += 1 73 | end 74 | 75 | plots = plot_specs.map do |column, title| 76 | "'#{datafile}' using 1:#{column} title '#{title}' with linespoints" 77 | end 78 | 79 | command = < 9 | #include 10 | #include 11 | 12 | #include "test.hpp" 13 | 14 | class CellWithArrayMember 15 | { 16 | public: 17 | __host__ 18 | __device__ 19 | inline 20 | explicit CellWithArrayMember(int j = 0) : 21 | j(j) 22 | { 23 | i[0] = j + 1; 24 | i[1] = j + 2; 25 | i[2] = j + 3; 26 | 27 | x[0] = j + 0.4; 28 | x[1] = j + 0.5; 29 | } 30 | 31 | __host__ 32 | __device__ 33 | inline 34 | CellWithArrayMember(int newI[3], double newX[2], int j) : 35 | j(j) 36 | { 37 | i[0] = newI[0]; 38 | i[1] = newI[1]; 39 | i[1] = newI[2]; 40 | 41 | x[0] = newX[0]; 42 | x[1] = newX[1]; 43 | } 44 | 45 | int i[3]; 46 | int j; 47 | double x[2]; 48 | }; 49 | 50 | LIBFLATARRAY_REGISTER_SOA(CellWithArrayMember, 51 | ((int)(i)(3)) 52 | ((int)(j)) 53 | ((double)(x)(2)) ) 54 | 55 | 56 | namespace LibFlatArray { 57 | 58 | typedef soa_array soa_array_type; 59 | 60 | __global__ 61 | void test_insert(soa_array_type *array) 62 | { 63 | int size = array->size(); 64 | for (int i = 0; i < size; ++i) { 65 | CellWithArrayMember cell = (*array)[i]; 66 | cell.i[0] += 10000; 67 | cell.i[1] += 20000; 68 | cell.i[2] += 30000; 69 | 70 | (*array) << cell; 71 | } 72 | } 73 | 74 | __global__ 75 | void test_modify(soa_array_type *array) 76 | { 77 | int index = blockDim.x * blockIdx.x + threadIdx.x; 78 | if (index >= array->size()) { 79 | return; 80 | } 81 | 82 | (*array)[index].i()[0] += index; 83 | (*array)[index].i()[1] -= index; 84 | (*array)[index].i()[2] = 2011 + 2014; 85 | } 86 | 87 | ADD_TEST(TestCUDABasic) 88 | { 89 | soa_array_type host_array; 90 | 91 | for (int i = 0; i < 100; ++i) { 92 | CellWithArrayMember cell; 93 | cell.i[0] = i; 94 | cell.i[1] = i + 1000; 95 | cell.i[2] = i + 2000; 96 | host_array << cell; 97 | } 98 | 99 | soa_array_type *device_array = 0; 100 | cudaMalloc(&device_array, sizeof(soa_array_type)); 101 | cudaMemcpy(device_array, &host_array, sizeof(soa_array_type), cudaMemcpyHostToDevice); 102 | 103 | test_insert<<<1, 1>>>(device_array); 104 | cudaMemcpy(&host_array, device_array, sizeof(soa_array_type), cudaMemcpyDeviceToHost); 105 | 106 | for (int i = 0; i < 100; ++i) { 107 | BOOST_TEST((i + 0) == host_array[i + 0].i()[0]); 108 | BOOST_TEST((i + 1000) == host_array[i + 0].i()[1]); 109 | BOOST_TEST((i + 2000) == host_array[i + 0].i()[2]); 110 | 111 | BOOST_TEST((i + 10000) == host_array[i + 100].i()[0]); 112 | BOOST_TEST((i + 21000) == host_array[i + 100].i()[1]); 113 | BOOST_TEST((i + 32000) == host_array[i + 100].i()[2]); 114 | } 115 | 116 | test_modify<<<7, 32>>>(device_array); 117 | cudaMemcpy(&host_array, device_array, sizeof(soa_array_type), cudaMemcpyDeviceToHost); 118 | 119 | for (int i = 0; i < 100; ++i) { 120 | BOOST_TEST((i + i + 0) == host_array[i + 0].i()[0]); 121 | BOOST_TEST((0 + 1000) == host_array[i + 0].i()[1]); 122 | BOOST_TEST(( 4025) == host_array[i + 0].i()[2]); 123 | 124 | BOOST_TEST((i + i + 10100) == host_array[i + 100].i()[0]); 125 | BOOST_TEST((0 + 20900) == host_array[i + 100].i()[1]); 126 | BOOST_TEST(( 4025) == host_array[i + 100].i()[2]); 127 | } 128 | 129 | cudaFree(device_array); 130 | } 131 | 132 | } 133 | 134 | int main(int argc, char **argv) 135 | { 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /include/libflatarray/estimate_optimum_short_vec_type.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP 9 | #define FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP 10 | 11 | #include 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | /** 17 | * This class serves as a type switch to select an appropriate 18 | * short_vec type based on the machine architecture and working set 19 | * size. This is just a heuristic. Users are advised that an 20 | * analytical performance model can yield much better results. 21 | * 22 | * We're primarily concerned with two choices: temporal vs. 23 | * non-temporal stores and the arity of the vector type. Smaller 24 | * working sets should use short_vec if they fit well into the cache, 25 | * larger sets should use streaming_short_vec to benefit from 26 | * streaming stores. 27 | * 28 | * The arity of the vector type should not be smaller than the arity 29 | * of the supported assembly instructions (e.g. >=8 for AVX512 and 30 | * doubles).If the arity is larger then we effectively perform 31 | * loop-unrolling. This may be beneficial for architectures that 32 | * struggle with out-of-order execution as if lenghtens the loop body 33 | * and gives them more independent instructions to work on (e.g. Intel 34 | * Core 2). Modern Intel architectures however may suffer from 35 | * unrolling as this might make the loop body exceed the size of the 36 | * loop buffer which holds previously decoded microinstructions. 37 | * 38 | * Arguments should be: 39 | * 40 | * - CARGO: the main machine data type used inside the kernel, e.g. 41 | * float or double. Most kernels will operate on various data 42 | * types, but the vector arity should usually be chosen based on 43 | * that type which is used most as it has the strongest impact on 44 | * register scheduling. 45 | * 46 | * - ACCESSOR: an soa_accessor produced by LibFlatArray that provides 47 | * the number of elements in the working set. We assume the size 48 | * of the working set to be the product of the size of CARGO and 49 | * the number of elements in the set. 50 | * 51 | * - LAST_LEVEL_CACHE_SIZE_ESTIMATE: if available, the user can give 52 | * an estimate of the CPU's cache. Our hard-coded value will 53 | * overestimate that size for most architectures, but that's 54 | * generally fine. The consequence of overestimating is that for 55 | * some medium-sized sets the code will use temporal stores 56 | * instead of non-temporal stores, reulting in a performance hit 57 | * of less than 30% (true for most codes and current 58 | * architectures). Underestimating the cache size will result in 59 | * the use of steaming stores even if the working set would fit 60 | * just fine into the caches, easily resulting in a performance 61 | * hit of 1500% (e.g. 0.4 GLUPS instead of 6 GLUPS for a 3D Jacobi 62 | * on an Intel i7-6700HQ). Bottom line: never underestimate the 63 | * cache size! 64 | */ 65 | template 66 | class estimate_optimum_short_vec_type 67 | { 68 | public: 69 | static const std::size_t ARITY = ilp_to_arity::ARITY; 70 | 71 | // Overflow is fine here, it's an artifact of 32-bit builds: 72 | #ifdef _MSC_BUILD 73 | #pragma warning( push ) 74 | #pragma warning( disable : 4307 ) 75 | #endif 76 | 77 | static const long STREAMING_FLAG = 78 | ACCESSOR::DIM_PROD * sizeof(typename ACCESSOR::element_type) / LAST_LEVEL_CACHE_SIZE_ESTIMATE; 79 | 80 | #ifdef _MSC_BUILD 81 | #pragma warning( pop ) 82 | #endif 83 | 84 | typedef typename detail::flat_array::streaming_short_vec_switch::VALUE VALUE; 85 | }; 86 | 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /include/libflatarray/detail/get_instance_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_GET_INSTANCE_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_GET_INSTANCE_FUNCTOR_HPP 11 | 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | namespace detail { 17 | 18 | namespace flat_array { 19 | 20 | /** 21 | * This helper class is used to retrieve objects from the SoA storage 22 | * with the help of an accessor. 23 | */ 24 | template 25 | class get_instance_functor 26 | { 27 | public: 28 | get_instance_functor( 29 | CELL *target, 30 | long x, 31 | long y, 32 | long z, 33 | long count) : 34 | target(target), 35 | x(x), 36 | y(y), 37 | z(z), 38 | count(count) 39 | {} 40 | 41 | get_instance_functor( 42 | CELL *target, 43 | std::size_t x, 44 | std::size_t y, 45 | std::size_t z, 46 | std::size_t count) : 47 | target(target), 48 | x(x), 49 | y(y), 50 | z(z), 51 | count(count) 52 | {} 53 | 54 | template 55 | void operator()(soa_accessor& accessor) const 56 | { 57 | typedef soa_accessor accessor_type; 58 | accessor.index() = accessor_type::gen_index(x, y, z); 59 | CELL *cursor = target; 60 | 61 | for (std::size_t i = 0; i < count; ++i) { 62 | accessor >> *cursor; 63 | ++cursor; 64 | ++accessor.index(); 65 | } 66 | } 67 | 68 | private: 69 | CELL *target; 70 | std::size_t x; 71 | std::size_t y; 72 | std::size_t z; 73 | std::size_t count; 74 | }; 75 | 76 | #ifdef LIBFLATARRAY_WITH_CUDA 77 | #ifdef __CUDACC__ 78 | 79 | template 80 | __global__ 81 | void get_kernel(CELL *target, const char *source, long count, long x, long y, long z) 82 | { 83 | long offset = blockDim.x * blockIdx.x + threadIdx.x; 84 | if (offset >= count) { 85 | return; 86 | } 87 | 88 | typedef const_soa_accessor_light accessor_type; 89 | 90 | long index = accessor_type::gen_index(x + offset, y, z); 91 | accessor_type accessor(source, index); 92 | 93 | accessor >> target[offset]; 94 | } 95 | 96 | /** 97 | * Specialization for CUDA 98 | */ 99 | template 100 | class get_instance_functor 101 | { 102 | public: 103 | get_instance_functor( 104 | CELL *target, 105 | long x, 106 | long y, 107 | long z, 108 | long count) : 109 | target(target), 110 | x(x), 111 | y(y), 112 | z(z), 113 | count(count) 114 | {} 115 | 116 | get_instance_functor( 117 | CELL *target, 118 | std::size_t x, 119 | std::size_t y, 120 | std::size_t z, 121 | std::size_t count) : 122 | target(target), 123 | x(x), 124 | y(y), 125 | z(z), 126 | count(count) 127 | {} 128 | 129 | template 130 | void operator()(soa_accessor& accessor) const 131 | { 132 | dim3 grid_dim; 133 | dim3 block_dim; 134 | generate_cuda_launch_config()(&grid_dim, &block_dim, count, 1, 1); 135 | 136 | get_kernel<<>>( 137 | target, 138 | accessor.data(), 139 | count, 140 | x, 141 | y, 142 | z); 143 | } 144 | 145 | private: 146 | CELL *target; 147 | std::size_t x; 148 | std::size_t y; 149 | std::size_t z; 150 | std::size_t count; 151 | 152 | }; 153 | 154 | #endif 155 | #endif 156 | 157 | } 158 | 159 | } 160 | 161 | } 162 | 163 | #endif 164 | -------------------------------------------------------------------------------- /include/libflatarray/detail/save_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_SAVE_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_SAVE_FUNCTOR_HPP 11 | 12 | namespace LibFlatArray { 13 | 14 | namespace detail { 15 | 16 | namespace flat_array { 17 | 18 | #ifdef _MSC_BUILD 19 | #pragma warning( push ) 20 | #pragma warning( disable : 4626 4710 5027 ) 21 | #endif 22 | 23 | /** 24 | * Same as load_functor, but the other way around. 25 | */ 26 | template 27 | class save_functor 28 | { 29 | public: 30 | save_functor( 31 | const ITERATOR& start, 32 | const ITERATOR& end, 33 | char *target, 34 | std::size_t count) : 35 | start(start), 36 | end(end), 37 | target(target), 38 | count(count) 39 | {} 40 | 41 | template 42 | void operator()(soa_accessor& accessor) const 43 | { 44 | std::size_t offset = 0; 45 | 46 | for (ITERATOR i = start; i != end; ++i) { 47 | accessor.index() = soa_accessor::gen_index( 48 | static_cast(i->origin[0]), 49 | static_cast(i->origin[1]), 50 | static_cast(i->origin[2])); 51 | accessor.save( 52 | target, 53 | static_cast(i->length()), 54 | offset, 55 | count); 56 | 57 | offset += i->length(); 58 | } 59 | } 60 | 61 | private: 62 | ITERATOR start; 63 | ITERATOR end; 64 | char *target; 65 | std::size_t count; 66 | }; 67 | 68 | #ifdef _MSC_BUILD 69 | #pragma warning( pop ) 70 | #endif 71 | 72 | #ifdef LIBFLATARRAY_WITH_CUDA 73 | #ifdef __CUDACC__ 74 | 75 | template 76 | __global__ 77 | void save_kernel(const char *source, char *target, long count, long stride, long x, long y, long z, long offset) 78 | { 79 | long thread_index = blockDim.x * blockIdx.x + threadIdx.x; 80 | if (thread_index >= count) { 81 | return; 82 | } 83 | 84 | typedef const_soa_accessor_light accessor_type; 85 | 86 | long index = accessor_type::gen_index(x, y, z) + thread_index; 87 | accessor_type accessor(source, index); 88 | 89 | accessor.save(target, 1, offset + thread_index, stride); 90 | } 91 | 92 | /** 93 | * Specialization for CUDA 94 | */ 95 | template 96 | class save_functor 97 | { 98 | public: 99 | save_functor( 100 | const ITERATOR& start, 101 | const ITERATOR& end, 102 | char *target, 103 | std::size_t count) : 104 | start(start), 105 | end(end), 106 | target(target), 107 | count(count) 108 | {} 109 | 110 | template 111 | void operator()(soa_accessor& accessor) const 112 | { 113 | std::size_t offset = 0; 114 | 115 | for (ITERATOR i = start; i != end; ++i) { 116 | dim3 grid_dim; 117 | dim3 block_dim; 118 | generate_cuda_launch_config()(&grid_dim, &block_dim, i->length(), 1, 1); 119 | 120 | save_kernel<<>>( 121 | accessor.data(), 122 | target, 123 | i->length(), 124 | count, 125 | i->origin[0], 126 | i->origin[1], 127 | i->origin[2], 128 | offset); 129 | 130 | offset += i->length(); 131 | } 132 | } 133 | 134 | private: 135 | ITERATOR start; 136 | ITERATOR end; 137 | char *target; 138 | std::size_t count; 139 | 140 | }; 141 | 142 | #endif 143 | #endif 144 | 145 | } 146 | 147 | } 148 | 149 | } 150 | 151 | #endif 152 | -------------------------------------------------------------------------------- /include/libflatarray/detail/set_instance_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_SET_INSTANCE_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_SET_INSTANCE_FUNCTOR_HPP 11 | 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | namespace detail { 17 | 18 | namespace flat_array { 19 | 20 | /** 21 | * This helper class uses an accessor to push an object's members into 22 | * the SoA storage. 23 | */ 24 | template 25 | class set_instance_functor 26 | { 27 | public: 28 | set_instance_functor( 29 | const CELL *source, 30 | long x, 31 | long y, 32 | long z, 33 | long count) : 34 | source(source), 35 | x(x), 36 | y(y), 37 | z(z), 38 | count(count) 39 | {} 40 | 41 | set_instance_functor( 42 | const CELL *source, 43 | std::size_t x, 44 | std::size_t y, 45 | std::size_t z, 46 | std::size_t count) : 47 | source(source), 48 | x(x), 49 | y(y), 50 | z(z), 51 | count(count) 52 | {} 53 | 54 | template 55 | void operator()(soa_accessor& accessor) const 56 | { 57 | accessor.index() = long(soa_accessor::gen_index(x, y, z)); 58 | const CELL *cursor = source; 59 | 60 | for (std::size_t i = 0; i < count; ++i) { 61 | accessor << *cursor; 62 | cursor += SOURCE_STRIDE; 63 | ++accessor.index(); 64 | } 65 | } 66 | 67 | private: 68 | const CELL *source; 69 | std::size_t x; 70 | std::size_t y; 71 | std::size_t z; 72 | std::size_t count; 73 | }; 74 | 75 | #ifdef LIBFLATARRAY_WITH_CUDA 76 | #ifdef __CUDACC__ 77 | 78 | template 79 | __global__ 80 | void set_kernel(const CELL *source, char *target, long count, long x, long y, long z) 81 | { 82 | long offset = (blockDim.x * blockIdx.x + threadIdx.x) * SOURCE_STRIDE; 83 | if (offset >= count) { 84 | return; 85 | } 86 | 87 | typedef soa_accessor_light accessor_type; 88 | 89 | long index = accessor_type::gen_index(x + offset, y, z); 90 | accessor_type accessor(target, index); 91 | 92 | accessor << source[offset]; 93 | } 94 | 95 | /** 96 | * Specialization for CUDA 97 | */ 98 | template 99 | class set_instance_functor 100 | { 101 | public: 102 | set_instance_functor( 103 | const CELL *source, 104 | long x, 105 | long y, 106 | long z, 107 | long count) : 108 | source(source), 109 | x(x), 110 | y(y), 111 | z(z), 112 | count(count) 113 | {} 114 | 115 | set_instance_functor( 116 | const CELL *source, 117 | std::size_t x, 118 | std::size_t y, 119 | std::size_t z, 120 | std::size_t count) : 121 | source(source), 122 | x(x), 123 | y(y), 124 | z(z), 125 | count(count) 126 | {} 127 | 128 | template 129 | void operator()(soa_accessor& accessor) const 130 | { 131 | dim3 grid_dim; 132 | dim3 block_dim; 133 | generate_cuda_launch_config()(&grid_dim, &block_dim, count, 1, 1); 134 | 135 | set_kernel<<>>( 136 | source, 137 | accessor.data(), 138 | count, 139 | x, 140 | y, 141 | z); 142 | } 143 | 144 | private: 145 | const CELL *source; 146 | std::size_t x; 147 | std::size_t y; 148 | std::size_t z; 149 | std::size_t count; 150 | 151 | }; 152 | 153 | #endif 154 | #endif 155 | 156 | } 157 | 158 | } 159 | 160 | } 161 | 162 | #endif 163 | -------------------------------------------------------------------------------- /test/test.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2017-2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef TEST_H 10 | #define TEST_H 11 | 12 | // disable certain warnings from system headers when compiling with 13 | // Microsoft Visual Studio: 14 | #ifdef _MSC_BUILD 15 | #pragma warning( push ) 16 | #pragma warning( disable : 4514 4710 4996 ) 17 | #endif 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef _MSC_BUILD 24 | #pragma warning( pop ) 25 | #endif 26 | 27 | #include 28 | 29 | #ifndef BOOST_TEST 30 | // Microsoft Visual Studio doesn't define __PRETTY_FUNCTION__: 31 | #ifdef _MSC_VER 32 | #define BOOST_TEST(ARG) if (!(ARG)) { std::cerr << __FILE__ << "(" << __LINE__ << "): test '" << #ARG << "' failed in function '" << __FUNCSIG__ << "'" << std::endl; } 33 | #else 34 | #define BOOST_TEST(ARG) if (!(ARG)) { std::cerr << __FILE__ << "(" << __LINE__ << "): test '" << #ARG << "' failed in function '" << __PRETTY_FUNCTION__ << "'" << std::endl; } 35 | #endif 36 | 37 | #endif 38 | 39 | 40 | #ifndef BOOST_TEST_EQ 41 | #define BOOST_TEST_EQ(A, B) BOOST_TEST((A) == (B)) 42 | #endif 43 | 44 | // Runner and ADD_TEST are some convenience functions to simplify 45 | // definition of new tests. ADD_TEST will add scaffolding that causes 46 | // the following block to be executed once the program starts. 47 | // Advantage: tests have no longer to be manually added to main(). 48 | template 49 | class Runner 50 | { 51 | public: 52 | Runner() 53 | { 54 | TEST()(); 55 | } 56 | }; 57 | 58 | #define ADD_TEST(TEST_NAME) \ 59 | class TEST_NAME \ 60 | { \ 61 | public: \ 62 | LIBFLATARRAY_INLINE \ 63 | void operator()(); \ 64 | \ 65 | private: \ 66 | static Runner runner; \ 67 | }; \ 68 | \ 69 | Runner TEST_NAME::runner; \ 70 | \ 71 | LIBFLATARRAY_INLINE \ 72 | void TEST_NAME::operator()() \ 73 | 74 | 75 | #define TEST_REAL_ACCURACY(A, B, RELATIVE_ERROR_LIMIT) \ 76 | { \ 77 | double a = (A); \ 78 | double b = (B); \ 79 | double delta = std::abs(a - b); \ 80 | double relativeError = delta / std::abs(a); \ 81 | if (relativeError > RELATIVE_ERROR_LIMIT) { \ 82 | std::stringstream buf; \ 83 | buf << "in file " \ 84 | << __FILE__ << ":" \ 85 | << __LINE__ << ": " \ 86 | << "difference exceeds tolerance.\n" \ 87 | << " A: " << a << "\n" \ 88 | << " B: " << b << "\n" \ 89 | << " delta: " << delta << "\n" \ 90 | << " relativeError: " << relativeError << "\n"; \ 91 | throw std::logic_error(buf.str()); \ 92 | } \ 93 | } 94 | 95 | // lazy (read: bad, inexact) test for equality. we can't use stict 96 | // equality (operator==()), as vector units may yield 97 | // non-IEEE-compliannt results. Single-precision accuracy (i.e. ~20 98 | // bits for the mantissa or 6 digits) shall be suffice for functional 99 | // testing. 100 | #define TEST_REAL(A, B) \ 101 | TEST_REAL_ACCURACY(A, B, 0.000001) 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /include/libflatarray/aligned_allocator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012-2017 Andreas Schäfer 3 | * Copyright 2015 Kurt Kanzenbach 4 | * Copyright 2018 Google 5 | * 6 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 7 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 8 | */ 9 | 10 | #ifndef FLAT_ARRAY_ALIGNED_ALLOCATOR_HPP 11 | #define FLAT_ARRAY_ALIGNED_ALLOCATOR_HPP 12 | 13 | // disable certain warnings from system headers when compiling with 14 | // Microsoft Visual Studio: 15 | #ifdef _MSC_BUILD 16 | #pragma warning( push ) 17 | #pragma warning( disable : 4514 4710 ) 18 | #endif 19 | 20 | #include 21 | 22 | #ifdef _MSC_BUILD 23 | #pragma warning( pop ) 24 | #endif 25 | 26 | namespace LibFlatArray { 27 | 28 | template 29 | class aligned_allocator 30 | { 31 | public: 32 | typedef std::ptrdiff_t difference_type; 33 | typedef T* pointer; 34 | typedef const T* const_pointer; 35 | typedef T& reference; 36 | typedef const T& const_reference; 37 | typedef T value_type; 38 | typedef std::size_t size_type; 39 | 40 | template 41 | struct rebind 42 | { 43 | typedef aligned_allocator other; 44 | }; 45 | 46 | inline aligned_allocator() 47 | {} 48 | 49 | template 50 | inline explicit aligned_allocator(const aligned_allocator& /* other */) 51 | {} 52 | 53 | inline pointer address(reference x) const 54 | { 55 | return &x; 56 | } 57 | 58 | inline const_pointer address(const_reference x) const 59 | { 60 | return &x; 61 | } 62 | 63 | pointer allocate(std::size_t n, const void* = 0) 64 | { 65 | // This code would have been a piece of cake if it would have 66 | // worked with posix_memalign -- which it didn't. Instead 67 | // we allocate a larger chunk of memory in which we can 68 | // accomodate an array of the required size, shifted to the 69 | // desired offset. Since we need the original address for the 70 | // deallocation, we store it directly in front of the aligned 71 | // array's start. Ugly, but it works. 72 | char *chunk = std::allocator().allocate(upsize(n)); 73 | if (chunk == 0) { 74 | return reinterpret_cast(chunk); 75 | } 76 | 77 | std::size_t offset = reinterpret_cast(chunk) % ALIGNMENT; 78 | std::size_t correction = ALIGNMENT - offset; 79 | if (correction < sizeof(char*)) { 80 | correction += ALIGNMENT; 81 | } 82 | char *ret = chunk + correction; 83 | *(reinterpret_cast(ret) - 1) = chunk; 84 | return reinterpret_cast(ret); 85 | } 86 | 87 | void deallocate(pointer p, std::size_t n) 88 | { 89 | if (p == 0) { 90 | return; 91 | } 92 | 93 | char *actual; 94 | // retrieve the original pointer which sits in front of its 95 | // aligned brother 96 | actual = *(reinterpret_cast(p) - 1); 97 | std::allocator().deallocate(actual, upsize(n)); 98 | } 99 | 100 | std::size_t max_size() const throw() 101 | { 102 | return std::allocator().max_size(); 103 | } 104 | 105 | void construct(pointer p, const_reference val) 106 | { 107 | std::allocator().construct(p, val); 108 | } 109 | 110 | /** 111 | * Added due to compiling for Intel MIC with CPP14=TRUE 112 | * GCC Bug Report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51626 113 | */ 114 | void construct(pointer p) 115 | { 116 | std::allocator().construct(p, value_type()); 117 | } 118 | 119 | void destroy(pointer p) 120 | { 121 | std::allocator().destroy(p); 122 | } 123 | 124 | bool operator!=(const aligned_allocator& other) const 125 | { 126 | return !(*this == other); 127 | } 128 | 129 | bool operator==(const aligned_allocator& /* other*/) const 130 | { 131 | return true; 132 | } 133 | 134 | private: 135 | std::size_t graceOffset() 136 | { 137 | return ALIGNMENT + sizeof(char*); 138 | } 139 | 140 | std::size_t upsize(std::size_t n) 141 | { 142 | return n * sizeof(T) + graceOffset(); 143 | } 144 | }; 145 | 146 | } 147 | 148 | #endif 149 | -------------------------------------------------------------------------------- /include/libflatarray/detail/short_vec_helpers.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015 Kurt Kanzenbach 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_HELPERS_HPP 9 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_HELPERS_HPP 10 | 11 | #include 12 | 13 | // disable certain warnings from system headers when compiling with 14 | // Microsoft Visual Studio: 15 | #ifdef _MSC_BUILD 16 | #pragma warning( push ) 17 | #pragma warning( disable : 4514 ) 18 | #endif 19 | 20 | #include 21 | 22 | // uintptr_t is only available through C++11 23 | #ifdef LIBFLATARRAY_WITH_CPP14 24 | #include 25 | #define _SHORTVEC_UINTPTR_T std::uintptr_t 26 | #else 27 | #define _SHORTVEC_UINTPTR_T unsigned long long 28 | #endif 29 | 30 | #ifdef __SSE4_1__ 31 | #include 32 | #endif 33 | 34 | #ifdef _MSC_BUILD 35 | #pragma warning( pop ) 36 | #endif 37 | 38 | /** 39 | * This macro asserts that the pointer is correctly aligned. 40 | * 41 | * @param ptr pointer to check 42 | * @param alignment alignement 43 | */ 44 | #define SHORTVEC_ASSERT_ALIGNED(ptr, alignment) \ 45 | do { \ 46 | assert((reinterpret_cast<_SHORTVEC_UINTPTR_T>(ptr) % (alignment)) == 0); \ 47 | } while (0) 48 | 49 | /** 50 | * For some implementations there is the problem, that the compiler does not 51 | * see, that some variables should be used uninitialized. 52 | * Therefore here are compiler specific macros to disable and enable this warning. 53 | */ 54 | #if defined(__GNUC__) && !defined(__clang__) 55 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED \ 56 | _Pragma("GCC diagnostic push") \ 57 | _Pragma("GCC diagnostic ignored \"-Wuninitialized\"") 58 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED \ 59 | _Pragma("GCC diagnostic pop") 60 | #endif 61 | 62 | #ifdef __clang__ 63 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED \ 64 | _Pragma("clang diagnostic push") \ 65 | _Pragma("clang diagnostic ignored \"-Wuninitialized\"") 66 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED \ 67 | _Pragma("clang diagnostic pop") 68 | #endif 69 | 70 | /** 71 | * If compiler is not gcc and not clang, just remove these macros. 72 | */ 73 | #ifndef SHORTVEC_DISABLE_WARNING_UNINITIALIZED 74 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED 75 | #endif 76 | #ifndef SHORTVEC_ENABLE_WARNING_UNINITIALIZED 77 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED 78 | #endif 79 | 80 | 81 | #ifdef __SSE4_1__ 82 | 83 | /** 84 | * Insertps instruction which allows to insert an memory location 85 | * into a xmm register. 86 | * Instruction: insertps xmm, xmm/m32, imm8 87 | * 88 | * @param a xmm register 89 | * @param base base pointer 90 | * @param offset offset 91 | * @param idx index, has to be a constant number like 0x10, no variable 92 | */ 93 | #define SHORTVEC_INSERT_PS(a, base, offset, idx) \ 94 | do { \ 95 | asm volatile ("insertps %1, (%q2, %q3, 4), %0\n" \ 96 | : "+x" (a) : "N" (idx), "r" (base), "r" (offset) : "memory"); \ 97 | } while (0) 98 | 99 | #endif 100 | 101 | #ifdef __AVX__ 102 | 103 | /** 104 | * Same as above just for AVX. 105 | * Instruction: vinsertps xmm, xmm, xmm/m32, imm8 106 | * 107 | * @param a xmm register 108 | * @param base base pointer 109 | * @param offset offset 110 | * @param idx index, has to be a constant number like 0x10, no variable 111 | */ 112 | #define SHORTVEC_INSERT_PS_AVX(a, base, offset, idx) \ 113 | do { \ 114 | asm volatile ("vinsertps %1, (%q2, %q3, 4), %0, %0\n" \ 115 | : "+x" (a) : "N" (idx), "r" (base), "r" (offset) : "memory"); \ 116 | } while (0) 117 | 118 | #endif 119 | 120 | namespace LibFlatArray { 121 | 122 | namespace ShortVecHelpers { 123 | 124 | #ifdef __SSE4_1__ 125 | 126 | /** 127 | * _mm_extract_ps returns an integer, but we need a float. 128 | * This union can be used to get a float back. 129 | */ 130 | union ExtractResult { 131 | int i; 132 | float f; 133 | }; 134 | 135 | #endif 136 | 137 | } 138 | 139 | } 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /examples/lbm/util.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UTIL_H 2 | #define LIBFLATARRAY_EXAMPLES_LBM_UTIL_H 3 | 4 | /** 5 | * Copyright 2013-2016 Andreas Schäfer 6 | * 7 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 8 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | void check_cuda_error() 18 | { 19 | cudaError_t error = cudaGetLastError(); 20 | if (error != cudaSuccess) { 21 | std::cerr << "ERROR: " << cudaGetErrorString(error) << "\n"; 22 | throw std::runtime_error("CUDA error"); 23 | } 24 | } 25 | 26 | class benchmark 27 | { 28 | public: 29 | virtual ~benchmark() 30 | {} 31 | 32 | void evaluate() 33 | { 34 | for (int dim = 32; dim <= 160; dim += 4) { 35 | run(dim); 36 | } 37 | } 38 | 39 | void run(int dim) 40 | { 41 | int repeats = 10; 42 | if (dim <= 96) { 43 | repeats *= 10; 44 | } 45 | 46 | double seconds = exec(dim, repeats); 47 | 48 | double updates = 1.0 * gridSize(dim) * repeats; 49 | double glups = 10e-9 * updates / seconds; 50 | 51 | std::cout << std::setiosflags(std::ios::left); 52 | std::cout << std::setw(24) << name() << " ; " 53 | << std::setw( 3) << dim << " ; " 54 | << std::setw( 9) << glups << " GLUPS\n"; 55 | } 56 | 57 | protected: 58 | virtual double exec(int dim, int repeats) = 0; 59 | virtual std::string name() = 0; 60 | virtual size_t gridSize(int dim) = 0; 61 | }; 62 | 63 | class benchmark_lbm_cuda : public benchmark 64 | { 65 | protected: 66 | double exec(int dim, int repeats) 67 | { 68 | dim3 dimBlock; 69 | dim3 dimGrid; 70 | gen_dims(&dimBlock, &dimGrid, dim); 71 | 72 | return cudaExec(dim, dimBlock, dimGrid, repeats); 73 | } 74 | 75 | virtual size_t gridSize(int dim) 76 | { 77 | dim3 dimBlock; 78 | dim3 dimGrid; 79 | gen_dims(&dimBlock, &dimGrid, dim); 80 | 81 | return dimGrid.x * dimBlock.x * dimGrid.y * dimBlock.y * (256 - 4); 82 | } 83 | 84 | virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats) = 0; 85 | 86 | void gen_dims(dim3 *dimBlock, dim3 *dimGrid, int dim) 87 | { 88 | int blockWidth = 1; 89 | for (; blockWidth <= dim; blockWidth *= 2) { 90 | } 91 | 92 | using std::min; 93 | blockWidth /= 2; 94 | blockWidth = min(256, blockWidth); 95 | *dimBlock = dim3(blockWidth, 2, 1); 96 | *dimGrid = dim3(dim / dimBlock->x, dim / dimBlock->y, 1); 97 | } 98 | }; 99 | 100 | class benchmark_lbm_cuda_basic : public benchmark_lbm_cuda 101 | { 102 | protected: 103 | virtual ~benchmark_lbm_cuda_basic() 104 | {} 105 | 106 | virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats) 107 | { 108 | int size = dim * dim * (256 + 64) * 20; 109 | int bytesize = size * sizeof(double); 110 | std::vector grid(size, 4711); 111 | 112 | double *devGridOld; 113 | double *devGridNew; 114 | cudaMalloc(&devGridOld, bytesize); 115 | cudaMalloc(&devGridNew, bytesize); 116 | check_cuda_error(); 117 | 118 | cudaMemcpy(devGridOld, &grid[0], bytesize, cudaMemcpyHostToDevice); 119 | cudaMemcpy(devGridNew, &grid[0], bytesize, cudaMemcpyHostToDevice); 120 | check_cuda_error(); 121 | 122 | cudaDeviceSynchronize(); 123 | double t_start = LibFlatArray::benchmark::time(); 124 | 125 | for (int t = 0; t < repeats; ++t) { 126 | update(dimGrid, dimBlock, dim, dim, 256, devGridOld, devGridNew); 127 | std::swap(devGridOld, devGridNew); 128 | } 129 | 130 | cudaDeviceSynchronize(); 131 | double t_end = LibFlatArray::benchmark::time(); 132 | check_cuda_error(); 133 | 134 | cudaMemcpy(&grid[0], devGridNew, bytesize, cudaMemcpyDeviceToHost); 135 | cudaFree(devGridOld); 136 | cudaFree(devGridNew); 137 | check_cuda_error(); 138 | 139 | double time = t_end - t_start; 140 | return time; 141 | } 142 | 143 | virtual void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew) = 0; 144 | 145 | }; 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /include/libflatarray/detail/load_functor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2016 Andreas Schäfer 3 | * Copyright 2017-2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_LOAD_FUNCTOR_HPP 10 | #define FLAT_ARRAY_DETAIL_LOAD_FUNCTOR_HPP 11 | 12 | #include 13 | 14 | namespace LibFlatArray { 15 | 16 | namespace detail { 17 | 18 | namespace flat_array { 19 | 20 | #ifdef _MSC_BUILD 21 | #pragma warning( push ) 22 | #pragma warning( disable : 4626 4710 5027 ) 23 | #endif 24 | 25 | /** 26 | * The purpose of this functor is to load a row of cells which are 27 | * already prepackaged (in SoA form) in a raw data segment (i.e. all 28 | * members are stored in a consecutive array of the given length and 29 | * all arrays are concatenated). 30 | */ 31 | template 32 | class load_functor 33 | { 34 | public: 35 | load_functor( 36 | const ITERATOR& start, 37 | const ITERATOR& end, 38 | const char *source, 39 | std::size_t count) : 40 | start(start), 41 | end(end), 42 | source(source), 43 | count(count) 44 | {} 45 | 46 | template 47 | void operator()(soa_accessor& accessor) const 48 | { 49 | std::size_t offset = 0; 50 | 51 | for (ITERATOR i = start; i != end; ++i) { 52 | accessor.index() = soa_accessor::gen_index( 53 | static_cast(i->origin[0]), 54 | static_cast(i->origin[1]), 55 | static_cast(i->origin[2])); 56 | accessor.load( 57 | source, 58 | static_cast(i->length()), 59 | offset, 60 | count); 61 | 62 | offset += i->length(); 63 | } 64 | } 65 | 66 | private: 67 | ITERATOR start; 68 | ITERATOR end; 69 | const char *source; 70 | std::size_t count; 71 | }; 72 | 73 | #ifdef _MSC_BUILD 74 | #pragma warning( pop ) 75 | #endif 76 | 77 | #ifdef LIBFLATARRAY_WITH_CUDA 78 | #ifdef __CUDACC__ 79 | 80 | template 81 | __global__ 82 | void load_kernel(const char *source, char *target, long count, long stride, long x, long y, long z, long offset) 83 | { 84 | long thread_index = blockDim.x * blockIdx.x + threadIdx.x; 85 | if (thread_index >= count) { 86 | return; 87 | } 88 | 89 | typedef soa_accessor_light accessor_type; 90 | 91 | long index = accessor_type::gen_index(x, y, z) + thread_index; 92 | accessor_type accessor(target, index); 93 | 94 | accessor.load(source, 1, offset + thread_index, stride); 95 | } 96 | 97 | /** 98 | * Specialization for CUDA 99 | */ 100 | template 101 | class load_functor 102 | { 103 | public: 104 | load_functor( 105 | const ITERATOR& start, 106 | const ITERATOR& end, 107 | const char *source, 108 | std::size_t count) : 109 | start(start), 110 | end(end), 111 | source(source), 112 | count(count) 113 | {} 114 | 115 | template 116 | void operator()(soa_accessor& accessor) const 117 | { 118 | std::size_t offset = 0; 119 | 120 | for (ITERATOR i = start; i != end; ++i) { 121 | dim3 grid_dim; 122 | dim3 block_dim; 123 | generate_cuda_launch_config()(&grid_dim, &block_dim, i->length(), 1, 1); 124 | 125 | load_kernel<<>>( 126 | source, 127 | accessor.data(), 128 | i->length(), 129 | count, 130 | i->origin[0], 131 | i->origin[1], 132 | i->origin[2], 133 | offset); 134 | 135 | offset += i->length(); 136 | } 137 | } 138 | 139 | private: 140 | ITERATOR start; 141 | ITERATOR end; 142 | const char *source; 143 | std::size_t count; 144 | 145 | }; 146 | 147 | #endif 148 | #endif 149 | 150 | } 151 | 152 | } 153 | 154 | } 155 | 156 | #endif 157 | -------------------------------------------------------------------------------- /test/estimate_optimum_short_vec_type_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | // globally disable some warnings with MSVC, that are issued not for a 10 | // specific header, but rather for the interaction of system headers 11 | // and LibFlatArray source: 12 | #ifdef _MSC_BUILD 13 | #pragma warning( disable : 4710 ) 14 | #endif 15 | 16 | #include 17 | 18 | #include "test.hpp" 19 | 20 | namespace LibFlatArray { 21 | 22 | class fake_particle 23 | { 24 | public: 25 | double pos_x; 26 | double pos_y; 27 | double pos_z; 28 | 29 | double vel_x; 30 | double vel_y; 31 | double vel_z; 32 | }; 33 | 34 | class fake_accessor 35 | { 36 | public: 37 | typedef fake_particle element_type; 38 | static const int DIM_PROD = 2000; 39 | }; 40 | 41 | class fake_accessor2 42 | { 43 | public: 44 | typedef fake_particle element_type; 45 | static const int DIM_PROD = 20000000; 46 | }; 47 | 48 | ADD_TEST(TestArity) 49 | { 50 | // expected arities are 2x of the vector-unit's bit width for some 51 | // architectures as we're doing loop-unrolling for those: 52 | 53 | # ifdef __VECTOR4DOUBLE__ 54 | static const int expected_arity_for_double = 8; 55 | static const int expected_arity_for_float = 16; 56 | # endif 57 | 58 | // Dito for ARM NEON: 59 | # ifdef __ARM_NEON__ 60 | // no double-intrinsics for NEON: 61 | static const int expected_arity_for_double = 2; 62 | static const int expected_arity_for_float = 8; 63 | # endif 64 | 65 | // Only the case of the IBM PC is complicated. No thanks to you, 66 | // history! 67 | # ifdef LFA_AVX512_HELPER 68 | static const int expected_arity_for_double = 16; 69 | static const int expected_arity_for_float = 32; 70 | # else 71 | # ifdef __AVX__ 72 | static const int expected_arity_for_double = 8; 73 | static const int expected_arity_for_float = 16; 74 | # else 75 | # ifdef __SSE__ 76 | static const int expected_arity_for_double = 4; 77 | static const int expected_arity_for_float = 8; 78 | # else 79 | static const int expected_arity_for_double = 2; 80 | static const int expected_arity_for_float = 2; 81 | # endif 82 | # endif 83 | # endif 84 | 85 | typedef estimate_optimum_short_vec_type::VALUE selected_double_type; 86 | typedef estimate_optimum_short_vec_type::VALUE selected_float_type; 87 | int actual_double = selected_double_type::ARITY; 88 | int actual_float = selected_float_type::ARITY; 89 | 90 | BOOST_TEST_EQ(expected_arity_for_double, actual_double); 91 | BOOST_TEST_EQ(expected_arity_for_float, actual_float); 92 | }; 93 | 94 | template 95 | class is_streaming_short_vec; 96 | 97 | template 98 | class is_streaming_short_vec > 99 | { 100 | public: 101 | static const bool VALUE = true; 102 | }; 103 | 104 | template 105 | class is_streaming_short_vec > 106 | { 107 | public: 108 | static const bool VALUE = false; 109 | }; 110 | 111 | ADD_TEST(TestStoreImplementation) 112 | { 113 | // Don't warn about const expressions not being flagged as such: we 114 | // don't have a suitable macro for such comparisons. 115 | #ifdef _MSC_BUILD 116 | #pragma warning( push ) 117 | #pragma warning( disable : 4127 ) 118 | #endif 119 | 120 | // small problem size should yield normal stores: 121 | typedef estimate_optimum_short_vec_type::VALUE selected_double_type; 122 | typedef estimate_optimum_short_vec_type::VALUE selected_float_type; 123 | 124 | BOOST_TEST_EQ(is_streaming_short_vec::VALUE, false); 125 | BOOST_TEST_EQ(is_streaming_short_vec::VALUE, false); 126 | 127 | // larger problem size should yield streaming stores: 128 | typedef estimate_optimum_short_vec_type::VALUE selected_double_type2; 129 | typedef estimate_optimum_short_vec_type::VALUE selected_float_type2; 130 | 131 | BOOST_TEST_EQ(is_streaming_short_vec::VALUE, true); 132 | BOOST_TEST_EQ(is_streaming_short_vec::VALUE, true); 133 | 134 | #ifdef _MSC_BUILD 135 | #pragma warning( pop ) 136 | #endif 137 | 138 | }; 139 | 140 | } 141 | 142 | int main(int /* argc */, char** /* argv */) 143 | { 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /test/preprocessor_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #include 9 | 10 | // globally disable some warnings with MSVC, that are issued not for a 11 | // specific header, but rather for the interaction of system headers 12 | // and LibFlatArray source: 13 | #ifdef _MSC_BUILD 14 | #pragma warning( disable : 4710 ) 15 | #endif 16 | 17 | // Don't warn about these functions being stripped from an executable 18 | // as they're not being used, that's actually expected behavior. 19 | #ifdef _MSC_BUILD 20 | #pragma warning( push ) 21 | #pragma warning( disable : 4514 ) 22 | #endif 23 | 24 | #include 25 | #include 26 | 27 | #ifdef _MSC_BUILD 28 | #pragma warning( pop ) 29 | #endif 30 | 31 | #include "test.hpp" 32 | 33 | #define LIST_A 34 | #define LIST_B (10)(20)(30)(40)(50) 35 | #define LIST_C (60) 36 | 37 | #define LIST_D LIBFLATARRAY_DEQUEUE(LIST_B) 38 | #define LIST_E LIBFLATARRAY_DEQUEUE(LIST_C) 39 | 40 | #define LAMBDA(INDEX, STANDARD_ARG, ITERATOR) vec[ITERATOR] = (INDEX + STANDARD_ARG + ITERATOR); 41 | 42 | // Don't warn about the conditional expressions being constant, that's 43 | // intentional here: 44 | #ifdef _MSC_BUILD 45 | #pragma warning( push ) 46 | #pragma warning( disable : 4127 4353 ) 47 | #endif 48 | 49 | ADD_TEST(TestElem) 50 | { 51 | BOOST_TEST(LIBFLATARRAY_ELEM(0, LIST_B) == 10); 52 | BOOST_TEST(LIBFLATARRAY_ELEM(1, LIST_B) == 20); 53 | BOOST_TEST(LIBFLATARRAY_ELEM(2, LIST_B) == 30); 54 | BOOST_TEST(LIBFLATARRAY_ELEM(3, LIST_B) == 40); 55 | BOOST_TEST(LIBFLATARRAY_ELEM(4, LIST_B) == 50); 56 | 57 | BOOST_TEST(LIBFLATARRAY_ELEM(0, LIST_C) == 60); 58 | } 59 | 60 | ADD_TEST(TestSize) 61 | { 62 | BOOST_TEST(LIBFLATARRAY_SIZE(LIST_A) == 0); 63 | BOOST_TEST(LIBFLATARRAY_SIZE(LIST_B) == 5); 64 | BOOST_TEST(LIBFLATARRAY_SIZE(LIST_C) == 1); 65 | } 66 | 67 | ADD_TEST(TestForEach) 68 | { 69 | std::vector vec(60, 0); 70 | LIBFLATARRAY_FOR_EACH(LAMBDA, 100, LIST_B); 71 | 72 | BOOST_TEST(vec[10] == (0 + 10 + 100)); 73 | BOOST_TEST(vec[20] == (1 + 20 + 100)); 74 | BOOST_TEST(vec[30] == (2 + 30 + 100)); 75 | BOOST_TEST(vec[40] == (3 + 40 + 100)); 76 | BOOST_TEST(vec[50] == (4 + 50 + 100)); 77 | } 78 | 79 | ADD_TEST(TestDequeue) 80 | { 81 | BOOST_TEST_EQ(LIBFLATARRAY_SIZE(LIST_D), 4); 82 | BOOST_TEST_EQ(LIBFLATARRAY_ELEM(0, LIST_D), 20); 83 | BOOST_TEST_EQ(LIBFLATARRAY_ELEM(1, LIST_D), 30); 84 | BOOST_TEST_EQ(LIBFLATARRAY_ELEM(2, LIST_D), 40); 85 | BOOST_TEST_EQ(LIBFLATARRAY_ELEM(3, LIST_D), 50); 86 | 87 | BOOST_TEST_EQ(LIBFLATARRAY_SIZE(LIST_E), 0); 88 | } 89 | 90 | ADD_TEST(TestIfShorter) 91 | { 92 | bool a0 = LIBFLATARRAY_IF_SHORTER(LIST_A, 0, false, true); 93 | bool a1 = LIBFLATARRAY_IF_SHORTER(LIST_A, 1, true, false); 94 | bool a2 = LIBFLATARRAY_IF_SHORTER(LIST_A, 2, true, false); 95 | bool a3 = LIBFLATARRAY_IF_SHORTER(LIST_A, 3, true, false); 96 | bool a4 = LIBFLATARRAY_IF_SHORTER(LIST_A, 4, true, false); 97 | 98 | bool b0 = LIBFLATARRAY_IF_SHORTER(LIST_B, 0, false, true); 99 | bool b1 = LIBFLATARRAY_IF_SHORTER(LIST_B, 1, false, true); 100 | bool b2 = LIBFLATARRAY_IF_SHORTER(LIST_B, 2, false, true); 101 | bool b3 = LIBFLATARRAY_IF_SHORTER(LIST_B, 3, false, true); 102 | bool b4 = LIBFLATARRAY_IF_SHORTER(LIST_B, 4, false, true); 103 | bool b5 = LIBFLATARRAY_IF_SHORTER(LIST_B, 5, false, true); 104 | bool b6 = LIBFLATARRAY_IF_SHORTER(LIST_B, 6, true, false); 105 | bool b7 = LIBFLATARRAY_IF_SHORTER(LIST_B, 7, true, false); 106 | bool b8 = LIBFLATARRAY_IF_SHORTER(LIST_B, 8, true, false); 107 | bool b9 = LIBFLATARRAY_IF_SHORTER(LIST_B, 9, true, false); 108 | 109 | bool c0 = LIBFLATARRAY_IF_SHORTER(LIST_C, 0, false, true); 110 | bool c1 = LIBFLATARRAY_IF_SHORTER(LIST_C, 1, false, true); 111 | bool c2 = LIBFLATARRAY_IF_SHORTER(LIST_C, 2, true, false); 112 | bool c3 = LIBFLATARRAY_IF_SHORTER(LIST_C, 3, true, false); 113 | bool c4 = LIBFLATARRAY_IF_SHORTER(LIST_C, 4, true, false); 114 | 115 | BOOST_TEST(a0); 116 | BOOST_TEST(a1); 117 | BOOST_TEST(a2); 118 | BOOST_TEST(a3); 119 | BOOST_TEST(a4); 120 | 121 | BOOST_TEST(b0); 122 | BOOST_TEST(b1); 123 | BOOST_TEST(b2); 124 | BOOST_TEST(b3); 125 | BOOST_TEST(b4); 126 | BOOST_TEST(b5); 127 | BOOST_TEST(b6); 128 | BOOST_TEST(b7); 129 | BOOST_TEST(b8); 130 | BOOST_TEST(b9); 131 | 132 | BOOST_TEST(c0); 133 | BOOST_TEST(c1); 134 | BOOST_TEST(c2); 135 | BOOST_TEST(c3); 136 | BOOST_TEST(c4); 137 | } 138 | 139 | #ifdef _MSC_BUILD 140 | #pragma warning( pop ) 141 | #endif 142 | 143 | int main(int /* argc */, char** /* argv */) 144 | { 145 | return 0; 146 | } 147 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(WITH_CUDA) 2 | lfa_cuda_add_executable(cuda_allocator_test cuda_allocator_test.cu) 3 | lfa_cuda_add_executable(cuda_array_test cuda_array_test.cu) 4 | lfa_cuda_add_executable(soa_array_cuda_test soa_array_cuda_test.cu) 5 | lfa_cuda_add_executable(soa_grid_cuda_test soa_grid_cuda_test.cu) 6 | endif() 7 | add_executable(aligned_allocator_test aligned_allocator_test.cpp) 8 | add_executable(api_traits_test api_traits_test.cpp) 9 | add_executable(estimate_optimum_short_vec_type_test estimate_optimum_short_vec_type_test.cpp) 10 | add_executable(loop_peeler_test loop_peeler_test.cpp) 11 | add_executable(preprocessor_test preprocessor_test.cpp) 12 | add_executable(short_vec_test short_vec_test.cpp short_vec_additional_test.cpp) 13 | add_executable(streaming_short_vec_test streaming_short_vec_test.cpp) 14 | add_executable(soa_array_test soa_array_test.cpp) 15 | add_executable(soa_grid_test soa_grid_test.cpp) 16 | add_executable(soa_vector_test soa_vector_test.cpp) 17 | 18 | if(WITH_CUDA) 19 | add_custom_target(run_cuda_allocator_test COMMAND cuda_allocator_test) 20 | add_custom_target(run_cuda_array_test COMMAND cuda_array_test) 21 | add_custom_target(run_soa_array_cuda_test COMMAND soa_array_cuda_test) 22 | add_custom_target(run_soa_grid_cuda_test COMMAND soa_grid_cuda_test) 23 | endif() 24 | 25 | add_custom_target(run_aligned_allocator_test COMMAND aligned_allocator_test) 26 | add_custom_target(run_api_traits_test COMMAND api_traits_test) 27 | add_custom_target(run_estimate_optimum_short_vec_type_test COMMAND estimate_optimum_short_vec_type_test) 28 | add_custom_target(run_loop_peeler_test COMMAND loop_peeler_test) 29 | add_custom_target(run_preprocessor_test COMMAND preprocessor_test) 30 | add_custom_target(run_short_vec_test COMMAND short_vec_test) 31 | add_custom_target(run_streaming_short_vec_test COMMAND streaming_short_vec_test) 32 | add_custom_target(run_soa_array_test COMMAND soa_array_test) 33 | add_custom_target(run_soa_grid_test COMMAND soa_grid_test) 34 | add_custom_target(run_soa_vector_test COMMAND soa_vector_test) 35 | 36 | if(WITH_CUDA) 37 | add_dependencies(check run_cuda_allocator_test) 38 | add_dependencies(check run_cuda_array_test) 39 | add_dependencies(check run_soa_array_cuda_test) 40 | add_dependencies(check run_soa_grid_cuda_test) 41 | endif() 42 | add_dependencies(check run_aligned_allocator_test) 43 | add_dependencies(check run_api_traits_test) 44 | add_dependencies(check run_estimate_optimum_short_vec_type_test) 45 | add_dependencies(check run_loop_peeler_test) 46 | add_dependencies(check run_preprocessor_test) 47 | add_dependencies(check run_short_vec_test) 48 | add_dependencies(check run_streaming_short_vec_test) 49 | add_dependencies(check run_soa_array_test) 50 | add_dependencies(check run_soa_grid_test) 51 | add_dependencies(check run_soa_vector_test) 52 | 53 | if(WITH_CUDA) 54 | add_dependencies(run_cuda_allocator_test cuda_allocator_test) 55 | add_dependencies(run_cuda_array_test cuda_array_test) 56 | add_dependencies(run_soa_array_cuda_test soa_array_cuda_test) 57 | add_dependencies(run_soa_grid_cuda_test soa_grid_cuda_test) 58 | endif() 59 | add_dependencies(run_aligned_allocator_test aligned_allocator_test) 60 | add_dependencies(run_api_traits_test api_traits_test) 61 | add_dependencies(run_estimate_optimum_short_vec_type_test estimate_optimum_short_vec_type_test) 62 | add_dependencies(run_loop_peeler_test loop_peeler_test) 63 | add_dependencies(run_preprocessor_test preprocessor_test) 64 | add_dependencies(run_short_vec_test short_vec_test) 65 | add_dependencies(run_streaming_short_vec_test streaming_short_vec_test) 66 | add_dependencies(run_soa_array_test soa_array_test) 67 | add_dependencies(run_soa_grid_test soa_grid_test) 68 | add_dependencies(run_soa_vector_test soa_vector_test) 69 | 70 | if(WITH_CUDA) 71 | add_dependencies(tests cuda_allocator_test) 72 | add_dependencies(tests cuda_array_test) 73 | add_dependencies(tests soa_array_cuda_test) 74 | add_dependencies(tests soa_grid_cuda_test) 75 | endif() 76 | add_dependencies(tests aligned_allocator_test) 77 | add_dependencies(tests api_traits_test) 78 | add_dependencies(tests estimate_optimum_short_vec_type_test) 79 | add_dependencies(tests loop_peeler_test) 80 | add_dependencies(tests preprocessor_test) 81 | add_dependencies(tests short_vec_test) 82 | add_dependencies(tests streaming_short_vec_test) 83 | add_dependencies(tests soa_array_test) 84 | add_dependencies(tests soa_grid_test) 85 | add_dependencies(tests soa_vector_test) 86 | -------------------------------------------------------------------------------- /include/libflatarray/testbed/evaluate.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2017-2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_TESTBED_EVALUATE_HPP 10 | #define FLAT_ARRAY_TESTBED_EVALUATE_HPP 11 | 12 | #include 13 | 14 | // disable certain warnings from system headers when compiling with 15 | // Microsoft Visual Studio: 16 | #ifdef _MSC_BUILD 17 | #pragma warning( push ) 18 | #pragma warning( disable : 4514 4668 4710 4820 ) 19 | #endif 20 | 21 | #include 22 | #include 23 | #include 24 | #ifdef _WIN32 25 | #include 26 | #else 27 | #include 28 | #endif 29 | 30 | #ifdef _MSC_BUILD 31 | #pragma warning( pop ) 32 | #endif 33 | 34 | namespace LibFlatArray { 35 | 36 | // not inlining is ok: 37 | #ifdef _MSC_BUILD 38 | #pragma warning( push ) 39 | #pragma warning( disable : 4710 ) 40 | #endif 41 | 42 | class evaluate 43 | { 44 | public: 45 | evaluate(const std::string& name, const std::string& revision) : 46 | name(name), 47 | revision(revision) 48 | {} 49 | 50 | void print_header() 51 | { 52 | std::cout << "#rev ; date ; host ; device ; order ; family ; species ; dimensions ; perf ; unit" << std::endl; 53 | } 54 | 55 | template 56 | void operator()(BENCHMARK benchmark, std::vector dim, bool output = true) 57 | { 58 | if (benchmark.family().find(name, 0) == std::string::npos) { 59 | return; 60 | } 61 | 62 | #ifdef _WIN32 63 | // this charade is based on https://msdn.microsoft.com/en-us/library/windows/desktop/ms724928(v=vs.85).aspx 64 | FILETIME fileTime; 65 | GetSystemTimeAsFileTime(&fileTime); 66 | 67 | ULARGE_INTEGER systemTime; 68 | systemTime.LowPart = fileTime.dwLowDateTime; 69 | systemTime.HighPart = fileTime.dwHighDateTime; 70 | 71 | SYSTEMTIME epoch; 72 | epoch.wYear = 1970; 73 | epoch.wMonth = 1; 74 | epoch.wDayOfWeek = 4; 75 | epoch.wDay = 1; 76 | epoch.wHour = 0; 77 | epoch.wMinute = 0; 78 | epoch.wSecond = 1; 79 | epoch.wMilliseconds = 0; 80 | FILETIME epochFileTime; 81 | SystemTimeToFileTime(&epoch, &epochFileTime); 82 | 83 | ULARGE_INTEGER epochULargeInteger; 84 | epochULargeInteger.LowPart = epochFileTime.dwLowDateTime; 85 | epochULargeInteger.HighPart = epochFileTime.dwHighDateTime; 86 | 87 | time_t secondsSinceEpoch = static_cast(systemTime.QuadPart - epochULargeInteger.QuadPart); 88 | #else 89 | timeval t; 90 | gettimeofday(&t, 0); 91 | time_t secondsSinceEpoch = t.tv_sec; 92 | #endif 93 | 94 | tm timeSpec; 95 | #ifdef _WIN32 96 | gmtime_s(&timeSpec, &secondsSinceEpoch); 97 | #else 98 | gmtime_r(&secondsSinceEpoch, &timeSpec); 99 | #endif 100 | char buf[1024]; 101 | strftime(buf, 1024, "%Y-%b-%d %H:%M:%S", &timeSpec); 102 | 103 | std::string now_string = buf; 104 | std::string device = benchmark.device(); 105 | 106 | int hostname_length = 2048; 107 | std::string hostname(static_cast(hostname_length), ' '); 108 | gethostname(&hostname[0], hostname_length); 109 | // cuts string at first 0 byte, required as gethostname returns 0-terminated strings 110 | hostname = std::string(hostname.c_str()); 111 | 112 | double performance = benchmark.performance(dim); 113 | 114 | std::ostringstream pretty_dim; 115 | pretty_dim << "(" << dim[0]; 116 | for (std::size_t i = 1; i < dim.size(); ++i) { 117 | pretty_dim << ", " << dim[i]; 118 | } 119 | pretty_dim << ")"; 120 | 121 | if (output) { 122 | std::cout << std::setiosflags(std::ios::left); 123 | std::cout << std::setw(18) << revision << "; " 124 | << now_string << " ; " 125 | << std::setw(32) << hostname << "; " 126 | << std::setw(48) << device << "; " 127 | << std::setw( 8) << benchmark.order() << "; " 128 | << std::setw(32) << benchmark.family() << "; " 129 | << std::setw( 8) << benchmark.species() << "; " 130 | << std::setw(24) << pretty_dim.str() << "; " 131 | << std::setw(12) << performance << "; " 132 | << std::setw( 8) << benchmark.unit() << std::endl; 133 | } 134 | } 135 | 136 | private: 137 | std::string name; 138 | std::string revision; 139 | }; 140 | 141 | #ifdef _MSC_BUILD 142 | #pragma warning( pop ) 143 | #endif 144 | 145 | } 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /include/libflatarray/testbed/cpu_benchmark.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014-2017 Andreas Schäfer 3 | * Copyright 2018-2020 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_TESTBED_CPU_BENCHMARK_HPP 10 | #define FLAT_ARRAY_TESTBED_CPU_BENCHMARK_HPP 11 | 12 | #include 13 | 14 | // disable certain warnings from system headers when compiling with 15 | // Microsoft Visual Studio. Also disable them for this class. 16 | #ifdef _MSC_BUILD 17 | #pragma warning( push ) 18 | #pragma warning( disable : 4514 4710 ) 19 | #endif 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | namespace LibFlatArray { 26 | 27 | class cpu_benchmark : public benchmark 28 | { 29 | public: 30 | std::string order() 31 | { 32 | return "CPU"; 33 | } 34 | 35 | std::string device() 36 | { 37 | try { 38 | try { 39 | // likwid-topology gives us the best data 40 | return parse_likwid_topology(); 41 | } catch (const std::runtime_error&) { 42 | // ...otherwise we'll fall back to /proc/cpuinfo 43 | return parse_proc_cpu(); 44 | } 45 | } catch (const std::runtime_error&) { 46 | return "unknown CPU"; 47 | } 48 | } 49 | 50 | private: 51 | static std::string parse_proc_cpu() 52 | { 53 | std::ifstream file("/proc/cpuinfo"); 54 | const std::size_t bufferSize = 1 << 12; 55 | char buffer[bufferSize]; 56 | 57 | while (file.getline(&buffer[0], bufferSize)) { 58 | std::vector tokens = tokenize(buffer, ':'); 59 | std::vector fields = tokenize(tokens[0], '\t'); 60 | 61 | if ((fields.size() == 1) && (fields[0] == "cpu")) { 62 | return tokens[1]; 63 | } 64 | 65 | if ((fields.size() == 1) && (fields[0] == "model name")) { 66 | tokens = tokenize(tokens[1], ' '); 67 | std::string buf = join(tokens, " "); 68 | if (buf[buf.size() - 1] == 0) { 69 | buf.resize(buf.size() - 1); 70 | } 71 | 72 | return buf; 73 | } 74 | } 75 | 76 | throw std::runtime_error("could not parse /proc/cpuinfo"); 77 | } 78 | 79 | static std::string parse_likwid_topology() 80 | { 81 | std::string read_buffer(100000, ' '); 82 | #ifdef _WIN32 83 | FILE *file = _popen("likwid-topology -O", "r"); 84 | #else 85 | FILE *file = popen("likwid-topology -O", "r"); 86 | #endif 87 | if (file == NULL) { 88 | throw std::runtime_error("failed to get output from likwid-topology"); 89 | } 90 | 91 | std::string cpu_type; 92 | std::string cpu_name; 93 | 94 | while (fgets(&read_buffer[0], read_buffer.size(), file) != NULL) { 95 | std::vector tokens = tokenize(read_buffer, ','); 96 | for (std::vector::iterator i = tokens.begin(); i != tokens.end(); ++i) { 97 | if (i->find("CPU type") != std::string::npos) { 98 | cpu_type = *(++i); 99 | } 100 | if (i->find("CPU name") != std::string::npos) { 101 | cpu_name = *(++i); 102 | } 103 | } 104 | } 105 | 106 | if (cpu_type.empty() || cpu_name.empty()) { 107 | throw std::runtime_error("failed to parse likwid-topology"); 108 | } 109 | std::string ret = cpu_type + ", " + cpu_name; 110 | return ret; 111 | } 112 | 113 | static std::string trim(const std::string& string) 114 | { 115 | if (string.size() == 0) { 116 | return string; 117 | } 118 | 119 | std::size_t start = 0; 120 | while ((string[start] == ' ') && (start < string.size())) { 121 | start += 1; 122 | } 123 | 124 | std::size_t end = string.size() - 1; 125 | while ((string[end] == ' ') && (end > 1)) { 126 | end -= 1; 127 | } 128 | if ((string[end] != ' ') && (end < string.size())) { 129 | end += 1; 130 | } 131 | 132 | return std::string(string, start, end - start); 133 | } 134 | 135 | static std::vector tokenize(const std::string& line, char delimiter = ';') 136 | { 137 | std::vector ret; 138 | 139 | std::stringstream buf(line); 140 | std::string item; 141 | 142 | while (std::getline(buf, item, delimiter)) { 143 | ret.push_back(trim(item)); 144 | } 145 | 146 | return ret; 147 | } 148 | 149 | static std::string join(const std::vector& tokens, const std::string& delimiter) 150 | { 151 | std::stringstream buf; 152 | 153 | for (std::vector::const_iterator i = tokens.begin(); i != tokens.end(); ++i) { 154 | if (i != tokens.begin()) { 155 | buf << delimiter; 156 | } 157 | buf << *i; 158 | } 159 | 160 | return buf.str(); 161 | } 162 | }; 163 | 164 | } 165 | 166 | #ifdef _MSC_BUILD 167 | #pragma warning( pop ) 168 | #endif 169 | 170 | #endif 171 | -------------------------------------------------------------------------------- /include/libflatarray/soa_vector.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * 4 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 | */ 7 | 8 | #ifndef FLAT_ARRAY_SOA_VECTOR_HPP 9 | #define FLAT_ARRAY_SOA_VECTOR_HPP 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | // disable certain warnings from system headers when compiling with 16 | // Microsoft Visual Studio: 17 | #ifdef _MSC_BUILD 18 | #pragma warning( push ) 19 | #pragma warning( disable : 4514 ) 20 | #endif 21 | 22 | #include 23 | 24 | #ifdef _MSC_BUILD 25 | #pragma warning( pop ) 26 | #endif 27 | 28 | namespace LibFlatArray { 29 | 30 | /** 31 | * This is the runtime resizable counterpart to soa_array. The goal is 32 | * to provide an interface similar to std::vector and simultaneously 33 | * have a callback to expose the struct-of-arrays layout. 34 | */ 35 | template< 36 | typename T, 37 | typename ALLOCATOR = aligned_allocator, 38 | bool USE_CUDA_FUNCTORS = false> 39 | class soa_vector 40 | { 41 | public: 42 | friend class TestResizeAndReserve; 43 | 44 | typedef T value_type; 45 | 46 | inline 47 | __host__ __device__ 48 | explicit soa_vector(std::size_t count = 0) : 49 | grid(count, 1, 1), 50 | count(count) 51 | {} 52 | 53 | inline 54 | __host__ __device__ 55 | explicit soa_vector(std::size_t count, const value_type& value) : 56 | grid(count, 1, 1), 57 | count(count) 58 | { 59 | grid.broadcast(0, 0, 0, value, count); 60 | } 61 | 62 | /** 63 | * Copies an element to the given index. We're intentionally not 64 | * using at() or operator[] to avoid mismatched expectations here: 65 | * we can't yield references to a T here. 66 | */ 67 | LIBFLATARRAY_INLINE 68 | __host__ __device__ 69 | void set(std::size_t index, const T& element) 70 | { 71 | grid.set(index, 0, 0, element); 72 | } 73 | 74 | /** 75 | * Copy out an element. Again we're not using at() or operator[] 76 | * here to avoid confusion with the API: we can't return 77 | * references from an SoA container. 78 | */ 79 | LIBFLATARRAY_INLINE 80 | __host__ __device__ 81 | T get(std::size_t index) const 82 | { 83 | return grid.get(index, 0, 0); 84 | } 85 | 86 | inline 87 | __host__ __device__ 88 | std::size_t size() const 89 | { 90 | return count; 91 | } 92 | 93 | inline 94 | __host__ __device__ 95 | bool empty() const 96 | { 97 | return count == 0; 98 | } 99 | 100 | inline 101 | __host__ __device__ 102 | void resize(std::size_t new_count) 103 | { 104 | if (new_count > capacity()) { 105 | reserve(new_count); 106 | } 107 | 108 | count = new_count; 109 | } 110 | 111 | inline 112 | __host__ __device__ 113 | void reserve(std::size_t new_count) 114 | { 115 | soa_grid new_grid(new_count, 1, 1); 116 | new_grid.resize(new_grid.extent_x(), 1, 1); 117 | 118 | detail::flat_array::simple_streak iter[2] = { 119 | detail::flat_array::simple_streak(0, 0, 0, count), 120 | detail::flat_array::simple_streak() 121 | }; 122 | 123 | new_grid.load(iter + 0, iter + 1, grid.data(), grid.extent_x()); 124 | swap(new_grid, grid); 125 | } 126 | 127 | inline 128 | __host__ __device__ 129 | std::size_t capacity() const 130 | { 131 | return grid.dim_x(); 132 | } 133 | 134 | inline 135 | __host__ __device__ 136 | void clear() 137 | { 138 | count = 0; 139 | } 140 | 141 | inline 142 | __host__ __device__ 143 | void push_back(const T& element) 144 | { 145 | if (count == grid.extent_x()) { 146 | // fixme: make this configurable 147 | reserve(static_cast(count * 1.2)); 148 | } 149 | set(count, element); 150 | ++count; 151 | } 152 | 153 | inline 154 | __host__ __device__ 155 | void pop_back() 156 | { 157 | --count; 158 | // destroy last element by overwriting with default element: 159 | set(count, T()); 160 | } 161 | 162 | #ifdef LIBFLATARRAY_WITH_CPP14 163 | template 164 | inline 165 | __host__ __device__ 166 | void emplace_back(ARGS&&... args) 167 | { 168 | push_back(T(std::forward(args)...)); 169 | } 170 | #endif 171 | 172 | template 173 | inline 174 | __host__ __device__ 175 | void callback(FUNCTOR functor) 176 | { 177 | grid.callback(functor); 178 | } 179 | 180 | template 181 | inline 182 | __host__ __device__ 183 | void callback(FUNCTOR functor) const 184 | { 185 | grid.callback(functor); 186 | } 187 | 188 | private: 189 | soa_grid grid; 190 | std::size_t count; 191 | 192 | // fixme: retrieval of multiple elements 193 | // fixme: add cuda test 194 | // fixme: add begin/end for range-based loops, dito for soa_array and perhaps for short_vec (alternatively an iterator loop) 195 | }; 196 | 197 | } 198 | 199 | #endif 200 | 201 | -------------------------------------------------------------------------------- /examples/smoothed_particle_hydrodynamics/kernels.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016-2017 Andreas Schäfer 3 | * Copyright 2017 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #include 10 | 11 | #ifdef _MSC_BUILD 12 | #pragma warning( push ) 13 | #pragma warning( disable : 4514 ) 14 | #endif 15 | 16 | #include 17 | 18 | #ifdef _MSC_BUILD 19 | #pragma warning( pop ) 20 | #endif 21 | 22 | #include "kernels.h" 23 | 24 | #ifndef M_PI 25 | #define M_PI 3.14159265358979323846 26 | #endif 27 | 28 | void compute_density(int n, float *restrict rho, float *restrict pos_x, float *restrict pos_y, float h, float mass) 29 | { 30 | float h_squared = h * h; 31 | float h_pow_8 = h_squared * h_squared * h_squared * h_squared; 32 | float C = 4 * mass / M_PI / h_pow_8; 33 | 34 | for (int i = 0; i < n; ++i) { 35 | rho[i] = 4 * mass / M_PI / h_squared; 36 | } 37 | 38 | for (int i = 0; i < n; ++i) { 39 | for (int j = i + 1; j < n; ++j) { 40 | float delta_x = pos_x[i] - pos_x[j]; 41 | float delta_y = pos_y[i] - pos_y[j]; 42 | float dist_squared = delta_x * delta_x + delta_y * delta_y; 43 | float overlap = h_squared - dist_squared; 44 | 45 | if (overlap > 0) { 46 | float rho_ij = C * overlap * overlap * overlap; 47 | rho[i] += rho_ij; 48 | rho[j] += rho_ij; 49 | } 50 | } 51 | } 52 | } 53 | 54 | void compute_accel( 55 | int n, 56 | float *restrict rho, 57 | float *restrict pos_x, 58 | float *restrict pos_y, 59 | float *restrict v_x, 60 | float *restrict v_y, 61 | float *restrict a_x, 62 | float *restrict a_y, 63 | float mass, 64 | float g, 65 | float h, 66 | float k, 67 | float rho0, 68 | float mu) 69 | { 70 | const float h_squared = h * h; 71 | const float C_0 = mass / M_PI / (h_squared * h_squared); 72 | const float C_p = 15 * k; 73 | const float C_v = -40 * mu; 74 | 75 | // gravity: 76 | for (int i = 0; i < n; ++i) { 77 | a_x[i] = 0; 78 | a_y[i] = -g; 79 | } 80 | 81 | // Now compute interaction forces 82 | for (int i = 0; i < n; ++i) { 83 | for (int j = i + 1; j < n; ++j) { 84 | float delta_x = pos_x[i] - pos_x[j]; 85 | float delta_y = pos_y[i] - pos_y[j]; 86 | float dist_squared = delta_x * delta_x + delta_y * delta_y; 87 | 88 | if (dist_squared < h_squared) { 89 | float q = sqrt(dist_squared) / h; 90 | float u = 1 - q; 91 | float w_0 = C_0 * u / rho[i] / rho[j]; 92 | float w_p = w_0 * C_p * (rho[i] + rho[j] - 2 * rho0) * u / q; 93 | float w_v = w_0 * C_v; 94 | float delta_v_x = v_x[i] - v_x[j]; 95 | float delta_v_y = v_y[i] - v_y[j]; 96 | 97 | a_x[i] += (w_p * delta_x + w_v * delta_v_x); 98 | a_y[i] += (w_p * delta_y + w_v * delta_v_y); 99 | a_x[j] -= (w_p * delta_x + w_v * delta_v_x); 100 | a_y[j] -= (w_p * delta_y + w_v * delta_v_y); 101 | } 102 | } 103 | } 104 | } 105 | 106 | void damp_reflect( 107 | int which, 108 | float barrier, 109 | float *pos_x, 110 | float *pos_y, 111 | float *v_x, 112 | float *v_y) 113 | { 114 | float *v_which = (which == 0) ? v_x : v_y; 115 | float *pos_which = (which == 0) ? pos_x : pos_y; 116 | 117 | // Coefficient of resitiution 118 | const float DAMP = 0.75; 119 | // Ignore degenerate cases 120 | if (fabs(v_which[0]) <= 1e-3) 121 | return; 122 | 123 | // Scale back the distance traveled based on time from collision 124 | float tbounce = (pos_which[0] - barrier) / v_which[0]; 125 | pos_x[0] -= v_x[0]*(1-DAMP)*tbounce; 126 | pos_y[0] -= v_y[0]*(1-DAMP)*tbounce; 127 | 128 | // Reflect the position and velocity 129 | pos_which[0] = 2 * barrier - pos_which[0]; 130 | v_which[0] = -v_which[0]; 131 | 132 | // Damp the velocities 133 | v_x[0] *= DAMP; 134 | v_y[0] *= DAMP; 135 | } 136 | 137 | void reflect_bc( 138 | int n, 139 | float *restrict pos_x, 140 | float *restrict pos_y, 141 | float *restrict v_x, 142 | float *restrict v_y) 143 | { 144 | // Boundaries of the computational domain 145 | const float XMIN = 0.0; 146 | const float XMAX = 1.0; 147 | const float YMIN = 0.0; 148 | const float YMAX = 1.0; 149 | 150 | for (int i = 0; i < n; ++i) { 151 | if (pos_x[i] < XMIN) { 152 | damp_reflect(0, XMIN, pos_x + i, pos_y + i, v_x + i, v_y + i); 153 | } 154 | if (pos_x[i] > XMAX) { 155 | damp_reflect(0, XMAX, pos_x + i, pos_y + i, v_x + i, v_y + i); 156 | } 157 | if (pos_y[i] < YMIN) { 158 | damp_reflect(1, YMIN, pos_x + i, pos_y + i, v_x + i, v_y + i); 159 | } 160 | if (pos_y[i] > YMAX) { 161 | damp_reflect(1, YMAX, pos_x + i, pos_y + i, v_x + i, v_y + i); 162 | } 163 | } 164 | } 165 | 166 | void leapfrog( 167 | int n, 168 | float *restrict pos_x, 169 | float *restrict pos_y, 170 | float *restrict v_x, 171 | float *restrict v_y, 172 | float *restrict a_x, 173 | float *restrict a_y, 174 | double dt) 175 | { 176 | for (int i = 0; i < n; ++i) { 177 | v_x[i] += a_x[i] * dt; 178 | v_y[i] += a_y[i] * dt; 179 | 180 | pos_x[i] += v_x[i] * dt; 181 | pos_y[i] += v_y[i] * dt; 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /examples/lbm/update_lbm_object_oriented.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_OBJECT_ORIENTED_H 2 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_OBJECT_ORIENTED_H 3 | 4 | /** 5 | * Copyright 2013 Andreas Schäfer 6 | * 7 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 8 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 9 | */ 10 | 11 | #include "util.h" 12 | 13 | #define GET_COMP(X, Y, Z, DIR) \ 14 | gridOld[(Z) * dimX * dimY + (Y) * dimX + (X)].DIR 15 | 16 | #define SET_COMP(DIR) \ 17 | gridNew[(z) * dimX * dimY + (y) * dimX + (x)].DIR 18 | 19 | __global__ void update_lbm_object_oriented(int dimX, int dimY, int dimZ, CellLBM *gridOld, CellLBM *gridNew) 20 | { 21 | int x = blockIdx.x * blockDim.x + threadIdx.x + 2; 22 | int y = blockIdx.y * blockDim.y + threadIdx.y + 2; 23 | int z = 2; 24 | 25 | #pragma unroll 10 26 | for (; z < (dimZ - 2); z += 1) { 27 | 28 | #define SQR(X) ((X)*(X)) 29 | const double omega = 1.0/1.7; 30 | const double omega_trm = 1.0 - omega; 31 | const double omega_w0 = 3.0 * 1.0 / 3.0 * omega; 32 | const double omega_w1 = 3.0*1.0/18.0*omega; 33 | const double omega_w2 = 3.0*1.0/36.0*omega; 34 | const double one_third = 1.0 / 3.0; 35 | double velX, velY, velZ; 36 | 37 | velX = 38 | GET_COMP(x-1,y,z,E) + GET_COMP(x-1,y-1,z,NE) + 39 | GET_COMP(x-1,y+1,z,SE) + GET_COMP(x-1,y,z-1,TE) + 40 | GET_COMP(x-1,y,z+1,BE); 41 | velY = GET_COMP(x,y-1,z,N) + GET_COMP(x+1,y-1,z,NW) + 42 | GET_COMP(x,y-1,z-1,TN) + GET_COMP(x,y-1,z+1,BN); 43 | velZ = GET_COMP(x,y,z-1,T) + GET_COMP(x,y+1,z-1,TS) + 44 | GET_COMP(x+1,y,z-1,TW); 45 | 46 | const double rho = 47 | GET_COMP(x,y,z,C) + GET_COMP(x,y+1,z,S) + 48 | GET_COMP(x+1,y,z,W) + GET_COMP(x,y,z+1,B) + 49 | GET_COMP(x+1,y+1,z,SW) + GET_COMP(x,y+1,z+1,BS) + 50 | GET_COMP(x+1,y,z+1,BW) + velX + velY + velZ; 51 | velX = velX 52 | - GET_COMP(x+1,y,z,W) - GET_COMP(x+1,y-1,z,NW) 53 | - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x+1,y,z-1,TW) 54 | - GET_COMP(x+1,y,z+1,BW); 55 | velY = velY 56 | + GET_COMP(x-1,y-1,z,NE) - GET_COMP(x,y+1,z,S) 57 | - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x-1,y+1,z,SE) 58 | - GET_COMP(x,y+1,z-1,TS) - GET_COMP(x,y+1,z+1,BS); 59 | velZ = velZ+GET_COMP(x,y-1,z-1,TN) + GET_COMP(x-1,y,z-1,TE) - GET_COMP(x,y,z+1,B) - GET_COMP(x,y-1,z+1,BN) - GET_COMP(x,y+1,z+1,BS) - GET_COMP(x+1,y,z+1,BW) - GET_COMP(x-1,y,z+1,BE); 60 | 61 | // density = rho; 62 | // velocityX = velX; 63 | // velocityY = velY; 64 | // velocityZ = velZ; 65 | 66 | const double dir_indep_trm = one_third*rho - 0.5*( velX*velX + velY*velY + velZ*velZ ); 67 | 68 | SET_COMP(C)=omega_trm * GET_COMP(x,y,z,C) + omega_w0*( dir_indep_trm ); 69 | 70 | SET_COMP(NW)=omega_trm * GET_COMP(x+1,y-1,z,NW) + 71 | omega_w2*( dir_indep_trm - ( velX-velY ) + 1.5*SQR( velX-velY ) ); 72 | SET_COMP(SE)=omega_trm * GET_COMP(x-1,y+1,z,SE) + 73 | omega_w2*( dir_indep_trm + ( velX-velY ) + 1.5*SQR( velX-velY ) ); 74 | SET_COMP(NE)=omega_trm * GET_COMP(x-1,y-1,z,NE) + 75 | omega_w2*( dir_indep_trm + ( velX+velY ) + 1.5*SQR( velX+velY ) ); 76 | SET_COMP(SW)=omega_trm * GET_COMP(x+1,y+1,z,SW) + 77 | omega_w2*( dir_indep_trm - ( velX+velY ) + 1.5*SQR( velX+velY ) ); 78 | 79 | SET_COMP(TW)=omega_trm * GET_COMP(x+1,y,z-1,TW) + omega_w2*( dir_indep_trm - ( velX-velZ ) + 1.5*SQR( velX-velZ ) ); 80 | SET_COMP(BE)=omega_trm * GET_COMP(x-1,y,z+1,BE) + omega_w2*( dir_indep_trm + ( velX-velZ ) + 1.5*SQR( velX-velZ ) ); 81 | SET_COMP(TE)=omega_trm * GET_COMP(x-1,y,z-1,TE) + omega_w2*( dir_indep_trm + ( velX+velZ ) + 1.5*SQR( velX+velZ ) ); 82 | SET_COMP(BW)=omega_trm * GET_COMP(x+1,y,z+1,BW) + omega_w2*( dir_indep_trm - ( velX+velZ ) + 1.5*SQR( velX+velZ ) ); 83 | 84 | SET_COMP(TS)=omega_trm * GET_COMP(x,y+1,z-1,TS) + omega_w2*( dir_indep_trm - ( velY-velZ ) + 1.5*SQR( velY-velZ ) ); 85 | SET_COMP(BN)=omega_trm * GET_COMP(x,y-1,z+1,BN) + omega_w2*( dir_indep_trm + ( velY-velZ ) + 1.5*SQR( velY-velZ ) ); 86 | SET_COMP(TN)=omega_trm * GET_COMP(x,y-1,z-1,TN) + omega_w2*( dir_indep_trm + ( velY+velZ ) + 1.5*SQR( velY+velZ ) ); 87 | SET_COMP(BS)=omega_trm * GET_COMP(x,y+1,z+1,BS) + omega_w2*( dir_indep_trm - ( velY+velZ ) + 1.5*SQR( velY+velZ ) ); 88 | 89 | SET_COMP(N)=omega_trm * GET_COMP(x,y-1,z,N) + omega_w1*( dir_indep_trm + velY + 1.5*SQR(velY)); 90 | SET_COMP(S)=omega_trm * GET_COMP(x,y+1,z,S) + omega_w1*( dir_indep_trm - velY + 1.5*SQR(velY)); 91 | SET_COMP(E)=omega_trm * GET_COMP(x-1,y,z,E) + omega_w1*( dir_indep_trm + velX + 1.5*SQR(velX)); 92 | SET_COMP(W)=omega_trm * GET_COMP(x+1,y,z,W) + omega_w1*( dir_indep_trm - velX + 1.5*SQR(velX)); 93 | SET_COMP(T)=omega_trm * GET_COMP(x,y,z-1,T) + omega_w1*( dir_indep_trm + velZ + 1.5*SQR(velZ)); 94 | SET_COMP(B)=omega_trm * GET_COMP(x,y,z+1,B) + omega_w1*( dir_indep_trm - velZ + 1.5*SQR(velZ)); 95 | } 96 | } 97 | 98 | #undef GET_COMP 99 | #undef SET_COMP 100 | 101 | class benchmark_lbm_cuda_object_oriented : public benchmark_lbm_cuda_basic 102 | { 103 | public: 104 | virtual std::string name() 105 | { 106 | return "lbm_cuda_object_oriented"; 107 | } 108 | 109 | protected: 110 | void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew) 111 | { 112 | update_lbm_object_oriented<<>>( 113 | dimX, dimY, dimZ, 114 | reinterpret_cast(devGridOld), 115 | reinterpret_cast(devGridNew)); 116 | } 117 | }; 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /include/libflatarray/loop_peeler.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Andreas Schäfer 3 | * Copyright 2018 Google 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_LOOP_PEELER_HPP 10 | #define FLAT_ARRAY_LOOP_PEELER_HPP 11 | 12 | #include 13 | #include 14 | 15 | #ifdef _MSC_BUILD 16 | /** 17 | * This is a shim to ease handling of unaligned or not vectorizable 18 | * iterations at the begin/end of loops. It will invoke FUNCTION with 19 | * a suitable variant of SHORT_VEC (with its arity adjusted) to that 20 | * the main chunk of the iterations will be running with full 21 | * vectorization (as given by SHORT_VEC) and only the initial 22 | * (possibly unaligned) and trailing (less than SHORT_VEC's arity) 23 | * iterations will be done with an arity of 1 (i.e. scalar). 24 | * 25 | * X is expected to be increased by FUNCTION (e.g. by passing it via 26 | * reference). 27 | */ 28 | #define LIBFLATARRAY_LOOP_PEELER(SHORT_VEC_TYPE, COUNTER_TYPE, \ 29 | X, END_X, FUNCTION, ...) \ 30 | __pragma( warning( push ) ) \ 31 | __pragma( warning( disable : 4710 4711 ) ) \ 32 | LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION( \ 33 | , SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) \ 34 | __pragma( warning( pop ) ) 35 | #else 36 | /** 37 | * This is a shim to ease handling of unaligned or not vectorizable 38 | * iterations at the begin/end of loops. It will invoke FUNCTION with 39 | * a suitable variant of SHORT_VEC (with its arity adjusted) to that 40 | * the main chunk of the iterations will be running with full 41 | * vectorization (as given by SHORT_VEC) and only the initial 42 | * (possibly unaligned) and trailing (less than SHORT_VEC's arity) 43 | * iterations will be done with an arity of 1 (i.e. scalar). 44 | * 45 | * X is expected to be increased by FUNCTION (e.g. by passing it via 46 | * reference). 47 | */ 48 | #define LIBFLATARRAY_LOOP_PEELER(SHORT_VEC_TYPE, COUNTER_TYPE, \ 49 | X, END_X, FUNCTION, ...) \ 50 | LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION( \ 51 | , SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) 52 | #endif 53 | 54 | #ifdef _MSC_BUILD 55 | /** 56 | * Same as LIBFLATARRAY_LOOP_PEELER(), but for use in templates 57 | */ 58 | #define LIBFLATARRAY_LOOP_PEELER_TEMPLATE(SHORT_VEC_TYPE, COUNTER_TYPE, \ 59 | X, END_X, FUNCTION, ...) \ 60 | __pragma( warning( push ) ) \ 61 | __pragma( warning( disable : 4710 4711 ) ) \ 62 | LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION( \ 63 | typename, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) \ 64 | __pragma( warning( pop ) ) 65 | #else 66 | /** 67 | * Same as LIBFLATARRAY_LOOP_PEELER(), but for use in templates 68 | */ 69 | #define LIBFLATARRAY_LOOP_PEELER_TEMPLATE(SHORT_VEC_TYPE, COUNTER_TYPE, \ 70 | X, END_X, FUNCTION, ...) \ 71 | LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION( \ 72 | typename, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) 73 | #endif 74 | 75 | #define LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION( \ 76 | TYPENAME, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, ...) \ 77 | { \ 78 | typedef SHORT_VEC_TYPE lfa_local_short_vec; \ 79 | typedef TYPENAME LibFlatArray::detail::flat_array:: \ 80 | sibling_short_vec_switch::VALUE \ 81 | lfa_local_scalar; \ 82 | \ 83 | COUNTER_TYPE remainder = (X) % \ 84 | COUNTER_TYPE(lfa_local_short_vec::ARITY); \ 85 | COUNTER_TYPE next_stop = remainder ? \ 86 | (X) + COUNTER_TYPE(lfa_local_short_vec::ARITY) - remainder : \ 87 | (X); \ 88 | COUNTER_TYPE last_stop = (END_X) - \ 89 | (END_X) % COUNTER_TYPE(lfa_local_short_vec::ARITY); \ 90 | \ 91 | FUNCTION(X, next_stop, __VA_ARGS__); \ 92 | FUNCTION(X, last_stop, __VA_ARGS__); \ 93 | FUNCTION(X, (END_X), __VA_ARGS__); \ 94 | } 95 | 96 | #ifdef LIBFLATARRAY_WITH_CPP14 97 | 98 | namespace LibFlatArray { 99 | 100 | template 101 | void loop_peeler(COUNTER_TYPE1 *counter, const COUNTER_TYPE2& end, const LAMBDA& lambda) 102 | { 103 | typedef SHORT_VEC_TYPE lfa_local_short_vec; 104 | typedef typename detail::flat_array:: 105 | sibling_short_vec_switch::VALUE 106 | lfa_local_scalar; 107 | 108 | COUNTER_TYPE2 remainder = (*counter) % COUNTER_TYPE2(lfa_local_short_vec::ARITY); 109 | COUNTER_TYPE2 next_stop = remainder ? 110 | (*counter) + COUNTER_TYPE2(lfa_local_short_vec::ARITY) - remainder : 111 | (*counter); 112 | COUNTER_TYPE2 last_stop = end - end % COUNTER_TYPE2(lfa_local_short_vec::ARITY); 113 | 114 | lambda(lfa_local_scalar(), counter, next_stop); 115 | lambda(lfa_local_short_vec(), counter, last_stop); 116 | lambda(lfa_local_scalar(), counter, end ); 117 | } 118 | 119 | } 120 | 121 | #endif 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /include/libflatarray/detail/short_vec_mic_double_8.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015 Kurt Kanzenbach 3 | * Copyright 2016-2017 Andreas Schäfer 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_MIC_DOUBLE_8_HPP 10 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_MIC_DOUBLE_8_HPP 11 | 12 | #if LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_MIC 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | // disable certain warnings from system headers when compiling with 19 | // Microsoft Visual Studio: 20 | #ifdef _MSC_BUILD 21 | #pragma warning( push ) 22 | #pragma warning( disable : 4514 ) 23 | #endif 24 | 25 | #include 26 | #ifdef LIBFLATARRAY_WITH_CPP14 27 | #include 28 | #endif 29 | 30 | #ifdef _MSC_BUILD 31 | #pragma warning( pop ) 32 | #endif 33 | 34 | namespace LibFlatArray { 35 | 36 | template 37 | class short_vec; 38 | 39 | #ifdef __ICC 40 | // disabling this warning as implicit type conversion is exactly our goal here: 41 | #pragma warning push 42 | #pragma warning (disable: 2304) 43 | #endif 44 | 45 | template<> 46 | class short_vec 47 | { 48 | public: 49 | static const std::size_t ARITY = 8; 50 | 51 | typedef short_vec_strategy::mic strategy; 52 | 53 | template 54 | friend std::basic_ostream<_CharT, _Traits>& operator<<( 55 | std::basic_ostream<_CharT, _Traits>& __os, 56 | const short_vec& vec); 57 | 58 | inline 59 | short_vec(const double data = 0) : 60 | val(_mm512_set1_pd(data)) 61 | {} 62 | 63 | inline 64 | short_vec(const double *data) 65 | { 66 | load(data); 67 | } 68 | 69 | inline 70 | short_vec(const __m512d& val) : 71 | val(val) 72 | {} 73 | 74 | #ifdef LIBFLATARRAY_WITH_CPP14 75 | inline 76 | short_vec(const std::initializer_list& il) 77 | { 78 | const double *ptr = static_cast(&(*il.begin())); 79 | load(ptr); 80 | } 81 | #endif 82 | 83 | inline 84 | void operator-=(const short_vec& other) 85 | { 86 | val = _mm512_sub_pd(val, other.val); 87 | } 88 | 89 | inline 90 | short_vec operator-(const short_vec& other) const 91 | { 92 | return short_vec( 93 | _mm512_sub_pd(val, other.val)); 94 | } 95 | 96 | inline 97 | void operator+=(const short_vec& other) 98 | { 99 | val = _mm512_add_pd(val, other.val); 100 | } 101 | 102 | inline 103 | short_vec operator+(const short_vec& other) const 104 | { 105 | return short_vec( 106 | _mm512_add_pd(val, other.val)); 107 | } 108 | 109 | inline 110 | void operator*=(const short_vec& other) 111 | { 112 | val = _mm512_mul_pd(val, other.val); 113 | } 114 | 115 | inline 116 | short_vec operator*(const short_vec& other) const 117 | { 118 | return short_vec( 119 | _mm512_mul_pd(val, other.val)); 120 | } 121 | 122 | inline 123 | void operator/=(const short_vec& other) 124 | { 125 | val = _mm512_div_pd(val, other.val); 126 | } 127 | 128 | inline 129 | short_vec operator/(const short_vec& other) const 130 | { 131 | return short_vec( 132 | _mm512_div_pd(val, other.val)); 133 | } 134 | 135 | inline 136 | short_vec sqrt() const 137 | { 138 | return short_vec( 139 | _mm512_sqrt_pd(val)); 140 | } 141 | 142 | inline 143 | void load(const double *data) 144 | { 145 | val = _mm512_loadunpacklo_pd(val, data + 0); 146 | val = _mm512_loadunpackhi_pd(val, data + 8); 147 | } 148 | 149 | inline 150 | void load_aligned(const double *data) 151 | { 152 | SHORTVEC_ASSERT_ALIGNED(data, 64); 153 | val = _mm512_load_pd(data); 154 | } 155 | 156 | inline 157 | void store(double *data) const 158 | { 159 | _mm512_packstorelo_pd(data + 0, val); 160 | _mm512_packstorehi_pd(data + 8, val); 161 | } 162 | 163 | inline 164 | void store_aligned(double *data) const 165 | { 166 | SHORTVEC_ASSERT_ALIGNED(data, 64); 167 | _mm512_store_pd(data, val); 168 | } 169 | 170 | inline 171 | void store_nt(double *data) const 172 | { 173 | SHORTVEC_ASSERT_ALIGNED(data, 64); 174 | _mm512_storenr_pd(data, val); 175 | } 176 | 177 | inline 178 | void gather(const double *ptr, const int *offsets) 179 | { 180 | __m512i indices; 181 | indices = _mm512_loadunpacklo_epi32(indices, offsets); 182 | val = _mm512_i32logather_pd(indices, ptr, 8); 183 | } 184 | 185 | inline 186 | void scatter(double *ptr, const int *offsets) const 187 | { 188 | __m512i indices; 189 | indices = _mm512_loadunpacklo_epi32(indices, offsets); 190 | _mm512_i32loscatter_pd(ptr, indices, val, 8); 191 | } 192 | 193 | private: 194 | __m512d val; 195 | }; 196 | 197 | inline 198 | void operator<<(double *data, const short_vec& vec) 199 | { 200 | vec.store(data); 201 | } 202 | 203 | #ifdef __ICC 204 | #pragma warning pop 205 | #endif 206 | 207 | inline 208 | short_vec sqrt(const short_vec& vec) 209 | { 210 | return vec.sqrt(); 211 | } 212 | 213 | template 214 | std::basic_ostream<_CharT, _Traits>& 215 | operator<<(std::basic_ostream<_CharT, _Traits>& __os, 216 | const short_vec& vec) 217 | { 218 | const double *data1 = reinterpret_cast(&vec.val); 219 | 220 | __os << "[" << data1[0] << ", " << data1[1] << ", " << data1[2] << ", " << data1[3] 221 | << ", " << data1[4] << ", " << data1[5] << ", " << data1[6] << ", " << data1[7] 222 | << "]"; 223 | return __os; 224 | } 225 | 226 | } 227 | 228 | #endif 229 | 230 | #endif 231 | -------------------------------------------------------------------------------- /include/libflatarray/detail/short_vec_scalar_int_2.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015 Kurt Kanzenbach 3 | * Copyright 2016-2017 Andreas Schäfer 4 | * 5 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 6 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SCALAR_INT_2_HPP 10 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_SCALAR_INT_2_HPP 11 | 12 | #include 13 | #include 14 | 15 | // disable certain warnings from system headers when compiling with 16 | // Microsoft Visual Studio: 17 | #ifdef _MSC_BUILD 18 | #pragma warning( push ) 19 | #pragma warning( disable : 4514 ) 20 | #endif 21 | 22 | #ifdef LIBFLATARRAY_WITH_CPP14 23 | #include 24 | #endif 25 | 26 | #ifdef _MSC_BUILD 27 | #pragma warning( pop ) 28 | #endif 29 | 30 | namespace LibFlatArray { 31 | 32 | template 33 | class short_vec; 34 | 35 | #ifdef __ICC 36 | // disabling this warning as implicit type conversion is exactly our goal here: 37 | #pragma warning push 38 | #pragma warning (disable: 2304) 39 | #endif 40 | 41 | // Don't warn about these functions being stripped from an executable 42 | // as they're not being used, that's actually expected behavior. 43 | #ifdef _MSC_BUILD 44 | #pragma warning( push ) 45 | #pragma warning( disable : 4514 ) 46 | #endif 47 | 48 | template<> 49 | class short_vec : public short_vec_base 50 | { 51 | public: 52 | static const std::size_t ARITY = 2; 53 | 54 | typedef short_vec_strategy::scalar strategy; 55 | 56 | template 57 | friend std::basic_ostream<_CharT, _Traits>& operator<<( 58 | std::basic_ostream<_CharT, _Traits>& __os, 59 | const short_vec& vec); 60 | 61 | inline 62 | short_vec(const int data = 0) : 63 | val{data, data} 64 | {} 65 | 66 | inline 67 | short_vec(const int *data) 68 | { 69 | load(data); 70 | } 71 | 72 | inline 73 | short_vec(const int val1, const int val2) : 74 | val{val1, 75 | val2} 76 | {} 77 | 78 | #ifdef LIBFLATARRAY_WITH_CPP14 79 | inline 80 | short_vec(const std::initializer_list& il) 81 | { 82 | const int *ptr = static_cast(&(*il.begin())); 83 | load(ptr); 84 | } 85 | #endif 86 | 87 | inline 88 | void operator-=(const short_vec& other) 89 | { 90 | val[ 0] -= other.val[ 0]; 91 | val[ 1] -= other.val[ 1]; 92 | } 93 | 94 | inline 95 | short_vec operator-(const short_vec& other) const 96 | { 97 | return short_vec( 98 | val[ 0] - other.val[ 0], 99 | val[ 1] - other.val[ 1]); 100 | } 101 | 102 | inline 103 | void operator+=(const short_vec& other) 104 | { 105 | val[ 0] += other.val[ 0]; 106 | val[ 1] += other.val[ 1]; 107 | } 108 | 109 | inline 110 | short_vec operator+(const short_vec& other) const 111 | { 112 | return short_vec( 113 | val[ 0] + other.val[ 0], 114 | val[ 1] + other.val[ 1]); 115 | } 116 | 117 | inline 118 | void operator*=(const short_vec& other) 119 | { 120 | val[ 0] *= other.val[ 0]; 121 | val[ 1] *= other.val[ 1]; 122 | } 123 | 124 | inline 125 | short_vec operator*(const short_vec& other) const 126 | { 127 | return short_vec( 128 | val[ 0] * other.val[ 0], 129 | val[ 1] * other.val[ 1]); 130 | } 131 | 132 | inline 133 | void operator/=(const short_vec& other) 134 | { 135 | val[ 0] /= other.val[ 0]; 136 | val[ 1] /= other.val[ 1]; 137 | } 138 | 139 | inline 140 | short_vec operator/(const short_vec& other) const 141 | { 142 | return short_vec( 143 | val[ 0] / other.val[ 0], 144 | val[ 1] / other.val[ 1]); 145 | } 146 | 147 | inline 148 | short_vec sqrt() const 149 | { 150 | return short_vec( 151 | static_cast(std::sqrt(val[ 0])), 152 | static_cast(std::sqrt(val[ 1]))); 153 | } 154 | 155 | inline 156 | void load(const int *data) 157 | { 158 | val[ 0] = data[0]; 159 | val[ 1] = data[1]; 160 | } 161 | 162 | inline 163 | void load_aligned(const int *data) 164 | { 165 | load(data); 166 | } 167 | 168 | inline 169 | void store(int *data) const 170 | { 171 | *(data + 0) = val[ 0]; 172 | *(data + 1) = val[ 1]; 173 | } 174 | 175 | inline 176 | void store_aligned(int *data) const 177 | { 178 | store(data); 179 | } 180 | 181 | inline 182 | void store_nt(int *data) const 183 | { 184 | store(data); 185 | } 186 | 187 | inline 188 | void gather(const int *ptr, const int *offsets) 189 | { 190 | val[ 0] = ptr[offsets[0]]; 191 | val[ 1] = ptr[offsets[1]]; 192 | } 193 | 194 | inline 195 | void scatter(int *ptr, const int *offsets) const 196 | { 197 | ptr[offsets[0]] = val[ 0]; 198 | ptr[offsets[1]] = val[ 1]; 199 | } 200 | 201 | private: 202 | int val[2]; 203 | }; 204 | 205 | inline 206 | void operator<<(int *data, const short_vec& vec) 207 | { 208 | vec.store(data); 209 | } 210 | 211 | #ifdef __ICC 212 | #pragma warning pop 213 | #endif 214 | 215 | inline 216 | short_vec sqrt(const short_vec& vec) 217 | { 218 | return vec.sqrt(); 219 | } 220 | 221 | // not inlining is ok, as is inlining: 222 | #ifdef _MSC_BUILD 223 | #pragma warning( push ) 224 | #pragma warning( disable : 4710 4711 ) 225 | #endif 226 | 227 | template 228 | inline 229 | std::basic_ostream<_CharT, _Traits>& 230 | operator<<(std::basic_ostream<_CharT, _Traits>& __os, 231 | const short_vec& vec) 232 | { 233 | __os << "[" << vec.val[ 0] << ", " << vec.val[ 1] 234 | << "]"; 235 | return __os; 236 | } 237 | 238 | #ifdef _MSC_BUILD 239 | #pragma warning( pop ) 240 | #endif 241 | 242 | } 243 | 244 | #ifdef _MSC_BUILD 245 | #pragma warning( pop ) 246 | #endif 247 | 248 | #endif 249 | -------------------------------------------------------------------------------- /examples/lbm/update_lbm_classic.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CLASSIC_H 2 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CLASSIC_H 3 | 4 | /** 5 | * Copyright 2013 Andreas Schäfer 6 | * 7 | * Distributed under the Boost Software License, Version 1.0. (See accompanying 8 | * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 9 | */ 10 | 11 | #include "util.h" 12 | 13 | #define C 0 14 | #define N 1 15 | #define E 2 16 | #define W 3 17 | #define S 4 18 | #define T 5 19 | #define B 6 20 | 21 | #define NW 7 22 | #define SW 8 23 | #define NE 9 24 | #define SE 10 25 | 26 | #define TW 11 27 | #define BW 12 28 | #define TE 13 29 | #define BE 14 30 | 31 | #define TN 15 32 | #define BN 16 33 | #define TS 17 34 | #define BS 18 35 | 36 | #define GET_COMP(X, Y, Z, DIR) \ 37 | gridOld[(Z) * dimX * dimY + (Y) * dimX + (X) + (DIR) * dimX * dimY * dimZ] 38 | 39 | #define SET_COMP(DIR) \ 40 | gridNew[z * dimX * dimY + y * dimX + x + (DIR) * dimX * dimY * dimZ] 41 | 42 | __global__ void update_lbm_classic(int dimX, int dimY, int dimZ, double *gridOld, double *gridNew) 43 | { 44 | int x = blockIdx.x * blockDim.x + threadIdx.x + 2; 45 | int y = blockIdx.y * blockDim.y + threadIdx.y + 2; 46 | int z = 2; 47 | 48 | #pragma unroll 10 49 | for (; z < (dimZ - 2); z += 1) { 50 | 51 | #define SQR(X) ((X)*(X)) 52 | const double omega = 1.0/1.7; 53 | const double omega_trm = 1.0 - omega; 54 | const double omega_w0 = 3.0 * 1.0 / 3.0 * omega; 55 | const double omega_w1 = 3.0*1.0/18.0*omega; 56 | const double omega_w2 = 3.0*1.0/36.0*omega; 57 | const double one_third = 1.0 / 3.0; 58 | double velX, velY, velZ; 59 | 60 | velX = 61 | GET_COMP(x-1,y,z,E) + GET_COMP(x-1,y-1,z,NE) + 62 | GET_COMP(x-1,y+1,z,SE) + GET_COMP(x-1,y,z-1,TE) + 63 | GET_COMP(x-1,y,z+1,BE); 64 | velY = GET_COMP(x,y-1,z,N) + GET_COMP(x+1,y-1,z,NW) + 65 | GET_COMP(x,y-1,z-1,TN) + GET_COMP(x,y-1,z+1,BN); 66 | velZ = GET_COMP(x,y,z-1,T) + GET_COMP(x,y+1,z-1,TS) + 67 | GET_COMP(x+1,y,z-1,TW); 68 | 69 | const double rho = 70 | GET_COMP(x,y,z,C) + GET_COMP(x,y+1,z,S) + 71 | GET_COMP(x+1,y,z,W) + GET_COMP(x,y,z+1,B) + 72 | GET_COMP(x+1,y+1,z,SW) + GET_COMP(x,y+1,z+1,BS) + 73 | GET_COMP(x+1,y,z+1,BW) + velX + velY + velZ; 74 | velX = velX 75 | - GET_COMP(x+1,y,z,W) - GET_COMP(x+1,y-1,z,NW) 76 | - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x+1,y,z-1,TW) 77 | - GET_COMP(x+1,y,z+1,BW); 78 | velY = velY 79 | + GET_COMP(x-1,y-1,z,NE) - GET_COMP(x,y+1,z,S) 80 | - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x-1,y+1,z,SE) 81 | - GET_COMP(x,y+1,z-1,TS) - GET_COMP(x,y+1,z+1,BS); 82 | velZ = velZ+GET_COMP(x,y-1,z-1,TN) + GET_COMP(x-1,y,z-1,TE) - GET_COMP(x,y,z+1,B) - GET_COMP(x,y-1,z+1,BN) - GET_COMP(x,y+1,z+1,BS) - GET_COMP(x+1,y,z+1,BW) - GET_COMP(x-1,y,z+1,BE); 83 | 84 | // density = rho; 85 | // velocityX = velX; 86 | // velocityY = velY; 87 | // velocityZ = velZ; 88 | 89 | const double dir_indep_trm = one_third*rho - 0.5*( velX*velX + velY*velY + velZ*velZ ); 90 | 91 | SET_COMP(C)=omega_trm * GET_COMP(x,y,z,C) + omega_w0*( dir_indep_trm ); 92 | 93 | SET_COMP(NW)=omega_trm * GET_COMP(x+1,y-1,z,NW) + 94 | omega_w2*( dir_indep_trm - ( velX-velY ) + 1.5*SQR( velX-velY ) ); 95 | SET_COMP(SE)=omega_trm * GET_COMP(x-1,y+1,z,SE) + 96 | omega_w2*( dir_indep_trm + ( velX-velY ) + 1.5*SQR( velX-velY ) ); 97 | SET_COMP(NE)=omega_trm * GET_COMP(x-1,y-1,z,NE) + 98 | omega_w2*( dir_indep_trm + ( velX+velY ) + 1.5*SQR( velX+velY ) ); 99 | SET_COMP(SW)=omega_trm * GET_COMP(x+1,y+1,z,SW) + 100 | omega_w2*( dir_indep_trm - ( velX+velY ) + 1.5*SQR( velX+velY ) ); 101 | 102 | SET_COMP(TW)=omega_trm * GET_COMP(x+1,y,z-1,TW) + omega_w2*( dir_indep_trm - ( velX-velZ ) + 1.5*SQR( velX-velZ ) ); 103 | SET_COMP(BE)=omega_trm * GET_COMP(x-1,y,z+1,BE) + omega_w2*( dir_indep_trm + ( velX-velZ ) + 1.5*SQR( velX-velZ ) ); 104 | SET_COMP(TE)=omega_trm * GET_COMP(x-1,y,z-1,TE) + omega_w2*( dir_indep_trm + ( velX+velZ ) + 1.5*SQR( velX+velZ ) ); 105 | SET_COMP(BW)=omega_trm * GET_COMP(x+1,y,z+1,BW) + omega_w2*( dir_indep_trm - ( velX+velZ ) + 1.5*SQR( velX+velZ ) ); 106 | 107 | SET_COMP(TS)=omega_trm * GET_COMP(x,y+1,z-1,TS) + omega_w2*( dir_indep_trm - ( velY-velZ ) + 1.5*SQR( velY-velZ ) ); 108 | SET_COMP(BN)=omega_trm * GET_COMP(x,y-1,z+1,BN) + omega_w2*( dir_indep_trm + ( velY-velZ ) + 1.5*SQR( velY-velZ ) ); 109 | SET_COMP(TN)=omega_trm * GET_COMP(x,y-1,z-1,TN) + omega_w2*( dir_indep_trm + ( velY+velZ ) + 1.5*SQR( velY+velZ ) ); 110 | SET_COMP(BS)=omega_trm * GET_COMP(x,y+1,z+1,BS) + omega_w2*( dir_indep_trm - ( velY+velZ ) + 1.5*SQR( velY+velZ ) ); 111 | 112 | SET_COMP(N)=omega_trm * GET_COMP(x,y-1,z,N) + omega_w1*( dir_indep_trm + velY + 1.5*SQR(velY)); 113 | SET_COMP(S)=omega_trm * GET_COMP(x,y+1,z,S) + omega_w1*( dir_indep_trm - velY + 1.5*SQR(velY)); 114 | SET_COMP(E)=omega_trm * GET_COMP(x-1,y,z,E) + omega_w1*( dir_indep_trm + velX + 1.5*SQR(velX)); 115 | SET_COMP(W)=omega_trm * GET_COMP(x+1,y,z,W) + omega_w1*( dir_indep_trm - velX + 1.5*SQR(velX)); 116 | SET_COMP(T)=omega_trm * GET_COMP(x,y,z-1,T) + omega_w1*( dir_indep_trm + velZ + 1.5*SQR(velZ)); 117 | SET_COMP(B)=omega_trm * GET_COMP(x,y,z+1,B) + omega_w1*( dir_indep_trm - velZ + 1.5*SQR(velZ)); 118 | } 119 | } 120 | 121 | #undef GET_COMP 122 | #undef SET_COMP 123 | #undef SQR 124 | 125 | #undef C 126 | #undef N 127 | #undef E 128 | #undef W 129 | #undef S 130 | #undef T 131 | #undef B 132 | 133 | #undef NW 134 | #undef SW 135 | #undef NE 136 | #undef SE 137 | 138 | #undef TW 139 | #undef BW 140 | #undef TE 141 | #undef BE 142 | 143 | #undef TN 144 | #undef BN 145 | #undef TS 146 | #undef BS 147 | 148 | class benchmark_lbm_cuda_classic : public benchmark_lbm_cuda_basic 149 | { 150 | public: 151 | virtual std::string name() 152 | { 153 | return "lbm_cuda_classic"; 154 | } 155 | 156 | protected: 157 | void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew) 158 | { 159 | update_lbm_classic<<>>(dimX, dimY, dimZ, devGridOld, devGridNew); 160 | } 161 | }; 162 | 163 | #endif 164 | -------------------------------------------------------------------------------- /examples/lbm/flatarray_implementation_0.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudalineupdatefunctorprototype.h" 3 | 4 | IMPLEMENTATION(CellLBM, 32, 32, 32, 32, 32, 32) 5 | IMPLEMENTATION(CellLBM, 32, 32, 64, 32, 32, 64) 6 | IMPLEMENTATION(CellLBM, 32, 32, 128, 32, 32, 128) 7 | IMPLEMENTATION(CellLBM, 32, 32, 136, 32, 32, 136) 8 | IMPLEMENTATION(CellLBM, 32, 32, 192, 32, 32, 192) 9 | IMPLEMENTATION(CellLBM, 32, 32, 200, 32, 32, 200) 10 | IMPLEMENTATION(CellLBM, 32, 32, 256, 32, 32, 256) 11 | IMPLEMENTATION(CellLBM, 32, 32, 264, 32, 32, 264) 12 | IMPLEMENTATION(CellLBM, 32, 32, 512, 32, 32, 512) 13 | IMPLEMENTATION(CellLBM, 32, 32, 520, 32, 32, 520) 14 | IMPLEMENTATION(CellLBM, 32, 32, 1032, 32, 32, 1032) 15 | IMPLEMENTATION(CellLBM, 32, 64, 32, 32, 64, 32) 16 | IMPLEMENTATION(CellLBM, 32, 64, 64, 32, 64, 64) 17 | IMPLEMENTATION(CellLBM, 32, 64, 128, 32, 64, 128) 18 | IMPLEMENTATION(CellLBM, 32, 64, 136, 32, 64, 136) 19 | IMPLEMENTATION(CellLBM, 32, 64, 192, 32, 64, 192) 20 | IMPLEMENTATION(CellLBM, 32, 64, 200, 32, 64, 200) 21 | IMPLEMENTATION(CellLBM, 32, 64, 256, 32, 64, 256) 22 | IMPLEMENTATION(CellLBM, 32, 64, 264, 32, 64, 264) 23 | IMPLEMENTATION(CellLBM, 32, 64, 512, 32, 64, 512) 24 | IMPLEMENTATION(CellLBM, 32, 64, 520, 32, 64, 520) 25 | IMPLEMENTATION(CellLBM, 32, 64, 1032, 32, 64, 1032) 26 | IMPLEMENTATION(CellLBM, 32, 128, 32, 32, 128, 32) 27 | IMPLEMENTATION(CellLBM, 32, 128, 64, 32, 128, 64) 28 | IMPLEMENTATION(CellLBM, 32, 128, 128, 32, 128, 128) 29 | IMPLEMENTATION(CellLBM, 32, 128, 136, 32, 128, 136) 30 | IMPLEMENTATION(CellLBM, 32, 128, 192, 32, 128, 192) 31 | IMPLEMENTATION(CellLBM, 32, 128, 200, 32, 128, 200) 32 | IMPLEMENTATION(CellLBM, 32, 128, 256, 32, 128, 256) 33 | IMPLEMENTATION(CellLBM, 32, 128, 264, 32, 128, 264) 34 | IMPLEMENTATION(CellLBM, 32, 128, 512, 32, 128, 512) 35 | IMPLEMENTATION(CellLBM, 32, 128, 520, 32, 128, 520) 36 | IMPLEMENTATION(CellLBM, 32, 128, 1032, 32, 128, 1032) 37 | IMPLEMENTATION(CellLBM, 32, 136, 32, 32, 136, 32) 38 | IMPLEMENTATION(CellLBM, 32, 136, 64, 32, 136, 64) 39 | IMPLEMENTATION(CellLBM, 32, 136, 128, 32, 136, 128) 40 | IMPLEMENTATION(CellLBM, 32, 136, 136, 32, 136, 136) 41 | IMPLEMENTATION(CellLBM, 32, 136, 192, 32, 136, 192) 42 | IMPLEMENTATION(CellLBM, 32, 136, 200, 32, 136, 200) 43 | IMPLEMENTATION(CellLBM, 32, 136, 256, 32, 136, 256) 44 | IMPLEMENTATION(CellLBM, 32, 136, 264, 32, 136, 264) 45 | IMPLEMENTATION(CellLBM, 32, 136, 512, 32, 136, 512) 46 | IMPLEMENTATION(CellLBM, 32, 136, 520, 32, 136, 520) 47 | IMPLEMENTATION(CellLBM, 32, 136, 1032, 32, 136, 1032) 48 | IMPLEMENTATION(CellLBM, 32, 192, 32, 32, 192, 32) 49 | IMPLEMENTATION(CellLBM, 32, 192, 64, 32, 192, 64) 50 | IMPLEMENTATION(CellLBM, 32, 192, 128, 32, 192, 128) 51 | IMPLEMENTATION(CellLBM, 32, 192, 136, 32, 192, 136) 52 | IMPLEMENTATION(CellLBM, 32, 192, 192, 32, 192, 192) 53 | IMPLEMENTATION(CellLBM, 32, 192, 200, 32, 192, 200) 54 | IMPLEMENTATION(CellLBM, 32, 192, 256, 32, 192, 256) 55 | IMPLEMENTATION(CellLBM, 32, 192, 264, 32, 192, 264) 56 | IMPLEMENTATION(CellLBM, 32, 192, 512, 32, 192, 512) 57 | IMPLEMENTATION(CellLBM, 32, 192, 520, 32, 192, 520) 58 | IMPLEMENTATION(CellLBM, 32, 192, 1032, 32, 192, 1032) 59 | IMPLEMENTATION(CellLBM, 32, 200, 32, 32, 200, 32) 60 | IMPLEMENTATION(CellLBM, 32, 200, 64, 32, 200, 64) 61 | IMPLEMENTATION(CellLBM, 32, 200, 128, 32, 200, 128) 62 | IMPLEMENTATION(CellLBM, 32, 200, 136, 32, 200, 136) 63 | IMPLEMENTATION(CellLBM, 32, 200, 192, 32, 200, 192) 64 | IMPLEMENTATION(CellLBM, 32, 200, 200, 32, 200, 200) 65 | IMPLEMENTATION(CellLBM, 32, 200, 256, 32, 200, 256) 66 | IMPLEMENTATION(CellLBM, 32, 200, 264, 32, 200, 264) 67 | IMPLEMENTATION(CellLBM, 32, 200, 512, 32, 200, 512) 68 | IMPLEMENTATION(CellLBM, 32, 200, 520, 32, 200, 520) 69 | IMPLEMENTATION(CellLBM, 32, 200, 1032, 32, 200, 1032) 70 | IMPLEMENTATION(CellLBM, 32, 256, 32, 32, 256, 32) 71 | IMPLEMENTATION(CellLBM, 32, 256, 64, 32, 256, 64) 72 | IMPLEMENTATION(CellLBM, 32, 256, 128, 32, 256, 128) 73 | IMPLEMENTATION(CellLBM, 32, 256, 136, 32, 256, 136) 74 | IMPLEMENTATION(CellLBM, 32, 256, 192, 32, 256, 192) 75 | IMPLEMENTATION(CellLBM, 32, 256, 200, 32, 256, 200) 76 | IMPLEMENTATION(CellLBM, 32, 256, 256, 32, 256, 256) 77 | IMPLEMENTATION(CellLBM, 32, 256, 264, 32, 256, 264) 78 | IMPLEMENTATION(CellLBM, 32, 256, 512, 32, 256, 512) 79 | IMPLEMENTATION(CellLBM, 32, 256, 520, 32, 256, 520) 80 | IMPLEMENTATION(CellLBM, 32, 256, 1032, 32, 256, 1032) 81 | IMPLEMENTATION(CellLBM, 32, 264, 32, 32, 264, 32) 82 | IMPLEMENTATION(CellLBM, 32, 264, 64, 32, 264, 64) 83 | IMPLEMENTATION(CellLBM, 32, 264, 128, 32, 264, 128) 84 | IMPLEMENTATION(CellLBM, 32, 264, 136, 32, 264, 136) 85 | IMPLEMENTATION(CellLBM, 32, 264, 192, 32, 264, 192) 86 | IMPLEMENTATION(CellLBM, 32, 264, 200, 32, 264, 200) 87 | IMPLEMENTATION(CellLBM, 32, 264, 256, 32, 264, 256) 88 | IMPLEMENTATION(CellLBM, 32, 264, 264, 32, 264, 264) 89 | IMPLEMENTATION(CellLBM, 32, 264, 512, 32, 264, 512) 90 | IMPLEMENTATION(CellLBM, 32, 264, 520, 32, 264, 520) 91 | IMPLEMENTATION(CellLBM, 32, 264, 1032, 32, 264, 1032) 92 | IMPLEMENTATION(CellLBM, 32, 512, 32, 32, 512, 32) 93 | IMPLEMENTATION(CellLBM, 32, 512, 64, 32, 512, 64) 94 | IMPLEMENTATION(CellLBM, 32, 512, 128, 32, 512, 128) 95 | IMPLEMENTATION(CellLBM, 32, 512, 136, 32, 512, 136) 96 | IMPLEMENTATION(CellLBM, 32, 512, 192, 32, 512, 192) 97 | IMPLEMENTATION(CellLBM, 32, 512, 200, 32, 512, 200) 98 | IMPLEMENTATION(CellLBM, 32, 512, 256, 32, 512, 256) 99 | IMPLEMENTATION(CellLBM, 32, 512, 264, 32, 512, 264) 100 | IMPLEMENTATION(CellLBM, 32, 512, 512, 32, 512, 512) 101 | IMPLEMENTATION(CellLBM, 32, 512, 520, 32, 512, 520) 102 | IMPLEMENTATION(CellLBM, 32, 512, 1032, 32, 512, 1032) 103 | IMPLEMENTATION(CellLBM, 32, 520, 32, 32, 520, 32) 104 | IMPLEMENTATION(CellLBM, 32, 520, 64, 32, 520, 64) 105 | IMPLEMENTATION(CellLBM, 32, 520, 128, 32, 520, 128) 106 | IMPLEMENTATION(CellLBM, 32, 520, 136, 32, 520, 136) 107 | IMPLEMENTATION(CellLBM, 32, 520, 192, 32, 520, 192) 108 | IMPLEMENTATION(CellLBM, 32, 520, 200, 32, 520, 200) 109 | IMPLEMENTATION(CellLBM, 32, 520, 256, 32, 520, 256) 110 | IMPLEMENTATION(CellLBM, 32, 520, 264, 32, 520, 264) 111 | IMPLEMENTATION(CellLBM, 32, 520, 512, 32, 520, 512) 112 | IMPLEMENTATION(CellLBM, 32, 520, 520, 32, 520, 520) 113 | IMPLEMENTATION(CellLBM, 32, 520, 1032, 32, 520, 1032) 114 | IMPLEMENTATION(CellLBM, 32, 1032, 32, 32, 1032, 32) 115 | IMPLEMENTATION(CellLBM, 32, 1032, 64, 32, 1032, 64) 116 | IMPLEMENTATION(CellLBM, 32, 1032, 128, 32, 1032, 128) 117 | IMPLEMENTATION(CellLBM, 32, 1032, 136, 32, 1032, 136) 118 | IMPLEMENTATION(CellLBM, 32, 1032, 192, 32, 1032, 192) 119 | IMPLEMENTATION(CellLBM, 32, 1032, 200, 32, 1032, 200) 120 | IMPLEMENTATION(CellLBM, 32, 1032, 256, 32, 1032, 256) 121 | IMPLEMENTATION(CellLBM, 32, 1032, 264, 32, 1032, 264) 122 | IMPLEMENTATION(CellLBM, 32, 1032, 512, 32, 1032, 512) 123 | IMPLEMENTATION(CellLBM, 32, 1032, 520, 32, 1032, 520) 124 | IMPLEMENTATION(CellLBM, 32, 1032, 1032, 32, 1032, 1032) 125 | -------------------------------------------------------------------------------- /examples/lbm/flatarray_implementation_1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudalineupdatefunctorprototype.h" 3 | 4 | IMPLEMENTATION(CellLBM, 64, 32, 32, 64, 32, 32) 5 | IMPLEMENTATION(CellLBM, 64, 32, 64, 64, 32, 64) 6 | IMPLEMENTATION(CellLBM, 64, 32, 128, 64, 32, 128) 7 | IMPLEMENTATION(CellLBM, 64, 32, 136, 64, 32, 136) 8 | IMPLEMENTATION(CellLBM, 64, 32, 192, 64, 32, 192) 9 | IMPLEMENTATION(CellLBM, 64, 32, 200, 64, 32, 200) 10 | IMPLEMENTATION(CellLBM, 64, 32, 256, 64, 32, 256) 11 | IMPLEMENTATION(CellLBM, 64, 32, 264, 64, 32, 264) 12 | IMPLEMENTATION(CellLBM, 64, 32, 512, 64, 32, 512) 13 | IMPLEMENTATION(CellLBM, 64, 32, 520, 64, 32, 520) 14 | IMPLEMENTATION(CellLBM, 64, 32, 1032, 64, 32, 1032) 15 | IMPLEMENTATION(CellLBM, 64, 64, 32, 64, 64, 32) 16 | IMPLEMENTATION(CellLBM, 64, 64, 64, 64, 64, 64) 17 | IMPLEMENTATION(CellLBM, 64, 64, 128, 64, 64, 128) 18 | IMPLEMENTATION(CellLBM, 64, 64, 136, 64, 64, 136) 19 | IMPLEMENTATION(CellLBM, 64, 64, 192, 64, 64, 192) 20 | IMPLEMENTATION(CellLBM, 64, 64, 200, 64, 64, 200) 21 | IMPLEMENTATION(CellLBM, 64, 64, 256, 64, 64, 256) 22 | IMPLEMENTATION(CellLBM, 64, 64, 264, 64, 64, 264) 23 | IMPLEMENTATION(CellLBM, 64, 64, 512, 64, 64, 512) 24 | IMPLEMENTATION(CellLBM, 64, 64, 520, 64, 64, 520) 25 | IMPLEMENTATION(CellLBM, 64, 64, 1032, 64, 64, 1032) 26 | IMPLEMENTATION(CellLBM, 64, 128, 32, 64, 128, 32) 27 | IMPLEMENTATION(CellLBM, 64, 128, 64, 64, 128, 64) 28 | IMPLEMENTATION(CellLBM, 64, 128, 128, 64, 128, 128) 29 | IMPLEMENTATION(CellLBM, 64, 128, 136, 64, 128, 136) 30 | IMPLEMENTATION(CellLBM, 64, 128, 192, 64, 128, 192) 31 | IMPLEMENTATION(CellLBM, 64, 128, 200, 64, 128, 200) 32 | IMPLEMENTATION(CellLBM, 64, 128, 256, 64, 128, 256) 33 | IMPLEMENTATION(CellLBM, 64, 128, 264, 64, 128, 264) 34 | IMPLEMENTATION(CellLBM, 64, 128, 512, 64, 128, 512) 35 | IMPLEMENTATION(CellLBM, 64, 128, 520, 64, 128, 520) 36 | IMPLEMENTATION(CellLBM, 64, 128, 1032, 64, 128, 1032) 37 | IMPLEMENTATION(CellLBM, 64, 136, 32, 64, 136, 32) 38 | IMPLEMENTATION(CellLBM, 64, 136, 64, 64, 136, 64) 39 | IMPLEMENTATION(CellLBM, 64, 136, 128, 64, 136, 128) 40 | IMPLEMENTATION(CellLBM, 64, 136, 136, 64, 136, 136) 41 | IMPLEMENTATION(CellLBM, 64, 136, 192, 64, 136, 192) 42 | IMPLEMENTATION(CellLBM, 64, 136, 200, 64, 136, 200) 43 | IMPLEMENTATION(CellLBM, 64, 136, 256, 64, 136, 256) 44 | IMPLEMENTATION(CellLBM, 64, 136, 264, 64, 136, 264) 45 | IMPLEMENTATION(CellLBM, 64, 136, 512, 64, 136, 512) 46 | IMPLEMENTATION(CellLBM, 64, 136, 520, 64, 136, 520) 47 | IMPLEMENTATION(CellLBM, 64, 136, 1032, 64, 136, 1032) 48 | IMPLEMENTATION(CellLBM, 64, 192, 32, 64, 192, 32) 49 | IMPLEMENTATION(CellLBM, 64, 192, 64, 64, 192, 64) 50 | IMPLEMENTATION(CellLBM, 64, 192, 128, 64, 192, 128) 51 | IMPLEMENTATION(CellLBM, 64, 192, 136, 64, 192, 136) 52 | IMPLEMENTATION(CellLBM, 64, 192, 192, 64, 192, 192) 53 | IMPLEMENTATION(CellLBM, 64, 192, 200, 64, 192, 200) 54 | IMPLEMENTATION(CellLBM, 64, 192, 256, 64, 192, 256) 55 | IMPLEMENTATION(CellLBM, 64, 192, 264, 64, 192, 264) 56 | IMPLEMENTATION(CellLBM, 64, 192, 512, 64, 192, 512) 57 | IMPLEMENTATION(CellLBM, 64, 192, 520, 64, 192, 520) 58 | IMPLEMENTATION(CellLBM, 64, 192, 1032, 64, 192, 1032) 59 | IMPLEMENTATION(CellLBM, 64, 200, 32, 64, 200, 32) 60 | IMPLEMENTATION(CellLBM, 64, 200, 64, 64, 200, 64) 61 | IMPLEMENTATION(CellLBM, 64, 200, 128, 64, 200, 128) 62 | IMPLEMENTATION(CellLBM, 64, 200, 136, 64, 200, 136) 63 | IMPLEMENTATION(CellLBM, 64, 200, 192, 64, 200, 192) 64 | IMPLEMENTATION(CellLBM, 64, 200, 200, 64, 200, 200) 65 | IMPLEMENTATION(CellLBM, 64, 200, 256, 64, 200, 256) 66 | IMPLEMENTATION(CellLBM, 64, 200, 264, 64, 200, 264) 67 | IMPLEMENTATION(CellLBM, 64, 200, 512, 64, 200, 512) 68 | IMPLEMENTATION(CellLBM, 64, 200, 520, 64, 200, 520) 69 | IMPLEMENTATION(CellLBM, 64, 200, 1032, 64, 200, 1032) 70 | IMPLEMENTATION(CellLBM, 64, 256, 32, 64, 256, 32) 71 | IMPLEMENTATION(CellLBM, 64, 256, 64, 64, 256, 64) 72 | IMPLEMENTATION(CellLBM, 64, 256, 128, 64, 256, 128) 73 | IMPLEMENTATION(CellLBM, 64, 256, 136, 64, 256, 136) 74 | IMPLEMENTATION(CellLBM, 64, 256, 192, 64, 256, 192) 75 | IMPLEMENTATION(CellLBM, 64, 256, 200, 64, 256, 200) 76 | IMPLEMENTATION(CellLBM, 64, 256, 256, 64, 256, 256) 77 | IMPLEMENTATION(CellLBM, 64, 256, 264, 64, 256, 264) 78 | IMPLEMENTATION(CellLBM, 64, 256, 512, 64, 256, 512) 79 | IMPLEMENTATION(CellLBM, 64, 256, 520, 64, 256, 520) 80 | IMPLEMENTATION(CellLBM, 64, 256, 1032, 64, 256, 1032) 81 | IMPLEMENTATION(CellLBM, 64, 264, 32, 64, 264, 32) 82 | IMPLEMENTATION(CellLBM, 64, 264, 64, 64, 264, 64) 83 | IMPLEMENTATION(CellLBM, 64, 264, 128, 64, 264, 128) 84 | IMPLEMENTATION(CellLBM, 64, 264, 136, 64, 264, 136) 85 | IMPLEMENTATION(CellLBM, 64, 264, 192, 64, 264, 192) 86 | IMPLEMENTATION(CellLBM, 64, 264, 200, 64, 264, 200) 87 | IMPLEMENTATION(CellLBM, 64, 264, 256, 64, 264, 256) 88 | IMPLEMENTATION(CellLBM, 64, 264, 264, 64, 264, 264) 89 | IMPLEMENTATION(CellLBM, 64, 264, 512, 64, 264, 512) 90 | IMPLEMENTATION(CellLBM, 64, 264, 520, 64, 264, 520) 91 | IMPLEMENTATION(CellLBM, 64, 264, 1032, 64, 264, 1032) 92 | IMPLEMENTATION(CellLBM, 64, 512, 32, 64, 512, 32) 93 | IMPLEMENTATION(CellLBM, 64, 512, 64, 64, 512, 64) 94 | IMPLEMENTATION(CellLBM, 64, 512, 128, 64, 512, 128) 95 | IMPLEMENTATION(CellLBM, 64, 512, 136, 64, 512, 136) 96 | IMPLEMENTATION(CellLBM, 64, 512, 192, 64, 512, 192) 97 | IMPLEMENTATION(CellLBM, 64, 512, 200, 64, 512, 200) 98 | IMPLEMENTATION(CellLBM, 64, 512, 256, 64, 512, 256) 99 | IMPLEMENTATION(CellLBM, 64, 512, 264, 64, 512, 264) 100 | IMPLEMENTATION(CellLBM, 64, 512, 512, 64, 512, 512) 101 | IMPLEMENTATION(CellLBM, 64, 512, 520, 64, 512, 520) 102 | IMPLEMENTATION(CellLBM, 64, 512, 1032, 64, 512, 1032) 103 | IMPLEMENTATION(CellLBM, 64, 520, 32, 64, 520, 32) 104 | IMPLEMENTATION(CellLBM, 64, 520, 64, 64, 520, 64) 105 | IMPLEMENTATION(CellLBM, 64, 520, 128, 64, 520, 128) 106 | IMPLEMENTATION(CellLBM, 64, 520, 136, 64, 520, 136) 107 | IMPLEMENTATION(CellLBM, 64, 520, 192, 64, 520, 192) 108 | IMPLEMENTATION(CellLBM, 64, 520, 200, 64, 520, 200) 109 | IMPLEMENTATION(CellLBM, 64, 520, 256, 64, 520, 256) 110 | IMPLEMENTATION(CellLBM, 64, 520, 264, 64, 520, 264) 111 | IMPLEMENTATION(CellLBM, 64, 520, 512, 64, 520, 512) 112 | IMPLEMENTATION(CellLBM, 64, 520, 520, 64, 520, 520) 113 | IMPLEMENTATION(CellLBM, 64, 520, 1032, 64, 520, 1032) 114 | IMPLEMENTATION(CellLBM, 64, 1032, 32, 64, 1032, 32) 115 | IMPLEMENTATION(CellLBM, 64, 1032, 64, 64, 1032, 64) 116 | IMPLEMENTATION(CellLBM, 64, 1032, 128, 64, 1032, 128) 117 | IMPLEMENTATION(CellLBM, 64, 1032, 136, 64, 1032, 136) 118 | IMPLEMENTATION(CellLBM, 64, 1032, 192, 64, 1032, 192) 119 | IMPLEMENTATION(CellLBM, 64, 1032, 200, 64, 1032, 200) 120 | IMPLEMENTATION(CellLBM, 64, 1032, 256, 64, 1032, 256) 121 | IMPLEMENTATION(CellLBM, 64, 1032, 264, 64, 1032, 264) 122 | IMPLEMENTATION(CellLBM, 64, 1032, 512, 64, 1032, 512) 123 | IMPLEMENTATION(CellLBM, 64, 1032, 520, 64, 1032, 520) 124 | IMPLEMENTATION(CellLBM, 64, 1032, 1032, 64, 1032, 1032) 125 | -------------------------------------------------------------------------------- /examples/lbm/flatarray_implementation_2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudalineupdatefunctorprototype.h" 3 | 4 | IMPLEMENTATION(CellLBM, 128, 32, 32, 128, 32, 32) 5 | IMPLEMENTATION(CellLBM, 128, 32, 64, 128, 32, 64) 6 | IMPLEMENTATION(CellLBM, 128, 32, 128, 128, 32, 128) 7 | IMPLEMENTATION(CellLBM, 128, 32, 136, 128, 32, 136) 8 | IMPLEMENTATION(CellLBM, 128, 32, 192, 128, 32, 192) 9 | IMPLEMENTATION(CellLBM, 128, 32, 200, 128, 32, 200) 10 | IMPLEMENTATION(CellLBM, 128, 32, 256, 128, 32, 256) 11 | IMPLEMENTATION(CellLBM, 128, 32, 264, 128, 32, 264) 12 | IMPLEMENTATION(CellLBM, 128, 32, 512, 128, 32, 512) 13 | IMPLEMENTATION(CellLBM, 128, 32, 520, 128, 32, 520) 14 | IMPLEMENTATION(CellLBM, 128, 32, 1032, 128, 32, 1032) 15 | IMPLEMENTATION(CellLBM, 128, 64, 32, 128, 64, 32) 16 | IMPLEMENTATION(CellLBM, 128, 64, 64, 128, 64, 64) 17 | IMPLEMENTATION(CellLBM, 128, 64, 128, 128, 64, 128) 18 | IMPLEMENTATION(CellLBM, 128, 64, 136, 128, 64, 136) 19 | IMPLEMENTATION(CellLBM, 128, 64, 192, 128, 64, 192) 20 | IMPLEMENTATION(CellLBM, 128, 64, 200, 128, 64, 200) 21 | IMPLEMENTATION(CellLBM, 128, 64, 256, 128, 64, 256) 22 | IMPLEMENTATION(CellLBM, 128, 64, 264, 128, 64, 264) 23 | IMPLEMENTATION(CellLBM, 128, 64, 512, 128, 64, 512) 24 | IMPLEMENTATION(CellLBM, 128, 64, 520, 128, 64, 520) 25 | IMPLEMENTATION(CellLBM, 128, 64, 1032, 128, 64, 1032) 26 | IMPLEMENTATION(CellLBM, 128, 128, 32, 128, 128, 32) 27 | IMPLEMENTATION(CellLBM, 128, 128, 64, 128, 128, 64) 28 | IMPLEMENTATION(CellLBM, 128, 128, 128, 128, 128, 128) 29 | IMPLEMENTATION(CellLBM, 128, 128, 136, 128, 128, 136) 30 | IMPLEMENTATION(CellLBM, 128, 128, 192, 128, 128, 192) 31 | IMPLEMENTATION(CellLBM, 128, 128, 200, 128, 128, 200) 32 | IMPLEMENTATION(CellLBM, 128, 128, 256, 128, 128, 256) 33 | IMPLEMENTATION(CellLBM, 128, 128, 264, 128, 128, 264) 34 | IMPLEMENTATION(CellLBM, 128, 128, 512, 128, 128, 512) 35 | IMPLEMENTATION(CellLBM, 128, 128, 520, 128, 128, 520) 36 | IMPLEMENTATION(CellLBM, 128, 128, 1032, 128, 128, 1032) 37 | IMPLEMENTATION(CellLBM, 128, 136, 32, 128, 136, 32) 38 | IMPLEMENTATION(CellLBM, 128, 136, 64, 128, 136, 64) 39 | IMPLEMENTATION(CellLBM, 128, 136, 128, 128, 136, 128) 40 | IMPLEMENTATION(CellLBM, 128, 136, 136, 128, 136, 136) 41 | IMPLEMENTATION(CellLBM, 128, 136, 192, 128, 136, 192) 42 | IMPLEMENTATION(CellLBM, 128, 136, 200, 128, 136, 200) 43 | IMPLEMENTATION(CellLBM, 128, 136, 256, 128, 136, 256) 44 | IMPLEMENTATION(CellLBM, 128, 136, 264, 128, 136, 264) 45 | IMPLEMENTATION(CellLBM, 128, 136, 512, 128, 136, 512) 46 | IMPLEMENTATION(CellLBM, 128, 136, 520, 128, 136, 520) 47 | IMPLEMENTATION(CellLBM, 128, 136, 1032, 128, 136, 1032) 48 | IMPLEMENTATION(CellLBM, 128, 192, 32, 128, 192, 32) 49 | IMPLEMENTATION(CellLBM, 128, 192, 64, 128, 192, 64) 50 | IMPLEMENTATION(CellLBM, 128, 192, 128, 128, 192, 128) 51 | IMPLEMENTATION(CellLBM, 128, 192, 136, 128, 192, 136) 52 | IMPLEMENTATION(CellLBM, 128, 192, 192, 128, 192, 192) 53 | IMPLEMENTATION(CellLBM, 128, 192, 200, 128, 192, 200) 54 | IMPLEMENTATION(CellLBM, 128, 192, 256, 128, 192, 256) 55 | IMPLEMENTATION(CellLBM, 128, 192, 264, 128, 192, 264) 56 | IMPLEMENTATION(CellLBM, 128, 192, 512, 128, 192, 512) 57 | IMPLEMENTATION(CellLBM, 128, 192, 520, 128, 192, 520) 58 | IMPLEMENTATION(CellLBM, 128, 192, 1032, 128, 192, 1032) 59 | IMPLEMENTATION(CellLBM, 128, 200, 32, 128, 200, 32) 60 | IMPLEMENTATION(CellLBM, 128, 200, 64, 128, 200, 64) 61 | IMPLEMENTATION(CellLBM, 128, 200, 128, 128, 200, 128) 62 | IMPLEMENTATION(CellLBM, 128, 200, 136, 128, 200, 136) 63 | IMPLEMENTATION(CellLBM, 128, 200, 192, 128, 200, 192) 64 | IMPLEMENTATION(CellLBM, 128, 200, 200, 128, 200, 200) 65 | IMPLEMENTATION(CellLBM, 128, 200, 256, 128, 200, 256) 66 | IMPLEMENTATION(CellLBM, 128, 200, 264, 128, 200, 264) 67 | IMPLEMENTATION(CellLBM, 128, 200, 512, 128, 200, 512) 68 | IMPLEMENTATION(CellLBM, 128, 200, 520, 128, 200, 520) 69 | IMPLEMENTATION(CellLBM, 128, 200, 1032, 128, 200, 1032) 70 | IMPLEMENTATION(CellLBM, 128, 256, 32, 128, 256, 32) 71 | IMPLEMENTATION(CellLBM, 128, 256, 64, 128, 256, 64) 72 | IMPLEMENTATION(CellLBM, 128, 256, 128, 128, 256, 128) 73 | IMPLEMENTATION(CellLBM, 128, 256, 136, 128, 256, 136) 74 | IMPLEMENTATION(CellLBM, 128, 256, 192, 128, 256, 192) 75 | IMPLEMENTATION(CellLBM, 128, 256, 200, 128, 256, 200) 76 | IMPLEMENTATION(CellLBM, 128, 256, 256, 128, 256, 256) 77 | IMPLEMENTATION(CellLBM, 128, 256, 264, 128, 256, 264) 78 | IMPLEMENTATION(CellLBM, 128, 256, 512, 128, 256, 512) 79 | IMPLEMENTATION(CellLBM, 128, 256, 520, 128, 256, 520) 80 | IMPLEMENTATION(CellLBM, 128, 256, 1032, 128, 256, 1032) 81 | IMPLEMENTATION(CellLBM, 128, 264, 32, 128, 264, 32) 82 | IMPLEMENTATION(CellLBM, 128, 264, 64, 128, 264, 64) 83 | IMPLEMENTATION(CellLBM, 128, 264, 128, 128, 264, 128) 84 | IMPLEMENTATION(CellLBM, 128, 264, 136, 128, 264, 136) 85 | IMPLEMENTATION(CellLBM, 128, 264, 192, 128, 264, 192) 86 | IMPLEMENTATION(CellLBM, 128, 264, 200, 128, 264, 200) 87 | IMPLEMENTATION(CellLBM, 128, 264, 256, 128, 264, 256) 88 | IMPLEMENTATION(CellLBM, 128, 264, 264, 128, 264, 264) 89 | IMPLEMENTATION(CellLBM, 128, 264, 512, 128, 264, 512) 90 | IMPLEMENTATION(CellLBM, 128, 264, 520, 128, 264, 520) 91 | IMPLEMENTATION(CellLBM, 128, 264, 1032, 128, 264, 1032) 92 | IMPLEMENTATION(CellLBM, 128, 512, 32, 128, 512, 32) 93 | IMPLEMENTATION(CellLBM, 128, 512, 64, 128, 512, 64) 94 | IMPLEMENTATION(CellLBM, 128, 512, 128, 128, 512, 128) 95 | IMPLEMENTATION(CellLBM, 128, 512, 136, 128, 512, 136) 96 | IMPLEMENTATION(CellLBM, 128, 512, 192, 128, 512, 192) 97 | IMPLEMENTATION(CellLBM, 128, 512, 200, 128, 512, 200) 98 | IMPLEMENTATION(CellLBM, 128, 512, 256, 128, 512, 256) 99 | IMPLEMENTATION(CellLBM, 128, 512, 264, 128, 512, 264) 100 | IMPLEMENTATION(CellLBM, 128, 512, 512, 128, 512, 512) 101 | IMPLEMENTATION(CellLBM, 128, 512, 520, 128, 512, 520) 102 | IMPLEMENTATION(CellLBM, 128, 512, 1032, 128, 512, 1032) 103 | IMPLEMENTATION(CellLBM, 128, 520, 32, 128, 520, 32) 104 | IMPLEMENTATION(CellLBM, 128, 520, 64, 128, 520, 64) 105 | IMPLEMENTATION(CellLBM, 128, 520, 128, 128, 520, 128) 106 | IMPLEMENTATION(CellLBM, 128, 520, 136, 128, 520, 136) 107 | IMPLEMENTATION(CellLBM, 128, 520, 192, 128, 520, 192) 108 | IMPLEMENTATION(CellLBM, 128, 520, 200, 128, 520, 200) 109 | IMPLEMENTATION(CellLBM, 128, 520, 256, 128, 520, 256) 110 | IMPLEMENTATION(CellLBM, 128, 520, 264, 128, 520, 264) 111 | IMPLEMENTATION(CellLBM, 128, 520, 512, 128, 520, 512) 112 | IMPLEMENTATION(CellLBM, 128, 520, 520, 128, 520, 520) 113 | IMPLEMENTATION(CellLBM, 128, 520, 1032, 128, 520, 1032) 114 | IMPLEMENTATION(CellLBM, 128, 1032, 32, 128, 1032, 32) 115 | IMPLEMENTATION(CellLBM, 128, 1032, 64, 128, 1032, 64) 116 | IMPLEMENTATION(CellLBM, 128, 1032, 128, 128, 1032, 128) 117 | IMPLEMENTATION(CellLBM, 128, 1032, 136, 128, 1032, 136) 118 | IMPLEMENTATION(CellLBM, 128, 1032, 192, 128, 1032, 192) 119 | IMPLEMENTATION(CellLBM, 128, 1032, 200, 128, 1032, 200) 120 | IMPLEMENTATION(CellLBM, 128, 1032, 256, 128, 1032, 256) 121 | IMPLEMENTATION(CellLBM, 128, 1032, 264, 128, 1032, 264) 122 | IMPLEMENTATION(CellLBM, 128, 1032, 512, 128, 1032, 512) 123 | IMPLEMENTATION(CellLBM, 128, 1032, 520, 128, 1032, 520) 124 | IMPLEMENTATION(CellLBM, 128, 1032, 1032, 128, 1032, 1032) 125 | -------------------------------------------------------------------------------- /examples/lbm/flatarray_implementation_3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudalineupdatefunctorprototype.h" 3 | 4 | IMPLEMENTATION(CellLBM, 136, 32, 32, 136, 32, 32) 5 | IMPLEMENTATION(CellLBM, 136, 32, 64, 136, 32, 64) 6 | IMPLEMENTATION(CellLBM, 136, 32, 128, 136, 32, 128) 7 | IMPLEMENTATION(CellLBM, 136, 32, 136, 136, 32, 136) 8 | IMPLEMENTATION(CellLBM, 136, 32, 192, 136, 32, 192) 9 | IMPLEMENTATION(CellLBM, 136, 32, 200, 136, 32, 200) 10 | IMPLEMENTATION(CellLBM, 136, 32, 256, 136, 32, 256) 11 | IMPLEMENTATION(CellLBM, 136, 32, 264, 136, 32, 264) 12 | IMPLEMENTATION(CellLBM, 136, 32, 512, 136, 32, 512) 13 | IMPLEMENTATION(CellLBM, 136, 32, 520, 136, 32, 520) 14 | IMPLEMENTATION(CellLBM, 136, 32, 1032, 136, 32, 1032) 15 | IMPLEMENTATION(CellLBM, 136, 64, 32, 136, 64, 32) 16 | IMPLEMENTATION(CellLBM, 136, 64, 64, 136, 64, 64) 17 | IMPLEMENTATION(CellLBM, 136, 64, 128, 136, 64, 128) 18 | IMPLEMENTATION(CellLBM, 136, 64, 136, 136, 64, 136) 19 | IMPLEMENTATION(CellLBM, 136, 64, 192, 136, 64, 192) 20 | IMPLEMENTATION(CellLBM, 136, 64, 200, 136, 64, 200) 21 | IMPLEMENTATION(CellLBM, 136, 64, 256, 136, 64, 256) 22 | IMPLEMENTATION(CellLBM, 136, 64, 264, 136, 64, 264) 23 | IMPLEMENTATION(CellLBM, 136, 64, 512, 136, 64, 512) 24 | IMPLEMENTATION(CellLBM, 136, 64, 520, 136, 64, 520) 25 | IMPLEMENTATION(CellLBM, 136, 64, 1032, 136, 64, 1032) 26 | IMPLEMENTATION(CellLBM, 136, 128, 32, 136, 128, 32) 27 | IMPLEMENTATION(CellLBM, 136, 128, 64, 136, 128, 64) 28 | IMPLEMENTATION(CellLBM, 136, 128, 128, 136, 128, 128) 29 | IMPLEMENTATION(CellLBM, 136, 128, 136, 136, 128, 136) 30 | IMPLEMENTATION(CellLBM, 136, 128, 192, 136, 128, 192) 31 | IMPLEMENTATION(CellLBM, 136, 128, 200, 136, 128, 200) 32 | IMPLEMENTATION(CellLBM, 136, 128, 256, 136, 128, 256) 33 | IMPLEMENTATION(CellLBM, 136, 128, 264, 136, 128, 264) 34 | IMPLEMENTATION(CellLBM, 136, 128, 512, 136, 128, 512) 35 | IMPLEMENTATION(CellLBM, 136, 128, 520, 136, 128, 520) 36 | IMPLEMENTATION(CellLBM, 136, 128, 1032, 136, 128, 1032) 37 | IMPLEMENTATION(CellLBM, 136, 136, 32, 136, 136, 32) 38 | IMPLEMENTATION(CellLBM, 136, 136, 64, 136, 136, 64) 39 | IMPLEMENTATION(CellLBM, 136, 136, 128, 136, 136, 128) 40 | IMPLEMENTATION(CellLBM, 136, 136, 136, 136, 136, 136) 41 | IMPLEMENTATION(CellLBM, 136, 136, 192, 136, 136, 192) 42 | IMPLEMENTATION(CellLBM, 136, 136, 200, 136, 136, 200) 43 | IMPLEMENTATION(CellLBM, 136, 136, 256, 136, 136, 256) 44 | IMPLEMENTATION(CellLBM, 136, 136, 264, 136, 136, 264) 45 | IMPLEMENTATION(CellLBM, 136, 136, 512, 136, 136, 512) 46 | IMPLEMENTATION(CellLBM, 136, 136, 520, 136, 136, 520) 47 | IMPLEMENTATION(CellLBM, 136, 136, 1032, 136, 136, 1032) 48 | IMPLEMENTATION(CellLBM, 136, 192, 32, 136, 192, 32) 49 | IMPLEMENTATION(CellLBM, 136, 192, 64, 136, 192, 64) 50 | IMPLEMENTATION(CellLBM, 136, 192, 128, 136, 192, 128) 51 | IMPLEMENTATION(CellLBM, 136, 192, 136, 136, 192, 136) 52 | IMPLEMENTATION(CellLBM, 136, 192, 192, 136, 192, 192) 53 | IMPLEMENTATION(CellLBM, 136, 192, 200, 136, 192, 200) 54 | IMPLEMENTATION(CellLBM, 136, 192, 256, 136, 192, 256) 55 | IMPLEMENTATION(CellLBM, 136, 192, 264, 136, 192, 264) 56 | IMPLEMENTATION(CellLBM, 136, 192, 512, 136, 192, 512) 57 | IMPLEMENTATION(CellLBM, 136, 192, 520, 136, 192, 520) 58 | IMPLEMENTATION(CellLBM, 136, 192, 1032, 136, 192, 1032) 59 | IMPLEMENTATION(CellLBM, 136, 200, 32, 136, 200, 32) 60 | IMPLEMENTATION(CellLBM, 136, 200, 64, 136, 200, 64) 61 | IMPLEMENTATION(CellLBM, 136, 200, 128, 136, 200, 128) 62 | IMPLEMENTATION(CellLBM, 136, 200, 136, 136, 200, 136) 63 | IMPLEMENTATION(CellLBM, 136, 200, 192, 136, 200, 192) 64 | IMPLEMENTATION(CellLBM, 136, 200, 200, 136, 200, 200) 65 | IMPLEMENTATION(CellLBM, 136, 200, 256, 136, 200, 256) 66 | IMPLEMENTATION(CellLBM, 136, 200, 264, 136, 200, 264) 67 | IMPLEMENTATION(CellLBM, 136, 200, 512, 136, 200, 512) 68 | IMPLEMENTATION(CellLBM, 136, 200, 520, 136, 200, 520) 69 | IMPLEMENTATION(CellLBM, 136, 200, 1032, 136, 200, 1032) 70 | IMPLEMENTATION(CellLBM, 136, 256, 32, 136, 256, 32) 71 | IMPLEMENTATION(CellLBM, 136, 256, 64, 136, 256, 64) 72 | IMPLEMENTATION(CellLBM, 136, 256, 128, 136, 256, 128) 73 | IMPLEMENTATION(CellLBM, 136, 256, 136, 136, 256, 136) 74 | IMPLEMENTATION(CellLBM, 136, 256, 192, 136, 256, 192) 75 | IMPLEMENTATION(CellLBM, 136, 256, 200, 136, 256, 200) 76 | IMPLEMENTATION(CellLBM, 136, 256, 256, 136, 256, 256) 77 | IMPLEMENTATION(CellLBM, 136, 256, 264, 136, 256, 264) 78 | IMPLEMENTATION(CellLBM, 136, 256, 512, 136, 256, 512) 79 | IMPLEMENTATION(CellLBM, 136, 256, 520, 136, 256, 520) 80 | IMPLEMENTATION(CellLBM, 136, 256, 1032, 136, 256, 1032) 81 | IMPLEMENTATION(CellLBM, 136, 264, 32, 136, 264, 32) 82 | IMPLEMENTATION(CellLBM, 136, 264, 64, 136, 264, 64) 83 | IMPLEMENTATION(CellLBM, 136, 264, 128, 136, 264, 128) 84 | IMPLEMENTATION(CellLBM, 136, 264, 136, 136, 264, 136) 85 | IMPLEMENTATION(CellLBM, 136, 264, 192, 136, 264, 192) 86 | IMPLEMENTATION(CellLBM, 136, 264, 200, 136, 264, 200) 87 | IMPLEMENTATION(CellLBM, 136, 264, 256, 136, 264, 256) 88 | IMPLEMENTATION(CellLBM, 136, 264, 264, 136, 264, 264) 89 | IMPLEMENTATION(CellLBM, 136, 264, 512, 136, 264, 512) 90 | IMPLEMENTATION(CellLBM, 136, 264, 520, 136, 264, 520) 91 | IMPLEMENTATION(CellLBM, 136, 264, 1032, 136, 264, 1032) 92 | IMPLEMENTATION(CellLBM, 136, 512, 32, 136, 512, 32) 93 | IMPLEMENTATION(CellLBM, 136, 512, 64, 136, 512, 64) 94 | IMPLEMENTATION(CellLBM, 136, 512, 128, 136, 512, 128) 95 | IMPLEMENTATION(CellLBM, 136, 512, 136, 136, 512, 136) 96 | IMPLEMENTATION(CellLBM, 136, 512, 192, 136, 512, 192) 97 | IMPLEMENTATION(CellLBM, 136, 512, 200, 136, 512, 200) 98 | IMPLEMENTATION(CellLBM, 136, 512, 256, 136, 512, 256) 99 | IMPLEMENTATION(CellLBM, 136, 512, 264, 136, 512, 264) 100 | IMPLEMENTATION(CellLBM, 136, 512, 512, 136, 512, 512) 101 | IMPLEMENTATION(CellLBM, 136, 512, 520, 136, 512, 520) 102 | IMPLEMENTATION(CellLBM, 136, 512, 1032, 136, 512, 1032) 103 | IMPLEMENTATION(CellLBM, 136, 520, 32, 136, 520, 32) 104 | IMPLEMENTATION(CellLBM, 136, 520, 64, 136, 520, 64) 105 | IMPLEMENTATION(CellLBM, 136, 520, 128, 136, 520, 128) 106 | IMPLEMENTATION(CellLBM, 136, 520, 136, 136, 520, 136) 107 | IMPLEMENTATION(CellLBM, 136, 520, 192, 136, 520, 192) 108 | IMPLEMENTATION(CellLBM, 136, 520, 200, 136, 520, 200) 109 | IMPLEMENTATION(CellLBM, 136, 520, 256, 136, 520, 256) 110 | IMPLEMENTATION(CellLBM, 136, 520, 264, 136, 520, 264) 111 | IMPLEMENTATION(CellLBM, 136, 520, 512, 136, 520, 512) 112 | IMPLEMENTATION(CellLBM, 136, 520, 520, 136, 520, 520) 113 | IMPLEMENTATION(CellLBM, 136, 520, 1032, 136, 520, 1032) 114 | IMPLEMENTATION(CellLBM, 136, 1032, 32, 136, 1032, 32) 115 | IMPLEMENTATION(CellLBM, 136, 1032, 64, 136, 1032, 64) 116 | IMPLEMENTATION(CellLBM, 136, 1032, 128, 136, 1032, 128) 117 | IMPLEMENTATION(CellLBM, 136, 1032, 136, 136, 1032, 136) 118 | IMPLEMENTATION(CellLBM, 136, 1032, 192, 136, 1032, 192) 119 | IMPLEMENTATION(CellLBM, 136, 1032, 200, 136, 1032, 200) 120 | IMPLEMENTATION(CellLBM, 136, 1032, 256, 136, 1032, 256) 121 | IMPLEMENTATION(CellLBM, 136, 1032, 264, 136, 1032, 264) 122 | IMPLEMENTATION(CellLBM, 136, 1032, 512, 136, 1032, 512) 123 | IMPLEMENTATION(CellLBM, 136, 1032, 520, 136, 1032, 520) 124 | IMPLEMENTATION(CellLBM, 136, 1032, 1032, 136, 1032, 1032) 125 | --------------------------------------------------------------------------------