├── .gitignore
├── examples
    ├── performance_tests
    │   ├── CMakeLists.txt
    │   └── plot
    ├── smoothed_particle_hydrodynamics
    │   ├── CMakeLists.txt
    │   ├── kernels.h
    │   └── kernels.c
    ├── CMakeLists.txt
    ├── gauss
    │   ├── CMakeLists.txt
    │   └── filter_c99.c
    ├── jacobi
    │   ├── CMakeLists.txt
    │   └── update_c99.c
    └── lbm
    │   ├── CMakeLists.txt
    │   ├── generator
    │   ├── main.cu
    │   ├── update_lbm_cuda_flat_array.h
    │   ├── cudalineupdatefunctorprototype.h
    │   ├── util.h
    │   ├── update_lbm_object_oriented.h
    │   ├── update_lbm_classic.h
    │   ├── flatarray_implementation_0.cu
    │   ├── flatarray_implementation_1.cu
    │   ├── flatarray_implementation_2.cu
    │   └── flatarray_implementation_3.cu
├── AUTHORS
├── .appveyor.yml
├── include
    └── libflatarray
    │   ├── short_vec_base.hpp
    │   ├── number_of_members.hpp
    │   ├── coord.hpp
    │   ├── aggregated_member_size.hpp
    │   ├── detail
    │       ├── init_kernel.hpp
    │       ├── sqrt_reference.hpp
    │       ├── sibling_short_vec_switch.hpp
    │       ├── streaming_short_vec_switch.hpp
    │       ├── simple_streak.hpp
    │       ├── offset.hpp
    │       ├── generate_cuda_launch_config.hpp
    │       ├── copy_functor.hpp
    │       ├── generic_destruct.hpp
    │       ├── set_byte_size_functor.hpp
    │       ├── staging_buffer.hpp
    │       ├── construct_functor.hpp
    │       ├── destroy_functor.hpp
    │       ├── dual_callback_helper.hpp
    │       ├── get_instance_functor.hpp
    │       ├── save_functor.hpp
    │       ├── set_instance_functor.hpp
    │       ├── short_vec_helpers.hpp
    │       ├── load_functor.hpp
    │       ├── short_vec_mic_double_8.hpp
    │       └── short_vec_scalar_int_2.hpp
    │   ├── member_ptr_to_offset.hpp
    │   ├── alignment.hpp
    │   ├── testbed
    │       ├── gpu_benchmark.hpp
    │       ├── benchmark.hpp
    │       ├── evaluate.hpp
    │       └── cpu_benchmark.hpp
    │   ├── flat_array.hpp
    │   ├── soa_accessor.hpp
    │   ├── cuda_allocator.hpp
    │   ├── preprocessor.hpp
    │   ├── ilp_to_arity.hpp
    │   ├── streaming_short_vec.hpp
    │   ├── estimate_optimum_short_vec_type.hpp
    │   ├── aligned_allocator.hpp
    │   ├── soa_vector.hpp
    │   └── loop_peeler.hpp
├── test
    ├── short_vec_additional_test.cpp
    ├── cuda_allocator_test.cu
    ├── aligned_allocator_test.cpp
    ├── loop_peeler_test.cpp
    ├── soa_array_cuda_test.cu
    ├── test.hpp
    ├── estimate_optimum_short_vec_type_test.cpp
    ├── preprocessor_test.cpp
    └── CMakeLists.txt
├── .circleci
    └── config.yml
├── LICENSE
├── README
└── CMakeModules
    └── FindSilo.cmake


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | *~
3 | 
4 | 


--------------------------------------------------------------------------------
/examples/performance_tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(performance_tests main.cpp)
2 | target_link_libraries(performance_tests ${libflatarray_LIBS})
3 | 


--------------------------------------------------------------------------------
/examples/smoothed_particle_hydrodynamics/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(WITH_SILO AND WITH_CPP14)
2 |   add_executable(sph main.cpp kernels.c)
3 |   include_directories(${Silo_INCLUDE_DIR})
4 |   target_link_libraries(sph ${Silo_LIBRARY})
5 | endif()
6 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(OpenMP)
 2 | 
 3 | if(NOT MSVC)
 4 |   add_subdirectory(jacobi)
 5 |   add_subdirectory(gauss)
 6 | endif()
 7 | add_subdirectory(lbm)
 8 | add_subdirectory(performance_tests)
 9 | add_subdirectory(smoothed_particle_hydrodynamics)
10 | 


--------------------------------------------------------------------------------
/examples/gauss/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(gauss main.cpp filter_c99.c)
 2 | target_link_libraries(gauss ${libflatarray_LIBS})
 3 | 
 4 | if(OPENMP_FOUND)
 5 |   if(CMAKE_VERSION VERSION_GREATER 2.8.11)
 6 |     target_compile_options(gauss PRIVATE ${OpenMP_CXX_FLAGS})
 7 |   endif()
 8 |   set_target_properties(gauss PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS})
 9 | endif()
10 | 


--------------------------------------------------------------------------------
/examples/jacobi/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(jacobi main.cpp update_c99.c)
 2 | target_link_libraries(jacobi ${libflatarray_LIBS})
 3 | 
 4 | if(OPENMP_FOUND)
 5 |   if(CMAKE_VERSION VERSION_GREATER 2.8.11)
 6 |     target_compile_options(jacobi PRIVATE ${OpenMP_CXX_FLAGS})
 7 |   endif()
 8 |   set_target_properties(jacobi PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS})
 9 | endif()
10 | 


--------------------------------------------------------------------------------
/examples/lbm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(WITH_CUDA)
2 |   lfa_cuda_add_executable(lbm2 main.cu flatarray_implementation_0.cu flatarray_implementation_1.cu flatarray_implementation_2.cu flatarray_implementation_3.cu flatarray_implementation_4.cu flatarray_implementation_5.cu flatarray_implementation_6.cu flatarray_implementation_7.cu flatarray_implementation_8.cu flatarray_implementation_9.cu flatarray_implementation_10.cu)
3 | endif()
4 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Copyright:
 2 | 
 3 | Year(s)   Name                 Affiliation Email
 4 | --------- -------------------- ----------- ---------------------------------
 5 | 2012-2015 Andreas Schäfer      FAU         gentryx@gmx.de
 6 | 2014-2015 Kurt Kanzenbach      FAU         kurt@kmk-computers.de
 7 | 2015-2015 Di Xiao (Larry)      SJTU        xiaodi@sjtu.edu.cn
 8 | 
 9 | Affiliation Abbreviations:
10 | --------------------------
11 | 
12 | FAU = Friedrich-Alexander-Universität Erlangen-Nürnberg
13 | SJTU = Shanghai Jiao Tong University
14 | 


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: 1.0.{build}
 2 | 
 3 | shallow_clone: true
 4 | 
 5 | matrix:
 6 |   fast_finish: true
 7 | 
 8 | environment:
 9 |   matrix:
10 |     - GENERATOR: "Visual Studio 14"
11 |       CONFIG: Debug
12 | 
13 |     - GENERATOR: "Visual Studio 14"
14 |       CONFIG: Release
15 | 
16 | os: Visual Studio 2015
17 | 
18 | build_script:
19 |   - cmake "-G%GENERATOR%" -H. -B_builds
20 |   - cmake --build _builds --config "%CONFIG%"
21 |   - cmake --build _builds --config "%CONFIG%" --target tests
22 | 
23 | #  - _builds\test\Debug\api_traits_test.exe
24 | 


--------------------------------------------------------------------------------
/include/libflatarray/short_vec_base.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_SHORT_VEC_BASE_HPP
 9 | #define FLAT_ARRAY_SHORT_VEC_BASE_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | template<typename CARGO, std::size_t ARITY>
14 | class short_vec_base
15 | {
16 | public:
17 |     static inline
18 |     std::size_t size()
19 |     {
20 |         return ARITY;
21 |     }
22 | 
23 | };
24 | 
25 | }
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/include/libflatarray/number_of_members.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_NUMBER_OF_MEMBERS_HPP
 9 | #define FLAT_ARRAY_NUMBER_OF_MEMBERS_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | /**
14 |  * Allow the user to access the number of data members of the SoA type.
15 |  *
16 |  * Will be instantiated by LIBFLATARRAY_REGISTER_SOA().
17 |  */
18 | template<typename CELL_TYPE>
19 | class number_of_members;
20 | 
21 | }
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/test/short_vec_additional_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2017 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | // include here to have another object file and check if linking with
 9 | // origninal test still works.
10 | #include <libflatarray/short_vec.hpp>
11 | 
12 | // globally disable some warnings with MSVC, that are issued not for a
13 | // specific header, but rather for the interaction of system headers
14 | // and LibFlatArray source:
15 | #ifdef _MSC_BUILD
16 | #pragma warning( disable : 4710 )
17 | #endif
18 | 


--------------------------------------------------------------------------------
/include/libflatarray/coord.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_COORD_HPP
 9 | #define FLAT_ARRAY_COORD_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | /**
14 |  * A utility class to specify (relative) coordinates. The class is to
15 |  * be used with soa_accessor.
16 |  *
17 |  * Since the coordinates are fixed at compile time, all dependent
18 |  * address calculations can be done at compile time.
19 |  */
20 | template<long X, long Y, long Z>
21 | class coord
22 | {};
23 | 
24 | }
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     docker:
 5 |       - image: circleci/ruby:stretch
 6 |     steps:
 7 |       - checkout
 8 |       - run: mkdir build
 9 |       - run: sudo apt-get update && sudo apt-get install -y cmake
10 |       - run: cd build && cmake ..
11 |       - run: cd build && make
12 |   test:
13 |     docker:
14 |       - image: circleci/ruby:stretch
15 |     steps:
16 |       - checkout
17 |       - run: mkdir build
18 |       - run: sudo apt-get update && sudo apt-get install -y cmake
19 |       - run: cd build && cmake ..
20 |       - run: cd build && make check
21 | workflows:
22 |   version: 2
23 |   build_and_test:
24 |     jobs:
25 |       - build
26 |       - test
27 | 


--------------------------------------------------------------------------------
/include/libflatarray/aggregated_member_size.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_AGGREGATED_MEMBER_SIZE_HPP
 9 | #define FLAT_ARRAY_AGGREGATED_MEMBER_SIZE_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | /**
14 |  * Accumulate the sizes of the individual data members. This may be
15 |  * lower than sizeof(CELL_TYPE) as structs/objects in C++ may need
16 |  * padding. We can avoid the padding of individual members in a SoA
17 |  * memory layout.
18 |  *
19 |  * Will be instantiated by LIBFLATARRAY_REGISTER_SOA().
20 |  */
21 | template<typename CELL_TYPE>
22 | class aggregated_member_size;
23 | 
24 | }
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/examples/lbm/generator:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | intervals = [
 4 |   [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032],
 5 |   [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032],
 6 |   [32, 64, 128, 136, 192, 200, 256, 264, 512, 520, 1032]
 7 | ]
 8 | 
 9 | counter = -1
10 | 
11 | intervals[0].size.times do |x1|
12 |   counter += 1
13 |   File.open("flatarray_implementation_#{counter}.cu", "w") do |f|
14 |     f.puts <<EOF
15 | #include <iostream>
16 | #include "cudalineupdatefunctorprototype.h"
17 | 
18 | EOF
19 | 
20 |     xA = intervals[0][x1]
21 |     xB = xA
22 | 
23 |     intervals[1].size.times do |y1|
24 |       yA = intervals[1][y1]
25 |       yB = yA
26 | 
27 |       intervals[2].size.times do |z1|
28 |         zA = intervals[2][z1]
29 |         zB = zA
30 | 
31 |         f.puts "IMPLEMENTATION(CellLBM, #{xA}, #{yA}, #{zA}, #{xB}, #{yB}, #{zB})"
32 |       end
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/init_kernel.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2018 Google
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_INIT_KERNEL_HPP
 9 | #define FLAT_ARRAY_DETAIL_INIT_KERNEL_HPP
10 | 
11 | #include <libflatarray/config.h>
12 | 
13 | namespace LibFlatArray {
14 | 
15 | namespace detail {
16 | 
17 | namespace flat_array {
18 | 
19 | #ifdef LIBFLATARRAY_WITH_CUDA
20 | #ifdef __CUDACC__
21 | 
22 | template<typename CELL>
23 | __global__
24 | void init_kernel(CELL source, CELL *target, long count)
25 | {
26 |     long thread_index = blockDim.x * blockIdx.x + threadIdx.x;
27 |     if (thread_index >= count) {
28 |         return;
29 |     }
30 | 
31 |     target[thread_index] = source;
32 | }
33 | 
34 | #endif
35 | #endif
36 | 
37 | }
38 | 
39 | }
40 | 
41 | }
42 | 
43 | #endif
44 | 
45 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/sqrt_reference.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_SQRT_REFERENCE_HPP
 9 | #define FLAT_ARRAY_DETAIL_SQRT_REFERENCE_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | template<typename CARGO, std::size_t ARITY>
14 | class sqrt_reference;
15 | 
16 | template<typename CARGO, std::size_t ARITY>
17 | short_vec<CARGO, ARITY> operator/(const sqrt_reference<CARGO, ARITY>& a, const short_vec<CARGO, ARITY>& b)
18 | {
19 |     return short_vec<CARGO, ARITY>(a) / b;
20 | }
21 | 
22 | template<typename CARGO, std::size_t ARITY>
23 | inline short_vec<CARGO, ARITY> operator/(const sqrt_reference<CARGO, ARITY>& a, const CARGO b)
24 | {
25 |     return short_vec<CARGO, ARITY>(a) / b;
26 | }
27 | 
28 | }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/sibling_short_vec_switch.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_SIBLING_SHORT_VEC_SWITCH_HPP
 9 | #define FLAT_ARRAY_DETAIL_SIBLING_SHORT_VEC_SWITCH_HPP
10 | 
11 | namespace LibFlatArray {
12 | namespace detail {
13 | namespace flat_array {
14 | 
15 | template<typename SHORT_VEC, std::size_t TARGET_ARITY>
16 | class sibling_short_vec_switch;
17 | 
18 | template<
19 |     template<typename CARGO_PARAM, std::size_t ARITY_PARAM> class SHORT_VEC_TEMPLATE,
20 |     typename CARGO,
21 |     std::size_t ARITY,
22 |     std::size_t TARGET_ARITY>
23 | class sibling_short_vec_switch<SHORT_VEC_TEMPLATE<CARGO, ARITY>, TARGET_ARITY>
24 | {
25 | public:
26 |     typedef SHORT_VEC_TEMPLATE<CARGO, TARGET_ARITY> VALUE;
27 | };
28 | 
29 | }
30 | }
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/streaming_short_vec_switch.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_STREAMING_SHORT_VEC_SWITCH_HPP
 9 | #define FLAT_ARRAY_DETAIL_STREAMING_SHORT_VEC_SWITCH_HPP
10 | 
11 | #include <libflatarray/short_vec.hpp>
12 | #include <libflatarray/streaming_short_vec.hpp>
13 | 
14 | namespace LibFlatArray {
15 | namespace detail {
16 | namespace flat_array {
17 | 
18 | template<typename CARGO, std::size_t ARITY, int STREAMING_FLAG>
19 | class streaming_short_vec_switch
20 | {
21 | public:
22 |     typedef streaming_short_vec<CARGO, ARITY> VALUE;
23 | };
24 | 
25 | template<typename CARGO, std::size_t ARITY>
26 | class streaming_short_vec_switch<CARGO, ARITY, 0>
27 | {
28 | public:
29 |     typedef short_vec<CARGO, ARITY> VALUE;
30 | };
31 | 
32 | }
33 | }
34 | }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/examples/lbm/main.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2013-2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #include <cuda.h>
 9 | #include <sstream>
10 | 
11 | #include "cell.h"
12 | #include "util.h"
13 | #include "update_lbm_classic.h"
14 | #include "update_lbm_object_oriented.h"
15 | #include "update_lbm_cuda_flat_array.h"
16 | 
17 | int main(int argc, char **argv)
18 | {
19 |     if (argc != 2) {
20 |         std::cerr << "usage: " << argv[0] << " CUDA_DEVICE\n";
21 |         return 1;
22 |     }
23 | 
24 |     std::stringstream s;
25 |     s << argv[1];
26 |     int cudaDevice;
27 |     s >> cudaDevice;
28 |     cudaSetDevice(cudaDevice);
29 | 
30 |     std::cout << "# test name              ; dim ; performance\n";
31 |     benchmark_lbm_cuda_object_oriented().evaluate();
32 |     benchmark_lbm_cuda_classic().evaluate();
33 |     benchmark_lbm_cuda_flat_array().evaluate();
34 | 
35 |     return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/include/libflatarray/member_ptr_to_offset.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_MEMBER_PTR_TO_OFFSET_HPP
 9 | #define FLAT_ARRAY_MEMBER_PTR_TO_OFFSET_HPP
10 | 
11 | #include <libflatarray/detail/offset.hpp>
12 | #include <libflatarray/number_of_members.hpp>
13 | 
14 | namespace LibFlatArray {
15 | 
16 | /**
17 |  * Lets user code discover a member's offset in the SoA layout from
18 |  * the member pointer of the original cell type. See test
19 |  * TestMemberPtrToOffset for an explanation.
20 |  *
21 |  * Will be instantiated by LIBFLATARRAY_REGISTER_SOA().
22 |  */
23 | class member_ptr_to_offset
24 | {
25 | public:
26 |     template<typename MEMBER_TYPE, typename CELL_TYPE>
27 |     int operator()(MEMBER_TYPE CELL_TYPE:: *member_ptr)
28 |     {
29 |         return detail::flat_array::offset<
30 |             CELL_TYPE,
31 |             number_of_members<CELL_TYPE>::VALUE>()(member_ptr);
32 |     }
33 | };
34 | 
35 | }
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/include/libflatarray/alignment.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_ALIGNMENT_HPP
 9 | #define FLAT_ARRAY_ALIGNMENT_HPP
10 | 
11 | #include <libflatarray/short_vec.hpp>
12 | #include <libflatarray/streaming_short_vec.hpp>
13 | 
14 | namespace LibFlatArray {
15 | 
16 | template<typename vec>
17 | class alignment;
18 | 
19 | template<typename T, std::size_t ARITY>
20 | class alignment<short_vec<T, ARITY> >
21 | {
22 | public:
23 |     typedef typename short_vec<T, ARITY>::strategy strategy;
24 |     typedef typename strategy::template alignment<T> align;
25 |     const static std::size_t VALUE = align::ALIGNMENT;
26 | };
27 | 
28 | template<typename T, std::size_t ARITY>
29 | class alignment<streaming_short_vec<T, ARITY> >
30 | {
31 | public:
32 |     typedef typename short_vec<T, ARITY>::strategy strategy;
33 |     typedef typename strategy::template alignment<T> align;
34 |     const static std::size_t VALUE = align::ALIGNMENT;
35 | };
36 | 
37 | }
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/examples/smoothed_particle_hydrodynamics/kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIBFLATARRAY_EXAMPLES_SMOOTHED_PARTICLE_HYDRODYNAMICS_KERNELS_H
 2 | #define LIBFLATARRAY_EXAMPLES_SMOOTHED_PARTICLE_HYDRODYNAMICS_KERNELS_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 |     void compute_density(
 9 |         int n,
10 |         float *rho,
11 |         float *pos_x,
12 |         float *pos_y,
13 |         float h,
14 |         float mass);
15 | 
16 |     void compute_accel(
17 |         int n,
18 |         float *rho,
19 |         float *pos_x,
20 |         float *pos_y,
21 |         float *v_x,
22 |         float *v_y,
23 |         float *a_x,
24 |         float *a_y,
25 |         float mass,
26 |         float g,
27 |         float h,
28 |         float k,
29 |         float rho0,
30 |         float mu);
31 | 
32 |     void leapfrog(
33 |         int n,
34 |         float *pos_x,
35 |         float *pos_y,
36 |         float *v_x,
37 |         float *v_y,
38 |         float *a_x,
39 |         float *a_y,
40 |         double dt);
41 | 
42 |     void reflect_bc(
43 |         int n,
44 |         float *pos_x,
45 |         float *pos_y,
46 |         float *v_x,
47 |         float *v_y);
48 | 
49 | #ifdef __cplusplus
50 | }
51 | #endif
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/include/libflatarray/testbed/gpu_benchmark.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_TESTBED_GPU_BENCHMARK_HPP
 9 | #define FLAT_ARRAY_TESTBED_GPU_BENCHMARK_HPP
10 | 
11 | #include <libflatarray/testbed/benchmark.hpp>
12 | 
13 | // disable certain warnings from system headers when compiling with
14 | // Microsoft Visual Studio:
15 | #ifdef _MSC_BUILD
16 | #pragma warning( push )
17 | #pragma warning( disable : 4514 )
18 | #endif
19 | 
20 | #include <cuda.h>
21 | 
22 | #ifdef _MSC_BUILD
23 | #pragma warning( pop )
24 | #endif
25 | 
26 | namespace LibFlatArray {
27 | 
28 | class gpu_benchmark : benchmark
29 | {
30 | public:
31 |     std::string order()
32 |     {
33 |         return "GPU";
34 |     }
35 | 
36 |     std::string device()
37 |     {
38 |         int cudaDevice;
39 |         cudaGetDevice(&cudaDevice);
40 |         cudaDeviceProp properties;
41 |         cudaGetDeviceProperties(&properties, cudaDevice);
42 |         std::string cudaDeviceID = properties.name;
43 | 
44 |         return cudaDeviceID;
45 |     }
46 | };
47 | 
48 | 
49 | }
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/include/libflatarray/flat_array.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2012-2016 Andreas Schäfer
 3 |  * Copyright 2017 Google
 4 |  *
 5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 7 |  */
 8 | 
 9 | #ifndef FLAT_ARRAY_FLAT_ARRAY_HPP
10 | #define FLAT_ARRAY_FLAT_ARRAY_HPP
11 | 
12 | #include <libflatarray/detail/macros.hpp>
13 | #include <libflatarray/detail/offset.hpp>
14 | #include <libflatarray/aligned_allocator.hpp>
15 | 
16 | #ifdef __CUDACC__
17 | #include <libflatarray/cuda_allocator.hpp>
18 | #endif
19 | 
20 | #include <libflatarray/alignment.hpp>
21 | #include <libflatarray/coord.hpp>
22 | #include <libflatarray/estimate_optimum_short_vec_type.hpp>
23 | #include <libflatarray/ilp_to_arity.hpp>
24 | #include <libflatarray/loop_peeler.hpp>
25 | #include <libflatarray/macros.hpp>
26 | #include <libflatarray/member_ptr_to_offset.hpp>
27 | #include <libflatarray/preprocessor.hpp>
28 | #include <libflatarray/number_of_members.hpp>
29 | #include <libflatarray/short_vec.hpp>
30 | #include <libflatarray/streaming_short_vec.hpp>
31 | #include <libflatarray/soa_accessor.hpp>
32 | #include <libflatarray/soa_array.hpp>
33 | #include <libflatarray/soa_grid.hpp>
34 | #include <libflatarray/soa_vector.hpp>
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/simple_streak.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016-2017 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_SIMPLE_STREAK_HPP
 9 | #define FLAT_ARRAY_DETAIL_SIMPLE_STREAK_HPP
10 | 
11 | // Don't warn about these functions being stripped from an executable
12 | // as they're not being used, that's actually expected behavior.
13 | #ifdef _MSC_BUILD
14 | #pragma warning( push )
15 | #pragma warning( disable : 4514 )
16 | #endif
17 | 
18 | namespace LibFlatArray {
19 | 
20 | namespace detail {
21 | 
22 | namespace flat_array {
23 | 
24 |     class simple_streak {
25 |     public:
26 |         explicit simple_streak(std::size_t x = 0, std::size_t y = 0, std::size_t z = 0, std::size_t count = 0) :
27 |             count(count)
28 |         {
29 |             origin[0] = x;
30 |             origin[1] = y;
31 |             origin[2] = z;
32 |         }
33 | 
34 |         std::size_t length() const
35 |         {
36 |             return count;
37 |         }
38 | 
39 |         std::size_t origin[3];
40 |         std::size_t count;
41 |     };
42 | 
43 | }
44 | 
45 | }
46 | 
47 | }
48 | 
49 | #ifdef _MSC_BUILD
50 | #pragma warning( pop )
51 | #endif
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/examples/jacobi/update_c99.c:
--------------------------------------------------------------------------------
 1 | #ifdef __ICC
 2 | #include <omp.h>
 3 | #endif
 4 | 
 5 | /**
 6 |  * Recommended reference for multi-dimensional array handling in C99
 7 |  * by Jeff Hammond:
 8 |  *
 9 |  *   https://github.com/jeffhammond/HPCInfo/blob/master/c99/array3d.c
10 |  */
11 | void update_c99(double *data_new, const double *data_old, int dim_x, int dim_y, int dim_z)
12 | {
13 |     // cast types here to maintain a C++-compatible signature:
14 |     double (* const restrict grid_old)[dim_y][dim_x] = (double (* const)[dim_y][dim_x])data_old;
15 |     double (*       restrict grid_new)[dim_y][dim_x] = (double (*      )[dim_y][dim_x])data_new;
16 | 
17 | #pragma omp parallel for schedule(static)
18 |     for (int z = 1; z < (dim_z - 1); ++z) {
19 |         for (int y = 1; y < (dim_y - 1); ++y) {
20 | #ifdef __ICC
21 | #pragma vector always nontemporal
22 | #endif
23 |             for (int x = 1; x < (dim_x - 1); ++x) {
24 |                 grid_new[z][y][x] =
25 |                     (grid_old[z - 1][y    ][x    ] +
26 |                      grid_old[z    ][y - 1][x    ] +
27 |                      grid_old[z    ][y    ][x - 1] +
28 |                      grid_old[z    ][y    ][x + 1] +
29 |                      grid_old[z    ][y + 1][x    ] +
30 |                      grid_old[z + 1][y    ][x    ]) * (1.0 / 6.0);
31 |             }
32 |         }
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Boost Software License - Version 1.0 - August 17, 2003
 2 | 
 3 | Permission is hereby granted, free of charge, to any person or organization
 4 | obtaining a copy of the software and accompanying documentation covered by
 5 | this license (the "Software") to use, reproduce, display, distribute,
 6 | execute, and transmit the Software, and to prepare derivative works of the
 7 | Software, and to permit third-parties to whom the Software is furnished to
 8 | do so, all subject to the following:
 9 | 
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/include/libflatarray/soa_accessor.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_SOA_ACCESSOR_HPP
 9 | #define FLAT_ARRAY_SOA_ACCESSOR_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | /**
14 |  * This class provides an object-oriented view to a "Struct of
15 |  * Arrays"-style grid. It requires the user to register the type CELL
16 |  * using the macro LIBFLATARRAY_REGISTER_SOA.
17 |  *
18 |  * All registed members will be avalable by functions of the same
19 |  * name, so if "Cell" had two members "float a" and "char b", then
20 |  * these would be accessible via soa_accessor<Cell, ...>::a() and
21 |  * soa_accessor<Cell, ...>::b().
22 |  *
23 |  * soa_accessor<> also provides an operator[] which can be used to
24 |  * access neighboring cells.
25 |  */
26 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
27 | class soa_accessor;
28 | 
29 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
30 | class const_soa_accessor;
31 | 
32 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
33 | class soa_accessor_light;
34 | 
35 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
36 | class const_soa_accessor_light;
37 | 
38 | }
39 | 
40 | #endif
41 | 
42 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/offset.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2017 Andreas Schäfer
 3 |  * Copyright 2018 Google
 4 |  *
 5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 7 |  */
 8 | 
 9 | #ifndef FLAT_ARRAY_DETAIL_OFFSET_HPP
10 | #define FLAT_ARRAY_DETAIL_OFFSET_HPP
11 | 
12 | // disable certain warnings from system headers when compiling with
13 | // Microsoft Visual Studio:
14 | #ifdef _MSC_BUILD
15 | #pragma warning( push )
16 | #pragma warning( disable : 4514 4548 4626 4710 4711 4820 4996 5027 )
17 | #endif
18 | 
19 | #include <stdexcept>
20 | 
21 | #ifdef _MSC_BUILD
22 | #pragma warning( pop )
23 | #endif
24 | 
25 | #ifdef _MSC_BUILD
26 | #pragma warning( push )
27 | #pragma warning( disable : 4710 4711 )
28 | #endif
29 | 
30 | namespace LibFlatArray {
31 | 
32 | namespace detail {
33 | 
34 | namespace flat_array {
35 | 
36 | template<typename CELL, long I>
37 | class offset;
38 | 
39 | template<typename CELL>
40 | class offset<CELL, 0>
41 | {
42 | public:
43 |     static const long OFFSET = 0;
44 | 
45 |     template<typename MEMBER_TYPE>
46 |     int operator()(MEMBER_TYPE CELL::* /* member_ptr */)
47 |     {
48 |         throw std::invalid_argument("member was not registered with LibFlatArray");
49 |     }
50 | };
51 | 
52 | }
53 | 
54 | }
55 | 
56 | }
57 | 
58 | #ifdef _MSC_BUILD
59 | #pragma warning( pop )
60 | #endif
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/include/libflatarray/cuda_allocator.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2012-2017 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_CUDA_ALLOCATOR_HPP
 9 | #define FLAT_ARRAY_CUDA_ALLOCATOR_HPP
10 | 
11 | #ifdef __CUDACC__
12 | 
13 | #ifdef __ICC
14 | // disabling this warning as implicit type conversion here as it's an intented feature for dim3
15 | #pragma warning push
16 | #pragma warning (disable: 2304)
17 | #endif
18 | 
19 | // disable certain warnings from system headers when compiling with
20 | // Microsoft Visual Studio:
21 | #ifdef _MSC_BUILD
22 | #pragma warning( push )
23 | #pragma warning( disable : 4514 )
24 | #endif
25 | 
26 | #include <cuda.h>
27 | 
28 | #ifdef _MSC_BUILD
29 | #pragma warning( pop )
30 | #endif
31 | 
32 | #ifdef __ICC
33 | #pragma warning pop
34 | #endif
35 | 
36 | namespace LibFlatArray {
37 | 
38 | template<class T>
39 | class cuda_allocator
40 | {
41 | public:
42 |     typedef ptrdiff_t difference_type;
43 |     typedef T* pointer;
44 |     typedef const T* const_pointer;
45 |     typedef T& reference;
46 |     typedef const T& const_reference;
47 |     typedef T value_type;
48 | 
49 |     pointer allocate(std::size_t n, const void* = 0)
50 |     {
51 |         pointer ret;
52 |         cudaMalloc(&ret, n * sizeof(T));
53 |         return ret;
54 |     }
55 | 
56 |     void deallocate(pointer p, std::size_t)
57 |     {
58 |         cudaFree(p);
59 |     }
60 | };
61 | 
62 | }
63 | 
64 | #endif
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/test/cuda_allocator_test.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2013 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #include <vector>
 9 | #include <libflatarray/cuda_allocator.hpp>
10 | 
11 | #include "test.hpp"
12 | 
13 | using namespace LibFlatArray;
14 | 
15 | ADD_TEST(basic)
16 | {
17 |     cuda_allocator<double> allocator;
18 | 
19 |     double *devArray1 = allocator.allocate( 50);
20 |     double *devArray2 = allocator.allocate(110);
21 |     BOOST_TEST(devArray1 != devArray2);
22 | 
23 |     std::vector<double> hostArray1(120, -1);
24 |     std::vector<double> hostArray2(130, -2);
25 | 
26 |     for (int i = 0; i < 50; ++i) {
27 |         hostArray1[i] = i + 0.5;
28 | 
29 |         BOOST_TEST(hostArray2[i] == -2);
30 |     }
31 | 
32 |     std::size_t byteSize = 50 * sizeof(double);
33 |     cudaMemcpy(devArray1,      &hostArray1[0], byteSize, cudaMemcpyHostToDevice);
34 |     cudaMemcpy(devArray2,      devArray1,      byteSize, cudaMemcpyDeviceToDevice);
35 |     cudaMemcpy(&hostArray2[0], devArray2,      byteSize, cudaMemcpyDeviceToHost);
36 | 
37 |     for (int i = 0; i < 50; ++i) {
38 |         double expected = i + 0.5;
39 |         BOOST_TEST(hostArray2[i] == expected);
40 |     }
41 | }
42 | 
43 | ADD_TEST(null_allocation)
44 | {
45 |     cuda_allocator<double> allocator;
46 |     double *p = allocator.allocate(0);
47 |     allocator.deallocate(p, 0);
48 |     BOOST_TEST(p == 0);
49 | }
50 | 
51 | int main(int argc, char **argv)
52 | {
53 |     return 0;
54 | }
55 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/generate_cuda_launch_config.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  * Copyright 2018 Google
 4 |  *
 5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 7 |  */
 8 | 
 9 | #ifndef FLAT_ARRAY_DETAIL_GENERATE_CUDA_LAUNCH_CONFIG_HPP
10 | #define FLAT_ARRAY_DETAIL_GENERATE_CUDA_LAUNCH_CONFIG_HPP
11 | 
12 | #include <libflatarray/config.h>
13 | 
14 | #ifdef LIBFLATARRAY_WITH_CUDA
15 | #ifdef __CUDACC__
16 | 
17 | namespace LibFlatArray {
18 | 
19 | namespace detail {
20 | 
21 | namespace flat_array {
22 | 
23 | /**
24 |  * Returns a somewhat sensible decomposition of the grid into thread
25 |  * blocks for launching CUDA kernels.
26 |  */
27 | class generate_cuda_launch_config
28 | {
29 | public:
30 |     void operator()(dim3 *grid_dim, dim3 *block_dim, int x, int y, int z)
31 |     {
32 |         if (y >= 4) {
33 |             *block_dim = dim3(128, 4, 1);
34 |         } else {
35 |             *block_dim = dim3(512, 1, 1);
36 |         }
37 | 
38 |         grid_dim->x = divide_and_round_up(x, block_dim->x);
39 |         grid_dim->y = divide_and_round_up(y, block_dim->y);
40 |         grid_dim->z = divide_and_round_up(z, block_dim->z);
41 |     }
42 | 
43 | private:
44 |     int divide_and_round_up(int i, int dividend)
45 |     {
46 |         int ret = i / dividend;
47 |         if (i % dividend) {
48 |             ret += 1;
49 |         }
50 | 
51 |         return ret;
52 |     }
53 | };
54 | 
55 | }
56 | 
57 | }
58 | 
59 | }
60 | 
61 | #endif
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | ABOUT
 2 | =====
 3 | 
 4 | LibFlatArray acts as a highly efficient multi-dimensional array of
 5 | arbitrary objects (array of structs, AoS), but really uses a struct of
 6 | arrays (SoA) memory layout. It's great for writing vectorized code and
 7 | its lightning-fast iterators give you access to neighboring elements
 8 | with zero address generation overhead.
 9 | 
10 | Use cases include:
11 | - computer simulations (e.g. stencil codes such as Lattice Boltzmann Methods)
12 | - image processing (e.g. Gaussian filters)
13 | - numerical methods (e.g. multiplication of complex matrices)
14 | 
15 | The library is written in C++ and uses templates to shift the burden
16 | of address computation from runtime to compile time. It shares some
17 | infrastructure with its parent project LibGeoDecomp.
18 | 
19 | Further information:
20 |   - homepage:          http://www.libgeodecomp.org/libflatarray.html
21 |   - mailing list:      http://www.libgeodecomp.org/mailing_lists.html
22 |   - source repository: https://bitbucket.org/gentryx/libflatarray
23 |   - contributors:      see file "AUTHORS"
24 | 
25 | DEPENDENCIES
26 | ============
27 | 
28 | - C++ compiler (min. C++98, tested with GCC's g++, Clang's clang++,
29 |   and Intel's icpc)
30 | 
31 | - CMake (min. 2.8.10)
32 | 
33 | - build tool supported by CMake (e.g. make, ninja)
34 | 
35 | BUILDING
36 | ========
37 | 
38 | For compiling LibFlatArray you'll need CMake (http://www.cmake.org)
39 | installed. We recommend an out-of-source build:
40 | 
41 |   BUILD_DIR=build/`uname -ms | sed s/\ /-/g`
42 |   mkdir -p $BUILD_DIR
43 |   cd $BUILD_DIR
44 |   cmake ../../
45 |   make
46 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/copy_functor.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_COPY_FUNCTOR_HPP
 9 | #define FLAT_ARRAY_DETAIL_COPY_FUNCTOR_HPP
10 | 
11 | namespace LibFlatArray {
12 | 
13 | namespace detail {
14 | 
15 | namespace flat_array {
16 | 
17 | /**
18 |  * Will copy all members of all grid cells by invoking std::copy on
19 |  * all member instances. We can't just memcpy() as members may still
20 |  * be C++ objects that need to run their copy c-tors for
21 |  * allocation/deallocation
22 |  */
23 | template<typename CELL>
24 | class copy_functor
25 | {
26 | public:
27 |     copy_functor(
28 |         std::size_t dim_x,
29 |         std::size_t dim_y,
30 |         std::size_t dim_z) :
31 |         dim_x(dim_x),
32 |         dim_y(dim_y),
33 |         dim_z(dim_z)
34 |     {}
35 | 
36 |     template<typename ACCESSOR1, typename ACCESSOR2>
37 |     void operator()(ACCESSOR1& source_accessor, ACCESSOR2 target_accessor) const
38 |     {
39 |         for (std::size_t z = 0; z < dim_z; ++z) {
40 |             for (std::size_t y = 0; y < dim_y; ++y) {
41 |                 target_accessor.index() = ACCESSOR1::gen_index(0, y, z);
42 |                 source_accessor.index() = target_accessor.index();
43 |                 target_accessor.copy_members(source_accessor, dim_x);
44 |             }
45 |         }
46 |     }
47 | 
48 | private:
49 |     std::size_t dim_x;
50 |     std::size_t dim_y;
51 |     std::size_t dim_z;
52 | };
53 | 
54 | }
55 | 
56 | }
57 | 
58 | }
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/generic_destruct.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_GENERIC_DESTRUCT_HPP
 9 | #define FLAT_ARRAY_DETAIL_GENERIC_DESTRUCT_HPP
10 | 
11 | // this fixes compilation for non-cuda builds
12 | #ifndef __host__
13 | #define __host__
14 | #endif
15 | 
16 | #ifndef __device__
17 | #define __device__
18 | #endif
19 | 
20 | // Don't warn about these functions being stripped from an executable
21 | // as they're not being used, that's actually expected behavior.
22 | #ifdef _MSC_BUILD
23 | #pragma warning( push )
24 | #pragma warning( disable : 4514 )
25 | #endif
26 | 
27 | namespace LibFlatArray {
28 | 
29 | namespace detail {
30 | 
31 | namespace flat_array {
32 | 
33 | template<typename TYPENAME>
34 | __host__ __device__
35 | inline void generic_destruct(TYPENAME *member)
36 | {
37 |     member->~TYPENAME();
38 | }
39 | 
40 | // primitive types don't have d-tors:
41 | __host__ __device__
42 | inline void generic_destruct(char *)
43 | {}
44 | 
45 | __host__ __device__
46 | inline void generic_destruct(float *)
47 | {}
48 | 
49 | __host__ __device__
50 | inline void generic_destruct(double *)
51 | {}
52 | 
53 | __host__ __device__
54 | inline void generic_destruct(int *)
55 | {}
56 | 
57 | __host__ __device__
58 | inline void generic_destruct(unsigned *)
59 | {}
60 | 
61 | __host__ __device__
62 | inline void generic_destruct(long *)
63 | {}
64 | 
65 | }
66 | 
67 | }
68 | 
69 | }
70 | 
71 | #ifdef _MSC_BUILD
72 | #pragma warning( pop )
73 | #endif
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/include/libflatarray/testbed/benchmark.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2017 Andreas Schäfer
 3 |  * Copyright 2018 Google
 4 |  *
 5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 7 |  */
 8 | 
 9 | #ifndef FLAT_ARRAY_TESTBED_BENCHMARK_HPP
10 | #define FLAT_ARRAY_TESTBED_BENCHMARK_HPP
11 | 
12 | // disable certain warnings from system headers when compiling with
13 | // Microsoft Visual Studio:
14 | #ifdef _MSC_BUILD
15 | #pragma warning( push )
16 | #pragma warning( disable : 4514 4548 4668 4711 4820 4996  )
17 | #endif
18 | 
19 | #include <string>
20 | #include <vector>
21 | 
22 | #ifdef _WIN32
23 | #include <windows.h>
24 | #else
25 | #include <sys/time.h>
26 | #endif
27 | 
28 | #ifdef _MSC_BUILD
29 | #pragma warning( pop )
30 | #endif
31 | 
32 | namespace LibFlatArray {
33 | 
34 | class benchmark
35 | {
36 | public:
37 |     virtual ~benchmark()
38 |     {}
39 | 
40 |     virtual std::string order() = 0;
41 |     virtual std::string family() = 0;
42 |     virtual std::string species() = 0;
43 |     virtual double performance(std::vector<int> dim) = 0;
44 |     virtual std::string unit() = 0;
45 |     virtual std::string device() = 0;
46 | 
47 |     static
48 |     inline double time()
49 |     {
50 | #ifdef _WIN32
51 |         LARGE_INTEGER time;
52 |         LARGE_INTEGER freq;
53 |         QueryPerformanceCounter(&time);
54 |         QueryPerformanceFrequency(&freq);
55 |         return 1.0 * time.QuadPart / freq.QuadPart;
56 | #else
57 |         timeval t;
58 |         gettimeofday(&t, 0);
59 |         return t.tv_sec + t.tv_usec * 1.0e-6;
60 | #endif
61 |     }
62 | 
63 | };
64 | 
65 | 
66 | }
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/include/libflatarray/preprocessor.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer,
 3 |  * heavily based on the Boost Preprocessor library by Paul Mensonides (copyright 2002)
 4 |  *
 5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 7 |  */
 8 | 
 9 | #ifndef FLAT_ARRAY_PREPROCESSOR_HPP
10 | #define FLAT_ARRAY_PREPROCESSOR_HPP
11 | 
12 | #include <libflatarray/detail/preprocessor.hpp>
13 | 
14 | /**
15 |  * Returns the element of LIST at position INDEX. Assumes 0-based
16 |  * addressing.
17 |  */
18 | #define LIBFLATARRAY_ELEM(INDEX, LIST) LIBFLATARRAY_ELEM_I(INDEX, LIST)
19 | 
20 | /**
21 |  * Return lenght of LIST. LIST is assumed to be of the form
22 |  *
23 |  * (foo)(bar)(goo)
24 |  *
25 |  * i.e. all elements are enclosed in parentheses.
26 |  */
27 | #define LIBFLATARRAY_SIZE(LIST) LIBFLATARRAY_SIZE_I(LIBFLATARRAY_SIZE_0 LIST)
28 | 
29 | // Expands to an empty string, useful for deleting arguments from a
30 | // list.
31 | #define LIBFLATARRAY_NULL(_)
32 | 
33 | // Returns a list which is identical to LIST, but with the first
34 | // element removed. Will fail for empty lists.
35 | #define LIBFLATARRAY_DEQUEUE(LIST) LIBFLATARRAY_NULL LIST
36 | 
37 | /**
38 |  * Will instantiate MACRO for each element of LIST with three parameters:
39 |  * 1. an integer index, starting at 0,
40 |  * 2. PARAM
41 |  * 3. the element of LIST at the given index.
42 |  */
43 | #define LIBFLATARRAY_FOR_EACH(MACRO, DEFAULT_ARG, LIST) LIBFLATARRAY_FOR_EACH_I(MACRO, DEFAULT_ARG, LIBFLATARRAY_DEQUEUE(LIST), LIST)
44 | 
45 | /**
46 |  * Will expand to A if the size of LIST is less than LENGTH. Will
47 |  * expand to B if the number of elements in LIST is equal to or larger
48 |  * than LENGTH.
49 |  */
50 | #define LIBFLATARRAY_IF_SHORTER(LIST, LENGTH, A, B) LIBFLATARRAY_IF_SHORTER_I(LIBFLATARRAY_IF_SHORTER_ ## LENGTH, LIST, A, B)
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/set_byte_size_functor.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014-2017 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_DETAIL_SET_BYTE_SIZE_FUNCTOR_HPP
 9 | #define FLAT_ARRAY_DETAIL_SET_BYTE_SIZE_FUNCTOR_HPP
10 | 
11 | #include <libflatarray/aggregated_member_size.hpp>
12 | 
13 | namespace LibFlatArray {
14 | 
15 | namespace detail {
16 | 
17 | namespace flat_array {
18 | 
19 | /**
20 |  * This helper class uses the dimension specified in the accessor to
21 |  * compute how many bytes a grid needs to allocate im memory.
22 |  */
23 | template<typename CELL>
24 | class set_byte_size_functor
25 | {
26 | public:
27 |     explicit set_byte_size_functor(
28 |         std::size_t *byte_size,
29 |         std::size_t *extent_x,
30 |         std::size_t *extent_y,
31 |         std::size_t *extent_z) :
32 |         byte_size(byte_size),
33 |         extent_x(extent_x),
34 |         extent_y(extent_y),
35 |         extent_z(extent_z)
36 |     {}
37 | 
38 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
39 |     void operator()(const soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& /* accessor */) const
40 |     {
41 |         // Overflow is fine here (it's actually to be expected for
42 |         // 32-bit builds) as such large grids can't be instantiated at
43 |         // runtime anyway;
44 | #ifdef _MSC_BUILD
45 | #pragma warning( push )
46 | #pragma warning( disable : 4307 )
47 | #endif
48 | 
49 |         *byte_size = aggregated_member_size<CELL>::VALUE * DIM_X * DIM_Y * DIM_Z;
50 | 
51 | #ifdef _MSC_BUILD
52 | #pragma warning( pop )
53 | #endif
54 | 
55 |         *extent_x = DIM_X;
56 |         *extent_y = DIM_Y;
57 |         *extent_z = DIM_Z;
58 |     }
59 | 
60 | private:
61 |     std::size_t *byte_size;
62 |     std::size_t *extent_x;
63 |     std::size_t *extent_y;
64 |     std::size_t *extent_z;
65 | };
66 | 
67 | }
68 | 
69 | }
70 | 
71 | }
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/include/libflatarray/ilp_to_arity.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_ILP_TO_ARITY_HPP
 9 | #define FLAT_ARRAY_ILP_TO_ARITY_HPP
10 | 
11 | #include <libflatarray/short_vec.hpp>
12 | 
13 | namespace LibFlatArray {
14 | 
15 | /**
16 |  * This class allows users to select the arity of a short_vec type by
17 |  * specifying the desired degree of instruction level parallelism
18 |  * (i.e. loop unrolling factor). For instance, setting ILP to 4 for
19 |  * double on an AVX-cabable CPU would yield short_vec<duble, 16>, but
20 |  * for a SSE-only CPU it would return a short_vec<double, 8>.
21 |  */
22 | template<typename CARGO, std::size_t ILP>
23 | class ilp_to_arity
24 | {
25 | public:
26 |     // Revert to scalar values when running on a CUDA device. The
27 |     // vector unit is much wider, but from a programming PoV it's
28 |     // scalar:
29 | #ifdef __CUDA_ARCH__
30 |     static const std::size_t ARITY = 1;
31 | #else
32 |     // for IBM Blue Gene/Q's QPX, which is mutually exclusive to
33 |     // Intel/AMD's AVX/SSE or ARM's NEON ISAs:
34 | #  ifdef __VECTOR4DOUBLE__
35 |     static const int BIT_WIDTH = 256;
36 | #  endif
37 | 
38 |     // Dito for ARM NEON:
39 | #  ifdef __ARM_NEON__
40 |     static const int BIT_WIDTH = 128;
41 | #  endif
42 | 
43 |     // Only the case of the IBM PC is complicated. No thanks to you,
44 |     // history!
45 | #  if !defined(__CUDA_ARCH__) && !defined(__ARM_NEON__) && !defined(__MIC__)
46 | #    ifdef LFA_AVX512_HELPER
47 |     static const int BIT_WIDTH = 512;
48 | #    else
49 | #      ifdef __AVX__
50 |     static const int BIT_WIDTH = 256;
51 | #      else
52 | #        ifdef __SSE__
53 |     static const int BIT_WIDTH = 128;
54 | #        else
55 |     static const int BIT_WIDTH = sizeof(CARGO) * 8;
56 | #        endif
57 | #      endif
58 | #    endif
59 | #  endif
60 |     static const std::size_t ARITY = ILP * BIT_WIDTH / sizeof(CARGO) / 8;
61 | #endif
62 | 
63 | };
64 | 
65 | }
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/examples/lbm/update_lbm_cuda_flat_array.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2013-2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CUDA_FLAT_ARRAY_H
 9 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CUDA_FLAT_ARRAY_H
10 | 
11 | #include <libflatarray/soa_grid.hpp>
12 | #include <libflatarray/testbed/benchmark.hpp>
13 | 
14 | #include "util.h"
15 | #include "cudalineupdatefunctorprototype.h"
16 | 
17 | class benchmark_lbm_cuda_flat_array : public benchmark_lbm_cuda
18 | {
19 |     virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats)
20 |     {
21 |         LibFlatArray::soa_grid<CellLBM> gridA(dim, dim, 256);
22 |         LibFlatArray::soa_grid<CellLBM> gridB(dim, dim, 256);
23 |         // fixme: init grid?
24 | 
25 |         char *dataA = gridA.data();
26 |         char *dataB = gridB.data();
27 | 
28 |         char *buf;
29 |         cudaMalloc(reinterpret_cast<void**>(&buf), gridA.byte_size());
30 |         gridA.set_data(buf);
31 |         cudaMalloc(reinterpret_cast<void**>(&buf), gridB.byte_size());
32 |         gridB.set_data(buf);
33 | 
34 |         LibFlatArray::soa_grid<CellLBM> *gridOld = &gridA;
35 |         LibFlatArray::soa_grid<CellLBM> *gridNew = &gridB;
36 | 
37 |         cudaDeviceSynchronize();
38 |         double t_start = LibFlatArray::benchmark::time();
39 | 
40 |         CudaLineUpdateFunctorPrototype<CellLBM> updater(dimBlock, dimGrid);
41 | 
42 |         for (int t = 0; t < repeats; ++t) {
43 |             gridOld->callback(gridNew, updater);
44 |             std::swap(gridOld, gridNew);
45 |         }
46 | 
47 |         cudaDeviceSynchronize();
48 |         double t_end = LibFlatArray::benchmark::time();
49 |         check_cuda_error();
50 | 
51 |         cudaFree(gridA.data());
52 |         cudaFree(gridB.data());
53 | 
54 |         gridA.set_data(dataA);
55 |         gridB.set_data(dataB);
56 | 
57 |         return t_end - t_start;
58 |     }
59 | 
60 |     virtual std::string name()
61 |     {
62 |         return "lbm_cuda_flat_array";
63 |     }
64 | };
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/examples/lbm/cudalineupdatefunctorprototype.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_CUDALINEUPDATEFUNCTORPROTOTYPE_H
 2 | #define LIBFLATARRAY_EXAMPLES_LBM_CUDALINEUPDATEFUNCTORPROTOTYPE_H
 3 | 
 4 | #include "cell.h"
 5 | 
 6 | template<typename CELL, typename ACCESSOR1, typename ACCESSOR2>
 7 | __global__
 8 | void update(ACCESSOR1 accessor1, ACCESSOR2 accessor2)
 9 | {
10 |     ACCESSOR1 accessorOld(accessor1.data(), 0);
11 |     ACCESSOR2 accessorNew(accessor2.data(), 0);
12 | 
13 |     CELL::updateLine(
14 |         accessorOld, &accessorOld.index(),
15 |         accessorNew, &accessorNew.index(), 2, 256 - 2);
16 | }
17 | 
18 | template<typename CELL>
19 | class CudaLineUpdateFunctorPrototypeImplementation
20 | {
21 | public:
22 |     CudaLineUpdateFunctorPrototypeImplementation(dim3 dim_block, dim3 dim_grid) :
23 |         dim_block(dim_block),
24 |         dim_grid(dim_grid)
25 |     {}
26 | 
27 |     template<typename ACCESSOR1, typename ACCESSOR2>
28 |     void operator()(ACCESSOR1 accessor1, ACCESSOR2 accessor2) const
29 |     {
30 |         update<CELL, ACCESSOR1, ACCESSOR2><<<dim_grid, dim_block>>>(accessor1, accessor2);
31 |     }
32 | 
33 | private:
34 |     dim3 dim_block;
35 |     dim3 dim_grid;
36 | };
37 | 
38 | template<typename CELL>
39 | class CudaLineUpdateFunctorPrototype
40 | {
41 | public:
42 |     CudaLineUpdateFunctorPrototype(dim3 dim_block, dim3 dim_grid) :
43 |         dim_block(dim_block),
44 |         dim_grid(dim_grid)
45 |     {}
46 | 
47 |     template<typename ACCESSOR1, typename ACCESSOR2>
48 |     void operator()(ACCESSOR1 accessor1, ACCESSOR2 accessor2) const;
49 | 
50 | private:
51 |     dim3 dim_block;
52 |     dim3 dim_grid;
53 | };
54 | 
55 | #define IMPLEMENTATION(CELL, X1, Y1, Z1, X2, Y2, Z2)                    \
56 |     template<>                                                          \
57 |     template<>                                                          \
58 |     void CudaLineUpdateFunctorPrototype<CellLBM>::operator()(           \
59 |         LibFlatArray::soa_accessor<CELL, X1, Y1, Z1, 0> accessor1,      \
60 |         LibFlatArray::soa_accessor<CELL, X2, Y2, Z2, 0> accessor2) const \
61 |     {                                                                   \
62 |         CudaLineUpdateFunctorPrototypeImplementation<CELL> i(dim_block, dim_grid); \
63 |         i(accessor1, accessor2);                                        \
64 |     }
65 | 
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/staging_buffer.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Andreas Schäfer
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #ifndef FLAT_ARRAY_DETAIL_STAGING_BUFFER_HPP
  9 | #define FLAT_ARRAY_DETAIL_STAGING_BUFFER_HPP
 10 | 
 11 | #include <libflatarray/cuda_array.hpp>
 12 | 
 13 | namespace LibFlatArray {
 14 | 
 15 | namespace detail {
 16 | 
 17 | namespace flat_array {
 18 | 
 19 | /**
 20 |  * Dummy class which presents the same interface as cuda_array,
 21 |  * but won't actually buffer the data. Instead the pointers are
 22 |  * forwarded directly so no additional copies of the data need to
 23 |  * be made.
 24 |  */
 25 | template<typename CELL, bool ENABLE_CUDA = false>
 26 | class staging_buffer
 27 | {
 28 | public:
 29 |     void resize(std::size_t /* unused */)
 30 |     {
 31 |         // intentionally left blank
 32 |     }
 33 | 
 34 |     void load(const CELL *new_data)
 35 |     {
 36 |         data_pointer = const_cast<CELL*>(new_data);
 37 |     }
 38 | 
 39 |     void save(CELL* /* new_data */) const
 40 |     {
 41 |         // intentionally left blank
 42 |     }
 43 | 
 44 |     const CELL *data() const
 45 |     {
 46 |         return data_pointer;
 47 |     }
 48 | 
 49 |     CELL *data()
 50 |     {
 51 |         return data_pointer;
 52 |     }
 53 | 
 54 |     void prep(CELL *new_data)
 55 |     {
 56 |         data_pointer = new_data;
 57 |     }
 58 | private:
 59 |     CELL *data_pointer;
 60 | };
 61 | 
 62 | #ifdef __CUDACC__
 63 | 
 64 | template<typename CELL>
 65 | class staging_buffer<CELL, true>
 66 | {
 67 | public:
 68 |    void resize(std::size_t n)
 69 |     {
 70 |         delegate.resize(n);
 71 |     }
 72 | 
 73 |     void load(const CELL *new_data)
 74 |     {
 75 |         delegate.load(new_data);
 76 |     }
 77 | 
 78 |     void save(CELL *new_data) const
 79 |     {
 80 |         delegate.save(new_data);
 81 |     }
 82 | 
 83 |     const CELL *data() const
 84 |     {
 85 |         return delegate.data();
 86 |     }
 87 | 
 88 |     CELL *data()
 89 |     {
 90 |         return delegate.data();
 91 |     }
 92 | 
 93 |     void prep(CELL* /* new_data */)
 94 |     {
 95 |         // intentionally left blank
 96 |     }
 97 | 
 98 | private:
 99 |     cuda_array<CELL> delegate;
100 | };
101 | 
102 | #endif
103 | 
104 | }
105 | 
106 | }
107 | 
108 | }
109 | 
110 | #endif
111 | 
112 | 


--------------------------------------------------------------------------------
/test/aligned_allocator_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2013-2017 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | // globally disable some warnings with MSVC, that are issued not for a
 9 | // specific header, but rather for the interaction of system headers
10 | // and LibFlatArray source:
11 | #ifdef _MSC_BUILD
12 | #pragma warning( disable : 4710 )
13 | #endif
14 | 
15 | #include <libflatarray/aligned_allocator.hpp>
16 | 
17 | // disable certain warnings from system headers when compiling with
18 | // Microsoft Visual Studio:
19 | #ifdef _MSC_BUILD
20 | #pragma warning( push )
21 | #pragma warning( disable : 4514 )
22 | #endif
23 | 
24 | #include <vector>
25 | 
26 | #ifdef _MSC_BUILD
27 | #pragma warning( pop )
28 | #endif
29 | 
30 | #include "test.hpp"
31 | 
32 | using namespace LibFlatArray;
33 | 
34 | ADD_TEST(test_alignment_64)
35 | {
36 |     int *p = aligned_allocator<int,   64>().allocate(3);
37 |     BOOST_TEST(0 == (long(p) %  64));
38 |     aligned_allocator<int, 64>().deallocate(p, 3);
39 | }
40 | 
41 | ADD_TEST(test_alignment_128)
42 | {
43 |     char *p = aligned_allocator<char, 128>().allocate(199);
44 |     BOOST_TEST(0 == (long(p) % 128));
45 |     aligned_allocator<char, 128>().deallocate(p, 199);
46 | }
47 | 
48 | ADD_TEST(test_alignment_512)
49 | {
50 |     long *p = aligned_allocator<long, 512>().allocate(256);
51 |     BOOST_TEST(0 == (long(p) % 512));
52 |     aligned_allocator<long, 512>().deallocate(p, 256);
53 | }
54 | 
55 | ADD_TEST(test_usage_with_std_vector)
56 | {
57 |     typedef std::vector<int, aligned_allocator<int, 64> > vec_type;
58 |     vec_type vec(40, -1);
59 | 
60 |     BOOST_TEST(0 == (std::size_t(&vec[0])) % 64);
61 | 
62 |     for (vec_type::iterator i = vec.begin(); i != vec.end(); ++i) {
63 |         BOOST_TEST(-1 == *i);
64 |     }
65 | 
66 |     vec.resize(80);
67 |     for (int i = 0; i < 80; ++i) {
68 |         vec[std::size_t(i)] = 4711 + i;
69 |     }
70 |     for (int i = 0; i < 80; ++i) {
71 |         BOOST_TEST((4711 + i) == vec[std::size_t(i)]);
72 |     }
73 | 
74 |     vec.resize(0);
75 |     for (int i = 0; i < 90; ++i) {
76 |         vec.push_back(23 + i);
77 |     }
78 |     for (int i = 0; i < 90; ++i) {
79 |         BOOST_TEST((23 + i) == vec[std::size_t(i)]);
80 |     }
81 | 
82 |     vec.resize(0);
83 |     vec.reserve(95);
84 |     for (int i = 0; i < 95; ++i) {
85 |         vec.push_back(69 + i);
86 |     }
87 |     for (int i = 0; i < 95; ++i) {
88 |         BOOST_TEST((69 + i) == vec[std::size_t(i)]);
89 |     }
90 | 
91 | }
92 | 
93 | int main(int /* argc */, char** /* argv */)
94 | {
95 |     return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/include/libflatarray/streaming_short_vec.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_STREAMING_SHORT_VEC_HPP
 10 | #define FLAT_ARRAY_STREAMING_SHORT_VEC_HPP
 11 | 
 12 | #include <libflatarray/short_vec.hpp>
 13 | 
 14 | namespace LibFlatArray {
 15 | 
 16 | #ifdef __ICC
 17 | // disabling this warning as implicit type conversion is exactly our goal here:
 18 | #pragma warning push
 19 | #pragma warning (disable: 2304)
 20 | #endif
 21 | 
 22 | template<typename CARGO, std::size_t ARITY>
 23 | class streaming_short_vec;
 24 | 
 25 | template<typename CARGO, std::size_t ARITY >
 26 | inline bool any(const streaming_short_vec<CARGO, ARITY>& vec)
 27 | {
 28 |     return vec.any();
 29 | }
 30 | 
 31 | // Don't warn about these functions being stripped from an executable
 32 | // as they're not being used, that's actually expected behavior.
 33 | #ifdef _MSC_BUILD
 34 | #pragma warning( push )
 35 | #pragma warning( disable : 4514 )
 36 | #endif
 37 | 
 38 | /**
 39 |  * Wraps functionality of short_vec, but replaces all stores by
 40 |  * streaming (i.e. non-temporal) stores. Downside: all store addresses
 41 |  * must be aligned.
 42 |  */
 43 | template<typename CARGO, std::size_t ARITY>
 44 | class streaming_short_vec : public short_vec<CARGO, ARITY>
 45 | {
 46 | public:
 47 | 
 48 |     inline
 49 |     streaming_short_vec(const CARGO val = 0) : short_vec<CARGO, ARITY>(val)
 50 |     {}
 51 | 
 52 |     inline
 53 |     streaming_short_vec(const CARGO *data) : short_vec<CARGO, ARITY>(data)
 54 |     {}
 55 | 
 56 |     inline
 57 |     streaming_short_vec(short_vec<CARGO, ARITY>&& val) : short_vec<CARGO, ARITY>(std::move(val))
 58 |     {}
 59 | 
 60 | #ifdef LIBFLATARRAY_WITH_CPP14
 61 |     inline
 62 |     streaming_short_vec(const std::initializer_list<CARGO>& list)
 63 |     {
 64 |         const CARGO *ptr = static_cast<const CARGO *>(&(*list.begin()));
 65 |         load(ptr);
 66 |     }
 67 | #endif
 68 | 
 69 |     using short_vec<CARGO, ARITY>::load;
 70 | 
 71 |     inline
 72 |     void store(CARGO *data)
 73 |     {
 74 |         short_vec<CARGO, ARITY>::store_nt(data);
 75 |     }
 76 | 
 77 |     inline
 78 |     void store_aligned(CARGO *data)
 79 |     {
 80 |         short_vec<CARGO, ARITY>::store_nt(data);
 81 |     }
 82 | };
 83 | 
 84 | #ifdef __ICC
 85 | #pragma warning pop
 86 | #endif
 87 | 
 88 | template<typename CARGO, int ARITY>
 89 | inline
 90 | void operator<<(double *data, const streaming_short_vec<CARGO, ARITY>& vec)
 91 | {
 92 |     vec.store_nt(data);
 93 | }
 94 | 
 95 | #ifdef _MSC_BUILD
 96 | #pragma warning( pop )
 97 | #endif
 98 | 
 99 | }
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/construct_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_CONSTRUCT_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_CONSTRUCT_FUNCTOR_HPP
 11 | 
 12 | #include <libflatarray/config.h>
 13 | #include <libflatarray/detail/generate_cuda_launch_config.hpp>
 14 | 
 15 | namespace LibFlatArray {
 16 | 
 17 | namespace detail {
 18 | 
 19 | namespace flat_array {
 20 | 
 21 | /**
 22 |  * Will initialize all grid cells, relies on the SoA (Struct of
 23 |  * Arrays) accessor to initialize a cell's members individually.
 24 |  */
 25 | template<typename CELL, bool USE_CUDA_FUNCTORS = false>
 26 | class construct_functor
 27 | {
 28 | public:
 29 |     construct_functor(
 30 |         std::size_t dim_x,
 31 |         std::size_t dim_y,
 32 |         std::size_t dim_z) :
 33 |         dim_x(dim_x),
 34 |         dim_y(dim_y),
 35 |         dim_z(dim_z)
 36 |     {}
 37 | 
 38 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 39 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 40 |     {
 41 |         for (std::size_t z = 0; z < dim_z; ++z) {
 42 |             for (std::size_t y = 0; y < dim_y; ++y) {
 43 |                 accessor.index() = soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>::gen_index(0, y, z);
 44 | 
 45 |                 for (std::size_t x = 0; x < dim_x; ++x) {
 46 |                     accessor.construct_members();
 47 |                     ++accessor;
 48 |                 }
 49 |             }
 50 |         }
 51 |     }
 52 | 
 53 | private:
 54 |     std::size_t dim_x;
 55 |     std::size_t dim_y;
 56 |     std::size_t dim_z;
 57 | };
 58 | 
 59 | #ifdef LIBFLATARRAY_WITH_CUDA
 60 | #ifdef __CUDACC__
 61 | 
 62 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 63 | __global__
 64 | void construct_kernel(char *data, long dim_x, long dim_y, long dim_z)
 65 | {
 66 |     long x = blockDim.x * blockIdx.x + threadIdx.x;
 67 |     long y = blockDim.y * blockIdx.y + threadIdx.y;
 68 |     long z = blockDim.z * blockIdx.z + threadIdx.z;
 69 | 
 70 |     if (x >= dim_x) {
 71 |         return;
 72 |     }
 73 | 
 74 |     if (y >= dim_y) {
 75 |         return;
 76 |     }
 77 | 
 78 |     if (z >= dim_z) {
 79 |         return;
 80 |     }
 81 | 
 82 |     typedef soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 83 | 
 84 |     long index = accessor_type::gen_index(x, y, z);
 85 |     accessor_type accessor(data, index);
 86 |     accessor.construct_members();
 87 | }
 88 | 
 89 | /**
 90 |  * Specialization for CUDA
 91 |  */
 92 | template<typename CELL>
 93 | class construct_functor<CELL, true>
 94 | {
 95 | public:
 96 |     construct_functor(
 97 |         std::size_t dim_x,
 98 |         std::size_t dim_y,
 99 |         std::size_t dim_z) :
100 |         dim_x(dim_x),
101 |         dim_y(dim_y),
102 |         dim_z(dim_z)
103 |     {}
104 | 
105 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
106 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
107 |     {
108 |         dim3 grid_dim;
109 |         dim3 block_dim;
110 |         generate_cuda_launch_config()(&grid_dim, &block_dim, dim_x, dim_y, dim_z);
111 | 
112 |         construct_kernel<CELL, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(accessor.data(), dim_x, dim_y, dim_z);
113 |     }
114 | 
115 | private:
116 |     std::size_t dim_x;
117 |     std::size_t dim_y;
118 |     std::size_t dim_z;
119 | };
120 | 
121 | #endif
122 | #endif
123 | 
124 | }
125 | 
126 | }
127 | 
128 | }
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/destroy_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_DESTROY_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_DESTROY_FUNCTOR_HPP
 11 | 
 12 | #include <libflatarray/config.h>
 13 | #include <libflatarray/detail/generate_cuda_launch_config.hpp>
 14 | 
 15 | namespace LibFlatArray {
 16 | 
 17 | namespace detail {
 18 | 
 19 | namespace flat_array {
 20 | 
 21 | /**
 22 |  * Will call the destructor on all grid cells, relies on the SoA
 23 |  * (Struct of Arrays) accessor to destroy a cell's members
 24 |  * individually.
 25 |  */
 26 | template<typename CELL, bool USE_CUDA_FUNCTORS = false>
 27 | class destroy_functor
 28 | {
 29 | public:
 30 |     destroy_functor(
 31 |         std::size_t dim_x,
 32 |         std::size_t dim_y,
 33 |         std::size_t dim_z) :
 34 |         dim_x(dim_x),
 35 |         dim_y(dim_y),
 36 |         dim_z(dim_z)
 37 |     {}
 38 | 
 39 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 40 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 41 |     {
 42 |         for (std::size_t z = 0; z < dim_z; ++z) {
 43 |             for (std::size_t y = 0; y < dim_y; ++y) {
 44 |                 accessor.index() = long(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>::gen_index(0, y, z));
 45 | 
 46 |                 for (std::size_t x = 0; x < dim_x; ++x) {
 47 |                     accessor.destroy_members();
 48 |                     ++accessor;
 49 |                 }
 50 |             }
 51 |         }
 52 |     }
 53 | 
 54 | private:
 55 |     std::size_t dim_x;
 56 |     std::size_t dim_y;
 57 |     std::size_t dim_z;
 58 | };
 59 | 
 60 | #ifdef LIBFLATARRAY_WITH_CUDA
 61 | #ifdef __CUDACC__
 62 | 
 63 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 64 | __global__
 65 | void destroy_kernel(char *data, long dim_x, long dim_y, long dim_z)
 66 | {
 67 |     long x = blockDim.x * blockIdx.x + threadIdx.x;
 68 |     long y = blockDim.y * blockIdx.y + threadIdx.y;
 69 |     long z = blockDim.z * blockIdx.z + threadIdx.z;
 70 | 
 71 |     if (x >= dim_x) {
 72 |         return;
 73 |     }
 74 | 
 75 |     if (y >= dim_y) {
 76 |         return;
 77 |     }
 78 | 
 79 |     if (z >= dim_z) {
 80 |         return;
 81 |     }
 82 | 
 83 |     typedef soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 84 | 
 85 |     long index = accessor_type::gen_index(x, y, z);
 86 |     accessor_type accessor(data, index);
 87 |     accessor.destroy_members();
 88 | }
 89 | 
 90 | /**
 91 |  * Specialization for CUDA
 92 |  */
 93 | template<typename CELL>
 94 | class destroy_functor<CELL, true>
 95 | {
 96 | public:
 97 |     destroy_functor(
 98 |         std::size_t dim_x,
 99 |         std::size_t dim_y,
100 |         std::size_t dim_z) :
101 |         dim_x(dim_x),
102 |         dim_y(dim_y),
103 |         dim_z(dim_z)
104 |     {}
105 | 
106 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
107 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
108 |     {
109 |         dim3 grid_dim;
110 |         dim3 block_dim;
111 |         generate_cuda_launch_config()(&grid_dim, &block_dim, dim_x, dim_y, dim_z);
112 | 
113 |         destroy_kernel<CELL, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(accessor.data(), dim_x, dim_y, dim_z);
114 |     }
115 | 
116 | private:
117 |     std::size_t dim_x;
118 |     std::size_t dim_y;
119 |     std::size_t dim_z;
120 | };
121 | 
122 | #endif
123 | #endif
124 | 
125 | }
126 | 
127 | }
128 | 
129 | }
130 | 
131 | #endif
132 | 
133 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/dual_callback_helper.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2017 Andreas Schäfer
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #ifndef FLAT_ARRAY_DETAIL_DUAL_CALLBACK_HELPER_HPP
  9 | #define FLAT_ARRAY_DETAIL_DUAL_CALLBACK_HELPER_HPP
 10 | 
 11 | namespace LibFlatArray {
 12 | 
 13 | namespace detail {
 14 | 
 15 | namespace flat_array {
 16 | 
 17 | template<typename ACCESSOR1, typename FUNCTOR>
 18 | class dual_callback_helper2
 19 | {
 20 | public:
 21 |     dual_callback_helper2(ACCESSOR1& accessor1, FUNCTOR& functor) :
 22 |         accessor1(accessor1),
 23 |         functor(functor)
 24 |     {}
 25 | 
 26 |     template<typename ACCESSOR2>
 27 |     void operator()(ACCESSOR2& accessor2)
 28 |     {
 29 |         functor(accessor1, accessor2);
 30 |     }
 31 | 
 32 | private:
 33 |     ACCESSOR1& accessor1;
 34 |     FUNCTOR& functor;
 35 | };
 36 | 
 37 | template<typename GRID2, typename FUNCTOR>
 38 | class dual_callback_helper1
 39 | {
 40 | public:
 41 |     dual_callback_helper1(GRID2 *grid2, const FUNCTOR& functor) :
 42 |         grid2(grid2),
 43 |         functor(functor)
 44 |     {}
 45 | 
 46 |     template<typename ACCESSOR1>
 47 |     void operator()(ACCESSOR1& accessor1) const
 48 |     {
 49 |         dual_callback_helper2<ACCESSOR1, FUNCTOR> helper(accessor1, functor);
 50 |         grid2->callback(helper);
 51 |     }
 52 | 
 53 | private:
 54 |     GRID2 *grid2;
 55 |     FUNCTOR& functor;
 56 | };
 57 | 
 58 | class dual_callback_helper
 59 | {
 60 | public:
 61 |     template<typename GRID1, typename GRID2, typename FUNCTOR>
 62 |     void operator()(GRID1 *gridOld, GRID2 *gridNew, FUNCTOR& functor)
 63 |     {
 64 |         dual_callback_helper1<GRID2, FUNCTOR> helper(gridNew, functor);
 65 |         gridOld->callback(helper);
 66 |     }
 67 | };
 68 | 
 69 | template<typename GRID_TYPE, typename FUNCTOR>
 70 | class dual_callback_helper_symmetric
 71 | {
 72 | public:
 73 |     dual_callback_helper_symmetric(GRID_TYPE *other_grid, FUNCTOR& functor) :
 74 |         other_grid(other_grid),
 75 |         functor(functor)
 76 |     {}
 77 | 
 78 |     template<typename ACCESSOR>
 79 |     void operator()(ACCESSOR& accessor1) const
 80 |     {
 81 |         ACCESSOR accessor2(other_grid->data());
 82 | 
 83 |         functor(accessor1, accessor2);
 84 |     }
 85 | 
 86 | private:
 87 |     GRID_TYPE *other_grid;
 88 |     FUNCTOR& functor;
 89 | };
 90 | 
 91 | // Hardwire this warning to off as MSVC would otherwise complain about
 92 | // an assignment operator missing -- which is clearly there:
 93 | #ifdef _MSC_BUILD
 94 | #pragma warning( push )
 95 | #pragma warning( disable : 4626 4710 )
 96 | #endif
 97 | 
 98 | template<typename GRID_TYPE, typename FUNCTOR>
 99 | class const_dual_callback_helper_symmetric
100 | {
101 | public:
102 | 
103 | #ifdef LIBFLATARRAY_WITH_CPP14
104 |     inline const_dual_callback_helper_symmetric(const const_dual_callback_helper_symmetric& other) = default;
105 |     inline const_dual_callback_helper_symmetric(const_dual_callback_helper_symmetric&& other) = default;
106 | #endif
107 | 
108 |     const_dual_callback_helper_symmetric(GRID_TYPE *other_grid, const FUNCTOR& functor) :
109 |         other_grid(other_grid),
110 |         functor(functor)
111 |     {}
112 | 
113 |     template<typename ACCESSOR>
114 |     void operator()(ACCESSOR& accessor1) const
115 |     {
116 |         ACCESSOR accessor2(other_grid->data(), 0);
117 | 
118 |         functor(accessor1, accessor2);
119 |     }
120 | 
121 | private:
122 |     GRID_TYPE *other_grid;
123 |     const FUNCTOR& functor;
124 | };
125 | 
126 | #ifdef _MSC_BUILD
127 | #pragma warning( pop )
128 | #endif
129 | 
130 | }
131 | 
132 | }
133 | 
134 | }
135 | 
136 | #endif
137 | 


--------------------------------------------------------------------------------
/CMakeModules/FindSilo.cmake:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2008-2012 Sandia Corporation, Kitware Inc.
  2 | # Copyright (c) 2014-2014 Andreas Schäfer
  3 | #
  4 | # Sandia National Laboratories, New Mexico
  5 | # PO Box 5800
  6 | # Albuquerque, NM 87185
  7 | #
  8 | # Kitware Inc.
  9 | # 28 Corporate Drive
 10 | # Clifton Park, NY 12065
 11 | # USA
 12 | #
 13 | # Andreas Schäfer
 14 | # Informatik 3
 15 | # Martensstr. 3
 16 | # 91058 Erlangen
 17 | # Germany
 18 | #
 19 | # Under the terms of Contract DE-AC04-94AL85000, there is a
 20 | # non-exclusive license for use of this work by or on behalf of the
 21 | # U.S. Government.
 22 | #
 23 | # Redistribution and use in source and binary forms, with or without
 24 | # modification, are permitted provided that the following conditions are
 25 | # met:
 26 | #
 27 | #  * Redistributions of source code must retain the above copyright
 28 | #    notice, this list of conditions and the following disclaimer.
 29 | #
 30 | #  * Redistributions in binary form must reproduce the above copyright
 31 | #    notice, this list of conditions and the following disclaimer in the
 32 | #    documentation and/or other materials provided with the
 33 | #    distribution.
 34 | #
 35 | #  * Neither the name of Kitware nor the names of any contributors may
 36 | #    be used to endorse or promote products derived from this software
 37 | #    without specific prior written permission.
 38 | #
 39 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 40 | # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 41 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 42 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
 43 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 44 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 45 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 46 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 47 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 48 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 49 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 50 | # ========================================================================
 51 | #
 52 | # Try to find Silo library and headers. Define Silo_ROOT if Silo is
 53 | # installed in a non-standard directory.
 54 | #
 55 | # This file sets the following variables:
 56 | #
 57 | # Silo_INCLUDE_DIR, where to find silo.h, etc.
 58 | # Silo_LIBRARIES, the libraries to link against
 59 | # Silo_FOUND, If false, do not try to use Silo.
 60 | #
 61 | # Also defined, but not for general use are:
 62 | # Silo_LIBRARY, the full path to the silo library.
 63 | # Silo_INCLUDE_PATH, for CMake backward compatibility
 64 | 
 65 | FIND_PATH( Silo_INCLUDE_DIR silo.h
 66 |   PATHS /usr/local/include
 67 |   /usr/include
 68 |   ${Silo_ROOT}/include
 69 | )
 70 | 
 71 | FIND_LIBRARY( Silo_LIBRARY NAMES siloh5 silo
 72 |   PATHS /usr/lib
 73 |   /usr/lib64
 74 |   /usr/local/lib
 75 |   ${Silo_ROOT}/lib
 76 |   ${Silo_ROOT}/lib64
 77 | )
 78 | 
 79 | SET(Silo_FOUND "NO" )
 80 | IF(Silo_INCLUDE_DIR)
 81 |   IF(Silo_LIBRARY)
 82 | 
 83 |     SET(Silo_LIBRARIES ${Silo_LIBRARY})
 84 |     SET(Silo_FOUND "YES" )
 85 | 
 86 |   ELSE(Silo_LIBRARY)
 87 |     IF(Silo_FIND_REQURIED)
 88 |       message(SEND_ERROR "Unable to find the requested Silo libraries.")
 89 |     ENDIF(Silo_FIND_REQURIED)
 90 |   ENDIF(Silo_LIBRARY)
 91 | ENDIF(Silo_INCLUDE_DIR)
 92 | 
 93 | # handle the QUIETLY and REQUIRED arguments and set Silo_FOUND to TRUE if
 94 | # all listed variables are TRUE
 95 | INCLUDE(FindPackageHandleStandardArgs)
 96 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Silo DEFAULT_MSG Silo_LIBRARY Silo_INCLUDE_DIR)
 97 | 
 98 | MARK_AS_ADVANCED(
 99 |   Silo_INCLUDE_DIR
100 |   Silo_LIBRARY
101 | )
102 | 


--------------------------------------------------------------------------------
/test/loop_peeler_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | // globally disable some warnings with MSVC, that are issued not for a
 10 | // specific header, but rather for the interaction of system headers
 11 | // and LibFlatArray source:
 12 | #ifdef _MSC_BUILD
 13 | #pragma warning( disable : 4710 )
 14 | #endif
 15 | 
 16 | #include <libflatarray/aligned_allocator.hpp>
 17 | #include <libflatarray/loop_peeler.hpp>
 18 | #include <libflatarray/short_vec.hpp>
 19 | #include <libflatarray/streaming_short_vec.hpp>
 20 | #include <vector>
 21 | 
 22 | #include "test.hpp"
 23 | 
 24 | template<typename SHORT_VEC>
 25 | LIBFLATARRAY_INLINE
 26 | void scaler(int& i, int endX, double *data, double factor)
 27 | {
 28 |     for (; i < endX; i += SHORT_VEC::ARITY) {
 29 |         SHORT_VEC vec(data + i);
 30 |         vec *= factor;
 31 |         (data + i) << vec;
 32 |     }
 33 | }
 34 | 
 35 | ADD_TEST(TestLoopPeelerFunctionality)
 36 | {
 37 |     std::vector<double, LibFlatArray::aligned_allocator<double, 64> > foo;
 38 |     for (int i = 0; i < 123; ++i) {
 39 |         foo.push_back(1000 + i);
 40 |     }
 41 | 
 42 |     int x = 3;
 43 |     typedef LibFlatArray::short_vec<double, 8> short_vec_type;
 44 |     LIBFLATARRAY_LOOP_PEELER(short_vec_type, int, x, 113, scaler, &foo[0], 2.5);
 45 | 
 46 |     for (std::size_t i = 0; i < 123; ++i) {
 47 |         double expected = 1000 + i;
 48 |         if ((i >= 3) && (i < 113)) {
 49 |             expected *= 2.5;
 50 |         }
 51 | 
 52 |         BOOST_TEST_EQ(expected, foo[i]);
 53 |     }
 54 | }
 55 | 
 56 | ADD_TEST(TestLoopPeelerInteroperabilityWithStreamingShortVecs)
 57 | {
 58 |     std::vector<double, LibFlatArray::aligned_allocator<double, 64> > foo;
 59 |     for (int i = 0; i < 1234; ++i) {
 60 |         foo.push_back(1000 + i);
 61 |     }
 62 | 
 63 |     int x = 13;
 64 |     typedef LibFlatArray::streaming_short_vec<double, 8> short_vec_type;
 65 |     LIBFLATARRAY_LOOP_PEELER(short_vec_type, int, x, 1113, scaler, &foo[0], 2.5);
 66 | 
 67 |     for (std::size_t i = 0; i < 1234; ++i) {
 68 |         double expected = 1000 + i;
 69 |         if ((i >= 13) && (i < 1113)) {
 70 |             expected *= 2.5;
 71 |         }
 72 | 
 73 |         BOOST_TEST_EQ(expected, foo[i]);
 74 |     }
 75 | }
 76 | 
 77 | #ifdef LIBFLATARRAY_WITH_CPP14
 78 | #ifndef LIBFLATARRAY_WITH_CUDA
 79 | #ifndef LIBFLATARRAY_WITH_FORCED_CPP11
 80 | 
 81 | ADD_TEST(TestCpp14StyleLoopPeeler)
 82 | {
 83 |     unsigned i = 5;
 84 |     unsigned end = 43;
 85 |     std::vector<double, LibFlatArray::aligned_allocator<double, 64> > foo(64, 0);
 86 | 
 87 | // Actually MSVC is wrong here to assume we're not referencing
 88 | // my_float in the following lamda. We're just not referencing its
 89 | // value, just the type:
 90 | #ifdef _MSC_BUILD
 91 | #pragma warning( push )
 92 | #pragma warning( disable : 4100 )
 93 | #endif
 94 | 
 95 |     LibFlatArray::loop_peeler<LibFlatArray::short_vec<double, 8> >(&i, end, [&foo](auto my_float, unsigned *i, unsigned end) {
 96 |             typedef decltype(my_float) FLOAT;
 97 |             for (; *i < end; *i += FLOAT::ARITY) {
 98 |                 &foo[*i] << FLOAT(1.0);
 99 |             }
100 |         });
101 | 
102 | #ifdef _MSC_BUILD
103 | #pragma warning( pop )
104 | #endif
105 | 
106 | 
107 |     for (std::size_t c = 0; c < 5; ++c) {
108 |         BOOST_TEST_EQ(0.0, foo[c]);
109 |     }
110 |     for (std::size_t c = 5; c < 43; ++c) {
111 |         BOOST_TEST_EQ(1.0, foo[c]);
112 |     }
113 |     for (std::size_t c = 43; c < 64; ++c) {
114 |         BOOST_TEST_EQ(0.0, foo[c]);
115 |     }
116 | }
117 | 
118 | #endif
119 | #endif
120 | #endif
121 | 
122 | int main(int /* argc */, char** /* argv */)
123 | {
124 |     return 0;
125 | }
126 | 


--------------------------------------------------------------------------------
/examples/gauss/filter_c99.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  * Copyright 2017 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifdef __ICC
 10 | #include <omp.h>
 11 | #endif
 12 | 
 13 | #ifdef _MSC_BUILD
 14 | #pragma warning( push )
 15 | #pragma warning( disable : 4514 )
 16 | #endif
 17 | 
 18 | #include <math.h>
 19 | 
 20 | #ifdef _MSC_BUILD
 21 | #pragma warning( pop )
 22 | #endif
 23 | 
 24 | /**
 25 |  * Computes a 2D gaussian filter with a 5x5 stencil accross the YZ-plane.
 26 |  */
 27 | void filter_c99(double *data_new, const double *data_old, int dim_x, int dim_y, int dim_z)
 28 | {
 29 |     // cast types here to maintain a C++-compatible signature:
 30 |     double (* const restrict grid_old)[dim_y][dim_x] = (double (* const)[dim_y][dim_x])data_old;
 31 |     double (*       restrict grid_new)[dim_y][dim_x] = (double (*      )[dim_y][dim_x])data_new;
 32 | 
 33 |     double weights[5][5];
 34 |     double sum = 0;
 35 | 
 36 |     for (int y = 0; y < 5; ++y) {
 37 |         for (int x = 0; x < 5; ++x) {
 38 |             double x_component = x - 2;
 39 |             double y_component = y - 2;
 40 |             weights[y][x] = exp(-0.5 * (x_component * x_component +
 41 |                                         y_component * y_component)) / 2 / 3.14159265358979323846;
 42 |             sum += weights[y][x];
 43 |         }
 44 |     }
 45 |     for (int y = 0; y < 5; ++y) {
 46 |         for (int x = 0; x < 5; ++x) {
 47 |             weights[y][x] /= sum;
 48 |         }
 49 |     }
 50 | 
 51 |     // we exploit symmetry to avoid redudant loads of weights:
 52 |     double weight_00 = weights[2][2];
 53 |     double weight_01 = weights[2][1];
 54 |     double weight_02 = weights[2][0];
 55 |     double weight_11 = weights[1][1];
 56 |     double weight_12 = weights[1][0];
 57 |     double weight_22 = weights[0][0];
 58 | 
 59 | #pragma omp parallel for schedule(static)
 60 |     for (int z = 2; z < (dim_z - 2); ++z) {
 61 |         for (int y = 2; y < (dim_y - 2); ++y) {
 62 | #ifdef __ICC
 63 | #pragma vector always nontemporal
 64 | #endif
 65 |             for (int x = 0; x < dim_x; ++x) {
 66 |                 grid_new[z][y][x] =
 67 |                     grid_old[z - 2][y - 2][x] * weight_22 +
 68 |                     grid_old[z - 2][y - 1][x] * weight_12 +
 69 |                     grid_old[z - 2][y + 0][x] * weight_02 +
 70 |                     grid_old[z - 2][y + 1][x] * weight_12 +
 71 |                     grid_old[z - 2][y + 2][x] * weight_22 +
 72 | 
 73 |                     grid_old[z - 1][y - 2][x] * weight_12 +
 74 |                     grid_old[z - 1][y - 1][x] * weight_11 +
 75 |                     grid_old[z - 1][y + 0][x] * weight_01 +
 76 |                     grid_old[z - 1][y + 1][x] * weight_11 +
 77 |                     grid_old[z - 1][y + 2][x] * weight_12 +
 78 | 
 79 |                     grid_old[z + 0][y - 2][x] * weight_02 +
 80 |                     grid_old[z + 0][y - 1][x] * weight_01 +
 81 |                     grid_old[z + 0][y + 0][x] * weight_00 +
 82 |                     grid_old[z + 0][y + 1][x] * weight_01 +
 83 |                     grid_old[z + 0][y + 2][x] * weight_02 +
 84 | 
 85 |                     grid_old[z + 1][y - 2][x] * weight_12 +
 86 |                     grid_old[z + 1][y - 1][x] * weight_11 +
 87 |                     grid_old[z + 1][y + 0][x] * weight_01 +
 88 |                     grid_old[z + 1][y + 1][x] * weight_11 +
 89 |                     grid_old[z + 1][y + 2][x] * weight_12 +
 90 | 
 91 |                     grid_old[z + 2][y - 2][x] * weight_22 +
 92 |                     grid_old[z + 2][y - 1][x] * weight_12 +
 93 |                     grid_old[z + 2][y + 0][x] * weight_02 +
 94 |                     grid_old[z + 2][y + 1][x] * weight_12 +
 95 |                     grid_old[z + 2][y + 2][x] * weight_22;
 96 |             }
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/examples/performance_tests/plot:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | require 'pp'
  3 | require 'set'
  4 | 
  5 | def parse_logfile(filename)
  6 |   data = []
  7 | 
  8 |   lines = File.readlines(filename)
  9 |   header = lines.shift[1..-1]
 10 |   header = header.split(";").map do |token|
 11 |     token.strip
 12 |   end
 13 | 
 14 |   lines.each do |line|
 15 |     tokens = line.split(";")
 16 |     entry = {}
 17 | 
 18 |     tokens.size.times do |i|
 19 |       entry[header[i]] = tokens[i].strip
 20 |     end
 21 | 
 22 |     data.push entry
 23 |   end
 24 | 
 25 |   return [data, header]
 26 | end
 27 | 
 28 | def gather_range(entries, column)
 29 |   ret = Set.new
 30 | 
 31 |   entries.each do |entry|
 32 |     ret.add entry[column]
 33 |   end
 34 | 
 35 |   return ret
 36 | end
 37 | 
 38 | def plot_jacobi(data, header)
 39 |   entries = data.find_all do |entry|
 40 |     entry["family"] == "JacobiD3Q7"
 41 |   end
 42 | 
 43 |   dimensions = gather_range(entries, "dimensions")
 44 |   species    = gather_range(entries, "species")
 45 | 
 46 |   outfile = "test.png"
 47 |   x_label = "Grid Size"
 48 |   y_label = "GLUPS"
 49 |   plot_specs = []
 50 |   datafile = "temp.dat"
 51 | 
 52 |   File.open(datafile, "w") do |file|
 53 |     dimensions.each do |dim|
 54 |       dim =~ /\((\d+),/
 55 |       file.print "#{$1}"
 56 | 
 57 |       species.each do |species|
 58 |         entries.each do |entry|
 59 |           if (entry["dimensions"] == dim) && (entry["species"] == species)
 60 |             file.print " #{entry["perf"]}"
 61 |           end
 62 |         end
 63 |       end
 64 | 
 65 |       file.puts
 66 |     end
 67 |   end
 68 | 
 69 |   index = 2
 70 |   species.each do |s|
 71 |     plot_specs << [index, s]
 72 |     index += 1
 73 |   end
 74 | 
 75 |   plots = plot_specs.map do |column, title|
 76 |     "'#{datafile}' using 1:#{column} title '#{title}' with linespoints"
 77 |   end
 78 | 
 79 |   command = <<EOF
 80 | set terminal png transparent size 800,400
 81 | set key right top
 82 | set output \"#{outfile}\"
 83 | set yrange [0:1]
 84 | set xlabel '#{x_label}'
 85 | set ylabel '#{y_label}'
 86 | plot #{plots.join(", ")}
 87 | EOF
 88 | 
 89 |   system "gnuplot <<EOF
 90 | #{command}
 91 | EOF"
 92 | 
 93 | end
 94 | 
 95 | def plot_n_body(data, header)
 96 |   entries = data.find_all do |entry|
 97 |     entry["family"] == "NBody"
 98 |   end
 99 | 
100 |   dimensions = gather_range(entries, "dimensions")
101 |   species    = gather_range(entries, "species")
102 | 
103 |   outfile = "test2.png"
104 |   x_label = "Particles"
105 |   y_label = "GFLOPS"
106 |   plot_specs = []
107 |   datafile = "temp.dat"
108 | 
109 |   File.open(datafile, "w") do |file|
110 |     dimensions.each do |dim|
111 |       dim =~ /\((\d+),/
112 |       file.print "#{$1}"
113 | 
114 |       species.each do |species|
115 |         entries.each do |entry|
116 |           if (entry["dimensions"] == dim) && (entry["species"] == species)
117 |             file.print " #{entry["perf"]}"
118 |           end
119 |         end
120 |       end
121 | 
122 |       file.puts
123 |     end
124 |   end
125 | 
126 |   index = 2
127 |   species.each do |s|
128 |     plot_specs << [index, s]
129 |     index += 1
130 |   end
131 | 
132 |   plots = plot_specs.map do |column, title|
133 |     "'#{datafile}' using 1:#{column} title '#{title}' with linespoints"
134 |   end
135 | 
136 |   command = <<EOF
137 | set terminal png transparent size 800,400
138 | set key right top
139 | set output \"#{outfile}\"
140 | set yrange [0:60]
141 | set xlabel '#{x_label}'
142 | set ylabel '#{y_label}'
143 | plot #{plots.join(", ")}
144 | EOF
145 | 
146 |   system "gnuplot <<EOF
147 | #{command}
148 | EOF"
149 | 
150 | end
151 | 
152 | if ARGV.size != 1
153 |   STDERR.puts <<EOF
154 | Usage: #{$0} LOGFILE
155 | 
156 | This script uses gnuplot to visualize the results of LibFlatArray's
157 | performance tests.
158 | EOF
159 | 
160 |   exit 1
161 | end
162 | 
163 | data, header = parse_logfile(ARGV[0])
164 | plot_jacobi(data, header)
165 | plot_n_body(data, header)
166 | 


--------------------------------------------------------------------------------
/test/soa_array_cuda_test.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Andreas Schäfer
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #include <libflatarray/soa_array.hpp>
  9 | #include <libflatarray/macros.hpp>
 10 | #include <map>
 11 | 
 12 | #include "test.hpp"
 13 | 
 14 | class CellWithArrayMember
 15 | {
 16 | public:
 17 |     __host__
 18 |     __device__
 19 |     inline
 20 |     explicit CellWithArrayMember(int j = 0) :
 21 |         j(j)
 22 |     {
 23 |         i[0] = j + 1;
 24 |         i[1] = j + 2;
 25 |         i[2] = j + 3;
 26 | 
 27 |         x[0] = j + 0.4;
 28 |         x[1] = j + 0.5;
 29 |     }
 30 | 
 31 |     __host__
 32 |     __device__
 33 |     inline
 34 |     CellWithArrayMember(int newI[3], double newX[2], int j) :
 35 |         j(j)
 36 |     {
 37 |         i[0] = newI[0];
 38 |         i[1] = newI[1];
 39 |         i[1] = newI[2];
 40 | 
 41 |         x[0] = newX[0];
 42 |         x[1] = newX[1];
 43 |     }
 44 | 
 45 |     int i[3];
 46 |     int j;
 47 |     double x[2];
 48 | };
 49 | 
 50 | LIBFLATARRAY_REGISTER_SOA(CellWithArrayMember,
 51 |                           ((int)(i)(3))
 52 |                           ((int)(j))
 53 |                           ((double)(x)(2)) )
 54 | 
 55 | 
 56 | namespace LibFlatArray {
 57 | 
 58 | typedef soa_array<CellWithArrayMember, 1000> soa_array_type;
 59 | 
 60 | __global__
 61 | void test_insert(soa_array_type *array)
 62 | {
 63 |     int size = array->size();
 64 |     for (int i = 0; i < size; ++i) {
 65 |         CellWithArrayMember cell = (*array)[i];
 66 |         cell.i[0] += 10000;
 67 |         cell.i[1] += 20000;
 68 |         cell.i[2] += 30000;
 69 | 
 70 |         (*array) << cell;
 71 |     }
 72 | }
 73 | 
 74 | __global__
 75 | void test_modify(soa_array_type *array)
 76 | {
 77 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
 78 |     if (index >= array->size()) {
 79 |         return;
 80 |     }
 81 | 
 82 |     (*array)[index].i()[0] += index;
 83 |     (*array)[index].i()[1] -= index;
 84 |     (*array)[index].i()[2]  = 2011 + 2014;
 85 | }
 86 | 
 87 | ADD_TEST(TestCUDABasic)
 88 | {
 89 |     soa_array_type host_array;
 90 | 
 91 |     for (int i = 0; i < 100; ++i) {
 92 |         CellWithArrayMember cell;
 93 |         cell.i[0] = i;
 94 |         cell.i[1] = i + 1000;
 95 |         cell.i[2] = i + 2000;
 96 |         host_array << cell;
 97 |     }
 98 | 
 99 |     soa_array_type *device_array = 0;
100 |     cudaMalloc(&device_array, sizeof(soa_array_type));
101 |     cudaMemcpy(device_array, &host_array, sizeof(soa_array_type), cudaMemcpyHostToDevice);
102 | 
103 |     test_insert<<<1, 1>>>(device_array);
104 |     cudaMemcpy(&host_array, device_array, sizeof(soa_array_type), cudaMemcpyDeviceToHost);
105 | 
106 |     for (int i = 0; i < 100; ++i) {
107 |         BOOST_TEST((i +     0) == host_array[i +   0].i()[0]);
108 |         BOOST_TEST((i +  1000) == host_array[i +   0].i()[1]);
109 |         BOOST_TEST((i +  2000) == host_array[i +   0].i()[2]);
110 | 
111 |         BOOST_TEST((i + 10000) == host_array[i + 100].i()[0]);
112 |         BOOST_TEST((i + 21000) == host_array[i + 100].i()[1]);
113 |         BOOST_TEST((i + 32000) == host_array[i + 100].i()[2]);
114 |     }
115 | 
116 |     test_modify<<<7, 32>>>(device_array);
117 |     cudaMemcpy(&host_array, device_array, sizeof(soa_array_type), cudaMemcpyDeviceToHost);
118 | 
119 |     for (int i = 0; i < 100; ++i) {
120 |         BOOST_TEST((i + i +     0) == host_array[i +   0].i()[0]);
121 |         BOOST_TEST((0 +      1000) == host_array[i +   0].i()[1]);
122 |         BOOST_TEST((         4025) == host_array[i +   0].i()[2]);
123 | 
124 |         BOOST_TEST((i + i + 10100) == host_array[i + 100].i()[0]);
125 |         BOOST_TEST((0 +     20900) == host_array[i + 100].i()[1]);
126 |         BOOST_TEST((         4025) == host_array[i + 100].i()[2]);
127 |     }
128 | 
129 |     cudaFree(device_array);
130 | }
131 | 
132 | }
133 | 
134 | int main(int argc, char **argv)
135 | {
136 |     return 0;
137 | }
138 | 


--------------------------------------------------------------------------------
/include/libflatarray/estimate_optimum_short_vec_type.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Andreas Schäfer
 3 |  *
 4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
 5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
 6 |  */
 7 | 
 8 | #ifndef FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP
 9 | #define FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP
10 | 
11 | #include <libflatarray/detail/streaming_short_vec_switch.hpp>
12 | #include <libflatarray/ilp_to_arity.hpp>
13 | 
14 | namespace LibFlatArray {
15 | 
16 | /**
17 |  * This class serves as a type switch to select an appropriate
18 |  * short_vec type based on the machine architecture and working set
19 |  * size. This is just a heuristic. Users are advised that an
20 |  * analytical performance model can yield much better results.
21 |  *
22 |  * We're primarily concerned with two choices: temporal vs.
23 |  * non-temporal stores and the arity of the vector type. Smaller
24 |  * working sets should use short_vec if they fit well into the cache,
25 |  * larger sets should use streaming_short_vec to benefit from
26 |  * streaming stores.
27 |  *
28 |  * The arity of the vector type should not be smaller than the arity
29 |  * of the supported assembly instructions (e.g. >=8 for AVX512 and
30 |  * doubles).If the arity is larger then we effectively perform
31 |  * loop-unrolling. This may be beneficial for architectures that
32 |  * struggle with out-of-order execution as if lenghtens the loop body
33 |  * and gives them more independent instructions to work on (e.g. Intel
34 |  * Core 2). Modern Intel architectures however may suffer from
35 |  * unrolling as this might make the loop body exceed the size of the
36 |  * loop buffer which holds previously decoded microinstructions.
37 |  *
38 |  * Arguments should be:
39 |  *
40 |  * - CARGO: the main machine data type used inside the kernel, e.g.
41 |  *     float or double. Most kernels will operate on various data
42 |  *     types, but the vector arity should usually be chosen based on
43 |  *     that type which is used most as it has the strongest impact on
44 |  *     register scheduling.
45 |  *
46 |  * - ACCESSOR: an soa_accessor produced by LibFlatArray that provides
47 |  *     the number of elements in the working set. We assume the size
48 |  *     of the working set to be the product of the size of CARGO and
49 |  *     the number of elements in the set.
50 |  *
51 |  * - LAST_LEVEL_CACHE_SIZE_ESTIMATE: if available, the user can give
52 |  *     an estimate of the CPU's cache. Our hard-coded value will
53 |  *     overestimate that size for most architectures, but that's
54 |  *     generally fine. The consequence of overestimating is that for
55 |  *     some medium-sized sets the code will use temporal stores
56 |  *     instead of non-temporal stores, reulting in a performance hit
57 |  *     of less than 30% (true for most codes and current
58 |  *     architectures). Underestimating the cache size will result in
59 |  *     the use of steaming stores even if the working set would fit
60 |  *     just fine into the caches, easily resulting in a performance
61 |  *     hit of 1500% (e.g. 0.4 GLUPS instead of 6 GLUPS for a 3D Jacobi
62 |  *     on an Intel i7-6700HQ). Bottom line: never underestimate the
63 |  *     cache size!
64 |  */
65 | template<typename CARGO, typename ACCESSOR, int UNROLL_FACTOR = 2, int LAST_LEVEL_CACHE_SIZE_ESTIMATE = (1 << 25)>
66 | class estimate_optimum_short_vec_type
67 | {
68 | public:
69 |     static const std::size_t ARITY = ilp_to_arity<CARGO, UNROLL_FACTOR>::ARITY;
70 | 
71 | // Overflow is fine here, it's an artifact of 32-bit builds:
72 | #ifdef _MSC_BUILD
73 | #pragma warning( push )
74 | #pragma warning( disable : 4307 )
75 | #endif
76 | 
77 |     static const long STREAMING_FLAG =
78 |         ACCESSOR::DIM_PROD * sizeof(typename ACCESSOR::element_type) / LAST_LEVEL_CACHE_SIZE_ESTIMATE;
79 | 
80 | #ifdef _MSC_BUILD
81 | #pragma warning( pop )
82 | #endif
83 | 
84 |     typedef typename detail::flat_array::streaming_short_vec_switch<CARGO, ARITY, STREAMING_FLAG>::VALUE VALUE;
85 | };
86 | 
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/get_instance_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2016 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_GET_INSTANCE_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_GET_INSTANCE_FUNCTOR_HPP
 11 | 
 12 | #include <libflatarray/soa_accessor.hpp>
 13 | 
 14 | namespace LibFlatArray {
 15 | 
 16 | namespace detail {
 17 | 
 18 | namespace flat_array {
 19 | 
 20 | /**
 21 |  * This helper class is used to retrieve objects from the SoA storage
 22 |  * with the help of an accessor.
 23 |  */
 24 | template<typename CELL, bool USE_CUDA_FUNCTORS = false>
 25 | class get_instance_functor
 26 | {
 27 | public:
 28 |     get_instance_functor(
 29 |         CELL *target,
 30 |         long x,
 31 |         long y,
 32 |         long z,
 33 |         long count) :
 34 |         target(target),
 35 |         x(x),
 36 |         y(y),
 37 |         z(z),
 38 |         count(count)
 39 |     {}
 40 | 
 41 |     get_instance_functor(
 42 |         CELL *target,
 43 |         std::size_t x,
 44 |         std::size_t y,
 45 |         std::size_t z,
 46 |         std::size_t count) :
 47 |         target(target),
 48 |         x(x),
 49 |         y(y),
 50 |         z(z),
 51 |         count(count)
 52 |     {}
 53 | 
 54 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 55 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 56 |     {
 57 |         typedef soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 58 |         accessor.index() = accessor_type::gen_index(x, y, z);
 59 |         CELL *cursor = target;
 60 | 
 61 |         for (std::size_t i = 0; i < count; ++i) {
 62 |             accessor >> *cursor;
 63 |             ++cursor;
 64 |             ++accessor.index();
 65 |         }
 66 |     }
 67 | 
 68 | private:
 69 |     CELL *target;
 70 |     std::size_t x;
 71 |     std::size_t y;
 72 |     std::size_t z;
 73 |     std::size_t count;
 74 | };
 75 | 
 76 | #ifdef LIBFLATARRAY_WITH_CUDA
 77 | #ifdef __CUDACC__
 78 | 
 79 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 80 | __global__
 81 | void get_kernel(CELL *target, const char *source, long count, long x, long y, long z)
 82 | {
 83 |     long offset = blockDim.x * blockIdx.x + threadIdx.x;
 84 |     if (offset >= count) {
 85 |         return;
 86 |     }
 87 | 
 88 |     typedef const_soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 89 | 
 90 |     long index = accessor_type::gen_index(x + offset, y, z);
 91 |     accessor_type accessor(source, index);
 92 | 
 93 |     accessor >> target[offset];
 94 | }
 95 | 
 96 | /**
 97 |  * Specialization for CUDA
 98 |  */
 99 | template<typename CELL>
100 | class get_instance_functor<CELL, true>
101 | {
102 | public:
103 |     get_instance_functor(
104 |         CELL *target,
105 |         long x,
106 |         long y,
107 |         long z,
108 |         long count) :
109 |         target(target),
110 |         x(x),
111 |         y(y),
112 |         z(z),
113 |         count(count)
114 |     {}
115 | 
116 |     get_instance_functor(
117 |         CELL *target,
118 |         std::size_t x,
119 |         std::size_t y,
120 |         std::size_t z,
121 |         std::size_t count) :
122 |         target(target),
123 |         x(x),
124 |         y(y),
125 |         z(z),
126 |         count(count)
127 |     {}
128 | 
129 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
130 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
131 |     {
132 |         dim3 grid_dim;
133 |         dim3 block_dim;
134 |         generate_cuda_launch_config()(&grid_dim, &block_dim, count, 1, 1);
135 | 
136 |         get_kernel<CELL, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(
137 |             target,
138 |             accessor.data(),
139 |             count,
140 |             x,
141 |             y,
142 |             z);
143 |     }
144 | 
145 | private:
146 |     CELL *target;
147 |     std::size_t x;
148 |     std::size_t y;
149 |     std::size_t z;
150 |     std::size_t count;
151 | 
152 | };
153 | 
154 | #endif
155 | #endif
156 | 
157 | }
158 | 
159 | }
160 | 
161 | }
162 | 
163 | #endif
164 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/save_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2016 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_SAVE_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_SAVE_FUNCTOR_HPP
 11 | 
 12 | namespace LibFlatArray {
 13 | 
 14 | namespace detail {
 15 | 
 16 | namespace flat_array {
 17 | 
 18 | #ifdef _MSC_BUILD
 19 | #pragma warning( push )
 20 | #pragma warning( disable : 4626 4710 5027 )
 21 | #endif
 22 | 
 23 | /**
 24 |  * Same as load_functor, but the other way around.
 25 |  */
 26 | template<typename CELL, typename ITERATOR, bool USE_CUDA_FUNCTORS = false>
 27 | class save_functor
 28 | {
 29 | public:
 30 |     save_functor(
 31 |         const ITERATOR& start,
 32 |         const ITERATOR& end,
 33 |         char *target,
 34 |         std::size_t count) :
 35 |         start(start),
 36 |         end(end),
 37 |         target(target),
 38 |         count(count)
 39 |     {}
 40 | 
 41 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 42 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 43 |     {
 44 |         std::size_t offset = 0;
 45 | 
 46 |         for (ITERATOR i = start; i != end; ++i) {
 47 |             accessor.index() = soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>::gen_index(
 48 |                 static_cast<long>(i->origin[0]),
 49 |                 static_cast<long>(i->origin[1]),
 50 |                 static_cast<long>(i->origin[2]));
 51 |             accessor.save(
 52 |                 target,
 53 |                 static_cast<std::size_t>(i->length()),
 54 |                 offset,
 55 |                 count);
 56 | 
 57 |             offset += i->length();
 58 |         }
 59 |     }
 60 | 
 61 | private:
 62 |     ITERATOR start;
 63 |     ITERATOR end;
 64 |     char *target;
 65 |     std::size_t count;
 66 | };
 67 | 
 68 | #ifdef _MSC_BUILD
 69 | #pragma warning( pop )
 70 | #endif
 71 | 
 72 | #ifdef LIBFLATARRAY_WITH_CUDA
 73 | #ifdef __CUDACC__
 74 | 
 75 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 76 | __global__
 77 | void save_kernel(const char *source, char *target, long count, long stride, long x, long y, long z, long offset)
 78 | {
 79 |     long thread_index = blockDim.x * blockIdx.x + threadIdx.x;
 80 |     if (thread_index >= count) {
 81 |         return;
 82 |     }
 83 | 
 84 |     typedef const_soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 85 | 
 86 |     long index = accessor_type::gen_index(x, y, z) + thread_index;
 87 |     accessor_type accessor(source, index);
 88 | 
 89 |     accessor.save(target, 1, offset + thread_index, stride);
 90 | }
 91 | 
 92 | /**
 93 |  * Specialization for CUDA
 94 |  */
 95 | template<typename CELL, typename ITERATOR>
 96 | class save_functor<CELL, ITERATOR, true>
 97 | {
 98 | public:
 99 |     save_functor(
100 |         const ITERATOR& start,
101 |         const ITERATOR& end,
102 |         char *target,
103 |         std::size_t count) :
104 |         start(start),
105 |         end(end),
106 |         target(target),
107 |         count(count)
108 |     {}
109 | 
110 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
111 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
112 |     {
113 |         std::size_t offset = 0;
114 | 
115 |         for (ITERATOR i = start; i != end; ++i) {
116 |             dim3 grid_dim;
117 |             dim3 block_dim;
118 |             generate_cuda_launch_config()(&grid_dim, &block_dim, i->length(), 1, 1);
119 | 
120 |             save_kernel<CELL, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(
121 |                 accessor.data(),
122 |                 target,
123 |                 i->length(),
124 |                 count,
125 |                 i->origin[0],
126 |                 i->origin[1],
127 |                 i->origin[2],
128 |                 offset);
129 | 
130 |             offset += i->length();
131 |         }
132 |     }
133 | 
134 | private:
135 |     ITERATOR start;
136 |     ITERATOR end;
137 |     char *target;
138 |     std::size_t count;
139 | 
140 | };
141 | 
142 | #endif
143 | #endif
144 | 
145 | }
146 | 
147 | }
148 | 
149 | }
150 | 
151 | #endif
152 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/set_instance_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2017 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_SET_INSTANCE_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_SET_INSTANCE_FUNCTOR_HPP
 11 | 
 12 | #include <libflatarray/soa_accessor.hpp>
 13 | 
 14 | namespace LibFlatArray {
 15 | 
 16 | namespace detail {
 17 | 
 18 | namespace flat_array {
 19 | 
 20 | /**
 21 |  * This helper class uses an accessor to push an object's members into
 22 |  * the SoA storage.
 23 |  */
 24 | template<typename CELL, long SOURCE_STRIDE, bool USE_CUDA_FUNCTORS = false>
 25 | class set_instance_functor
 26 | {
 27 | public:
 28 |     set_instance_functor(
 29 |         const CELL *source,
 30 |         long x,
 31 |         long y,
 32 |         long z,
 33 |         long count) :
 34 |         source(source),
 35 |         x(x),
 36 |         y(y),
 37 |         z(z),
 38 |         count(count)
 39 |     {}
 40 | 
 41 |     set_instance_functor(
 42 |         const CELL *source,
 43 |         std::size_t x,
 44 |         std::size_t y,
 45 |         std::size_t z,
 46 |         std::size_t count) :
 47 |         source(source),
 48 |         x(x),
 49 |         y(y),
 50 |         z(z),
 51 |         count(count)
 52 |     {}
 53 | 
 54 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 55 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 56 |     {
 57 |         accessor.index() = long(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>::gen_index(x, y, z));
 58 |         const CELL *cursor = source;
 59 | 
 60 |         for (std::size_t i = 0; i < count; ++i) {
 61 |             accessor << *cursor;
 62 |             cursor += SOURCE_STRIDE;
 63 |             ++accessor.index();
 64 |         }
 65 |     }
 66 | 
 67 | private:
 68 |     const CELL *source;
 69 |     std::size_t x;
 70 |     std::size_t y;
 71 |     std::size_t z;
 72 |     std::size_t count;
 73 | };
 74 | 
 75 | #ifdef LIBFLATARRAY_WITH_CUDA
 76 | #ifdef __CUDACC__
 77 | 
 78 | template<typename CELL, long SOURCE_STRIDE, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 79 | __global__
 80 | void set_kernel(const CELL *source, char *target, long count, long x, long y, long z)
 81 | {
 82 |     long offset = (blockDim.x * blockIdx.x + threadIdx.x) * SOURCE_STRIDE;
 83 |     if (offset >= count) {
 84 |         return;
 85 |     }
 86 | 
 87 |     typedef soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 88 | 
 89 |     long index = accessor_type::gen_index(x + offset, y, z);
 90 |     accessor_type accessor(target, index);
 91 | 
 92 |     accessor << source[offset];
 93 | }
 94 | 
 95 | /**
 96 |  * Specialization for CUDA
 97 |  */
 98 | template<typename CELL, long SOURCE_STRIDE>
 99 | class set_instance_functor<CELL, SOURCE_STRIDE, true>
100 | {
101 | public:
102 |     set_instance_functor(
103 |         const CELL *source,
104 |         long x,
105 |         long y,
106 |         long z,
107 |         long count) :
108 |         source(source),
109 |         x(x),
110 |         y(y),
111 |         z(z),
112 |         count(count)
113 |     {}
114 | 
115 |     set_instance_functor(
116 |         const CELL *source,
117 |         std::size_t x,
118 |         std::size_t y,
119 |         std::size_t z,
120 |         std::size_t count) :
121 |         source(source),
122 |         x(x),
123 |         y(y),
124 |         z(z),
125 |         count(count)
126 |     {}
127 | 
128 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
129 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
130 |     {
131 |         dim3 grid_dim;
132 |         dim3 block_dim;
133 |         generate_cuda_launch_config()(&grid_dim, &block_dim, count, 1, 1);
134 | 
135 |         set_kernel<CELL, SOURCE_STRIDE, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(
136 |             source,
137 |             accessor.data(),
138 |             count,
139 |             x,
140 |             y,
141 |             z);
142 |     }
143 | 
144 | private:
145 |     const CELL *source;
146 |     std::size_t x;
147 |     std::size_t y;
148 |     std::size_t z;
149 |     std::size_t count;
150 | 
151 | };
152 | 
153 | #endif
154 | #endif
155 | 
156 | }
157 | 
158 | }
159 | 
160 | }
161 | 
162 | #endif
163 | 


--------------------------------------------------------------------------------
/test/test.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2017 Andreas Schäfer
  3 |  * Copyright 2017-2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef TEST_H
 10 | #define TEST_H
 11 | 
 12 | // disable certain warnings from system headers when compiling with
 13 | // Microsoft Visual Studio:
 14 | #ifdef _MSC_BUILD
 15 | #pragma warning( push )
 16 | #pragma warning( disable : 4514 4710 4996 )
 17 | #endif
 18 | 
 19 | #include <cmath>
 20 | #include <iostream>
 21 | #include <sstream>
 22 | 
 23 | #ifdef _MSC_BUILD
 24 | #pragma warning( pop )
 25 | #endif
 26 | 
 27 | #include <libflatarray/detail/macros.hpp>
 28 | 
 29 | #ifndef BOOST_TEST
 30 | // Microsoft Visual Studio doesn't define __PRETTY_FUNCTION__:
 31 | #ifdef _MSC_VER
 32 | #define BOOST_TEST(ARG) if (!(ARG)) { std::cerr << __FILE__ << "(" << __LINE__ << "): test '" << #ARG << "' failed in function '" << __FUNCSIG__ << "'" << std::endl; }
 33 | #else
 34 | #define BOOST_TEST(ARG) if (!(ARG)) { std::cerr << __FILE__ << "(" << __LINE__ << "): test '" << #ARG << "' failed in function '" << __PRETTY_FUNCTION__ << "'" << std::endl; }
 35 | #endif
 36 | 
 37 | #endif
 38 | 
 39 | 
 40 | #ifndef BOOST_TEST_EQ
 41 | #define BOOST_TEST_EQ(A, B) BOOST_TEST((A) == (B))
 42 | #endif
 43 | 
 44 | // Runner and ADD_TEST are some convenience functions to simplify
 45 | // definition of new tests. ADD_TEST will add scaffolding that causes
 46 | // the following block to be executed once the program starts.
 47 | // Advantage: tests have no longer to be manually added to main().
 48 | template<typename TEST>
 49 | class Runner
 50 | {
 51 | public:
 52 |     Runner()
 53 |     {
 54 |         TEST()();
 55 |     }
 56 | };
 57 | 
 58 | #define ADD_TEST(TEST_NAME)                     \
 59 |     class TEST_NAME                             \
 60 |     {                                           \
 61 |     public:                                     \
 62 |         LIBFLATARRAY_INLINE                     \
 63 |         void operator()();                      \
 64 |                                                 \
 65 |     private:                                    \
 66 |         static Runner<TEST_NAME> runner;        \
 67 |     };                                          \
 68 |                                                 \
 69 |     Runner<TEST_NAME> TEST_NAME::runner;        \
 70 |                                                 \
 71 |     LIBFLATARRAY_INLINE                         \
 72 |     void TEST_NAME::operator()()                \
 73 | 
 74 | 
 75 | #define TEST_REAL_ACCURACY(A, B, RELATIVE_ERROR_LIMIT)                  \
 76 |     {                                                                   \
 77 |         double a = (A);                                                 \
 78 |         double b = (B);                                                 \
 79 |         double delta = std::abs(a - b);                                 \
 80 |         double relativeError = delta / std::abs(a);                     \
 81 |         if (relativeError > RELATIVE_ERROR_LIMIT) {                     \
 82 |             std::stringstream buf;                                      \
 83 |             buf << "in file "                                           \
 84 |                 << __FILE__ << ":"                                      \
 85 |                 << __LINE__ << ": "                                     \
 86 |                 << "difference exceeds tolerance.\n"                    \
 87 |                 << "   A: " << a << "\n"                                \
 88 |                 << "   B: " << b << "\n"                                \
 89 |                 << "   delta: " << delta << "\n"                        \
 90 |                 << "   relativeError: " << relativeError << "\n";       \
 91 |             throw std::logic_error(buf.str());                          \
 92 |         }                                                               \
 93 |     }
 94 | 
 95 | // lazy (read: bad, inexact) test for equality. we can't use stict
 96 | // equality (operator==()), as vector units may yield
 97 | // non-IEEE-compliannt results. Single-precision accuracy (i.e. ~20
 98 | // bits for the mantissa or 6 digits) shall be suffice for functional
 99 | // testing.
100 | #define TEST_REAL(A, B)                         \
101 |     TEST_REAL_ACCURACY(A, B, 0.000001)
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/include/libflatarray/aligned_allocator.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2012-2017 Andreas Schäfer
  3 |  * Copyright 2015 Kurt Kanzenbach
  4 |  * Copyright 2018 Google
  5 |  *
  6 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  7 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  8 |  */
  9 | 
 10 | #ifndef FLAT_ARRAY_ALIGNED_ALLOCATOR_HPP
 11 | #define FLAT_ARRAY_ALIGNED_ALLOCATOR_HPP
 12 | 
 13 | // disable certain warnings from system headers when compiling with
 14 | // Microsoft Visual Studio:
 15 | #ifdef _MSC_BUILD
 16 | #pragma warning( push )
 17 | #pragma warning( disable : 4514 4710 )
 18 | #endif
 19 | 
 20 | #include <memory>
 21 | 
 22 | #ifdef _MSC_BUILD
 23 | #pragma warning( pop )
 24 | #endif
 25 | 
 26 | namespace LibFlatArray {
 27 | 
 28 | template<class T, std::size_t ALIGNMENT>
 29 | class aligned_allocator
 30 | {
 31 | public:
 32 |     typedef std::ptrdiff_t difference_type;
 33 |     typedef T* pointer;
 34 |     typedef const T* const_pointer;
 35 |     typedef T& reference;
 36 |     typedef const T& const_reference;
 37 |     typedef T value_type;
 38 |     typedef std::size_t size_type;
 39 | 
 40 |     template<typename OTHER>
 41 |     struct rebind
 42 |     {
 43 |         typedef aligned_allocator<OTHER, ALIGNMENT> other;
 44 |     };
 45 | 
 46 |     inline aligned_allocator()
 47 |     {}
 48 | 
 49 |     template<typename OTHER, int OTHER_ALIGNMENT>
 50 |     inline explicit aligned_allocator(const aligned_allocator<OTHER, OTHER_ALIGNMENT>& /* other */)
 51 |     {}
 52 | 
 53 |     inline pointer address(reference x) const
 54 |     {
 55 |         return &x;
 56 |     }
 57 | 
 58 |     inline const_pointer address(const_reference x) const
 59 |     {
 60 |         return &x;
 61 |     }
 62 | 
 63 |     pointer allocate(std::size_t n, const void* = 0)
 64 |     {
 65 |         // This code would have been a piece of cake if it would have
 66 |         // worked with posix_memalign -- which it didn't. Instead
 67 |         // we allocate a larger chunk of memory in which we can
 68 |         // accomodate an array of the required size, shifted to the
 69 |         // desired offset. Since we need the original address for the
 70 |         // deallocation, we store it directly in front of the aligned
 71 |         // array's start. Ugly, but it works.
 72 |         char *chunk = std::allocator<char>().allocate(upsize(n));
 73 |         if (chunk == 0) {
 74 |             return reinterpret_cast<pointer>(chunk);
 75 |         }
 76 | 
 77 |         std::size_t offset = reinterpret_cast<std::size_t>(chunk) % ALIGNMENT;
 78 |         std::size_t correction = ALIGNMENT - offset;
 79 |         if (correction < sizeof(char*)) {
 80 |             correction += ALIGNMENT;
 81 |         }
 82 |         char *ret = chunk + correction;
 83 |         *(reinterpret_cast<char**>(ret) - 1) = chunk;
 84 |         return reinterpret_cast<pointer>(ret);
 85 |     }
 86 | 
 87 |     void deallocate(pointer p, std::size_t n)
 88 |     {
 89 |         if (p == 0) {
 90 |             return;
 91 |         }
 92 | 
 93 |         char *actual;
 94 |         // retrieve the original pointer which sits in front of its
 95 |         // aligned brother
 96 |         actual = *(reinterpret_cast<char**>(p) - 1);
 97 |         std::allocator<char>().deallocate(actual, upsize(n));
 98 |     }
 99 | 
100 |     std::size_t max_size() const throw()
101 |     {
102 |         return std::allocator<T>().max_size();
103 |     }
104 | 
105 |     void construct(pointer p, const_reference val)
106 |     {
107 |         std::allocator<T>().construct(p, val);
108 |     }
109 | 
110 |     /**
111 |      * Added due to compiling for Intel MIC with CPP14=TRUE
112 |      * GCC Bug Report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51626
113 |      */
114 |     void construct(pointer p)
115 |     {
116 |         std::allocator<T>().construct(p, value_type());
117 |     }
118 | 
119 |     void destroy(pointer p)
120 |     {
121 |         std::allocator<T>().destroy(p);
122 |     }
123 | 
124 |     bool operator!=(const aligned_allocator& other) const
125 |     {
126 |         return !(*this == other);
127 |     }
128 | 
129 |     bool operator==(const aligned_allocator& /* other*/) const
130 |     {
131 |         return true;
132 |     }
133 | 
134 | private:
135 |     std::size_t graceOffset()
136 |     {
137 |         return ALIGNMENT + sizeof(char*);
138 |     }
139 | 
140 |     std::size_t upsize(std::size_t n)
141 |     {
142 |         return n * sizeof(T) + graceOffset();
143 |     }
144 | };
145 | 
146 | }
147 | 
148 | #endif
149 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/short_vec_helpers.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015 Kurt Kanzenbach
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_HELPERS_HPP
  9 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_HELPERS_HPP
 10 | 
 11 | #include <libflatarray/config.h>
 12 | 
 13 | // disable certain warnings from system headers when compiling with
 14 | // Microsoft Visual Studio:
 15 | #ifdef _MSC_BUILD
 16 | #pragma warning( push )
 17 | #pragma warning( disable : 4514 )
 18 | #endif
 19 | 
 20 | #include <cassert>
 21 | 
 22 | // uintptr_t is only available through C++11
 23 | #ifdef LIBFLATARRAY_WITH_CPP14
 24 | #include <cstdint>
 25 | #define _SHORTVEC_UINTPTR_T std::uintptr_t
 26 | #else
 27 | #define _SHORTVEC_UINTPTR_T unsigned long long
 28 | #endif
 29 | 
 30 | #ifdef __SSE4_1__
 31 | #include <smmintrin.h>
 32 | #endif
 33 | 
 34 | #ifdef _MSC_BUILD
 35 | #pragma warning( pop )
 36 | #endif
 37 | 
 38 | /**
 39 |  * This macro asserts that the pointer is correctly aligned.
 40 |  *
 41 |  * @param ptr pointer to check
 42 |  * @param alignment alignement
 43 |  */
 44 | #define SHORTVEC_ASSERT_ALIGNED(ptr, alignment)                         \
 45 |     do {                                                                \
 46 |         assert((reinterpret_cast<_SHORTVEC_UINTPTR_T>(ptr) % (alignment)) == 0); \
 47 |     } while (0)
 48 | 
 49 | /**
 50 |  * For some implementations there is the problem, that the compiler does not
 51 |  * see, that some variables should be used uninitialized.
 52 |  * Therefore here are compiler specific macros to disable and enable this warning.
 53 |  */
 54 | #if defined(__GNUC__) && !defined(__clang__)
 55 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED             \
 56 |     _Pragma("GCC diagnostic push")                         \
 57 |     _Pragma("GCC diagnostic ignored \"-Wuninitialized\"")
 58 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED   \
 59 |     _Pragma("GCC diagnostic pop")
 60 | #endif
 61 | 
 62 | #ifdef __clang__
 63 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED              \
 64 |     _Pragma("clang diagnostic push")                        \
 65 |     _Pragma("clang diagnostic ignored \"-Wuninitialized\"")
 66 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED   \
 67 |     _Pragma("clang diagnostic pop")
 68 | #endif
 69 | 
 70 | /**
 71 |  * If compiler is not gcc and not clang, just remove these macros.
 72 |  */
 73 | #ifndef SHORTVEC_DISABLE_WARNING_UNINITIALIZED
 74 | #define SHORTVEC_DISABLE_WARNING_UNINITIALIZED
 75 | #endif
 76 | #ifndef SHORTVEC_ENABLE_WARNING_UNINITIALIZED
 77 | #define SHORTVEC_ENABLE_WARNING_UNINITIALIZED
 78 | #endif
 79 | 
 80 | 
 81 | #ifdef __SSE4_1__
 82 | 
 83 | /**
 84 |  * Insertps instruction which allows to insert an memory location
 85 |  * into a xmm register.
 86 |  * Instruction: insertps xmm, xmm/m32, imm8
 87 |  *
 88 |  * @param a xmm register
 89 |  * @param base base pointer
 90 |  * @param offset offset
 91 |  * @param idx index, has to be a constant number like 0x10, no variable
 92 |  */
 93 | #define SHORTVEC_INSERT_PS(a, base, offset, idx)                        \
 94 |     do {                                                                \
 95 |         asm volatile ("insertps %1, (%q2, %q3, 4), %0\n"                \
 96 |                       : "+x" (a) : "N" (idx), "r" (base), "r" (offset) : "memory"); \
 97 |     } while (0)
 98 | 
 99 | #endif
100 | 
101 | #ifdef __AVX__
102 | 
103 | /**
104 |  * Same as above just for AVX.
105 |  * Instruction: vinsertps xmm, xmm, xmm/m32, imm8
106 |  *
107 |  * @param a xmm register
108 |  * @param base base pointer
109 |  * @param offset offset
110 |  * @param idx index, has to be a constant number like 0x10, no variable
111 |  */
112 | #define SHORTVEC_INSERT_PS_AVX(a, base, offset, idx)                    \
113 |     do {                                                                \
114 |         asm volatile ("vinsertps %1, (%q2, %q3, 4), %0, %0\n"           \
115 |                       : "+x" (a) : "N" (idx), "r" (base), "r" (offset) : "memory"); \
116 |     } while (0)
117 | 
118 | #endif
119 | 
120 | namespace LibFlatArray {
121 | 
122 | namespace ShortVecHelpers {
123 | 
124 | #ifdef __SSE4_1__
125 | 
126 | /**
127 |  * _mm_extract_ps returns an integer, but we need a float.
128 |  * This union can be used to get a float back.
129 |  */
130 | union ExtractResult {
131 |     int i;
132 |     float f;
133 | };
134 | 
135 | #endif
136 | 
137 | }
138 | 
139 | }
140 | 
141 | #endif
142 | 


--------------------------------------------------------------------------------
/examples/lbm/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UTIL_H
  2 | #define LIBFLATARRAY_EXAMPLES_LBM_UTIL_H
  3 | 
  4 | /**
  5 |  * Copyright 2013-2016 Andreas Schäfer
  6 |  *
  7 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  8 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  9 |  */
 10 | 
 11 | #include <iomanip>
 12 | #include <iostream>
 13 | #include <stdexcept>
 14 | #include <vector>
 15 | #include <libflatarray/testbed/benchmark.hpp>
 16 | 
 17 | void check_cuda_error()
 18 | {
 19 |     cudaError_t error = cudaGetLastError();
 20 |     if (error != cudaSuccess) {
 21 |         std::cerr << "ERROR: " << cudaGetErrorString(error) << "\n";
 22 |         throw std::runtime_error("CUDA error");
 23 |     }
 24 | }
 25 | 
 26 | class benchmark
 27 | {
 28 | public:
 29 |     virtual ~benchmark()
 30 |     {}
 31 | 
 32 |     void evaluate()
 33 |     {
 34 |         for (int dim = 32; dim <= 160; dim += 4) {
 35 |             run(dim);
 36 |         }
 37 |     }
 38 | 
 39 |     void run(int dim)
 40 |     {
 41 |         int repeats = 10;
 42 | 	if (dim <= 96) {
 43 |             repeats *= 10;
 44 |         }
 45 | 
 46 |         double seconds = exec(dim, repeats);
 47 | 
 48 |         double updates = 1.0 * gridSize(dim) * repeats;
 49 |         double glups = 10e-9 * updates / seconds;
 50 | 
 51 |         std::cout << std::setiosflags(std::ios::left);
 52 |         std::cout << std::setw(24) << name() << " ; "
 53 |                   << std::setw( 3) << dim << " ; "
 54 |                   << std::setw( 9) << glups << " GLUPS\n";
 55 |     }
 56 | 
 57 | protected:
 58 |     virtual double exec(int dim, int repeats) = 0;
 59 |     virtual std::string name() = 0;
 60 |     virtual size_t gridSize(int dim) = 0;
 61 | };
 62 | 
 63 | class benchmark_lbm_cuda : public benchmark
 64 | {
 65 | protected:
 66 |     double exec(int dim, int repeats)
 67 |     {
 68 |         dim3 dimBlock;
 69 |         dim3 dimGrid;
 70 |         gen_dims(&dimBlock, &dimGrid, dim);
 71 | 
 72 |         return cudaExec(dim, dimBlock, dimGrid, repeats);
 73 |     }
 74 | 
 75 |     virtual size_t gridSize(int dim)
 76 |     {
 77 |         dim3 dimBlock;
 78 |         dim3 dimGrid;
 79 |         gen_dims(&dimBlock, &dimGrid, dim);
 80 | 
 81 |         return dimGrid.x * dimBlock.x * dimGrid.y * dimBlock.y * (256 - 4);
 82 |     }
 83 | 
 84 |     virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats) = 0;
 85 | 
 86 |     void gen_dims(dim3 *dimBlock, dim3 *dimGrid, int dim)
 87 |     {
 88 |         int blockWidth = 1;
 89 |         for (; blockWidth <= dim; blockWidth *= 2) {
 90 |         }
 91 | 
 92 |         using std::min;
 93 |         blockWidth /= 2;
 94 |         blockWidth = min(256, blockWidth);
 95 |         *dimBlock = dim3(blockWidth, 2, 1);
 96 |         *dimGrid = dim3(dim / dimBlock->x, dim / dimBlock->y, 1);
 97 |     }
 98 | };
 99 | 
100 | class benchmark_lbm_cuda_basic : public benchmark_lbm_cuda
101 | {
102 | protected:
103 |     virtual ~benchmark_lbm_cuda_basic()
104 |     {}
105 | 
106 |     virtual double cudaExec(int dim, dim3 dimBlock, dim3 dimGrid, int repeats)
107 |     {
108 |         int size = dim * dim * (256 + 64) * 20;
109 |         int bytesize = size * sizeof(double);
110 |         std::vector<double> grid(size, 4711);
111 | 
112 |         double *devGridOld;
113 |         double *devGridNew;
114 |         cudaMalloc(&devGridOld, bytesize);
115 |         cudaMalloc(&devGridNew, bytesize);
116 |         check_cuda_error();
117 | 
118 |         cudaMemcpy(devGridOld, &grid[0], bytesize, cudaMemcpyHostToDevice);
119 |         cudaMemcpy(devGridNew, &grid[0], bytesize, cudaMemcpyHostToDevice);
120 |         check_cuda_error();
121 | 
122 |         cudaDeviceSynchronize();
123 |         double t_start = LibFlatArray::benchmark::time();
124 | 
125 |         for (int t = 0; t < repeats; ++t) {
126 |             update(dimGrid, dimBlock, dim, dim, 256, devGridOld, devGridNew);
127 |             std::swap(devGridOld, devGridNew);
128 |         }
129 | 
130 |         cudaDeviceSynchronize();
131 |         double t_end = LibFlatArray::benchmark::time();
132 |         check_cuda_error();
133 | 
134 |         cudaMemcpy(&grid[0], devGridNew, bytesize, cudaMemcpyDeviceToHost);
135 |         cudaFree(devGridOld);
136 |         cudaFree(devGridNew);
137 |         check_cuda_error();
138 | 
139 |         double time = t_end - t_start;
140 |         return time;
141 |     }
142 | 
143 |     virtual void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew) = 0;
144 | 
145 | };
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/load_functor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2016 Andreas Schäfer
  3 |  * Copyright 2017-2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_LOAD_FUNCTOR_HPP
 10 | #define FLAT_ARRAY_DETAIL_LOAD_FUNCTOR_HPP
 11 | 
 12 | #include <libflatarray/config.h>
 13 | 
 14 | namespace LibFlatArray {
 15 | 
 16 | namespace detail {
 17 | 
 18 | namespace flat_array {
 19 | 
 20 | #ifdef _MSC_BUILD
 21 | #pragma warning( push )
 22 | #pragma warning( disable : 4626 4710 5027 )
 23 | #endif
 24 | 
 25 | /**
 26 |  * The purpose of this functor is to load a row of cells which are
 27 |  * already prepackaged (in SoA form) in a raw data segment (i.e. all
 28 |  * members are stored in a consecutive array of the given length and
 29 |  * all arrays are concatenated).
 30 |  */
 31 | template<typename CELL, typename ITERATOR, bool USE_CUDA_FUNCTORS = false>
 32 | class load_functor
 33 | {
 34 | public:
 35 |     load_functor(
 36 |         const ITERATOR& start,
 37 |         const ITERATOR& end,
 38 |         const char *source,
 39 |         std::size_t count) :
 40 |         start(start),
 41 |         end(end),
 42 |         source(source),
 43 |         count(count)
 44 |     {}
 45 | 
 46 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 47 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
 48 |     {
 49 |         std::size_t offset = 0;
 50 | 
 51 |         for (ITERATOR i = start; i != end; ++i) {
 52 |             accessor.index() = soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>::gen_index(
 53 |                 static_cast<long>(i->origin[0]),
 54 |                 static_cast<long>(i->origin[1]),
 55 |                 static_cast<long>(i->origin[2]));
 56 |             accessor.load(
 57 |                 source,
 58 |                 static_cast<std::size_t>(i->length()),
 59 |                 offset,
 60 |                 count);
 61 | 
 62 |             offset += i->length();
 63 |         }
 64 |     }
 65 | 
 66 | private:
 67 |     ITERATOR start;
 68 |     ITERATOR end;
 69 |     const char *source;
 70 |     std::size_t count;
 71 | };
 72 | 
 73 | #ifdef _MSC_BUILD
 74 | #pragma warning( pop )
 75 | #endif
 76 | 
 77 | #ifdef LIBFLATARRAY_WITH_CUDA
 78 | #ifdef __CUDACC__
 79 | 
 80 | template<typename CELL, long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
 81 | __global__
 82 | void load_kernel(const char *source, char *target, long count, long stride, long x, long y, long z, long offset)
 83 | {
 84 |     long thread_index = blockDim.x * blockIdx.x + threadIdx.x;
 85 |     if (thread_index >= count) {
 86 |         return;
 87 |     }
 88 | 
 89 |     typedef soa_accessor_light<CELL, DIM_X, DIM_Y, DIM_Z, INDEX> accessor_type;
 90 | 
 91 |     long index = accessor_type::gen_index(x, y, z) + thread_index;
 92 |     accessor_type accessor(target, index);
 93 | 
 94 |     accessor.load(source, 1, offset + thread_index, stride);
 95 | }
 96 | 
 97 | /**
 98 |  * Specialization for CUDA
 99 |  */
100 | template<typename CELL, typename ITERATOR>
101 | class load_functor<CELL, ITERATOR, true>
102 | {
103 | public:
104 |     load_functor(
105 |         const ITERATOR& start,
106 |         const ITERATOR& end,
107 |         const char *source,
108 |         std::size_t count) :
109 |         start(start),
110 |         end(end),
111 |         source(source),
112 |         count(count)
113 |     {}
114 | 
115 |     template<long DIM_X, long DIM_Y, long DIM_Z, long INDEX>
116 |     void operator()(soa_accessor<CELL, DIM_X, DIM_Y, DIM_Z, INDEX>& accessor) const
117 |     {
118 |         std::size_t offset = 0;
119 | 
120 |         for (ITERATOR i = start; i != end; ++i) {
121 |             dim3 grid_dim;
122 |             dim3 block_dim;
123 |             generate_cuda_launch_config()(&grid_dim, &block_dim, i->length(), 1, 1);
124 | 
125 |             load_kernel<CELL, DIM_X, DIM_Y, DIM_Z, INDEX><<<grid_dim, block_dim>>>(
126 |                 source,
127 |                 accessor.data(),
128 |                 i->length(),
129 |                 count,
130 |                 i->origin[0],
131 |                 i->origin[1],
132 |                 i->origin[2],
133 |                 offset);
134 | 
135 |             offset += i->length();
136 |         }
137 |     }
138 | 
139 | private:
140 |     ITERATOR start;
141 |     ITERATOR end;
142 |     const char *source;
143 |     std::size_t count;
144 | 
145 | };
146 | 
147 | #endif
148 | #endif
149 | 
150 | }
151 | 
152 | }
153 | 
154 | }
155 | 
156 | #endif
157 | 


--------------------------------------------------------------------------------
/test/estimate_optimum_short_vec_type_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | // globally disable some warnings with MSVC, that are issued not for a
 10 | // specific header, but rather for the interaction of system headers
 11 | // and LibFlatArray source:
 12 | #ifdef _MSC_BUILD
 13 | #pragma warning( disable : 4710 )
 14 | #endif
 15 | 
 16 | #include <libflatarray/estimate_optimum_short_vec_type.hpp>
 17 | 
 18 | #include "test.hpp"
 19 | 
 20 | namespace LibFlatArray {
 21 | 
 22 | class fake_particle
 23 | {
 24 | public:
 25 |     double pos_x;
 26 |     double pos_y;
 27 |     double pos_z;
 28 | 
 29 |     double vel_x;
 30 |     double vel_y;
 31 |     double vel_z;
 32 | };
 33 | 
 34 | class fake_accessor
 35 | {
 36 | public:
 37 |     typedef fake_particle element_type;
 38 |     static const int DIM_PROD = 2000;
 39 | };
 40 | 
 41 | class fake_accessor2
 42 | {
 43 | public:
 44 |     typedef fake_particle element_type;
 45 |     static const int DIM_PROD = 20000000;
 46 | };
 47 | 
 48 | ADD_TEST(TestArity)
 49 | {
 50 |     // expected arities are 2x of the vector-unit's bit width for some
 51 |     // architectures as we're doing loop-unrolling for those:
 52 | 
 53 | #  ifdef __VECTOR4DOUBLE__
 54 |     static const int expected_arity_for_double = 8;
 55 |     static const int expected_arity_for_float = 16;
 56 | #  endif
 57 | 
 58 |     // Dito for ARM NEON:
 59 | #  ifdef __ARM_NEON__
 60 |     // no double-intrinsics for NEON:
 61 |     static const int expected_arity_for_double = 2;
 62 |     static const int expected_arity_for_float = 8;
 63 | #  endif
 64 | 
 65 |     // Only the case of the IBM PC is complicated. No thanks to you,
 66 |     // history!
 67 | #  ifdef LFA_AVX512_HELPER
 68 |     static const int expected_arity_for_double = 16;
 69 |     static const int expected_arity_for_float  = 32;
 70 | #  else
 71 | #    ifdef __AVX__
 72 |     static const int expected_arity_for_double = 8;
 73 |     static const int expected_arity_for_float = 16;
 74 | #    else
 75 | #      ifdef __SSE__
 76 |     static const int expected_arity_for_double = 4;
 77 |     static const int expected_arity_for_float = 8;
 78 | #      else
 79 |     static const int expected_arity_for_double = 2;
 80 |     static const int expected_arity_for_float = 2;
 81 |   #    endif
 82 | #    endif
 83 | #  endif
 84 | 
 85 |     typedef estimate_optimum_short_vec_type<double, fake_accessor>::VALUE selected_double_type;
 86 |     typedef estimate_optimum_short_vec_type<float,  fake_accessor>::VALUE selected_float_type;
 87 |     int actual_double = selected_double_type::ARITY;
 88 |     int actual_float  = selected_float_type::ARITY;
 89 | 
 90 |     BOOST_TEST_EQ(expected_arity_for_double, actual_double);
 91 |     BOOST_TEST_EQ(expected_arity_for_float,  actual_float);
 92 | };
 93 | 
 94 | template<typename SHORT_VEC>
 95 | class is_streaming_short_vec;
 96 | 
 97 | template<typename CARGO, std::size_t ARITY>
 98 | class is_streaming_short_vec<streaming_short_vec<CARGO, ARITY> >
 99 | {
100 | public:
101 |     static const bool VALUE = true;
102 | };
103 | 
104 | template<typename CARGO, std::size_t ARITY>
105 | class is_streaming_short_vec<short_vec<CARGO, ARITY> >
106 | {
107 | public:
108 |     static const bool VALUE = false;
109 | };
110 | 
111 | ADD_TEST(TestStoreImplementation)
112 | {
113 | // Don't warn about const expressions not being flagged as such: we
114 | // don't have a suitable macro for such comparisons.
115 | #ifdef _MSC_BUILD
116 | #pragma warning( push )
117 | #pragma warning( disable : 4127 )
118 | #endif
119 | 
120 |     // small problem size should yield normal stores:
121 |     typedef estimate_optimum_short_vec_type<double, fake_accessor>::VALUE selected_double_type;
122 |     typedef estimate_optimum_short_vec_type<float,  fake_accessor>::VALUE selected_float_type;
123 | 
124 |     BOOST_TEST_EQ(is_streaming_short_vec<selected_double_type>::VALUE, false);
125 |     BOOST_TEST_EQ(is_streaming_short_vec<selected_float_type>::VALUE,  false);
126 | 
127 |     // larger problem size should yield streaming stores:
128 |     typedef estimate_optimum_short_vec_type<double, fake_accessor2>::VALUE selected_double_type2;
129 |     typedef estimate_optimum_short_vec_type<float,  fake_accessor2>::VALUE selected_float_type2;
130 | 
131 |     BOOST_TEST_EQ(is_streaming_short_vec<selected_double_type2>::VALUE, true);
132 |     BOOST_TEST_EQ(is_streaming_short_vec<selected_float_type2>::VALUE,  true);
133 | 
134 | #ifdef _MSC_BUILD
135 | #pragma warning( pop )
136 | #endif
137 | 
138 | };
139 | 
140 | }
141 | 
142 | int main(int /* argc */, char** /* argv */)
143 | {
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/test/preprocessor_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #include <libflatarray/preprocessor.hpp>
  9 | 
 10 | // globally disable some warnings with MSVC, that are issued not for a
 11 | // specific header, but rather for the interaction of system headers
 12 | // and LibFlatArray source:
 13 | #ifdef _MSC_BUILD
 14 | #pragma warning( disable : 4710 )
 15 | #endif
 16 | 
 17 | // Don't warn about these functions being stripped from an executable
 18 | // as they're not being used, that's actually expected behavior.
 19 | #ifdef _MSC_BUILD
 20 | #pragma warning( push )
 21 | #pragma warning( disable : 4514 )
 22 | #endif
 23 | 
 24 | #include <iostream>
 25 | #include <vector>
 26 | 
 27 | #ifdef _MSC_BUILD
 28 | #pragma warning( pop )
 29 | #endif
 30 | 
 31 | #include "test.hpp"
 32 | 
 33 | #define LIST_A
 34 | #define LIST_B (10)(20)(30)(40)(50)
 35 | #define LIST_C (60)
 36 | 
 37 | #define LIST_D LIBFLATARRAY_DEQUEUE(LIST_B)
 38 | #define LIST_E LIBFLATARRAY_DEQUEUE(LIST_C)
 39 | 
 40 | #define LAMBDA(INDEX, STANDARD_ARG, ITERATOR) vec[ITERATOR] = (INDEX + STANDARD_ARG + ITERATOR);
 41 | 
 42 | // Don't warn about the conditional expressions being constant, that's
 43 | // intentional here:
 44 | #ifdef _MSC_BUILD
 45 | #pragma warning( push )
 46 | #pragma warning( disable : 4127 4353 )
 47 | #endif
 48 | 
 49 | ADD_TEST(TestElem)
 50 | {
 51 |     BOOST_TEST(LIBFLATARRAY_ELEM(0, LIST_B) == 10);
 52 |     BOOST_TEST(LIBFLATARRAY_ELEM(1, LIST_B) == 20);
 53 |     BOOST_TEST(LIBFLATARRAY_ELEM(2, LIST_B) == 30);
 54 |     BOOST_TEST(LIBFLATARRAY_ELEM(3, LIST_B) == 40);
 55 |     BOOST_TEST(LIBFLATARRAY_ELEM(4, LIST_B) == 50);
 56 | 
 57 |     BOOST_TEST(LIBFLATARRAY_ELEM(0, LIST_C) == 60);
 58 | }
 59 | 
 60 | ADD_TEST(TestSize)
 61 | {
 62 |     BOOST_TEST(LIBFLATARRAY_SIZE(LIST_A) == 0);
 63 |     BOOST_TEST(LIBFLATARRAY_SIZE(LIST_B) == 5);
 64 |     BOOST_TEST(LIBFLATARRAY_SIZE(LIST_C) == 1);
 65 | }
 66 | 
 67 | ADD_TEST(TestForEach)
 68 | {
 69 |     std::vector<int> vec(60, 0);
 70 |     LIBFLATARRAY_FOR_EACH(LAMBDA, 100, LIST_B);
 71 | 
 72 |     BOOST_TEST(vec[10] == (0 + 10 + 100));
 73 |     BOOST_TEST(vec[20] == (1 + 20 + 100));
 74 |     BOOST_TEST(vec[30] == (2 + 30 + 100));
 75 |     BOOST_TEST(vec[40] == (3 + 40 + 100));
 76 |     BOOST_TEST(vec[50] == (4 + 50 + 100));
 77 | }
 78 | 
 79 | ADD_TEST(TestDequeue)
 80 | {
 81 |     BOOST_TEST_EQ(LIBFLATARRAY_SIZE(LIST_D),     4);
 82 |     BOOST_TEST_EQ(LIBFLATARRAY_ELEM(0, LIST_D), 20);
 83 |     BOOST_TEST_EQ(LIBFLATARRAY_ELEM(1, LIST_D), 30);
 84 |     BOOST_TEST_EQ(LIBFLATARRAY_ELEM(2, LIST_D), 40);
 85 |     BOOST_TEST_EQ(LIBFLATARRAY_ELEM(3, LIST_D), 50);
 86 | 
 87 |     BOOST_TEST_EQ(LIBFLATARRAY_SIZE(LIST_E),     0);
 88 | }
 89 | 
 90 | ADD_TEST(TestIfShorter)
 91 | {
 92 |     bool a0 = LIBFLATARRAY_IF_SHORTER(LIST_A, 0, false, true);
 93 |     bool a1 = LIBFLATARRAY_IF_SHORTER(LIST_A, 1, true, false);
 94 |     bool a2 = LIBFLATARRAY_IF_SHORTER(LIST_A, 2, true, false);
 95 |     bool a3 = LIBFLATARRAY_IF_SHORTER(LIST_A, 3, true, false);
 96 |     bool a4 = LIBFLATARRAY_IF_SHORTER(LIST_A, 4, true, false);
 97 | 
 98 |     bool b0 = LIBFLATARRAY_IF_SHORTER(LIST_B, 0, false, true);
 99 |     bool b1 = LIBFLATARRAY_IF_SHORTER(LIST_B, 1, false, true);
100 |     bool b2 = LIBFLATARRAY_IF_SHORTER(LIST_B, 2, false, true);
101 |     bool b3 = LIBFLATARRAY_IF_SHORTER(LIST_B, 3, false, true);
102 |     bool b4 = LIBFLATARRAY_IF_SHORTER(LIST_B, 4, false, true);
103 |     bool b5 = LIBFLATARRAY_IF_SHORTER(LIST_B, 5, false, true);
104 |     bool b6 = LIBFLATARRAY_IF_SHORTER(LIST_B, 6, true, false);
105 |     bool b7 = LIBFLATARRAY_IF_SHORTER(LIST_B, 7, true, false);
106 |     bool b8 = LIBFLATARRAY_IF_SHORTER(LIST_B, 8, true, false);
107 |     bool b9 = LIBFLATARRAY_IF_SHORTER(LIST_B, 9, true, false);
108 | 
109 |     bool c0 = LIBFLATARRAY_IF_SHORTER(LIST_C, 0, false, true);
110 |     bool c1 = LIBFLATARRAY_IF_SHORTER(LIST_C, 1, false, true);
111 |     bool c2 = LIBFLATARRAY_IF_SHORTER(LIST_C, 2, true, false);
112 |     bool c3 = LIBFLATARRAY_IF_SHORTER(LIST_C, 3, true, false);
113 |     bool c4 = LIBFLATARRAY_IF_SHORTER(LIST_C, 4, true, false);
114 | 
115 |     BOOST_TEST(a0);
116 |     BOOST_TEST(a1);
117 |     BOOST_TEST(a2);
118 |     BOOST_TEST(a3);
119 |     BOOST_TEST(a4);
120 | 
121 |     BOOST_TEST(b0);
122 |     BOOST_TEST(b1);
123 |     BOOST_TEST(b2);
124 |     BOOST_TEST(b3);
125 |     BOOST_TEST(b4);
126 |     BOOST_TEST(b5);
127 |     BOOST_TEST(b6);
128 |     BOOST_TEST(b7);
129 |     BOOST_TEST(b8);
130 |     BOOST_TEST(b9);
131 | 
132 |     BOOST_TEST(c0);
133 |     BOOST_TEST(c1);
134 |     BOOST_TEST(c2);
135 |     BOOST_TEST(c3);
136 |     BOOST_TEST(c4);
137 | }
138 | 
139 | #ifdef _MSC_BUILD
140 | #pragma warning( pop )
141 | #endif
142 | 
143 | int main(int /* argc */, char** /* argv */)
144 | {
145 |     return 0;
146 | }
147 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if(WITH_CUDA)
 2 |   lfa_cuda_add_executable(cuda_allocator_test cuda_allocator_test.cu)
 3 |   lfa_cuda_add_executable(cuda_array_test     cuda_array_test.cu)
 4 |   lfa_cuda_add_executable(soa_array_cuda_test soa_array_cuda_test.cu)
 5 |   lfa_cuda_add_executable(soa_grid_cuda_test  soa_grid_cuda_test.cu)
 6 | endif()
 7 | add_executable(aligned_allocator_test               aligned_allocator_test.cpp)
 8 | add_executable(api_traits_test                      api_traits_test.cpp)
 9 | add_executable(estimate_optimum_short_vec_type_test estimate_optimum_short_vec_type_test.cpp)
10 | add_executable(loop_peeler_test                     loop_peeler_test.cpp)
11 | add_executable(preprocessor_test                    preprocessor_test.cpp)
12 | add_executable(short_vec_test                       short_vec_test.cpp short_vec_additional_test.cpp)
13 | add_executable(streaming_short_vec_test             streaming_short_vec_test.cpp)
14 | add_executable(soa_array_test                       soa_array_test.cpp)
15 | add_executable(soa_grid_test                        soa_grid_test.cpp)
16 | add_executable(soa_vector_test                      soa_vector_test.cpp)
17 | 
18 | if(WITH_CUDA)
19 |   add_custom_target(run_cuda_allocator_test COMMAND cuda_allocator_test)
20 |   add_custom_target(run_cuda_array_test     COMMAND cuda_array_test)
21 |   add_custom_target(run_soa_array_cuda_test COMMAND soa_array_cuda_test)
22 |   add_custom_target(run_soa_grid_cuda_test  COMMAND soa_grid_cuda_test)
23 | endif()
24 | 
25 | add_custom_target(run_aligned_allocator_test               COMMAND aligned_allocator_test)
26 | add_custom_target(run_api_traits_test                      COMMAND api_traits_test)
27 | add_custom_target(run_estimate_optimum_short_vec_type_test COMMAND estimate_optimum_short_vec_type_test)
28 | add_custom_target(run_loop_peeler_test                     COMMAND loop_peeler_test)
29 | add_custom_target(run_preprocessor_test                    COMMAND preprocessor_test)
30 | add_custom_target(run_short_vec_test                       COMMAND short_vec_test)
31 | add_custom_target(run_streaming_short_vec_test             COMMAND streaming_short_vec_test)
32 | add_custom_target(run_soa_array_test                       COMMAND soa_array_test)
33 | add_custom_target(run_soa_grid_test                        COMMAND soa_grid_test)
34 | add_custom_target(run_soa_vector_test                      COMMAND soa_vector_test)
35 | 
36 | if(WITH_CUDA)
37 |   add_dependencies(check run_cuda_allocator_test)
38 |   add_dependencies(check run_cuda_array_test)
39 |   add_dependencies(check run_soa_array_cuda_test)
40 |   add_dependencies(check run_soa_grid_cuda_test)
41 | endif()
42 | add_dependencies(check run_aligned_allocator_test)
43 | add_dependencies(check run_api_traits_test)
44 | add_dependencies(check run_estimate_optimum_short_vec_type_test)
45 | add_dependencies(check run_loop_peeler_test)
46 | add_dependencies(check run_preprocessor_test)
47 | add_dependencies(check run_short_vec_test)
48 | add_dependencies(check run_streaming_short_vec_test)
49 | add_dependencies(check run_soa_array_test)
50 | add_dependencies(check run_soa_grid_test)
51 | add_dependencies(check run_soa_vector_test)
52 | 
53 | if(WITH_CUDA)
54 |   add_dependencies(run_cuda_allocator_test                cuda_allocator_test)
55 |   add_dependencies(run_cuda_array_test                    cuda_array_test)
56 |   add_dependencies(run_soa_array_cuda_test                soa_array_cuda_test)
57 |   add_dependencies(run_soa_grid_cuda_test                 soa_grid_cuda_test)
58 | endif()
59 | add_dependencies(run_aligned_allocator_test               aligned_allocator_test)
60 | add_dependencies(run_api_traits_test                      api_traits_test)
61 | add_dependencies(run_estimate_optimum_short_vec_type_test estimate_optimum_short_vec_type_test)
62 | add_dependencies(run_loop_peeler_test                     loop_peeler_test)
63 | add_dependencies(run_preprocessor_test                    preprocessor_test)
64 | add_dependencies(run_short_vec_test                       short_vec_test)
65 | add_dependencies(run_streaming_short_vec_test             streaming_short_vec_test)
66 | add_dependencies(run_soa_array_test                       soa_array_test)
67 | add_dependencies(run_soa_grid_test                        soa_grid_test)
68 | add_dependencies(run_soa_vector_test                      soa_vector_test)
69 | 
70 | if(WITH_CUDA)
71 |   add_dependencies(tests cuda_allocator_test)
72 |   add_dependencies(tests cuda_array_test)
73 |   add_dependencies(tests soa_array_cuda_test)
74 |   add_dependencies(tests soa_grid_cuda_test)
75 | endif()
76 | add_dependencies(tests aligned_allocator_test)
77 | add_dependencies(tests api_traits_test)
78 | add_dependencies(tests estimate_optimum_short_vec_type_test)
79 | add_dependencies(tests loop_peeler_test)
80 | add_dependencies(tests preprocessor_test)
81 | add_dependencies(tests short_vec_test)
82 | add_dependencies(tests streaming_short_vec_test)
83 | add_dependencies(tests soa_array_test)
84 | add_dependencies(tests soa_grid_test)
85 | add_dependencies(tests soa_vector_test)
86 | 


--------------------------------------------------------------------------------
/include/libflatarray/testbed/evaluate.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2017 Andreas Schäfer
  3 |  * Copyright 2017-2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_TESTBED_EVALUATE_HPP
 10 | #define FLAT_ARRAY_TESTBED_EVALUATE_HPP
 11 | 
 12 | #include <libflatarray/testbed/benchmark.hpp>
 13 | 
 14 | // disable certain warnings from system headers when compiling with
 15 | // Microsoft Visual Studio:
 16 | #ifdef _MSC_BUILD
 17 | #pragma warning( push )
 18 | #pragma warning( disable : 4514 4668 4710 4820 )
 19 | #endif
 20 | 
 21 | #include <ctime>
 22 | #include <iomanip>
 23 | #include <iostream>
 24 | #ifdef _WIN32
 25 | #include <windows.h>
 26 | #else
 27 | #include <unistd.h>
 28 | #endif
 29 | 
 30 | #ifdef _MSC_BUILD
 31 | #pragma warning( pop )
 32 | #endif
 33 | 
 34 | namespace LibFlatArray {
 35 | 
 36 | // not inlining is ok:
 37 | #ifdef _MSC_BUILD
 38 | #pragma warning( push )
 39 | #pragma warning( disable : 4710 )
 40 | #endif
 41 | 
 42 | class evaluate
 43 | {
 44 | public:
 45 |     evaluate(const std::string& name, const std::string& revision) :
 46 |         name(name),
 47 |         revision(revision)
 48 |     {}
 49 | 
 50 |     void print_header()
 51 |     {
 52 |         std::cout << "#rev              ; date                 ; host                            ; device                                          ; order   ; family                          ; species ; dimensions              ; perf        ; unit" << std::endl;
 53 |     }
 54 | 
 55 |     template<class BENCHMARK>
 56 |     void operator()(BENCHMARK benchmark, std::vector<int> dim, bool output = true)
 57 |     {
 58 |         if (benchmark.family().find(name, 0) == std::string::npos) {
 59 |             return;
 60 |         }
 61 | 
 62 | #ifdef _WIN32
 63 |         // this charade is based on https://msdn.microsoft.com/en-us/library/windows/desktop/ms724928(v=vs.85).aspx
 64 |         FILETIME fileTime;
 65 |         GetSystemTimeAsFileTime(&fileTime);
 66 | 
 67 |         ULARGE_INTEGER systemTime;
 68 |         systemTime.LowPart = fileTime.dwLowDateTime;
 69 |         systemTime.HighPart = fileTime.dwHighDateTime;
 70 | 
 71 |         SYSTEMTIME epoch;
 72 |         epoch.wYear = 1970;
 73 |         epoch.wMonth = 1;
 74 |         epoch.wDayOfWeek = 4;
 75 |         epoch.wDay = 1;
 76 |         epoch.wHour = 0;
 77 |         epoch.wMinute = 0;
 78 |         epoch.wSecond = 1;
 79 |         epoch.wMilliseconds = 0;
 80 |         FILETIME epochFileTime;
 81 |         SystemTimeToFileTime(&epoch, &epochFileTime);
 82 | 
 83 |         ULARGE_INTEGER epochULargeInteger;
 84 |         epochULargeInteger.LowPart = epochFileTime.dwLowDateTime;
 85 |         epochULargeInteger.HighPart = epochFileTime.dwHighDateTime;
 86 | 
 87 |         time_t secondsSinceEpoch = static_cast<time_t>(systemTime.QuadPart - epochULargeInteger.QuadPart);
 88 | #else
 89 |         timeval t;
 90 |         gettimeofday(&t, 0);
 91 |         time_t secondsSinceEpoch = t.tv_sec;
 92 | #endif
 93 | 
 94 |         tm timeSpec;
 95 | #ifdef _WIN32
 96 |         gmtime_s(&timeSpec, &secondsSinceEpoch);
 97 | #else
 98 |         gmtime_r(&secondsSinceEpoch, &timeSpec);
 99 | #endif
100 |         char buf[1024];
101 |         strftime(buf, 1024, "%Y-%b-%d %H:%M:%S", &timeSpec);
102 | 
103 |         std::string now_string = buf;
104 |         std::string device = benchmark.device();
105 | 
106 |         int hostname_length = 2048;
107 |         std::string hostname(static_cast<std::size_t>(hostname_length), ' ');
108 |         gethostname(&hostname[0], hostname_length);
109 |         // cuts string at first 0 byte, required as gethostname returns 0-terminated strings
110 |         hostname = std::string(hostname.c_str());
111 | 
112 |         double performance = benchmark.performance(dim);
113 | 
114 |         std::ostringstream pretty_dim;
115 |         pretty_dim << "(" << dim[0];
116 |         for (std::size_t i = 1; i < dim.size(); ++i) {
117 |             pretty_dim << ", " << dim[i];
118 |         }
119 |         pretty_dim << ")";
120 | 
121 |         if (output) {
122 |             std::cout << std::setiosflags(std::ios::left);
123 |             std::cout << std::setw(18) << revision << "; "
124 |                       << now_string << " ; "
125 |                       << std::setw(32) << hostname << "; "
126 |                       << std::setw(48) << device << "; "
127 |                       << std::setw( 8) << benchmark.order() <<  "; "
128 |                       << std::setw(32) << benchmark.family() <<  "; "
129 |                       << std::setw( 8) << benchmark.species() <<  "; "
130 |                       << std::setw(24) << pretty_dim.str() <<  "; "
131 |                       << std::setw(12) << performance <<  "; "
132 |                       << std::setw( 8) << benchmark.unit() << std::endl;
133 |         }
134 |     }
135 | 
136 | private:
137 |     std::string name;
138 |     std::string revision;
139 | };
140 | 
141 | #ifdef _MSC_BUILD
142 | #pragma warning( pop )
143 | #endif
144 | 
145 | }
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/include/libflatarray/testbed/cpu_benchmark.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014-2017 Andreas Schäfer
  3 |  * Copyright 2018-2020 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_TESTBED_CPU_BENCHMARK_HPP
 10 | #define FLAT_ARRAY_TESTBED_CPU_BENCHMARK_HPP
 11 | 
 12 | #include <libflatarray/testbed/benchmark.hpp>
 13 | 
 14 | // disable certain warnings from system headers when compiling with
 15 | // Microsoft Visual Studio. Also disable them for this class.
 16 | #ifdef _MSC_BUILD
 17 | #pragma warning( push )
 18 | #pragma warning( disable : 4514 4710 )
 19 | #endif
 20 | 
 21 | #include <cstdio>
 22 | #include <fstream>
 23 | #include <iostream>
 24 | 
 25 | namespace LibFlatArray {
 26 | 
 27 | class cpu_benchmark : public benchmark
 28 | {
 29 | public:
 30 |     std::string order()
 31 |     {
 32 |         return "CPU";
 33 |     }
 34 | 
 35 |     std::string device()
 36 |     {
 37 |         try {
 38 |             try {
 39 |                 // likwid-topology gives us the best data
 40 |                 return parse_likwid_topology();
 41 |             } catch (const std::runtime_error&) {
 42 |                 // ...otherwise we'll fall back to /proc/cpuinfo
 43 |                 return parse_proc_cpu();
 44 |             }
 45 |         } catch (const std::runtime_error&) {
 46 |             return "unknown CPU";
 47 |         }
 48 |     }
 49 | 
 50 | private:
 51 |     static std::string parse_proc_cpu()
 52 |     {
 53 |         std::ifstream file("/proc/cpuinfo");
 54 |         const std::size_t bufferSize = 1 << 12;
 55 |         char buffer[bufferSize];
 56 | 
 57 |         while (file.getline(&buffer[0], bufferSize)) {
 58 |             std::vector<std::string> tokens = tokenize(buffer, ':');
 59 |             std::vector<std::string> fields = tokenize(tokens[0], '\t');
 60 | 
 61 |             if ((fields.size() == 1) && (fields[0] == "cpu")) {
 62 |                 return tokens[1];
 63 |             }
 64 | 
 65 |             if ((fields.size() == 1) && (fields[0] == "model name")) {
 66 |                 tokens = tokenize(tokens[1], ' ');
 67 |                 std::string buf = join(tokens, " ");
 68 |                 if (buf[buf.size() - 1] == 0) {
 69 |                     buf.resize(buf.size() - 1);
 70 |                 }
 71 | 
 72 |                 return buf;
 73 |             }
 74 |         }
 75 | 
 76 |         throw std::runtime_error("could not parse /proc/cpuinfo");
 77 |     }
 78 | 
 79 |     static std::string parse_likwid_topology()
 80 |     {
 81 |         std::string read_buffer(100000, ' ');
 82 | #ifdef _WIN32
 83 |         FILE *file = _popen("likwid-topology -O", "r");
 84 | #else
 85 |         FILE *file = popen("likwid-topology -O", "r");
 86 | #endif
 87 |         if (file == NULL) {
 88 |             throw std::runtime_error("failed to get output from likwid-topology");
 89 |         }
 90 | 
 91 |         std::string cpu_type;
 92 |         std::string cpu_name;
 93 | 
 94 |         while (fgets(&read_buffer[0], read_buffer.size(), file) != NULL) {
 95 |             std::vector<std::string> tokens = tokenize(read_buffer, ',');
 96 |             for (std::vector<std::string>::iterator i = tokens.begin(); i != tokens.end(); ++i) {
 97 |                 if (i->find("CPU type") != std::string::npos) {
 98 |                     cpu_type = *(++i);
 99 |                 }
100 |                 if (i->find("CPU name") != std::string::npos) {
101 |                     cpu_name = *(++i);
102 |                 }
103 |             }
104 |         }
105 | 
106 |         if (cpu_type.empty() || cpu_name.empty()) {
107 |             throw std::runtime_error("failed to parse likwid-topology");
108 |         }
109 |         std::string ret = cpu_type + ", " + cpu_name;
110 |         return ret;
111 |     }
112 | 
113 |     static std::string trim(const std::string& string)
114 |     {
115 |         if (string.size() == 0) {
116 |             return string;
117 |         }
118 | 
119 |         std::size_t start = 0;
120 |         while ((string[start] == ' ') && (start < string.size())) {
121 |             start += 1;
122 |         }
123 | 
124 |         std::size_t end = string.size() - 1;
125 |         while ((string[end] == ' ') && (end > 1)) {
126 |             end -= 1;
127 |         }
128 |         if ((string[end] != ' ') && (end < string.size())) {
129 |             end += 1;
130 |         }
131 | 
132 |         return std::string(string, start, end - start);
133 |     }
134 | 
135 |     static std::vector<std::string> tokenize(const std::string& line, char delimiter = ';')
136 |     {
137 |         std::vector<std::string> ret;
138 | 
139 |         std::stringstream buf(line);
140 |         std::string item;
141 | 
142 |         while (std::getline(buf, item, delimiter)) {
143 |             ret.push_back(trim(item));
144 |         }
145 | 
146 |         return ret;
147 |     }
148 | 
149 |     static std::string join(const std::vector<std::string>& tokens, const std::string& delimiter)
150 |     {
151 |         std::stringstream buf;
152 | 
153 |         for (std::vector<std::string>::const_iterator i = tokens.begin(); i != tokens.end(); ++i) {
154 |             if (i != tokens.begin()) {
155 |                 buf << delimiter;
156 |             }
157 |             buf << *i;
158 |         }
159 | 
160 |         return buf.str();
161 |     }
162 | };
163 | 
164 | }
165 | 
166 | #ifdef _MSC_BUILD
167 | #pragma warning( pop )
168 | #endif
169 | 
170 | #endif
171 | 


--------------------------------------------------------------------------------
/include/libflatarray/soa_vector.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  *
  4 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  5 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  6 |  */
  7 | 
  8 | #ifndef FLAT_ARRAY_SOA_VECTOR_HPP
  9 | #define FLAT_ARRAY_SOA_VECTOR_HPP
 10 | 
 11 | #include <libflatarray/aligned_allocator.hpp>
 12 | #include <libflatarray/soa_accessor.hpp>
 13 | #include <libflatarray/soa_grid.hpp>
 14 | 
 15 | // disable certain warnings from system headers when compiling with
 16 | // Microsoft Visual Studio:
 17 | #ifdef _MSC_BUILD
 18 | #pragma warning( push )
 19 | #pragma warning( disable : 4514 )
 20 | #endif
 21 | 
 22 | #include <stdexcept>
 23 | 
 24 | #ifdef _MSC_BUILD
 25 | #pragma warning( pop )
 26 | #endif
 27 | 
 28 | namespace LibFlatArray {
 29 | 
 30 | /**
 31 |  * This is the runtime resizable counterpart to soa_array. The goal is
 32 |  * to provide an interface similar to std::vector and simultaneously
 33 |  * have a callback to expose the struct-of-arrays layout.
 34 |  */
 35 | template<
 36 |     typename T,
 37 |     typename ALLOCATOR = aligned_allocator<char, 4096>,
 38 |     bool USE_CUDA_FUNCTORS = false>
 39 | class soa_vector
 40 | {
 41 | public:
 42 |     friend class TestResizeAndReserve;
 43 | 
 44 |     typedef T value_type;
 45 | 
 46 |     inline
 47 |     __host__ __device__
 48 |     explicit soa_vector(std::size_t count = 0) :
 49 |         grid(count, 1, 1),
 50 |         count(count)
 51 |     {}
 52 | 
 53 |     inline
 54 |     __host__ __device__
 55 |     explicit soa_vector(std::size_t count, const value_type& value) :
 56 |         grid(count, 1, 1),
 57 |         count(count)
 58 |     {
 59 |         grid.broadcast(0, 0, 0, value, count);
 60 |     }
 61 | 
 62 |     /**
 63 |      * Copies an element to the given index. We're intentionally not
 64 |      * using at() or operator[] to avoid mismatched expectations here:
 65 |      * we can't yield references to a T here.
 66 |      */
 67 |     LIBFLATARRAY_INLINE
 68 |     __host__ __device__
 69 |     void set(std::size_t index, const T& element)
 70 |     {
 71 |         grid.set(index, 0, 0, element);
 72 |     }
 73 | 
 74 |     /**
 75 |      * Copy out an element. Again we're not using at() or operator[]
 76 |      * here to avoid confusion with the API: we can't return
 77 |      * references from an SoA container.
 78 |      */
 79 |     LIBFLATARRAY_INLINE
 80 |     __host__ __device__
 81 |     T get(std::size_t index) const
 82 |     {
 83 |         return grid.get(index, 0, 0);
 84 |     }
 85 | 
 86 |     inline
 87 |     __host__ __device__
 88 |     std::size_t size() const
 89 |     {
 90 |         return count;
 91 |     }
 92 | 
 93 |     inline
 94 |     __host__ __device__
 95 |     bool empty() const
 96 |     {
 97 |         return count == 0;
 98 |     }
 99 | 
100 |     inline
101 |     __host__ __device__
102 |     void resize(std::size_t new_count)
103 |     {
104 |         if (new_count > capacity()) {
105 |             reserve(new_count);
106 |         }
107 | 
108 |         count = new_count;
109 |     }
110 | 
111 |     inline
112 |     __host__ __device__
113 |     void reserve(std::size_t new_count)
114 |     {
115 |         soa_grid<T, ALLOCATOR, USE_CUDA_FUNCTORS> new_grid(new_count, 1, 1);
116 |         new_grid.resize(new_grid.extent_x(), 1, 1);
117 | 
118 |         detail::flat_array::simple_streak iter[2] = {
119 |             detail::flat_array::simple_streak(0, 0, 0, count),
120 |             detail::flat_array::simple_streak()
121 |         };
122 | 
123 |         new_grid.load(iter + 0, iter + 1, grid.data(), grid.extent_x());
124 |         swap(new_grid, grid);
125 |     }
126 | 
127 |     inline
128 |     __host__ __device__
129 |     std::size_t capacity() const
130 |     {
131 |         return grid.dim_x();
132 |     }
133 | 
134 |     inline
135 |     __host__ __device__
136 |     void clear()
137 |     {
138 |         count = 0;
139 |     }
140 | 
141 |     inline
142 |     __host__ __device__
143 |     void push_back(const T& element)
144 |     {
145 |         if (count == grid.extent_x()) {
146 |             // fixme: make this configurable
147 |             reserve(static_cast<std::size_t>(count * 1.2));
148 |         }
149 |         set(count, element);
150 |         ++count;
151 |     }
152 | 
153 |     inline
154 |     __host__ __device__
155 |     void pop_back()
156 |     {
157 |         --count;
158 |         // destroy last element by overwriting with default element:
159 |         set(count, T());
160 |     }
161 | 
162 | #ifdef LIBFLATARRAY_WITH_CPP14
163 |     template<class... ARGS>
164 |     inline
165 |     __host__ __device__
166 |     void emplace_back(ARGS&&... args)
167 |     {
168 |         push_back(T(std::forward<ARGS>(args)...));
169 |     }
170 | #endif
171 | 
172 |     template<typename FUNCTOR>
173 |     inline
174 |     __host__ __device__
175 |     void callback(FUNCTOR functor)
176 |     {
177 |         grid.callback(functor);
178 |     }
179 | 
180 |     template<typename FUNCTOR>
181 |     inline
182 |     __host__ __device__
183 |     void callback(FUNCTOR functor) const
184 |     {
185 |         grid.callback(functor);
186 |     }
187 | 
188 | private:
189 |     soa_grid<T, ALLOCATOR, USE_CUDA_FUNCTORS> grid;
190 |     std::size_t count;
191 | 
192 |     // fixme: retrieval of multiple elements
193 |     // fixme: add cuda test
194 |     // fixme: add begin/end for range-based loops, dito for soa_array and perhaps for short_vec (alternatively an iterator loop)
195 | };
196 | 
197 | }
198 | 
199 | #endif
200 | 
201 | 


--------------------------------------------------------------------------------
/examples/smoothed_particle_hydrodynamics/kernels.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016-2017 Andreas Schäfer
  3 |  * Copyright 2017 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #include <assert.h>
 10 | 
 11 | #ifdef _MSC_BUILD
 12 | #pragma warning( push )
 13 | #pragma warning( disable : 4514 )
 14 | #endif
 15 | 
 16 | #include <math.h>
 17 | 
 18 | #ifdef _MSC_BUILD
 19 | #pragma warning( pop )
 20 | #endif
 21 | 
 22 | #include "kernels.h"
 23 | 
 24 | #ifndef M_PI
 25 | #define M_PI 3.14159265358979323846
 26 | #endif
 27 | 
 28 | void compute_density(int n, float *restrict rho, float *restrict pos_x, float *restrict pos_y, float h, float mass)
 29 | {
 30 |     float h_squared = h * h;
 31 |     float h_pow_8 = h_squared * h_squared * h_squared * h_squared;
 32 |     float C = 4 * mass / M_PI / h_pow_8;
 33 | 
 34 |     for (int i = 0; i < n; ++i) {
 35 |         rho[i] = 4 * mass / M_PI / h_squared;
 36 |     }
 37 | 
 38 |     for (int i = 0; i < n; ++i) {
 39 |         for (int j = i + 1; j < n; ++j) {
 40 |             float delta_x = pos_x[i] - pos_x[j];
 41 |             float delta_y = pos_y[i] - pos_y[j];
 42 |             float dist_squared = delta_x * delta_x + delta_y * delta_y;
 43 |             float overlap = h_squared - dist_squared;
 44 | 
 45 |             if (overlap > 0) {
 46 |                 float rho_ij = C * overlap * overlap * overlap;
 47 |                 rho[i] += rho_ij;
 48 |                 rho[j] += rho_ij;
 49 |             }
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | void compute_accel(
 55 |     int n,
 56 |     float *restrict rho,
 57 |     float *restrict pos_x,
 58 |     float *restrict pos_y,
 59 |     float *restrict v_x,
 60 |     float *restrict v_y,
 61 |     float *restrict a_x,
 62 |     float *restrict a_y,
 63 |     float mass,
 64 |     float g,
 65 |     float h,
 66 |     float k,
 67 |     float rho0,
 68 |     float mu)
 69 | {
 70 |     const float h_squared = h * h;
 71 |     const float C_0 = mass / M_PI / (h_squared * h_squared);
 72 |     const float C_p = 15 * k;
 73 |     const float C_v = -40 * mu;
 74 | 
 75 |     // gravity:
 76 |     for (int i = 0; i < n; ++i) {
 77 |         a_x[i] = 0;
 78 |         a_y[i] = -g;
 79 |     }
 80 | 
 81 |     // Now compute interaction forces
 82 |     for (int i = 0; i < n; ++i) {
 83 |         for (int j = i + 1; j < n; ++j) {
 84 |             float delta_x = pos_x[i] - pos_x[j];
 85 |             float delta_y = pos_y[i] - pos_y[j];
 86 |             float dist_squared = delta_x * delta_x + delta_y * delta_y;
 87 | 
 88 |             if (dist_squared < h_squared) {
 89 |                 float q = sqrt(dist_squared) / h;
 90 |                 float u = 1 - q;
 91 |                 float w_0 = C_0 * u / rho[i] / rho[j];
 92 |                 float w_p = w_0 * C_p * (rho[i] + rho[j] - 2 * rho0) * u / q;
 93 |                 float w_v = w_0 * C_v;
 94 |                 float delta_v_x = v_x[i] - v_x[j];
 95 |                 float delta_v_y = v_y[i] - v_y[j];
 96 | 
 97 |                 a_x[i] += (w_p * delta_x + w_v * delta_v_x);
 98 |                 a_y[i] += (w_p * delta_y + w_v * delta_v_y);
 99 |                 a_x[j] -= (w_p * delta_x + w_v * delta_v_x);
100 |                 a_y[j] -= (w_p * delta_y + w_v * delta_v_y);
101 |             }
102 |         }
103 |     }
104 | }
105 | 
106 | void damp_reflect(
107 |     int which,
108 |     float barrier,
109 |     float *pos_x,
110 |     float *pos_y,
111 |     float *v_x,
112 |     float *v_y)
113 | {
114 |     float *v_which   = (which == 0) ? v_x   : v_y;
115 |     float *pos_which = (which == 0) ? pos_x : pos_y;
116 | 
117 |     // Coefficient of resitiution
118 |     const float DAMP = 0.75;
119 |     // Ignore degenerate cases
120 |     if (fabs(v_which[0]) <= 1e-3)
121 |         return;
122 | 
123 |     // Scale back the distance traveled based on time from collision
124 |     float tbounce = (pos_which[0] - barrier) / v_which[0];
125 |     pos_x[0] -= v_x[0]*(1-DAMP)*tbounce;
126 |     pos_y[0] -= v_y[0]*(1-DAMP)*tbounce;
127 | 
128 |     // Reflect the position and velocity
129 |     pos_which[0] = 2 * barrier - pos_which[0];
130 |     v_which[0]   = -v_which[0];
131 | 
132 |     // Damp the velocities
133 |     v_x[0] *= DAMP;
134 |     v_y[0] *= DAMP;
135 | }
136 | 
137 | void reflect_bc(
138 |     int n,
139 |     float *restrict pos_x,
140 |     float *restrict pos_y,
141 |     float *restrict v_x,
142 |     float *restrict v_y)
143 | {
144 |     // Boundaries of the computational domain
145 |     const float XMIN = 0.0;
146 |     const float XMAX = 1.0;
147 |     const float YMIN = 0.0;
148 |     const float YMAX = 1.0;
149 | 
150 |     for (int i = 0; i < n; ++i) {
151 |         if (pos_x[i] < XMIN) {
152 |             damp_reflect(0, XMIN, pos_x + i, pos_y + i, v_x + i, v_y + i);
153 |         }
154 |         if (pos_x[i] > XMAX) {
155 |             damp_reflect(0, XMAX, pos_x + i, pos_y + i, v_x + i, v_y + i);
156 |         }
157 |         if (pos_y[i] < YMIN) {
158 |             damp_reflect(1, YMIN, pos_x + i, pos_y + i, v_x + i, v_y + i);
159 |         }
160 |         if (pos_y[i] > YMAX) {
161 |             damp_reflect(1, YMAX, pos_x + i, pos_y + i, v_x + i, v_y + i);
162 |         }
163 |     }
164 | }
165 | 
166 | void leapfrog(
167 |     int n,
168 |     float *restrict pos_x,
169 |     float *restrict pos_y,
170 |     float *restrict v_x,
171 |     float *restrict v_y,
172 |     float *restrict a_x,
173 |     float *restrict a_y,
174 |     double dt)
175 | {
176 |     for (int i = 0; i < n; ++i) {
177 |         v_x[i] += a_x[i] * dt;
178 |         v_y[i] += a_y[i] * dt;
179 | 
180 |         pos_x[i] += v_x[i] * dt;
181 |         pos_y[i] += v_y[i] * dt;
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/examples/lbm/update_lbm_object_oriented.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_OBJECT_ORIENTED_H
  2 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_OBJECT_ORIENTED_H
  3 | 
  4 | /**
  5 |  * Copyright 2013 Andreas Schäfer
  6 |  *
  7 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  8 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  9 |  */
 10 | 
 11 | #include "util.h"
 12 | 
 13 | #define GET_COMP(X, Y, Z, DIR)                  \
 14 |     gridOld[(Z) * dimX * dimY + (Y) * dimX + (X)].DIR
 15 | 
 16 | #define SET_COMP(DIR)                           \
 17 |     gridNew[(z) * dimX * dimY + (y) * dimX + (x)].DIR
 18 | 
 19 | __global__ void update_lbm_object_oriented(int dimX, int dimY, int dimZ, CellLBM *gridOld, CellLBM *gridNew)
 20 | {
 21 |     int x = blockIdx.x * blockDim.x + threadIdx.x + 2;
 22 |     int y = blockIdx.y * blockDim.y + threadIdx.y + 2;
 23 |     int z = 2;
 24 | 
 25 | #pragma unroll 10
 26 |     for (; z < (dimZ - 2); z += 1) {
 27 | 
 28 | #define SQR(X) ((X)*(X))
 29 |         const double omega = 1.0/1.7;
 30 |         const double omega_trm = 1.0 - omega;
 31 |         const double omega_w0 = 3.0 * 1.0 / 3.0 * omega;
 32 |         const double omega_w1 = 3.0*1.0/18.0*omega;
 33 |         const double omega_w2 = 3.0*1.0/36.0*omega;
 34 |         const double one_third = 1.0 / 3.0;
 35 |         double velX, velY, velZ;
 36 | 
 37 |         velX  =
 38 |             GET_COMP(x-1,y,z,E) + GET_COMP(x-1,y-1,z,NE) +
 39 |             GET_COMP(x-1,y+1,z,SE) + GET_COMP(x-1,y,z-1,TE) +
 40 |             GET_COMP(x-1,y,z+1,BE);
 41 |         velY  = GET_COMP(x,y-1,z,N) + GET_COMP(x+1,y-1,z,NW) +
 42 |             GET_COMP(x,y-1,z-1,TN) + GET_COMP(x,y-1,z+1,BN);
 43 |         velZ  = GET_COMP(x,y,z-1,T) + GET_COMP(x,y+1,z-1,TS) +
 44 |             GET_COMP(x+1,y,z-1,TW);
 45 | 
 46 |         const double rho =
 47 |             GET_COMP(x,y,z,C) + GET_COMP(x,y+1,z,S) +
 48 |             GET_COMP(x+1,y,z,W) + GET_COMP(x,y,z+1,B) +
 49 |             GET_COMP(x+1,y+1,z,SW) + GET_COMP(x,y+1,z+1,BS) +
 50 |             GET_COMP(x+1,y,z+1,BW) + velX + velY + velZ;
 51 |         velX  = velX
 52 |             - GET_COMP(x+1,y,z,W)    - GET_COMP(x+1,y-1,z,NW)
 53 |             - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x+1,y,z-1,TW)
 54 |             - GET_COMP(x+1,y,z+1,BW);
 55 |         velY  = velY
 56 |             + GET_COMP(x-1,y-1,z,NE) - GET_COMP(x,y+1,z,S)
 57 |             - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x-1,y+1,z,SE)
 58 |             - GET_COMP(x,y+1,z-1,TS) - GET_COMP(x,y+1,z+1,BS);
 59 |         velZ  = velZ+GET_COMP(x,y-1,z-1,TN) + GET_COMP(x-1,y,z-1,TE) - GET_COMP(x,y,z+1,B) - GET_COMP(x,y-1,z+1,BN) - GET_COMP(x,y+1,z+1,BS) - GET_COMP(x+1,y,z+1,BW) - GET_COMP(x-1,y,z+1,BE);
 60 | 
 61 |         // density = rho;
 62 |         // velocityX = velX;
 63 |         // velocityY = velY;
 64 |         // velocityZ = velZ;
 65 | 
 66 |         const double dir_indep_trm = one_third*rho - 0.5*( velX*velX + velY*velY + velZ*velZ );
 67 | 
 68 |         SET_COMP(C)=omega_trm * GET_COMP(x,y,z,C) + omega_w0*( dir_indep_trm );
 69 | 
 70 |         SET_COMP(NW)=omega_trm * GET_COMP(x+1,y-1,z,NW) +
 71 |             omega_w2*( dir_indep_trm - ( velX-velY ) + 1.5*SQR( velX-velY ) );
 72 |         SET_COMP(SE)=omega_trm * GET_COMP(x-1,y+1,z,SE) +
 73 |             omega_w2*( dir_indep_trm + ( velX-velY ) + 1.5*SQR( velX-velY ) );
 74 |         SET_COMP(NE)=omega_trm * GET_COMP(x-1,y-1,z,NE) +
 75 |             omega_w2*( dir_indep_trm + ( velX+velY ) + 1.5*SQR( velX+velY ) );
 76 |         SET_COMP(SW)=omega_trm * GET_COMP(x+1,y+1,z,SW) +
 77 |             omega_w2*( dir_indep_trm - ( velX+velY ) + 1.5*SQR( velX+velY ) );
 78 | 
 79 |         SET_COMP(TW)=omega_trm * GET_COMP(x+1,y,z-1,TW) + omega_w2*( dir_indep_trm - ( velX-velZ ) + 1.5*SQR( velX-velZ ) );
 80 |         SET_COMP(BE)=omega_trm * GET_COMP(x-1,y,z+1,BE) + omega_w2*( dir_indep_trm + ( velX-velZ ) + 1.5*SQR( velX-velZ ) );
 81 |         SET_COMP(TE)=omega_trm * GET_COMP(x-1,y,z-1,TE) + omega_w2*( dir_indep_trm + ( velX+velZ ) + 1.5*SQR( velX+velZ ) );
 82 |         SET_COMP(BW)=omega_trm * GET_COMP(x+1,y,z+1,BW) + omega_w2*( dir_indep_trm - ( velX+velZ ) + 1.5*SQR( velX+velZ ) );
 83 | 
 84 |         SET_COMP(TS)=omega_trm * GET_COMP(x,y+1,z-1,TS) + omega_w2*( dir_indep_trm - ( velY-velZ ) + 1.5*SQR( velY-velZ ) );
 85 |         SET_COMP(BN)=omega_trm * GET_COMP(x,y-1,z+1,BN) + omega_w2*( dir_indep_trm + ( velY-velZ ) + 1.5*SQR( velY-velZ ) );
 86 |         SET_COMP(TN)=omega_trm * GET_COMP(x,y-1,z-1,TN) + omega_w2*( dir_indep_trm + ( velY+velZ ) + 1.5*SQR( velY+velZ ) );
 87 |         SET_COMP(BS)=omega_trm * GET_COMP(x,y+1,z+1,BS) + omega_w2*( dir_indep_trm - ( velY+velZ ) + 1.5*SQR( velY+velZ ) );
 88 | 
 89 |         SET_COMP(N)=omega_trm * GET_COMP(x,y-1,z,N) + omega_w1*( dir_indep_trm + velY + 1.5*SQR(velY));
 90 |         SET_COMP(S)=omega_trm * GET_COMP(x,y+1,z,S) + omega_w1*( dir_indep_trm - velY + 1.5*SQR(velY));
 91 |         SET_COMP(E)=omega_trm * GET_COMP(x-1,y,z,E) + omega_w1*( dir_indep_trm + velX + 1.5*SQR(velX));
 92 |         SET_COMP(W)=omega_trm * GET_COMP(x+1,y,z,W) + omega_w1*( dir_indep_trm - velX + 1.5*SQR(velX));
 93 |         SET_COMP(T)=omega_trm * GET_COMP(x,y,z-1,T) + omega_w1*( dir_indep_trm + velZ + 1.5*SQR(velZ));
 94 |         SET_COMP(B)=omega_trm * GET_COMP(x,y,z+1,B) + omega_w1*( dir_indep_trm - velZ + 1.5*SQR(velZ));
 95 |     }
 96 | }
 97 | 
 98 | #undef GET_COMP
 99 | #undef SET_COMP
100 | 
101 | class benchmark_lbm_cuda_object_oriented : public benchmark_lbm_cuda_basic
102 | {
103 | public:
104 |     virtual std::string name()
105 |     {
106 |         return "lbm_cuda_object_oriented";
107 |     }
108 | 
109 | protected:
110 |     void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew)
111 |     {
112 |         update_lbm_object_oriented<<<dimGrid, dimBlock>>>(
113 |             dimX, dimY, dimZ,
114 |             reinterpret_cast<CellLBM*>(devGridOld),
115 |             reinterpret_cast<CellLBM*>(devGridNew));
116 |     }
117 | };
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/include/libflatarray/loop_peeler.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Andreas Schäfer
  3 |  * Copyright 2018 Google
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_LOOP_PEELER_HPP
 10 | #define FLAT_ARRAY_LOOP_PEELER_HPP
 11 | 
 12 | #include <libflatarray/config.h>
 13 | #include <libflatarray/detail/sibling_short_vec_switch.hpp>
 14 | 
 15 | #ifdef _MSC_BUILD
 16 | /**
 17 |  * This is a shim to ease handling of unaligned or not vectorizable
 18 |  * iterations at the begin/end of loops. It will invoke FUNCTION with
 19 |  * a suitable variant of SHORT_VEC (with its arity adjusted) to that
 20 |  * the main chunk of the iterations will be running with full
 21 |  * vectorization (as given by SHORT_VEC) and only the initial
 22 |  * (possibly unaligned) and trailing (less than SHORT_VEC's arity)
 23 |  * iterations will be done with an arity of 1 (i.e. scalar).
 24 |  *
 25 |  * X is expected to be increased by FUNCTION (e.g. by passing it via
 26 |  * reference).
 27 |  */
 28 | #define LIBFLATARRAY_LOOP_PEELER(SHORT_VEC_TYPE, COUNTER_TYPE,          \
 29 |                                  X, END_X, FUNCTION, ...)               \
 30 |     __pragma( warning( push ) )                                         \
 31 |     __pragma( warning( disable : 4710 4711 ) )                          \
 32 |     LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION(                            \
 33 |         , SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) \
 34 |     __pragma( warning( pop ) )
 35 | #else
 36 | /**
 37 |  * This is a shim to ease handling of unaligned or not vectorizable
 38 |  * iterations at the begin/end of loops. It will invoke FUNCTION with
 39 |  * a suitable variant of SHORT_VEC (with its arity adjusted) to that
 40 |  * the main chunk of the iterations will be running with full
 41 |  * vectorization (as given by SHORT_VEC) and only the initial
 42 |  * (possibly unaligned) and trailing (less than SHORT_VEC's arity)
 43 |  * iterations will be done with an arity of 1 (i.e. scalar).
 44 |  *
 45 |  * X is expected to be increased by FUNCTION (e.g. by passing it via
 46 |  * reference).
 47 |  */
 48 | #define LIBFLATARRAY_LOOP_PEELER(SHORT_VEC_TYPE, COUNTER_TYPE,          \
 49 |                                  X, END_X, FUNCTION, ...)               \
 50 |     LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION(                            \
 51 |         , SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__)
 52 | #endif
 53 | 
 54 | #ifdef _MSC_BUILD
 55 | /**
 56 |  * Same as LIBFLATARRAY_LOOP_PEELER(), but for use in templates
 57 |  */
 58 | #define LIBFLATARRAY_LOOP_PEELER_TEMPLATE(SHORT_VEC_TYPE, COUNTER_TYPE, \
 59 |                                           X, END_X, FUNCTION, ...)      \
 60 |     __pragma( warning( push ) )                                         \
 61 |     __pragma( warning( disable : 4710 4711 ) )                          \
 62 |     LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION(                            \
 63 |         typename, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__) \
 64 |     __pragma( warning( pop ) )
 65 | #else
 66 | /**
 67 |  * Same as LIBFLATARRAY_LOOP_PEELER(), but for use in templates
 68 |  */
 69 | #define LIBFLATARRAY_LOOP_PEELER_TEMPLATE(SHORT_VEC_TYPE, COUNTER_TYPE, \
 70 |                                           X, END_X, FUNCTION, ...)      \
 71 |     LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION(                            \
 72 |         typename, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, __VA_ARGS__)
 73 | #endif
 74 | 
 75 | #define LIBFLATARRAY_LOOP_PEELER_IMPLEMENTATION(                        \
 76 |     TYPENAME, SHORT_VEC_TYPE, COUNTER_TYPE, X, END_X, FUNCTION, ...)    \
 77 |     {                                                                   \
 78 |         typedef SHORT_VEC_TYPE lfa_local_short_vec;                     \
 79 |         typedef TYPENAME LibFlatArray::detail::flat_array::             \
 80 |             sibling_short_vec_switch<SHORT_VEC_TYPE, 1>::VALUE          \
 81 |             lfa_local_scalar;                                           \
 82 |                                                                         \
 83 |         COUNTER_TYPE remainder = (X) %                                  \
 84 |             COUNTER_TYPE(lfa_local_short_vec::ARITY);                   \
 85 |         COUNTER_TYPE next_stop = remainder ?                            \
 86 |             (X) + COUNTER_TYPE(lfa_local_short_vec::ARITY) - remainder : \
 87 |             (X);                                                        \
 88 |         COUNTER_TYPE last_stop = (END_X) -                              \
 89 |             (END_X) % COUNTER_TYPE(lfa_local_short_vec::ARITY);         \
 90 |                                                                         \
 91 |         FUNCTION<lfa_local_scalar   >(X, next_stop, __VA_ARGS__);       \
 92 |         FUNCTION<lfa_local_short_vec>(X, last_stop, __VA_ARGS__);       \
 93 |         FUNCTION<lfa_local_scalar   >(X, (END_X),   __VA_ARGS__);       \
 94 |     }
 95 | 
 96 | #ifdef LIBFLATARRAY_WITH_CPP14
 97 | 
 98 | namespace LibFlatArray {
 99 | 
100 | template<typename SHORT_VEC_TYPE, typename COUNTER_TYPE1, typename COUNTER_TYPE2, typename LAMBDA>
101 | void loop_peeler(COUNTER_TYPE1 *counter, const COUNTER_TYPE2& end, const LAMBDA& lambda)
102 | {
103 |     typedef SHORT_VEC_TYPE lfa_local_short_vec;
104 |     typedef typename detail::flat_array::
105 |         sibling_short_vec_switch<SHORT_VEC_TYPE, 1>::VALUE
106 |         lfa_local_scalar;
107 | 
108 |     COUNTER_TYPE2 remainder = (*counter) % COUNTER_TYPE2(lfa_local_short_vec::ARITY);
109 |     COUNTER_TYPE2 next_stop = remainder ?
110 |         (*counter) + COUNTER_TYPE2(lfa_local_short_vec::ARITY) - remainder :
111 |         (*counter);
112 |     COUNTER_TYPE2 last_stop = end - end % COUNTER_TYPE2(lfa_local_short_vec::ARITY);
113 | 
114 |     lambda(lfa_local_scalar(),    counter, next_stop);
115 |     lambda(lfa_local_short_vec(), counter, last_stop);
116 |     lambda(lfa_local_scalar(),    counter, end      );
117 | }
118 | 
119 | }
120 | 
121 | #endif
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/short_vec_mic_double_8.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015 Kurt Kanzenbach
  3 |  * Copyright 2016-2017 Andreas Schäfer
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_MIC_DOUBLE_8_HPP
 10 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_MIC_DOUBLE_8_HPP
 11 | 
 12 | #if LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_MIC
 13 | 
 14 | #include <libflatarray/detail/sqrt_reference.hpp>
 15 | #include <libflatarray/detail/short_vec_helpers.hpp>
 16 | #include <libflatarray/config.h>
 17 | 
 18 | // disable certain warnings from system headers when compiling with
 19 | // Microsoft Visual Studio:
 20 | #ifdef _MSC_BUILD
 21 | #pragma warning( push )
 22 | #pragma warning( disable : 4514 )
 23 | #endif
 24 | 
 25 | #include <immintrin.h>
 26 | #ifdef LIBFLATARRAY_WITH_CPP14
 27 | #include <initializer_list>
 28 | #endif
 29 | 
 30 | #ifdef _MSC_BUILD
 31 | #pragma warning( pop )
 32 | #endif
 33 | 
 34 | namespace LibFlatArray {
 35 | 
 36 | template<typename CARGO, std::size_t ARITY>
 37 | class short_vec;
 38 | 
 39 | #ifdef __ICC
 40 | // disabling this warning as implicit type conversion is exactly our goal here:
 41 | #pragma warning push
 42 | #pragma warning (disable: 2304)
 43 | #endif
 44 | 
 45 | template<>
 46 | class short_vec<double, 8>
 47 | {
 48 | public:
 49 |     static const std::size_t ARITY = 8;
 50 | 
 51 |     typedef short_vec_strategy::mic strategy;
 52 | 
 53 |     template<typename _CharT, typename _Traits>
 54 |     friend std::basic_ostream<_CharT, _Traits>& operator<<(
 55 |         std::basic_ostream<_CharT, _Traits>& __os,
 56 |         const short_vec<double, 8>& vec);
 57 | 
 58 |     inline
 59 |     short_vec(const double data = 0) :
 60 |         val(_mm512_set1_pd(data))
 61 |     {}
 62 | 
 63 |     inline
 64 |     short_vec(const double *data)
 65 |     {
 66 |         load(data);
 67 |     }
 68 | 
 69 |     inline
 70 |     short_vec(const __m512d& val) :
 71 |         val(val)
 72 |     {}
 73 | 
 74 | #ifdef LIBFLATARRAY_WITH_CPP14
 75 |     inline
 76 |     short_vec(const std::initializer_list<double>& il)
 77 |     {
 78 |         const double *ptr = static_cast<const double *>(&(*il.begin()));
 79 |         load(ptr);
 80 |     }
 81 | #endif
 82 | 
 83 |     inline
 84 |     void operator-=(const short_vec<double, 8>& other)
 85 |     {
 86 |         val = _mm512_sub_pd(val, other.val);
 87 |     }
 88 | 
 89 |     inline
 90 |     short_vec<double, 8> operator-(const short_vec<double, 8>& other) const
 91 |     {
 92 |         return short_vec<double, 8>(
 93 |             _mm512_sub_pd(val, other.val));
 94 |     }
 95 | 
 96 |     inline
 97 |     void operator+=(const short_vec<double, 8>& other)
 98 |     {
 99 |         val = _mm512_add_pd(val, other.val);
100 |     }
101 | 
102 |     inline
103 |     short_vec<double, 8> operator+(const short_vec<double, 8>& other) const
104 |     {
105 |         return short_vec<double, 8>(
106 |             _mm512_add_pd(val, other.val));
107 |     }
108 | 
109 |     inline
110 |     void operator*=(const short_vec<double, 8>& other)
111 |     {
112 |         val = _mm512_mul_pd(val, other.val);
113 |     }
114 | 
115 |     inline
116 |     short_vec<double, 8> operator*(const short_vec<double, 8>& other) const
117 |     {
118 |         return short_vec<double, 8>(
119 |             _mm512_mul_pd(val, other.val));
120 |     }
121 | 
122 |     inline
123 |     void operator/=(const short_vec<double, 8>& other)
124 |     {
125 |         val = _mm512_div_pd(val, other.val);
126 |     }
127 | 
128 |     inline
129 |     short_vec<double, 8> operator/(const short_vec<double, 8>& other) const
130 |     {
131 |         return short_vec<double, 8>(
132 |             _mm512_div_pd(val, other.val));
133 |     }
134 | 
135 |     inline
136 |     short_vec<double, 8> sqrt() const
137 |     {
138 |         return short_vec<double, 8>(
139 |             _mm512_sqrt_pd(val));
140 |     }
141 | 
142 |     inline
143 |     void load(const double *data)
144 |     {
145 |         val = _mm512_loadunpacklo_pd(val, data + 0);
146 |         val = _mm512_loadunpackhi_pd(val, data + 8);
147 |     }
148 | 
149 |     inline
150 |     void load_aligned(const double *data)
151 |     {
152 |         SHORTVEC_ASSERT_ALIGNED(data, 64);
153 |         val = _mm512_load_pd(data);
154 |     }
155 | 
156 |     inline
157 |     void store(double *data) const
158 |     {
159 |         _mm512_packstorelo_pd(data + 0, val);
160 |         _mm512_packstorehi_pd(data + 8, val);
161 |     }
162 | 
163 |     inline
164 |     void store_aligned(double *data) const
165 |     {
166 |         SHORTVEC_ASSERT_ALIGNED(data, 64);
167 |         _mm512_store_pd(data, val);
168 |     }
169 | 
170 |     inline
171 |     void store_nt(double *data) const
172 |     {
173 |         SHORTVEC_ASSERT_ALIGNED(data, 64);
174 |         _mm512_storenr_pd(data, val);
175 |     }
176 | 
177 |     inline
178 |     void gather(const double *ptr, const int *offsets)
179 |     {
180 |         __m512i indices;
181 |         indices = _mm512_loadunpacklo_epi32(indices, offsets);
182 |         val    = _mm512_i32logather_pd(indices, ptr, 8);
183 |     }
184 | 
185 |     inline
186 |     void scatter(double *ptr, const int *offsets) const
187 |     {
188 |         __m512i indices;
189 |         indices = _mm512_loadunpacklo_epi32(indices, offsets);
190 |         _mm512_i32loscatter_pd(ptr, indices, val, 8);
191 |     }
192 | 
193 | private:
194 |     __m512d val;
195 | };
196 | 
197 | inline
198 | void operator<<(double *data, const short_vec<double, 8>& vec)
199 | {
200 |     vec.store(data);
201 | }
202 | 
203 | #ifdef __ICC
204 | #pragma warning pop
205 | #endif
206 | 
207 | inline
208 | short_vec<double, 8> sqrt(const short_vec<double, 8>& vec)
209 | {
210 |     return vec.sqrt();
211 | }
212 | 
213 | template<typename _CharT, typename _Traits>
214 | std::basic_ostream<_CharT, _Traits>&
215 | operator<<(std::basic_ostream<_CharT, _Traits>& __os,
216 |            const short_vec<double, 8>& vec)
217 | {
218 |     const double *data1 = reinterpret_cast<const double *>(&vec.val);
219 | 
220 |     __os << "["  << data1[0] << ", " << data1[1] << ", " << data1[2] << ", " << data1[3]
221 |          << ", " << data1[4] << ", " << data1[5] << ", " << data1[6] << ", " << data1[7]
222 |          << "]";
223 |     return __os;
224 | }
225 | 
226 | }
227 | 
228 | #endif
229 | 
230 | #endif
231 | 


--------------------------------------------------------------------------------
/include/libflatarray/detail/short_vec_scalar_int_2.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2015 Kurt Kanzenbach
  3 |  * Copyright 2016-2017 Andreas Schäfer
  4 |  *
  5 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  6 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  7 |  */
  8 | 
  9 | #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SCALAR_INT_2_HPP
 10 | #define FLAT_ARRAY_DETAIL_SHORT_VEC_SCALAR_INT_2_HPP
 11 | 
 12 | #include <libflatarray/config.h>
 13 | #include <libflatarray/short_vec_base.hpp>
 14 | 
 15 | // disable certain warnings from system headers when compiling with
 16 | // Microsoft Visual Studio:
 17 | #ifdef _MSC_BUILD
 18 | #pragma warning( push )
 19 | #pragma warning( disable : 4514 )
 20 | #endif
 21 | 
 22 | #ifdef LIBFLATARRAY_WITH_CPP14
 23 | #include <initializer_list>
 24 | #endif
 25 | 
 26 | #ifdef _MSC_BUILD
 27 | #pragma warning( pop )
 28 | #endif
 29 | 
 30 | namespace LibFlatArray {
 31 | 
 32 | template<typename CARGO, std::size_t ARITY>
 33 | class short_vec;
 34 | 
 35 | #ifdef __ICC
 36 | // disabling this warning as implicit type conversion is exactly our goal here:
 37 | #pragma warning push
 38 | #pragma warning (disable: 2304)
 39 | #endif
 40 | 
 41 | // Don't warn about these functions being stripped from an executable
 42 | // as they're not being used, that's actually expected behavior.
 43 | #ifdef _MSC_BUILD
 44 | #pragma warning( push )
 45 | #pragma warning( disable : 4514 )
 46 | #endif
 47 | 
 48 | template<>
 49 | class short_vec<int, 2> : public short_vec_base<int, 2>
 50 | {
 51 | public:
 52 |     static const std::size_t ARITY = 2;
 53 | 
 54 |     typedef short_vec_strategy::scalar strategy;
 55 | 
 56 |     template<typename _CharT, typename _Traits>
 57 |     friend std::basic_ostream<_CharT, _Traits>& operator<<(
 58 |         std::basic_ostream<_CharT, _Traits>& __os,
 59 |         const short_vec<int, 2>& vec);
 60 | 
 61 |     inline
 62 |     short_vec(const int data = 0) :
 63 |         val{data, data}
 64 |     {}
 65 | 
 66 |     inline
 67 |     short_vec(const int *data)
 68 |     {
 69 |         load(data);
 70 |     }
 71 | 
 72 |     inline
 73 |     short_vec(const int val1, const int val2) :
 74 |         val{val1,
 75 |             val2}
 76 |     {}
 77 | 
 78 | #ifdef LIBFLATARRAY_WITH_CPP14
 79 |     inline
 80 |     short_vec(const std::initializer_list<int>& il)
 81 |     {
 82 |         const int *ptr = static_cast<const int *>(&(*il.begin()));
 83 |         load(ptr);
 84 |     }
 85 | #endif
 86 | 
 87 |     inline
 88 |     void operator-=(const short_vec<int, 2>& other)
 89 |     {
 90 |         val[ 0] -= other.val[ 0];
 91 |         val[ 1] -= other.val[ 1];
 92 |     }
 93 | 
 94 |     inline
 95 |     short_vec<int, 2> operator-(const short_vec<int, 2>& other) const
 96 |     {
 97 |         return short_vec<int, 2>(
 98 |             val[ 0] - other.val[ 0],
 99 |             val[ 1] - other.val[ 1]);
100 |     }
101 | 
102 |     inline
103 |     void operator+=(const short_vec<int, 2>& other)
104 |     {
105 |         val[ 0] += other.val[ 0];
106 |         val[ 1] += other.val[ 1];
107 |     }
108 | 
109 |     inline
110 |     short_vec<int, 2> operator+(const short_vec<int, 2>& other) const
111 |     {
112 |         return short_vec<int, 2>(
113 |             val[ 0] + other.val[ 0],
114 |             val[ 1] + other.val[ 1]);
115 |     }
116 | 
117 |     inline
118 |     void operator*=(const short_vec<int, 2>& other)
119 |     {
120 |         val[ 0] *= other.val[ 0];
121 |         val[ 1] *= other.val[ 1];
122 |     }
123 | 
124 |     inline
125 |     short_vec<int, 2> operator*(const short_vec<int, 2>& other) const
126 |     {
127 |         return short_vec<int, 2>(
128 |             val[ 0] * other.val[ 0],
129 |             val[ 1] * other.val[ 1]);
130 |     }
131 | 
132 |     inline
133 |     void operator/=(const short_vec<int, 2>& other)
134 |     {
135 |         val[ 0] /= other.val[ 0];
136 |         val[ 1] /= other.val[ 1];
137 |     }
138 | 
139 |     inline
140 |     short_vec<int, 2> operator/(const short_vec<int, 2>& other) const
141 |     {
142 |         return short_vec<int, 2>(
143 |             val[ 0] / other.val[ 0],
144 |             val[ 1] / other.val[ 1]);
145 |     }
146 | 
147 |     inline
148 |     short_vec<int, 2> sqrt() const
149 |     {
150 |         return short_vec<int, 2>(
151 |             static_cast<int>(std::sqrt(val[ 0])),
152 |             static_cast<int>(std::sqrt(val[ 1])));
153 |     }
154 | 
155 |     inline
156 |     void load(const int *data)
157 |     {
158 |         val[ 0] = data[0];
159 |         val[ 1] = data[1];
160 |     }
161 | 
162 |     inline
163 |     void load_aligned(const int *data)
164 |     {
165 |         load(data);
166 |     }
167 | 
168 |     inline
169 |     void store(int *data) const
170 |     {
171 |         *(data +  0) = val[ 0];
172 |         *(data +  1) = val[ 1];
173 |     }
174 | 
175 |     inline
176 |     void store_aligned(int *data) const
177 |     {
178 |         store(data);
179 |     }
180 | 
181 |     inline
182 |     void store_nt(int *data) const
183 |     {
184 |         store(data);
185 |     }
186 | 
187 |     inline
188 |     void gather(const int *ptr, const int *offsets)
189 |     {
190 |         val[ 0] = ptr[offsets[0]];
191 |         val[ 1] = ptr[offsets[1]];
192 |     }
193 | 
194 |     inline
195 |     void scatter(int *ptr, const int *offsets) const
196 |     {
197 |         ptr[offsets[0]] = val[ 0];
198 |         ptr[offsets[1]] = val[ 1];
199 |     }
200 | 
201 | private:
202 |     int val[2];
203 | };
204 | 
205 | inline
206 | void operator<<(int *data, const short_vec<int, 2>& vec)
207 | {
208 |     vec.store(data);
209 | }
210 | 
211 | #ifdef __ICC
212 | #pragma warning pop
213 | #endif
214 | 
215 | inline
216 | short_vec<int, 2> sqrt(const short_vec<int, 2>& vec)
217 | {
218 |     return vec.sqrt();
219 | }
220 | 
221 | // not inlining is ok, as is inlining:
222 | #ifdef _MSC_BUILD
223 | #pragma warning( push )
224 | #pragma warning( disable : 4710 4711 )
225 | #endif
226 | 
227 | template<typename _CharT, typename _Traits>
228 | inline
229 | std::basic_ostream<_CharT, _Traits>&
230 | operator<<(std::basic_ostream<_CharT, _Traits>& __os,
231 |            const short_vec<int, 2>& vec)
232 | {
233 |     __os << "["  << vec.val[ 0] << ", " << vec.val[ 1]
234 |          << "]";
235 |     return __os;
236 | }
237 | 
238 | #ifdef _MSC_BUILD
239 | #pragma warning( pop )
240 | #endif
241 | 
242 | }
243 | 
244 | #ifdef _MSC_BUILD
245 | #pragma warning( pop )
246 | #endif
247 | 
248 | #endif
249 | 


--------------------------------------------------------------------------------
/examples/lbm/update_lbm_classic.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CLASSIC_H
  2 | #define LIBFLATARRAY_EXAMPLES_LBM_UPDATE_LBM_CLASSIC_H
  3 | 
  4 | /**
  5 |  * Copyright 2013 Andreas Schäfer
  6 |  *
  7 |  * Distributed under the Boost Software License, Version 1.0. (See accompanying
  8 |  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
  9 |  */
 10 | 
 11 | #include "util.h"
 12 | 
 13 | #define C 0
 14 | #define N 1
 15 | #define E 2
 16 | #define W 3
 17 | #define S 4
 18 | #define T 5
 19 | #define B 6
 20 | 
 21 | #define NW 7
 22 | #define SW 8
 23 | #define NE 9
 24 | #define SE 10
 25 | 
 26 | #define TW 11
 27 | #define BW 12
 28 | #define TE 13
 29 | #define BE 14
 30 | 
 31 | #define TN 15
 32 | #define BN 16
 33 | #define TS 17
 34 | #define BS 18
 35 | 
 36 | #define GET_COMP(X, Y, Z, DIR)                                          \
 37 |     gridOld[(Z) * dimX * dimY + (Y) * dimX + (X) + (DIR) * dimX * dimY * dimZ]
 38 | 
 39 | #define SET_COMP(DIR)                                                   \
 40 |     gridNew[z   * dimX * dimY +   y * dimX +   x + (DIR) * dimX * dimY * dimZ]
 41 | 
 42 | __global__ void update_lbm_classic(int dimX, int dimY, int dimZ, double *gridOld, double *gridNew)
 43 | {
 44 |     int x = blockIdx.x * blockDim.x + threadIdx.x + 2;
 45 |     int y = blockIdx.y * blockDim.y + threadIdx.y + 2;
 46 |     int z = 2;
 47 | 
 48 | #pragma unroll 10
 49 |     for (; z < (dimZ - 2); z += 1) {
 50 | 
 51 | #define SQR(X) ((X)*(X))
 52 |         const double omega = 1.0/1.7;
 53 |         const double omega_trm = 1.0 - omega;
 54 |         const double omega_w0 = 3.0 * 1.0 / 3.0 * omega;
 55 |         const double omega_w1 = 3.0*1.0/18.0*omega;
 56 |         const double omega_w2 = 3.0*1.0/36.0*omega;
 57 |         const double one_third = 1.0 / 3.0;
 58 |         double velX, velY, velZ;
 59 | 
 60 |         velX  =
 61 |             GET_COMP(x-1,y,z,E) + GET_COMP(x-1,y-1,z,NE) +
 62 |             GET_COMP(x-1,y+1,z,SE) + GET_COMP(x-1,y,z-1,TE) +
 63 |             GET_COMP(x-1,y,z+1,BE);
 64 |         velY  = GET_COMP(x,y-1,z,N) + GET_COMP(x+1,y-1,z,NW) +
 65 |             GET_COMP(x,y-1,z-1,TN) + GET_COMP(x,y-1,z+1,BN);
 66 |         velZ  = GET_COMP(x,y,z-1,T) + GET_COMP(x,y+1,z-1,TS) +
 67 |             GET_COMP(x+1,y,z-1,TW);
 68 | 
 69 |         const double rho =
 70 |             GET_COMP(x,y,z,C) + GET_COMP(x,y+1,z,S) +
 71 |             GET_COMP(x+1,y,z,W) + GET_COMP(x,y,z+1,B) +
 72 |             GET_COMP(x+1,y+1,z,SW) + GET_COMP(x,y+1,z+1,BS) +
 73 |             GET_COMP(x+1,y,z+1,BW) + velX + velY + velZ;
 74 |         velX  = velX
 75 |             - GET_COMP(x+1,y,z,W)    - GET_COMP(x+1,y-1,z,NW)
 76 |             - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x+1,y,z-1,TW)
 77 |             - GET_COMP(x+1,y,z+1,BW);
 78 |         velY  = velY
 79 |             + GET_COMP(x-1,y-1,z,NE) - GET_COMP(x,y+1,z,S)
 80 |             - GET_COMP(x+1,y+1,z,SW) - GET_COMP(x-1,y+1,z,SE)
 81 |             - GET_COMP(x,y+1,z-1,TS) - GET_COMP(x,y+1,z+1,BS);
 82 |         velZ  = velZ+GET_COMP(x,y-1,z-1,TN) + GET_COMP(x-1,y,z-1,TE) - GET_COMP(x,y,z+1,B) - GET_COMP(x,y-1,z+1,BN) - GET_COMP(x,y+1,z+1,BS) - GET_COMP(x+1,y,z+1,BW) - GET_COMP(x-1,y,z+1,BE);
 83 | 
 84 |         // density = rho;
 85 |         // velocityX = velX;
 86 |         // velocityY = velY;
 87 |         // velocityZ = velZ;
 88 | 
 89 |         const double dir_indep_trm = one_third*rho - 0.5*( velX*velX + velY*velY + velZ*velZ );
 90 | 
 91 |         SET_COMP(C)=omega_trm * GET_COMP(x,y,z,C) + omega_w0*( dir_indep_trm );
 92 | 
 93 |         SET_COMP(NW)=omega_trm * GET_COMP(x+1,y-1,z,NW) +
 94 |             omega_w2*( dir_indep_trm - ( velX-velY ) + 1.5*SQR( velX-velY ) );
 95 |         SET_COMP(SE)=omega_trm * GET_COMP(x-1,y+1,z,SE) +
 96 |             omega_w2*( dir_indep_trm + ( velX-velY ) + 1.5*SQR( velX-velY ) );
 97 |         SET_COMP(NE)=omega_trm * GET_COMP(x-1,y-1,z,NE) +
 98 |             omega_w2*( dir_indep_trm + ( velX+velY ) + 1.5*SQR( velX+velY ) );
 99 |         SET_COMP(SW)=omega_trm * GET_COMP(x+1,y+1,z,SW) +
100 |             omega_w2*( dir_indep_trm - ( velX+velY ) + 1.5*SQR( velX+velY ) );
101 | 
102 |         SET_COMP(TW)=omega_trm * GET_COMP(x+1,y,z-1,TW) + omega_w2*( dir_indep_trm - ( velX-velZ ) + 1.5*SQR( velX-velZ ) );
103 |         SET_COMP(BE)=omega_trm * GET_COMP(x-1,y,z+1,BE) + omega_w2*( dir_indep_trm + ( velX-velZ ) + 1.5*SQR( velX-velZ ) );
104 |         SET_COMP(TE)=omega_trm * GET_COMP(x-1,y,z-1,TE) + omega_w2*( dir_indep_trm + ( velX+velZ ) + 1.5*SQR( velX+velZ ) );
105 |         SET_COMP(BW)=omega_trm * GET_COMP(x+1,y,z+1,BW) + omega_w2*( dir_indep_trm - ( velX+velZ ) + 1.5*SQR( velX+velZ ) );
106 | 
107 |         SET_COMP(TS)=omega_trm * GET_COMP(x,y+1,z-1,TS) + omega_w2*( dir_indep_trm - ( velY-velZ ) + 1.5*SQR( velY-velZ ) );
108 |         SET_COMP(BN)=omega_trm * GET_COMP(x,y-1,z+1,BN) + omega_w2*( dir_indep_trm + ( velY-velZ ) + 1.5*SQR( velY-velZ ) );
109 |         SET_COMP(TN)=omega_trm * GET_COMP(x,y-1,z-1,TN) + omega_w2*( dir_indep_trm + ( velY+velZ ) + 1.5*SQR( velY+velZ ) );
110 |         SET_COMP(BS)=omega_trm * GET_COMP(x,y+1,z+1,BS) + omega_w2*( dir_indep_trm - ( velY+velZ ) + 1.5*SQR( velY+velZ ) );
111 | 
112 |         SET_COMP(N)=omega_trm * GET_COMP(x,y-1,z,N) + omega_w1*( dir_indep_trm + velY + 1.5*SQR(velY));
113 |         SET_COMP(S)=omega_trm * GET_COMP(x,y+1,z,S) + omega_w1*( dir_indep_trm - velY + 1.5*SQR(velY));
114 |         SET_COMP(E)=omega_trm * GET_COMP(x-1,y,z,E) + omega_w1*( dir_indep_trm + velX + 1.5*SQR(velX));
115 |         SET_COMP(W)=omega_trm * GET_COMP(x+1,y,z,W) + omega_w1*( dir_indep_trm - velX + 1.5*SQR(velX));
116 |         SET_COMP(T)=omega_trm * GET_COMP(x,y,z-1,T) + omega_w1*( dir_indep_trm + velZ + 1.5*SQR(velZ));
117 |         SET_COMP(B)=omega_trm * GET_COMP(x,y,z+1,B) + omega_w1*( dir_indep_trm - velZ + 1.5*SQR(velZ));
118 |     }
119 | }
120 | 
121 | #undef GET_COMP
122 | #undef SET_COMP
123 | #undef SQR
124 | 
125 | #undef C
126 | #undef N
127 | #undef E
128 | #undef W
129 | #undef S
130 | #undef T
131 | #undef B
132 | 
133 | #undef NW
134 | #undef SW
135 | #undef NE
136 | #undef SE
137 | 
138 | #undef TW
139 | #undef BW
140 | #undef TE
141 | #undef BE
142 | 
143 | #undef TN
144 | #undef BN
145 | #undef TS
146 | #undef BS
147 | 
148 | class benchmark_lbm_cuda_classic : public benchmark_lbm_cuda_basic
149 | {
150 | public:
151 |     virtual std::string name()
152 |     {
153 |         return "lbm_cuda_classic";
154 |     }
155 | 
156 | protected:
157 |     void update(dim3 dimGrid, dim3 dimBlock, int dimX, int dimY, int dimZ, double *devGridOld, double *devGridNew)
158 |     {
159 |             update_lbm_classic<<<dimGrid, dimBlock>>>(dimX, dimY, dimZ, devGridOld, devGridNew);
160 |     }
161 | };
162 | 
163 | #endif
164 | 


--------------------------------------------------------------------------------
/examples/lbm/flatarray_implementation_0.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include "cudalineupdatefunctorprototype.h"
  3 | 
  4 | IMPLEMENTATION(CellLBM, 32, 32, 32, 32, 32, 32)
  5 | IMPLEMENTATION(CellLBM, 32, 32, 64, 32, 32, 64)
  6 | IMPLEMENTATION(CellLBM, 32, 32, 128, 32, 32, 128)
  7 | IMPLEMENTATION(CellLBM, 32, 32, 136, 32, 32, 136)
  8 | IMPLEMENTATION(CellLBM, 32, 32, 192, 32, 32, 192)
  9 | IMPLEMENTATION(CellLBM, 32, 32, 200, 32, 32, 200)
 10 | IMPLEMENTATION(CellLBM, 32, 32, 256, 32, 32, 256)
 11 | IMPLEMENTATION(CellLBM, 32, 32, 264, 32, 32, 264)
 12 | IMPLEMENTATION(CellLBM, 32, 32, 512, 32, 32, 512)
 13 | IMPLEMENTATION(CellLBM, 32, 32, 520, 32, 32, 520)
 14 | IMPLEMENTATION(CellLBM, 32, 32, 1032, 32, 32, 1032)
 15 | IMPLEMENTATION(CellLBM, 32, 64, 32, 32, 64, 32)
 16 | IMPLEMENTATION(CellLBM, 32, 64, 64, 32, 64, 64)
 17 | IMPLEMENTATION(CellLBM, 32, 64, 128, 32, 64, 128)
 18 | IMPLEMENTATION(CellLBM, 32, 64, 136, 32, 64, 136)
 19 | IMPLEMENTATION(CellLBM, 32, 64, 192, 32, 64, 192)
 20 | IMPLEMENTATION(CellLBM, 32, 64, 200, 32, 64, 200)
 21 | IMPLEMENTATION(CellLBM, 32, 64, 256, 32, 64, 256)
 22 | IMPLEMENTATION(CellLBM, 32, 64, 264, 32, 64, 264)
 23 | IMPLEMENTATION(CellLBM, 32, 64, 512, 32, 64, 512)
 24 | IMPLEMENTATION(CellLBM, 32, 64, 520, 32, 64, 520)
 25 | IMPLEMENTATION(CellLBM, 32, 64, 1032, 32, 64, 1032)
 26 | IMPLEMENTATION(CellLBM, 32, 128, 32, 32, 128, 32)
 27 | IMPLEMENTATION(CellLBM, 32, 128, 64, 32, 128, 64)
 28 | IMPLEMENTATION(CellLBM, 32, 128, 128, 32, 128, 128)
 29 | IMPLEMENTATION(CellLBM, 32, 128, 136, 32, 128, 136)
 30 | IMPLEMENTATION(CellLBM, 32, 128, 192, 32, 128, 192)
 31 | IMPLEMENTATION(CellLBM, 32, 128, 200, 32, 128, 200)
 32 | IMPLEMENTATION(CellLBM, 32, 128, 256, 32, 128, 256)
 33 | IMPLEMENTATION(CellLBM, 32, 128, 264, 32, 128, 264)
 34 | IMPLEMENTATION(CellLBM, 32, 128, 512, 32, 128, 512)
 35 | IMPLEMENTATION(CellLBM, 32, 128, 520, 32, 128, 520)
 36 | IMPLEMENTATION(CellLBM, 32, 128, 1032, 32, 128, 1032)
 37 | IMPLEMENTATION(CellLBM, 32, 136, 32, 32, 136, 32)
 38 | IMPLEMENTATION(CellLBM, 32, 136, 64, 32, 136, 64)
 39 | IMPLEMENTATION(CellLBM, 32, 136, 128, 32, 136, 128)
 40 | IMPLEMENTATION(CellLBM, 32, 136, 136, 32, 136, 136)
 41 | IMPLEMENTATION(CellLBM, 32, 136, 192, 32, 136, 192)
 42 | IMPLEMENTATION(CellLBM, 32, 136, 200, 32, 136, 200)
 43 | IMPLEMENTATION(CellLBM, 32, 136, 256, 32, 136, 256)
 44 | IMPLEMENTATION(CellLBM, 32, 136, 264, 32, 136, 264)
 45 | IMPLEMENTATION(CellLBM, 32, 136, 512, 32, 136, 512)
 46 | IMPLEMENTATION(CellLBM, 32, 136, 520, 32, 136, 520)
 47 | IMPLEMENTATION(CellLBM, 32, 136, 1032, 32, 136, 1032)
 48 | IMPLEMENTATION(CellLBM, 32, 192, 32, 32, 192, 32)
 49 | IMPLEMENTATION(CellLBM, 32, 192, 64, 32, 192, 64)
 50 | IMPLEMENTATION(CellLBM, 32, 192, 128, 32, 192, 128)
 51 | IMPLEMENTATION(CellLBM, 32, 192, 136, 32, 192, 136)
 52 | IMPLEMENTATION(CellLBM, 32, 192, 192, 32, 192, 192)
 53 | IMPLEMENTATION(CellLBM, 32, 192, 200, 32, 192, 200)
 54 | IMPLEMENTATION(CellLBM, 32, 192, 256, 32, 192, 256)
 55 | IMPLEMENTATION(CellLBM, 32, 192, 264, 32, 192, 264)
 56 | IMPLEMENTATION(CellLBM, 32, 192, 512, 32, 192, 512)
 57 | IMPLEMENTATION(CellLBM, 32, 192, 520, 32, 192, 520)
 58 | IMPLEMENTATION(CellLBM, 32, 192, 1032, 32, 192, 1032)
 59 | IMPLEMENTATION(CellLBM, 32, 200, 32, 32, 200, 32)
 60 | IMPLEMENTATION(CellLBM, 32, 200, 64, 32, 200, 64)
 61 | IMPLEMENTATION(CellLBM, 32, 200, 128, 32, 200, 128)
 62 | IMPLEMENTATION(CellLBM, 32, 200, 136, 32, 200, 136)
 63 | IMPLEMENTATION(CellLBM, 32, 200, 192, 32, 200, 192)
 64 | IMPLEMENTATION(CellLBM, 32, 200, 200, 32, 200, 200)
 65 | IMPLEMENTATION(CellLBM, 32, 200, 256, 32, 200, 256)
 66 | IMPLEMENTATION(CellLBM, 32, 200, 264, 32, 200, 264)
 67 | IMPLEMENTATION(CellLBM, 32, 200, 512, 32, 200, 512)
 68 | IMPLEMENTATION(CellLBM, 32, 200, 520, 32, 200, 520)
 69 | IMPLEMENTATION(CellLBM, 32, 200, 1032, 32, 200, 1032)
 70 | IMPLEMENTATION(CellLBM, 32, 256, 32, 32, 256, 32)
 71 | IMPLEMENTATION(CellLBM, 32, 256, 64, 32, 256, 64)
 72 | IMPLEMENTATION(CellLBM, 32, 256, 128, 32, 256, 128)
 73 | IMPLEMENTATION(CellLBM, 32, 256, 136, 32, 256, 136)
 74 | IMPLEMENTATION(CellLBM, 32, 256, 192, 32, 256, 192)
 75 | IMPLEMENTATION(CellLBM, 32, 256, 200, 32, 256, 200)
 76 | IMPLEMENTATION(CellLBM, 32, 256, 256, 32, 256, 256)
 77 | IMPLEMENTATION(CellLBM, 32, 256, 264, 32, 256, 264)
 78 | IMPLEMENTATION(CellLBM, 32, 256, 512, 32, 256, 512)
 79 | IMPLEMENTATION(CellLBM, 32, 256, 520, 32, 256, 520)
 80 | IMPLEMENTATION(CellLBM, 32, 256, 1032, 32, 256, 1032)
 81 | IMPLEMENTATION(CellLBM, 32, 264, 32, 32, 264, 32)
 82 | IMPLEMENTATION(CellLBM, 32, 264, 64, 32, 264, 64)
 83 | IMPLEMENTATION(CellLBM, 32, 264, 128, 32, 264, 128)
 84 | IMPLEMENTATION(CellLBM, 32, 264, 136, 32, 264, 136)
 85 | IMPLEMENTATION(CellLBM, 32, 264, 192, 32, 264, 192)
 86 | IMPLEMENTATION(CellLBM, 32, 264, 200, 32, 264, 200)
 87 | IMPLEMENTATION(CellLBM, 32, 264, 256, 32, 264, 256)
 88 | IMPLEMENTATION(CellLBM, 32, 264, 264, 32, 264, 264)
 89 | IMPLEMENTATION(CellLBM, 32, 264, 512, 32, 264, 512)
 90 | IMPLEMENTATION(CellLBM, 32, 264, 520, 32, 264, 520)
 91 | IMPLEMENTATION(CellLBM, 32, 264, 1032, 32, 264, 1032)
 92 | IMPLEMENTATION(CellLBM, 32, 512, 32, 32, 512, 32)
 93 | IMPLEMENTATION(CellLBM, 32, 512, 64, 32, 512, 64)
 94 | IMPLEMENTATION(CellLBM, 32, 512, 128, 32, 512, 128)
 95 | IMPLEMENTATION(CellLBM, 32, 512, 136, 32, 512, 136)
 96 | IMPLEMENTATION(CellLBM, 32, 512, 192, 32, 512, 192)
 97 | IMPLEMENTATION(CellLBM, 32, 512, 200, 32, 512, 200)
 98 | IMPLEMENTATION(CellLBM, 32, 512, 256, 32, 512, 256)
 99 | IMPLEMENTATION(CellLBM, 32, 512, 264, 32, 512, 264)
100 | IMPLEMENTATION(CellLBM, 32, 512, 512, 32, 512, 512)
101 | IMPLEMENTATION(CellLBM, 32, 512, 520, 32, 512, 520)
102 | IMPLEMENTATION(CellLBM, 32, 512, 1032, 32, 512, 1032)
103 | IMPLEMENTATION(CellLBM, 32, 520, 32, 32, 520, 32)
104 | IMPLEMENTATION(CellLBM, 32, 520, 64, 32, 520, 64)
105 | IMPLEMENTATION(CellLBM, 32, 520, 128, 32, 520, 128)
106 | IMPLEMENTATION(CellLBM, 32, 520, 136, 32, 520, 136)
107 | IMPLEMENTATION(CellLBM, 32, 520, 192, 32, 520, 192)
108 | IMPLEMENTATION(CellLBM, 32, 520, 200, 32, 520, 200)
109 | IMPLEMENTATION(CellLBM, 32, 520, 256, 32, 520, 256)
110 | IMPLEMENTATION(CellLBM, 32, 520, 264, 32, 520, 264)
111 | IMPLEMENTATION(CellLBM, 32, 520, 512, 32, 520, 512)
112 | IMPLEMENTATION(CellLBM, 32, 520, 520, 32, 520, 520)
113 | IMPLEMENTATION(CellLBM, 32, 520, 1032, 32, 520, 1032)
114 | IMPLEMENTATION(CellLBM, 32, 1032, 32, 32, 1032, 32)
115 | IMPLEMENTATION(CellLBM, 32, 1032, 64, 32, 1032, 64)
116 | IMPLEMENTATION(CellLBM, 32, 1032, 128, 32, 1032, 128)
117 | IMPLEMENTATION(CellLBM, 32, 1032, 136, 32, 1032, 136)
118 | IMPLEMENTATION(CellLBM, 32, 1032, 192, 32, 1032, 192)
119 | IMPLEMENTATION(CellLBM, 32, 1032, 200, 32, 1032, 200)
120 | IMPLEMENTATION(CellLBM, 32, 1032, 256, 32, 1032, 256)
121 | IMPLEMENTATION(CellLBM, 32, 1032, 264, 32, 1032, 264)
122 | IMPLEMENTATION(CellLBM, 32, 1032, 512, 32, 1032, 512)
123 | IMPLEMENTATION(CellLBM, 32, 1032, 520, 32, 1032, 520)
124 | IMPLEMENTATION(CellLBM, 32, 1032, 1032, 32, 1032, 1032)
125 | 


--------------------------------------------------------------------------------
/examples/lbm/flatarray_implementation_1.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include "cudalineupdatefunctorprototype.h"
  3 | 
  4 | IMPLEMENTATION(CellLBM, 64, 32, 32, 64, 32, 32)
  5 | IMPLEMENTATION(CellLBM, 64, 32, 64, 64, 32, 64)
  6 | IMPLEMENTATION(CellLBM, 64, 32, 128, 64, 32, 128)
  7 | IMPLEMENTATION(CellLBM, 64, 32, 136, 64, 32, 136)
  8 | IMPLEMENTATION(CellLBM, 64, 32, 192, 64, 32, 192)
  9 | IMPLEMENTATION(CellLBM, 64, 32, 200, 64, 32, 200)
 10 | IMPLEMENTATION(CellLBM, 64, 32, 256, 64, 32, 256)
 11 | IMPLEMENTATION(CellLBM, 64, 32, 264, 64, 32, 264)
 12 | IMPLEMENTATION(CellLBM, 64, 32, 512, 64, 32, 512)
 13 | IMPLEMENTATION(CellLBM, 64, 32, 520, 64, 32, 520)
 14 | IMPLEMENTATION(CellLBM, 64, 32, 1032, 64, 32, 1032)
 15 | IMPLEMENTATION(CellLBM, 64, 64, 32, 64, 64, 32)
 16 | IMPLEMENTATION(CellLBM, 64, 64, 64, 64, 64, 64)
 17 | IMPLEMENTATION(CellLBM, 64, 64, 128, 64, 64, 128)
 18 | IMPLEMENTATION(CellLBM, 64, 64, 136, 64, 64, 136)
 19 | IMPLEMENTATION(CellLBM, 64, 64, 192, 64, 64, 192)
 20 | IMPLEMENTATION(CellLBM, 64, 64, 200, 64, 64, 200)
 21 | IMPLEMENTATION(CellLBM, 64, 64, 256, 64, 64, 256)
 22 | IMPLEMENTATION(CellLBM, 64, 64, 264, 64, 64, 264)
 23 | IMPLEMENTATION(CellLBM, 64, 64, 512, 64, 64, 512)
 24 | IMPLEMENTATION(CellLBM, 64, 64, 520, 64, 64, 520)
 25 | IMPLEMENTATION(CellLBM, 64, 64, 1032, 64, 64, 1032)
 26 | IMPLEMENTATION(CellLBM, 64, 128, 32, 64, 128, 32)
 27 | IMPLEMENTATION(CellLBM, 64, 128, 64, 64, 128, 64)
 28 | IMPLEMENTATION(CellLBM, 64, 128, 128, 64, 128, 128)
 29 | IMPLEMENTATION(CellLBM, 64, 128, 136, 64, 128, 136)
 30 | IMPLEMENTATION(CellLBM, 64, 128, 192, 64, 128, 192)
 31 | IMPLEMENTATION(CellLBM, 64, 128, 200, 64, 128, 200)
 32 | IMPLEMENTATION(CellLBM, 64, 128, 256, 64, 128, 256)
 33 | IMPLEMENTATION(CellLBM, 64, 128, 264, 64, 128, 264)
 34 | IMPLEMENTATION(CellLBM, 64, 128, 512, 64, 128, 512)
 35 | IMPLEMENTATION(CellLBM, 64, 128, 520, 64, 128, 520)
 36 | IMPLEMENTATION(CellLBM, 64, 128, 1032, 64, 128, 1032)
 37 | IMPLEMENTATION(CellLBM, 64, 136, 32, 64, 136, 32)
 38 | IMPLEMENTATION(CellLBM, 64, 136, 64, 64, 136, 64)
 39 | IMPLEMENTATION(CellLBM, 64, 136, 128, 64, 136, 128)
 40 | IMPLEMENTATION(CellLBM, 64, 136, 136, 64, 136, 136)
 41 | IMPLEMENTATION(CellLBM, 64, 136, 192, 64, 136, 192)
 42 | IMPLEMENTATION(CellLBM, 64, 136, 200, 64, 136, 200)
 43 | IMPLEMENTATION(CellLBM, 64, 136, 256, 64, 136, 256)
 44 | IMPLEMENTATION(CellLBM, 64, 136, 264, 64, 136, 264)
 45 | IMPLEMENTATION(CellLBM, 64, 136, 512, 64, 136, 512)
 46 | IMPLEMENTATION(CellLBM, 64, 136, 520, 64, 136, 520)
 47 | IMPLEMENTATION(CellLBM, 64, 136, 1032, 64, 136, 1032)
 48 | IMPLEMENTATION(CellLBM, 64, 192, 32, 64, 192, 32)
 49 | IMPLEMENTATION(CellLBM, 64, 192, 64, 64, 192, 64)
 50 | IMPLEMENTATION(CellLBM, 64, 192, 128, 64, 192, 128)
 51 | IMPLEMENTATION(CellLBM, 64, 192, 136, 64, 192, 136)
 52 | IMPLEMENTATION(CellLBM, 64, 192, 192, 64, 192, 192)
 53 | IMPLEMENTATION(CellLBM, 64, 192, 200, 64, 192, 200)
 54 | IMPLEMENTATION(CellLBM, 64, 192, 256, 64, 192, 256)
 55 | IMPLEMENTATION(CellLBM, 64, 192, 264, 64, 192, 264)
 56 | IMPLEMENTATION(CellLBM, 64, 192, 512, 64, 192, 512)
 57 | IMPLEMENTATION(CellLBM, 64, 192, 520, 64, 192, 520)
 58 | IMPLEMENTATION(CellLBM, 64, 192, 1032, 64, 192, 1032)
 59 | IMPLEMENTATION(CellLBM, 64, 200, 32, 64, 200, 32)
 60 | IMPLEMENTATION(CellLBM, 64, 200, 64, 64, 200, 64)
 61 | IMPLEMENTATION(CellLBM, 64, 200, 128, 64, 200, 128)
 62 | IMPLEMENTATION(CellLBM, 64, 200, 136, 64, 200, 136)
 63 | IMPLEMENTATION(CellLBM, 64, 200, 192, 64, 200, 192)
 64 | IMPLEMENTATION(CellLBM, 64, 200, 200, 64, 200, 200)
 65 | IMPLEMENTATION(CellLBM, 64, 200, 256, 64, 200, 256)
 66 | IMPLEMENTATION(CellLBM, 64, 200, 264, 64, 200, 264)
 67 | IMPLEMENTATION(CellLBM, 64, 200, 512, 64, 200, 512)
 68 | IMPLEMENTATION(CellLBM, 64, 200, 520, 64, 200, 520)
 69 | IMPLEMENTATION(CellLBM, 64, 200, 1032, 64, 200, 1032)
 70 | IMPLEMENTATION(CellLBM, 64, 256, 32, 64, 256, 32)
 71 | IMPLEMENTATION(CellLBM, 64, 256, 64, 64, 256, 64)
 72 | IMPLEMENTATION(CellLBM, 64, 256, 128, 64, 256, 128)
 73 | IMPLEMENTATION(CellLBM, 64, 256, 136, 64, 256, 136)
 74 | IMPLEMENTATION(CellLBM, 64, 256, 192, 64, 256, 192)
 75 | IMPLEMENTATION(CellLBM, 64, 256, 200, 64, 256, 200)
 76 | IMPLEMENTATION(CellLBM, 64, 256, 256, 64, 256, 256)
 77 | IMPLEMENTATION(CellLBM, 64, 256, 264, 64, 256, 264)
 78 | IMPLEMENTATION(CellLBM, 64, 256, 512, 64, 256, 512)
 79 | IMPLEMENTATION(CellLBM, 64, 256, 520, 64, 256, 520)
 80 | IMPLEMENTATION(CellLBM, 64, 256, 1032, 64, 256, 1032)
 81 | IMPLEMENTATION(CellLBM, 64, 264, 32, 64, 264, 32)
 82 | IMPLEMENTATION(CellLBM, 64, 264, 64, 64, 264, 64)
 83 | IMPLEMENTATION(CellLBM, 64, 264, 128, 64, 264, 128)
 84 | IMPLEMENTATION(CellLBM, 64, 264, 136, 64, 264, 136)
 85 | IMPLEMENTATION(CellLBM, 64, 264, 192, 64, 264, 192)
 86 | IMPLEMENTATION(CellLBM, 64, 264, 200, 64, 264, 200)
 87 | IMPLEMENTATION(CellLBM, 64, 264, 256, 64, 264, 256)
 88 | IMPLEMENTATION(CellLBM, 64, 264, 264, 64, 264, 264)
 89 | IMPLEMENTATION(CellLBM, 64, 264, 512, 64, 264, 512)
 90 | IMPLEMENTATION(CellLBM, 64, 264, 520, 64, 264, 520)
 91 | IMPLEMENTATION(CellLBM, 64, 264, 1032, 64, 264, 1032)
 92 | IMPLEMENTATION(CellLBM, 64, 512, 32, 64, 512, 32)
 93 | IMPLEMENTATION(CellLBM, 64, 512, 64, 64, 512, 64)
 94 | IMPLEMENTATION(CellLBM, 64, 512, 128, 64, 512, 128)
 95 | IMPLEMENTATION(CellLBM, 64, 512, 136, 64, 512, 136)
 96 | IMPLEMENTATION(CellLBM, 64, 512, 192, 64, 512, 192)
 97 | IMPLEMENTATION(CellLBM, 64, 512, 200, 64, 512, 200)
 98 | IMPLEMENTATION(CellLBM, 64, 512, 256, 64, 512, 256)
 99 | IMPLEMENTATION(CellLBM, 64, 512, 264, 64, 512, 264)
100 | IMPLEMENTATION(CellLBM, 64, 512, 512, 64, 512, 512)
101 | IMPLEMENTATION(CellLBM, 64, 512, 520, 64, 512, 520)
102 | IMPLEMENTATION(CellLBM, 64, 512, 1032, 64, 512, 1032)
103 | IMPLEMENTATION(CellLBM, 64, 520, 32, 64, 520, 32)
104 | IMPLEMENTATION(CellLBM, 64, 520, 64, 64, 520, 64)
105 | IMPLEMENTATION(CellLBM, 64, 520, 128, 64, 520, 128)
106 | IMPLEMENTATION(CellLBM, 64, 520, 136, 64, 520, 136)
107 | IMPLEMENTATION(CellLBM, 64, 520, 192, 64, 520, 192)
108 | IMPLEMENTATION(CellLBM, 64, 520, 200, 64, 520, 200)
109 | IMPLEMENTATION(CellLBM, 64, 520, 256, 64, 520, 256)
110 | IMPLEMENTATION(CellLBM, 64, 520, 264, 64, 520, 264)
111 | IMPLEMENTATION(CellLBM, 64, 520, 512, 64, 520, 512)
112 | IMPLEMENTATION(CellLBM, 64, 520, 520, 64, 520, 520)
113 | IMPLEMENTATION(CellLBM, 64, 520, 1032, 64, 520, 1032)
114 | IMPLEMENTATION(CellLBM, 64, 1032, 32, 64, 1032, 32)
115 | IMPLEMENTATION(CellLBM, 64, 1032, 64, 64, 1032, 64)
116 | IMPLEMENTATION(CellLBM, 64, 1032, 128, 64, 1032, 128)
117 | IMPLEMENTATION(CellLBM, 64, 1032, 136, 64, 1032, 136)
118 | IMPLEMENTATION(CellLBM, 64, 1032, 192, 64, 1032, 192)
119 | IMPLEMENTATION(CellLBM, 64, 1032, 200, 64, 1032, 200)
120 | IMPLEMENTATION(CellLBM, 64, 1032, 256, 64, 1032, 256)
121 | IMPLEMENTATION(CellLBM, 64, 1032, 264, 64, 1032, 264)
122 | IMPLEMENTATION(CellLBM, 64, 1032, 512, 64, 1032, 512)
123 | IMPLEMENTATION(CellLBM, 64, 1032, 520, 64, 1032, 520)
124 | IMPLEMENTATION(CellLBM, 64, 1032, 1032, 64, 1032, 1032)
125 | 


--------------------------------------------------------------------------------
/examples/lbm/flatarray_implementation_2.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include "cudalineupdatefunctorprototype.h"
  3 | 
  4 | IMPLEMENTATION(CellLBM, 128, 32, 32, 128, 32, 32)
  5 | IMPLEMENTATION(CellLBM, 128, 32, 64, 128, 32, 64)
  6 | IMPLEMENTATION(CellLBM, 128, 32, 128, 128, 32, 128)
  7 | IMPLEMENTATION(CellLBM, 128, 32, 136, 128, 32, 136)
  8 | IMPLEMENTATION(CellLBM, 128, 32, 192, 128, 32, 192)
  9 | IMPLEMENTATION(CellLBM, 128, 32, 200, 128, 32, 200)
 10 | IMPLEMENTATION(CellLBM, 128, 32, 256, 128, 32, 256)
 11 | IMPLEMENTATION(CellLBM, 128, 32, 264, 128, 32, 264)
 12 | IMPLEMENTATION(CellLBM, 128, 32, 512, 128, 32, 512)
 13 | IMPLEMENTATION(CellLBM, 128, 32, 520, 128, 32, 520)
 14 | IMPLEMENTATION(CellLBM, 128, 32, 1032, 128, 32, 1032)
 15 | IMPLEMENTATION(CellLBM, 128, 64, 32, 128, 64, 32)
 16 | IMPLEMENTATION(CellLBM, 128, 64, 64, 128, 64, 64)
 17 | IMPLEMENTATION(CellLBM, 128, 64, 128, 128, 64, 128)
 18 | IMPLEMENTATION(CellLBM, 128, 64, 136, 128, 64, 136)
 19 | IMPLEMENTATION(CellLBM, 128, 64, 192, 128, 64, 192)
 20 | IMPLEMENTATION(CellLBM, 128, 64, 200, 128, 64, 200)
 21 | IMPLEMENTATION(CellLBM, 128, 64, 256, 128, 64, 256)
 22 | IMPLEMENTATION(CellLBM, 128, 64, 264, 128, 64, 264)
 23 | IMPLEMENTATION(CellLBM, 128, 64, 512, 128, 64, 512)
 24 | IMPLEMENTATION(CellLBM, 128, 64, 520, 128, 64, 520)
 25 | IMPLEMENTATION(CellLBM, 128, 64, 1032, 128, 64, 1032)
 26 | IMPLEMENTATION(CellLBM, 128, 128, 32, 128, 128, 32)
 27 | IMPLEMENTATION(CellLBM, 128, 128, 64, 128, 128, 64)
 28 | IMPLEMENTATION(CellLBM, 128, 128, 128, 128, 128, 128)
 29 | IMPLEMENTATION(CellLBM, 128, 128, 136, 128, 128, 136)
 30 | IMPLEMENTATION(CellLBM, 128, 128, 192, 128, 128, 192)
 31 | IMPLEMENTATION(CellLBM, 128, 128, 200, 128, 128, 200)
 32 | IMPLEMENTATION(CellLBM, 128, 128, 256, 128, 128, 256)
 33 | IMPLEMENTATION(CellLBM, 128, 128, 264, 128, 128, 264)
 34 | IMPLEMENTATION(CellLBM, 128, 128, 512, 128, 128, 512)
 35 | IMPLEMENTATION(CellLBM, 128, 128, 520, 128, 128, 520)
 36 | IMPLEMENTATION(CellLBM, 128, 128, 1032, 128, 128, 1032)
 37 | IMPLEMENTATION(CellLBM, 128, 136, 32, 128, 136, 32)
 38 | IMPLEMENTATION(CellLBM, 128, 136, 64, 128, 136, 64)
 39 | IMPLEMENTATION(CellLBM, 128, 136, 128, 128, 136, 128)
 40 | IMPLEMENTATION(CellLBM, 128, 136, 136, 128, 136, 136)
 41 | IMPLEMENTATION(CellLBM, 128, 136, 192, 128, 136, 192)
 42 | IMPLEMENTATION(CellLBM, 128, 136, 200, 128, 136, 200)
 43 | IMPLEMENTATION(CellLBM, 128, 136, 256, 128, 136, 256)
 44 | IMPLEMENTATION(CellLBM, 128, 136, 264, 128, 136, 264)
 45 | IMPLEMENTATION(CellLBM, 128, 136, 512, 128, 136, 512)
 46 | IMPLEMENTATION(CellLBM, 128, 136, 520, 128, 136, 520)
 47 | IMPLEMENTATION(CellLBM, 128, 136, 1032, 128, 136, 1032)
 48 | IMPLEMENTATION(CellLBM, 128, 192, 32, 128, 192, 32)
 49 | IMPLEMENTATION(CellLBM, 128, 192, 64, 128, 192, 64)
 50 | IMPLEMENTATION(CellLBM, 128, 192, 128, 128, 192, 128)
 51 | IMPLEMENTATION(CellLBM, 128, 192, 136, 128, 192, 136)
 52 | IMPLEMENTATION(CellLBM, 128, 192, 192, 128, 192, 192)
 53 | IMPLEMENTATION(CellLBM, 128, 192, 200, 128, 192, 200)
 54 | IMPLEMENTATION(CellLBM, 128, 192, 256, 128, 192, 256)
 55 | IMPLEMENTATION(CellLBM, 128, 192, 264, 128, 192, 264)
 56 | IMPLEMENTATION(CellLBM, 128, 192, 512, 128, 192, 512)
 57 | IMPLEMENTATION(CellLBM, 128, 192, 520, 128, 192, 520)
 58 | IMPLEMENTATION(CellLBM, 128, 192, 1032, 128, 192, 1032)
 59 | IMPLEMENTATION(CellLBM, 128, 200, 32, 128, 200, 32)
 60 | IMPLEMENTATION(CellLBM, 128, 200, 64, 128, 200, 64)
 61 | IMPLEMENTATION(CellLBM, 128, 200, 128, 128, 200, 128)
 62 | IMPLEMENTATION(CellLBM, 128, 200, 136, 128, 200, 136)
 63 | IMPLEMENTATION(CellLBM, 128, 200, 192, 128, 200, 192)
 64 | IMPLEMENTATION(CellLBM, 128, 200, 200, 128, 200, 200)
 65 | IMPLEMENTATION(CellLBM, 128, 200, 256, 128, 200, 256)
 66 | IMPLEMENTATION(CellLBM, 128, 200, 264, 128, 200, 264)
 67 | IMPLEMENTATION(CellLBM, 128, 200, 512, 128, 200, 512)
 68 | IMPLEMENTATION(CellLBM, 128, 200, 520, 128, 200, 520)
 69 | IMPLEMENTATION(CellLBM, 128, 200, 1032, 128, 200, 1032)
 70 | IMPLEMENTATION(CellLBM, 128, 256, 32, 128, 256, 32)
 71 | IMPLEMENTATION(CellLBM, 128, 256, 64, 128, 256, 64)
 72 | IMPLEMENTATION(CellLBM, 128, 256, 128, 128, 256, 128)
 73 | IMPLEMENTATION(CellLBM, 128, 256, 136, 128, 256, 136)
 74 | IMPLEMENTATION(CellLBM, 128, 256, 192, 128, 256, 192)
 75 | IMPLEMENTATION(CellLBM, 128, 256, 200, 128, 256, 200)
 76 | IMPLEMENTATION(CellLBM, 128, 256, 256, 128, 256, 256)
 77 | IMPLEMENTATION(CellLBM, 128, 256, 264, 128, 256, 264)
 78 | IMPLEMENTATION(CellLBM, 128, 256, 512, 128, 256, 512)
 79 | IMPLEMENTATION(CellLBM, 128, 256, 520, 128, 256, 520)
 80 | IMPLEMENTATION(CellLBM, 128, 256, 1032, 128, 256, 1032)
 81 | IMPLEMENTATION(CellLBM, 128, 264, 32, 128, 264, 32)
 82 | IMPLEMENTATION(CellLBM, 128, 264, 64, 128, 264, 64)
 83 | IMPLEMENTATION(CellLBM, 128, 264, 128, 128, 264, 128)
 84 | IMPLEMENTATION(CellLBM, 128, 264, 136, 128, 264, 136)
 85 | IMPLEMENTATION(CellLBM, 128, 264, 192, 128, 264, 192)
 86 | IMPLEMENTATION(CellLBM, 128, 264, 200, 128, 264, 200)
 87 | IMPLEMENTATION(CellLBM, 128, 264, 256, 128, 264, 256)
 88 | IMPLEMENTATION(CellLBM, 128, 264, 264, 128, 264, 264)
 89 | IMPLEMENTATION(CellLBM, 128, 264, 512, 128, 264, 512)
 90 | IMPLEMENTATION(CellLBM, 128, 264, 520, 128, 264, 520)
 91 | IMPLEMENTATION(CellLBM, 128, 264, 1032, 128, 264, 1032)
 92 | IMPLEMENTATION(CellLBM, 128, 512, 32, 128, 512, 32)
 93 | IMPLEMENTATION(CellLBM, 128, 512, 64, 128, 512, 64)
 94 | IMPLEMENTATION(CellLBM, 128, 512, 128, 128, 512, 128)
 95 | IMPLEMENTATION(CellLBM, 128, 512, 136, 128, 512, 136)
 96 | IMPLEMENTATION(CellLBM, 128, 512, 192, 128, 512, 192)
 97 | IMPLEMENTATION(CellLBM, 128, 512, 200, 128, 512, 200)
 98 | IMPLEMENTATION(CellLBM, 128, 512, 256, 128, 512, 256)
 99 | IMPLEMENTATION(CellLBM, 128, 512, 264, 128, 512, 264)
100 | IMPLEMENTATION(CellLBM, 128, 512, 512, 128, 512, 512)
101 | IMPLEMENTATION(CellLBM, 128, 512, 520, 128, 512, 520)
102 | IMPLEMENTATION(CellLBM, 128, 512, 1032, 128, 512, 1032)
103 | IMPLEMENTATION(CellLBM, 128, 520, 32, 128, 520, 32)
104 | IMPLEMENTATION(CellLBM, 128, 520, 64, 128, 520, 64)
105 | IMPLEMENTATION(CellLBM, 128, 520, 128, 128, 520, 128)
106 | IMPLEMENTATION(CellLBM, 128, 520, 136, 128, 520, 136)
107 | IMPLEMENTATION(CellLBM, 128, 520, 192, 128, 520, 192)
108 | IMPLEMENTATION(CellLBM, 128, 520, 200, 128, 520, 200)
109 | IMPLEMENTATION(CellLBM, 128, 520, 256, 128, 520, 256)
110 | IMPLEMENTATION(CellLBM, 128, 520, 264, 128, 520, 264)
111 | IMPLEMENTATION(CellLBM, 128, 520, 512, 128, 520, 512)
112 | IMPLEMENTATION(CellLBM, 128, 520, 520, 128, 520, 520)
113 | IMPLEMENTATION(CellLBM, 128, 520, 1032, 128, 520, 1032)
114 | IMPLEMENTATION(CellLBM, 128, 1032, 32, 128, 1032, 32)
115 | IMPLEMENTATION(CellLBM, 128, 1032, 64, 128, 1032, 64)
116 | IMPLEMENTATION(CellLBM, 128, 1032, 128, 128, 1032, 128)
117 | IMPLEMENTATION(CellLBM, 128, 1032, 136, 128, 1032, 136)
118 | IMPLEMENTATION(CellLBM, 128, 1032, 192, 128, 1032, 192)
119 | IMPLEMENTATION(CellLBM, 128, 1032, 200, 128, 1032, 200)
120 | IMPLEMENTATION(CellLBM, 128, 1032, 256, 128, 1032, 256)
121 | IMPLEMENTATION(CellLBM, 128, 1032, 264, 128, 1032, 264)
122 | IMPLEMENTATION(CellLBM, 128, 1032, 512, 128, 1032, 512)
123 | IMPLEMENTATION(CellLBM, 128, 1032, 520, 128, 1032, 520)
124 | IMPLEMENTATION(CellLBM, 128, 1032, 1032, 128, 1032, 1032)
125 | 


--------------------------------------------------------------------------------
/examples/lbm/flatarray_implementation_3.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include "cudalineupdatefunctorprototype.h"
  3 | 
  4 | IMPLEMENTATION(CellLBM, 136, 32, 32, 136, 32, 32)
  5 | IMPLEMENTATION(CellLBM, 136, 32, 64, 136, 32, 64)
  6 | IMPLEMENTATION(CellLBM, 136, 32, 128, 136, 32, 128)
  7 | IMPLEMENTATION(CellLBM, 136, 32, 136, 136, 32, 136)
  8 | IMPLEMENTATION(CellLBM, 136, 32, 192, 136, 32, 192)
  9 | IMPLEMENTATION(CellLBM, 136, 32, 200, 136, 32, 200)
 10 | IMPLEMENTATION(CellLBM, 136, 32, 256, 136, 32, 256)
 11 | IMPLEMENTATION(CellLBM, 136, 32, 264, 136, 32, 264)
 12 | IMPLEMENTATION(CellLBM, 136, 32, 512, 136, 32, 512)
 13 | IMPLEMENTATION(CellLBM, 136, 32, 520, 136, 32, 520)
 14 | IMPLEMENTATION(CellLBM, 136, 32, 1032, 136, 32, 1032)
 15 | IMPLEMENTATION(CellLBM, 136, 64, 32, 136, 64, 32)
 16 | IMPLEMENTATION(CellLBM, 136, 64, 64, 136, 64, 64)
 17 | IMPLEMENTATION(CellLBM, 136, 64, 128, 136, 64, 128)
 18 | IMPLEMENTATION(CellLBM, 136, 64, 136, 136, 64, 136)
 19 | IMPLEMENTATION(CellLBM, 136, 64, 192, 136, 64, 192)
 20 | IMPLEMENTATION(CellLBM, 136, 64, 200, 136, 64, 200)
 21 | IMPLEMENTATION(CellLBM, 136, 64, 256, 136, 64, 256)
 22 | IMPLEMENTATION(CellLBM, 136, 64, 264, 136, 64, 264)
 23 | IMPLEMENTATION(CellLBM, 136, 64, 512, 136, 64, 512)
 24 | IMPLEMENTATION(CellLBM, 136, 64, 520, 136, 64, 520)
 25 | IMPLEMENTATION(CellLBM, 136, 64, 1032, 136, 64, 1032)
 26 | IMPLEMENTATION(CellLBM, 136, 128, 32, 136, 128, 32)
 27 | IMPLEMENTATION(CellLBM, 136, 128, 64, 136, 128, 64)
 28 | IMPLEMENTATION(CellLBM, 136, 128, 128, 136, 128, 128)
 29 | IMPLEMENTATION(CellLBM, 136, 128, 136, 136, 128, 136)
 30 | IMPLEMENTATION(CellLBM, 136, 128, 192, 136, 128, 192)
 31 | IMPLEMENTATION(CellLBM, 136, 128, 200, 136, 128, 200)
 32 | IMPLEMENTATION(CellLBM, 136, 128, 256, 136, 128, 256)
 33 | IMPLEMENTATION(CellLBM, 136, 128, 264, 136, 128, 264)
 34 | IMPLEMENTATION(CellLBM, 136, 128, 512, 136, 128, 512)
 35 | IMPLEMENTATION(CellLBM, 136, 128, 520, 136, 128, 520)
 36 | IMPLEMENTATION(CellLBM, 136, 128, 1032, 136, 128, 1032)
 37 | IMPLEMENTATION(CellLBM, 136, 136, 32, 136, 136, 32)
 38 | IMPLEMENTATION(CellLBM, 136, 136, 64, 136, 136, 64)
 39 | IMPLEMENTATION(CellLBM, 136, 136, 128, 136, 136, 128)
 40 | IMPLEMENTATION(CellLBM, 136, 136, 136, 136, 136, 136)
 41 | IMPLEMENTATION(CellLBM, 136, 136, 192, 136, 136, 192)
 42 | IMPLEMENTATION(CellLBM, 136, 136, 200, 136, 136, 200)
 43 | IMPLEMENTATION(CellLBM, 136, 136, 256, 136, 136, 256)
 44 | IMPLEMENTATION(CellLBM, 136, 136, 264, 136, 136, 264)
 45 | IMPLEMENTATION(CellLBM, 136, 136, 512, 136, 136, 512)
 46 | IMPLEMENTATION(CellLBM, 136, 136, 520, 136, 136, 520)
 47 | IMPLEMENTATION(CellLBM, 136, 136, 1032, 136, 136, 1032)
 48 | IMPLEMENTATION(CellLBM, 136, 192, 32, 136, 192, 32)
 49 | IMPLEMENTATION(CellLBM, 136, 192, 64, 136, 192, 64)
 50 | IMPLEMENTATION(CellLBM, 136, 192, 128, 136, 192, 128)
 51 | IMPLEMENTATION(CellLBM, 136, 192, 136, 136, 192, 136)
 52 | IMPLEMENTATION(CellLBM, 136, 192, 192, 136, 192, 192)
 53 | IMPLEMENTATION(CellLBM, 136, 192, 200, 136, 192, 200)
 54 | IMPLEMENTATION(CellLBM, 136, 192, 256, 136, 192, 256)
 55 | IMPLEMENTATION(CellLBM, 136, 192, 264, 136, 192, 264)
 56 | IMPLEMENTATION(CellLBM, 136, 192, 512, 136, 192, 512)
 57 | IMPLEMENTATION(CellLBM, 136, 192, 520, 136, 192, 520)
 58 | IMPLEMENTATION(CellLBM, 136, 192, 1032, 136, 192, 1032)
 59 | IMPLEMENTATION(CellLBM, 136, 200, 32, 136, 200, 32)
 60 | IMPLEMENTATION(CellLBM, 136, 200, 64, 136, 200, 64)
 61 | IMPLEMENTATION(CellLBM, 136, 200, 128, 136, 200, 128)
 62 | IMPLEMENTATION(CellLBM, 136, 200, 136, 136, 200, 136)
 63 | IMPLEMENTATION(CellLBM, 136, 200, 192, 136, 200, 192)
 64 | IMPLEMENTATION(CellLBM, 136, 200, 200, 136, 200, 200)
 65 | IMPLEMENTATION(CellLBM, 136, 200, 256, 136, 200, 256)
 66 | IMPLEMENTATION(CellLBM, 136, 200, 264, 136, 200, 264)
 67 | IMPLEMENTATION(CellLBM, 136, 200, 512, 136, 200, 512)
 68 | IMPLEMENTATION(CellLBM, 136, 200, 520, 136, 200, 520)
 69 | IMPLEMENTATION(CellLBM, 136, 200, 1032, 136, 200, 1032)
 70 | IMPLEMENTATION(CellLBM, 136, 256, 32, 136, 256, 32)
 71 | IMPLEMENTATION(CellLBM, 136, 256, 64, 136, 256, 64)
 72 | IMPLEMENTATION(CellLBM, 136, 256, 128, 136, 256, 128)
 73 | IMPLEMENTATION(CellLBM, 136, 256, 136, 136, 256, 136)
 74 | IMPLEMENTATION(CellLBM, 136, 256, 192, 136, 256, 192)
 75 | IMPLEMENTATION(CellLBM, 136, 256, 200, 136, 256, 200)
 76 | IMPLEMENTATION(CellLBM, 136, 256, 256, 136, 256, 256)
 77 | IMPLEMENTATION(CellLBM, 136, 256, 264, 136, 256, 264)
 78 | IMPLEMENTATION(CellLBM, 136, 256, 512, 136, 256, 512)
 79 | IMPLEMENTATION(CellLBM, 136, 256, 520, 136, 256, 520)
 80 | IMPLEMENTATION(CellLBM, 136, 256, 1032, 136, 256, 1032)
 81 | IMPLEMENTATION(CellLBM, 136, 264, 32, 136, 264, 32)
 82 | IMPLEMENTATION(CellLBM, 136, 264, 64, 136, 264, 64)
 83 | IMPLEMENTATION(CellLBM, 136, 264, 128, 136, 264, 128)
 84 | IMPLEMENTATION(CellLBM, 136, 264, 136, 136, 264, 136)
 85 | IMPLEMENTATION(CellLBM, 136, 264, 192, 136, 264, 192)
 86 | IMPLEMENTATION(CellLBM, 136, 264, 200, 136, 264, 200)
 87 | IMPLEMENTATION(CellLBM, 136, 264, 256, 136, 264, 256)
 88 | IMPLEMENTATION(CellLBM, 136, 264, 264, 136, 264, 264)
 89 | IMPLEMENTATION(CellLBM, 136, 264, 512, 136, 264, 512)
 90 | IMPLEMENTATION(CellLBM, 136, 264, 520, 136, 264, 520)
 91 | IMPLEMENTATION(CellLBM, 136, 264, 1032, 136, 264, 1032)
 92 | IMPLEMENTATION(CellLBM, 136, 512, 32, 136, 512, 32)
 93 | IMPLEMENTATION(CellLBM, 136, 512, 64, 136, 512, 64)
 94 | IMPLEMENTATION(CellLBM, 136, 512, 128, 136, 512, 128)
 95 | IMPLEMENTATION(CellLBM, 136, 512, 136, 136, 512, 136)
 96 | IMPLEMENTATION(CellLBM, 136, 512, 192, 136, 512, 192)
 97 | IMPLEMENTATION(CellLBM, 136, 512, 200, 136, 512, 200)
 98 | IMPLEMENTATION(CellLBM, 136, 512, 256, 136, 512, 256)
 99 | IMPLEMENTATION(CellLBM, 136, 512, 264, 136, 512, 264)
100 | IMPLEMENTATION(CellLBM, 136, 512, 512, 136, 512, 512)
101 | IMPLEMENTATION(CellLBM, 136, 512, 520, 136, 512, 520)
102 | IMPLEMENTATION(CellLBM, 136, 512, 1032, 136, 512, 1032)
103 | IMPLEMENTATION(CellLBM, 136, 520, 32, 136, 520, 32)
104 | IMPLEMENTATION(CellLBM, 136, 520, 64, 136, 520, 64)
105 | IMPLEMENTATION(CellLBM, 136, 520, 128, 136, 520, 128)
106 | IMPLEMENTATION(CellLBM, 136, 520, 136, 136, 520, 136)
107 | IMPLEMENTATION(CellLBM, 136, 520, 192, 136, 520, 192)
108 | IMPLEMENTATION(CellLBM, 136, 520, 200, 136, 520, 200)
109 | IMPLEMENTATION(CellLBM, 136, 520, 256, 136, 520, 256)
110 | IMPLEMENTATION(CellLBM, 136, 520, 264, 136, 520, 264)
111 | IMPLEMENTATION(CellLBM, 136, 520, 512, 136, 520, 512)
112 | IMPLEMENTATION(CellLBM, 136, 520, 520, 136, 520, 520)
113 | IMPLEMENTATION(CellLBM, 136, 520, 1032, 136, 520, 1032)
114 | IMPLEMENTATION(CellLBM, 136, 1032, 32, 136, 1032, 32)
115 | IMPLEMENTATION(CellLBM, 136, 1032, 64, 136, 1032, 64)
116 | IMPLEMENTATION(CellLBM, 136, 1032, 128, 136, 1032, 128)
117 | IMPLEMENTATION(CellLBM, 136, 1032, 136, 136, 1032, 136)
118 | IMPLEMENTATION(CellLBM, 136, 1032, 192, 136, 1032, 192)
119 | IMPLEMENTATION(CellLBM, 136, 1032, 200, 136, 1032, 200)
120 | IMPLEMENTATION(CellLBM, 136, 1032, 256, 136, 1032, 256)
121 | IMPLEMENTATION(CellLBM, 136, 1032, 264, 136, 1032, 264)
122 | IMPLEMENTATION(CellLBM, 136, 1032, 512, 136, 1032, 512)
123 | IMPLEMENTATION(CellLBM, 136, 1032, 520, 136, 1032, 520)
124 | IMPLEMENTATION(CellLBM, 136, 1032, 1032, 136, 1032, 1032)
125 | 


--------------------------------------------------------------------------------