├── examples ├── .gitirnore └── 01-class │ ├── CMakeLists.txt │ └── fib-class.cpp ├── benchmarks ├── .gitignore ├── cilk │ ├── fib │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── dfs │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── blkmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── matmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── mergesort │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── openmp │ ├── dfs │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── fib │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── blkmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── matmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── mergesort │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── sequential │ ├── dfs │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── fib │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── blkmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── matmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── mergesort │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── staccato │ ├── dfs │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── fib │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── blkmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── matmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── mergesort │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── tbb │ ├── dfs │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── fib │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── blkmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ ├── matmul │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── mergesort │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── plor.r └── run.sh ├── docs ├── llc-misses.png ├── final.dat-dfs.png ├── final.dat-fib.png ├── final.dat-blkmul.png └── final.dat-matmul.png ├── .gitignore ├── tests ├── task_mock.hpp ├── CMakeLists.txt.in ├── lifo_allocator.cpp ├── CMakeLists.txt └── task_deque.cpp ├── LICENSE ├── CMakeLists.txt ├── include ├── debug.hpp ├── task.hpp ├── counter.hpp ├── task_deque.hpp ├── lifo_allocator.hpp ├── utils.hpp ├── scheduler.hpp └── worker.hpp └── README.md /examples/.gitirnore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | *.dat 3 | -------------------------------------------------------------------------------- /docs/llc-misses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkuchumov/staccato/HEAD/docs/llc-misses.png -------------------------------------------------------------------------------- /docs/final.dat-dfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkuchumov/staccato/HEAD/docs/final.dat-dfs.png -------------------------------------------------------------------------------- /docs/final.dat-fib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkuchumov/staccato/HEAD/docs/final.dat-fib.png -------------------------------------------------------------------------------- /docs/final.dat-blkmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkuchumov/staccato/HEAD/docs/final.dat-blkmul.png -------------------------------------------------------------------------------- /docs/final.dat-matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rkuchumov/staccato/HEAD/docs/final.dat-matmul.png -------------------------------------------------------------------------------- /examples/01-class/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | add_executable(fib-class fib-class.cpp) 4 | 5 | target_link_libraries(fib-class pthread) 6 | -------------------------------------------------------------------------------- /benchmarks/cilk/fib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus -lcilkrts -O3") 4 | 5 | add_executable(fib-cilk main.cpp) 6 | -------------------------------------------------------------------------------- /benchmarks/openmp/dfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O3 -g") 4 | 5 | add_executable(dfs-omp main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/openmp/fib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O3 -g") 4 | 5 | add_executable(fib-omp main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/openmp/blkmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O3 -g") 4 | 5 | add_executable(blkmul-omp main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/openmp/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O3 -g") 4 | 5 | add_executable(matmul-omp main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/cilk/dfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus -lcilkrts -g -O3") 4 | 5 | add_executable(dfs-cilk main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/openmp/mergesort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O3 -g") 4 | 5 | add_executable(mergesort-omp main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/cilk/blkmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus -lcilkrts -g -O3") 4 | 5 | add_executable(blkmul-cilk main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/cilk/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus -lcilkrts -g -O3") 4 | 5 | add_executable(matmul-cilk main.cpp) 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/cilk/mergesort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus -lcilkrts -g -O3") 4 | 5 | add_executable(mergesort-cilk main.cpp) 6 | -------------------------------------------------------------------------------- /benchmarks/sequential/dfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target dfs-sequential) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | -------------------------------------------------------------------------------- /benchmarks/sequential/fib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target fib-sequential) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | -------------------------------------------------------------------------------- /benchmarks/sequential/blkmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target blkmul-sequential) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | -------------------------------------------------------------------------------- /benchmarks/sequential/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target matmul-sequential) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | -------------------------------------------------------------------------------- /benchmarks/sequential/mergesort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target mergesort-sequential) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | -------------------------------------------------------------------------------- /benchmarks/staccato/dfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target dfs-staccato) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | 9 | find_path(STACCATO_INC staccato) 10 | 11 | target_link_libraries(${target} pthread) 12 | link_directories(${target} "${STACCATO_INC}") 13 | -------------------------------------------------------------------------------- /benchmarks/staccato/fib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target fib-staccato) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | 9 | find_path(STACCATO_INC staccato) 10 | 11 | target_link_libraries(${target} pthread) 12 | link_directories(${target} "${STACCATO_INC}") 13 | -------------------------------------------------------------------------------- /benchmarks/staccato/blkmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target blkmul-staccato) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | 9 | find_path(STACCATO_INC staccato) 10 | 11 | target_link_libraries(${target} pthread) 12 | link_directories(${target} "${STACCATO_INC}") 13 | -------------------------------------------------------------------------------- /benchmarks/staccato/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target matmul-staccato) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | 9 | find_path(STACCATO_INC staccato) 10 | 11 | target_link_libraries(${target} pthread) 12 | link_directories(${target} "${STACCATO_INC}") 13 | 14 | -------------------------------------------------------------------------------- /benchmarks/staccato/mergesort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target mergesort-staccato) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 6 | 7 | add_executable(${target} main.cpp) 8 | 9 | find_path(STACCATO_INC staccato) 10 | 11 | target_link_libraries(${target} pthread) 12 | link_directories(${target} "${STACCATO_INC}") 13 | 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | 30 | *build* 31 | .ycm_extra_conf* 32 | *.vim 33 | -------------------------------------------------------------------------------- /tests/task_mock.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TASK_MOCK_HPP_FKQ408NU 2 | #define TASK_MOCK_HPP_FKQ408NU 3 | 4 | #include "gtest/gtest.h" 5 | #include "gmock/gmock.h" 6 | 7 | #include "task.hpp" 8 | 9 | using namespace ::testing; 10 | 11 | class task_mock: public staccato::task 12 | { 13 | public: 14 | task_mock(size_t id_) : id(id_) 15 | {}; 16 | 17 | virtual ~task_mock() {}; 18 | 19 | size_t id; 20 | 21 | MOCK_METHOD0(execute, void()); 22 | }; 23 | 24 | #endif /* end of include guard: TASK_MOCK_HPP_FKQ408NU */ 25 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.2) 2 | 3 | project(googletest-download NONE) 4 | 5 | include(ExternalProject) 6 | ExternalProject_Add(googletest 7 | GIT_REPOSITORY https://github.com/google/googletest.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" 10 | BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" 11 | CONFIGURE_COMMAND "" 12 | UPDATE_COMMAND "" 13 | BUILD_COMMAND "" 14 | INSTALL_COMMAND "" 15 | TEST_COMMAND "" 16 | ) 17 | -------------------------------------------------------------------------------- /benchmarks/tbb/dfs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target dfs-tbb) 4 | 5 | add_executable(${target} main.cpp) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 8 | 9 | find_library(TBB_LIB tbb) 10 | find_path(TBB_INC tbb) 11 | 12 | target_link_libraries(${target} "${TBB_LIB}") 13 | link_directories(${target} "${TBB_INC}") 14 | 15 | find_library(TBBMALLOC_LIB tbbmalloc_proxy) 16 | target_link_libraries(${target} "${TBBMALLOC_LIB}") 17 | 18 | find_path(TBBMALLOC_INC tbbmalloc_proxy) 19 | link_directories(${target} "${TBBMALLOC_INC}") 20 | 21 | -------------------------------------------------------------------------------- /benchmarks/tbb/fib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target fib-tbb) 4 | 5 | add_executable(${target} main.cpp) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 8 | 9 | find_library(TBB_LIB tbb) 10 | find_path(TBB_INC tbb) 11 | 12 | target_link_libraries(${target} "${TBB_LIB}") 13 | link_directories(${target} "${TBB_INC}") 14 | 15 | find_library(TBBMALLOC_LIB tbbmalloc_proxy) 16 | target_link_libraries(${target} "${TBBMALLOC_LIB}") 17 | 18 | find_path(TBBMALLOC_INC tbbmalloc_proxy) 19 | link_directories(${target} "${TBBMALLOC_INC}") 20 | 21 | -------------------------------------------------------------------------------- /benchmarks/tbb/blkmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target blkmul-tbb) 4 | 5 | add_executable(${target} main.cpp) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 8 | 9 | find_library(TBB_LIB tbb) 10 | find_path(TBB_INC tbb) 11 | 12 | target_link_libraries(${target} "${TBB_LIB}") 13 | link_directories(${target} "${TBB_INC}") 14 | 15 | find_library(TBBMALLOC_LIB tbbmalloc_proxy) 16 | target_link_libraries(${target} "${TBBMALLOC_LIB}") 17 | 18 | find_path(TBBMALLOC_INC tbbmalloc_proxy) 19 | link_directories(${target} "${TBBMALLOC_INC}") 20 | 21 | -------------------------------------------------------------------------------- /benchmarks/tbb/matmul/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target matmul-tbb) 4 | 5 | add_executable(${target} main.cpp) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 8 | 9 | find_library(TBB_LIB tbb) 10 | find_path(TBB_INC tbb) 11 | 12 | target_link_libraries(${target} "${TBB_LIB}") 13 | link_directories(${target} "${TBB_INC}") 14 | 15 | find_library(TBBMALLOC_LIB tbbmalloc_proxy) 16 | target_link_libraries(${target} "${TBBMALLOC_LIB}") 17 | 18 | find_path(TBBMALLOC_INC tbbmalloc_proxy) 19 | link_directories(${target} "${TBBMALLOC_INC}") 20 | 21 | -------------------------------------------------------------------------------- /benchmarks/tbb/mergesort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(target mergesort-tbb) 4 | 5 | add_executable(${target} main.cpp) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 8 | 9 | find_library(TBB_LIB tbb) 10 | find_path(TBB_INC tbb) 11 | 12 | target_link_libraries(${target} "${TBB_LIB}") 13 | link_directories(${target} "${TBB_INC}") 14 | 15 | find_library(TBBMALLOC_LIB tbbmalloc_proxy) 16 | target_link_libraries(${target} "${TBBMALLOC_LIB}") 17 | 18 | find_path(TBBMALLOC_INC tbbmalloc_proxy) 19 | link_directories(${target} "${TBBMALLOC_INC}") 20 | 21 | -------------------------------------------------------------------------------- /tests/lifo_allocator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "gtest/gtest.h" 8 | #include "gmock/gmock.h" 9 | 10 | #include "lifo_allocator.hpp" 11 | 12 | using namespace staccato; 13 | using namespace staccato::internal; 14 | 15 | TEST(ctor, creating_and_deleteing) { 16 | auto a = new lifo_allocator(100); 17 | delete a; 18 | } 19 | 20 | TEST(alloc_dealloc, no_reize) { 21 | size_t n = 5000; 22 | auto a = new lifo_allocator(2048); 23 | 24 | std::vector ptrs; 25 | 26 | for (size_t i = 1; i <= n; ++i) { 27 | auto p = new(a->alloc()) size_t; 28 | *p = i; 29 | ptrs.push_back(p); 30 | } 31 | 32 | for (size_t i = n; i > 1; --i) { 33 | auto p = ptrs.back(); 34 | EXPECT_EQ(*p, i); 35 | 36 | ptrs.pop_back(); 37 | } 38 | 39 | delete a; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /benchmarks/sequential/fib/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace chrono; 7 | 8 | void fib_seq(size_t n, unsigned long *sum) 9 | { 10 | if (n <= 2) { 11 | *sum = 1; 12 | return; 13 | } 14 | 15 | unsigned long x; 16 | fib_seq(n - 1, &x); 17 | 18 | unsigned long y; 19 | fib_seq(n - 2, &y); 20 | 21 | *sum = x + y; 22 | 23 | return; 24 | } 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | size_t n = 40; 29 | unsigned long answer; 30 | 31 | if (argc >= 3) 32 | n = atoi(argv[2]); 33 | 34 | auto start = system_clock::now(); 35 | 36 | fib_seq(n, &answer); 37 | 38 | auto stop = system_clock::now(); 39 | 40 | cout << "Scheduler: sequential\n"; 41 | cout << "Benchmark: fib\n"; 42 | cout << "Threads: " << 0 << "\n"; 43 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 44 | cout << "Input: " << n << "\n"; 45 | cout << "Output: " << answer << "\n"; 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ruslan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /examples/01-class/fib-class.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | using namespace std; 7 | using namespace staccato; 8 | 9 | class FibTask: public task 10 | { 11 | public: 12 | FibTask (int n_, long *sum_): n(n_), sum(sum_) 13 | { } 14 | 15 | void execute() { 16 | if (n <= 2) { 17 | *sum = 1; 18 | return; 19 | } 20 | 21 | long x; 22 | spawn(new(child()) FibTask(n - 1, &x)); 23 | 24 | long y; 25 | spawn(new(child()) FibTask(n - 2, &y)); 26 | 27 | wait(); 28 | 29 | *sum = x + y; 30 | 31 | return; 32 | } 33 | 34 | private: 35 | int n; 36 | long *sum; 37 | }; 38 | 39 | int main(int argc, char *argv[]) 40 | { 41 | size_t n = 20; 42 | long answer; 43 | size_t nthreads = 0; 44 | 45 | if (argc >= 2) 46 | nthreads = atoi(argv[1]); 47 | if (argc >= 3) 48 | n = atoi(argv[2]); 49 | if (nthreads == 0) 50 | nthreads = thread::hardware_concurrency(); 51 | 52 | { 53 | scheduler sh(2, nthreads); 54 | sh.spawn(new(sh.root()) FibTask(n, &answer)); 55 | sh.wait(); 56 | } 57 | 58 | cout << "fib(" << n << ") = " << answer << "\n"; 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /benchmarks/sequential/dfs/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace chrono; 8 | 9 | void seq_dfs(size_t depth, size_t breadth, unsigned long *sum) 10 | { 11 | if (depth == 0) { 12 | *sum = 1; 13 | return; 14 | } 15 | 16 | vector sums(breadth); 17 | 18 | for (size_t i = 0; i < breadth; ++i) 19 | seq_dfs(depth - 1, breadth, &sums[i]); 20 | 21 | *sum = 0; 22 | for (size_t i = 0; i < breadth; ++i) 23 | *sum += sums[i]; 24 | } 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | size_t depth = 8; 29 | size_t breadth = 8; 30 | unsigned long answer; 31 | 32 | if (argc >= 3) 33 | depth = atoi(argv[2]); 34 | if (argc >= 4) 35 | breadth = atoi(argv[3]); 36 | 37 | auto start = system_clock::now(); 38 | 39 | seq_dfs(depth, breadth, &answer); 40 | 41 | auto stop = system_clock::now(); 42 | 43 | cout << "Scheduler: sequential\n"; 44 | cout << "Benchmark: dfs\n"; 45 | cout << "Threads: " << 0 << "\n"; 46 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 47 | cout << "Input: " << depth << " " << breadth << "\n"; 48 | cout << "Output: " << answer << "\n"; 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | project("Staccato C++ task scheduler") 4 | 5 | option(STACCATO_BUILD_EXAMPLES "Build example programs" OFF) 6 | option(STACCATO_BUILD_TESTS "Build all tests" OFF) 7 | 8 | if (NOT CMAKE_BUILD_TYPE) 9 | set (CMAKE_BUILD_TYPE Release) 10 | endif() 11 | 12 | if (CMAKE_BUILD_TYPE MATCHES Debug) 13 | message("Debug build.") 14 | 15 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") 16 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") 17 | 18 | add_definitions(-DSTACCATO_DEBUG=1) 19 | set(CMAKE_VERBOSE_MAKEFILE TRUE) 20 | elseif (CMAKE_BUILD_TYPE MATCHES Release) 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 22 | message("Release build.") 23 | endif () 24 | 25 | set(CMAKE_CXX_STANDARD 11) 26 | 27 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 28 | 29 | set(HEADERS 30 | include/scheduler.hpp 31 | include/task.hpp 32 | include/debug.hpp 33 | include/task_deque.hpp 34 | include/lifo_allocator.hpp 35 | include/worker.hpp 36 | include/utils.hpp 37 | include/counter.hpp 38 | ) 39 | 40 | install( 41 | FILES ${HEADERS} 42 | DESTINATION include/staccato 43 | ) 44 | 45 | # enable_testing() 46 | add_subdirectory(examples/01-class) 47 | # add_subdirectory(tests) 48 | 49 | -------------------------------------------------------------------------------- /include/debug.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEBUG_HPP_0BRUWGOK 2 | #define DEBUG_HPP_0BRUWGOK 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.hpp" 11 | 12 | namespace staccato { 13 | namespace internal { 14 | 15 | #if STACCATO_DEBUG 16 | 17 | class Debug 18 | { 19 | public: 20 | Debug(size_t indent = 0) 21 | : m_printed(false) 22 | , m_indent(indent) 23 | { 24 | m_buffer << "[STACCATO]"; 25 | 26 | m_buffer << "["; 27 | m_buffer << std::setfill('0') << std::setw(5) 28 | << std::hash()(std::this_thread::get_id()) % 100000; 29 | m_buffer << "] "; 30 | 31 | for (size_t i = 0; i < m_indent; ++i) { 32 | m_buffer << " "; 33 | } 34 | } 35 | 36 | ~Debug() 37 | { 38 | if (!m_printed) 39 | print(); 40 | } 41 | 42 | template 43 | Debug & operator<<(const T &value) 44 | { 45 | m_buffer << value; 46 | return *this; 47 | } 48 | 49 | private: 50 | void print() 51 | { 52 | m_buffer << std::endl; 53 | std::cerr << m_buffer.str(); 54 | m_printed = true; 55 | } 56 | 57 | std::ostringstream m_buffer; 58 | bool m_printed; 59 | size_t m_indent; 60 | }; 61 | 62 | #else 63 | 64 | class Debug 65 | { 66 | public: 67 | Debug(size_t n = 0) { } 68 | ~Debug() { } 69 | 70 | template 71 | Debug & operator<<(const T &) { } 72 | }; 73 | 74 | #endif 75 | 76 | } /* internal */ 77 | } /* staccato */ 78 | 79 | #endif /* end of include guard: DEBUG_HPP_0BRUWGOK */ 80 | -------------------------------------------------------------------------------- /include/task.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TASK_META_HPP_34TJDRLS 2 | #define TASK_META_HPP_34TJDRLS 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "task_deque.hpp" 10 | #include "utils.hpp" 11 | 12 | namespace staccato 13 | { 14 | 15 | namespace internal { 16 | template 17 | class worker; 18 | } 19 | 20 | template 21 | class task { 22 | public: 23 | task(); 24 | virtual ~task(); 25 | 26 | virtual void execute() = 0; 27 | 28 | T *child(); 29 | 30 | void spawn(T *t); 31 | 32 | void wait(); 33 | 34 | void process(internal::worker *worker, internal::task_deque *tail); 35 | 36 | private: 37 | internal::worker *m_worker; 38 | 39 | internal::task_deque *m_tail; 40 | }; 41 | 42 | template 43 | task::task() 44 | { } 45 | 46 | template 47 | task::~task() 48 | { } 49 | 50 | template 51 | void task::process(internal::worker *worker, internal::task_deque *tail) 52 | { 53 | m_worker = worker; 54 | m_tail = tail; 55 | 56 | execute(); 57 | } 58 | 59 | template 60 | T *task::child() 61 | { 62 | return m_tail->put_allocate(); 63 | } 64 | 65 | template 66 | void task::spawn(T *) 67 | { 68 | m_tail->put_commit(); 69 | } 70 | 71 | template 72 | void task::wait() 73 | { 74 | m_worker->local_loop(m_tail); 75 | 76 | // m_tail->reset(); 77 | } 78 | 79 | } /* staccato */ 80 | 81 | 82 | #endif /* end of include guard: TASK_META_HPP_34TJDRLS */ 83 | -------------------------------------------------------------------------------- /benchmarks/openmp/fib/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | using namespace std; 8 | using namespace std::chrono; 9 | 10 | void fib(int n, unsigned long *sum) 11 | { 12 | if (n <= 2) { 13 | *sum = 1; 14 | return; 15 | } 16 | 17 | unsigned long x; 18 | #pragma omp task shared(n, x) 19 | fib(n - 1, &x); 20 | 21 | unsigned long y; 22 | #pragma omp task shared(n, y) 23 | fib(n - 2, &y); 24 | 25 | #pragma omp taskwait 26 | 27 | *sum = x + y; 28 | } 29 | 30 | void test(int n, unsigned long *sum) 31 | { 32 | #pragma omp task shared(n, sum) 33 | fib(n, sum); 34 | #pragma omp taskwait 35 | } 36 | 37 | int main(int argc, char *argv[]) 38 | { 39 | size_t n = 40; 40 | unsigned long answer; 41 | size_t nthreads = 0; 42 | 43 | if (argc >= 2) 44 | nthreads = atoi(argv[1]); 45 | if (argc >= 3) 46 | n = atoi(argv[2]); 47 | if (nthreads == 0) 48 | nthreads = thread::hardware_concurrency(); 49 | 50 | auto start = system_clock::now(); 51 | 52 | omp_set_dynamic(0); 53 | omp_set_num_threads(nthreads); 54 | 55 | #pragma omp parallel shared(n, answer) 56 | #pragma omp single 57 | test(n, &answer); 58 | 59 | auto stop = system_clock::now(); 60 | 61 | cout << "Scheduler: omp\n"; 62 | cout << "Benchmark: fib\n"; 63 | cout << "Threads: " << nthreads << "\n"; 64 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 65 | cout << "Input: " << n << "\n"; 66 | cout << "Output: " << answer << "\n"; 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /benchmarks/cilk/fib/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace std::chrono; 10 | 11 | void fib(int n, unsigned long *sum) 12 | { 13 | if (n <= 2) { 14 | *sum = 1; 15 | return; 16 | } 17 | 18 | unsigned long x; 19 | cilk_spawn fib(n - 1, &x); 20 | 21 | unsigned long y; 22 | cilk_spawn fib(n - 2, &y); 23 | 24 | cilk_sync; 25 | 26 | *sum = x + y; 27 | } 28 | 29 | void test(int n, unsigned long *sum) 30 | { 31 | cilk_spawn fib(n, sum); 32 | cilk_sync; 33 | } 34 | 35 | int main(int argc, char *argv[]) 36 | { 37 | size_t n = 40; 38 | unsigned long answer; 39 | const char *nthreads = nullptr; 40 | 41 | if (argc >= 2) 42 | nthreads = argv[1]; 43 | if (argc >= 3) 44 | n = atoi(argv[2]); 45 | if (nthreads == nullptr) 46 | nthreads = to_string(thread::hardware_concurrency()).c_str(); 47 | 48 | __cilkrts_end_cilk(); 49 | 50 | auto start = system_clock::now(); 51 | 52 | if (__cilkrts_set_param("nworkers", nthreads) != 0) { 53 | cerr << "Failed to set worker count\n"; 54 | exit(EXIT_FAILURE); 55 | } 56 | 57 | __cilkrts_init(); 58 | 59 | test(n, &answer); 60 | 61 | __cilkrts_end_cilk(); 62 | 63 | auto stop = system_clock::now(); 64 | 65 | cout << "Scheduler: cilk\n"; 66 | cout << "Benchmark: fib\n"; 67 | cout << "Threads: " << nthreads << "\n"; 68 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 69 | cout << "Input: " << n << "\n"; 70 | cout << "Output: " << answer << "\n"; 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /benchmarks/staccato/fib/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | using namespace staccato; 11 | 12 | class FibTask: public task 13 | { 14 | public: 15 | FibTask (int n_, unsigned long *sum_): n(n_), sum(sum_) 16 | { } 17 | 18 | void execute() { 19 | if (n <= 2) { 20 | *sum = 1; 21 | return; 22 | } 23 | 24 | unsigned long x; 25 | spawn(new(child()) FibTask(n - 1, &x)); 26 | 27 | unsigned long y; 28 | spawn(new(child()) FibTask(n - 2, &y)); 29 | 30 | wait(); 31 | 32 | *sum = x + y; 33 | 34 | return; 35 | } 36 | 37 | private: 38 | int n; 39 | unsigned long *sum; 40 | }; 41 | 42 | int main(int argc, char *argv[]) 43 | { 44 | size_t n = 40; 45 | unsigned long answer; 46 | size_t nthreads = 0; 47 | 48 | if (argc >= 2) 49 | nthreads = atoi(argv[1]); 50 | if (argc >= 3) 51 | n = atoi(argv[2]); 52 | if (nthreads == 0) 53 | nthreads = thread::hardware_concurrency(); 54 | 55 | auto start = system_clock::now(); 56 | 57 | { 58 | scheduler sh(2, nthreads); 59 | sh.spawn(new(sh.root()) FibTask(n, &answer)); 60 | sh.wait(); 61 | } 62 | 63 | auto stop = system_clock::now(); 64 | 65 | cout << "Scheduler: staccato\n"; 66 | cout << "Benchmark: fib\n"; 67 | cout << "Threads: " << nthreads << "\n"; 68 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 69 | cout << "Input: " << n << "\n"; 70 | cout << "Output: " << answer << "\n"; 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /benchmarks/openmp/dfs/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | 11 | void dfs(size_t depth, size_t breadth, unsigned long *sum) { 12 | if (depth == 0) { 13 | *sum = 1; 14 | return; 15 | } 16 | 17 | vector sums(breadth); 18 | 19 | for (size_t i = 0; i < breadth; ++i) 20 | { 21 | auto s = &sums[i]; 22 | #pragma omp task shared(depth, breadth, s) 23 | dfs(depth - 1, breadth, s); 24 | } 25 | 26 | #pragma omp taskwait 27 | 28 | *sum = 0; 29 | for (size_t i = 0; i < breadth; ++i) 30 | *sum += sums[i]; 31 | } 32 | 33 | void test(size_t depth, size_t breadth, unsigned long *sum) 34 | { 35 | #pragma omp task shared(depth, breadth, sum) 36 | dfs(depth, breadth, sum); 37 | #pragma omp taskwait 38 | } 39 | 40 | int main(int argc, char *argv[]) 41 | { 42 | size_t depth = 8; 43 | size_t breadth = 8; 44 | unsigned long answer; 45 | size_t nthreads = 0; 46 | 47 | if (argc >= 2) 48 | nthreads = atoi(argv[1]); 49 | if (argc >= 3) 50 | depth = atoi(argv[2]); 51 | if (argc >= 4) 52 | breadth = atoi(argv[3]); 53 | if (nthreads == 0) 54 | nthreads = thread::hardware_concurrency(); 55 | 56 | auto start = system_clock::now(); 57 | 58 | omp_set_dynamic(0); 59 | omp_set_num_threads(nthreads); 60 | 61 | #pragma omp parallel shared(depth, breadth, answer) 62 | #pragma omp single 63 | test(depth, breadth, &answer); 64 | 65 | auto stop = system_clock::now(); 66 | 67 | cout << "Scheduler: omp\n"; 68 | cout << "Benchmark: dfs\n"; 69 | cout << "Threads: " << nthreads << "\n"; 70 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 71 | cout << "Input: " << depth << " " << breadth << "\n"; 72 | cout << "Output: " << answer << "\n"; 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /benchmarks/cilk/dfs/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace chrono; 11 | 12 | void dfs(size_t depth, size_t breadth, unsigned long *sum) { 13 | if (depth == 0) { 14 | *sum = 1; 15 | return; 16 | } 17 | 18 | vector sums(breadth); 19 | 20 | for (size_t i = 0; i < breadth; ++i) 21 | cilk_spawn dfs(depth - 1, breadth, &sums[i]); 22 | 23 | cilk_sync; 24 | 25 | *sum = 0; 26 | for (size_t i = 0; i < breadth; ++i) 27 | *sum += sums[i]; 28 | } 29 | 30 | void test(size_t depth, size_t breadth, unsigned long *sum) 31 | { 32 | cilk_spawn dfs(depth, breadth, sum); 33 | cilk_sync; 34 | } 35 | 36 | int main(int argc, char *argv[]) 37 | { 38 | size_t depth = 8; 39 | size_t breadth = 8; 40 | unsigned long answer; 41 | const char *nthreads = nullptr; 42 | 43 | if (argc >= 2) 44 | nthreads = argv[1]; 45 | if (argc >= 3) 46 | depth = atoi(argv[2]); 47 | if (argc >= 4) 48 | breadth = atoi(argv[3]); 49 | if (nthreads == nullptr) 50 | nthreads = to_string(thread::hardware_concurrency()).c_str(); 51 | 52 | __cilkrts_end_cilk(); 53 | 54 | auto start = system_clock::now(); 55 | 56 | if (__cilkrts_set_param("nworkers", nthreads) != 0) { 57 | cerr << "Failed to set worker count\n"; 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | __cilkrts_init(); 62 | 63 | test(depth, breadth, &answer); 64 | 65 | __cilkrts_end_cilk(); 66 | 67 | auto stop = system_clock::now(); 68 | 69 | cout << "Scheduler: cilk\n"; 70 | cout << "Benchmark: dfs\n"; 71 | cout << "Threads: " << nthreads << "\n"; 72 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 73 | cout << "Input: " << depth << " " << breadth << "\n"; 74 | cout << "Output: " << answer << "\n"; 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /benchmarks/staccato/dfs/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace chrono; 11 | using namespace staccato; 12 | 13 | class DFSTask: public task 14 | { 15 | public: 16 | DFSTask (size_t depth_, size_t breadth_, unsigned long *sum_) 17 | : depth(depth_) 18 | , breadth(breadth_) 19 | , sum(sum_) 20 | { } 21 | 22 | void execute() { 23 | if (depth == 0) { 24 | *sum = 1; 25 | return; 26 | } 27 | 28 | vector sums(breadth); 29 | 30 | for (size_t i = 0; i < breadth; ++i) 31 | spawn(new(child()) DFSTask(depth - 1, breadth, &sums[i])); 32 | 33 | wait(); 34 | 35 | *sum = 0; 36 | for (size_t i = 0; i < breadth; ++i) 37 | *sum += sums[i]; 38 | 39 | return; 40 | } 41 | 42 | private: 43 | size_t depth; 44 | size_t breadth; 45 | unsigned long *sum; 46 | }; 47 | 48 | int main(int argc, char *argv[]) 49 | { 50 | size_t depth = 8; 51 | size_t breadth = 8; 52 | unsigned long answer; 53 | size_t nthreads = 0; 54 | 55 | if (argc >= 2) 56 | nthreads = atoi(argv[1]); 57 | if (argc >= 3) 58 | depth = atoi(argv[2]); 59 | if (argc >= 4) 60 | breadth = atoi(argv[3]); 61 | if (nthreads == 0) 62 | nthreads = thread::hardware_concurrency(); 63 | 64 | auto start = system_clock::now(); 65 | 66 | { 67 | scheduler sh(breadth, nthreads); 68 | sh.spawn(new(sh.root()) DFSTask(depth, breadth, &answer)); 69 | sh.wait(); 70 | } 71 | 72 | auto stop = system_clock::now(); 73 | 74 | cout << "Scheduler: staccato\n"; 75 | cout << "Benchmark: dfs\n"; 76 | cout << "Threads: " << nthreads << "\n"; 77 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 78 | cout << "Input: " << depth << " " << breadth << "\n"; 79 | cout << "Output: " << answer << "\n"; 80 | 81 | return 0; 82 | } 83 | -------------------------------------------------------------------------------- /benchmarks/plor.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library('ggplot2') 4 | 5 | read_input <- function (path) { 6 | data.raw <- read.table(input, header=TRUE) 7 | 8 | sdev <- function (x) sqrt(var(x)) 9 | data.mean <- aggregate(time ~ sched + name + threads, data.raw, mean) 10 | data.sdev <- aggregate(time ~ sched + name + threads, data.raw, sdev) 11 | data <- cbind(data.mean, sdev = data.sdev$time) 12 | return(data) 13 | } 14 | 15 | my_plot <- function (data, name, title = 'atata') { 16 | d <- data[data$name==name,] 17 | 18 | # t <- log(time) 19 | p <- ggplot(d, aes(x = threads, y = log(time), group = sched, color = sched)) + 20 | geom_line() + 21 | geom_point() + 22 | # geom_errorbar( 23 | # aes(ymin = t - sdev, ymax = t + sdev), 24 | # width = .2, 25 | # position = position_dodge(0.05) 26 | # ) + 27 | labs( 28 | title = title, 29 | x = 'Number of threads', 30 | y = 'Execution time (us), log scale' 31 | ) + 32 | scale_color_manual(values=c('#999999','#E69F00', '#4286f4')) 33 | 34 | return(p) 35 | } 36 | 37 | process <- function (data, benchmark, title, output) { 38 | p <- my_plot(data, benchmark, title) 39 | 40 | path <- paste(output, '-', benchmark, '.png', sep='') 41 | ggsave(path, p, width = 10, height = 6) 42 | } 43 | 44 | 45 | args = commandArgs(trailingOnly=TRUE) 46 | if (length(args) == 0) { 47 | stop("Usage: ./plot.r ", call.=FALSE) 48 | } 49 | 50 | input <- args[1] 51 | 52 | data <- read_input(input) 53 | 54 | process(data, 'fib', 'Fibonacci Number (43)', input) 55 | process(data, 'dfs', 'Depth First Search (9^10 vertices)', input) 56 | process(data, 'matmul', 'Matrix Multiplication (3500x3500)', input) 57 | process(data, 'mergesort', 'Merge Sort (10^9 of 4 byte integers)', input) 58 | process(data, 'blkmul', 'Block Matrix Multiplication (4096*4096)', input) 59 | 60 | 61 | -------------------------------------------------------------------------------- /benchmarks/tbb/dfs/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace chrono; 11 | using namespace tbb; 12 | 13 | class DFSTask: public task 14 | { 15 | public: 16 | DFSTask (size_t depth_, size_t breadth_, unsigned long *sum_) 17 | : depth(depth_) 18 | , breadth(breadth_) 19 | , sum(sum_) 20 | { } 21 | 22 | task *execute() { 23 | if (depth == 0) { 24 | *sum = 1; 25 | return nullptr; 26 | } 27 | 28 | vector sums(breadth); 29 | 30 | set_ref_count(breadth + 1); 31 | 32 | for (size_t i = 0; i < breadth; ++i) 33 | spawn(*new(allocate_child()) DFSTask(depth - 1, breadth, &sums[i])); 34 | 35 | wait_for_all(); 36 | 37 | *sum = 0; 38 | for (size_t i = 0; i < breadth; ++i) 39 | *sum += sums[i]; 40 | 41 | return nullptr; 42 | } 43 | 44 | private: 45 | size_t depth; 46 | size_t breadth; 47 | unsigned long *sum; 48 | }; 49 | 50 | int main(int argc, char *argv[]) 51 | { 52 | size_t depth = 8; 53 | size_t breadth = 8; 54 | unsigned long answer; 55 | size_t nthreads = 0; 56 | 57 | if (argc >= 2) 58 | nthreads = atoi(argv[1]); 59 | if (argc >= 3) 60 | depth = atoi(argv[2]); 61 | if (argc >= 4) 62 | breadth = atoi(argv[3]); 63 | if (nthreads == 0) 64 | nthreads = thread::hardware_concurrency(); 65 | 66 | auto start = system_clock::now(); 67 | 68 | task_scheduler_init scheduler(nthreads); 69 | 70 | auto root = new(task::allocate_root()) DFSTask(depth, breadth, &answer); 71 | 72 | task::spawn_root_and_wait(*root); 73 | 74 | scheduler.terminate(); 75 | 76 | auto stop = system_clock::now(); 77 | 78 | cout << "Scheduler: tbb\n"; 79 | cout << "Benchmark: dfs\n"; 80 | cout << "Threads: " << nthreads << "\n"; 81 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 82 | cout << "Input: " << depth << " " << breadth << "\n"; 83 | cout << "Output: " << answer << "\n"; 84 | 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project(StaccatoUnitTests) 3 | 4 | option(STACCATO_MEMCHECK "Enable memory checks with valgrind" ON) 5 | 6 | set(CMAKE_CXX_STANDARD 11) 7 | set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") 8 | 9 | set(GTEST_DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/gtest_download) 10 | 11 | # Download and unpack googletest at configure time 12 | configure_file(CMakeLists.txt.in ${GTEST_DOWNLOAD_DIR}/CMakeLists.txt) 13 | 14 | execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . 15 | RESULT_VARIABLE result 16 | WORKING_DIRECTORY ${GTEST_DOWNLOAD_DIR}) 17 | if(result) 18 | message(FATAL_ERROR "CMake step for googletest failed: ${result}") 19 | endif() 20 | 21 | execute_process(COMMAND ${CMAKE_COMMAND} --build . 22 | RESULT_VARIABLE result 23 | WORKING_DIRECTORY ${GTEST_DOWNLOAD_DIR}) 24 | if(result) 25 | message(FATAL_ERROR "Build step for googletest failed: ${result}") 26 | endif() 27 | 28 | # Prevent overriding the parent project's compiler/linker 29 | # settings on Windows 30 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 31 | 32 | # Add googletest directly to our build. This defines 33 | # the gtest and gtest_main targets. 34 | add_subdirectory( 35 | ${CMAKE_BINARY_DIR}/googletest-src 36 | ${CMAKE_BINARY_DIR}/googletest-build 37 | ) 38 | 39 | # The gtest/gtest_main targets carry header search path 40 | # dependencies automatically when using CMake 2.8.11 or 41 | # later. Otherwise we have to add them here ourselves. 42 | if (CMAKE_VERSION VERSION_LESS 2.8.11) 43 | include_directories("${gtest_SOURCE_DIR}/include") 44 | endif() 45 | 46 | enable_testing() 47 | 48 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 49 | 50 | function(my_add_test name sources) 51 | add_executable(${name} ${sources}) 52 | 53 | target_link_libraries(${name} pthread) 54 | 55 | target_link_libraries(${name} gtest_main gtest gmock) 56 | 57 | add_test(NAME ${name} COMMAND ${name}) 58 | 59 | find_program(VALGRIND "valgrind") 60 | if (TRAIN_MEMCHECK AND VALGRIND) 61 | add_test(NAME ${name}_memcheck COMMAND ${VALGRIND} ${valgrind_args} ./${name}) 62 | endif() 63 | 64 | endfunction() 65 | 66 | my_add_test(test_task_deque task_deque.cpp) 67 | my_add_test(test_lifo_allocator lifo_allocator.cpp) 68 | -------------------------------------------------------------------------------- /benchmarks/sequential/mergesort/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | using namespace staccato; 11 | 12 | typedef int elem_t; 13 | 14 | size_t lenght = 0; 15 | elem_t *data = nullptr; 16 | elem_t *data_tmp = nullptr; 17 | long sum_before = 0; 18 | 19 | inline uint32_t xorshift_rand() { 20 | static uint32_t x = 2463534242; 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | void generate_data(size_t n) { 28 | lenght = n; 29 | data = new elem_t[n]; 30 | data_tmp = new elem_t[n]; 31 | sum_before = 0; 32 | for (size_t i = 0; i < n; ++i) { 33 | data[i] = xorshift_rand() % (n / 2); 34 | sum_before += data[i]; 35 | } 36 | } 37 | 38 | bool check() { 39 | long s = data[0]; 40 | for (size_t i = 1; i < lenght; ++i) { 41 | s += data[i]; 42 | if (data[i] > data[i]) 43 | return false; 44 | } 45 | 46 | return sum_before == s; 47 | } 48 | 49 | void seq_matmul(size_t left, size_t right) 50 | { 51 | if (right - left <= 1) 52 | return; 53 | 54 | size_t mid = (left + right) / 2; 55 | size_t l = left; 56 | size_t r = mid; 57 | 58 | seq_matmul(left, mid); 59 | seq_matmul(mid, right); 60 | 61 | for (size_t i = left; i < right; i++) { 62 | if ((l < mid && r < right && data[l] < data[r]) || r == right) { 63 | data_tmp[i] = data[l]; 64 | l++; 65 | } else if ((l < mid && r < right) || l == mid) { 66 | data_tmp[i] = data[r]; 67 | r++; 68 | } 69 | } 70 | 71 | memcpy(data + left, data_tmp + left, (right - left) * sizeof(elem_t)); 72 | } 73 | 74 | int main(int argc, char *argv[]) 75 | { 76 | size_t n = 8e7; 77 | 78 | if (argc >= 3) 79 | n = atoi(argv[2]); 80 | 81 | generate_data(n); 82 | 83 | auto start = system_clock::now(); 84 | 85 | seq_matmul(0, n); 86 | 87 | auto stop = system_clock::now(); 88 | 89 | cout << "Scheduler: sequential\n"; 90 | cout << "Benchmark: mergesort\n"; 91 | cout << "Threads: " << 0 << "\n"; 92 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 93 | cout << "Input: " << n << "\n"; 94 | cout << "Output: " << check() << "\n"; 95 | 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /include/counter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef COUNTER_HPP_6COIEFOP 2 | #define COUNTER_HPP_6COIEFOP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.hpp" 9 | 10 | #define COUNT(e) m_counter.count(counter::e) 11 | 12 | namespace staccato 13 | { 14 | namespace internal 15 | { 16 | 17 | class counter 18 | { 19 | public: 20 | counter(); 21 | 22 | enum event_e { 23 | take = 0, 24 | take_empty = 1, 25 | take_stolen = 2, 26 | steal = 3, 27 | steal_race = 4, 28 | steal_empty = 5, 29 | steal2 = 6, 30 | steal2_race = 7, 31 | steal2_empty = 8, 32 | dbg1 = 9, 33 | dbg2 = 10, 34 | }; 35 | 36 | void count(event_e e); 37 | 38 | static void print_header(); 39 | 40 | void print(size_t id) const; 41 | 42 | private: 43 | static const size_t m_nconsters = 11; 44 | static const int m_cell_width = 9; 45 | 46 | static const constexpr char* const m_events[] = { 47 | "take", 48 | "take!e", 49 | "take!s", 50 | "steal", 51 | "steal!r", 52 | "steal!e", 53 | "steal2", 54 | "steal2!r", 55 | "steal2!e", 56 | "dbg1", 57 | "dbg2" 58 | }; 59 | 60 | unsigned long m_counters[m_nconsters]; 61 | }; 62 | 63 | const constexpr char* const counter::m_events[]; 64 | 65 | counter::counter() 66 | { 67 | memset(m_counters, 0, m_nconsters * sizeof(m_counters[0])); 68 | } 69 | 70 | void counter::count(event_e e) 71 | { 72 | auto i = static_cast(e); 73 | m_counters[i]++; 74 | } 75 | 76 | void counter::print_header() 77 | { 78 | FILE *fp = stdout; 79 | 80 | fprintf(fp, "[STACCATO]"); 81 | fprintf(fp, " w# |"); 82 | 83 | auto n = sizeof(m_events) / sizeof(m_events[0]); 84 | for (size_t i = 0; i < n; ++i) 85 | fprintf(fp, "%*s |", m_cell_width, m_events[i]); 86 | 87 | fprintf(fp, "\n"); 88 | } 89 | 90 | void counter::print(size_t id) const 91 | { 92 | FILE *fp = stdout; 93 | 94 | fprintf(fp, "[STACCATO]"); 95 | fprintf(fp, "%3lu |", id); 96 | 97 | for (size_t i = 0; i < m_nconsters; ++i) 98 | fprintf(fp, "%*lu |", m_cell_width, m_counters[i]); 99 | 100 | fprintf(fp, "\n"); 101 | } 102 | 103 | } /* internal */ 104 | } /* staccato */ 105 | 106 | #endif /* end of include guard: COUNTER_HPP_6COIEFOP */ 107 | -------------------------------------------------------------------------------- /benchmarks/tbb/fib/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | using namespace tbb; 11 | 12 | class FibTask: public task 13 | { 14 | public: 15 | FibTask (int n_, unsigned long *sum_): n(n_), sum(sum_) 16 | { } 17 | 18 | task *execute() { 19 | if (n <= 2) { 20 | *sum = 1; 21 | return nullptr; 22 | } 23 | 24 | unsigned long x, y; 25 | FibTask &a = *new(allocate_child()) FibTask(n - 1, &x); 26 | FibTask &b = *new(allocate_child()) FibTask(n - 2, &y); 27 | 28 | set_ref_count(3); 29 | 30 | spawn(a); 31 | spawn(b); 32 | 33 | wait_for_all(); 34 | 35 | *sum = x + y; 36 | 37 | return nullptr; 38 | } 39 | 40 | private: 41 | int n; 42 | unsigned long *sum; 43 | }; 44 | 45 | void fib_seq(int n, unsigned long *sum) { 46 | if (n <= 2) { 47 | *sum = 1; 48 | return; 49 | } 50 | 51 | unsigned long x; 52 | fib_seq(n - 1, &x); 53 | 54 | unsigned long y; 55 | fib_seq(n - 2, &y); 56 | 57 | *sum = x + y; 58 | } 59 | 60 | int main(int argc, char *argv[]) 61 | { 62 | size_t n = 40; 63 | unsigned long answer; 64 | size_t nthreads = 0; 65 | 66 | if (argc >= 2) 67 | nthreads = atoi(argv[1]); 68 | if (argc >= 3) 69 | n = atoi(argv[2]); 70 | if (nthreads == 0) 71 | nthreads = thread::hardware_concurrency(); 72 | 73 | auto start_noshed = system_clock::now(); 74 | 75 | if (nthreads == 1) 76 | fib_seq(n, &answer); 77 | 78 | auto stop_noshed = system_clock::now(); 79 | auto noshed_time = duration_cast(stop_noshed - start_noshed).count(); 80 | cout << "Seq time(us): " << noshed_time << "\n"; 81 | 82 | auto start = system_clock::now(); 83 | 84 | task_scheduler_init scheduler(nthreads); 85 | 86 | auto root = new(task::allocate_root()) FibTask(n, &answer); 87 | 88 | task::spawn_root_and_wait(*root); 89 | 90 | scheduler.terminate(); 91 | 92 | auto stop = system_clock::now(); 93 | 94 | auto shed_time = duration_cast(stop - start).count(); 95 | 96 | cout << "Scheduler: tbb\n"; 97 | cout << "Benchmark: fib\n"; 98 | cout << "Threads: " << nthreads << "\n"; 99 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 100 | cout << "Input: " << n << "\n"; 101 | cout << "Output: " << answer << "\n"; 102 | 103 | if (nthreads == 1) 104 | cout << "Overhead: " << static_cast(shed_time) / noshed_time << "\n"; 105 | 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /benchmarks/openmp/mergesort/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | 11 | typedef int elem_t; 12 | 13 | size_t lenght = 0; 14 | elem_t *data = nullptr; 15 | elem_t *data_tmp = nullptr; 16 | long sum_before = 0; 17 | 18 | inline uint32_t xorshift_rand() { 19 | static uint32_t x = 2463534242; 20 | x ^= x >> 13; 21 | x ^= x << 17; 22 | x ^= x >> 5; 23 | return x; 24 | } 25 | 26 | void generate_data(size_t n) { 27 | lenght = n; 28 | data = new elem_t[n]; 29 | data_tmp = new elem_t[n]; 30 | sum_before = 0; 31 | for (size_t i = 0; i < n; ++i) { 32 | data[i] = xorshift_rand() % (n / 2); 33 | sum_before += data[i]; 34 | } 35 | } 36 | 37 | bool check() { 38 | long s = data[0]; 39 | for (size_t i = 1; i < lenght; ++i) { 40 | s += data[i]; 41 | if (data[i] > data[i]) 42 | return false; 43 | } 44 | 45 | return sum_before == s; 46 | } 47 | 48 | void mergesort(size_t left, size_t right) { 49 | if (right - left <= 1) 50 | return; 51 | 52 | size_t mid = (left + right) / 2; 53 | size_t l = left; 54 | size_t r = mid; 55 | 56 | #pragma omp task shared() 57 | mergesort(left, mid); 58 | #pragma omp task shared() 59 | mergesort(mid, right); 60 | 61 | #pragma omp taskwait 62 | 63 | for (size_t i = left; i < right; i++) { 64 | if ((l < mid && r < right && data[l] < data[r]) || r == right) { 65 | data_tmp[i] = data[l]; 66 | l++; 67 | } else if ((l < mid && r < right) || l == mid) { 68 | data_tmp[i] = data[r]; 69 | r++; 70 | } 71 | } 72 | 73 | // TODO: prefetch? 74 | memcpy(data + left, data_tmp + left, (right - left) * sizeof(elem_t)); 75 | 76 | } 77 | void test(size_t left, size_t right) 78 | { 79 | #pragma omp task shared() 80 | mergesort(left, right); 81 | #pragma omp taskwait 82 | } 83 | 84 | int main(int argc, char *argv[]) 85 | { 86 | size_t n = 8e7; 87 | size_t nthreads = 0; 88 | 89 | if (argc >= 2) 90 | nthreads = atoi(argv[1]); 91 | if (argc >= 3) 92 | n = atoi(argv[2]); 93 | if (nthreads == 0) 94 | nthreads = thread::hardware_concurrency(); 95 | 96 | generate_data(n); 97 | 98 | auto start = system_clock::now(); 99 | 100 | omp_set_dynamic(0); 101 | omp_set_num_threads(nthreads); 102 | 103 | #pragma omp parallel shared(A, B, C, n) 104 | #pragma omp single 105 | test(0, n); 106 | 107 | auto stop = system_clock::now(); 108 | 109 | cout << "Scheduler: cilk\n"; 110 | cout << "Benchmark: mergesort\n"; 111 | cout << "Threads: " << nthreads << "\n"; 112 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 113 | cout << "Input: " << n << "\n"; 114 | cout << "Output: " << check() << "\n"; 115 | 116 | return 0; 117 | } 118 | -------------------------------------------------------------------------------- /benchmarks/cilk/mergesort/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace chrono; 11 | 12 | typedef int elem_t; 13 | 14 | size_t lenght = 0; 15 | elem_t *data = nullptr; 16 | elem_t *data_tmp = nullptr; 17 | long sum_before = 0; 18 | 19 | inline uint32_t xorshift_rand() { 20 | static uint32_t x = 2463534242; 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | void generate_data(size_t n) { 28 | lenght = n; 29 | data = new elem_t[n]; 30 | data_tmp = new elem_t[n]; 31 | sum_before = 0; 32 | for (size_t i = 0; i < n; ++i) { 33 | data[i] = xorshift_rand() % (n / 2); 34 | sum_before += data[i]; 35 | } 36 | } 37 | 38 | bool check() { 39 | long s = data[0]; 40 | for (size_t i = 1; i < lenght; ++i) { 41 | s += data[i]; 42 | if (data[i] > data[i]) 43 | return false; 44 | } 45 | 46 | return sum_before == s; 47 | } 48 | 49 | const size_t cutoff = 8192; 50 | 51 | int qsort_cmp(const void* a, const void* b) 52 | { 53 | elem_t arg1 = *(const elem_t*)a; 54 | elem_t arg2 = *(const elem_t*)b; 55 | return (arg1 > arg2) - (arg1 < arg2); 56 | } 57 | 58 | void mergesort(size_t left, size_t right) { 59 | if (right - left <= cutoff) { 60 | qsort(data + left, right - left, sizeof(elem_t), qsort_cmp); 61 | return; 62 | } 63 | 64 | if (right - left <= 1) 65 | return; 66 | 67 | size_t mid = (left + right) / 2; 68 | size_t l = left; 69 | size_t r = mid; 70 | 71 | cilk_spawn mergesort(left, mid); 72 | cilk_spawn mergesort(mid, right); 73 | 74 | cilk_sync; 75 | 76 | for (size_t i = left; i < right; i++) { 77 | if ((l < mid && r < right && data[l] < data[r]) || r == right) { 78 | data_tmp[i] = data[l]; 79 | l++; 80 | } else if ((l < mid && r < right) || l == mid) { 81 | data_tmp[i] = data[r]; 82 | r++; 83 | } 84 | } 85 | 86 | // TODO: prefetch? 87 | memcpy(data + left, data_tmp + left, (right - left) * sizeof(elem_t)); 88 | 89 | } 90 | void test(size_t left, size_t right) 91 | { 92 | cilk_spawn mergesort(left, right); 93 | cilk_sync; 94 | } 95 | 96 | int main(int argc, char *argv[]) 97 | { 98 | size_t n = 8e7; 99 | const char *nthreads = nullptr; 100 | 101 | if (argc >= 2) 102 | nthreads = argv[1]; 103 | if (argc >= 3) 104 | n = atoi(argv[2]); 105 | if (nthreads == 0) 106 | nthreads = to_string(thread::hardware_concurrency()).c_str(); 107 | 108 | generate_data(n); 109 | 110 | __cilkrts_end_cilk(); 111 | 112 | auto start = system_clock::now(); 113 | 114 | if (__cilkrts_set_param("nworkers", nthreads) != 0) { 115 | cerr << "Failed to set worker count\n"; 116 | exit(EXIT_FAILURE); 117 | } 118 | 119 | __cilkrts_init(); 120 | 121 | test(0, n); 122 | 123 | __cilkrts_end_cilk(); 124 | 125 | auto stop = system_clock::now(); 126 | 127 | cout << "Scheduler: cilk\n"; 128 | cout << "Benchmark: mergesort\n"; 129 | cout << "Threads: " << nthreads << "\n"; 130 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 131 | cout << "Input: " << n << "\n"; 132 | cout << "Output: " << check() << "\n"; 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /benchmarks/staccato/mergesort/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace chrono; 10 | using namespace staccato; 11 | 12 | typedef int elem_t; 13 | 14 | size_t lenght = 0; 15 | elem_t *data = nullptr; 16 | elem_t *data_tmp = nullptr; 17 | long sum_before = 0; 18 | 19 | inline uint32_t xorshift_rand() { 20 | static uint32_t x = 2463534242; 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | void generate_data(size_t n) { 28 | lenght = n; 29 | data = new elem_t[n]; 30 | data_tmp = new elem_t[n]; 31 | sum_before = 0; 32 | for (size_t i = 0; i < n; ++i) { 33 | data[i] = xorshift_rand() % (n / 2); 34 | sum_before += data[i]; 35 | } 36 | } 37 | 38 | bool check() { 39 | long s = data[0]; 40 | for (size_t i = 1; i < lenght; ++i) { 41 | s += data[i]; 42 | if (data[i] > data[i]) 43 | return false; 44 | } 45 | 46 | return sum_before == s; 47 | } 48 | 49 | class SortTask: public task 50 | { 51 | public: 52 | SortTask (size_t left, size_t right) 53 | : m_left(left) 54 | , m_right(right) 55 | { } 56 | 57 | static const size_t cutoff = 8192; 58 | 59 | static int qsort_cmp(const void* a, const void* b) 60 | { 61 | elem_t arg1 = *(const elem_t*)a; 62 | elem_t arg2 = *(const elem_t*)b; 63 | return (arg1 > arg2) - (arg1 < arg2); 64 | } 65 | 66 | void execute() { 67 | if (m_right - m_left <= cutoff) { 68 | qsort(data + m_left, m_right - m_left, sizeof(elem_t), qsort_cmp); 69 | return; 70 | } 71 | 72 | if (m_right - m_left <= 1) 73 | return; 74 | 75 | size_t mid = (m_left + m_right) / 2; 76 | size_t l = m_left; 77 | size_t r = mid; 78 | 79 | spawn(new(child()) SortTask(m_left, mid)); 80 | spawn(new(child()) SortTask(mid, m_right)); 81 | 82 | wait(); 83 | 84 | for (size_t i = m_left; i < m_right; i++) { 85 | if ((l < mid && r < m_right && data[l] < data[r]) || r == m_right) { 86 | data_tmp[i] = data[l]; 87 | l++; 88 | } else if ((l < mid && r < m_right) || l == mid) { 89 | data_tmp[i] = data[r]; 90 | r++; 91 | } 92 | } 93 | 94 | memcpy(data + m_left, data_tmp + m_left, (m_right - m_left) * sizeof(elem_t)); 95 | } 96 | 97 | private: 98 | size_t m_left; 99 | size_t m_right; 100 | }; 101 | 102 | int main(int argc, char *argv[]) 103 | { 104 | size_t n = 8e7; 105 | size_t nthreads = 0; 106 | 107 | if (argc >= 2) 108 | nthreads = atoi(argv[1]); 109 | if (argc >= 3) 110 | n = atoi(argv[2]); 111 | if (nthreads == 0) 112 | nthreads = thread::hardware_concurrency(); 113 | 114 | generate_data(n); 115 | 116 | auto start = system_clock::now(); 117 | 118 | { 119 | scheduler sh(2, nthreads); 120 | sh.spawn(new(sh.root()) SortTask(0, n)); 121 | sh.wait(); 122 | } 123 | 124 | auto stop = system_clock::now(); 125 | 126 | cout << "Scheduler: staccato\n"; 127 | cout << "Benchmark: mergesort\n"; 128 | cout << "Threads: " << nthreads << "\n"; 129 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 130 | cout << "Input: " << n << "\n"; 131 | cout << "Output: " << check() << "\n"; 132 | 133 | return 0; 134 | } 135 | -------------------------------------------------------------------------------- /benchmarks/tbb/mergesort/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace chrono; 11 | using namespace tbb; 12 | 13 | typedef int elem_t; 14 | 15 | size_t lenght = 0; 16 | elem_t *data = nullptr; 17 | elem_t *data_tmp = nullptr; 18 | long sum_before = 0; 19 | 20 | inline uint32_t xorshift_rand() { 21 | static uint32_t x = 2463534242; 22 | x ^= x >> 13; 23 | x ^= x << 17; 24 | x ^= x >> 5; 25 | return x; 26 | } 27 | 28 | void generate_data(size_t n) { 29 | lenght = n; 30 | data = new elem_t[n]; 31 | data_tmp = new elem_t[n]; 32 | sum_before = 0; 33 | for (size_t i = 0; i < n; ++i) { 34 | data[i] = xorshift_rand() % (n / 2); 35 | sum_before += data[i]; 36 | } 37 | } 38 | 39 | bool check() { 40 | long s = data[0]; 41 | for (size_t i = 1; i < lenght; ++i) { 42 | s += data[i]; 43 | if (data[i] > data[i]) 44 | return false; 45 | } 46 | 47 | return sum_before == s; 48 | } 49 | 50 | class SortTask: public task 51 | { 52 | public: 53 | SortTask (size_t left, size_t right) 54 | : m_left(left) 55 | , m_right(right) 56 | { } 57 | 58 | static const size_t cutoff = 8192; 59 | 60 | static int qsort_cmp(const void* a, const void* b) 61 | { 62 | elem_t arg1 = *(const elem_t*)a; 63 | elem_t arg2 = *(const elem_t*)b; 64 | return (arg1 > arg2) - (arg1 < arg2); 65 | } 66 | 67 | 68 | task *execute() { 69 | if (m_right - m_left <= cutoff) { 70 | qsort(data + m_left, m_right - m_left, sizeof(elem_t), qsort_cmp); 71 | return nullptr; 72 | } 73 | 74 | if (m_right - m_left <= 1) 75 | return nullptr; 76 | 77 | size_t mid = (m_left + m_right) / 2; 78 | size_t l = m_left; 79 | size_t r = mid; 80 | 81 | SortTask &a = *new(allocate_child()) SortTask(m_left, mid); 82 | SortTask &b = *new(allocate_child()) SortTask(mid, m_right); 83 | 84 | set_ref_count(3); 85 | 86 | spawn(a); 87 | spawn(b); 88 | 89 | wait_for_all(); 90 | 91 | for (size_t i = m_left; i < m_right; i++) { 92 | if ((l < mid && r < m_right && data[l] < data[r]) || r == m_right) { 93 | data_tmp[i] = data[l]; 94 | l++; 95 | } else if ((l < mid && r < m_right) || l == mid) { 96 | data_tmp[i] = data[r]; 97 | r++; 98 | } 99 | } 100 | 101 | memcpy(data + m_left, data_tmp + m_left, (m_right - m_left) * sizeof(elem_t)); 102 | 103 | return nullptr; 104 | } 105 | 106 | private: 107 | size_t m_left; 108 | size_t m_right; 109 | }; 110 | 111 | int main(int argc, char *argv[]) 112 | { 113 | size_t n = 8e7; 114 | size_t nthreads = 0; 115 | 116 | if (argc >= 2) 117 | nthreads = atoi(argv[1]); 118 | if (argc >= 3) 119 | n = atoi(argv[2]); 120 | if (nthreads == 0) 121 | nthreads = thread::hardware_concurrency(); 122 | 123 | generate_data(n); 124 | 125 | auto start = system_clock::now(); 126 | 127 | task_scheduler_init scheduler(nthreads); 128 | 129 | auto root = new(task::allocate_root()) SortTask(0, n); 130 | 131 | task::spawn_root_and_wait(*root); 132 | 133 | scheduler.terminate(); 134 | 135 | auto stop = system_clock::now(); 136 | 137 | cout << "Scheduler: tbb\n"; 138 | cout << "Benchmark: mergesort\n"; 139 | cout << "Threads: " << nthreads << "\n"; 140 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 141 | cout << "Input: " << n << "\n"; 142 | cout << "Output: " << check() << "\n"; 143 | 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /benchmarks/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | runs=6 4 | threads=({1..56}) 5 | 6 | args_fib="42" 7 | args_dfs="9 9" 8 | args_mergesort="50000000" 9 | args_matmul="3000" 10 | args_blkmul="8" 11 | 12 | runs=2 13 | threads=(3 4) 14 | 15 | args_fib="42" 16 | args_dfs="9 9" 17 | args_mergesort="100000" 18 | args_matmul="800" 19 | args_blkmul="6" 20 | 21 | benchmarks=( 22 | "staccato fib _threads_ $args_fib" 23 | # "staccato dfs _threads_ $args_dfs" 24 | # "staccato mergesort _threads_ $args_mergesort" 25 | # "staccato matmul _threads_ $args_matmul" 26 | # "staccato blkmul _threads_ $args_blkmul" 27 | # "cilk fib _threads_ $args_fib" 28 | # "cilk dfs _threads_ $args_dfs" 29 | # "cilk mergesort _threads_ $args_mergesort" 30 | # "cilk matmul _threads_ $args_matmul" 31 | # "cilk blkmul _threads_ $args_blkmul" 32 | # "tbb fib _threads_ $args_fib" 33 | # "tbb dfs _threads_ $args_dfs" 34 | # "tbb mergesort _threads_ $args_mergesort" 35 | # "tbb matmul _threads_ $args_matmul" 36 | # "tbb blkmul _threads_ $args_blkmul" 37 | ) 38 | 39 | # export CXXFLAGS=-I\ ~/.local/include/\ -DSTACCATO_DEBUG=1 40 | export CXXFLAGS=-I\ ~/.local/include/ 41 | 42 | function get_integer() { 43 | echo "$1" | grep "$2" | grep -o "[0-9].*" 44 | } 45 | 46 | function get_string() { 47 | echo "$1" | grep "$2" | cut -f2 -d':' 48 | } 49 | 50 | function print_header() { 51 | echo sched name threads time input output 52 | } 53 | 54 | function show_results() { 55 | output=$(cat /dev/stdin) 56 | 57 | sched=$(get_string "$output" "Scheduler") 58 | name=$(get_string "$output" "Benchmark") 59 | threads=$(get_integer "$output" "Threads") 60 | time=$(get_integer "$output" "Time(us)") 61 | input=$(get_integer "$output" "Input") 62 | output=$(get_integer "$output" "Output") 63 | 64 | echo $sched $name $threads $time \"$input\" \"$output\" 65 | } 66 | 67 | function clean() { 68 | dir=$1/$2/build 69 | 70 | if [ -d $dir ]; then 71 | rm -rf $dir 72 | fi 73 | } 74 | 75 | function build() { 76 | dir=$1/$2 77 | 78 | pushd . 79 | 80 | cd $dir 81 | 82 | if [ -d build ]; then 83 | cd build 84 | else 85 | mkdir build 86 | cd build 87 | fi 88 | 89 | cmake $cmake_args .. 90 | make 91 | 92 | popd 93 | } 94 | 95 | function bench() { 96 | runs=$1 97 | dir=$2/$3/build/ 98 | bin=${3}-${2} 99 | shift; shift; shift 100 | args=$@ 101 | 102 | pushd . >/dev/null 103 | 104 | cd $dir 105 | 106 | for (( i = 0; i < $runs; i++ )); do 107 | ./$bin $args | show_results 108 | done 109 | 110 | popd >/dev/null 111 | } 112 | 113 | function clean_all() { 114 | for b in "${benchmarks[@]}" ; do 115 | clean $b 116 | done 117 | } 118 | 119 | function build_all() { 120 | for b in "${benchmarks[@]}" ; do 121 | build $b 122 | done 123 | } 124 | 125 | function bench_all() { 126 | print_header 127 | for id in "${!benchmarks[@]}" ; do 128 | benchmark="${benchmarks[$id]}" 129 | 130 | for t in ${threads[@]} ; do 131 | b=${benchmark/_threads_/$t} 132 | bench $runs $b 133 | done 134 | done 135 | } 136 | 137 | function install_staccato() { 138 | pushd . 139 | cd .. 140 | 141 | rm -rf build || true 142 | mkdir build 143 | cd build 144 | cmake -DCMAKE_INSTALL_PREFIX=~/.local/ .. 145 | make 146 | make install 147 | 148 | popd 149 | } 150 | 151 | mode=$1 152 | if [[ z$mode == "zupdate" ]]; then 153 | install_staccato 154 | clean_all 155 | build_all 156 | elif [[ z$mode == "zrebuild" ]]; then 157 | clean_all 158 | build_all 159 | elif [[ z$mode == "zbuild" ]]; then 160 | build_all 161 | elif [[ z$mode == "zclean" ]]; then 162 | clean_all 163 | else 164 | bench_all 165 | fi 166 | 167 | -------------------------------------------------------------------------------- /include/task_deque.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TASK_DEQUE_HPP_1ZDWEADG 2 | #define TASK_DEQUE_HPP_1ZDWEADG 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.hpp" 9 | #include "debug.hpp" 10 | #include "lifo_allocator.hpp" 11 | 12 | namespace staccato 13 | { 14 | 15 | namespace internal 16 | { 17 | 18 | template 19 | class task_deque 20 | { 21 | public: 22 | task_deque(size_t size, T *mem); 23 | ~task_deque(); 24 | 25 | void set_prev(task_deque *d); 26 | void set_next(task_deque *d); 27 | void set_victim(task_deque *d); 28 | 29 | task_deque *get_next(); 30 | 31 | void return_stolen(); 32 | 33 | T *put_allocate(); 34 | void put_commit(); 35 | 36 | T *take(size_t *); 37 | T *steal(bool *was_empty); 38 | 39 | private: 40 | const size_t m_mask; 41 | 42 | // TODO: make this array a part of this class 43 | T * m_array; 44 | 45 | task_deque *m_next; 46 | 47 | STACCATO_ALIGN std::atomic_size_t m_nstolen; 48 | STACCATO_ALIGN std::atomic_size_t m_top; 49 | STACCATO_ALIGN std::atomic_size_t m_bottom; 50 | }; 51 | 52 | template 53 | task_deque::task_deque(size_t size, T *mem) 54 | : m_mask(size - 1) 55 | , m_array(mem) 56 | , m_next(nullptr) 57 | , m_nstolen(0) 58 | , m_top(1) 59 | , m_bottom(1) 60 | { 61 | STACCATO_ASSERT(is_pow2(size), "Deque size is not power of 2"); 62 | } 63 | 64 | template 65 | task_deque::~task_deque() 66 | { } 67 | 68 | template 69 | void task_deque::set_next(task_deque *d) 70 | { 71 | m_next = d; 72 | } 73 | 74 | template 75 | task_deque *task_deque::get_next() 76 | { 77 | return m_next; 78 | } 79 | 80 | template 81 | T *task_deque::put_allocate() 82 | { 83 | auto b = load_relaxed(m_bottom); 84 | return &m_array[b & m_mask]; 85 | } 86 | 87 | template 88 | void task_deque::put_commit() 89 | { 90 | auto b = load_relaxed(m_bottom); 91 | atomic_fence_release(); 92 | store_relaxed(m_bottom, b + 1); 93 | } 94 | 95 | template 96 | T *task_deque::take(size_t *nstolen) 97 | { 98 | auto b = dec_relaxed(m_bottom) - 1; 99 | auto t = load_relaxed(m_top); 100 | auto n = load_relaxed(m_nstolen); 101 | 102 | // Check whether the deque was empty 103 | if (t > b) { 104 | // Restoring to empty state 105 | store_relaxed(m_bottom, b + 1); 106 | *nstolen = n; 107 | return nullptr; 108 | } 109 | 110 | // Check if the task can be stolen 111 | if (t == b) { 112 | // Check if it's not stolen 113 | if (!cas_strong(m_top, t, t + 1)) { 114 | // It was stolen, restoring to previous state 115 | m_bottom = b + 1; 116 | *nstolen = n + 1; 117 | return nullptr; 118 | } 119 | 120 | // Wasn't stolen, but we icnremented top index 121 | m_bottom = b + 1; 122 | return &m_array[b & m_mask]; 123 | } 124 | 125 | // The task can't be stolen, no need for CAS 126 | return &m_array[b & m_mask]; 127 | } 128 | 129 | template 130 | T *task_deque::steal(bool *was_empty) 131 | { 132 | auto t = load_acquire(m_top); 133 | atomic_fence_seq_cst(); 134 | auto b = load_acquire(m_bottom); 135 | 136 | // Check if deque was empty 137 | if (t >= b) { 138 | *was_empty = true; 139 | return nullptr; 140 | } 141 | 142 | auto r = &m_array[t & m_mask]; 143 | 144 | inc_relaxed(m_nstolen); 145 | 146 | // Check if loaded task is not stolen 147 | if (!cas_weak(m_top, t, t + 1)) { 148 | dec_relaxed(m_nstolen); 149 | return nullptr; 150 | } 151 | 152 | return r; 153 | } 154 | 155 | template 156 | void task_deque::return_stolen() 157 | { 158 | STACCATO_ASSERT(m_nstolen > 0, "Decrementing stolen count when there are no stolen tasks"); 159 | dec_relaxed(m_nstolen); 160 | } 161 | 162 | } // namespace internal 163 | } // namespace stacccato 164 | 165 | #endif /* end of include guard: TASK_DEQUE_HPP_1ZDWEADG */ 166 | -------------------------------------------------------------------------------- /include/lifo_allocator.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LIFO_ALLOCATOR_HPP_PXMFXMQN 2 | #define LIFO_ALLOCATOR_HPP_PXMFXMQN 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "utils.hpp" 10 | 11 | namespace staccato 12 | { 13 | namespace internal 14 | { 15 | 16 | class lifo_allocator 17 | { 18 | public: 19 | lifo_allocator(size_t page_size); 20 | 21 | virtual ~lifo_allocator(); 22 | 23 | template 24 | T *alloc(); 25 | 26 | template 27 | T *alloc_array(size_t lenght); 28 | 29 | static inline size_t round_align(size_t to, size_t x) { 30 | return (x + (to - 1)) & ~(to - 1); 31 | } 32 | 33 | private: 34 | class page { 35 | public: 36 | page(void *mem, size_t size); 37 | 38 | ~page(); 39 | 40 | void *alloc(size_t alignment, size_t size); 41 | 42 | void set_next(page *p); 43 | 44 | page *get_next() const; 45 | 46 | static page *allocate_page(size_t alignment, size_t size); 47 | 48 | private: 49 | page *m_next; 50 | size_t m_size_left; 51 | void *m_stack; 52 | void *m_base; 53 | }; 54 | 55 | void inc_tail(size_t required_size); 56 | 57 | void *alloc(size_t alignment, size_t size); 58 | 59 | static const size_t m_page_alignment = 4 * (1 << 10); 60 | 61 | const size_t m_page_size; 62 | 63 | page *m_head; 64 | page *m_tail; 65 | }; 66 | 67 | lifo_allocator::page::page(void *mem, size_t size) 68 | : m_next(nullptr) 69 | , m_size_left(size) 70 | , m_stack(mem) 71 | , m_base(reinterpret_cast(mem) + sizeof(page)) 72 | { } 73 | 74 | lifo_allocator::page::~page() 75 | { 76 | } 77 | 78 | void lifo_allocator::page::set_next(lifo_allocator::page *p) 79 | { 80 | m_next = p; 81 | } 82 | 83 | lifo_allocator::page *lifo_allocator::page::get_next() const 84 | { 85 | return m_next; 86 | } 87 | 88 | void *lifo_allocator::page::alloc(size_t alignment, size_t size) 89 | { 90 | auto p = std::align(alignment, size, m_base, m_size_left); 91 | 92 | if (!p) 93 | return nullptr; 94 | 95 | m_base = reinterpret_cast(m_base) + size; 96 | m_size_left -= size; 97 | 98 | return p; 99 | } 100 | 101 | lifo_allocator::page *lifo_allocator::page::allocate_page( 102 | size_t alignment, 103 | size_t size 104 | ) { 105 | auto sz = round_align(alignment, size); 106 | auto p = aligned_alloc(alignment, sz); 107 | 108 | new(p) page(p, sz - sizeof(page)); 109 | 110 | return reinterpret_cast(p); 111 | } 112 | 113 | lifo_allocator::lifo_allocator(size_t page_size) 114 | : m_page_size(page_size) 115 | { 116 | m_head = page::allocate_page(m_page_alignment, m_page_size); 117 | m_tail = m_head; 118 | } 119 | 120 | lifo_allocator::~lifo_allocator() 121 | { 122 | auto n = m_head; 123 | 124 | while (n) { 125 | auto p = n; 126 | n = n->get_next(); 127 | std::free(p); 128 | } 129 | } 130 | 131 | template 132 | T *lifo_allocator::alloc() 133 | { 134 | auto p = alloc(alignof(T), sizeof(T)); 135 | return reinterpret_cast(p); 136 | } 137 | 138 | template 139 | T *lifo_allocator::alloc_array(size_t lenght) 140 | { 141 | auto p = alloc(alignof(T), sizeof(T) * lenght); 142 | return reinterpret_cast(p); 143 | } 144 | 145 | void *lifo_allocator::alloc(size_t alignment, size_t size) 146 | { 147 | void *ptr = m_tail->alloc(alignment, size); 148 | if (ptr) 149 | return ptr; 150 | 151 | inc_tail(size); 152 | 153 | ptr = m_tail->alloc(alignment, size); 154 | if (ptr) 155 | return ptr; 156 | 157 | throw std::bad_alloc(); 158 | } 159 | 160 | void lifo_allocator::inc_tail(size_t required_size) 161 | { 162 | auto sz = m_page_size; 163 | if (required_size > sz) 164 | sz = required_size; 165 | 166 | auto p = page::allocate_page(m_page_alignment, sz); 167 | 168 | m_tail->set_next(p); 169 | m_tail = p; 170 | } 171 | 172 | } /* internal */ 173 | } /* staccato */ 174 | 175 | #endif /* end of include guard: LIFO_ALLOCATOR_HPP_PXMFXMQN */ 176 | -------------------------------------------------------------------------------- /benchmarks/sequential/matmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Rectangular matrix multiplication. 3 | * 4 | * Adapted from Cilk 5.4.3 example 5 | * 6 | * https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 7 | * See the paper ``Cache-Oblivious Algorithms'', by 8 | * Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 9 | * Sridhar Ramachandran, FOCS 1999, for an explanation of 10 | * why this algorithm is good for caches. 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace std; 20 | using namespace chrono; 21 | 22 | typedef float elem_t; 23 | 24 | inline uint32_t xorshift_rand() { 25 | static uint32_t x = 2463534242; 26 | x ^= x >> 13; 27 | x ^= x << 17; 28 | x ^= x >> 5; 29 | return x; 30 | } 31 | 32 | void zero(elem_t *A, size_t n) 33 | { 34 | for (size_t i = 0; i < n; ++i) 35 | for (size_t j = 0; j < n; ++j) 36 | A[i * n + j] = 0.0; 37 | } 38 | 39 | void fill(elem_t *A, size_t n) 40 | { 41 | for (size_t i = 0; i < n; ++i) 42 | for (size_t j = 0; j < n; ++j) 43 | A[i * n + j] = xorshift_rand() % n; 44 | } 45 | 46 | double maxerror(elem_t *A, elem_t *B, size_t n) 47 | { 48 | size_t i, j; 49 | double error = 0.0; 50 | 51 | for (i = 0; i < n; i++) { 52 | for (j = 0; j < n; j++) { 53 | double diff = (A[i * n + j] - B[i * n + j]) / A[i * n + j]; 54 | if (diff < 0) 55 | diff = -diff; 56 | if (diff > error) 57 | error = diff; 58 | } 59 | } 60 | 61 | return error; 62 | } 63 | 64 | bool check(elem_t *A, elem_t *B, elem_t *C, size_t n) 65 | { 66 | elem_t tr_C = 0; 67 | elem_t tr_AB = 0; 68 | for (size_t i = 0; i < n; ++i) { 69 | for (size_t j = 0; j < n; ++j) 70 | tr_AB += A[i * n + j] * B[j * n + i]; 71 | tr_C += C[i * n + i]; 72 | } 73 | 74 | return fabs(tr_AB - tr_C) < 1e-3; 75 | } 76 | 77 | void seq_matmul( 78 | elem_t *A, 79 | elem_t *B, 80 | elem_t *C, 81 | size_t m, 82 | size_t n, 83 | size_t p, 84 | size_t ld, 85 | bool add 86 | ) { 87 | if ((m + n + p) <= 64) { 88 | if (add) { 89 | for (size_t i = 0; i < m; ++i) { 90 | for (size_t k = 0; k < p; ++k) { 91 | elem_t c = 0.0; 92 | for (size_t j = 0; j < n; ++j) 93 | c += A[i * ld + j] * B[j * ld + k]; 94 | C[i * ld + k] += c; 95 | } 96 | } 97 | } else { 98 | for (size_t i = 0; i < m; ++i) { 99 | for (size_t k = 0; k < p; ++k) { 100 | elem_t c = 0.0; 101 | for (size_t j = 0; j < n; ++j) 102 | c += A[i * ld + j] * B[j * ld + k]; 103 | C[i * ld + k] = c; 104 | } 105 | } 106 | } 107 | return; 108 | } 109 | 110 | if (m >= n && n >= p) { 111 | size_t m1 = m >> 1; 112 | seq_matmul(A, B, C, m1, n, p, ld, add); 113 | seq_matmul(A + m1 * ld, B, C + m1 * ld, m - m1, n, p, ld, add); 114 | } else if (n >= m && n >= p) { 115 | size_t n1 = n >> 1; 116 | seq_matmul(A, B, C, m, n1, p, ld, add); 117 | seq_matmul(A + n1, B + n1 * ld, C, m, n - n1, p, ld, true); 118 | } else { 119 | size_t p1 = p >> 1; 120 | seq_matmul(A, B, C, m, n, p1, ld, add); 121 | seq_matmul(A, B + p1, C + p1, m, n, p - p1, ld, add); 122 | } 123 | } 124 | 125 | int main(int argc, char *argv[]) { 126 | elem_t *A, *B, *C; 127 | size_t n = 3000; 128 | 129 | if (argc >= 3) 130 | n = atoi(argv[2]); 131 | 132 | A = new elem_t[n * n]; 133 | B = new elem_t[n * n]; 134 | C = new elem_t[n * n]; 135 | 136 | fill(A, n); 137 | fill(B, n); 138 | zero(C, n); 139 | 140 | auto start = system_clock::now(); 141 | 142 | seq_matmul(A, B, C, n, n, n, n, 0); 143 | 144 | auto stop = system_clock::now(); 145 | 146 | cout << "Scheduler: sequential\n"; 147 | cout << "Benchmark: matmul\n"; 148 | cout << "Threads: " << 0 << "\n"; 149 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 150 | cout << "Input: " << n << "\n"; 151 | cout << "Output: " << check(A, B, C, n) << "\n"; 152 | 153 | delete []C; 154 | delete []B; 155 | delete []A; 156 | return 0; 157 | } 158 | -------------------------------------------------------------------------------- /include/utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_HPP_CSPTFG9B 2 | #define UTILS_HPP_CSPTFG9B 3 | 4 | #include 5 | #include 6 | 7 | #ifndef STACCATO_DEBUG 8 | # define STACCATO_DEBUG 0 9 | #endif // STACCATO_DEBUG 10 | 11 | #if !defined(LEVEL1_DCACHE_LINESIZE) || LEVEL1_DCACHE_LINESIZE == 0 12 | # define STACCATO_CACHE_SIZE 64 13 | #else 14 | # define STACCATO_CACHE_SIZE LEVEL1_DCACHE_LINESIZE 15 | #endif // LEVEL1_DCACHE_LINESIZE 16 | 17 | // XXX: not tested 18 | #if __cplusplus > 199711L 19 | # define STACCATO_TLS thread_local 20 | #elif __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ 21 | # define STACCATO_TLS _Thread_local 22 | #elif defined _WIN32 && ( \ 23 | defined _MSC_VER || \ 24 | defined __ICL || \ 25 | defined __DMC__ || \ 26 | defined __BORLANDC__ ) 27 | # define STACCATO_TLS __declspec(thread) 28 | #elif defined __GNUC__ || \ 29 | defined __SUNPRO_C || \ 30 | defined __xlC__ 31 | # define STACCATO_TLS __thread 32 | #else 33 | # define STACCATO_TLS thread_local 34 | # warning "Cannot define thread_local" 35 | #endif 36 | 37 | // XXX: not tested 38 | #if __cplusplus > 199711L 39 | # define STACCATO_ALIGN alignas(STACCATO_CACHE_SIZE) 40 | #elif defined _WIN32 && ( \ 41 | defined _MSC_VER || \ 42 | defined __ICL || \ 43 | defined __DMC__ || \ 44 | defined __BORLANDC__ ) 45 | # define STACCATO_ALIGN __declspec(align(STACCATO_CACHE_SIZE)) 46 | #elif defined __GNUC__ || \ 47 | defined __SUNPRO_C || \ 48 | defined __xlC__ 49 | # define STACCATO_ALIGN __attribute__(aligned(STACCATO_CACHE_SIZE)) 50 | #else 51 | # define STACCATO_ALIGN alignas(STACCATO_CACHE_SIZE) 52 | # warning "Cannot define alignas()" 53 | #endif 54 | 55 | #if STACCATO_DEBUG 56 | # define STACCATO_ASSERT(condition, message) \ 57 | do { \ 58 | if (!(condition)) { \ 59 | std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ 60 | << ":" << __LINE__ << "\n" << message << "\n" << std::endl; \ 61 | std::exit(EXIT_FAILURE); \ 62 | } \ 63 | } while (false) 64 | #else 65 | # define STACCATO_ASSERT(condition, message) do { } while (false) 66 | #endif 67 | 68 | // TODO: define inline functions 69 | #define load_relaxed(var) (var).load(std::memory_order_relaxed) 70 | #define load_acquire(var) (var).load(std::memory_order_acquire) 71 | #define load_consume(var) (var).load(std::memory_order_consume) 72 | 73 | #define store_relaxed(var, x) (var).store((x), std::memory_order_relaxed) 74 | #define store_release(var, x) (var).store((x), std::memory_order_release) 75 | 76 | #define atomic_fence_seq_cst() std::atomic_thread_fence(std::memory_order_seq_cst) 77 | #define atomic_fence_release() std::atomic_thread_fence(std::memory_order_release) 78 | 79 | #define cas_strong(var, expected, new) (var).compare_exchange_strong((expected), (new), std::memory_order_seq_cst, std::memory_order_relaxed) 80 | #define cas_weak(var, expected, new) (var).compare_exchange_weak((expected), (new), std::memory_order_seq_cst, std::memory_order_relaxed) 81 | 82 | #define dec_relaxed(var) (var).fetch_sub(1, std::memory_order_relaxed) 83 | #define inc_relaxed(var) (var).fetch_add(1, std::memory_order_relaxed) 84 | 85 | #define dec_relaxed_p(var) (var)->fetch_sub(1, std::memory_order_relaxed) 86 | #define inc_relaxed_p(var) (var)->fetch_add(1, std::memory_order_relaxed) 87 | 88 | namespace staccato 89 | { 90 | namespace internal 91 | { 92 | 93 | inline uint32_t xorshift_rand() { 94 | STACCATO_TLS static uint32_t x = 2463534242; 95 | 96 | x ^= x >> 13; 97 | x ^= x << 17; 98 | x ^= x >> 5; 99 | return x; 100 | } 101 | 102 | inline bool is_pow2(uint64_t x) { 103 | return x && !(x & (x - 1)); 104 | } 105 | 106 | inline uint64_t next_pow2(uint64_t x) 107 | { 108 | x--; 109 | x |= (x >> 1); 110 | x |= (x >> 2); 111 | x |= (x >> 4); 112 | x |= (x >> 8); 113 | x |= (x >> 16); 114 | x |= (x >> 32); 115 | return x + 1; 116 | } 117 | 118 | } /* internal */ 119 | } /* staccato */ 120 | 121 | #endif /* end of include guard: UTILS_HPP_CSPTFG9B */ 122 | -------------------------------------------------------------------------------- /include/scheduler.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STACCATO_SCEDULER_H 2 | #define STACCATO_SCEDULER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "utils.hpp" 12 | #include "debug.hpp" 13 | 14 | #include "worker.hpp" 15 | #include "lifo_allocator.hpp" 16 | #include "counter.hpp" 17 | 18 | namespace staccato 19 | { 20 | 21 | template 22 | class task; 23 | 24 | template 25 | class scheduler 26 | { 27 | public: 28 | 29 | scheduler ( 30 | size_t taskgraph_degree, 31 | size_t nworkers = 0, 32 | size_t taskgraph_height = 1 33 | ); 34 | 35 | ~scheduler(); 36 | 37 | T *root(); 38 | void spawn(T *t); 39 | void wait(); 40 | 41 | private: 42 | struct worker_t { 43 | std::thread *thr; 44 | internal::lifo_allocator *alloc; 45 | internal::worker * wkr; 46 | std::atomic_bool ready; 47 | }; 48 | 49 | inline size_t predict_page_size() const; 50 | 51 | void create_workers(); 52 | void create_worker(size_t id); 53 | 54 | const size_t m_taskgraph_degree; 55 | const size_t m_taskgraph_height; 56 | 57 | size_t m_nworkers; 58 | worker_t *m_workers; 59 | internal::worker *m_master; 60 | }; 61 | 62 | template 63 | scheduler::scheduler( 64 | size_t taskgraph_degree, 65 | size_t nworkers, 66 | size_t taskgraph_height 67 | ) 68 | : m_taskgraph_degree(internal::next_pow2(taskgraph_degree)) 69 | , m_taskgraph_height(taskgraph_height) 70 | , m_nworkers(nworkers) 71 | { 72 | internal::Debug() << "Scheduler is working in debug mode"; 73 | 74 | if (m_nworkers == 0) 75 | m_nworkers = std::thread::hardware_concurrency(); 76 | 77 | create_workers(); 78 | } 79 | 80 | template 81 | void scheduler::create_workers() 82 | { 83 | using namespace internal; 84 | 85 | m_workers = new worker_t[m_nworkers]; 86 | for (size_t i = 0; i < m_nworkers; ++i) 87 | m_workers[i].ready = false; 88 | 89 | create_worker(0); 90 | m_workers[0].thr = nullptr; 91 | m_master = m_workers[0].wkr; 92 | 93 | for (size_t i = 1; i < m_nworkers; ++i) { 94 | m_workers[i].thr = new std::thread([=] { 95 | create_worker(i); 96 | m_workers[i].wkr->steal_loop(); 97 | }); 98 | } 99 | 100 | for (size_t i = 0; i < m_nworkers; ++i) 101 | while (!m_workers[i].ready) 102 | std::this_thread::yield(); 103 | 104 | for (size_t i = 0; i < m_nworkers; ++i) { 105 | for (size_t j = 0; j < m_nworkers; ++j) { 106 | if (i == j) 107 | continue; 108 | m_workers[i].wkr->cache_victim(m_workers[j].wkr); 109 | } 110 | } 111 | } 112 | 113 | template 114 | void scheduler::create_worker(size_t id) 115 | { 116 | using namespace internal; 117 | 118 | Debug() << "Init worker #" << id; 119 | 120 | auto alloc = new lifo_allocator(predict_page_size()); 121 | 122 | auto wkr = alloc->alloc>(); 123 | new(wkr) 124 | worker(id, alloc, m_nworkers, m_taskgraph_degree, m_taskgraph_height); 125 | 126 | m_workers[id].alloc = alloc; 127 | m_workers[id].wkr = wkr; 128 | m_workers[id].ready = true; 129 | } 130 | 131 | template 132 | inline size_t scheduler::predict_page_size() const 133 | { 134 | using namespace internal; 135 | 136 | size_t s = 0; 137 | s += alignof(task_deque) + sizeof(task_deque); 138 | s += alignof(T) + sizeof(T) * m_taskgraph_degree; 139 | s *= m_taskgraph_height; 140 | return s; 141 | } 142 | 143 | template 144 | scheduler::~scheduler() 145 | { 146 | for (size_t i = 1; i < m_nworkers; ++i) { 147 | while (!m_workers[i].ready) 148 | std::this_thread::yield(); 149 | 150 | m_workers[i].wkr->stop(); 151 | } 152 | 153 | #if STACCATO_DEBUG 154 | internal::counter::print_header(); 155 | for (size_t i = 0; i < m_nworkers; ++i) 156 | m_workers[i].wkr->print_counters(); 157 | #endif 158 | 159 | for (size_t i = 1; i < m_nworkers; ++i) 160 | m_workers[i].thr->join(); 161 | 162 | for (size_t i = 0; i < m_nworkers; ++i) { 163 | delete m_workers[i].alloc; 164 | delete m_workers[i].thr; 165 | } 166 | 167 | delete []m_workers; 168 | } 169 | 170 | template 171 | T *scheduler::root() 172 | { 173 | return m_master->root_allocate(); 174 | } 175 | 176 | template 177 | void scheduler::spawn(T *) 178 | { 179 | m_master->root_commit(); 180 | } 181 | 182 | template 183 | void scheduler::wait() 184 | { 185 | m_master->root_wait(); 186 | } 187 | 188 | } /* namespace:staccato */ 189 | 190 | #endif /* end of include guard: STACCATO_SCEDULER_H */ 191 | 192 | -------------------------------------------------------------------------------- /benchmarks/cilk/matmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Rectangular matrix multiplication. 3 | * 4 | * Adapted from Cilk 5.4.3 example 5 | * 6 | * https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 7 | * See the paper ``Cache-Oblivious Algorithms'', by 8 | * Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 9 | * Sridhar Ramachandran, FOCS 1999, for an explanation of 10 | * why this algorithm is good for caches. 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | using namespace std; 23 | using namespace chrono; 24 | 25 | typedef float elem_t; 26 | 27 | inline uint32_t xorshift_rand() { 28 | static uint32_t x = 2463534242; 29 | x ^= x >> 13; 30 | x ^= x << 17; 31 | x ^= x >> 5; 32 | return x; 33 | } 34 | 35 | void zero(elem_t *A, size_t n) 36 | { 37 | for (size_t i = 0; i < n; ++i) 38 | for (size_t j = 0; j < n; ++j) 39 | A[i * n + j] = 0.0; 40 | } 41 | 42 | void fill(elem_t *A, size_t n) 43 | { 44 | for (size_t i = 0; i < n; ++i) 45 | for (size_t j = 0; j < n; ++j) 46 | A[i * n + j] = xorshift_rand() % n; 47 | } 48 | 49 | double maxerror(elem_t *A, elem_t *B, size_t n) 50 | { 51 | size_t i, j; 52 | double error = 0.0; 53 | 54 | for (i = 0; i < n; i++) { 55 | for (j = 0; j < n; j++) { 56 | double diff = (A[i * n + j] - B[i * n + j]) / A[i * n + j]; 57 | if (diff < 0) 58 | diff = -diff; 59 | if (diff > error) 60 | error = diff; 61 | } 62 | } 63 | 64 | return error; 65 | } 66 | 67 | bool check(elem_t *A, elem_t *B, elem_t *C, size_t n) 68 | { 69 | elem_t tr_C = 0; 70 | elem_t tr_AB = 0; 71 | for (size_t i = 0; i < n; ++i) { 72 | for (size_t j = 0; j < n; ++j) 73 | tr_AB += A[i * n + j] * B[j * n + i]; 74 | tr_C += C[i * n + i]; 75 | } 76 | 77 | return fabs(tr_AB - tr_C) < 1e-3; 78 | } 79 | 80 | 81 | void matmul( 82 | elem_t *A, 83 | elem_t *B, 84 | elem_t *C, 85 | size_t m, 86 | size_t n, 87 | size_t p, 88 | size_t ld, 89 | bool add 90 | ) { 91 | if ((m + n + p) <= 64) { 92 | if (add) { 93 | for (size_t i = 0; i < m; ++i) { 94 | for (size_t k = 0; k < p; ++k) { 95 | elem_t c = 0.0; 96 | for (size_t j = 0; j < n; ++j) 97 | c += A[i * ld + j] * B[j * ld + k]; 98 | C[i * ld + k] += c; 99 | } 100 | } 101 | } else { 102 | for (size_t i = 0; i < m; ++i) { 103 | for (size_t k = 0; k < p; ++k) { 104 | elem_t c = 0.0; 105 | for (size_t j = 0; j < n; ++j) 106 | c += A[i * ld + j] * B[j * ld + k]; 107 | C[i * ld + k] = c; 108 | } 109 | } 110 | } 111 | 112 | return; 113 | } 114 | 115 | if (m >= n && n >= p) { 116 | size_t m1 = m >> 1; 117 | cilk_spawn matmul(A, B, C, m1, n, p, ld, add); 118 | cilk_spawn matmul(A + m1 * ld, B, C + m1 * ld, m - m1, n, p, ld, add); 119 | } else if (n >= m && n >= p) { 120 | size_t n1 = n >> 1; 121 | cilk_spawn matmul(A, B, C, m, n1, p, ld, add); 122 | cilk_spawn matmul(A + n1, B + n1 * ld, C, m, n - n1, p, ld, true); 123 | } else { 124 | size_t p1 = p >> 1; 125 | cilk_spawn matmul(A, B, C, m, n, p1, ld, add); 126 | cilk_spawn matmul(A, B + p1, C + p1, m, n, p - p1, ld, add); 127 | } 128 | 129 | cilk_sync; 130 | } 131 | 132 | void test( 133 | elem_t *A, 134 | elem_t *B, 135 | elem_t *C, 136 | size_t n 137 | ) { 138 | cilk_spawn matmul(A, B, C, n, n, n, n, 0); 139 | cilk_sync; 140 | } 141 | 142 | int main(int argc, char *argv[]) { 143 | elem_t *A, *B, *C; 144 | size_t n = 3000; 145 | const char *nthreads = nullptr; 146 | 147 | if (argc >= 2) 148 | nthreads = argv[1]; 149 | if (argc >= 3) 150 | n = atoi(argv[2]); 151 | if (nthreads == 0) 152 | nthreads = to_string(thread::hardware_concurrency()).c_str(); 153 | 154 | A = new elem_t[n * n]; 155 | B = new elem_t[n * n]; 156 | C = new elem_t[n * n]; 157 | 158 | fill(A, n); 159 | fill(B, n); 160 | zero(C, n); 161 | 162 | __cilkrts_end_cilk(); 163 | 164 | auto start = system_clock::now(); 165 | 166 | if (__cilkrts_set_param("nworkers", nthreads) != 0) { 167 | cerr << "Failed to set worker count\n"; 168 | exit(EXIT_FAILURE); 169 | } 170 | 171 | __cilkrts_init(); 172 | 173 | test(A, B, C, n); 174 | 175 | __cilkrts_end_cilk(); 176 | 177 | auto stop = system_clock::now(); 178 | 179 | cout << "Scheduler: cilk\n"; 180 | cout << "Benchmark: matmul\n"; 181 | cout << "Threads: " << nthreads << "\n"; 182 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 183 | cout << "Input: " << n << "\n"; 184 | cout << "Output: " << check(A, B, C, n) << "\n"; 185 | 186 | delete []C; 187 | delete []B; 188 | delete []A; 189 | return 0; 190 | } 191 | -------------------------------------------------------------------------------- /benchmarks/openmp/matmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Rectangular matrix multiplication. 3 | * 4 | * Adapted from Cilk 5.4.3 example 5 | * 6 | * https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 7 | * See the paper ``Cache-Oblivious Algorithms'', by 8 | * Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 9 | * Sridhar Ramachandran, FOCS 1999, for an explanation of 10 | * why this algorithm is good for caches. 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | using namespace std; 22 | using namespace chrono; 23 | 24 | typedef float elem_t; 25 | 26 | inline uint32_t xorshift_rand() { 27 | static uint32_t x = 2463534242; 28 | x ^= x >> 13; 29 | x ^= x << 17; 30 | x ^= x >> 5; 31 | return x; 32 | } 33 | 34 | void zero(elem_t *A, size_t n) 35 | { 36 | for (size_t i = 0; i < n; ++i) 37 | for (size_t j = 0; j < n; ++j) 38 | A[i * n + j] = 0.0; 39 | } 40 | 41 | void fill(elem_t *A, size_t n) 42 | { 43 | for (size_t i = 0; i < n; ++i) 44 | for (size_t j = 0; j < n; ++j) 45 | A[i * n + j] = xorshift_rand() % n; 46 | } 47 | 48 | double maxerror(elem_t *A, elem_t *B, size_t n) 49 | { 50 | size_t i, j; 51 | double error = 0.0; 52 | 53 | for (i = 0; i < n; i++) { 54 | for (j = 0; j < n; j++) { 55 | double diff = (A[i * n + j] - B[i * n + j]) / A[i * n + j]; 56 | if (diff < 0) 57 | diff = -diff; 58 | if (diff > error) 59 | error = diff; 60 | } 61 | } 62 | 63 | return error; 64 | } 65 | 66 | bool check(elem_t *A, elem_t *B, elem_t *C, size_t n) 67 | { 68 | elem_t tr_C = 0; 69 | elem_t tr_AB = 0; 70 | for (size_t i = 0; i < n; ++i) { 71 | for (size_t j = 0; j < n; ++j) 72 | tr_AB += A[i * n + j] * B[j * n + i]; 73 | tr_C += C[i * n + i]; 74 | } 75 | 76 | return fabs(tr_AB - tr_C) < 1e-3; 77 | } 78 | 79 | 80 | void matmul( 81 | elem_t *A, 82 | elem_t *B, 83 | elem_t *C, 84 | size_t m, 85 | size_t n, 86 | size_t p, 87 | size_t ld, 88 | bool add 89 | ) { 90 | if ((m + n + p) <= 64) { 91 | if (add) { 92 | for (size_t i = 0; i < m; ++i) { 93 | for (size_t k = 0; k < p; ++k) { 94 | elem_t c = 0.0; 95 | for (size_t j = 0; j < n; ++j) 96 | c += A[i * ld + j] * B[j * ld + k]; 97 | C[i * ld + k] += c; 98 | } 99 | } 100 | } else { 101 | for (size_t i = 0; i < m; ++i) { 102 | for (size_t k = 0; k < p; ++k) { 103 | elem_t c = 0.0; 104 | for (size_t j = 0; j < n; ++j) 105 | c += A[i * ld + j] * B[j * ld + k]; 106 | C[i * ld + k] = c; 107 | } 108 | } 109 | } 110 | 111 | return; 112 | } 113 | 114 | if (m >= n && n >= p) { 115 | size_t m1 = m >> 1; 116 | #pragma omp task shared(A, B, C) 117 | matmul(A, B, C, m1, n, p, ld, add); 118 | #pragma omp task shared(A, B, C) 119 | matmul(A + m1 * ld, B, C + m1 * ld, m - m1, n, p, ld, add); 120 | } else if (n >= m && n >= p) { 121 | size_t n1 = n >> 1; 122 | #pragma omp task shared(A, B, C) 123 | matmul(A, B, C, m, n1, p, ld, add); 124 | #pragma omp task shared(A, B, C) 125 | matmul(A + n1, B + n1 * ld, C, m, n - n1, p, ld, true); 126 | } else { 127 | size_t p1 = p >> 1; 128 | #pragma omp task shared(A, B, C) 129 | matmul(A, B, C, m, n, p1, ld, add); 130 | #pragma omp task shared(A, B, C) 131 | matmul(A, B + p1, C + p1, m, n, p - p1, ld, add); 132 | } 133 | 134 | #pragma omp taskwait 135 | } 136 | 137 | void test( 138 | elem_t *A, 139 | elem_t *B, 140 | elem_t *C, 141 | size_t n 142 | ) { 143 | #pragma omp task shared(A, B, C, n) 144 | matmul(A, B, C, n, n, n, n, 0); 145 | #pragma omp taskwait 146 | } 147 | 148 | int main(int argc, char *argv[]) 149 | { 150 | elem_t *A, *B, *C; 151 | size_t n = 3000; 152 | size_t nthreads = 0; 153 | 154 | if (argc >= 2) 155 | nthreads = atoi(argv[1]); 156 | if (argc >= 3) 157 | n = atoi(argv[2]); 158 | if (nthreads == 0) 159 | nthreads = thread::hardware_concurrency(); 160 | 161 | A = new elem_t[n * n]; 162 | B = new elem_t[n * n]; 163 | C = new elem_t[n * n]; 164 | 165 | fill(A, n); 166 | fill(B, n); 167 | zero(C, n); 168 | 169 | auto start = system_clock::now(); 170 | 171 | omp_set_dynamic(0); 172 | omp_set_num_threads(nthreads); 173 | 174 | #pragma omp parallel shared(A, B, C, n) 175 | #pragma omp single 176 | test(A, B, C, n); 177 | 178 | auto stop = system_clock::now(); 179 | 180 | cout << "Scheduler: openmp\n"; 181 | cout << "Benchmark: matmul\n"; 182 | cout << "Threads: " << nthreads << "\n"; 183 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 184 | cout << "Input: " << n << "\n"; 185 | cout << "Output: " << check(A, B, C, n) << "\n"; 186 | 187 | delete []C; 188 | delete []B; 189 | delete []A; 190 | return 0; 191 | } 192 | -------------------------------------------------------------------------------- /benchmarks/staccato/matmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Rectangular matrix multiplication. 3 | * 4 | * Adapted from Cilk 5.4.3 example 5 | * 6 | * https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 7 | * See the paper ``Cache-Oblivious Algorithms'', by 8 | * Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 9 | * Sridhar Ramachandran, FOCS 1999, for an explanation of 10 | * why this algorithm is good for caches. 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | using namespace std; 23 | using namespace chrono; 24 | using namespace staccato; 25 | 26 | typedef float elem_t; 27 | 28 | inline uint32_t xorshift_rand() { 29 | static uint32_t x = 2463534242; 30 | x ^= x >> 13; 31 | x ^= x << 17; 32 | x ^= x >> 5; 33 | return x; 34 | } 35 | 36 | void zero(elem_t *A, size_t n) 37 | { 38 | for (size_t i = 0; i < n; ++i) 39 | for (size_t j = 0; j < n; ++j) 40 | A[i * n + j] = 0.0; 41 | } 42 | 43 | void fill(elem_t *A, size_t n) 44 | { 45 | for (size_t i = 0; i < n; ++i) 46 | for (size_t j = 0; j < n; ++j) 47 | A[i * n + j] = xorshift_rand() % n; 48 | } 49 | 50 | double maxerror(elem_t *A, elem_t *B, size_t n) 51 | { 52 | size_t i, j; 53 | double error = 0.0; 54 | 55 | for (i = 0; i < n; i++) { 56 | for (j = 0; j < n; j++) { 57 | double diff = (A[i * n + j] - B[i * n + j]) / A[i * n + j]; 58 | if (diff < 0) 59 | diff = -diff; 60 | if (diff > error) 61 | error = diff; 62 | } 63 | } 64 | 65 | return error; 66 | } 67 | 68 | bool check(elem_t *A, elem_t *B, elem_t *C, size_t n) 69 | { 70 | elem_t tr_C = 0; 71 | elem_t tr_AB = 0; 72 | for (size_t i = 0; i < n; ++i) { 73 | for (size_t j = 0; j < n; ++j) 74 | tr_AB += A[i * n + j] * B[j * n + i]; 75 | tr_C += C[i * n + i]; 76 | } 77 | 78 | return fabs(tr_AB - tr_C) < 1e-3; 79 | } 80 | 81 | class MultTask: public task 82 | { 83 | public: 84 | MultTask( 85 | elem_t *A, 86 | elem_t *B, 87 | elem_t *C, 88 | size_t m, 89 | size_t n, 90 | size_t p, 91 | size_t ld, 92 | bool add) 93 | : A(A) 94 | , B(B) 95 | , C(C) 96 | , m(m) 97 | , n(n) 98 | , p(p) 99 | , ld(ld) 100 | , add(add) 101 | { }; 102 | 103 | void execute() { 104 | if ((m + n + p) <= 64) { 105 | if (add) { 106 | for (size_t i = 0; i < m; ++i) { 107 | for (size_t k = 0; k < p; ++k) { 108 | elem_t c = 0.0; 109 | for (size_t j = 0; j < n; ++j) 110 | c += A[i * ld + j] * B[j * ld + k]; 111 | C[i * ld + k] += c; 112 | } 113 | } 114 | } else { 115 | for (size_t i = 0; i < m; ++i) { 116 | for (size_t k = 0; k < p; ++k) { 117 | elem_t c = 0.0; 118 | for (size_t j = 0; j < n; ++j) 119 | c += A[i * ld + j] * B[j * ld + k]; 120 | C[i * ld + k] = c; 121 | } 122 | } 123 | } 124 | 125 | return; 126 | } 127 | 128 | if (m >= n && n >= p) { 129 | size_t m1 = m >> 1; 130 | spawn(new(child()) 131 | MultTask(A, B, C, m1, n, p, ld, add)); 132 | spawn(new(child()) 133 | MultTask(A + m1 * ld, B, C + m1 * ld, m - m1, n, p, ld, add)); 134 | } else if (n >= m && n >= p) { 135 | size_t n1 = n >> 1; 136 | spawn(new(child()) 137 | MultTask(A, B, C, m, n1, p, ld, add)); 138 | spawn(new(child()) 139 | MultTask(A + n1, B + n1 * ld, C, m, n - n1, p, ld, true)); 140 | } else { 141 | size_t p1 = p >> 1; 142 | spawn(new(child()) 143 | MultTask(A, B, C, m, n, p1, ld, add)); 144 | spawn(new(child()) 145 | MultTask(A, B + p1, C + p1, m, n, p - p1, ld, add)); 146 | } 147 | 148 | wait(); 149 | } 150 | 151 | private: 152 | elem_t *A; 153 | elem_t *B; 154 | elem_t *C; 155 | size_t m; 156 | size_t n; 157 | size_t p; 158 | size_t ld; 159 | bool add; 160 | }; 161 | 162 | int main(int argc, char *argv[]) { 163 | elem_t *A, *B, *C; 164 | size_t n = 3000; 165 | size_t nthreads = 0; 166 | 167 | if (argc >= 2) 168 | nthreads = atoi(argv[1]); 169 | if (argc >= 3) 170 | n = atoi(argv[2]); 171 | if (nthreads == 0) 172 | nthreads = thread::hardware_concurrency(); 173 | 174 | A = new elem_t[n * n]; 175 | B = new elem_t[n * n]; 176 | C = new elem_t[n * n]; 177 | 178 | fill(A, n); 179 | fill(B, n); 180 | zero(C, n); 181 | 182 | auto start = system_clock::now(); 183 | 184 | { 185 | scheduler sh(2, nthreads); 186 | sh.spawn(new(sh.root()) MultTask(A, B, C, n, n, n, n, 0)); 187 | sh.wait(); 188 | } 189 | 190 | auto stop = system_clock::now(); 191 | 192 | cout << "Scheduler: staccato\n"; 193 | cout << "Benchmark: matmul\n"; 194 | cout << "Threads: " << nthreads << "\n"; 195 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 196 | cout << "Input: " << n << "\n"; 197 | cout << "Output: " << check(A, B, C, n) << "\n"; 198 | 199 | delete []C; 200 | delete []B; 201 | delete []A; 202 | return 0; 203 | } 204 | -------------------------------------------------------------------------------- /benchmarks/tbb/matmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Rectangular matrix multiplication. 3 | * 4 | * Adapted from Cilk 5.4.3 example 5 | * 6 | * https://bradley.csail.mit.edu/svn/repos/cilk/5.4.3/examples/matmul.cilk; 7 | * See the paper ``Cache-Oblivious Algorithms'', by 8 | * Matteo Frigo, Charles E. Leiserson, Harald Prokop, and 9 | * Sridhar Ramachandran, FOCS 1999, for an explanation of 10 | * why this algorithm is good for caches. 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | using namespace std; 23 | using namespace chrono; 24 | using namespace tbb; 25 | 26 | typedef float elem_t; 27 | 28 | inline uint32_t xorshift_rand() { 29 | static uint32_t x = 2463534242; 30 | x ^= x >> 13; 31 | x ^= x << 17; 32 | x ^= x >> 5; 33 | return x; 34 | } 35 | 36 | void zero(elem_t *A, size_t n) 37 | { 38 | for (size_t i = 0; i < n; ++i) 39 | for (size_t j = 0; j < n; ++j) 40 | A[i * n + j] = 0.0; 41 | } 42 | 43 | void fill(elem_t *A, size_t n) 44 | { 45 | for (size_t i = 0; i < n; ++i) 46 | for (size_t j = 0; j < n; ++j) 47 | A[i * n + j] = xorshift_rand() % n; 48 | } 49 | 50 | bool check(elem_t *A, elem_t *B, elem_t *C, size_t n) 51 | { 52 | elem_t tr_C = 0; 53 | elem_t tr_AB = 0; 54 | for (size_t i = 0; i < n; ++i) { 55 | for (size_t j = 0; j < n; ++j) 56 | tr_AB += A[i * n + j] * B[j * n + i]; 57 | tr_C += C[i * n + i]; 58 | } 59 | 60 | return fabs(tr_AB - tr_C) < 1e-3; 61 | } 62 | 63 | class MultTask: public task 64 | { 65 | public: 66 | MultTask( 67 | elem_t *A, 68 | elem_t *B, 69 | elem_t *C, 70 | size_t m, 71 | size_t n, 72 | size_t p, 73 | size_t ld, 74 | bool add) 75 | : A(A) 76 | , B(B) 77 | , C(C) 78 | , m(m) 79 | , n(n) 80 | , p(p) 81 | , ld(ld) 82 | , add(add) 83 | { }; 84 | 85 | task *execute() { 86 | if ((m + n + p) <= 64) { 87 | if (add) { 88 | for (size_t i = 0; i < m; ++i) { 89 | for (size_t k = 0; k < p; ++k) { 90 | elem_t c = 0.0; 91 | for (size_t j = 0; j < n; ++j) 92 | c += A[i * ld + j] * B[j * ld + k]; 93 | C[i * ld + k] += c; 94 | } 95 | } 96 | } else { 97 | for (size_t i = 0; i < m; ++i) { 98 | for (size_t k = 0; k < p; ++k) { 99 | elem_t c = 0.0; 100 | for (size_t j = 0; j < n; ++j) 101 | c += A[i * ld + j] * B[j * ld + k]; 102 | C[i * ld + k] = c; 103 | } 104 | } 105 | } 106 | 107 | return nullptr; 108 | } 109 | 110 | if (m >= n && n >= p) { 111 | size_t m1 = m >> 1; 112 | MultTask &a = *new(allocate_child()) 113 | MultTask(A, B, C, m1, n, p, ld, add); 114 | MultTask &b = *new(allocate_child()) 115 | MultTask(A + m1 * ld, B, C + m1 * ld, m - m1, n, p, ld, add); 116 | 117 | set_ref_count(3); 118 | 119 | spawn(a); 120 | spawn(b); 121 | } else if (n >= m && n >= p) { 122 | size_t n1 = n >> 1; 123 | MultTask &a = *new(allocate_child()) 124 | MultTask(A, B, C, m, n1, p, ld, add); 125 | MultTask &b = *new(allocate_child()) 126 | MultTask(A + n1, B + n1 * ld, C, m, n - n1, p, ld, true); 127 | 128 | set_ref_count(3); 129 | 130 | spawn(a); 131 | spawn(b); 132 | } else { 133 | size_t p1 = p >> 1; 134 | MultTask &a = *new(allocate_child()) 135 | MultTask(A, B, C, m, n, p1, ld, add); 136 | MultTask &b = *new(allocate_child()) 137 | MultTask(A, B + p1, C + p1, m, n, p - p1, ld, add); 138 | 139 | set_ref_count(3); 140 | 141 | spawn(a); 142 | spawn(b); 143 | } 144 | 145 | wait_for_all(); 146 | 147 | return nullptr; 148 | } 149 | 150 | private: 151 | elem_t *A; 152 | elem_t *B; 153 | elem_t *C; 154 | size_t m; 155 | size_t n; 156 | size_t p; 157 | size_t ld; 158 | bool add; 159 | }; 160 | 161 | int main(int argc, char *argv[]) { 162 | elem_t *A, *B, *C; 163 | size_t n = 3000; 164 | size_t nthreads = 0; 165 | 166 | if (argc >= 2) 167 | nthreads = atoi(argv[1]); 168 | if (argc >= 3) 169 | n = atoi(argv[2]); 170 | if (nthreads == 0) 171 | nthreads = thread::hardware_concurrency(); 172 | 173 | A = new elem_t[n * n]; 174 | B = new elem_t[n * n]; 175 | C = new elem_t[n * n]; 176 | 177 | fill(A, n); 178 | fill(B, n); 179 | zero(C, n); 180 | 181 | auto start = system_clock::now(); 182 | 183 | task_scheduler_init scheduler(nthreads); 184 | 185 | auto root = new(task::allocate_root()) MultTask(A, B, C, n, n, n, n, 0); 186 | 187 | task::spawn_root_and_wait(*root); 188 | 189 | scheduler.terminate(); 190 | 191 | auto stop = system_clock::now(); 192 | 193 | cout << "Scheduler: tbb\n"; 194 | cout << "Benchmark: matmul\n"; 195 | cout << "Threads: " << nthreads << "\n"; 196 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 197 | cout << "Input: " << n << "\n"; 198 | cout << "Output: " << check(A, B, C, n) << "\n"; 199 | 200 | delete []C; 201 | delete []B; 202 | delete []A; 203 | return 0; 204 | } 205 | -------------------------------------------------------------------------------- /include/worker.hpp: -------------------------------------------------------------------------------- 1 | #ifndef WORKER_H_MIRBQTTK 2 | #define WORKER_H_MIRBQTTK 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.hpp" 9 | 10 | #include "task_deque.hpp" 11 | #include "lifo_allocator.hpp" 12 | #include "task.hpp" 13 | #include "counter.hpp" 14 | 15 | namespace staccato 16 | { 17 | namespace internal 18 | { 19 | 20 | template 21 | class worker 22 | { 23 | public: 24 | worker( 25 | size_t id, 26 | lifo_allocator *alloc, 27 | size_t nvictims, 28 | size_t taskgraph_degree, 29 | size_t taskgraph_height 30 | ); 31 | 32 | ~worker(); 33 | 34 | void cache_victim(worker *victim); 35 | 36 | void stop(); 37 | 38 | void local_loop(task_deque *tail); 39 | 40 | void steal_loop(); 41 | 42 | T *root_allocate(); 43 | void root_commit(); 44 | void root_wait(); 45 | 46 | #if STACCATO_DEBUG 47 | void print_counters(); 48 | #endif 49 | 50 | private: 51 | void init(size_t core_id, worker *victim); 52 | 53 | void grow_tail(task_deque *tail); 54 | 55 | task_deque *get_victim(); 56 | 57 | task *steal_task(task_deque *tail, task_deque **victim); 58 | 59 | const size_t m_id; 60 | const size_t m_taskgraph_degree; 61 | const size_t m_taskgraph_height; 62 | lifo_allocator *m_allocator; 63 | 64 | #if STACCATO_DEBUG 65 | counter m_counter; 66 | #endif 67 | 68 | std::atomic_bool m_stopped; 69 | 70 | std::atomic_size_t m_nvictims; 71 | 72 | task_deque **m_victims_heads; 73 | 74 | task_deque *m_head_deque; 75 | }; 76 | 77 | template 78 | worker::worker( 79 | size_t id, 80 | lifo_allocator *alloc, 81 | size_t nvictims, 82 | size_t taskgraph_degree, 83 | size_t taskgraph_height 84 | ) 85 | : m_id(id) 86 | , m_taskgraph_degree(taskgraph_degree) 87 | , m_taskgraph_height(taskgraph_height) 88 | , m_allocator(alloc) 89 | , m_stopped(false) 90 | , m_nvictims(0) 91 | , m_victims_heads(nullptr) 92 | , m_head_deque(nullptr) 93 | { 94 | m_victims_heads = m_allocator->alloc_array *>(nvictims); 95 | 96 | auto d = m_allocator->alloc>(); 97 | auto t = m_allocator->alloc_array(m_taskgraph_degree); 98 | new(d) task_deque(m_taskgraph_degree, t); 99 | 100 | m_head_deque = d; 101 | 102 | for (size_t i = 1; i < m_taskgraph_height + 1; ++i) { 103 | auto n = m_allocator->alloc>(); 104 | auto t = m_allocator->alloc_array(m_taskgraph_degree); 105 | new(n) task_deque(m_taskgraph_degree, t); 106 | 107 | d->set_next(n); 108 | d = n; 109 | } 110 | } 111 | 112 | template 113 | worker::~worker() 114 | { 115 | delete m_allocator; 116 | } 117 | 118 | template 119 | void worker::cache_victim(worker *victim) 120 | { 121 | m_victims_heads[m_nvictims] = victim->m_head_deque; 122 | m_nvictims++; 123 | } 124 | 125 | template 126 | void worker::stop() 127 | { 128 | m_stopped = true; 129 | } 130 | 131 | template 132 | T *worker::root_allocate() 133 | { 134 | return m_head_deque->put_allocate(); 135 | } 136 | 137 | template 138 | void worker::root_commit() 139 | { 140 | m_head_deque->put_commit(); 141 | } 142 | 143 | template 144 | void worker::root_wait() 145 | { 146 | local_loop(m_head_deque); 147 | } 148 | 149 | template 150 | void worker::grow_tail(task_deque *tail) 151 | { 152 | if (tail->get_next()) 153 | return; 154 | 155 | auto d = m_allocator->alloc>(); 156 | auto t = m_allocator->alloc_array(m_taskgraph_degree); 157 | new(d) task_deque(m_taskgraph_degree, t); 158 | 159 | tail->set_next(d); 160 | 161 | // if (!m_victim_tail) 162 | // return; 163 | // 164 | // auto v = m_victim_tail->get_next(); 165 | // if (!v) 166 | // return; 167 | // 168 | // d->set_victim(v); 169 | // m_victim_tail = v; 170 | } 171 | 172 | template 173 | task_deque *worker::get_victim() 174 | { 175 | auto i = xorshift_rand() % m_nvictims; 176 | return m_victims_heads[i]; 177 | } 178 | 179 | template 180 | void worker::steal_loop() 181 | { 182 | while (m_nvictims == 0) 183 | std::this_thread::yield(); 184 | 185 | auto vhead = get_victim(); 186 | auto vtail = vhead; 187 | size_t now_stolen = 0; 188 | 189 | while (!load_relaxed(m_stopped)) { 190 | if (now_stolen >= m_taskgraph_degree - 1) { 191 | if (vtail->get_next()) { 192 | vtail = vtail->get_next(); 193 | now_stolen = 0; 194 | } 195 | } 196 | 197 | bool was_empty = false; 198 | auto t = vtail->steal(&was_empty); 199 | 200 | #if STACCATO_DEBUG 201 | if (t) 202 | COUNT(steal); 203 | else if (was_empty) 204 | COUNT(steal_empty); 205 | else 206 | COUNT(steal_race); 207 | #endif 208 | 209 | if (t) { 210 | t->process(this, m_head_deque); 211 | vtail->return_stolen(); 212 | 213 | vtail = get_victim(); 214 | now_stolen = 0; 215 | 216 | continue; 217 | } 218 | 219 | if (!was_empty) { 220 | now_stolen++; 221 | continue; 222 | } 223 | 224 | if (vtail->get_next()) 225 | vtail = vtail->get_next(); 226 | else 227 | vtail = get_victim(); 228 | 229 | now_stolen = 0; 230 | } 231 | } 232 | 233 | template 234 | void worker::local_loop(task_deque *tail) 235 | { 236 | task *t = nullptr; 237 | task_deque *victim = nullptr; 238 | 239 | while (true) { // Local tasks loop 240 | if (t) { 241 | grow_tail(tail); 242 | 243 | t->process(this, tail->get_next()); 244 | 245 | if (victim) { 246 | victim->return_stolen(); 247 | victim = nullptr; 248 | } 249 | } 250 | 251 | size_t nstolen = 0; 252 | 253 | t = tail->take(&nstolen); 254 | 255 | #if STACCATO_DEBUG 256 | if (t) 257 | COUNT(take); 258 | else if (nstolen == 0) 259 | COUNT(take_empty); 260 | else 261 | COUNT(take_stolen); 262 | #endif 263 | 264 | if (t) 265 | continue; 266 | 267 | if (nstolen == 0) 268 | return; 269 | 270 | t = steal_task(tail, &victim); 271 | 272 | if (!t) 273 | std::this_thread::yield(); 274 | } 275 | } 276 | 277 | template 278 | task *worker::steal_task(task_deque *, task_deque **victim) 279 | { 280 | if (load_relaxed(m_nvictims) == 0) 281 | return nullptr; 282 | 283 | auto vhead = get_victim(); 284 | auto vtail = vhead; 285 | size_t now_stolen = 0; 286 | 287 | while (true) { 288 | if (now_stolen >= m_taskgraph_degree - 1) { 289 | if (vtail->get_next()) { 290 | vtail = vtail->get_next(); 291 | now_stolen = 0; 292 | } 293 | } 294 | 295 | bool was_empty = false; 296 | auto t = vtail->steal(&was_empty); 297 | 298 | #if STACCATO_DEBUG 299 | if (t) 300 | COUNT(steal2); 301 | else if (was_empty) 302 | COUNT(steal2_empty); 303 | else 304 | COUNT(steal2_race); 305 | #endif 306 | 307 | if (t) { 308 | *victim = vtail; 309 | return t; 310 | } 311 | 312 | if (!was_empty) { 313 | now_stolen++; 314 | continue; 315 | } 316 | 317 | if (vtail->get_next()) 318 | vtail = vtail->get_next(); 319 | else 320 | return nullptr; 321 | 322 | now_stolen = 0; 323 | } 324 | } 325 | 326 | #if STACCATO_DEBUG 327 | 328 | template 329 | void worker::print_counters() 330 | { 331 | m_counter.print(m_id); 332 | } 333 | #endif 334 | 335 | } 336 | } 337 | 338 | #endif /* end of include guard: WORKER_H_MIRBQTTK */ 339 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Work-Stealing Task Scheduler 2 | 3 | This is my C++11 implementation of concurrent work-stealing task scheduler. It has only basic scheduling functions (`spawn` and `wait`) and supports weak memory models. 4 | 5 | ## How does work-stealing works? 6 | Tasks consists of a sets of instructions that should be executed sequentially. During the execution they can create subtasks which can be run in parallel. Thus all the tasks can be described by directed acyclic graph. 7 | 8 | Scheduler has a set of execution threads with private queues of tasks. When new a subtask is created (by `spawn` call) it is placed in thread's private queue. When task should wait for its subtasks to finish (i.e. `wait` is called), thread starts to execute tasks from its own queue. In case it doesn't have tasks to execute, it steals tasks from queues of others threads. 9 | 10 | ## What's special about this implementation? 11 | 12 | Internal data structures of most work-stealing schedulers are designed with an assumption that they would store memory pointers to task objects. Despite their low overhead, this approach have the following problems: 13 | 14 | 1. For each task memory allocator should be accessed twice: to allocate task object memory and then to free it, when the task is finished. It brings allocator-related overhead. 15 | 2. There's no guarantee that tasks objects will be stored in adjacent memory regions. During tasks graph execution this leads to cache-misses overhead. 16 | 17 | In my implementation these problems are addressed by: 18 | 19 | 1. Designing a deque based data structure that allows to store tasks objects during their execution. So when the task is completed its memory is reused by the following tasks, which eliminates the need for accessing memory manager. Besides, this implementation has lesser lock-contention than traditional work-stealing deques. 20 | 2. Using dedicated memory manager. As deques owner executes tasks in LIFO manner, its memory access follows the same pattern. It allows to use LIFO based memory manager that does not have to provide fragmentation handling and thread-safety thus having lowest overhead possible and stores memory object in consecutive memory. 21 | 22 | As a drawback of this approach is that you have to specify the maximum number of subtasks each task can have. For example, for classical tasks of calculating Fibonacci number it's equal to 2. For majority of tasks it's not problem and this number is usually known at developing stage, but I plan bypass this in the future version. 23 | 24 | ## How does it compare to other schedulers? 25 | 26 | For bench-marking I've used the following tests: 27 | 28 | |Name|Dimensions|Type|Description| 29 | |--|--|--|--| 30 | |fib| 42| CPU-bound | Fibonacci number calculation recurrence formula | 31 | |dfs|9^9| CPU-bound | Depth first search of a balanced tree | 32 | |matmul|3500x350| Memory-bound |Cache-aware square matrix multiplication | 33 | |blkmul|4096x4096| Memory-bound |Square block matrix multiplication| 34 | 35 | *fib* and *dfs* benchmarks have relatively small tasks and the majority of CPU time is spend executing the scheduler code. On contrary, *blkmul* and *matmul* benchmarks require more time to execute their tasks than the scheduler code. This can be seen when comparing the execution times of sequential versions with the parallel one, *fib* and *dfs* are far more efficient without a scheduler. This, in turn, means that *fib* and *dfs* are more suited for comparing the overheads of different schedulers. 36 | 37 | For benchmarking I've used a server with two Intel Xeon E5 v4. Each processor has 32K L1i and L2d, 256K L2 and 35 Mb L3 cache, 14 cores and 2 threads per core (56 threads total). Each scheduler was compiled with the same compiler (g++ 7.2.1) and the same optimization level (-O3) under Linux 4.13 (Fedora 27). 38 | 39 | The comparison with Intel TBB (v.4.3), Intel Cilk Plus (7.3) and sequential version show the following results: 40 | 41 |

42 | llc_misses 43 |

44 | 45 |

46 | fib_benchmark 47 |

48 | 49 |

50 | dfs_benchmark 51 |

52 | 53 |

54 | matmul_benchmark 55 |

56 | 57 |

58 | blkmul_benchmark 59 |

60 | 61 | As this implementation attempts to reduce the overhead of internal data structures, the difference is most noticeable in CPU-bound tasks while memory-bound tasks left almost unaffected. 62 | 63 | ## Usage 64 | 65 | It's header-only library, so no need for compiling it and linking. C++11 with thread support is the only requirement. 66 | 67 | Include `staccato/scheduler.hpp` file to use the scheduler in your program. 68 | 69 | ### Define a task class 70 | 71 | Define a class derived from `staccato::task` for your task with `void execute()` method. This method would be executed by the scheduler. 72 | 73 | You can spawn substasks during execution of `execute()`. For that: 74 | 1. Create new object of your task class over the memory returned by `child()` 75 | 2. Call `void spawn(task *t)` to place a new task in thread queue 76 | 3. Call `void wait()` to wait for all created subtasks to finish 77 | 78 | ### Create scheduler object 79 | 80 | Create `scheduler` object with specified maximum number of subtasks and a number of threads: 81 | 82 | ```c++ 83 | scheduler sh(2, nthreads); 84 | ``` 85 | 86 | The specified number of execution threads (`nthreads`) will be created. These threads will be removed when the destructor is called. 87 | 88 | ### Submit root task for execution 89 | 90 | Create an object for the root task over the memory returned by `sh.root()` and submit the task for execution with `sh.spawn()`. To wait until the task is finished call `sh.wait()`: 91 | 92 | ```c++ 93 | sh.spawn(new(sh.root()) FibTask(n, &answer)); 94 | sh.wait() 95 | ``` 96 | 97 | ## Example 98 | 99 | ```c++ 100 | #include 101 | #include 102 | 103 | #include 104 | 105 | using namespace std; 106 | using namespace staccato; 107 | 108 | class FibTask: public task 109 | { 110 | public: 111 | FibTask (int n_, long *sum_): n(n_), sum(sum_) 112 | { } 113 | 114 | void execute() { 115 | if (n <= 2) { 116 | *sum = 1; 117 | return; 118 | } 119 | 120 | long x; 121 | spawn(new(child()) FibTask(n - 1, &x)); 122 | long y; 123 | spawn(new(child()) FibTask(n - 2, &y)); 124 | 125 | wait(); 126 | 127 | *sum = x + y; 128 | 129 | return; 130 | } 131 | 132 | private: 133 | int n; 134 | long *sum; 135 | }; 136 | 137 | int main(int argc, char *argv[]) 138 | { 139 | size_t n = 20; 140 | long answer; 141 | size_t nthreads = 0; 142 | 143 | if (argc >= 2) 144 | nthreads = atoi(argv[1]); 145 | if (argc >= 3) 146 | n = atoi(argv[2]); 147 | if (nthreads == 0) 148 | nthreads = thread::hardware_concurrency(); 149 | 150 | { 151 | scheduler sh(2, nthreads); 152 | sh.spawn(new(sh.root()) FibTask(n, &answer)); 153 | sh.wait(); 154 | } 155 | 156 | cout << "fib(" << n << ") = " << answer << "\n"; 157 | 158 | return 0; 159 | } 160 | ``` 161 | 162 | This program calculates 40'th Fibonacci number by recurrence formula. The root task calculates 40'th number by spawning two subtasks for 39'th and 38'th numbers. They, in turn, spawn two subtasks and so on until the first number. When subtasks finish, their sum is calculated and it is stored in parent's task memory. 163 | 164 | For more examples check `bencmarks/staccato/` directory. 165 | -------------------------------------------------------------------------------- /benchmarks/sequential/blkmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Blocked matrix multiply is done as follows: 3 | * Adapted from Cilk 5.4.3 example 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace chrono; 14 | 15 | inline uint32_t xorshift_rand() { 16 | static uint32_t x = 2463534242; 17 | x ^= x >> 13; 18 | x ^= x << 17; 19 | x ^= x >> 5; 20 | return x; 21 | } 22 | 23 | static const uint32_t rand_max = 1e3; 24 | 25 | typedef uint32_t elem_t; 26 | 27 | class Block 28 | { 29 | public: 30 | Block(); 31 | 32 | void fill(); 33 | 34 | void add(Block *a, Block *b); 35 | 36 | void mul(Block *a, Block *b, bool add = false); 37 | 38 | elem_t trace(); 39 | 40 | static elem_t dotpord(Block *a, Block *b); 41 | 42 | static const size_t size = 256; 43 | 44 | private: 45 | elem_t m_data[size]; 46 | }; 47 | 48 | Block::Block() 49 | { 50 | memset(m_data, 0, size); 51 | } 52 | 53 | void Block::fill() 54 | { 55 | for (size_t i = 0; i < size; i += 4) { 56 | m_data[i + 0] += xorshift_rand() % rand_max; 57 | m_data[i + 1] += xorshift_rand() % rand_max; 58 | m_data[i + 2] += xorshift_rand() % rand_max; 59 | m_data[i + 3] += xorshift_rand() % rand_max; 60 | } 61 | } 62 | 63 | void Block::add(Block *a, Block *b) 64 | { 65 | for (size_t i = 0; i < size; i += 4) { 66 | m_data[i + 0] = a->m_data[i + 0] + b->m_data[i + 0]; 67 | m_data[i + 1] = a->m_data[i + 1] + b->m_data[i + 1]; 68 | m_data[i + 2] = a->m_data[i + 2] + b->m_data[i + 2]; 69 | m_data[i + 3] = a->m_data[i + 3] + b->m_data[i + 3]; 70 | } 71 | } 72 | 73 | void Block::mul(Block *a, Block *b, bool add) 74 | { 75 | for (size_t j = 0; j < 16; j += 2) { 76 | elem_t *bp = &b->m_data[j]; 77 | for (size_t i = 0; i < 16; i += 2) { 78 | elem_t *ap = &a->m_data [i * 16]; 79 | elem_t *rp = &m_data[i * 16 + j]; 80 | 81 | elem_t s0_0 = ap[0] * bp[0]; 82 | elem_t s0_1 = ap[0] * bp[1]; 83 | elem_t s1_0 = ap[16] * bp[0]; 84 | elem_t s1_1 = ap[16] * bp[1]; 85 | s0_0 += ap[1] * bp[16]; 86 | s0_1 += ap[1] * bp[17]; 87 | s1_0 += ap[17] * bp[16]; 88 | s1_1 += ap[17] * bp[17]; 89 | s0_0 += ap[2] * bp[32]; 90 | s0_1 += ap[2] * bp[33]; 91 | s1_0 += ap[18] * bp[32]; 92 | s1_1 += ap[18] * bp[33]; 93 | s0_0 += ap[3] * bp[48]; 94 | s0_1 += ap[3] * bp[49]; 95 | s1_0 += ap[19] * bp[48]; 96 | s1_1 += ap[19] * bp[49]; 97 | s0_0 += ap[4] * bp[64]; 98 | s0_1 += ap[4] * bp[65]; 99 | s1_0 += ap[20] * bp[64]; 100 | s1_1 += ap[20] * bp[65]; 101 | s0_0 += ap[5] * bp[80]; 102 | s0_1 += ap[5] * bp[81]; 103 | s1_0 += ap[21] * bp[80]; 104 | s1_1 += ap[21] * bp[81]; 105 | s0_0 += ap[6] * bp[96]; 106 | s0_1 += ap[6] * bp[97]; 107 | s1_0 += ap[22] * bp[96]; 108 | s1_1 += ap[22] * bp[97]; 109 | s0_0 += ap[7] * bp[112]; 110 | s0_1 += ap[7] * bp[113]; 111 | s1_0 += ap[23] * bp[112]; 112 | s1_1 += ap[23] * bp[113]; 113 | s0_0 += ap[8] * bp[128]; 114 | s0_1 += ap[8] * bp[129]; 115 | s1_0 += ap[24] * bp[128]; 116 | s1_1 += ap[24] * bp[129]; 117 | s0_0 += ap[9] * bp[144]; 118 | s0_1 += ap[9] * bp[145]; 119 | s1_0 += ap[25] * bp[144]; 120 | s1_1 += ap[25] * bp[145]; 121 | s0_0 += ap[10] * bp[160]; 122 | s0_1 += ap[10] * bp[161]; 123 | s1_0 += ap[26] * bp[160]; 124 | s1_1 += ap[26] * bp[161]; 125 | s0_0 += ap[11] * bp[176]; 126 | s0_1 += ap[11] * bp[177]; 127 | s1_0 += ap[27] * bp[176]; 128 | s1_1 += ap[27] * bp[177]; 129 | s0_0 += ap[12] * bp[192]; 130 | s0_1 += ap[12] * bp[193]; 131 | s1_0 += ap[28] * bp[192]; 132 | s1_1 += ap[28] * bp[193]; 133 | s0_0 += ap[13] * bp[208]; 134 | s0_1 += ap[13] * bp[209]; 135 | s1_0 += ap[29] * bp[208]; 136 | s1_1 += ap[29] * bp[209]; 137 | s0_0 += ap[14] * bp[224]; 138 | s0_1 += ap[14] * bp[225]; 139 | s1_0 += ap[30] * bp[224]; 140 | s1_1 += ap[30] * bp[225]; 141 | s0_0 += ap[15] * bp[240]; 142 | s0_1 += ap[15] * bp[241]; 143 | s1_0 += ap[31] * bp[240]; 144 | s1_1 += ap[31] * bp[241]; 145 | 146 | if (add) { 147 | rp[0] += s0_0; 148 | rp[1] += s0_1; 149 | rp[16] += s1_0; 150 | rp[17] += s1_1; 151 | } else { 152 | rp[0] = s0_0; 153 | rp[1] = s0_1; 154 | rp[16] = s1_0; 155 | rp[17] = s1_1; 156 | } 157 | } 158 | } 159 | } 160 | 161 | elem_t Block::trace() 162 | { 163 | elem_t s = 0; 164 | for (size_t i = 0; i < 16; ++i) 165 | s += m_data[i * 16 + i]; 166 | return s; 167 | } 168 | 169 | // r = sum(transpose(A) o B) 170 | elem_t Block::dotpord(Block *a, Block *b) 171 | { 172 | elem_t s = 0; 173 | for (size_t i = 0; i < 16; ++i) 174 | for (size_t j = 0; j < 16; ++j) 175 | s += a->m_data[16*j + i] * b->m_data[16*i + j]; 176 | 177 | return s; 178 | } 179 | 180 | void fill(Block *A, size_t n) 181 | { 182 | for (size_t i = 0; i < n; ++i) 183 | A[i].fill(); 184 | } 185 | 186 | elem_t trace(Block *A, size_t n) 187 | { 188 | if (n <= 1) 189 | return A->trace(); 190 | 191 | auto q = n / 4; 192 | return trace(A, q) + trace(A + 3*q, q); 193 | } 194 | 195 | elem_t dotpord(Block *A, Block *B, size_t n) 196 | { 197 | if (n <= 1) 198 | return Block::dotpord(A, B); 199 | 200 | auto q = n / 4; 201 | elem_t s = 0; 202 | s += dotpord(A+0*q, B+0*q, q); 203 | s += dotpord(A+2*q, B+1*q, q); 204 | s += dotpord(A+1*q, B+2*q, q); 205 | s += dotpord(A+3*q, B+3*q, q); 206 | 207 | return s; 208 | } 209 | 210 | bool check(Block *A, Block *B, Block *C, size_t n) 211 | { 212 | return fabs(trace(C, n) - dotpord(A, B, n)) < 1e-3; 213 | } 214 | 215 | void seq_blkmul(Block *A, Block *B, Block *R, size_t n, bool do_mul = true); 216 | void seq_add(Block *A, Block *B, Block *R, size_t n); 217 | 218 | void seq_mul(Block *A, Block *B, Block *R, size_t n) 219 | { 220 | if (n <= 1) { 221 | R->mul(A, B); 222 | return; 223 | } 224 | 225 | auto q = n / 4; 226 | 227 | auto l = new Block[n]; 228 | 229 | seq_blkmul(A+0*q, B+0*q, l+0*q, q); 230 | seq_blkmul(A+0*q, B+1*q, l+1*q, q); 231 | seq_blkmul(A+2*q, B+0*q, l+2*q, q); 232 | seq_blkmul(A+2*q, B+1*q, l+3*q, q); 233 | 234 | auto r = new Block[n]; 235 | 236 | seq_blkmul(A+1*q, B+2*q, r+0*q, q); 237 | seq_blkmul(A+1*q, B+3*q, r+1*q, q); 238 | seq_blkmul(A+3*q, B+2*q, r+2*q, q); 239 | seq_blkmul(A+3*q, B+3*q, r+3*q, q); 240 | 241 | A = l; 242 | B = r; 243 | seq_add(A, B, R, n); 244 | 245 | delete []r; 246 | delete []l; 247 | } 248 | 249 | void seq_add(Block *A, Block *B, Block *R, size_t n) 250 | { 251 | if (n <= 1) { 252 | R->add(A, B); 253 | return; 254 | } 255 | 256 | auto q = n / 4; 257 | 258 | seq_blkmul(A+0*q, B+0*q, R+0*q, q, false); 259 | seq_blkmul(A+1*q, B+1*q, R+1*q, q, false); 260 | seq_blkmul(A+2*q, B+2*q, R+2*q, q, false); 261 | seq_blkmul(A+3*q, B+3*q, R+3*q, q, false); 262 | } 263 | 264 | void seq_blkmul(Block *A, Block *B, Block *R, size_t n, bool do_mul) { 265 | if (do_mul) 266 | seq_mul(A, B, R, n); 267 | else 268 | seq_add(A, B, R, n); 269 | } 270 | 271 | int main(int argc, char *argv[]) 272 | { 273 | size_t log_n = 4; 274 | 275 | if (argc >= 3) 276 | log_n = atoi(argv[2]); 277 | 278 | auto n = 1 << log_n; 279 | 280 | cout << "Matrix dim: " << n * 16 << "\n"; 281 | auto nblocks = n * n; 282 | 283 | cout << "Data size: " << 3 * nblocks * sizeof(Block) / 1024 << "Kb\n"; 284 | 285 | auto A = new Block[nblocks]; 286 | auto B = new Block[nblocks]; 287 | auto R = new Block[nblocks]; 288 | 289 | fill(A, nblocks); 290 | fill(B, nblocks); 291 | 292 | auto start = system_clock::now(); 293 | 294 | seq_blkmul(A, B, R, nblocks); 295 | 296 | auto stop = system_clock::now(); 297 | 298 | cout << "Scheduler: sequential\n"; 299 | cout << "Benchmark: blkmul\n"; 300 | cout << "Threads: " << 0 << "\n"; 301 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 302 | cout << "Input: " << log_n << "\n"; 303 | cout << "Output: " << check(A, B, R, nblocks) << "\n"; 304 | 305 | delete []A; 306 | delete []B; 307 | delete []R; 308 | return 0; 309 | } 310 | 311 | -------------------------------------------------------------------------------- /tests/task_deque.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "gtest/gtest.h" 8 | #include "gmock/gmock.h" 9 | 10 | #include "task_mock.hpp" 11 | #include "task_deque.hpp" 12 | 13 | using namespace staccato; 14 | using namespace staccato::internal; 15 | 16 | static const size_t test_timeout = 2; 17 | static const size_t nthreads = 4; 18 | 19 | TEST(ctor, creating_and_deleteing) { 20 | auto m = static_cast(malloc(sizeof(task_mock) * 8)); 21 | auto d = new task_deque(3, m); 22 | delete d; 23 | free(m); 24 | } 25 | 26 | TEST(take, single) { 27 | size_t ntasks = 8; 28 | auto m = static_cast(malloc(sizeof(task_mock) * ntasks)); 29 | auto set = new task_deque(3, m); 30 | 31 | for (size_t i = 1; i <= ntasks; ++i) { 32 | new(set->put_allocate()) task_mock(i); 33 | set->put_commit(); 34 | } 35 | 36 | for (size_t i = ntasks; i >= 1; --i) { 37 | size_t nstolen = 0; 38 | auto r = set->take(&nstolen); 39 | auto t = reinterpret_cast(r); 40 | 41 | EXPECT_EQ(t->id, i); 42 | } 43 | 44 | EXPECT_TRUE(set->null()); 45 | 46 | delete set; 47 | free(m); 48 | } 49 | 50 | TEST(steal, single) { 51 | size_t ntasks = 8; 52 | auto m = static_cast(malloc(sizeof(task_mock) * ntasks)); 53 | auto set = new task_deque(3, m); 54 | 55 | set->set_null(false); 56 | 57 | for (size_t i = 1; i <= ntasks; ++i) { 58 | new(set->put_allocate()) task_mock(i); 59 | set->put_commit(); 60 | } 61 | 62 | 63 | for (size_t i = 1; i <= ntasks; ++i) { 64 | bool was_null = false; 65 | bool was_empty = false; 66 | auto r = set->steal(&was_empty, &was_null); 67 | auto t = reinterpret_cast(r); 68 | 69 | EXPECT_FALSE(was_empty); 70 | EXPECT_EQ(t->id, i); 71 | } 72 | 73 | bool was_empty = false; 74 | bool was_null = false; 75 | auto r = set->steal(&was_empty, &was_null); 76 | EXPECT_EQ(r, nullptr); 77 | EXPECT_TRUE(was_empty); 78 | 79 | // EXPECT_TRUE(set->empty()); 80 | 81 | delete set; 82 | free(m); 83 | } 84 | 85 | TEST(steal, concurrent_steals) { 86 | size_t ntasks = 1 << 13; 87 | 88 | std::vector tasks; 89 | 90 | auto m = static_cast(malloc(sizeof(task_mock) * ntasks)); 91 | auto set = new task_deque(13, m); 92 | 93 | for (size_t i = 1; i <= ntasks; ++i) { 94 | new(set->put_allocate()) task_mock(i); 95 | set->put_commit(); 96 | tasks.push_back(i); 97 | } 98 | 99 | std::atomic_size_t nready(0); 100 | std::atomic_bool stop(false); 101 | std::vector stolen[nthreads]; 102 | std::thread threads[nthreads]; 103 | 104 | auto steal = [&](size_t id) { 105 | nready++; 106 | 107 | while (nready != nthreads) 108 | std::this_thread::yield(); 109 | 110 | while (!stop) { 111 | bool was_empty = false; 112 | bool was_null = false; 113 | auto r = set->steal(&was_empty, &was_null); 114 | if (r) { 115 | auto t = reinterpret_cast(r); 116 | stolen[id].push_back(t->id); 117 | } 118 | } 119 | }; 120 | 121 | for (size_t i = 0; i < nthreads; ++i) { 122 | threads[i] = std::thread(steal, i); 123 | } 124 | 125 | std::this_thread::sleep_for(std::chrono::seconds(test_timeout)); 126 | stop = true; 127 | 128 | std::cerr << "total tasks: " << ntasks << "\n"; 129 | for (size_t i = 0; i < nthreads; ++i) { 130 | threads[i].join(); 131 | std::cerr << "thread #" << i << " tasks: " << stolen[i].size() << "\n"; 132 | } 133 | 134 | for (size_t i = 0; i < nthreads; ++i) { 135 | for (auto t : stolen[i]) { 136 | auto found = std::find(tasks.begin(), tasks.end(), t); 137 | ASSERT_TRUE(found != tasks.end()); 138 | tasks.erase(found); 139 | } 140 | } 141 | 142 | EXPECT_TRUE(tasks.empty()); 143 | 144 | delete set; 145 | free(m); 146 | } 147 | 148 | TEST(steal_take, concurrent_steal_and_take) { 149 | size_t ntasks = 1 << 13; 150 | 151 | auto m = static_cast(malloc(sizeof(task_mock) * ntasks)); 152 | auto set = new task_deque(13, m); 153 | 154 | std::vector tasks; 155 | 156 | for (size_t i = 1; i <= ntasks; ++i) { 157 | new(set->put_allocate()) task_mock(i); 158 | set->put_commit(); 159 | tasks.push_back(i); 160 | } 161 | 162 | std::atomic_size_t nready(0); 163 | std::atomic_bool stop(false); 164 | std::vector stolen[nthreads]; 165 | std::thread threads[nthreads]; 166 | 167 | auto steal = [&](size_t id, bool steal) { 168 | nready++; 169 | 170 | while (nready != nthreads) 171 | std::this_thread::yield(); 172 | 173 | while (!stop) { 174 | bool was_empty = false; 175 | bool was_null = false; 176 | size_t nstolen = 0; 177 | 178 | auto r = steal ? set->steal(&was_empty, &was_null) : set->take(&nstolen); 179 | if (r) { 180 | auto t = reinterpret_cast(r); 181 | stolen[id].push_back(t->id); 182 | } 183 | } 184 | }; 185 | 186 | for (size_t i = 0; i < nthreads; ++i) { 187 | threads[i] = std::thread(steal, i, i != 0); 188 | } 189 | 190 | std::this_thread::sleep_for(std::chrono::seconds(test_timeout)); 191 | stop = true; 192 | 193 | std::cerr << "total tasks: " << ntasks << "\n"; 194 | for (size_t i = 0; i < nthreads; ++i) { 195 | threads[i].join(); 196 | std::cerr << "thread #" << i << " tasks: " << stolen[i].size() << "\n"; 197 | } 198 | 199 | for (size_t i = 0; i < nthreads; ++i) { 200 | for (auto t : stolen[i]) { 201 | auto found = std::find(tasks.begin(), tasks.end(), t); 202 | ASSERT_TRUE(found != tasks.end()); 203 | tasks.erase(found); 204 | } 205 | } 206 | 207 | // EXPECT_TRUE(tasks.empty()); 208 | 209 | delete set; 210 | free(m); 211 | } 212 | 213 | 214 | TEST(steal_take_put, concurrent_with_reset) { 215 | size_t iter = 1000; 216 | size_t ntasks = 8; 217 | 218 | auto m = static_cast(malloc(sizeof(task_mock) * ntasks)); 219 | auto set = new task_deque(3, m); 220 | 221 | std::vector tasks; 222 | 223 | std::atomic_size_t nready(0); 224 | std::atomic_bool stop(false); 225 | std::vector taken[nthreads]; 226 | std::thread threads[nthreads]; 227 | 228 | auto owner = [&]() { 229 | nready++; 230 | while (nready != nthreads) 231 | std::this_thread::yield(); 232 | 233 | for (size_t i = 0; i < iter; ++i) { 234 | 235 | for (size_t j = 0; j < ntasks; ++j) { 236 | auto t = new(set->put_allocate()) task_mock(i * ntasks + j); 237 | set->put_commit(); 238 | tasks.push_back(t->id); 239 | } 240 | 241 | while (true) { 242 | size_t nstolen = 0; 243 | auto t = set->take(&nstolen); 244 | 245 | if (t) { 246 | taken[0].push_back(t->id); 247 | continue; 248 | } 249 | 250 | if (nstolen) 251 | continue; 252 | 253 | break; 254 | } 255 | } 256 | 257 | stop = true; 258 | }; 259 | 260 | auto thief = [&](size_t id) { 261 | nready++; 262 | while (nready != nthreads) 263 | std::this_thread::yield(); 264 | 265 | while (!stop) { 266 | bool was_empty = false; 267 | bool was_null = false; 268 | 269 | auto t = set->steal(&was_empty, &was_null); 270 | 271 | if (t) { 272 | taken[id].push_back(t->id); 273 | set->return_stolen(); 274 | continue; 275 | } 276 | } 277 | }; 278 | 279 | threads[0] = std::thread(owner); 280 | for (size_t i = 1; i < nthreads; ++i) { 281 | threads[i] = std::thread(thief, i); 282 | } 283 | 284 | for (int i = 0; i < 5; ++i) { 285 | std::this_thread::sleep_for(std::chrono::seconds(1)); 286 | if (stop) 287 | break; 288 | } 289 | std::this_thread::sleep_for(std::chrono::seconds(1)); 290 | stop = true; 291 | 292 | std::cerr << "total tasks: " << iter * ntasks << "\n"; 293 | for (size_t i = 0; i < nthreads; ++i) { 294 | threads[i].join(); 295 | } 296 | 297 | for (size_t i = 0; i < nthreads; ++i) { 298 | std::cerr << "thread #" << i << " tasks: " << taken[i].size() << "\n"; 299 | for (auto t : taken[i]) { 300 | auto found = std::find(tasks.begin(), tasks.end(), t); 301 | 302 | if (found == tasks.end()) 303 | std::cout << "Taks #" << t << " is taken twice" << std::endl; 304 | 305 | ASSERT_TRUE(found != tasks.end()); 306 | tasks.erase(found); 307 | } 308 | } 309 | 310 | delete set; 311 | free(m); 312 | 313 | } 314 | -------------------------------------------------------------------------------- /benchmarks/cilk/blkmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Blocked matrix multiply is done as follows: 3 | * Adapted from Cilk 5.4.3 example 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace chrono; 17 | 18 | inline uint32_t xorshift_rand() { 19 | static uint32_t x = 2463534242; 20 | x ^= x >> 13; 21 | x ^= x << 17; 22 | x ^= x >> 5; 23 | return x; 24 | } 25 | 26 | static const uint32_t rand_max = 1e3; 27 | 28 | typedef uint32_t elem_t; 29 | 30 | class Block 31 | { 32 | public: 33 | Block(); 34 | 35 | void fill(); 36 | 37 | void add(Block *a, Block *b); 38 | 39 | void mul(Block *a, Block *b, bool add = false); 40 | 41 | elem_t trace(); 42 | 43 | static elem_t dotpord(Block *a, Block *b); 44 | 45 | static const size_t size = 256; 46 | 47 | private: 48 | elem_t m_data[size]; 49 | }; 50 | 51 | Block::Block() 52 | { 53 | memset(m_data, 0, size); 54 | } 55 | 56 | void Block::fill() 57 | { 58 | for (size_t i = 0; i < size; i += 4) { 59 | m_data[i + 0] += xorshift_rand() % rand_max; 60 | m_data[i + 1] += xorshift_rand() % rand_max; 61 | m_data[i + 2] += xorshift_rand() % rand_max; 62 | m_data[i + 3] += xorshift_rand() % rand_max; 63 | } 64 | } 65 | 66 | void Block::add(Block *a, Block *b) 67 | { 68 | for (size_t i = 0; i < size; i += 4) { 69 | m_data[i + 0] = a->m_data[i + 0] + b->m_data[i + 0]; 70 | m_data[i + 1] = a->m_data[i + 1] + b->m_data[i + 1]; 71 | m_data[i + 2] = a->m_data[i + 2] + b->m_data[i + 2]; 72 | m_data[i + 3] = a->m_data[i + 3] + b->m_data[i + 3]; 73 | } 74 | } 75 | 76 | void Block::mul(Block *a, Block *b, bool add) 77 | { 78 | for (size_t j = 0; j < 16; j += 2) { 79 | elem_t *bp = &b->m_data[j]; 80 | for (size_t i = 0; i < 16; i += 2) { 81 | elem_t *ap = &a->m_data [i * 16]; 82 | elem_t *rp = &m_data[i * 16 + j]; 83 | 84 | elem_t s0_0 = ap[0] * bp[0]; 85 | elem_t s0_1 = ap[0] * bp[1]; 86 | elem_t s1_0 = ap[16] * bp[0]; 87 | elem_t s1_1 = ap[16] * bp[1]; 88 | s0_0 += ap[1] * bp[16]; 89 | s0_1 += ap[1] * bp[17]; 90 | s1_0 += ap[17] * bp[16]; 91 | s1_1 += ap[17] * bp[17]; 92 | s0_0 += ap[2] * bp[32]; 93 | s0_1 += ap[2] * bp[33]; 94 | s1_0 += ap[18] * bp[32]; 95 | s1_1 += ap[18] * bp[33]; 96 | s0_0 += ap[3] * bp[48]; 97 | s0_1 += ap[3] * bp[49]; 98 | s1_0 += ap[19] * bp[48]; 99 | s1_1 += ap[19] * bp[49]; 100 | s0_0 += ap[4] * bp[64]; 101 | s0_1 += ap[4] * bp[65]; 102 | s1_0 += ap[20] * bp[64]; 103 | s1_1 += ap[20] * bp[65]; 104 | s0_0 += ap[5] * bp[80]; 105 | s0_1 += ap[5] * bp[81]; 106 | s1_0 += ap[21] * bp[80]; 107 | s1_1 += ap[21] * bp[81]; 108 | s0_0 += ap[6] * bp[96]; 109 | s0_1 += ap[6] * bp[97]; 110 | s1_0 += ap[22] * bp[96]; 111 | s1_1 += ap[22] * bp[97]; 112 | s0_0 += ap[7] * bp[112]; 113 | s0_1 += ap[7] * bp[113]; 114 | s1_0 += ap[23] * bp[112]; 115 | s1_1 += ap[23] * bp[113]; 116 | s0_0 += ap[8] * bp[128]; 117 | s0_1 += ap[8] * bp[129]; 118 | s1_0 += ap[24] * bp[128]; 119 | s1_1 += ap[24] * bp[129]; 120 | s0_0 += ap[9] * bp[144]; 121 | s0_1 += ap[9] * bp[145]; 122 | s1_0 += ap[25] * bp[144]; 123 | s1_1 += ap[25] * bp[145]; 124 | s0_0 += ap[10] * bp[160]; 125 | s0_1 += ap[10] * bp[161]; 126 | s1_0 += ap[26] * bp[160]; 127 | s1_1 += ap[26] * bp[161]; 128 | s0_0 += ap[11] * bp[176]; 129 | s0_1 += ap[11] * bp[177]; 130 | s1_0 += ap[27] * bp[176]; 131 | s1_1 += ap[27] * bp[177]; 132 | s0_0 += ap[12] * bp[192]; 133 | s0_1 += ap[12] * bp[193]; 134 | s1_0 += ap[28] * bp[192]; 135 | s1_1 += ap[28] * bp[193]; 136 | s0_0 += ap[13] * bp[208]; 137 | s0_1 += ap[13] * bp[209]; 138 | s1_0 += ap[29] * bp[208]; 139 | s1_1 += ap[29] * bp[209]; 140 | s0_0 += ap[14] * bp[224]; 141 | s0_1 += ap[14] * bp[225]; 142 | s1_0 += ap[30] * bp[224]; 143 | s1_1 += ap[30] * bp[225]; 144 | s0_0 += ap[15] * bp[240]; 145 | s0_1 += ap[15] * bp[241]; 146 | s1_0 += ap[31] * bp[240]; 147 | s1_1 += ap[31] * bp[241]; 148 | 149 | if (add) { 150 | rp[0] += s0_0; 151 | rp[1] += s0_1; 152 | rp[16] += s1_0; 153 | rp[17] += s1_1; 154 | } else { 155 | rp[0] = s0_0; 156 | rp[1] = s0_1; 157 | rp[16] = s1_0; 158 | rp[17] = s1_1; 159 | } 160 | } 161 | } 162 | } 163 | 164 | elem_t Block::trace() 165 | { 166 | elem_t s = 0; 167 | for (size_t i = 0; i < 16; ++i) 168 | s += m_data[i * 16 + i]; 169 | return s; 170 | } 171 | 172 | // r = sum(transpose(A) o B) 173 | elem_t Block::dotpord(Block *a, Block *b) 174 | { 175 | elem_t s = 0; 176 | for (size_t i = 0; i < 16; ++i) 177 | for (size_t j = 0; j < 16; ++j) 178 | s += a->m_data[16*j + i] * b->m_data[16*i + j]; 179 | 180 | return s; 181 | } 182 | 183 | void add(Block *A, Block *B, Block *R, size_t n); 184 | void mul(Block *A, Block *B, Block *R, size_t n); 185 | 186 | void operationtask( 187 | Block *A, 188 | Block *B, 189 | Block *R, 190 | size_t n, 191 | bool do_mul = true 192 | ) { 193 | if (do_mul) 194 | mul(A, B, R, n); 195 | else 196 | add(A, B, R, n); 197 | } 198 | 199 | void add(Block *A, Block *B, Block *R, size_t n) 200 | { 201 | if (n <= 1) { 202 | R->add(A, B); 203 | return; 204 | } 205 | 206 | auto q = n / 4; 207 | 208 | cilk_spawn operationtask(A+0*q, B+0*q, R+0*q, q, false); 209 | cilk_spawn operationtask(A+1*q, B+1*q, R+1*q, q, false); 210 | cilk_spawn operationtask(A+2*q, B+2*q, R+2*q, q, false); 211 | cilk_spawn operationtask(A+3*q, B+3*q, R+3*q, q, false); 212 | 213 | cilk_sync; 214 | } 215 | 216 | /* A x B = R = l + r 217 | * | 0 | 1 | | 0 | 1 | | 00+12 | 01+13 | | 00 | 01 | | 12 | 13 | 218 | * |---+---| x |---+---| = |-------+-------| = |----+----| + |----+----| 219 | * | 2 | 3 | | 2 | 3 | | 20+32 | 21+33 | | 20 | 21 | | 32 | 33 | 220 | */ 221 | void mul(Block *A, Block *B, Block *R, size_t n) { 222 | if (n <= 1) { 223 | R->mul(A, B); 224 | return; 225 | } 226 | 227 | auto q = n / 4; 228 | 229 | auto l = new Block[n]; 230 | 231 | cilk_spawn operationtask(A+0*q, B+0*q, l+0*q, q); 232 | cilk_spawn operationtask(A+0*q, B+1*q, l+1*q, q); 233 | cilk_spawn operationtask(A+2*q, B+0*q, l+2*q, q); 234 | cilk_spawn operationtask(A+2*q, B+1*q, l+3*q, q); 235 | 236 | auto r = new Block[n]; 237 | 238 | cilk_spawn operationtask(A+1*q, B+2*q, r+0*q, q); 239 | cilk_spawn operationtask(A+1*q, B+3*q, r+1*q, q); 240 | cilk_spawn operationtask(A+3*q, B+2*q, r+2*q, q); 241 | cilk_spawn operationtask(A+3*q, B+3*q, r+3*q, q); 242 | 243 | cilk_sync; 244 | 245 | A = l; 246 | B = r; 247 | add(A, B, R, n); 248 | 249 | delete []r; 250 | delete []l; 251 | } 252 | 253 | 254 | void fill(Block *A, size_t n) 255 | { 256 | for (size_t i = 0; i < n; ++i) 257 | A[i].fill(); 258 | } 259 | 260 | elem_t trace(Block *A, size_t n) 261 | { 262 | if (n <= 1) 263 | return A->trace(); 264 | 265 | auto q = n / 4; 266 | return trace(A, q) + trace(A + 3*q, q); 267 | } 268 | 269 | elem_t dotpord(Block *A, Block *B, size_t n) 270 | { 271 | if (n <= 1) 272 | return Block::dotpord(A, B); 273 | 274 | auto q = n / 4; 275 | elem_t s = 0; 276 | s += dotpord(A+0*q, B+0*q, q); 277 | s += dotpord(A+2*q, B+1*q, q); 278 | s += dotpord(A+1*q, B+2*q, q); 279 | s += dotpord(A+3*q, B+3*q, q); 280 | 281 | return s; 282 | } 283 | 284 | bool check(Block *A, Block *B, Block *C, size_t n) 285 | { 286 | return fabs(trace(C, n) - dotpord(A, B, n)) < 1e-3; 287 | } 288 | 289 | void test(Block *A, Block *B, Block *C, size_t n) 290 | { 291 | cilk_spawn operationtask(A, B, C, n); 292 | cilk_sync; 293 | } 294 | 295 | int main(int argc, char *argv[]) 296 | { 297 | size_t log_n = 4; 298 | const char *nthreads = nullptr; 299 | 300 | if (argc >= 2) 301 | nthreads = argv[1]; 302 | if (argc >= 3) 303 | log_n = atoi(argv[2]); 304 | if (nthreads == nullptr) 305 | nthreads = to_string(thread::hardware_concurrency()).c_str(); 306 | 307 | auto n = 1 << log_n; 308 | 309 | cout << "Matrix dim: " << n * 16 << "\n"; 310 | auto nblocks = n * n; 311 | 312 | cout << "Data size: " << 3 * nblocks * sizeof(Block) / 1024 << "Kb\n"; 313 | 314 | auto A = new Block[nblocks]; 315 | auto B = new Block[nblocks]; 316 | auto R = new Block[nblocks]; 317 | 318 | fill(A, nblocks); 319 | fill(B, nblocks); 320 | 321 | __cilkrts_end_cilk(); 322 | 323 | auto start = system_clock::now(); 324 | 325 | if (__cilkrts_set_param("nworkers", nthreads) != 0) { 326 | cerr << "Failed to set worker count\n"; 327 | exit(EXIT_FAILURE); 328 | } 329 | 330 | __cilkrts_init(); 331 | 332 | test(A, B, R, nblocks); 333 | 334 | __cilkrts_end_cilk(); 335 | 336 | auto stop = system_clock::now(); 337 | 338 | cout << "Scheduler: cilk\n"; 339 | cout << "Benchmark: blkmul\n"; 340 | cout << "Threads: " << nthreads << "\n"; 341 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 342 | cout << "Input: " << log_n << "\n"; 343 | cout << "Output: " << check(A, B, R, nblocks) << "\n"; 344 | 345 | delete []A; 346 | delete []B; 347 | delete []R; 348 | return 0; 349 | } 350 | 351 | -------------------------------------------------------------------------------- /benchmarks/openmp/blkmul/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #include 9 | 10 | using namespace std; 11 | using namespace std::chrono; 12 | 13 | inline uint32_t xorshift_rand() { 14 | static uint32_t x = 2463534242; 15 | x ^= x >> 13; 16 | x ^= x << 17; 17 | x ^= x >> 5; 18 | return x; 19 | } 20 | 21 | static const uint32_t rand_max = 1e3; 22 | 23 | typedef uint32_t elem_t; 24 | 25 | class Block 26 | { 27 | public: 28 | Block(); 29 | 30 | void fill(); 31 | 32 | void add(Block *a, Block *b); 33 | 34 | void mul(Block *a, Block *b, bool add = false); 35 | 36 | elem_t trace(); 37 | 38 | static elem_t dotpord(Block *a, Block *b); 39 | 40 | static const size_t size = 256; 41 | 42 | private: 43 | elem_t m_data[size]; 44 | }; 45 | 46 | Block::Block() 47 | { 48 | memset(m_data, 0, size); 49 | } 50 | 51 | void Block::fill() 52 | { 53 | for (size_t i = 0; i < size; i += 4) { 54 | m_data[i + 0] += xorshift_rand() % rand_max; 55 | m_data[i + 1] += xorshift_rand() % rand_max; 56 | m_data[i + 2] += xorshift_rand() % rand_max; 57 | m_data[i + 3] += xorshift_rand() % rand_max; 58 | } 59 | } 60 | 61 | void Block::add(Block *a, Block *b) 62 | { 63 | for (size_t i = 0; i < size; i += 4) { 64 | m_data[i + 0] = a->m_data[i + 0] + b->m_data[i + 0]; 65 | m_data[i + 1] = a->m_data[i + 1] + b->m_data[i + 1]; 66 | m_data[i + 2] = a->m_data[i + 2] + b->m_data[i + 2]; 67 | m_data[i + 3] = a->m_data[i + 3] + b->m_data[i + 3]; 68 | } 69 | } 70 | 71 | void Block::mul(Block *a, Block *b, bool add) 72 | { 73 | for (size_t j = 0; j < 16; j += 2) { 74 | elem_t *bp = &b->m_data[j]; 75 | for (size_t i = 0; i < 16; i += 2) { 76 | elem_t *ap = &a->m_data [i * 16]; 77 | elem_t *rp = &m_data[i * 16 + j]; 78 | 79 | elem_t s0_0 = ap[0] * bp[0]; 80 | elem_t s0_1 = ap[0] * bp[1]; 81 | elem_t s1_0 = ap[16] * bp[0]; 82 | elem_t s1_1 = ap[16] * bp[1]; 83 | s0_0 += ap[1] * bp[16]; 84 | s0_1 += ap[1] * bp[17]; 85 | s1_0 += ap[17] * bp[16]; 86 | s1_1 += ap[17] * bp[17]; 87 | s0_0 += ap[2] * bp[32]; 88 | s0_1 += ap[2] * bp[33]; 89 | s1_0 += ap[18] * bp[32]; 90 | s1_1 += ap[18] * bp[33]; 91 | s0_0 += ap[3] * bp[48]; 92 | s0_1 += ap[3] * bp[49]; 93 | s1_0 += ap[19] * bp[48]; 94 | s1_1 += ap[19] * bp[49]; 95 | s0_0 += ap[4] * bp[64]; 96 | s0_1 += ap[4] * bp[65]; 97 | s1_0 += ap[20] * bp[64]; 98 | s1_1 += ap[20] * bp[65]; 99 | s0_0 += ap[5] * bp[80]; 100 | s0_1 += ap[5] * bp[81]; 101 | s1_0 += ap[21] * bp[80]; 102 | s1_1 += ap[21] * bp[81]; 103 | s0_0 += ap[6] * bp[96]; 104 | s0_1 += ap[6] * bp[97]; 105 | s1_0 += ap[22] * bp[96]; 106 | s1_1 += ap[22] * bp[97]; 107 | s0_0 += ap[7] * bp[112]; 108 | s0_1 += ap[7] * bp[113]; 109 | s1_0 += ap[23] * bp[112]; 110 | s1_1 += ap[23] * bp[113]; 111 | s0_0 += ap[8] * bp[128]; 112 | s0_1 += ap[8] * bp[129]; 113 | s1_0 += ap[24] * bp[128]; 114 | s1_1 += ap[24] * bp[129]; 115 | s0_0 += ap[9] * bp[144]; 116 | s0_1 += ap[9] * bp[145]; 117 | s1_0 += ap[25] * bp[144]; 118 | s1_1 += ap[25] * bp[145]; 119 | s0_0 += ap[10] * bp[160]; 120 | s0_1 += ap[10] * bp[161]; 121 | s1_0 += ap[26] * bp[160]; 122 | s1_1 += ap[26] * bp[161]; 123 | s0_0 += ap[11] * bp[176]; 124 | s0_1 += ap[11] * bp[177]; 125 | s1_0 += ap[27] * bp[176]; 126 | s1_1 += ap[27] * bp[177]; 127 | s0_0 += ap[12] * bp[192]; 128 | s0_1 += ap[12] * bp[193]; 129 | s1_0 += ap[28] * bp[192]; 130 | s1_1 += ap[28] * bp[193]; 131 | s0_0 += ap[13] * bp[208]; 132 | s0_1 += ap[13] * bp[209]; 133 | s1_0 += ap[29] * bp[208]; 134 | s1_1 += ap[29] * bp[209]; 135 | s0_0 += ap[14] * bp[224]; 136 | s0_1 += ap[14] * bp[225]; 137 | s1_0 += ap[30] * bp[224]; 138 | s1_1 += ap[30] * bp[225]; 139 | s0_0 += ap[15] * bp[240]; 140 | s0_1 += ap[15] * bp[241]; 141 | s1_0 += ap[31] * bp[240]; 142 | s1_1 += ap[31] * bp[241]; 143 | 144 | if (add) { 145 | rp[0] += s0_0; 146 | rp[1] += s0_1; 147 | rp[16] += s1_0; 148 | rp[17] += s1_1; 149 | } else { 150 | rp[0] = s0_0; 151 | rp[1] = s0_1; 152 | rp[16] = s1_0; 153 | rp[17] = s1_1; 154 | } 155 | } 156 | } 157 | } 158 | 159 | elem_t Block::trace() 160 | { 161 | elem_t s = 0; 162 | for (size_t i = 0; i < 16; ++i) 163 | s += m_data[i * 16 + i]; 164 | return s; 165 | } 166 | 167 | // r = sum(transpose(A) o B) 168 | elem_t Block::dotpord(Block *a, Block *b) 169 | { 170 | elem_t s = 0; 171 | for (size_t i = 0; i < 16; ++i) 172 | for (size_t j = 0; j < 16; ++j) 173 | s += a->m_data[16*j + i] * b->m_data[16*i + j]; 174 | 175 | return s; 176 | } 177 | 178 | void add(Block *A, Block *B, Block *R, size_t n); 179 | void mul(Block *A, Block *B, Block *R, size_t n); 180 | 181 | void operationtask( 182 | Block *A, 183 | Block *B, 184 | Block *R, 185 | size_t n, 186 | bool do_mul = true 187 | ) { 188 | if (do_mul) 189 | mul(A, B, R, n); 190 | else 191 | add(A, B, R, n); 192 | } 193 | 194 | 195 | void add(Block *A, Block *B, Block *R, size_t n) 196 | { 197 | if (n <= 1) { 198 | R->add(A, B); 199 | return; 200 | } 201 | 202 | auto q = n / 4; 203 | 204 | #pragma omp task shared(A, B, R, q) 205 | operationtask(A+0*q, B+0*q, R+0*q, q, false); 206 | #pragma omp task shared(A, B, R, q) 207 | operationtask(A+1*q, B+1*q, R+1*q, q, false); 208 | #pragma omp task shared(A, B, R, q) 209 | operationtask(A+2*q, B+2*q, R+2*q, q, false); 210 | #pragma omp task shared(A, B, R, q) 211 | operationtask(A+3*q, B+3*q, R+3*q, q, false); 212 | 213 | #pragma omp taskwait 214 | } 215 | 216 | /* A x B = R = l + r 217 | * | 0 | 1 | | 0 | 1 | | 00+12 | 01+13 | | 00 | 01 | | 12 | 13 | 218 | * |---+---| x |---+---| = |-------+-------| = |----+----| + |----+----| 219 | * | 2 | 3 | | 2 | 3 | | 20+32 | 21+33 | | 20 | 21 | | 32 | 33 | 220 | */ 221 | void mul(Block *A, Block *B, Block *R, size_t n) { 222 | if (n <= 1) { 223 | R->mul(A, B); 224 | return; 225 | } 226 | 227 | auto q = n / 4; 228 | 229 | auto l = new Block[n]; 230 | 231 | #pragma omp task shared(A, B, l, q) 232 | operationtask(A+0*q, B+0*q, l+0*q, q); 233 | #pragma omp task shared(A, B, l, q) 234 | operationtask(A+0*q, B+1*q, l+1*q, q); 235 | #pragma omp task shared(A, B, l, q) 236 | operationtask(A+2*q, B+0*q, l+2*q, q); 237 | #pragma omp task shared(A, B, l, q) 238 | operationtask(A+2*q, B+1*q, l+3*q, q); 239 | 240 | auto r = new Block[n]; 241 | 242 | #pragma omp task shared(A, B, r, q) 243 | operationtask(A+1*q, B+2*q, r+0*q, q); 244 | #pragma omp task shared(A, B, r, q) 245 | operationtask(A+1*q, B+3*q, r+1*q, q); 246 | #pragma omp task shared(A, B, r, q) 247 | operationtask(A+3*q, B+2*q, r+2*q, q); 248 | #pragma omp task shared(A, B, r, q) 249 | operationtask(A+3*q, B+3*q, r+3*q, q); 250 | 251 | #pragma omp taskwait 252 | 253 | A = l; 254 | B = r; 255 | add(A, B, R, n); 256 | 257 | delete []r; 258 | delete []l; 259 | } 260 | 261 | 262 | void fill(Block *A, size_t n) 263 | { 264 | for (size_t i = 0; i < n; ++i) 265 | A[i].fill(); 266 | } 267 | 268 | elem_t trace(Block *A, size_t n) 269 | { 270 | if (n <= 1) 271 | return A->trace(); 272 | 273 | auto q = n / 4; 274 | return trace(A, q) + trace(A + 3*q, q); 275 | } 276 | 277 | elem_t dotpord(Block *A, Block *B, size_t n) 278 | { 279 | if (n <= 1) 280 | return Block::dotpord(A, B); 281 | 282 | auto q = n / 4; 283 | elem_t s = 0; 284 | s += dotpord(A+0*q, B+0*q, q); 285 | s += dotpord(A+2*q, B+1*q, q); 286 | s += dotpord(A+1*q, B+2*q, q); 287 | s += dotpord(A+3*q, B+3*q, q); 288 | 289 | return s; 290 | } 291 | 292 | bool check(Block *A, Block *B, Block *C, size_t n) 293 | { 294 | return fabs(trace(C, n) - dotpord(A, B, n)) < 1e-3; 295 | } 296 | 297 | void test(Block *A, Block *B, Block *C, size_t n) 298 | { 299 | #pragma omp task shared(A, B, C, n) 300 | operationtask(A, B, C, n); 301 | #pragma omp taskwait 302 | } 303 | 304 | int main(int argc, char *argv[]) 305 | { 306 | size_t log_n = 4; 307 | size_t nthreads = 0; 308 | 309 | if (argc >= 2) 310 | nthreads = atoi(argv[1]); 311 | if (argc >= 3) 312 | log_n = atoi(argv[2]); 313 | if (nthreads == 0) 314 | nthreads = thread::hardware_concurrency(); 315 | 316 | auto n = 1 << log_n; 317 | 318 | cout << "Matrix dim: " << n * 16 << "\n"; 319 | auto nblocks = n * n; 320 | 321 | cout << "Data size: " << 3 * nblocks * sizeof(Block) / 1024 << "Kb\n"; 322 | 323 | auto A = new Block[nblocks]; 324 | auto B = new Block[nblocks]; 325 | auto R = new Block[nblocks]; 326 | 327 | fill(A, nblocks); 328 | fill(B, nblocks); 329 | 330 | auto start = system_clock::now(); 331 | 332 | omp_set_dynamic(0); 333 | omp_set_num_threads(nthreads); 334 | 335 | #pragma omp parallel shared(A, B, R, nblocks) 336 | #pragma omp single 337 | test(A, B, R, nblocks); 338 | 339 | auto stop = system_clock::now(); 340 | 341 | cout << "Scheduler: openmp\n"; 342 | cout << "Benchmark: blkmul\n"; 343 | cout << "Threads: " << nthreads << "\n"; 344 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 345 | cout << "Input: " << log_n << "\n"; 346 | cout << "Output: " << check(A, B, R, nblocks) << "\n"; 347 | 348 | delete []A; 349 | delete []B; 350 | delete []R; 351 | return 0; 352 | } 353 | -------------------------------------------------------------------------------- /benchmarks/staccato/blkmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Blocked matrix multiply is done as follows: 3 | * Adapted from Cilk 5.4.3 example 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace chrono; 17 | using namespace staccato; 18 | 19 | inline uint32_t xorshift_rand() { 20 | static uint32_t x = 2463534242; 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | static const uint32_t rand_max = 1e3; 28 | 29 | typedef uint32_t elem_t; 30 | 31 | class Block 32 | { 33 | public: 34 | Block(); 35 | 36 | void fill(); 37 | 38 | void add(Block *a, Block *b); 39 | 40 | void mul(Block *a, Block *b, bool add = false); 41 | 42 | elem_t trace(); 43 | 44 | static elem_t dotpord(Block *a, Block *b); 45 | 46 | static const size_t size = 256; 47 | 48 | private: 49 | elem_t m_data[size]; 50 | }; 51 | 52 | Block::Block() 53 | { 54 | memset(m_data, 0, size); 55 | } 56 | 57 | void Block::fill() 58 | { 59 | for (size_t i = 0; i < size; i += 4) { 60 | m_data[i + 0] += xorshift_rand() % rand_max; 61 | m_data[i + 1] += xorshift_rand() % rand_max; 62 | m_data[i + 2] += xorshift_rand() % rand_max; 63 | m_data[i + 3] += xorshift_rand() % rand_max; 64 | } 65 | } 66 | 67 | void Block::add(Block *a, Block *b) 68 | { 69 | for (size_t i = 0; i < size; i += 4) { 70 | m_data[i + 0] = a->m_data[i + 0] + b->m_data[i + 0]; 71 | m_data[i + 1] = a->m_data[i + 1] + b->m_data[i + 1]; 72 | m_data[i + 2] = a->m_data[i + 2] + b->m_data[i + 2]; 73 | m_data[i + 3] = a->m_data[i + 3] + b->m_data[i + 3]; 74 | } 75 | } 76 | 77 | void Block::mul(Block *a, Block *b, bool add) 78 | { 79 | for (size_t j = 0; j < 16; j += 2) { 80 | elem_t *bp = &b->m_data[j]; 81 | for (size_t i = 0; i < 16; i += 2) { 82 | elem_t *ap = &a->m_data [i * 16]; 83 | elem_t *rp = &m_data[i * 16 + j]; 84 | 85 | elem_t s0_0 = ap[0] * bp[0]; 86 | elem_t s0_1 = ap[0] * bp[1]; 87 | elem_t s1_0 = ap[16] * bp[0]; 88 | elem_t s1_1 = ap[16] * bp[1]; 89 | s0_0 += ap[1] * bp[16]; 90 | s0_1 += ap[1] * bp[17]; 91 | s1_0 += ap[17] * bp[16]; 92 | s1_1 += ap[17] * bp[17]; 93 | s0_0 += ap[2] * bp[32]; 94 | s0_1 += ap[2] * bp[33]; 95 | s1_0 += ap[18] * bp[32]; 96 | s1_1 += ap[18] * bp[33]; 97 | s0_0 += ap[3] * bp[48]; 98 | s0_1 += ap[3] * bp[49]; 99 | s1_0 += ap[19] * bp[48]; 100 | s1_1 += ap[19] * bp[49]; 101 | s0_0 += ap[4] * bp[64]; 102 | s0_1 += ap[4] * bp[65]; 103 | s1_0 += ap[20] * bp[64]; 104 | s1_1 += ap[20] * bp[65]; 105 | s0_0 += ap[5] * bp[80]; 106 | s0_1 += ap[5] * bp[81]; 107 | s1_0 += ap[21] * bp[80]; 108 | s1_1 += ap[21] * bp[81]; 109 | s0_0 += ap[6] * bp[96]; 110 | s0_1 += ap[6] * bp[97]; 111 | s1_0 += ap[22] * bp[96]; 112 | s1_1 += ap[22] * bp[97]; 113 | s0_0 += ap[7] * bp[112]; 114 | s0_1 += ap[7] * bp[113]; 115 | s1_0 += ap[23] * bp[112]; 116 | s1_1 += ap[23] * bp[113]; 117 | s0_0 += ap[8] * bp[128]; 118 | s0_1 += ap[8] * bp[129]; 119 | s1_0 += ap[24] * bp[128]; 120 | s1_1 += ap[24] * bp[129]; 121 | s0_0 += ap[9] * bp[144]; 122 | s0_1 += ap[9] * bp[145]; 123 | s1_0 += ap[25] * bp[144]; 124 | s1_1 += ap[25] * bp[145]; 125 | s0_0 += ap[10] * bp[160]; 126 | s0_1 += ap[10] * bp[161]; 127 | s1_0 += ap[26] * bp[160]; 128 | s1_1 += ap[26] * bp[161]; 129 | s0_0 += ap[11] * bp[176]; 130 | s0_1 += ap[11] * bp[177]; 131 | s1_0 += ap[27] * bp[176]; 132 | s1_1 += ap[27] * bp[177]; 133 | s0_0 += ap[12] * bp[192]; 134 | s0_1 += ap[12] * bp[193]; 135 | s1_0 += ap[28] * bp[192]; 136 | s1_1 += ap[28] * bp[193]; 137 | s0_0 += ap[13] * bp[208]; 138 | s0_1 += ap[13] * bp[209]; 139 | s1_0 += ap[29] * bp[208]; 140 | s1_1 += ap[29] * bp[209]; 141 | s0_0 += ap[14] * bp[224]; 142 | s0_1 += ap[14] * bp[225]; 143 | s1_0 += ap[30] * bp[224]; 144 | s1_1 += ap[30] * bp[225]; 145 | s0_0 += ap[15] * bp[240]; 146 | s0_1 += ap[15] * bp[241]; 147 | s1_0 += ap[31] * bp[240]; 148 | s1_1 += ap[31] * bp[241]; 149 | 150 | if (add) { 151 | rp[0] += s0_0; 152 | rp[1] += s0_1; 153 | rp[16] += s1_0; 154 | rp[17] += s1_1; 155 | } else { 156 | rp[0] = s0_0; 157 | rp[1] = s0_1; 158 | rp[16] = s1_0; 159 | rp[17] = s1_1; 160 | } 161 | } 162 | } 163 | } 164 | 165 | elem_t Block::trace() 166 | { 167 | elem_t s = 0; 168 | for (size_t i = 0; i < 16; ++i) 169 | s += m_data[i * 16 + i]; 170 | return s; 171 | } 172 | 173 | // r = sum(transpose(A) o B) 174 | elem_t Block::dotpord(Block *a, Block *b) 175 | { 176 | elem_t s = 0; 177 | for (size_t i = 0; i < 16; ++i) 178 | for (size_t j = 0; j < 16; ++j) 179 | s += a->m_data[16*j + i] * b->m_data[16*i + j]; 180 | 181 | return s; 182 | } 183 | 184 | class OperationTask: public task 185 | { 186 | public: 187 | OperationTask( 188 | Block *A, 189 | Block *B, 190 | Block *R, 191 | size_t n, 192 | bool do_mul = true 193 | ); 194 | 195 | void execute(); 196 | 197 | private: 198 | void add(); 199 | void mul(); 200 | 201 | bool do_mul; 202 | 203 | Block *A; 204 | Block *B; 205 | Block *R; 206 | 207 | size_t n; 208 | }; 209 | 210 | OperationTask::OperationTask( 211 | Block *A, 212 | Block *B, 213 | Block *R, 214 | size_t n, 215 | bool do_mul 216 | ) 217 | : do_mul(do_mul) 218 | , A(A) 219 | , B(B) 220 | , R(R) 221 | , n(n) 222 | { } 223 | 224 | void OperationTask::add() 225 | { 226 | if (n <= 1) { 227 | R->add(A, B); 228 | return; 229 | } 230 | 231 | auto q = n / 4; 232 | 233 | spawn(new(child()) OperationTask(A+0*q, B+0*q, R+0*q, q, false)); 234 | spawn(new(child()) OperationTask(A+1*q, B+1*q, R+1*q, q, false)); 235 | spawn(new(child()) OperationTask(A+2*q, B+2*q, R+2*q, q, false)); 236 | spawn(new(child()) OperationTask(A+3*q, B+3*q, R+3*q, q, false)); 237 | 238 | wait(); 239 | } 240 | 241 | /* A x B = R = l + r 242 | * | 0 | 1 | | 0 | 1 | | 00+12 | 01+13 | | 00 | 01 | | 12 | 13 | 243 | * |---+---| x |---+---| = |-------+-------| = |----+----| + |----+----| 244 | * | 2 | 3 | | 2 | 3 | | 20+32 | 21+33 | | 20 | 21 | | 32 | 33 | 245 | */ 246 | void OperationTask::mul() { 247 | if (n <= 1) { 248 | R->mul(A, B); 249 | return; 250 | } 251 | 252 | auto q = n / 4; 253 | 254 | auto l = new Block[n]; 255 | 256 | spawn(new(child()) OperationTask(A+0*q, B+0*q, l+0*q, q)); 257 | spawn(new(child()) OperationTask(A+0*q, B+1*q, l+1*q, q)); 258 | spawn(new(child()) OperationTask(A+2*q, B+0*q, l+2*q, q)); 259 | spawn(new(child()) OperationTask(A+2*q, B+1*q, l+3*q, q)); 260 | 261 | auto r = new Block[n]; 262 | 263 | spawn(new(child()) OperationTask(A+1*q, B+2*q, r+0*q, q)); 264 | spawn(new(child()) OperationTask(A+1*q, B+3*q, r+1*q, q)); 265 | spawn(new(child()) OperationTask(A+3*q, B+2*q, r+2*q, q)); 266 | spawn(new(child()) OperationTask(A+3*q, B+3*q, r+3*q, q)); 267 | 268 | wait(); 269 | 270 | A = l; 271 | B = r; 272 | add(); 273 | 274 | delete []r; 275 | delete []l; 276 | } 277 | 278 | void OperationTask::execute() 279 | { 280 | if (do_mul) 281 | mul(); 282 | else 283 | add(); 284 | } 285 | 286 | void fill(Block *A, size_t n) 287 | { 288 | for (size_t i = 0; i < n; ++i) 289 | A[i].fill(); 290 | } 291 | 292 | elem_t trace(Block *A, size_t n) 293 | { 294 | if (n <= 1) 295 | return A->trace(); 296 | 297 | auto q = n / 4; 298 | return trace(A, q) + trace(A + 3*q, q); 299 | } 300 | 301 | elem_t dotpord(Block *A, Block *B, size_t n) 302 | { 303 | if (n <= 1) 304 | return Block::dotpord(A, B); 305 | 306 | auto q = n / 4; 307 | elem_t s = 0; 308 | s += dotpord(A+0*q, B+0*q, q); 309 | s += dotpord(A+2*q, B+1*q, q); 310 | s += dotpord(A+1*q, B+2*q, q); 311 | s += dotpord(A+3*q, B+3*q, q); 312 | 313 | return s; 314 | } 315 | 316 | bool check(Block *A, Block *B, Block *C, size_t n) 317 | { 318 | return fabs(trace(C, n) - dotpord(A, B, n)) < 1e-3; 319 | } 320 | 321 | int main(int argc, char *argv[]) 322 | { 323 | size_t log_n = 4; 324 | size_t nthreads = 0; 325 | 326 | if (argc >= 2) 327 | nthreads = atoi(argv[1]); 328 | if (argc >= 3) 329 | log_n = atoi(argv[2]); 330 | if (nthreads == 0) 331 | nthreads = thread::hardware_concurrency(); 332 | 333 | auto n = 1 << log_n; 334 | 335 | cout << "Matrix dim: " << n * 16 << "\n"; 336 | auto nblocks = n * n; 337 | 338 | cout << "Data size: " << 3 * nblocks * sizeof(Block) / 1024 << "Kb\n"; 339 | 340 | auto A = new Block[nblocks]; 341 | auto B = new Block[nblocks]; 342 | auto R = new Block[nblocks]; 343 | 344 | fill(A, nblocks); 345 | fill(B, nblocks); 346 | 347 | auto start = system_clock::now(); 348 | 349 | { 350 | scheduler sh(8, nthreads); 351 | sh.spawn(new(sh.root()) OperationTask(A, B, R, nblocks)); 352 | sh.wait(); 353 | } 354 | 355 | auto stop = system_clock::now(); 356 | 357 | cout << "Scheduler: staccato\n"; 358 | cout << "Benchmark: blkmul\n"; 359 | cout << "Threads: " << nthreads << "\n"; 360 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 361 | cout << "Input: " << log_n << "\n"; 362 | cout << "Output: " << check(A, B, R, nblocks) << "\n"; 363 | 364 | delete []A; 365 | delete []B; 366 | delete []R; 367 | return 0; 368 | } 369 | 370 | -------------------------------------------------------------------------------- /benchmarks/tbb/blkmul/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Blocked matrix multiply is done as follows: 3 | * Adapted from Cilk 5.4.3 example 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace chrono; 17 | using namespace tbb; 18 | 19 | inline uint32_t xorshift_rand() { 20 | static uint32_t x = 2463534242; 21 | x ^= x >> 13; 22 | x ^= x << 17; 23 | x ^= x >> 5; 24 | return x; 25 | } 26 | 27 | static const uint32_t rand_max = 1e3; 28 | 29 | typedef uint32_t elem_t; 30 | 31 | class Block 32 | { 33 | public: 34 | Block(); 35 | 36 | void fill(); 37 | 38 | void add(Block *a, Block *b); 39 | 40 | void mul(Block *a, Block *b, bool add = false); 41 | 42 | elem_t trace(); 43 | 44 | static elem_t dotpord(Block *a, Block *b); 45 | 46 | static const size_t size = 256; 47 | 48 | private: 49 | elem_t m_data[size]; 50 | }; 51 | 52 | Block::Block() 53 | { 54 | memset(m_data, 0, size); 55 | } 56 | 57 | void Block::fill() 58 | { 59 | for (size_t i = 0; i < size; i += 4) { 60 | m_data[i + 0] += xorshift_rand() % rand_max; 61 | m_data[i + 1] += xorshift_rand() % rand_max; 62 | m_data[i + 2] += xorshift_rand() % rand_max; 63 | m_data[i + 3] += xorshift_rand() % rand_max; 64 | } 65 | } 66 | 67 | void Block::add(Block *a, Block *b) 68 | { 69 | for (size_t i = 0; i < size; i += 4) { 70 | m_data[i + 0] = a->m_data[i + 0] + b->m_data[i + 0]; 71 | m_data[i + 1] = a->m_data[i + 1] + b->m_data[i + 1]; 72 | m_data[i + 2] = a->m_data[i + 2] + b->m_data[i + 2]; 73 | m_data[i + 3] = a->m_data[i + 3] + b->m_data[i + 3]; 74 | } 75 | } 76 | 77 | void Block::mul(Block *a, Block *b, bool add) 78 | { 79 | for (size_t j = 0; j < 16; j += 2) { 80 | elem_t *bp = &b->m_data[j]; 81 | for (size_t i = 0; i < 16; i += 2) { 82 | elem_t *ap = &a->m_data [i * 16]; 83 | elem_t *rp = &m_data[i * 16 + j]; 84 | 85 | elem_t s0_0 = ap[0] * bp[0]; 86 | elem_t s0_1 = ap[0] * bp[1]; 87 | elem_t s1_0 = ap[16] * bp[0]; 88 | elem_t s1_1 = ap[16] * bp[1]; 89 | s0_0 += ap[1] * bp[16]; 90 | s0_1 += ap[1] * bp[17]; 91 | s1_0 += ap[17] * bp[16]; 92 | s1_1 += ap[17] * bp[17]; 93 | s0_0 += ap[2] * bp[32]; 94 | s0_1 += ap[2] * bp[33]; 95 | s1_0 += ap[18] * bp[32]; 96 | s1_1 += ap[18] * bp[33]; 97 | s0_0 += ap[3] * bp[48]; 98 | s0_1 += ap[3] * bp[49]; 99 | s1_0 += ap[19] * bp[48]; 100 | s1_1 += ap[19] * bp[49]; 101 | s0_0 += ap[4] * bp[64]; 102 | s0_1 += ap[4] * bp[65]; 103 | s1_0 += ap[20] * bp[64]; 104 | s1_1 += ap[20] * bp[65]; 105 | s0_0 += ap[5] * bp[80]; 106 | s0_1 += ap[5] * bp[81]; 107 | s1_0 += ap[21] * bp[80]; 108 | s1_1 += ap[21] * bp[81]; 109 | s0_0 += ap[6] * bp[96]; 110 | s0_1 += ap[6] * bp[97]; 111 | s1_0 += ap[22] * bp[96]; 112 | s1_1 += ap[22] * bp[97]; 113 | s0_0 += ap[7] * bp[112]; 114 | s0_1 += ap[7] * bp[113]; 115 | s1_0 += ap[23] * bp[112]; 116 | s1_1 += ap[23] * bp[113]; 117 | s0_0 += ap[8] * bp[128]; 118 | s0_1 += ap[8] * bp[129]; 119 | s1_0 += ap[24] * bp[128]; 120 | s1_1 += ap[24] * bp[129]; 121 | s0_0 += ap[9] * bp[144]; 122 | s0_1 += ap[9] * bp[145]; 123 | s1_0 += ap[25] * bp[144]; 124 | s1_1 += ap[25] * bp[145]; 125 | s0_0 += ap[10] * bp[160]; 126 | s0_1 += ap[10] * bp[161]; 127 | s1_0 += ap[26] * bp[160]; 128 | s1_1 += ap[26] * bp[161]; 129 | s0_0 += ap[11] * bp[176]; 130 | s0_1 += ap[11] * bp[177]; 131 | s1_0 += ap[27] * bp[176]; 132 | s1_1 += ap[27] * bp[177]; 133 | s0_0 += ap[12] * bp[192]; 134 | s0_1 += ap[12] * bp[193]; 135 | s1_0 += ap[28] * bp[192]; 136 | s1_1 += ap[28] * bp[193]; 137 | s0_0 += ap[13] * bp[208]; 138 | s0_1 += ap[13] * bp[209]; 139 | s1_0 += ap[29] * bp[208]; 140 | s1_1 += ap[29] * bp[209]; 141 | s0_0 += ap[14] * bp[224]; 142 | s0_1 += ap[14] * bp[225]; 143 | s1_0 += ap[30] * bp[224]; 144 | s1_1 += ap[30] * bp[225]; 145 | s0_0 += ap[15] * bp[240]; 146 | s0_1 += ap[15] * bp[241]; 147 | s1_0 += ap[31] * bp[240]; 148 | s1_1 += ap[31] * bp[241]; 149 | 150 | if (add) { 151 | rp[0] += s0_0; 152 | rp[1] += s0_1; 153 | rp[16] += s1_0; 154 | rp[17] += s1_1; 155 | } else { 156 | rp[0] = s0_0; 157 | rp[1] = s0_1; 158 | rp[16] = s1_0; 159 | rp[17] = s1_1; 160 | } 161 | } 162 | } 163 | } 164 | 165 | elem_t Block::trace() 166 | { 167 | elem_t s = 0; 168 | for (size_t i = 0; i < 16; ++i) 169 | s += m_data[i * 16 + i]; 170 | return s; 171 | } 172 | 173 | // r = sum(transpose(A) o B) 174 | elem_t Block::dotpord(Block *a, Block *b) 175 | { 176 | elem_t s = 0; 177 | for (size_t i = 0; i < 16; ++i) 178 | for (size_t j = 0; j < 16; ++j) 179 | s += a->m_data[16*j + i] * b->m_data[16*i + j]; 180 | 181 | return s; 182 | } 183 | 184 | class OperationTask: public task 185 | { 186 | public: 187 | OperationTask( 188 | Block *A, 189 | Block *B, 190 | Block *R, 191 | size_t n, 192 | bool do_mul = true 193 | ); 194 | 195 | task *execute(); 196 | 197 | private: 198 | void add(); 199 | void mul(); 200 | 201 | bool do_mul; 202 | 203 | Block *A; 204 | Block *B; 205 | Block *R; 206 | 207 | size_t n; 208 | }; 209 | 210 | OperationTask::OperationTask( 211 | Block *A, 212 | Block *B, 213 | Block *R, 214 | size_t n, 215 | bool do_mul 216 | ) 217 | : do_mul(do_mul) 218 | , A(A) 219 | , B(B) 220 | , R(R) 221 | , n(n) 222 | { } 223 | 224 | void OperationTask::add() 225 | { 226 | if (n <= 1) { 227 | R->add(A, B); 228 | return; 229 | } 230 | 231 | auto q = n / 4; 232 | 233 | set_ref_count(5); 234 | 235 | spawn(*new(allocate_child()) OperationTask(A+0*q, B+0*q, R+0*q, q, false)); 236 | spawn(*new(allocate_child()) OperationTask(A+1*q, B+1*q, R+1*q, q, false)); 237 | spawn(*new(allocate_child()) OperationTask(A+2*q, B+2*q, R+2*q, q, false)); 238 | spawn(*new(allocate_child()) OperationTask(A+3*q, B+3*q, R+3*q, q, false)); 239 | 240 | wait_for_all(); 241 | } 242 | 243 | /* A x B = R = l + r 244 | * | 0 | 1 | | 0 | 1 | | 00+12 | 01+13 | | 00 | 01 | | 12 | 13 | 245 | * |---+---| x |---+---| = |-------+-------| = |----+----| + |----+----| 246 | * | 2 | 3 | | 2 | 3 | | 20+32 | 21+33 | | 20 | 21 | | 32 | 33 | 247 | */ 248 | void OperationTask::mul() { 249 | if (n <= 1) { 250 | R->mul(A, B); 251 | return; 252 | } 253 | 254 | auto q = n / 4; 255 | 256 | auto l = new Block[n]; 257 | 258 | set_ref_count(9); 259 | 260 | spawn(*new(allocate_child()) OperationTask(A+0*q, B+0*q, l+0*q, q)); 261 | spawn(*new(allocate_child()) OperationTask(A+0*q, B+1*q, l+1*q, q)); 262 | spawn(*new(allocate_child()) OperationTask(A+2*q, B+0*q, l+2*q, q)); 263 | spawn(*new(allocate_child()) OperationTask(A+2*q, B+1*q, l+3*q, q)); 264 | 265 | auto r = new Block[n]; 266 | 267 | spawn(*new(allocate_child()) OperationTask(A+1*q, B+2*q, r+0*q, q)); 268 | spawn(*new(allocate_child()) OperationTask(A+1*q, B+3*q, r+1*q, q)); 269 | spawn(*new(allocate_child()) OperationTask(A+3*q, B+2*q, r+2*q, q)); 270 | spawn(*new(allocate_child()) OperationTask(A+3*q, B+3*q, r+3*q, q)); 271 | 272 | wait_for_all(); 273 | 274 | A = l; 275 | B = r; 276 | add(); 277 | 278 | delete []r; 279 | delete []l; 280 | } 281 | 282 | task *OperationTask::execute() 283 | { 284 | if (do_mul) 285 | mul(); 286 | else 287 | add(); 288 | 289 | return nullptr; 290 | } 291 | 292 | void fill(Block *A, size_t n) 293 | { 294 | for (size_t i = 0; i < n; ++i) 295 | A[i].fill(); 296 | } 297 | 298 | elem_t trace(Block *A, size_t n) 299 | { 300 | if (n <= 1) 301 | return A->trace(); 302 | 303 | auto q = n / 4; 304 | return trace(A, q) + trace(A + 3*q, q); 305 | } 306 | 307 | elem_t dotpord(Block *A, Block *B, size_t n) 308 | { 309 | if (n <= 1) 310 | return Block::dotpord(A, B); 311 | 312 | auto q = n / 4; 313 | elem_t s = 0; 314 | s += dotpord(A+0*q, B+0*q, q); 315 | s += dotpord(A+2*q, B+1*q, q); 316 | s += dotpord(A+1*q, B+2*q, q); 317 | s += dotpord(A+3*q, B+3*q, q); 318 | 319 | return s; 320 | } 321 | 322 | bool check(Block *A, Block *B, Block *C, size_t n) 323 | { 324 | return fabs(trace(C, n) - dotpord(A, B, n)) < 1e-3; 325 | } 326 | 327 | int main(int argc, char *argv[]) 328 | { 329 | size_t log_n = 4; 330 | size_t nthreads = 0; 331 | 332 | if (argc >= 2) 333 | nthreads = atoi(argv[1]); 334 | if (argc >= 3) 335 | log_n = atoi(argv[2]); 336 | if (nthreads == 0) 337 | nthreads = thread::hardware_concurrency(); 338 | 339 | auto n = 1 << log_n; 340 | 341 | cout << "Matrix dim: " << n * 16 << "\n"; 342 | auto nblocks = n * n; 343 | 344 | cout << "Data size: " << 3 * nblocks * sizeof(Block) / 1024 << "Kb\n"; 345 | 346 | auto A = new Block[nblocks]; 347 | auto B = new Block[nblocks]; 348 | auto R = new Block[nblocks]; 349 | 350 | fill(A, nblocks); 351 | fill(B, nblocks); 352 | 353 | auto start = system_clock::now(); 354 | 355 | task_scheduler_init scheduler(nthreads); 356 | 357 | auto root = new(task::allocate_root()) OperationTask(A, B, R, nblocks); 358 | 359 | task::spawn_root_and_wait(*root); 360 | 361 | scheduler.terminate(); 362 | 363 | auto stop = system_clock::now(); 364 | 365 | cout << "Scheduler: tbb\n"; 366 | cout << "Benchmark: blkmul\n"; 367 | cout << "Threads: " << nthreads << "\n"; 368 | cout << "Time(us): " << duration_cast(stop - start).count() << "\n"; 369 | cout << "Input: " << log_n << "\n"; 370 | cout << "Output: " << check(A, B, R, nblocks) << "\n"; 371 | 372 | delete []A; 373 | delete []B; 374 | delete []R; 375 | return 0; 376 | } 377 | 378 | --------------------------------------------------------------------------------