├── CODE_OF_CONDUCT.md ├── rseq ├── internal │ ├── NumCpus.cpp │ ├── Likely.h │ ├── SwitchToCpu.h │ ├── AsymmetricThreadFence.h │ ├── rseq_c_inlines.c │ ├── SwitchToCpuTest.cpp │ ├── OsMem.h │ ├── Dummy.cpp │ ├── Errors.cpp │ ├── NumCpus.h │ ├── SwitchToCpu.cpp │ ├── CleanUpOnThreadDeath.h │ ├── CpuLocalTest.cpp │ ├── Code.h │ ├── Mutex.cpp │ ├── rseq_c.h │ ├── rseq_c.cpp │ ├── CachelinePadded.h │ ├── Rseq.h │ ├── CpuLocal.h │ ├── CleanUpOnThreadDeath.cpp │ ├── OsMem.cpp │ ├── IntrusiveLinkedList.h │ ├── MutexTest.cpp │ ├── AsymmetricThreadFence.cpp │ ├── CachelinePaddedTest.cpp │ ├── IntrusiveLinkedListTest.cpp │ ├── IdAllocator.h │ ├── ThreadControl.h │ ├── AsymmetricThreadFenceTest.cpp │ ├── Errors.h │ ├── ErrorsTest.cpp │ ├── Mutex.h │ ├── CMakeLists.txt │ ├── OsMemTest.cpp │ ├── ThreadControlTest.cpp │ ├── IdAllocatorTest.cpp │ ├── CodeTest.cpp │ ├── CleanUpOnThreadDeathTest.cpp │ ├── Code.cpp │ ├── Rseq.cpp │ └── ThreadControl.cpp ├── CMakeLists.txt ├── RseqCTest.cpp ├── rseq_c.h ├── RseqTest.cpp └── Rseq.h ├── CMakeLists.txt ├── LICENSE ├── CONTRIBUTING.md ├── PATENTS ├── README.md ├── Rseq.md └── RseqBenchmark.cpp /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /rseq/internal/NumCpus.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/NumCpus.h" 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | namespace detail { 16 | mutex::OnceFlag numCpusOnceFlag; 17 | } // namespace detail 18 | 19 | } // namespace internal 20 | } // namespace rseq 21 | -------------------------------------------------------------------------------- /rseq/internal/Likely.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #if defined(__GNUC__) && __GNUC__ >= 4 13 | #define RSEQ_LIKELY(x) (__builtin_expect((x), 1)) 14 | #define RSEQ_UNLIKELY(x) (__builtin_expect((x), 0)) 15 | #else 16 | #define RSEQ_LIKELY(x) (x) 17 | #define RSEQ_UNLIKELY(x) (x) 18 | #endif 19 | -------------------------------------------------------------------------------- /rseq/internal/SwitchToCpu.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | // Switch to the given CPU. Throws a std::runtime_error if it couldn't do so 16 | // successfully. 17 | void switchToCpu(int cpu); 18 | 19 | } // namespace internal 20 | } // namespace rseq 21 | -------------------------------------------------------------------------------- /rseq/internal/AsymmetricThreadFence.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | inline void asymmetricThreadFenceLight() { 16 | asm volatile("" : : : "memory"); 17 | } 18 | 19 | // Throws std::runtime_error on failure. 20 | void asymmetricThreadFenceHeavy(); 21 | 22 | } // namespace internal 23 | } // namespace rseq 24 | -------------------------------------------------------------------------------- /rseq/internal/rseq_c_inlines.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/rseq_c.h" 11 | 12 | extern inline int rseq_begin(); 13 | extern inline int rseq_load(rseq_value_t *dst, rseq_repr_t *src); 14 | extern inline int rseq_store(rseq_repr_t *dst, rseq_value_t val); 15 | extern inline int rseq_store_fence(rseq_repr_t *dst, rseq_value_t val); 16 | extern inline int rseq_validate(); 17 | -------------------------------------------------------------------------------- /rseq/internal/SwitchToCpuTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/SwitchToCpu.h" 11 | 12 | #include 13 | 14 | #include 15 | 16 | #include "rseq/internal/NumCpus.h" 17 | 18 | using namespace rseq::internal; 19 | 20 | TEST(SwitchToCpu, SwitchesCpus) { 21 | for (int i = 0; i < numCpus(); ++i) { 22 | switchToCpu(i); 23 | EXPECT_EQ(i, sched_getcpu()); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /rseq/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | 8 | add_subdirectory(internal) 9 | # internal/CMakeLists.txt populates the all_sources variable. 10 | add_library(rseq ${all_sources}) 11 | 12 | rseq_gtest( 13 | rseq_test 14 | RseqTest.cpp 15 | rseq 16 | cpu_local 17 | num_cpus 18 | switch_to_cpu 19 | ) 20 | 21 | rseq_gtest( 22 | rseq_c_test 23 | RseqCTest.cpp 24 | rseq 25 | cpu_local 26 | num_cpus 27 | switch_to_cpu 28 | ) 29 | 30 | install (TARGETS rseq DESTINATION lib) 31 | -------------------------------------------------------------------------------- /rseq/internal/OsMem.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | namespace rseq { 15 | namespace internal { 16 | namespace os_mem { 17 | 18 | // Allocation functions throw a std::runtime_exception on failure. 19 | void* allocate(std::size_t bytes); 20 | void* allocateExecutable(std::size_t bytes); 21 | void free(void* ptr, std::size_t bytes); 22 | 23 | 24 | } // namespace os_mem 25 | } // namespace internal 26 | } // namespace rseq 27 | -------------------------------------------------------------------------------- /rseq/internal/Dummy.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | // This file only exists because CMake will complain about libraries with no 11 | // .cpp files (i.e. header-only libraries). Explicitly listing such libraries in 12 | // a CMakeLists.txt file isn't strictly necessary (header-only libraries should 13 | // "just work"), but it helps make library inter-dependencies clear. 14 | 15 | namespace rseq { 16 | namespace internal { 17 | namespace dummy { 18 | inline void dummy() {} 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /rseq/internal/Errors.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Errors.h" 11 | 12 | namespace rseq { 13 | namespace internal { 14 | namespace errors { 15 | 16 | static __thread FatalErrorHandler curHandler; 17 | 18 | void setFatalErrorHandler(FatalErrorHandler handler) { 19 | curHandler = handler; 20 | } 21 | 22 | FatalErrorHandler getFatalErrorHandler() { 23 | return curHandler; 24 | } 25 | 26 | void fatalError(const char* message) { 27 | curHandler(message); 28 | } 29 | 30 | } // namespace errors 31 | } // namespace internal 32 | } // namespace rseq 33 | -------------------------------------------------------------------------------- /rseq/internal/NumCpus.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | #include "rseq/internal/Mutex.h" 15 | 16 | namespace rseq { 17 | namespace internal { 18 | 19 | namespace detail { 20 | extern mutex::OnceFlag numCpusOnceFlag; 21 | } // namespace detail 22 | 23 | // std::thread::hardware_concurrency() is surprisingly slow. This just caches 24 | // the result. 25 | inline int numCpus() { 26 | static int result; 27 | mutex::callOnce(detail::numCpusOnceFlag, []() { 28 | result = sysconf(_SC_NPROCESSORS_ONLN); 29 | }); 30 | return result; 31 | } 32 | 33 | } // namespace internal 34 | } // namespace rseq 35 | -------------------------------------------------------------------------------- /rseq/internal/SwitchToCpu.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/SwitchToCpu.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | #include "rseq/internal/Errors.h" 21 | 22 | namespace rseq { 23 | namespace internal { 24 | 25 | void switchToCpu(int cpu) { 26 | pid_t tid = syscall(__NR_gettid); 27 | cpu_set_t set; 28 | CPU_ZERO(&set); 29 | CPU_SET(cpu, &set); 30 | int err = sched_setaffinity(tid, sizeof(cpu_set_t), &set); 31 | if (err != 0) { 32 | errors::fatalError("Couldn't switch cpus"); 33 | } 34 | } 35 | 36 | } // namespace internal 37 | } // namespace rseq 38 | -------------------------------------------------------------------------------- /rseq/internal/CleanUpOnThreadDeath.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | // We want to centralize all the thread death logic for a couple reasons: 16 | // - The order of execution matters; we do rseq cleanup before thread control 17 | // cleanup. 18 | // - That's how jemalloc does it, so porting will be easier if we decide to. 19 | // - The logic is actually a little bit subtle (there are ODR issues involved). 20 | // We have to wrap up the calls behind a layer of indirection to avoid a 21 | // circular dependency. 22 | void setRseqCleanup(void (*)()); 23 | void setThreadControlCleanup(void (*)()); 24 | 25 | } // namespace internal 26 | } // namespace rseq 27 | -------------------------------------------------------------------------------- /rseq/internal/CpuLocalTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/CpuLocal.h" 11 | 12 | #include 13 | 14 | #include "rseq/internal/NumCpus.h" 15 | #include "rseq/internal/SwitchToCpu.h" 16 | 17 | using namespace rseq::internal; 18 | 19 | TEST(CpuLocal, DataIsPerCpu) { 20 | CpuLocal data; 21 | for (int i = 0; i < numCpus(); ++i) { 22 | switchToCpu(i); 23 | *data.forCpu(i) = i; 24 | } 25 | 26 | for (int i = 0; i < numCpus(); ++i) { 27 | switchToCpu(i); 28 | EXPECT_EQ(i, *data.forCpu(i)); 29 | } 30 | } 31 | 32 | TEST(CpuLocal, CanAccessAnotherCpusData) { 33 | CpuLocal data; 34 | switchToCpu(0); 35 | for (int i = 0; i < numCpus(); ++i) { 36 | *data.forCpu(i) = i; 37 | } 38 | for (int i = 0; i < numCpus(); ++i) { 39 | switchToCpu(i); 40 | EXPECT_EQ(i, *data.forCpu(i)); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /rseq/internal/Code.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | 15 | namespace rseq { 16 | namespace internal { 17 | 18 | class Code { 19 | public: 20 | // The rseq load and store functions return 1 if there was an interruption, 21 | // and 0 otherwise. 22 | typedef int (*RseqLoadFunc)(unsigned long* dst, unsigned long* src); 23 | typedef int (*RseqStoreFunc)(unsigned long* dst, unsigned long val); 24 | 25 | static Code* initForId(std::uint32_t id, std::atomic* threadCachedCpu); 26 | 27 | RseqLoadFunc rseqLoadFunc(); 28 | RseqStoreFunc rseqStoreFunc(); 29 | RseqStoreFunc rseqStoreFenceFunc(); 30 | 31 | void blockRseqOps(); 32 | void unblockRseqOps(); 33 | 34 | private: 35 | unsigned char code_[54]; // See Code.cpp to see where 54 comes from. 36 | }; 37 | 38 | } // namespace internal 39 | } // namespace rseq 40 | -------------------------------------------------------------------------------- /rseq/internal/Mutex.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Mutex.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rseq { 17 | namespace internal { 18 | namespace mutex { 19 | 20 | void Mutex::futexWait(std::uint32_t val) { 21 | // We ignore errors here; it just means we'll spin a little extra. 22 | syscall( 23 | __NR_futex, 24 | &state_, 25 | FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 26 | val, 27 | nullptr, 28 | nullptr, 29 | 0); 30 | } 31 | 32 | void Mutex::futexWake(int num) { 33 | // Ignore errors here, too; it probably means a destructor race. 34 | syscall( 35 | __NR_futex, 36 | &state_, 37 | FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 38 | num, 39 | nullptr, 40 | nullptr, 41 | 0); 42 | } 43 | 44 | } // namespace mutex 45 | } // namespace internal 46 | } // namespace rseq 47 | -------------------------------------------------------------------------------- /rseq/internal/rseq_c.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | /* We want this in C-land so that C users can get the fast inlined versions too. 17 | * It turns out to be slightly faster to have these return false on success and 18 | * true on failure, so we invert the result in the wrapper functions, hoping 19 | * that the compiler can use its visibility into them to avoid having to do its 20 | * own inversion. */ 21 | extern __thread int (*rseq_load_trampoline)( 22 | unsigned long* dst, unsigned long* src); 23 | extern __thread int (*rseq_store_trampoline)( 24 | unsigned long* dst, unsigned long val); 25 | extern __thread int (*rseq_store_fence_trampoline)( 26 | unsigned long* dst, unsigned long val); 27 | extern __thread volatile int rseq_thread_cached_cpu; 28 | 29 | int rseq_begin_slow_path(); 30 | 31 | #ifdef __cplusplus 32 | } /* extern "C" */ 33 | #endif 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | 8 | cmake_minimum_required(VERSION 2.8) 9 | 10 | project("Userspace restartable sequences") 11 | 12 | option(test "Build all tests." OFF) 13 | if (test) 14 | find_package(GTest REQUIRED) 15 | include_directories(${GTEST_INCLUDE_DIRS}) 16 | 17 | enable_testing() 18 | endif () 19 | function(rseq_gtest name src_file) 20 | if (test) 21 | add_executable( 22 | "${name}_runner" 23 | ${src_file} 24 | ) 25 | target_link_libraries( 26 | "${name}_runner" 27 | ${GTEST_BOTH_LIBRARIES} 28 | ${ARGN} 29 | ) 30 | add_test( 31 | NAME ${name} 32 | COMMAND "${name}_runner" 33 | ) 34 | endif () 35 | endfunction(rseq_gtest) 36 | 37 | set (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-std=c++11 -pthread -fno-exceptions") 38 | 39 | include_directories(PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 40 | 41 | add_subdirectory(rseq) 42 | 43 | add_executable(rseq_benchmark RseqBenchmark.cpp) 44 | target_link_libraries(rseq_benchmark rseq) 45 | 46 | install(DIRECTORY rseq DESTINATION include FILES_MATCHING PATTERN "*.h") 47 | -------------------------------------------------------------------------------- /rseq/internal/rseq_c.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include "rseq/internal/Errors.h" 18 | #include "rseq/internal/Rseq.h" 19 | 20 | extern "C" { 21 | 22 | __thread int (*rseq_load_trampoline)(unsigned long* dst, unsigned long* src); 23 | __thread int (*rseq_store_trampoline)(unsigned long* dst, unsigned long val); 24 | __thread int (*rseq_store_fence_trampoline)( 25 | unsigned long* dst, unsigned long val); 26 | __thread volatile int rseq_thread_cached_cpu = -1; 27 | 28 | int rseq_begin_slow_path() { 29 | rseq::internal::errors::AbortOnError aoe; 30 | return rseq::internal::beginSlowPath(); 31 | } 32 | 33 | void rseq_end() { 34 | rseq::internal::errors::AbortOnError aoe; 35 | rseq::internal::end(); 36 | } 37 | 38 | void rseq_fence_with(int shard) { 39 | rseq::internal::errors::AbortOnError aoe; 40 | rseq::internal::fenceWith(shard); 41 | } 42 | 43 | void rseq_fence() { 44 | rseq::internal::errors::AbortOnError aoe; 45 | rseq::internal::fence(); 46 | } 47 | 48 | } /* extern "C" */ 49 | -------------------------------------------------------------------------------- /rseq/internal/CachelinePadded.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | constexpr int kCachelineSize = 64; 16 | 17 | template 18 | struct CachelinePaddedImpl; 19 | template 20 | struct CachelinePaddedImpl { 21 | T item; 22 | }; 23 | template 24 | struct CachelinePaddedImpl { 25 | T item; 26 | char padding[kCachelineSize - sizeof(T) % kCachelineSize]; 27 | }; 28 | 29 | template 30 | struct CachelinePadded { 31 | // Casting from the return value of get() back to a CachelinePadded is 32 | // guaranteed to work if T is standard-layout. 33 | T* get() { 34 | return &paddedItem.item; 35 | } 36 | 37 | // Note: can't be private; this struct must remain standard-layout to get the 38 | // guarantee that we can cast back and forth between the item and this struct 39 | // (in particular, we need this for Code objects). 40 | CachelinePaddedImpl paddedItem; 41 | }; 42 | 43 | } // namespace internal 44 | } // namespace rseq 45 | -------------------------------------------------------------------------------- /rseq/internal/Rseq.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | #include "rseq/internal/Errors.h" 15 | #include "rseq/internal/rseq_c.h" 16 | 17 | namespace rseq { 18 | namespace internal { 19 | 20 | // Internal equivalents of public functions. 21 | // We put the error wrappers in the header file so that no exception logic lives 22 | // in librseq. 23 | 24 | 25 | int beginSlowPath(); 26 | void end(); 27 | void fenceWith(int shard); 28 | void fence(); 29 | 30 | inline int beginSlowPathWrapper() { 31 | errors::ThrowOnError thrower; 32 | return beginSlowPath(); 33 | }; 34 | 35 | inline void endWrapper() { 36 | errors::ThrowOnError thrower; 37 | end(); 38 | } 39 | 40 | inline void fenceWithWrapper(int shard) { 41 | errors::ThrowOnError thrower; 42 | fenceWith(shard); 43 | } 44 | 45 | inline void fenceWrapper() { 46 | errors::ThrowOnError thrower; 47 | fence(); 48 | } 49 | 50 | inline std::atomic* threadCachedCpu() { 51 | return reinterpret_cast*>( 52 | const_cast(&rseq_thread_cached_cpu)); 53 | } 54 | 55 | } // namespace internal 56 | } // namespace rseq 57 | -------------------------------------------------------------------------------- /rseq/internal/CpuLocal.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | #include "rseq/internal/CachelinePadded.h" 15 | #include "rseq/internal/NumCpus.h" 16 | #include "rseq/internal/OsMem.h" 17 | 18 | 19 | namespace rseq { 20 | namespace internal { 21 | 22 | template 23 | class CpuLocal { 24 | public: 25 | CpuLocal() { 26 | void* mem = os_mem::allocate(sizeof(ElemType) * numCpus()); 27 | elements_ = static_cast(mem); 28 | for (int i = 0; i < numCpus(); ++i) { 29 | new (&elements_[i]) ElemType; 30 | } 31 | } 32 | 33 | ~CpuLocal() { 34 | for (int i = 0; i < numCpus(); ++i) { 35 | elements_[i].~ElemType(); 36 | } 37 | os_mem::free(elements_, sizeof(ElemType) * numCpus()); 38 | } 39 | 40 | T* forCpu(int i) { 41 | return elements_[i].get(); 42 | } 43 | 44 | private: 45 | // This saves us some typing, and is needed for explicit destructor invocation 46 | // (which doesn't parse with template types). 47 | typedef CachelinePadded ElemType; 48 | ElemType* elements_; 49 | }; 50 | 51 | } // namespace internal 52 | } // namespace rseq 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For rseq software 4 | 5 | Copyright (c) 2016-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Rseq 2 | We want to make contributing to this project as easy and transparent as 3 | possible. We anticipate only bug-fixes, but functionality improvements might be 4 | possible as well. 5 | 6 | ## Our Development Process 7 | We intend github to be the source of truth for this project, with future 8 | development happening entirely "in the open". 9 | 10 | ## Pull Requests 11 | We actively welcome your pull requests. 12 | 13 | 1. Fork the repo and create your branch from `master`. 14 | 2. If you've added code that should be tested, add tests. 15 | 3. If you've changed APIs, update the documentation. 16 | 4. Ensure the test suite passes. 17 | 5. Make sure your code lints. 18 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 19 | 20 | ## Contributor License Agreement ("CLA") 21 | In order to accept your pull request, we need you to submit a CLA. You only need 22 | to do this once to work on any of Facebook's open source projects. 23 | 24 | Complete your CLA here: 25 | 26 | ## Issues 27 | We use GitHub issues to track public bugs. Please ensure your description is 28 | clear and has sufficient instructions to be able to reproduce the issue. 29 | 30 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 31 | disclosure of security bugs. In those cases, please go through the process 32 | outlined on that page and do not file a public issue. 33 | 34 | ## Coding Style 35 | * 2 spaces for indentation rather than tabs 36 | * 80 character line length 37 | * Broadly, we follow the Google C++ style guide. 38 | 39 | ## License 40 | By contributing to Rseq, you agree that your contributions will be licensed 41 | under the LICENSE file in the root directory of this source tree. 42 | -------------------------------------------------------------------------------- /rseq/RseqCTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/rseq_c.h" 11 | 12 | #include 13 | 14 | #include 15 | 16 | // We don't put this through the ringer the same way we do the C++ interface. So 17 | // long as it compiles and runs, we assume things are correct. 18 | TEST(RseqC, SanityChecks) { 19 | rseq_repr_t rseqItem; 20 | reinterpret_cast*>(&rseqItem)->store(1); 21 | rseq_value_t rseqValue; 22 | 23 | /* int cpu = */ rseq_begin(); 24 | 25 | // Starts at 1 26 | EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem)); 27 | EXPECT_EQ(1, rseqValue); 28 | 29 | // Store 2, then load 30 | EXPECT_TRUE(rseq_store(&rseqItem, 2)); 31 | EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem)); 32 | EXPECT_EQ(2, rseqValue); 33 | 34 | // Store-fence 3, then load 35 | EXPECT_TRUE(rseq_store_fence(&rseqItem, 3)); 36 | EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem)); 37 | EXPECT_EQ(3, rseqValue); 38 | 39 | // Fence 40 | rseq_fence(); 41 | 42 | // Store should fail then. 43 | EXPECT_FALSE(rseq_store(&rseqItem, 4)); 44 | EXPECT_EQ( 45 | 3, reinterpret_cast*>(&rseqItem)->load()); 46 | 47 | // Start up again 48 | /* int cpu = */ rseq_begin(); 49 | 50 | // End 51 | rseq_end(); 52 | 53 | // Start up yet again. 54 | /* int cpu = */ rseq_begin(); 55 | 56 | // And things should work, even after ending. 57 | EXPECT_TRUE(rseq_store(&rseqItem, 5)); 58 | EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem)); 59 | EXPECT_EQ(5, rseqValue); 60 | } 61 | -------------------------------------------------------------------------------- /rseq/internal/CleanUpOnThreadDeath.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | #include "rseq/internal/CleanUpOnThreadDeath.h" 10 | 11 | #include 12 | 13 | #include "rseq/internal/Mutex.h" 14 | #include "rseq/internal/Errors.h" 15 | 16 | namespace rseq { 17 | namespace internal { 18 | 19 | static __thread void (*cleanUpRseq)(); 20 | static __thread void (*cleanUpThreadControl)(); 21 | 22 | static __thread bool myDestructorScheduled; 23 | static pthread_key_t pthreadOnceKey; 24 | static mutex::OnceFlag destructorScheduledOnceFlag; 25 | 26 | static void destructor(void* /* ignored */) { 27 | // If someone does an rseq operation *within* a pthread destructor, we'll 28 | // re-initialize our data. 29 | myDestructorScheduled = false; 30 | if (cleanUpRseq != nullptr) { 31 | cleanUpRseq(); 32 | } 33 | if (cleanUpThreadControl != nullptr) { 34 | cleanUpThreadControl(); 35 | } 36 | cleanUpRseq = nullptr; 37 | cleanUpThreadControl = nullptr; 38 | } 39 | 40 | static void ensureDestructorScheduled() { 41 | mutex::callOnce(destructorScheduledOnceFlag, []() { 42 | int err = pthread_key_create(&pthreadOnceKey, &destructor); 43 | if (err != 0) { 44 | errors::fatalError("Couldn't schedule thread death destructor"); 45 | } 46 | }); 47 | if (!myDestructorScheduled) { 48 | // Exists purely to schedule the destructor. 49 | pthread_setspecific(pthreadOnceKey, reinterpret_cast(1)); 50 | } 51 | } 52 | 53 | void setRseqCleanup(void (*func)()) { 54 | cleanUpRseq = func; 55 | ensureDestructorScheduled(); 56 | } 57 | 58 | void setThreadControlCleanup(void (*func)()) { 59 | cleanUpThreadControl = func; 60 | ensureDestructorScheduled(); 61 | } 62 | 63 | } // namespace internal 64 | } // namespace rseq 65 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the rseq software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /rseq/internal/OsMem.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include "rseq/internal/Errors.h" 16 | 17 | namespace rseq { 18 | namespace internal { 19 | namespace os_mem { 20 | 21 | static void* mmapWithPermissions(std::size_t bytes, int prot) { 22 | // If we die in this method, it'd be helpful to know the arguments; make sure 23 | // they're available in the debugger. 24 | volatile int bytesCopy = bytes; 25 | volatile int protCopy = prot; 26 | 27 | void* alloc = mmap( 28 | nullptr, 29 | bytes, 30 | prot, 31 | MAP_PRIVATE | MAP_ANONYMOUS, 32 | -1, 33 | 0); 34 | if (alloc == MAP_FAILED) { 35 | errors::fatalError("mmap failed."); 36 | } 37 | return alloc; 38 | } 39 | 40 | void* allocate(std::size_t bytes) { 41 | return mmapWithPermissions(bytes, PROT_READ | PROT_WRITE); 42 | } 43 | 44 | void* allocateExecutable(std::size_t bytes) { 45 | return mmapWithPermissions(bytes, PROT_READ | PROT_WRITE | PROT_EXEC); 46 | } 47 | 48 | void free(void* ptr, std::size_t bytes) { 49 | // Note that we may throw, even though this is on a deallocation path. So if 50 | // we get called with an invalid argument during exception unwinding, we'll 51 | // crash the process. This is an acceptable penalty for passing invalid 52 | // pointers to your memory allocator. 53 | const int kPageSize = 4096; 54 | std::uintptr_t ptrInt = reinterpret_cast(ptr); 55 | if (ptrInt & (kPageSize - 1)) { 56 | errors::fatalError("Improperly aligned pointer"); 57 | } 58 | int err = munmap(ptr, bytes); 59 | if (err != 0) { 60 | errors::fatalError("munmap failed"); 61 | } 62 | } 63 | 64 | } // namespace os_mem 65 | } // namespace internal 66 | } // namespace rseq 67 | -------------------------------------------------------------------------------- /rseq/internal/IntrusiveLinkedList.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | namespace rseq { 13 | namespace internal { 14 | 15 | // Intrusive linked list, using the CRTP. Does not take ownership of its 16 | // elements. 17 | 18 | // Supports the bare minimum interface necessary for its only use, in 19 | // ThreadControl. 20 | 21 | template 22 | class IntrusiveLinkedList; 23 | 24 | template 25 | class IntrusiveLinkedListNode { 26 | private: 27 | friend class IntrusiveLinkedList; 28 | 29 | IntrusiveLinkedListNode* next; 30 | IntrusiveLinkedListNode* prev; 31 | }; 32 | 33 | template 34 | class IntrusiveLinkedList { 35 | public: 36 | IntrusiveLinkedList() { 37 | dummyHead_.next = &dummyTail_; 38 | dummyTail_.prev = &dummyHead_; 39 | } 40 | 41 | void link(IntrusiveLinkedListNode* node) { 42 | node->next = &dummyTail_; 43 | node->prev = dummyTail_.prev; 44 | 45 | node->next->prev = node; 46 | node->prev->next = node; 47 | } 48 | 49 | void unlink(IntrusiveLinkedListNode* node) { 50 | node->next->prev = node->prev; 51 | node->prev->next = node->next; 52 | } 53 | 54 | 55 | 56 | // We don't need real iterator support, just enough for a range-based for 57 | // loop. 58 | struct Iterator { 59 | IntrusiveLinkedListNode* item; 60 | void operator++() { 61 | item = item->next; 62 | } 63 | T& operator*() { 64 | return *static_cast(item); 65 | } 66 | bool operator!=(const Iterator& other) { 67 | return item != other.item; 68 | } 69 | }; 70 | Iterator begin() { 71 | return { dummyHead_.next }; 72 | } 73 | Iterator end() { 74 | return { &dummyTail_ }; 75 | } 76 | 77 | IntrusiveLinkedListNode dummyHead_; 78 | IntrusiveLinkedListNode dummyTail_; 79 | }; 80 | 81 | } // namespace internal 82 | } // namespace rseq 83 | -------------------------------------------------------------------------------- /rseq/internal/MutexTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Mutex.h" 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | using namespace rseq::internal::mutex; 18 | 19 | TEST(Mutex, ProvidesExclusion) { 20 | const int kNumThreads = 10; 21 | const int kIncrementsPerThread = 1000000; 22 | 23 | Mutex mu; 24 | mu.init(); 25 | int x = 0; 26 | int y = 0; 27 | std::vector threads(kNumThreads); 28 | for (int i = 0; i < kNumThreads; ++i) { 29 | threads[i] = std::thread([&]() { 30 | for (int j = 0; j < kIncrementsPerThread; ++j) { 31 | LockGuard lg(mu); 32 | EXPECT_TRUE(x == y); 33 | ++x; 34 | ++y; 35 | } 36 | }); 37 | } 38 | for (int i = 0; i < kNumThreads; ++i) { 39 | threads[i].join(); 40 | } 41 | EXPECT_TRUE(x == y); 42 | EXPECT_EQ(kNumThreads * kIncrementsPerThread, x); 43 | } 44 | 45 | TEST(CallOnce, SimpleCase) { 46 | int x = 0; 47 | OnceFlag once; 48 | once.init(); 49 | callOnce(once, [&]() { 50 | ++x; 51 | }); 52 | callOnce(once, [&]() { 53 | ++x; 54 | }); 55 | EXPECT_EQ(1, x); 56 | } 57 | 58 | TEST(CallOnce, Racy) { 59 | const int kNumTrials = 10000; 60 | const int kNumThreads = 10; 61 | for (int i = 0; i < kNumTrials; ++i) { 62 | std::vector threads(kNumThreads); 63 | std::atomic ready(false); 64 | int x = 0; 65 | OnceFlag once; 66 | once.init(); 67 | for (int j = 0; j < kNumThreads; ++j) { 68 | threads[j] = std::thread([&]() { 69 | while (!ready.load()) { 70 | // Spin until all threads have a chance to win the race. 71 | } 72 | callOnce(once, [&]() { 73 | ++x; 74 | }); 75 | EXPECT_EQ(1, x); 76 | }); 77 | } 78 | ready.store(true); 79 | for (int j = 0; j < kNumThreads; ++j) { 80 | threads[j].join(); 81 | } 82 | EXPECT_EQ(1, x); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /rseq/rseq_c.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "rseq/internal/Likely.h" 13 | #include "rseq/internal/rseq_c.h" 14 | 15 | #ifdef __cplusplus 16 | extern "C" { 17 | #endif 18 | 19 | /* A 64-bit type; this is what "inhabits" rseq slots. */ 20 | typedef unsigned long rseq_value_t; 21 | 22 | /* Rseq slots to which you can do rseq-protected loads and stores. */ 23 | typedef struct { 24 | /* This union tricks gcc into not complaining about the strict-aliasing 25 | * violation here. I don't think it actually fixes the undefined behavior, but 26 | * as a practical matter the utility of being able to cast to atomic types is 27 | * more important. */ 28 | union { 29 | volatile rseq_value_t item; 30 | volatile char aliasing_goo[sizeof(rseq_value_t)]; 31 | }; 32 | } rseq_repr_t; 33 | 34 | 35 | inline int rseq_begin() { 36 | int ret = rseq_thread_cached_cpu; 37 | if (RSEQ_UNLIKELY(ret < 0)) { 38 | ret = rseq_begin_slow_path(); 39 | } 40 | /* Good enough for an acquire barrier on x86. */ 41 | __asm__ volatile("" : : : "memory"); 42 | return ret; 43 | } 44 | 45 | inline int rseq_load(rseq_value_t *dst, rseq_repr_t *src) { 46 | /* Note: this goes through dynamically generated code, which will prevent 47 | compiler reordering. */ 48 | return RSEQ_LIKELY(!rseq_load_trampoline(dst, (unsigned long*)src)); 49 | } 50 | 51 | inline int rseq_store(rseq_repr_t *dst, rseq_value_t val) { 52 | /* Same here. */ 53 | return RSEQ_LIKELY(!rseq_store_trampoline((unsigned long*)dst, val)); 54 | } 55 | 56 | inline int rseq_store_fence(rseq_repr_t *dst, rseq_value_t val) { 57 | /* And here. */ 58 | return RSEQ_LIKELY(!rseq_store_fence_trampoline((unsigned long*)dst, val)); 59 | } 60 | 61 | inline int rseq_validate() { 62 | rseq_repr_t dummy; 63 | return rseq_store(&dummy, 0); 64 | } 65 | 66 | void rseq_end(); 67 | void rseq_fence_with(int shard); 68 | void rseq_fence(); 69 | 70 | 71 | #ifdef __cplusplus 72 | } /* extern "C" */ 73 | #endif 74 | -------------------------------------------------------------------------------- /rseq/internal/AsymmetricThreadFence.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/AsymmetricThreadFence.h" 11 | 12 | #include 13 | 14 | #include 15 | 16 | #include "rseq/internal/Errors.h" 17 | #include "rseq/internal/Mutex.h" 18 | 19 | namespace rseq { 20 | namespace internal { 21 | 22 | // TODO: There's a lot we can do to speed this up if we like, with varying 23 | // time/space tradeoffs: 24 | // - Can make this lock-free, or allocate from a pool of threads. 25 | // - Give each thread its own page for mprotect operations. This lets us cycle 26 | // it through: "R/W/X" -> "R/W" -> "R" -> "None", doing 4 mprotects for 3 27 | // heavy fences (instead of the 6 mprotects we need for the current 28 | // mechanism). 29 | // - Give each thread a range of pages to operate on; allocate N pages instead 30 | // of 1. Permissions are lowered one at a time, and raised in batch after 31 | // we've exhausted all of the lowerings we can. 32 | // - We can delegate the permission raising to a helper thread. Wouldn't save 33 | // much time, but could save pages. 34 | 35 | static mutex::Mutex mu; 36 | void asymmetricThreadFenceHeavy() { 37 | static char page[8192]; 38 | 39 | std::uintptr_t pageInt = reinterpret_cast(page); 40 | std::uintptr_t alignedInt = (pageInt + 4096 - 1) & ~(4096 - 1); 41 | char* aligned = reinterpret_cast(alignedInt); 42 | 43 | mutex::LockGuard lg(mu); 44 | 45 | // Make this volatile so that we know the debugger can see it if we die (for 46 | // simplicity, we don't include it in the error message) 47 | volatile int err = mprotect(aligned, 4096, PROT_READ | PROT_WRITE); 48 | if (err) { 49 | errors::fatalError( 50 | "First mprotect in asymmetricThreadFenceHeavy failed.\n"); 51 | } 52 | 53 | // Page must be dirty to trigger the IPI. 54 | *static_cast(aligned) = 0; 55 | 56 | err = mprotect(aligned, 4096, PROT_READ); 57 | if (err) { 58 | errors::fatalError( 59 | "Second mprotect in asymmetricThreadFenceHeavy failed.\n"); 60 | } 61 | } 62 | 63 | } // namespace internal 64 | } // namespace rseq 65 | -------------------------------------------------------------------------------- /rseq/internal/CachelinePaddedTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/CachelinePadded.h" 11 | 12 | #include 13 | 14 | using namespace rseq::internal; 15 | 16 | template 17 | struct SizedData { 18 | SizedData() { 19 | for (unsigned i = 0; i < dataSize; ++i) { 20 | data[i] = i; 21 | } 22 | } 23 | 24 | void doModifications() { 25 | for (unsigned i = 0; i < dataSize; ++i) { 26 | EXPECT_EQ(static_cast(i), data[i]); 27 | ++data[i]; 28 | } 29 | } 30 | 31 | ~SizedData() { 32 | for (unsigned i = 0; i < dataSize; ++i) { 33 | EXPECT_EQ(static_cast(i + 1), data[i]); 34 | } 35 | } 36 | 37 | unsigned char data[dataSize]; 38 | }; 39 | 40 | using ExactlyCachelineSized = SizedData; 41 | using DoubleCachelineSized = SizedData<2 * kCachelineSize>; 42 | using BelowCachelineSized = SizedData; 43 | using AboveCachelineSized = SizedData; 44 | 45 | TEST(CachelinePadded, Exact) { 46 | EXPECT_EQ(kCachelineSize, sizeof(CachelinePadded)); 47 | CachelinePadded item; 48 | item.get()->doModifications(); 49 | EXPECT_TRUE(reinterpret_cast*>( 50 | item.get()) == &item); 51 | } 52 | 53 | TEST(CachelinePadded, Double) { 54 | EXPECT_EQ(2 * kCachelineSize, sizeof(CachelinePadded)); 55 | CachelinePadded item; 56 | item.get()->doModifications(); 57 | EXPECT_TRUE(reinterpret_cast*>( 58 | item.get()) == &item); 59 | } 60 | 61 | TEST(CachelinePadded, Below) { 62 | EXPECT_EQ(kCachelineSize, sizeof(CachelinePadded)); 63 | CachelinePadded item; 64 | item.get()->doModifications(); 65 | EXPECT_TRUE(reinterpret_cast*>( 66 | item.get()) == &item); 67 | } 68 | 69 | TEST(CachelinePadded, Above) { 70 | EXPECT_EQ(2 * kCachelineSize, sizeof(CachelinePadded)); 71 | CachelinePadded item; 72 | item.get()->doModifications(); 73 | EXPECT_TRUE(reinterpret_cast*>( 74 | item.get()) == &item); 75 | } 76 | -------------------------------------------------------------------------------- /rseq/internal/IntrusiveLinkedListTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/IntrusiveLinkedList.h" 11 | 12 | #include 13 | 14 | using namespace rseq::internal; 15 | 16 | struct LLInt : IntrusiveLinkedListNode { 17 | unsigned data; 18 | }; 19 | 20 | struct DiesNoisily : IntrusiveLinkedListNode { 21 | DiesNoisily() : noisy(true) {} 22 | ~DiesNoisily() { 23 | EXPECT_FALSE(noisy); 24 | } 25 | 26 | bool noisy; 27 | }; 28 | 29 | TEST(IntrusiveLinkedList, ConstructsEmpty) { 30 | IntrusiveLinkedList list; 31 | int count = 0; 32 | for (auto& item : list) { 33 | ++count; 34 | } 35 | EXPECT_EQ(0, count); 36 | } 37 | 38 | TEST(IntrusiveLinkedList, DoesListOperations) { 39 | const int kNumItems = 10; 40 | const unsigned kItemSetMask = ((1 << kNumItems) - 1); 41 | 42 | LLInt itemsArr[kNumItems]; 43 | for (int i = 0; i < kNumItems; ++i) { 44 | itemsArr[i].data = (1 << i); 45 | } 46 | 47 | // Add all the even indices 48 | IntrusiveLinkedList itemsList; 49 | for (int i = 0; i < kNumItems; ++i) { 50 | if (i % 2 == 0) { 51 | itemsList.link(&itemsArr[i]); 52 | } 53 | } 54 | 55 | // Make sure only the even bit positions are set. 56 | unsigned itemSet = 0; 57 | for (auto& item : itemsList) { 58 | itemSet |= item.data; 59 | } 60 | EXPECT_EQ(0x55555555U & kItemSetMask, itemSet); 61 | 62 | // Add the odds, too 63 | for (int i = 0; i < kNumItems; ++i) { 64 | if (i % 2 == 1) { 65 | itemsList.link(&itemsArr[i]); 66 | } 67 | } 68 | 69 | // Make sure *all* bits are set. 70 | itemSet = 0; 71 | for (auto& item : itemsList) { 72 | itemSet |= item.data; 73 | } 74 | EXPECT_EQ(kItemSetMask, itemSet); 75 | 76 | // Remove the items divisible by 4 77 | for (int i = 0; i < kNumItems; ++i) { 78 | if (i % 4 == 0) { 79 | itemsList.unlink(&itemsArr[i]); 80 | } 81 | } 82 | 83 | // Make sure every fourth bit is unset. 84 | itemSet = 0; 85 | for (auto& item : itemsList) { 86 | itemSet |= item.data; 87 | } 88 | EXPECT_EQ(0xEEEEEEEEU & kItemSetMask, itemSet); 89 | } 90 | 91 | TEST(IntrusiveLinkedList, DoesNotTakeOwnership) { 92 | DiesNoisily item; 93 | { 94 | IntrusiveLinkedList list; 95 | list.link(&item); 96 | // Destructor runs here. 97 | } 98 | item.noisy = false; 99 | } 100 | -------------------------------------------------------------------------------- /rseq/internal/IdAllocator.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | 15 | #include "rseq/internal/Mutex.h" 16 | #include "rseq/internal/OsMem.h" 17 | 18 | namespace rseq { 19 | namespace internal { 20 | 21 | // This can block when acquiring or releasing an Id, but doing an Id->owner 22 | // lookup is lock-free and fast. 23 | // This never returns an Id of 0; you can use that as "null". 24 | // This is guaranteed to return either an Id that has been acquired and then 25 | // released, or, if no such Id exists, the smallest positive uint32_t that has 26 | // not already been allocated. 27 | // TODO: if we ever start using this more than in a handful of places, we should 28 | // type-erase everything; the identity of T doesn't matter. 29 | template 30 | class IdAllocator { 31 | public: 32 | // maxElements should include the null element. If you need 10 items + null, 33 | // maxElement should be 11. 34 | explicit IdAllocator(std::uint32_t maxElements) : maxElements_(maxElements) { 35 | mu_.init(); 36 | 37 | freeListHead_ = 0; 38 | // We never allocate id 0, so we can use it as null in the linked list of 39 | // free ids. 40 | firstUntouchedId_ = 1; 41 | 42 | void* mem = os_mem::allocate(maxElements_ * sizeof(FreeNodeOrItem)); 43 | items_ = static_cast(mem); 44 | } 45 | 46 | ~IdAllocator() { 47 | os_mem::free(items_, maxElements_ * sizeof(FreeNodeOrItem)); 48 | } 49 | 50 | std::uint32_t allocate(T* owner) { 51 | mutex::LockGuard lg(mu_); 52 | 53 | std::uint32_t result; 54 | if (freeListHead_ != 0) { 55 | result = freeListHead_; 56 | freeListHead_ = items_[freeListHead_].next; 57 | } else { 58 | result = firstUntouchedId_++; 59 | } 60 | items_[result].owner = owner; 61 | return result; 62 | } 63 | 64 | void free(std::uint32_t id) { 65 | mutex::LockGuard lg(mu_); 66 | items_[id].next = freeListHead_; 67 | freeListHead_ = id; 68 | } 69 | 70 | T* lookupOwner(std::uint32_t id) { 71 | return items_[id].owner; 72 | } 73 | 74 | private: 75 | union FreeNodeOrItem { 76 | std::uint32_t next; 77 | T* owner; 78 | }; 79 | 80 | mutex::Mutex mu_; 81 | FreeNodeOrItem* items_; 82 | std::uint32_t freeListHead_; 83 | std::uint32_t firstUntouchedId_; 84 | std::uint32_t maxElements_; 85 | }; 86 | 87 | } // namespace internal 88 | } // namespace rseq 89 | -------------------------------------------------------------------------------- /rseq/internal/ThreadControl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | 15 | #include "rseq/internal/IntrusiveLinkedList.h" 16 | 17 | namespace rseq { 18 | namespace internal { 19 | 20 | class Code; 21 | 22 | class ThreadControl : public IntrusiveLinkedListNode { 23 | public: 24 | // Get the calling thread's ThreadControl. 25 | static ThreadControl* get(std::atomic* threadCachedCpu); 26 | 27 | // Get the ThreadControl with the given id 28 | static ThreadControl* forId(std::uint32_t id); 29 | 30 | // Each living thread has a distinct id. 31 | std::uint32_t id() { 32 | return id_; 33 | } 34 | 35 | Code* code() { 36 | return code_; 37 | } 38 | 39 | // Block or unblock this thread's rseq operations. 40 | // This doesn't do any memory model trickery; it's up to callers to ensure 41 | // that this method's actions are visible to the victim before knowing that 42 | // no more rseq operations will happen / will succeed. 43 | void blockRseqOps(); 44 | void unblockRseqOps(); 45 | 46 | // Try to get the associated thread's current CPU (if its running), or else 47 | // the next CPU it will run on. May fail and return -1. 48 | // Memory ordering is tricky here. Everything is best effort, with the 49 | // exception of one memory ordering guarantee: a thread that observes itself 50 | // to be running on cpu N, and subsequently observes another thread to be 51 | // running on cpu N using curCpu, then the effect is that of an 52 | // asymmetricThreadFenceHeavy() that pairs only with an 53 | // asymmetricThreadFenceLight() in the other thread. 54 | int curCpu(); 55 | 56 | // A ThreadControl object remains valid (and the corresponding thread alive) 57 | // whenever some other thread's accessing field contains its id, and when the 58 | // store happens-before the execution of die() below (which is executed when 59 | // the owning thread terminates). 60 | std::atomic* accessing() { 61 | return &accessing_; 62 | } 63 | 64 | private: 65 | // We don't want users making their own ThreadControls; the semantics and 66 | // cleanup code need for each thread to have at most one ThreadControl. 67 | explicit ThreadControl(std::atomic* threadCachedCpu); 68 | ~ThreadControl(); 69 | 70 | Code* code_; 71 | int tid_; 72 | std::uint32_t id_; 73 | std::atomic* threadCachedCpu_; 74 | std::atomic accessing_; 75 | 76 | ThreadControl* next_; 77 | ThreadControl* prev_; 78 | }; 79 | 80 | } // namespace internal 81 | } // namespace rseq 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rseq 2 | Rseq is a userspace take on the proposed kernel restartable sequences API, and 3 | provides and mechanism to perform efficient per-cpu operations. 4 | 5 | This isn't intended to be a long-running project. Instead, its goal is to allow 6 | userland experiments with rseq (without having to recompile the kernel), to 7 | collect data as to how useful it would be. 8 | 9 | ## Example 10 | Here is a simple demonstration of how to use rseq and why it might be useful. A 11 | more thorough explanation (together with history and some implementation notes) 12 | can be found in `Rseq.md`. 13 | 14 | rseq::Value counterByCpu[kNumCpus]; 15 | void bumpCounter() { 16 | while (true) { 17 | int cpu = rseq::begin(); 18 | // A plain atomic load. rseq::Value types are API-compatible with 19 | // std::atomic. 20 | int curValue = counterByCpu[cpu].load(); 21 | // rseq::store takes no action and returns false if another thread might 22 | // have run on the same CPU after the call to rseq::begin(). Otherwise, the 23 | // store happens and the call returns true. 24 | bool success = rseq::store(&counterByCpu[cpu], curValue + 1); 25 | if (success) { 26 | break; 27 | } 28 | } 29 | } 30 | 31 | ## Requirements 32 | Rseq only works on Linux and x86-64. Building requires CMake and a recent 33 | version of clang or g++. Building the tests requires a gtest installation. 34 | 35 | ## Building Rseq 36 | In this directory, run: 37 | 38 | mkdir build 39 | cd build 40 | # Include the former option to produce an optimized build, and the latter to 41 | # enable running tests. 42 | cmake [-DCMAKE_BUILD_TYPE=Release] [-Dtest=ON] [-DCMAKE_INSTALL_PREFIX=] ../ 43 | make 44 | 45 | # Now we can take some of our binaries for a test drive 46 | 47 | # If you passed -Dtest=ON above, this will run all tests. 48 | make test 49 | # Run a benchmark of a variety of mechanisms for incrementing a set of 50 | # counters. 51 | ./rseq_benchmark all 8 10000000 52 | 53 | ## Installing Rseq 54 | For the common case, you probably want: 55 | 56 | mkdir build && cd build 57 | cmake -DCMAKE_BUILD_TYPE=Release ../ 58 | sudo make install 59 | 60 | You can then compile programs that `#include "rseq/Rseq.h"` with 61 | `g++ myProgram.cpp -lrseq`. 62 | 63 | 64 | ## How Rseq works 65 | See `Rseq.md` for a more thorough description. Essentially, each thread gets its 66 | own copy of the code that does an rseq operation. When one thread wants to evict 67 | another from ownership of a CPU, it patches that thread's copy of the function 68 | to jump to a failure path instead of doing the operation. 69 | 70 | ## Full documentation 71 | [`Rseq.md`](Rseq.md) contains a more thorough description. Reading the comments in 72 | [`rseq/Rseq.h`](rseq/Rseq.h) should give a working understanding of the API. 73 | 74 | ## License 75 | Rseq is BSD-licensed. We also provide an additional patent grant. 76 | -------------------------------------------------------------------------------- /rseq/internal/AsymmetricThreadFenceTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/AsymmetricThreadFence.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include "rseq/internal/NumCpus.h" 19 | 20 | using namespace rseq::internal; 21 | 22 | class BiasedLock { 23 | public: 24 | BiasedLock() 25 | : fastTurn(true), 26 | fastInterested(false), 27 | slowInterested(false), 28 | slowMu(false) { 29 | } 30 | 31 | void lockFast() { 32 | fastInterested.store(true, std::memory_order_relaxed); 33 | fastTurn.store(true, std::memory_order_release); 34 | asymmetricThreadFenceLight(); 35 | while (slowInterested.load() && fastTurn.load()) { 36 | } 37 | } 38 | 39 | void unlockFast() { 40 | fastInterested.store(false, std::memory_order_release); 41 | } 42 | 43 | void lockSlow() { 44 | bool expected; 45 | do { 46 | expected = false; 47 | } while (!slowMu.compare_exchange_weak(expected, true)); 48 | slowInterested.store(true, std::memory_order_relaxed); 49 | fastTurn.store(false, std::memory_order_release); 50 | asymmetricThreadFenceHeavy(); 51 | while(fastInterested.load() && !fastTurn.load()) { 52 | } 53 | } 54 | 55 | void unlockSlow() { 56 | slowInterested.store(false, std::memory_order_release); 57 | slowMu.store(false, std::memory_order_release); 58 | } 59 | 60 | private: 61 | std::atomic fastTurn; 62 | std::atomic fastInterested; 63 | std::atomic slowInterested; 64 | std::atomic slowMu; 65 | }; 66 | 67 | TEST(AsymmetricThreadFence, BiasedLocking) { 68 | const std::uint64_t kFastIters = 3000000; 69 | const std::uint64_t kSlowIters = 10000; 70 | 71 | BiasedLock lock; 72 | std::uint64_t counter = 0; 73 | 74 | int numSlowThreads = numCpus() - 1; 75 | 76 | std::thread fastThread; 77 | std::vector slowThreads(numSlowThreads); 78 | 79 | // Start the slow threads incrementing the counter 80 | for (int i = 0; i < numSlowThreads; ++i) { 81 | slowThreads[i] = std::thread([&]() { 82 | for (int j = 0; j < kSlowIters; ++j) { 83 | lock.lockSlow(); 84 | ++counter; 85 | lock.unlockSlow(); 86 | } 87 | }); 88 | } 89 | // Start the fast thread incrementing the counter 90 | fastThread = std::thread([&]() { 91 | for (int j = 0; j < kFastIters; ++j) { 92 | lock.lockFast(); 93 | ++counter; 94 | lock.unlockFast(); 95 | } 96 | }); 97 | 98 | // Wait for the threads to finish. 99 | fastThread.join(); 100 | for (int i = 0; i < numSlowThreads; ++i) { 101 | slowThreads[i].join(); 102 | } 103 | EXPECT_EQ(kFastIters + numSlowThreads * kSlowIters, counter); 104 | } 105 | -------------------------------------------------------------------------------- /rseq/internal/Errors.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace rseq { 22 | namespace internal { 23 | namespace errors { 24 | 25 | namespace detail { 26 | inline void abortWithMessage(const char* message) { 27 | // We ignore the error code; we can't do anything in case of a "real" failure, 28 | // and handling e.g. signal logic is more complicated than we need. 29 | write(STDERR_FILENO, message, std::strlen(message)); 30 | std::abort(); 31 | } 32 | 33 | inline void throwRuntimeException(const char* message) { 34 | #if __EXCEPTIONS 35 | throw std::runtime_error(message); 36 | #endif // __EXCEPTIONS 37 | } 38 | } // namespace detail 39 | 40 | // This should not return; it should either terminate the program or throw an 41 | // exception. 42 | typedef void (*FatalErrorHandler)(const char* message); 43 | 44 | // Error handlers are thread-local. The default one throws an 45 | // std::runtime_exception 46 | void setFatalErrorHandler(FatalErrorHandler handler); 47 | FatalErrorHandler getFatalErrorHandler(); 48 | 49 | void fatalError(const char* message); 50 | 51 | // While one of these is in scope, rseq failures will call abort(), and 52 | // destruction via a thrown exception causes abort. 53 | // Having the abort call happen implicitly in a destructor (as opposed to 54 | // writing "catch(...) { abort(); }") is advantageous because it means that core 55 | // dumps will show the stack trace of the function that threw the exception, not 56 | // the one that caught it. 57 | class AbortOnError { 58 | public: 59 | AbortOnError() { 60 | previousHandler_ = getFatalErrorHandler(); 61 | setFatalErrorHandler(&detail::abortWithMessage); 62 | } 63 | ~AbortOnError() { 64 | #if __EXCEPTIONS 65 | if (std::uncaught_exception()) { 66 | // Being destroyed as part of exception unwinding; abort. 67 | detail::abortWithMessage("Exception thrown into top-level C function.\n"); 68 | } 69 | #endif // __EXCEPTIONS 70 | setFatalErrorHandler(previousHandler_); 71 | } 72 | AbortOnError(const AbortOnError&) = delete; 73 | AbortOnError& operator=(const AbortOnError&) = delete; 74 | private: 75 | FatalErrorHandler previousHandler_; 76 | }; 77 | 78 | #ifdef __EXCEPTIONS 79 | class ThrowOnError { 80 | public: 81 | ThrowOnError() { 82 | previousHandler_ = getFatalErrorHandler(); 83 | setFatalErrorHandler(&detail::throwRuntimeException); 84 | } 85 | ~ThrowOnError() { 86 | setFatalErrorHandler(previousHandler_); 87 | } 88 | private: 89 | FatalErrorHandler previousHandler_; 90 | }; 91 | #else // __EXCEPTIONS 92 | typedef AbortOnError ThrowOnError; 93 | #endif // __EXCEPTIONS 94 | 95 | 96 | } // namespace errors 97 | } // namespace internal 98 | } // namespace rseq 99 | -------------------------------------------------------------------------------- /rseq/internal/ErrorsTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Errors.h" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | using namespace rseq::internal::errors; 18 | 19 | TEST(Errors, AbortOnErrorAborts) { 20 | AbortOnError aoe; 21 | ASSERT_DEATH(fatalError("ThisIsAnErrorString"), "ThisIsAnErrorString"); 22 | } 23 | 24 | #if __EXCEPTIONS 25 | 26 | TEST(Errors, DefaultThrows) { 27 | ThrowOnError thrower; 28 | std::string msg = "Some error message"; 29 | bool exceptionCaught = false; 30 | try { 31 | fatalError(msg.c_str()); 32 | } catch (const std::runtime_error& exception) { 33 | EXPECT_EQ(msg, exception.what()); 34 | exceptionCaught = true; 35 | } 36 | EXPECT_TRUE(exceptionCaught); 37 | } 38 | 39 | TEST(Errors, AllowsChangingHandler) { 40 | ThrowOnError thrower; 41 | // Get a copy of the old handler. 42 | FatalErrorHandler oldHandler = getFatalErrorHandler(); 43 | 44 | // Install a custom handler that throws a custom type. 45 | struct MyException { 46 | }; 47 | FatalErrorHandler myHandler = +[](const char* /* message */) { 48 | throw MyException(); 49 | }; 50 | setFatalErrorHandler(myHandler); 51 | 52 | // Make sure the custom handler is called. 53 | bool exceptionCaught = false; 54 | try { 55 | fatalError("this gets ignored"); 56 | } catch (const MyException& /* exception */) { 57 | exceptionCaught = true; 58 | } 59 | EXPECT_TRUE(exceptionCaught); 60 | 61 | // Make sure we can reinstall the old handler, and that it's the right one. 62 | setFatalErrorHandler(oldHandler); 63 | exceptionCaught = false; 64 | try { 65 | fatalError("this gets ignored too"); 66 | } catch (const std::runtime_error& /* exception */) { 67 | exceptionCaught = true; 68 | } 69 | EXPECT_TRUE(exceptionCaught); 70 | } 71 | 72 | static void throwException() { 73 | throw std::runtime_error("Runtime error"); 74 | } 75 | 76 | static void abortAfterCallingThrowException() { 77 | AbortOnError aoe; 78 | throwException(); 79 | } 80 | 81 | static void tryCatchException() { 82 | try { 83 | abortAfterCallingThrowException(); 84 | } catch (...) { 85 | } 86 | } 87 | 88 | TEST(Errors, AbortOnErrorAbortsAfterExceptions) { 89 | ASSERT_DEATH(tryCatchException(), ""); 90 | } 91 | 92 | TEST(Errors, AbortOnErrorIsntPermanent) { 93 | ThrowOnError thrower; 94 | { 95 | AbortOnError aoe; 96 | } 97 | bool exceptionCaught = false; 98 | try { 99 | fatalError("blah blah blah"); 100 | } catch (const std::runtime_error& /* exception */) { 101 | exceptionCaught = true; 102 | } 103 | EXPECT_TRUE(exceptionCaught); 104 | } 105 | 106 | #else // __EXCEPTIONS 107 | 108 | TEST(Errors, DefaultAborts) { 109 | ASSERT_DEATH(fatalError(""), ""); 110 | } 111 | 112 | #endif // __EXCEPTIONS 113 | -------------------------------------------------------------------------------- /rseq/internal/Mutex.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "rseq/internal/Likely.h" 17 | 18 | // A simple clone of parts of . This lets us avoid depending on C++ 19 | // static constructors and linking against libstdc++ (which would stop people 20 | // from linking with plain-C binaries). 21 | // We only acquire mutexes down slow paths, so we don't bother doing anything 22 | // fancy (like adaptive spinning, anything to avoid wasted wakeup attempts, 23 | // etc.). 24 | 25 | // These classes don't have constructors or destructors, so that they can live 26 | // safely in static memory without any C++ runtime support. If they do in fact 27 | // live in static memory, no initialization is needed. Otherwise, you have to 28 | // call init() on them explicitly. 29 | 30 | namespace rseq { 31 | namespace internal { 32 | namespace mutex { 33 | 34 | template 35 | class LockGuard { 36 | public: 37 | explicit LockGuard(Lock& lock) : lock_(lock) { 38 | lock_.lock(); 39 | } 40 | ~LockGuard() { 41 | lock_.unlock(); 42 | } 43 | private: 44 | Lock& lock_; 45 | }; 46 | 47 | class Mutex { 48 | public: 49 | void init() { 50 | state_.store(0, std::memory_order_relaxed); 51 | } 52 | 53 | void lock() { 54 | std::uint32_t oldState = state_.exchange(kHeldNoWaiter); 55 | if (oldState == kFree) { 56 | return; 57 | } 58 | oldState = state_.exchange(kHeldPossibleWaiter); 59 | while (oldState != kFree) { 60 | futexWait(kHeldPossibleWaiter); 61 | oldState = state_.exchange(kHeldPossibleWaiter); 62 | } 63 | } 64 | 65 | void unlock() { 66 | std::uint32_t oldState = state_.exchange(0); 67 | if (oldState == kHeldPossibleWaiter) { 68 | futexWake(1); 69 | } 70 | } 71 | 72 | private: 73 | constexpr static std::uint32_t kFree = 0; 74 | constexpr static std::uint32_t kHeldNoWaiter = 1; 75 | constexpr static std::uint32_t kHeldPossibleWaiter = 2; 76 | 77 | void futexWait(std::uint32_t val); 78 | void futexWake(int num); 79 | 80 | std::atomic state_; 81 | }; 82 | 83 | 84 | class OnceFlag; 85 | template 86 | void callOnce(OnceFlag&, Func&&, Args&&...); 87 | 88 | class OnceFlag { 89 | public: 90 | void init() { 91 | initialized_.store(false, std::memory_order_relaxed); 92 | mu_.init(); 93 | } 94 | private: 95 | template 96 | friend void ::rseq::internal::mutex::callOnce(OnceFlag&, Func&&, Args&&...); 97 | 98 | std::atomic initialized_; 99 | Mutex mu_; 100 | }; 101 | 102 | template 103 | void callOnce(OnceFlag& flag, Func&& func, Args&&... args) { 104 | if (RSEQ_LIKELY(flag.initialized_.load(std::memory_order_acquire))) { 105 | return; 106 | } 107 | LockGuard lg(flag.mu_); 108 | if (RSEQ_LIKELY(flag.initialized_.load(std::memory_order_relaxed))) { 109 | return; 110 | } 111 | func(std::forward(args)...); 112 | flag.initialized_.store(true, std::memory_order_release); 113 | } 114 | 115 | } // namespace mutex 116 | } // namespace internal 117 | } // namespace rseq 118 | -------------------------------------------------------------------------------- /rseq/internal/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | 8 | add_library(asymmetric_thread_fence AsymmetricThreadFence.cpp) 9 | target_link_libraries( 10 | asymmetric_thread_fence 11 | mutex 12 | errors 13 | ) 14 | list(APPEND all_sources internal/AsymmetricThreadFence.cpp) 15 | 16 | rseq_gtest( 17 | asymmetric_thread_fence_test 18 | AsymmetricThreadFenceTest.cpp 19 | asymmetric_thread_fence 20 | num_cpus 21 | ) 22 | 23 | 24 | add_library(cacheline_padded Dummy.cpp) 25 | 26 | rseq_gtest( 27 | cacheline_padded_test 28 | CachelinePaddedTest.cpp 29 | cacheline_padded 30 | ) 31 | 32 | 33 | add_library(clean_up_on_thread_death CleanUpOnThreadDeath.cpp) 34 | target_link_libraries( 35 | clean_up_on_thread_death 36 | errors 37 | mutex 38 | ) 39 | list(APPEND all_sources internal/CleanUpOnThreadDeath.cpp) 40 | 41 | rseq_gtest( 42 | clean_up_on_thread_death_test 43 | CleanUpOnThreadDeathTest.cpp 44 | clean_up_on_thread_death 45 | ) 46 | 47 | 48 | add_library(code Code.cpp) 49 | target_link_libraries(code cacheline_padded mutex os_mem) 50 | list(APPEND all_sources internal/Code.cpp) 51 | 52 | rseq_gtest( 53 | code_test 54 | CodeTest.cpp 55 | code 56 | ) 57 | 58 | 59 | add_library(cpu_local Dummy.cpp) 60 | target_link_libraries(cpu_local cacheline_padded num_cpus os_mem) 61 | 62 | rseq_gtest( 63 | cpu_local_test 64 | CpuLocalTest.cpp 65 | cpu_local 66 | num_cpus 67 | switch_to_cpu 68 | ) 69 | 70 | 71 | add_library(id_allocator Dummy.cpp) 72 | target_link_libraries(id_allocator mutex os_mem) 73 | 74 | rseq_gtest( 75 | id_allocator 76 | IdAllocatorTest.cpp 77 | id_allocator 78 | ) 79 | 80 | add_library(errors Errors.cpp) 81 | list(APPEND all_sources internal/Errors.cpp) 82 | 83 | rseq_gtest( 84 | errors_test 85 | ErrorsTest.cpp 86 | errors 87 | ) 88 | 89 | 90 | add_library(intrusive_linked_list Dummy.cpp) 91 | 92 | rseq_gtest( 93 | intrusive_linked_list_test 94 | IntrusiveLinkedListTest.cpp 95 | intrusive_linked_list 96 | ) 97 | 98 | 99 | add_library(likely Dummy.cpp) 100 | # LIKELY and UNLIKELY macros not tested 101 | 102 | 103 | add_library(mutex Mutex.cpp) 104 | target_link_libraries(mutex likely) 105 | list(APPEND all_sources internal/Mutex.cpp) 106 | 107 | rseq_gtest( 108 | mutex_test 109 | MutexTest.cpp 110 | mutex 111 | ) 112 | 113 | 114 | add_library(num_cpus NumCpus.cpp) 115 | list(APPEND all_sources internal/NumCpus.cpp) 116 | target_link_libraries(num_cpus mutex) 117 | # numCpus() not tested 118 | 119 | 120 | add_library(os_mem OsMem.cpp) 121 | target_link_libraries( 122 | os_mem 123 | errors 124 | ) 125 | list(APPEND all_sources internal/OsMem.cpp) 126 | 127 | rseq_gtest( 128 | os_mem_test 129 | OsMemTest.cpp 130 | os_mem 131 | errors 132 | ) 133 | 134 | 135 | add_library(internal_rseq Rseq.cpp rseq_c.cpp rseq_c_inlines.c) 136 | target_link_libraries( 137 | internal_rseq 138 | asymmetric_thread_fence 139 | code 140 | cpu_local 141 | errors 142 | mutex 143 | num_cpus 144 | thread_control 145 | ) 146 | list( 147 | APPEND 148 | all_sources 149 | internal/Rseq.cpp 150 | internal/rseq_c.cpp 151 | internal/rseq_c_inlines.c 152 | ) 153 | # rseq is tested through the public interface; no rseq_gtest here. 154 | 155 | 156 | add_library(switch_to_cpu SwitchToCpu.cpp) 157 | target_link_libraries( 158 | switch_to_cpu 159 | errors 160 | ) 161 | # SwitchToCpu.cpp is test-only; we don't include it in all_sources 162 | rseq_gtest( 163 | switch_to_cpu_test 164 | SwitchToCpuTest.cpp 165 | num_cpus 166 | switch_to_cpu 167 | ) 168 | 169 | 170 | add_library(thread_control ThreadControl.cpp) 171 | target_link_libraries( 172 | thread_control 173 | clean_up_on_thread_death 174 | code 175 | id_allocator 176 | intrusive_linked_list 177 | mutex 178 | ) 179 | list(APPEND all_sources internal/ThreadControl.cpp) 180 | 181 | rseq_gtest( 182 | thread_control_test 183 | ThreadControlTest.cpp 184 | num_cpus 185 | switch_to_cpu 186 | thread_control 187 | ) 188 | 189 | set (all_sources ${all_sources} PARENT_SCOPE) 190 | -------------------------------------------------------------------------------- /rseq/internal/OsMemTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/OsMem.h" 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #include "rseq/internal/Errors.h" 21 | 22 | using namespace rseq::internal; 23 | using namespace rseq::internal::os_mem; 24 | 25 | TEST(OsMem, SanityCheck) { 26 | const int kAllocSize1 = 123456; 27 | const int kAllocSize2 = 12345; 28 | 29 | void* alloc1 = allocate(kAllocSize1); 30 | void* alloc2 = allocate(kAllocSize2); 31 | 32 | unsigned char* arr1 = reinterpret_cast(alloc1); 33 | unsigned char* arr2 = reinterpret_cast(alloc2); 34 | 35 | for (int i = 0; i < kAllocSize1; ++i) { 36 | arr1[i] = 111; 37 | } 38 | for (int i = 0; i < kAllocSize2; ++i) { 39 | arr2[i] = 222; 40 | } 41 | 42 | for (int i = 0; i < kAllocSize1; ++i) { 43 | EXPECT_EQ(111, arr1[i]); 44 | } 45 | free(alloc1, kAllocSize1); 46 | 47 | for (int i = 0; i < kAllocSize2; ++i) { 48 | EXPECT_EQ(222, arr2[i]); 49 | } 50 | free(alloc2, kAllocSize2); 51 | } 52 | 53 | #if __EXCEPTIONS 54 | TEST(OsMem, ThrowsOnFailure) { 55 | errors::ThrowOnError thrower; 56 | 57 | bool failed = false; 58 | const std::size_t kTooBig = 1ULL << 48; 59 | try { 60 | allocate(kTooBig); 61 | } catch (...) { 62 | failed = true; 63 | } 64 | EXPECT_TRUE(failed); 65 | failed = false; 66 | try { 67 | allocateExecutable(kTooBig); 68 | } catch (...) { 69 | failed = true; 70 | } 71 | EXPECT_TRUE(failed); 72 | } 73 | #endif // __EXCEPTIONS 74 | 75 | TEST(OsMem, AllocatesExecutable) { 76 | const unsigned char return12345Template[] = { 77 | // mov $12345, %eax ; (12345 = 0x3039) 78 | 0xb8, 0x39, 0x30, 0x00, 0x00, 79 | // retq 80 | 0xc3, 81 | }; 82 | 83 | void* code = allocateExecutable(sizeof(return12345Template)); 84 | std::memcpy(code, return12345Template, sizeof(return12345Template)); 85 | int (*fn)() = reinterpret_cast(code); 86 | EXPECT_EQ(12345, fn()); 87 | free(code, sizeof(return12345Template)); 88 | } 89 | 90 | TEST(OsMem, Frees) { 91 | // These can't be stack variables, since we need to know their address 92 | // (without being told) in the signal handler. We make them thread-local to 93 | // avoid any parallel testing trickiness. 94 | static __thread void* volatile alloc; 95 | static __thread volatile bool segfaulted; 96 | static __thread jmp_buf returnFromSegfault; 97 | 98 | alloc = nullptr; 99 | segfaulted = false; 100 | 101 | struct sigaction oldHandler; 102 | struct sigaction newHandler; 103 | 104 | void (*segfaultHandler)(int, siginfo_t*, void*) 105 | = +[](int signo, siginfo_t* info, void* ucontext) { 106 | EXPECT_EQ(SIGSEGV, signo); 107 | // EXPECT_EQ is a little screwy with regards to volatile pointer (note: 108 | // not pointer *to* volatile) arguments. We copy its argument into a 109 | // non-volatile pointer to help it out. 110 | void* allocCopy = alloc; 111 | EXPECT_EQ(allocCopy, info->si_addr); 112 | segfaulted = true; 113 | // We setjmp(returnFromSegfault) before triggering the segfault. 114 | longjmp(returnFromSegfault, 1); 115 | }; 116 | 117 | std::memset(&newHandler, 0, sizeof(newHandler)); 118 | newHandler.sa_sigaction = segfaultHandler; 119 | newHandler.sa_flags = SA_SIGINFO; 120 | int err = sigaction(SIGSEGV, &newHandler, &oldHandler); 121 | ASSERT_EQ(0, err); 122 | 123 | alloc = allocate(1); 124 | volatile char* c = static_cast(alloc); 125 | *c = 123; 126 | 127 | free(alloc, 1); 128 | EXPECT_FALSE(segfaulted); 129 | 130 | // BEGIN MAGIC 131 | if (!setjmp(returnFromSegfault)) { 132 | // Not returning from the segfault handler; cause a segfault. 133 | *c; 134 | } else { 135 | // Returning from the segfault handler. 136 | EXPECT_TRUE(segfaulted); 137 | } 138 | // END MAGIC 139 | 140 | // Go back to the previous signal handler (probably crashing). 141 | sigaction(SIGSEGV, &oldHandler, nullptr); 142 | } 143 | -------------------------------------------------------------------------------- /rseq/internal/ThreadControlTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/ThreadControl.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | #include "rseq/internal/Code.h" 22 | #include "rseq/internal/NumCpus.h" 23 | #include "rseq/internal/SwitchToCpu.h" 24 | 25 | using namespace rseq::internal; 26 | 27 | class ThreadControlFixture : public ::testing::Test { 28 | protected: 29 | void SetUp() override { 30 | me = ThreadControl::get(&myThreadCachedCpu); 31 | 32 | childDead = false; 33 | childShouldDie = false; 34 | child = std::thread([&]() { 35 | std::unique_lock ul(mu); 36 | while (true) { 37 | if (childShouldDie) { 38 | return; 39 | } 40 | if (func != nullptr) { 41 | func(); 42 | func = nullptr; 43 | } 44 | cond.notify_all(); 45 | cond.wait(ul, [&]() { 46 | return childShouldDie || func != nullptr; 47 | }); 48 | } 49 | }); 50 | 51 | childCpu = numCpus() > 1 ? 1 : 0; 52 | 53 | switchToCpu(0); 54 | // To check thread transition logic, we want the child to get its 55 | // ThreadControl on one CPU, and have it manipulated while it's on another. 56 | runOnChild([&]() { 57 | switchToCpu(0); 58 | childThreadControl = ThreadControl::get(&childThreadCachedCpu); 59 | switchToCpu(childCpu); 60 | }); 61 | } 62 | 63 | void TearDown() override { 64 | if (!childDead) { 65 | killChild(); 66 | } 67 | child.join(); 68 | } 69 | 70 | // Only starts the child's death; doesn't join() it. 71 | void killChild() { 72 | std::lock_guard lg(mu); 73 | childShouldDie = true; 74 | cond.notify_all(); 75 | } 76 | 77 | void runOnChild(std::function f) { 78 | func = f; 79 | std::unique_lock ul(mu); 80 | cond.notify_all(); 81 | cond.wait(ul, [&]() { 82 | return func == nullptr; 83 | }); 84 | } 85 | 86 | std::atomic myThreadCachedCpu; 87 | ThreadControl* me; 88 | 89 | int childCpu; 90 | std::atomic childThreadCachedCpu; 91 | ThreadControl* childThreadControl; 92 | 93 | bool childDead; 94 | bool childShouldDie; 95 | std::thread child; 96 | std::mutex mu; 97 | std::condition_variable cond; 98 | std::function func; 99 | }; 100 | 101 | TEST_F(ThreadControlFixture, IdManipulation) { 102 | std::uint32_t myId = me->id(); 103 | std::uint32_t childId = childThreadControl->id(); 104 | EXPECT_EQ(me, ThreadControl::forId(myId)); 105 | EXPECT_EQ(childThreadControl, ThreadControl::forId(childId)); 106 | } 107 | 108 | TEST_F(ThreadControlFixture, Code) { 109 | Code* code = me->code(); 110 | EXPECT_NE(nullptr, code->rseqLoadFunc()); 111 | EXPECT_NE(nullptr, code->rseqStoreFunc()); 112 | EXPECT_NE(nullptr, code->rseqStoreFenceFunc()); 113 | } 114 | 115 | TEST_F(ThreadControlFixture, RseqManipulation) { 116 | std::uintptr_t dst = 0; 117 | runOnChild([&]() { 118 | EXPECT_FALSE(childThreadControl->code()->rseqStoreFunc()(&dst, 1)); 119 | }); 120 | EXPECT_EQ(1, dst); 121 | childThreadControl->blockRseqOps(); 122 | childThreadCachedCpu.store(0); 123 | runOnChild([&]() { 124 | EXPECT_TRUE(childThreadControl->code()->rseqStoreFunc()(&dst, 2)); 125 | }); 126 | EXPECT_LT(childThreadCachedCpu.load(), 0); 127 | EXPECT_EQ(1, dst); 128 | childThreadControl->unblockRseqOps(); 129 | runOnChild([&]() { 130 | EXPECT_FALSE(childThreadControl->code()->rseqStoreFunc()(&dst, 2)); 131 | }); 132 | EXPECT_EQ(2, dst); 133 | } 134 | 135 | TEST_F(ThreadControlFixture, CurCpu) { 136 | EXPECT_EQ(childCpu, childThreadControl->curCpu()); 137 | runOnChild([&]() { 138 | switchToCpu(0); 139 | }); 140 | EXPECT_EQ(0, childThreadControl->curCpu()); 141 | } 142 | 143 | TEST_F(ThreadControlFixture, LivesWhileBeingAccessed) { 144 | me->accessing()->store(childThreadControl->id()); 145 | killChild(); 146 | /* sleep override */ 147 | // Give it a bit to die on its own, if it's going to. 148 | std::this_thread::sleep_for(std::chrono::milliseconds(100)); 149 | EXPECT_EQ(childCpu, childThreadControl->curCpu()); 150 | me->accessing()->store(0); 151 | } 152 | 153 | TEST_F(ThreadControlFixture, DiesWhenNotAccessed) { 154 | killChild(); 155 | // If the child doesn't die, then we'll time out when the subsequent join() 156 | // call fails. 157 | } 158 | -------------------------------------------------------------------------------- /rseq/internal/IdAllocatorTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/IdAllocator.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | using namespace rseq::internal; 21 | 22 | struct IdOwner { 23 | std::uint32_t id; 24 | }; 25 | 26 | TEST(IdAllocator, SingleThreaded) { 27 | const int kNumOwners = 100000; 28 | 29 | // Remember that [] dereferencing automatically inserts the key with the value 30 | // 0. 31 | std::unordered_map countForId; 32 | 33 | IdAllocator idAllocator(kNumOwners + 1); 34 | 35 | std::vector owners(kNumOwners); 36 | 37 | // Allocate a bunch of ids 38 | for (int i = 0; i < kNumOwners; ++i) { 39 | owners[i].id = idAllocator.allocate(&owners[i]); 40 | EXPECT_NE(0, owners[i].id); 41 | EXPECT_EQ(i + 1, owners[i].id); 42 | EXPECT_EQ(1, ++countForId[owners[i].id]); 43 | } 44 | 45 | // Check that the owners match up 46 | for (int i = 0; i < kNumOwners; ++i) { 47 | EXPECT_EQ(&owners[i], idAllocator.lookupOwner(owners[i].id)); 48 | } 49 | 50 | // OK, now we mix things up a little 51 | 52 | // Free index i if i % 3 == 0 53 | for (int i = 0; i < kNumOwners; i += 3) { 54 | idAllocator.free(owners[i].id); 55 | EXPECT_EQ(0, --countForId[owners[i].id]); 56 | } 57 | 58 | // Free index i if i % 3 == 1 59 | for (int i = 1; i < kNumOwners; i += 3) { 60 | idAllocator.free(owners[i].id); 61 | EXPECT_EQ(0, --countForId[owners[i].id]); 62 | } 63 | 64 | // Allocate for index i if i % 3 == 0 or i % 3 == 1 65 | for (int i = 0; i < kNumOwners; ++i) { 66 | if (i % 3 == 0 || i % 3 == 1) { 67 | owners[i].id = idAllocator.allocate(&owners[i]); 68 | EXPECT_EQ(1, ++countForId[owners[i].id]); 69 | EXPECT_NE(0, owners[i].id); 70 | } 71 | } 72 | 73 | // Check that things still match 74 | for (int i = 0; i < kNumOwners; ++i) { 75 | EXPECT_EQ(&owners[i], idAllocator.lookupOwner(owners[i].id)); 76 | } 77 | 78 | // At any given time, we had <= kNumOwners allocated Ids. Now we go to 79 | // kNumOwners + 1. 80 | IdOwner newOwner; 81 | newOwner.id = idAllocator.allocate(&newOwner); 82 | EXPECT_EQ(newOwner.id, kNumOwners + 1); 83 | EXPECT_EQ(1, ++countForId[newOwner.id]); 84 | } 85 | 86 | void updateMax(std::atomic* max, std::uint32_t atLeast) { 87 | std::uint32_t curMax = max->load(); 88 | while (curMax < atLeast) { 89 | if (max->compare_exchange_strong(curMax, atLeast)) { 90 | break; 91 | } 92 | } 93 | } 94 | 95 | TEST(IdAllocator, MultiThreaded) { 96 | const int kNumThreads = 10; 97 | const int kAllocationsPerThread = 100000; 98 | 99 | std::atomic highestIdAllocated(0); 100 | 101 | std::vector> ownersByThread(kNumThreads); 102 | for (int i = 0; i < kNumThreads; ++i) { 103 | ownersByThread[i] = std::vector(kAllocationsPerThread); 104 | } 105 | 106 | IdAllocator idAllocator(kNumThreads * kAllocationsPerThread + 1); 107 | 108 | std::vector threads(kNumThreads); 109 | 110 | // Spawn many threads, doing many allocations and frees 111 | for (int i = 0; i < kNumThreads; ++i) { 112 | threads[i] = std::thread([&, i]() { 113 | // Allocate everything 114 | for (int j = 0; j < kAllocationsPerThread; ++j) { 115 | ownersByThread[i][j].id = idAllocator.allocate(&ownersByThread[i][j]); 116 | EXPECT_NE(0, ownersByThread[i][j].id); 117 | } 118 | // Free the evens 119 | for (int j = 0; j < kAllocationsPerThread; j += 2) { 120 | idAllocator.free(ownersByThread[i][j].id); 121 | } 122 | // Reallocate them 123 | for (int j = 0; j < kAllocationsPerThread; j += 2) { 124 | ownersByThread[i][j].id = idAllocator.allocate(&ownersByThread[i][j]); 125 | EXPECT_NE(0, ownersByThread[i][j].id); 126 | } 127 | }); 128 | } 129 | for (int i = 0; i < kNumThreads; ++i) { 130 | threads[i].join(); 131 | } 132 | 133 | std::unordered_map countForId; 134 | for (int i = 0; i < kNumThreads; ++i) { 135 | for (int j = 0; j < kAllocationsPerThread; ++j) { 136 | EXPECT_NE(0, ownersByThread[i][j].id); 137 | EXPECT_EQ( 138 | &ownersByThread[i][j], 139 | idAllocator.lookupOwner(ownersByThread[i][j].id)); 140 | } 141 | } 142 | IdOwner newOwner; 143 | newOwner.id = idAllocator.allocate(&newOwner); 144 | EXPECT_EQ(kNumThreads * kAllocationsPerThread + 1, newOwner.id); 145 | } 146 | -------------------------------------------------------------------------------- /rseq/internal/CodeTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Code.h" 11 | 12 | #include 13 | 14 | using namespace rseq::internal; 15 | 16 | TEST(Code, Allocation) { 17 | // Note: we assume that this is divisible by 4 later. 18 | const int kNumAllocations = 10000; 19 | std::atomic threadCachedCpu[kNumAllocations]; 20 | Code* code[kNumAllocations]; 21 | for (int i = 0; i < kNumAllocations; ++i) { 22 | code[i] = Code::initForId(i, &threadCachedCpu[i]); 23 | } 24 | // Make sure they all work 25 | for (int i = 0; i < kNumAllocations; ++i) { 26 | std::uint64_t val = 12345; 27 | std::uint64_t dst = 0; 28 | EXPECT_FALSE(code[i]->rseqLoadFunc()(&dst, &val)); 29 | EXPECT_EQ(12345, dst); 30 | } 31 | // Block the even ones 32 | for (int i = 0; i < kNumAllocations; i += 2) { 33 | code[i]->blockRseqOps(); 34 | } 35 | // Make sure the evens don't work and the odds do. 36 | for (int i = 0; i < kNumAllocations; ++i) { 37 | std::uint64_t val = 12345; 38 | std::uint64_t dst = 0; 39 | EXPECT_EQ(i % 2 == 0, code[i]->rseqLoadFunc()(&dst, &val)); 40 | EXPECT_EQ(12345 * (i % 2), dst); 41 | } 42 | // Block the odds 43 | for (int i = 1; i < kNumAllocations; i += 2) { 44 | code[i]->blockRseqOps(); 45 | } 46 | // Reallocate the evens, but with a different mapping between threadCachedCpus 47 | // and Codes. 48 | for (int i = 0; i < kNumAllocations; ++i) { 49 | if (i % 4 == 0) { 50 | code[i] = Code::initForId(i / 2, &threadCachedCpu[i]); 51 | } 52 | if (i % 4 == 2) { 53 | // Here we use the knowledge that kNumAllocations is divisible by 4 54 | // (kNumAllocations / 2 is even). 55 | code[i] = Code::initForId( 56 | i / 2 + kNumAllocations / 2, &threadCachedCpu[i]); 57 | } 58 | } 59 | // Make sure the evens work and the odds dont. 60 | for (int i = 0; i < kNumAllocations; ++i) { 61 | std::uint64_t val = 12345; 62 | std::uint64_t dst = 0; 63 | 64 | bool failed = code[i]->rseqLoadFunc()(&dst, &val); 65 | if (i % 2 == 0) { 66 | EXPECT_FALSE(failed); 67 | EXPECT_EQ(12345, dst); 68 | } 69 | } 70 | } 71 | 72 | class CodeFixture : public ::testing::Test { 73 | protected: 74 | void SetUp() override { 75 | code = Code::initForId(1, &threadCachedCpu); 76 | threadCachedCpu.store(0); 77 | } 78 | 79 | Code* code; 80 | std::atomic threadCachedCpu; 81 | }; 82 | 83 | TEST_F(CodeFixture, LoadsCorrectly) { 84 | std::uint64_t val = 12345; 85 | std::uint64_t dst = 0; 86 | EXPECT_FALSE(code->rseqLoadFunc()(&dst, &val)); 87 | EXPECT_EQ(12345, dst); 88 | EXPECT_GE(0, threadCachedCpu.load()); 89 | } 90 | 91 | TEST_F(CodeFixture, StoresCorrectly) { 92 | std::uint64_t dst = 0; 93 | EXPECT_FALSE(code->rseqStoreFunc()(&dst, 12345)); 94 | EXPECT_EQ(12345, dst); 95 | EXPECT_GE(0, threadCachedCpu.load()); 96 | } 97 | 98 | TEST_F(CodeFixture, StoreFencesCorrectly) { 99 | std::uint64_t dst = 0; 100 | EXPECT_FALSE(code->rseqStoreFenceFunc()(&dst, 12345)); 101 | EXPECT_EQ(12345, dst); 102 | EXPECT_GE(0, threadCachedCpu.load()); 103 | } 104 | 105 | TEST_F(CodeFixture, BlocksLoads) { 106 | std::uint64_t val = 12345; 107 | std::uint64_t dst = 0; 108 | code->blockRseqOps(); 109 | EXPECT_TRUE(code->rseqLoadFunc()(&dst, &val)); 110 | EXPECT_LT(threadCachedCpu.load(), 0); 111 | EXPECT_EQ(0, dst); 112 | } 113 | 114 | TEST_F(CodeFixture, BlocksStores) { 115 | std::uint64_t dst = 0; 116 | code->blockRseqOps(); 117 | EXPECT_TRUE(code->rseqStoreFunc()(&dst, 12345)); 118 | EXPECT_LT(threadCachedCpu.load(), 0); 119 | EXPECT_EQ(0, dst); 120 | } 121 | 122 | TEST_F(CodeFixture, BlocksStoreFences) { 123 | std::uint64_t dst = 0; 124 | code->blockRseqOps(); 125 | EXPECT_TRUE(code->rseqStoreFenceFunc()(&dst, 12345)); 126 | EXPECT_LT(threadCachedCpu.load(), 0); 127 | EXPECT_EQ(0, dst); 128 | } 129 | 130 | TEST_F(CodeFixture, UnblocksLoads) { 131 | std::uint64_t val = 12345; 132 | std::uint64_t dst = 0; 133 | code->blockRseqOps(); 134 | code->unblockRseqOps(); 135 | EXPECT_FALSE(code->rseqLoadFunc()(&dst, &val)); 136 | EXPECT_EQ(12345, dst); 137 | } 138 | 139 | TEST_F(CodeFixture, UnblocksStores) { 140 | std::uint64_t dst = 0; 141 | code->blockRseqOps(); 142 | code->unblockRseqOps(); 143 | EXPECT_FALSE(code->rseqStoreFunc()(&dst, 12345)); 144 | EXPECT_EQ(dst, 12345); 145 | } 146 | 147 | TEST_F(CodeFixture, UnblocksStoreFences) { 148 | std::uint64_t dst = 0; 149 | code->blockRseqOps(); 150 | code->unblockRseqOps(); 151 | EXPECT_FALSE(code->rseqStoreFenceFunc()(&dst, 12345)); 152 | EXPECT_EQ(dst, 12345); 153 | } 154 | -------------------------------------------------------------------------------- /rseq/internal/CleanUpOnThreadDeathTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/CleanUpOnThreadDeath.h" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | using namespace rseq::internal; 21 | 22 | int rseqVal; 23 | int threadControlVal; 24 | bool rseqValSetDuringThreadControlCleanup; 25 | 26 | void rseqCleanupFunc() { 27 | rseqVal = 1; 28 | } 29 | 30 | void threadControlCleanupFunc() { 31 | rseqValSetDuringThreadControlCleanup = (rseqVal == 1); 32 | threadControlVal = 1; 33 | } 34 | 35 | TEST(CleanUpOnThreadDeath, CallsRseq) { 36 | rseqValSetDuringThreadControlCleanup = false; 37 | rseqVal = threadControlVal = 0; 38 | std::thread t([]() { 39 | setRseqCleanup(rseqCleanupFunc); 40 | }); 41 | t.join(); 42 | EXPECT_EQ(1, rseqVal); 43 | } 44 | 45 | TEST(CleanUpOnThreadDeath, CallsThreadControl) { 46 | rseqValSetDuringThreadControlCleanup = false; 47 | rseqVal = threadControlVal = 0; 48 | 49 | std::thread t([]() { 50 | setThreadControlCleanup(threadControlCleanupFunc); 51 | }); 52 | t.join(); 53 | EXPECT_EQ(1, threadControlVal); 54 | } 55 | 56 | 57 | TEST(CleanUpOnThreadDeath, OrdersCallsCorrectlyWhenAddedInOrder) { 58 | rseqValSetDuringThreadControlCleanup = false; 59 | rseqVal = threadControlVal = 0; 60 | std::thread t([]() { 61 | setRseqCleanup(rseqCleanupFunc); 62 | setThreadControlCleanup(threadControlCleanupFunc); 63 | }); 64 | t.join(); 65 | EXPECT_TRUE(rseqValSetDuringThreadControlCleanup); 66 | } 67 | 68 | TEST(CleanUpOnThreadDeath, OrdersCallsCorrectlyWhenNotAddedInOrder) { 69 | rseqValSetDuringThreadControlCleanup = false; 70 | rseqVal = threadControlVal = 0; 71 | std::thread t([]() { 72 | setThreadControlCleanup(threadControlCleanupFunc); 73 | setRseqCleanup(rseqCleanupFunc); 74 | }); 75 | t.join(); 76 | EXPECT_TRUE(rseqValSetDuringThreadControlCleanup); 77 | } 78 | 79 | TEST(CleanUpOnThreadDeath, OutlivesThreadLocals) { 80 | static __thread int deathCount; 81 | deathCount = 0; 82 | 83 | static void (*bumpAndCheckDeathCount)() = []() { 84 | EXPECT_EQ(0, deathCount); 85 | ++deathCount; 86 | }; 87 | 88 | struct SetsCleanup { 89 | ~SetsCleanup() { 90 | setRseqCleanup(bumpAndCheckDeathCount); 91 | } 92 | }; 93 | thread_local SetsCleanup setsCleanup1; 94 | thread_local SetsCleanup setsCleanup2; 95 | void* volatile odrUse = &setsCleanup1; 96 | odrUse = &setsCleanup2; 97 | setRseqCleanup(bumpAndCheckDeathCount); 98 | } 99 | 100 | TEST(CleanUpOnThreadDeath, SupportsReinitialization) { 101 | static pthread_key_t key1; 102 | static pthread_key_t key2; 103 | static pthread_key_t key3; 104 | 105 | struct TestInfo { 106 | TestInfo() 107 | : rseqInitialized(false), 108 | numRseqInitializations(0), 109 | numRseqDestructions(0) {} 110 | bool rseqInitialized; 111 | int numRseqInitializations; 112 | int numRseqDestructions; 113 | }; 114 | // Note: only used by the child 115 | static __thread TestInfo* myTestInfo; 116 | std::unique_ptr childTestInfo; 117 | 118 | static auto initializeRseq = []() { 119 | if (!myTestInfo->rseqInitialized) { 120 | myTestInfo->rseqInitialized = true; 121 | ++myTestInfo->numRseqInitializations; 122 | setRseqCleanup([]() { 123 | ++myTestInfo->numRseqDestructions; 124 | myTestInfo->rseqInitialized = false; 125 | }); 126 | } 127 | }; 128 | 129 | static void (*destructor3)(void*) = [](void*) { 130 | initializeRseq(); 131 | }; 132 | static void (*destructor1)(void*) = [](void*) { 133 | initializeRseq(); 134 | pthread_setspecific(key3, reinterpret_cast(3)); 135 | }; 136 | static void (*destructor2)(void*) = [](void*) { 137 | initializeRseq(); 138 | }; 139 | static std::once_flag once; 140 | std::call_once(once, []() { 141 | pthread_key_create(&key1, destructor1); 142 | pthread_key_create(&key2, destructor2); 143 | pthread_key_create(&key3, destructor3); 144 | }); 145 | 146 | 147 | std::thread t([&]() { 148 | // Easiest way to tell the pthread destructors where to find the TestInfo is 149 | // a threadlocal. 150 | myTestInfo = new TestInfo; 151 | childTestInfo.reset(myTestInfo); 152 | pthread_setspecific(key1, reinterpret_cast(1)); 153 | initializeRseq(); 154 | pthread_setspecific(key2, reinterpret_cast(2)); 155 | }); 156 | t.join(); 157 | EXPECT_TRUE( 158 | childTestInfo->numRseqInitializations 159 | == childTestInfo->numRseqDestructions); 160 | EXPECT_FALSE( 161 | childTestInfo->rseqInitialized); 162 | } 163 | -------------------------------------------------------------------------------- /rseq/internal/Code.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Code.h" 11 | 12 | #include 13 | 14 | #include "rseq/internal/CachelinePadded.h" 15 | #include "rseq/internal/Mutex.h" 16 | #include "rseq/internal/OsMem.h" 17 | 18 | namespace rseq { 19 | namespace internal { 20 | 21 | static const unsigned char codeTemplate[] = { 22 | // 8-byte load code. Prototype is: 23 | // int (*)(unsigned long* dst, unsigned long* src); 24 | 25 | // Do the load 26 | // mov (%rsi), %rax 27 | /* offset 0: */ 0x48, 0x8b, 0x06, 28 | 29 | // Store it into *dst 30 | // mov %rax, (%rdi) 31 | /* offset 3: */ 0x48, 0x89, 0x07, 32 | 33 | // Return success! (i.e. 0) 34 | // xor %eax, %eax 35 | /* offset 6: */ 0x31, 0xc0, 36 | // retq 37 | /* offset 8: */ 0xc3, 38 | 39 | // Padding bytes 40 | /* offset 9: */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 41 | 42 | // 8-byte store code. Prototype is: 43 | // int (*)(unsigned long* dst, unsigned long val); 44 | 45 | // Do the store. 46 | // mov %rsi, (%rdi) 47 | /* offset 16: */ 0x48, 0x89, 0x37, 48 | 49 | // Return success! (i.e. 0) 50 | // xor %eax, %eax 51 | /* offset 19: */ 0x31, 0xc0, 52 | // retq 53 | /* offset 21: */ 0xc3, 54 | 55 | 56 | // Padding bytes 57 | /* offset 22: */ 0x00, 0x00, 58 | 59 | 60 | // 8-byte store-fence code. Prototype is: 61 | // int (*)(unsigned long* dst, unsigned long val); 62 | 63 | // Do the store (via xchg). 64 | // xchg %rsi, (%rdi) 65 | /* offset 24: */ 0x48, 0x87, 0x37, 66 | 67 | // Return success! (i.e. 0) 68 | // xor %eax, %eax 69 | /* offset 27: */ 0x31, 0xc0, 70 | // retq 71 | /* offset 29: */ 0xc3, 72 | 73 | 74 | // Padding bytes 75 | /* offset 30: */ 0x00, 0x00, 76 | 77 | 78 | // Failure path. 79 | // This code is shared by all the load and store paths above. 80 | // The initial instruction of each path is patched to be a jump to here. 81 | 82 | // Store -1 into the threadCachedCpu variable. 83 | // The 42s get replaced with a pointer to the owning thread's threadCachedCpu 84 | // variable. 85 | // movabs $0x4242424242424242, %rax 86 | /* offset 32: */ 0x48, 0xb8, 87 | /* offset 34: */ 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 88 | // movl $-1, (%rax) 89 | /* offset 42: */ 0xc7, 0x00, 0xff, 0xff, 0xff, 0xff, 90 | 91 | // Return failure :( (i.e. 1). 92 | // mov $1, %eax 93 | /* offset 48: */ 0xb8, 0x01, 0x00, 0x00, 0x00, 94 | // retq 95 | /* offset 53: */ 0xc3 96 | }; 97 | 98 | 99 | const static int kLoadOffset = 0; 100 | const static int kStoreOffset = 16; 101 | const static int kStoreFenceOffset = 24; 102 | const static int kReturnFailureOffset = 32; 103 | const static int kThreadCachedCpuOffset = 34; 104 | 105 | 106 | const static int kJmpInstructionSize = 2; 107 | 108 | const int kLoadToFailureJmpSize 109 | = kReturnFailureOffset - kLoadOffset - kJmpInstructionSize; 110 | const int kStoreToFailureJmpSize 111 | = kReturnFailureOffset - kStoreOffset - kJmpInstructionSize; 112 | const int kStoreFenceToFailureJmpSize 113 | = kReturnFailureOffset - kStoreFenceOffset - kJmpInstructionSize; 114 | 115 | 116 | const std::uint16_t kJmpBytecode = 0xeb; 117 | const std::uint16_t kLoadReplacement 118 | = kJmpBytecode | (kLoadToFailureJmpSize << 8); 119 | const std::uint16_t kStoreReplacement 120 | = kJmpBytecode | (kStoreToFailureJmpSize << 8); 121 | const std::uint16_t kStoreFenceReplacement 122 | = kJmpBytecode | (kStoreFenceToFailureJmpSize << 8); 123 | 124 | 125 | static mutex::OnceFlag codePagesOnceFlag; 126 | static CachelinePadded* codePages; 127 | 128 | // static 129 | Code* Code::initForId(std::uint32_t id, std::atomic* threadCachedCpu) { 130 | static_assert( 131 | sizeof(codeTemplate) == sizeof(Code::code_), 132 | "codeTemplate and code_ storage size must match."); 133 | 134 | mutex::callOnce(codePagesOnceFlag, []() { 135 | // We get kMaxGlobalThreads from the kernel limit. This reserves 256MB of 136 | // address space, but pages are lazily allocated, so the actual cost is much 137 | // smaller. 138 | const int kMaxGlobalThreads = 1 << 22; 139 | const int kMemToReserve = kMaxGlobalThreads * sizeof(CachelinePadded); 140 | 141 | void* alloc = os_mem::allocateExecutable(kMemToReserve); 142 | codePages = static_cast*>(alloc); 143 | }); 144 | Code* code = codePages[id].get(); 145 | std::memcpy(code->code_, codeTemplate, sizeof(codeTemplate)); 146 | std::memcpy( 147 | &code->code_[kThreadCachedCpuOffset], 148 | &threadCachedCpu, 149 | sizeof(threadCachedCpu)); 150 | return code; 151 | } 152 | 153 | Code::RseqLoadFunc Code::rseqLoadFunc() { 154 | return reinterpret_cast(&code_[kLoadOffset]); 155 | } 156 | 157 | Code::RseqStoreFunc Code::rseqStoreFunc() { 158 | return reinterpret_cast(&code_[kStoreOffset]); 159 | } 160 | 161 | Code::RseqStoreFunc Code::rseqStoreFenceFunc() { 162 | return reinterpret_cast(&code_[kStoreFenceOffset]); 163 | } 164 | 165 | void Code::blockRseqOps() { 166 | std::atomic* load = 167 | reinterpret_cast*>(&code_[kLoadOffset]); 168 | std::atomic* store = 169 | reinterpret_cast*>(&code_[kStoreOffset]); 170 | std::atomic* storeFence = 171 | reinterpret_cast*>(&code_[kStoreFenceOffset]); 172 | load->store(kLoadReplacement, std::memory_order_relaxed); 173 | store->store(kStoreReplacement, std::memory_order_relaxed); 174 | storeFence->store(kStoreFenceReplacement, std::memory_order_relaxed); 175 | } 176 | 177 | void Code::unblockRseqOps() { 178 | const std::uint16_t kLoadBytes = 0x8b48; 179 | const std::uint16_t kStoreBytes = 0x8948; 180 | const std::uint16_t kStoreFenceBytes = 0x8748; 181 | 182 | std::atomic* load = 183 | reinterpret_cast*>(&code_[kLoadOffset]); 184 | std::atomic* store = 185 | reinterpret_cast*>(&code_[kStoreOffset]); 186 | std::atomic* storeFence = 187 | reinterpret_cast*>(&code_[kStoreFenceOffset]); 188 | 189 | load->store(kLoadBytes, std::memory_order_relaxed); 190 | store->store(kStoreBytes, std::memory_order_relaxed); 191 | storeFence->store(kStoreFenceBytes, std::memory_order_relaxed); 192 | } 193 | } // namespace internal 194 | } // namespace rseq 195 | -------------------------------------------------------------------------------- /rseq/internal/Rseq.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/Rseq.h" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include "rseq/internal/AsymmetricThreadFence.h" 18 | #include "rseq/internal/Code.h" 19 | #include "rseq/internal/CleanUpOnThreadDeath.h" 20 | #include "rseq/internal/CpuLocal.h" 21 | #include "rseq/internal/Mutex.h" 22 | #include "rseq/internal/NumCpus.h" 23 | #include "rseq/internal/ThreadControl.h" 24 | 25 | namespace rseq { 26 | namespace internal { 27 | 28 | static __thread int lastCpu; 29 | 30 | static __thread ThreadControl* me; 31 | 32 | // In at least some environments, alignof(std::atomic) == 4 if 33 | // alignof(T) == 4, even if sizeof(T) == 8; this won't work for us. 34 | // We could alignas this struct, but I think the gold standard 35 | // for "this is lock-free regardless of compiler and standard library choices" 36 | // is still using an integral type. So instead of using 37 | // std::atomic, we use AtomicOwnerAndEvictor below. 38 | struct OwnerAndEvictor { 39 | std::uint32_t ownerId; 40 | std::uint32_t evictorId; 41 | }; 42 | 43 | struct AtomicOwnerAndEvictor { 44 | AtomicOwnerAndEvictor() : repr(0) { 45 | } 46 | 47 | OwnerAndEvictor load() { 48 | OwnerAndEvictor result; 49 | std::uint64_t view = repr.load(); 50 | result.ownerId = view >> 32; 51 | result.evictorId = view & 0xFFFFFFFFU; 52 | return result; 53 | } 54 | 55 | bool cas(OwnerAndEvictor expected, OwnerAndEvictor desired) { 56 | std::uint64_t expectedRepr 57 | = (static_cast(expected.ownerId) << 32) 58 | | expected.evictorId; 59 | std::uint64_t desiredRepr 60 | = (static_cast(desired.ownerId) << 32) 61 | | desired.evictorId; 62 | return repr.compare_exchange_strong(expectedRepr, desiredRepr); 63 | } 64 | 65 | std::atomic repr; 66 | }; 67 | 68 | // Initialized in ensureMyThreadControlInitialized below. 69 | // PodWrapper since there are shutdown order issues that mean this can't ever be 70 | // safely destroyed (dying threads access it). 71 | static CpuLocal* ownerAndEvictor; 72 | static char ownerAndEvictorStorage alignas(CpuLocal) [ 73 | sizeof(*ownerAndEvictor)]; 74 | 75 | static int acquireCpuOwnership() { 76 | while (true) { 77 | lastCpu = sched_getcpu(); 78 | threadCachedCpu()->store(lastCpu, std::memory_order_relaxed); 79 | 80 | OwnerAndEvictor curOwnerAndEvictor 81 | = ownerAndEvictor->forCpu(lastCpu)->load(); 82 | if (curOwnerAndEvictor.ownerId == 0) { 83 | if (ownerAndEvictor->forCpu(lastCpu)->cas( 84 | curOwnerAndEvictor, { me->id(), 0 } )) { 85 | return lastCpu; 86 | } else { 87 | continue; 88 | } 89 | } 90 | 91 | me->accessing()->store( 92 | curOwnerAndEvictor.ownerId, std::memory_order_relaxed); 93 | if (!ownerAndEvictor->forCpu(lastCpu)->cas( 94 | curOwnerAndEvictor, { curOwnerAndEvictor.ownerId, me->id() })) { 95 | me->accessing()->store(0, std::memory_order_relaxed); 96 | continue; 97 | } 98 | // The CAS succeeded, so we installed ourself as the evictor. 99 | curOwnerAndEvictor.evictorId = me->id(); 100 | 101 | ThreadControl* victim = ThreadControl::forId(curOwnerAndEvictor.ownerId); 102 | victim->blockRseqOps(); // A 103 | 104 | if (lastCpu != sched_getcpu()) { // B 105 | me->accessing()->store(0, std::memory_order_relaxed); 106 | continue; 107 | } 108 | 109 | // This is a little bit tricky; why don't we *always* need to do the 110 | // asymmetricThreadFencyHeavy()? 111 | // We did the stores blocking the victim's rseq ops above (A), and then 112 | // viewed ourselves to be running on CPU lastCpu (B). So the blocking stores 113 | // will be visible to all threads that run on CPU lastCpu in the future. If 114 | // we observe victim->curCpu() == lastCpu below, we know that the victim is 115 | // such a thread. So either the victim ran in between the blocking stores 116 | // and now (in which case it did a CAS to lastCpu's OwnerEvictor from 117 | // to , so we'll retry below), or the victim hasn't 118 | // run yet, in which case we don't need the heavy fence. 119 | // This relies on the memory ordering guarantee of ThreadControl::curCpu() 120 | // (which itself relies on the way the kernel handles thread migrations). 121 | if (victim->curCpu() != lastCpu) { 122 | asymmetricThreadFenceHeavy(); 123 | } 124 | 125 | me->accessing()->store(0, std::memory_order_relaxed); 126 | 127 | if (ownerAndEvictor->forCpu(lastCpu)->cas( 128 | curOwnerAndEvictor, { me->id(), 0 })) { 129 | return lastCpu; 130 | } 131 | } 132 | } 133 | 134 | static mutex::OnceFlag ownerAndEvictorOnceFlag; 135 | 136 | static void ensureMyThreadControlInitialized() { 137 | if (me == nullptr) { 138 | me = ThreadControl::get(threadCachedCpu()); 139 | rseq_load_trampoline = me->code()->rseqLoadFunc(); 140 | rseq_store_trampoline = me->code()->rseqStoreFunc(); 141 | rseq_store_fence_trampoline = me->code()->rseqStoreFenceFunc(); 142 | setRseqCleanup([]() { 143 | end(); 144 | // If rseq is shut-down at thread-death, then resurrected at thread-death, 145 | // we need to make sure we re-initialize our data structures. 146 | me = nullptr; 147 | }); 148 | 149 | mutex::callOnce(ownerAndEvictorOnceFlag, []() { 150 | ownerAndEvictor 151 | = new (ownerAndEvictorStorage) CpuLocal; 152 | }); 153 | } 154 | } 155 | 156 | int beginSlowPath() { 157 | ensureMyThreadControlInitialized(); 158 | end(); 159 | me->unblockRseqOps(); 160 | return acquireCpuOwnership(); 161 | } 162 | 163 | void end() { 164 | threadCachedCpu()->store(-1, std::memory_order_relaxed); 165 | while (true) { 166 | OwnerAndEvictor curOwnerAndEvictor 167 | = ownerAndEvictor->forCpu(lastCpu)->load(); 168 | if (curOwnerAndEvictor.ownerId != me->id()) { 169 | break; 170 | } 171 | if (ownerAndEvictor->forCpu(lastCpu)->cas(curOwnerAndEvictor, { 0, 0 })) { 172 | break; 173 | } 174 | } 175 | } 176 | 177 | static void evictOwner(int shard) { 178 | OwnerAndEvictor curOwnerAndEvictor = ownerAndEvictor->forCpu(shard)->load(); 179 | if (curOwnerAndEvictor.ownerId == 0) { 180 | return; 181 | } 182 | 183 | me->accessing()->store(curOwnerAndEvictor.ownerId); 184 | if (ownerAndEvictor->forCpu(shard)->load().ownerId 185 | != curOwnerAndEvictor.ownerId) { 186 | me->accessing()->store(0, std::memory_order_relaxed); 187 | return; 188 | } 189 | 190 | ThreadControl* victim = ThreadControl::forId(curOwnerAndEvictor.ownerId); 191 | victim->blockRseqOps(); 192 | 193 | me->accessing()->store(0, std::memory_order_relaxed); 194 | } 195 | 196 | void fenceWith(int shard) { 197 | std::atomic_thread_fence(std::memory_order_seq_cst); 198 | ensureMyThreadControlInitialized(); 199 | evictOwner(shard); 200 | asymmetricThreadFenceHeavy(); 201 | } 202 | 203 | void fence() { 204 | std::atomic_thread_fence(std::memory_order_seq_cst); 205 | ensureMyThreadControlInitialized(); 206 | for (int i = 0; i < numCpus(); ++i) { 207 | evictOwner(i); 208 | } 209 | asymmetricThreadFenceHeavy(); 210 | } 211 | 212 | } // namespace internal 213 | } // namespace rseq 214 | -------------------------------------------------------------------------------- /rseq/internal/ThreadControl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/internal/ThreadControl.h" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | #include "rseq/internal/CleanUpOnThreadDeath.h" 22 | #include "rseq/internal/Code.h" 23 | #include "rseq/internal/IdAllocator.h" 24 | #include "rseq/internal/IntrusiveLinkedList.h" 25 | #include "rseq/internal/Mutex.h" 26 | 27 | namespace rseq { 28 | namespace internal { 29 | 30 | // ThreadControls are all kept in a global linked list. The list, including all 31 | // additions to and removals from the list, are protected by the mutex. 32 | // Theoretically we ought to worry about destructor order issues during 33 | // shutdown, but as a practical matter everything works fine for these types. 34 | static mutex::Mutex allThreadControlsMu; 35 | static IntrusiveLinkedList allThreadControls; 36 | 37 | // Initialized in ThreadControl::get below. 38 | // Here we *do* care about destructors running during shutdown. 39 | static mutex::OnceFlag idAllocatorOnceFlag; 40 | static IdAllocator* idAllocator; 41 | static char idAllocatorStorage alignas(IdAllocator) [ 42 | sizeof(*idAllocator)]; 43 | 44 | // We get this from the kernel limit. 45 | // TODO: Code.cpp has the same constant. We ought to move it into someplace 46 | // common. 47 | constexpr static int kMaxGlobalThreads = 1 << 22; 48 | 49 | // The ThreadControl for the current thread. The rules around __thread variables 50 | // in gcc are weird; putting ThreadControl directly in thread depends on a lot 51 | // of finicky details. It's easier to do this lazy initialization hack. 52 | static __thread ThreadControl* me; 53 | static __thread char meStorage alignas(ThreadControl) [sizeof(*me)]; 54 | 55 | // static 56 | ThreadControl* ThreadControl::get(std::atomic* threadCachedCpu) { 57 | if (me != nullptr) { 58 | return me; 59 | } 60 | 61 | mutex::callOnce(idAllocatorOnceFlag, []() { 62 | idAllocator = 63 | new (idAllocatorStorage) IdAllocator(kMaxGlobalThreads); 64 | }); 65 | 66 | me = new (meStorage) ThreadControl(threadCachedCpu); 67 | return me; 68 | } 69 | 70 | // static 71 | ThreadControl* ThreadControl::forId(std::uint32_t id) { 72 | return idAllocator->lookupOwner(id); 73 | } 74 | 75 | ThreadControl::ThreadControl(std::atomic* threadCachedCpu) { 76 | // Get our id. 77 | id_ = idAllocator->allocate(this); 78 | 79 | // Fill in the data about our process 80 | threadCachedCpu_ = threadCachedCpu; 81 | code_ = Code::initForId(id_, threadCachedCpu); 82 | tid_ = syscall(SYS_gettid); 83 | 84 | // Insert the ThreadControl into the global list 85 | { 86 | mutex::LockGuard lg(allThreadControlsMu); 87 | allThreadControls.link(this); 88 | } 89 | setThreadControlCleanup([]() { 90 | me->~ThreadControl(); 91 | // If we're reinitialized during thread death, we need to *know* it, and 92 | // reinitialize our data structures. 93 | me = nullptr; 94 | }); 95 | } 96 | 97 | ThreadControl::~ThreadControl() { 98 | // Remove ourselves from the list. 99 | { 100 | mutex::LockGuard lg(allThreadControlsMu); 101 | allThreadControls.unlink(this); 102 | } 103 | 104 | // Wait until no one's trying to evict us. 105 | bool beingAccessed = true; 106 | int numYields = 0; 107 | while (beingAccessed) { 108 | beingAccessed = false; 109 | { 110 | mutex::LockGuard lg(allThreadControlsMu); 111 | for (ThreadControl& thread : allThreadControls) { 112 | if (thread.accessing()->load() == id_) { 113 | beingAccessed = true; 114 | break; 115 | } 116 | } 117 | } 118 | if (!beingAccessed) { 119 | break; 120 | } 121 | // We yield for the first 100 attempts at dying. After that, we sleep. 122 | if (numYields < 100) { 123 | ++numYields; 124 | sched_yield(); 125 | } else { 126 | /* sleep override */ 127 | sleep(1); 128 | } 129 | } 130 | idAllocator->free(id_); 131 | } 132 | 133 | void ThreadControl::blockRseqOps() { 134 | threadCachedCpu_->store(-1, std::memory_order_relaxed); 135 | code_->blockRseqOps(); 136 | } 137 | 138 | void ThreadControl::unblockRseqOps() { 139 | // threadCachedCpu is set at the point of the sched_getcpu() call. 140 | code_->unblockRseqOps(); 141 | } 142 | 143 | // Returns -1 on error. 144 | static int tryParseCpu(char* procFileContents, ssize_t length) { 145 | if (length < 0) { 146 | return -1; 147 | } 148 | 149 | int indexOfLastRParen = -1; 150 | for (int i = 0; i < length; ++i) { 151 | if (procFileContents[i] == ')') { 152 | indexOfLastRParen = i; 153 | } 154 | } 155 | if (indexOfLastRParen == -1) { 156 | return -1; 157 | } 158 | 159 | // Command is field 39, command is field 2. 160 | const int kSpacesBeforeCpu = 38; 161 | int pos = 0; 162 | for ( 163 | int numSpacesEncountered = 0; 164 | pos < length && numSpacesEncountered < kSpacesBeforeCpu; 165 | ++pos) { 166 | if (procFileContents[pos] == ' ') { 167 | ++numSpacesEncountered; 168 | } 169 | } 170 | int cpu = 0; 171 | for (; pos < length; ++pos) { 172 | char charAtPos = procFileContents[pos]; 173 | if (charAtPos == ' ') { 174 | return cpu; 175 | } else if ('0' <= charAtPos && charAtPos <= '9') { 176 | cpu *= 10; 177 | cpu += charAtPos - '0'; 178 | } else { 179 | return -1; 180 | } 181 | } 182 | return -1; 183 | } 184 | 185 | // Returns a pointer to the first character after the integer output. 186 | static char* rseqItoa(int i, char* a) { 187 | char* cur = a; 188 | if (i == 0) { 189 | *cur++ = '0'; 190 | } 191 | while (i != 0) { 192 | *cur++ = '0' + i % 10; 193 | i /= 10; 194 | } 195 | // We printed the string least-significant digit first; we have to reverse it. 196 | for (char* left = a, *right = cur - 1; left < right; ++left, --right) { 197 | char temp = *right; 198 | *right = *left; 199 | *left = temp; 200 | } 201 | return cur; 202 | } 203 | 204 | int ThreadControl::curCpu() { 205 | // We want to construct "/proc/self/task//stat". 206 | // "/proc/self/task/" is 16 characters, tid is a positive int, so it's at most 207 | // 10 characters. "/stat" is 5 characters, and we need 1 terminating null 208 | // character. Adding all these together, we get 32 characters. 209 | const int procFileNameSize = 32; 210 | // We know the types of all the fields in /proc/self//stat, and can bound 211 | // their length to get the maximum buffer size we need, much the same way as 212 | // above. See P56392714 for the arithmetic. 213 | const int procFileContentsSize = 968; 214 | 215 | char filename[procFileNameSize]; 216 | // What we want here is: 217 | // snprintf(filename, sizeof(filename), "/prof/self/task/%d/stat", tid_); 218 | // But there are snprintf paths that can call malloc. Rather than try to 219 | // reason about the conditions under which this happens, we'll do our own 220 | // string printing. 221 | const char* filenamePrefix = "/proc/self/task/"; 222 | const char* filenameSuffix = "/stat"; 223 | std::strcpy(filename, filenamePrefix); 224 | char* tidStart = filename + std::strlen(filenamePrefix); 225 | char* suffixStart = rseqItoa(tid_, tidStart); 226 | std::strcpy(suffixStart, filenameSuffix); 227 | 228 | char procFileContents[procFileContentsSize]; 229 | 230 | int fd = open(filename, O_RDONLY); 231 | if (fd == -1) { 232 | return -1; 233 | } 234 | // To get atomicity, we want to read the whole file (well, the part of it that 235 | // we care about anyway) in a single read() call. We retry in case a signal 236 | // causes a length of -1. 237 | ssize_t length = -1; 238 | for (int i = 0; i < 10 && length == -1; ++i) { 239 | length = read(fd, procFileContents, procFileContentsSize); 240 | } 241 | int cpu = tryParseCpu(procFileContents, length); 242 | close(fd); 243 | return cpu; 244 | } 245 | 246 | } // namespace internal 247 | } // namespace rseq 248 | -------------------------------------------------------------------------------- /Rseq.md: -------------------------------------------------------------------------------- 1 | # `Rseq.h` 2 | -------- 3 | 4 | ## Overview 5 | *** 6 | 7 | This is a userspace take on the kernel restartable-sequences API. This allows 8 | efficient per-cpu atomic operations that don't use barriers. A thread can 9 | begin a restartable sequence (henceforth, "rseq"), and do rseq-load's and 10 | rseq-stores. These are just like normal loads and stores (they're efficient 11 | and don't come with any built-in barriers), which one exception: if another 12 | thread has begun an rseq on the same CPU, then the load / store doesn't take 13 | place, and returns an error code instead. 14 | 15 | ## History 16 | *** 17 | 18 | This idea originated with "Fast mutual exclusion for uniprocessors" 19 | (http://dl.acm.org/citation.cfm?id=143523), though similar ideas go back at 20 | least to the 1980s, with "Concurrency Features for the Trellis/Owl Language" 21 | (http://link.springer.com/chapter/10.1007%2F3-540-47891-4_16). "Mostly lock-free 22 | malloc" (http://dl.acm.org/citation.cfm?id=512451) showed some impressive 23 | performance wins by using essentially the same scheme. There has been a recent 24 | resurgence in interest prompted by work done by Google 25 | (http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf), 26 | resulting in a number of attempts to provide support in the Linux kernel, the 27 | most recent of which is at https://lkml.org/lkml/2016/8/19/699 . 28 | 29 | 30 | ## Usage example 31 | *** 32 | 33 | To see why this is useful, let's consider a hypothetical malloc 34 | implementation. At its core is a global data structure that keeps track of 35 | chunks of free memory of various size classes, each size class organized into 36 | a linked list. 37 | 38 | Adding and removing elements from the centralized linked lists will be 39 | expensive because of the synchronization overhead (lots of threads trying to 40 | pull an element off the same linked list will get expensive). So in addition 41 | to the centralized free-lists, we keep a per-thread cache. 42 | 43 | Here's how the fast path alloc/free from a size-class might look then. 44 | ThreadLocalSizeClassCache::head is the head of a linked-list based stack of 45 | free memory. 46 | 47 | void free(void* memory) { 48 | ThreadLocalSizeClassCache* cache = myTLD()->sizeClassCacheForPtr(memory); 49 | *(void**) memory = cache->head; 50 | cache->head = memory; 51 | } 52 | 53 | void* alloc(size_t size) { 54 | ThreadLocalSizeClassCache* cache = myTLD()->sizeClassCacheForSize(size); 55 | if (cache->head == nullptr) { 56 | return getMemoryFromCentralFreeList(cache->sizeClass); 57 | } 58 | void* result = cache->head; 59 | cache->head = *(void**)cache->head; 60 | return result; 61 | } 62 | 63 | But this approach has some problems. One big one is memory usage; to avoid 64 | the locking overhead of the central free-lists, we need caches to be big. But 65 | an N-byte cache per thread for T threads means we need N * T bytes reserved 66 | in caches. It wouldn't be unrealistic for N to be on the order of millions 67 | and T on the order of thousands. That's gigabytes of memory just sitting 68 | around waiting to be used. 69 | 70 | To save memory, we'll try per-CPU caching: make the linked-list stack where 71 | we keep freed memory in a per-cpu data structure instead of a per-thread one. 72 | Since there can be tens or even hundreds of threads per CPU, we may hope for 73 | a dramatic reduction in memory sitting around unused in caches. 74 | 75 | void free(void* ptr) { 76 | while (true) { 77 | CpuLocalSizeClassCache* cache = myCLD()->sizeClassCacheForPtr(ptr); 78 | do { 79 | *(void**) ptr = cache->head; 80 | } while (!compareAndSwap(&cache->head, *(void**) ptr, ptr)); 81 | } 82 | } 83 | 84 | void alloc(size_t size) { 85 | while (true) { 86 | CpuLocalSizeClassCache* cache = myCLD()->sizeClassCacheForSize(size); 87 | void* result = cache->head; 88 | if (result == nullptr) { 89 | return getMemoryFromCentralFreeList(cache->sizeClass); 90 | } 91 | void* newHead = *(void**) result; 92 | if (compareAndSwap(&cache->head, result, newHead)) { 93 | return result; 94 | } 95 | } 96 | } 97 | 98 | There are two problems here: 99 | 1. We have a compare-and-swap on the fast paths for both allocation and free. 100 | Even assuming cache hits, this is expensive. 101 | 2. There is an ABA problem in alloc. Resolving it involves strategies that 102 | are complicated, error-prone, and slow. 103 | 104 | Both of these problems are caused by the fact that a thread doesn't have any 105 | way of knowing if another thread will run between the loading of cache->head 106 | and the subsequent modification of it. This is exactly the problem that rseq 107 | can solve. 108 | 109 | Here's how it looks: 110 | 111 | void free(void* ptr) { 112 | while (true) { 113 | int cpu = rseq::begin(); 114 | CpuLocalSizeClassCache* cache = cldFor(cpu)->sizeClassCacheForPtr(ptr); 115 | *(void**) ptr = cache->head; 116 | if (rseq::store(&cache->head, ptr)) { 117 | return; 118 | } 119 | } 120 | } 121 | 122 | void alloc(size_t size) { 123 | while (true) { 124 | int cpu = rseq::begin(); 125 | CpuLocalSizeClassCache* cache = cldFor(cpu)->sizeClassCacheForSize(size); 126 | 127 | void* result = cache->head; 128 | if (result == nullptr) { 129 | return getMemoryFromCentralFreeList(cache->sizeClass); 130 | } 131 | void* newHead = *(void**) result; 132 | if (rseq::store(&cache->head, newHead)) { 133 | return result; 134 | } 135 | } 136 | } 137 | 138 | This is efficient (an rseq store has very little overhead over a 139 | plain-store), and correct (the store to cache->head will fail if another 140 | thread touched the cpu-local data after the call to rseq::begin(), avoiding 141 | the ABA-problem). 142 | 143 | 144 | ## Implementation 145 | *** 146 | 147 | We'll cover rseq::store only; the other functions are similar. Each thread 148 | gets its own copy of the following function: 149 | 150 | bool storeImpl(uint64_t* dst, uint64_t val) { 151 | do_store: 152 | *dst = val; 153 | success_path: 154 | return success; 155 | failure_path: 156 | return failure; 157 | } 158 | 159 | That is to say, we dynamically generate the assembly for storeImpl once per 160 | thread. Note that failure_path is unreachable as written. 161 | 162 | Additionally, there is a global cache that maps cpu -> thread owning that 163 | cpu, and a thread-local int that indicates the CPU a thread thinks it's 164 | running on. In rseq::begin(), we see if globalCpuOwner[myCachedCpu] == me, 165 | and if so, return myCachedCpu. 166 | 167 | The interesting case is if we detect an ownership change. If that happens, we 168 | update myCachedCpu, and look at globalCpuOwner[myCachedCpu] with the new 169 | value of myCachedCpu. We're going to block that thread's stores. We do so by 170 | patching the victim thread's copy of storeImpl to instead look like: 171 | 172 | bool storeImpl(uint64_t* dst, uint64_t val) { 173 | do_store: 174 | goto failure_path; // Store instruction has been overwritten with a jump! 175 | success_path: 176 | return success; 177 | failure_path: 178 | return failure; 179 | } 180 | 181 | After this store becomes visible to the victim, we know that any victim rseqs 182 | are done, and we may proceed; we cas ourselves into becoming the owner of the 183 | CPU and are done. 184 | 185 | The implementation is slightly more complicated; we need an 186 | asymmetricThreadFence() to make sure the victim thread has made its 187 | operations visible and seen the blocking of its operations. By looking at 188 | /proc/self/task//stat, we can usually tell if the other thread 189 | has been migrated or simply descheduled, and thereby usually avoid the fence. 190 | As described, we have an ABA issue when a victim thread has its operations 191 | blocked and re-enables them and runs again on the same CPU. We fix this by 192 | having globalCpuOwner[n] store a pair rather than just 193 | the owner. 194 | 195 | Note that if we aren't able to prove that the previous thread running on this 196 | CPU has been descheduled (say, because thread migrations are very frequent), 197 | then we have to take a slow path involving an IPI (triggered by an mprotect) 198 | more often. This can cause overheads on the order of microseconds per scheduling 199 | quantum. 200 | 201 | 202 | ## Dangers 203 | *** 204 | 205 | We break a few rules at several layers of the stack. These are described below. 206 | To increase our confidence that this behavior won't manifest, we include some 207 | stress tests (see `Readme.md` for more information on how to build and run 208 | tests). 209 | 210 | ### The CPU 211 | Our approach (patching a store to a jump without synchronization) is officially 212 | disallowed by the Intel architecture manuals 213 | (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 214 | section 8.1.3, "Handling Self- and Cross-Modifying Code"). As a practical 215 | matter, no problems have appeared under our stress testing. The AMD manuals 216 | (http://support.amd.com/TechDocs/24593.pdf section 7.6.1) guarantee that it 217 | works. Reading between the lines a little bit, I think this is likely safe 218 | (we're patching a single-micro-op instruction to another single-micro-op 219 | instruction at a word-aligned boundary that is only ever jumped to). Windows 220 | hot-patching points do something similar (patching a NOP to a jump instead of a 221 | store), so hopefully Intel will be conservative with this sort of behavior. 222 | 223 | An alternative approach would be to abuse the breakpoint mechanism (int3). We 224 | install a sigtrap handler that checks if the breakpoint we hit was on our copy 225 | of the store function, and if so moves the pc to the failure path. The evicting 226 | thread sets the breakpoint on the victim's copy of store and does an 227 | asymmetricThreadFenceHeavy(). This assumes that cross-modifying breakpoint 228 | insertion is allowed. This isn't stated explicitly, but the assumption is used 229 | in the Linux kernel, so Intel will have a harder time breaking it. A fancier 230 | variant is the following: 231 | - Insert the breakpoint on the store. 232 | - asymmetricThreadFenceHeavy() 233 | - Change the rest of the bytes in the store to a jump. 234 | - asymmetricThreadFenceHeavy() 235 | - Change the breakpoint to the first byte of the jump. 236 | This makes it far less likely that the victim will have to hit the breakpoint. 237 | In either case, we can try to use the /proc/self/ check mentioned above to avoid 238 | the asymmetricThreadFenceHeavy()s. 239 | 240 | A completely safe but slower approach is to put each thread's copies of its 241 | functions on a page specific to that thread. An evicting thread removes the 242 | execute permissions of the victim thread's page to stop it, and the victim fixes 243 | things up in a sigsegv handler. 244 | 245 | The advantages of the current approach over the others are speed (no cross-core 246 | activity on the fast path) and the fact that it does not need to steal a signal. 247 | 248 | 249 | ### The kernel 250 | 251 | There are two issues here. 252 | 253 | #### The mprotect hack 254 | We assume that our asymmetricThreadFenceHeavy() call gets the effect of a 255 | sys_membarrier() for cheap (i.e. without descheduling the calling thread). This 256 | works for now, about which Linus says "I'd be a bit leery about it" 257 | (https://lists.lttng.org/pipermail/lttng-dev/2015-March/024269.html). 258 | 259 | #### Trusting `/proc/stat/task//stat` 260 | To avoid the cost of the asymmetricThreadFenceHeavy() down the fast path where 261 | the victim has been descheduled rather than changed CPUs, we read its CPU out of 262 | /proc and see that it's assigned to our CPU; we then know that it will see the 263 | eviction. This works because the task's CPU is updated on the old CPU before it 264 | changes CPUs and begins running. If the kernel changes this, we'll break. 265 | 266 | 267 | ### The compiler 268 | 269 | We have a few bits of undefined behavior: 270 | 271 | - We manipulate pointers via a uintptr_t, and reinterpret the manipulated 272 | address as a pointer. 273 | - There are a few instances of what I think are strict aliasing violations (the 274 | code patching, rseq_repr_t, maybe elsewhere). 275 | - We use volatile as a stand-in for real atomics in places where we need C99 276 | compatibility, and use heuristic arguments about compiler reorderings and the 277 | fact that we're only concerned with x86. 278 | -------------------------------------------------------------------------------- /rseq/RseqTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #include "rseq/Rseq.h" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "rseq/internal/CpuLocal.h" 25 | #include "rseq/internal/SwitchToCpu.h" 26 | 27 | TEST(RseqMemberAddr, GetsAddresses) { 28 | struct Type { 29 | int field1; 30 | char field2; 31 | float arrayField[17]; 32 | double trailingField; 33 | }; 34 | Type* t = new Type; 35 | EXPECT_EQ(&t->field1, RSEQ_MEMBER_ADDR(t, field1)); 36 | EXPECT_EQ(&t->field2, RSEQ_MEMBER_ADDR(t, field2)); 37 | EXPECT_EQ(&t->arrayField[0], RSEQ_MEMBER_ADDR(t, arrayField)); 38 | EXPECT_EQ(&t->arrayField[0], &RSEQ_MEMBER_ADDR(t, arrayField)[0]); 39 | EXPECT_EQ(&t->arrayField[11], &RSEQ_MEMBER_ADDR(t, arrayField)[11]); 40 | EXPECT_EQ(&t->arrayField[11], RSEQ_MEMBER_ADDR(t, arrayField) + 11); 41 | EXPECT_EQ(&t->trailingField, RSEQ_MEMBER_ADDR(t, trailingField)); 42 | delete t; 43 | // t is deleted; if sanitiizers are going to complain about these, they'll do 44 | // it now. 45 | void* volatile ignored; 46 | ignored = RSEQ_MEMBER_ADDR(t, field1); 47 | ignored = RSEQ_MEMBER_ADDR(t, field2); 48 | ignored = RSEQ_MEMBER_ADDR(t, arrayField) + 11; 49 | ignored = RSEQ_MEMBER_ADDR(t, trailingField); 50 | 51 | // Make sure that it even works with null. 52 | t = nullptr; 53 | ignored = RSEQ_MEMBER_ADDR(t, field2); 54 | 55 | // Silence warnings about an unused variable. 56 | (void) ignored; 57 | } 58 | 59 | TEST(RseqMemberAddr, PreservesQualifiers) { 60 | enum Qualification { 61 | kInvalid, 62 | kUnqualified, 63 | kConst, 64 | kVolatile, 65 | kConstVolatile, 66 | }; 67 | struct DoesStores { 68 | void doStore(Qualification* qualification) { 69 | *qualification = kUnqualified; 70 | } 71 | void doStore(Qualification* qualification) const { 72 | *qualification = kConst; 73 | } 74 | void doStore(Qualification* qualification) volatile { 75 | *qualification = kVolatile; 76 | } 77 | void doStore(Qualification* qualification) const volatile { 78 | *qualification = kConstVolatile; 79 | } 80 | }; 81 | 82 | struct Holder { 83 | DoesStores doesStores; 84 | }; 85 | 86 | Holder holder; 87 | Holder* unqualifiedHolder = &holder; 88 | const Holder* constHolder = &holder; 89 | volatile Holder* volatileHolder = &holder; 90 | const volatile Holder* constVolatileHolder = &holder; 91 | 92 | Qualification qualification = kInvalid; 93 | 94 | RSEQ_MEMBER_ADDR(unqualifiedHolder, doesStores)->doStore(&qualification); 95 | EXPECT_EQ(kUnqualified, qualification); 96 | 97 | RSEQ_MEMBER_ADDR(constHolder, doesStores)->doStore(&qualification); 98 | EXPECT_EQ(kConst, qualification); 99 | 100 | RSEQ_MEMBER_ADDR(volatileHolder, doesStores)->doStore(&qualification); 101 | EXPECT_EQ(kVolatile, qualification); 102 | 103 | RSEQ_MEMBER_ADDR(constVolatileHolder, doesStores)->doStore(&qualification); 104 | EXPECT_EQ(kConstVolatile, qualification); 105 | } 106 | 107 | // It's hard to verify that the atomics actually act atomic; we just make sure 108 | // the things that ought to compile do. 109 | TEST(RseqValue, ActsLikeAtomic) { 110 | rseq::Value i0; 111 | rseq::Value i1(1); 112 | rseq::Value i2{1}; 113 | rseq::Value d; 114 | 115 | rseq::Value s; 116 | short s2 = s = 1; 117 | s.store(1); 118 | s.store(1, std::memory_order_relaxed); 119 | EXPECT_EQ(1, s.load()); 120 | EXPECT_EQ(1, s.load(std::memory_order_acquire)); 121 | EXPECT_EQ(1, s.exchange(2)); 122 | EXPECT_EQ(2, s.load()); 123 | EXPECT_EQ(2, s.exchange(2, std::memory_order_relaxed)); 124 | short expected = 1; 125 | EXPECT_FALSE(s.compare_exchange_weak(expected, 3)); 126 | EXPECT_EQ(2, expected); 127 | EXPECT_TRUE(s.compare_exchange_weak(expected, 3)); 128 | s.compare_exchange_weak(expected, 0, std::memory_order_relaxed); 129 | s.compare_exchange_weak( 130 | expected, 0, std::memory_order_relaxed, std::memory_order_relaxed); 131 | expected = 1; 132 | s.store(2); 133 | EXPECT_FALSE(s.compare_exchange_strong(expected, 3)); 134 | EXPECT_EQ(2, expected); 135 | EXPECT_TRUE(s.compare_exchange_strong(expected, 3)); 136 | s.compare_exchange_strong(expected, 0, std::memory_order_relaxed); 137 | s.compare_exchange_strong( 138 | expected, 0, std::memory_order_relaxed, std::memory_order_relaxed); 139 | } 140 | 141 | TEST(Rseq, StoresCorrectly) { 142 | std::uint64_t threadsPerCore = 200; 143 | std::uint64_t incrementsPerThread = 1000000; 144 | std::uint64_t numCores = rseq::internal::numCpus(); 145 | std::uint64_t numThreads = threadsPerCore * numCores; 146 | 147 | rseq::internal::CpuLocal> counters; 148 | for (int i = 0; i < numCores; ++i) { 149 | *counters.forCpu(i) = 0; 150 | } 151 | std::vector threads(numThreads); 152 | for (int i = 0; i < numThreads; ++i) { 153 | threads[i] = std::thread([&]() { 154 | for (int j = 0; j < incrementsPerThread; ++j) { 155 | while (true) { 156 | int cpu = rseq::begin(); 157 | rseq::Value* target = counters.forCpu(cpu); 158 | if (rseq::store(target, target->load() + 1)) { 159 | break; 160 | } 161 | } 162 | } 163 | }); 164 | } 165 | for (int i = 0; i < numThreads; ++i) { 166 | threads[i].join(); 167 | } 168 | std::uint64_t sum = 0; 169 | for (int i = 0; i < numCores; ++i) { 170 | sum += *counters.forCpu(i); 171 | } 172 | EXPECT_EQ(numThreads * incrementsPerThread, sum); 173 | } 174 | 175 | TEST(Rseq, StoreFencesCorrectly) { 176 | // First test that it does a store. 177 | rseq::Value dst(0); 178 | /* int cpu = */ rseq::begin(); 179 | EXPECT_TRUE(rseq::store(&dst, 1)); 180 | EXPECT_EQ(1, dst.load()); 181 | 182 | // Can't test fencing with only one processor. 183 | if (rseq::internal::numCpus() < 2) { 184 | return; 185 | } 186 | // We test fencing with dekker locking. The protected data is the counter 187 | // below. 188 | const int kIncrementsPerThread = 10000000; 189 | std::uint64_t counter1 = 0; 190 | std::uint64_t counter2 = 0; 191 | alignas(64) rseq::Value turn; 192 | alignas(64) std::atomic interested0; 193 | alignas(64) std::atomic interested1; 194 | std::atomic* interested[] = {&interested0, &interested1}; 195 | 196 | std::thread threads[2]; 197 | for (int i = 0; i < 2; ++i) { 198 | threads[i] = std::thread([&, i]() { 199 | rseq::internal::switchToCpu(i); 200 | for (int j = 0; j < kIncrementsPerThread; ++j) { 201 | EXPECT_EQ(i, rseq::begin()); 202 | interested[i]->store(true, std::memory_order_relaxed); 203 | EXPECT_TRUE(rseq::storeFence(&turn, 1 - i)); 204 | while (interested[1 - i]->load() && turn.load() != i) { 205 | // spin 206 | } 207 | EXPECT_TRUE(counter1 == counter2); 208 | ++counter1; 209 | ++counter2; 210 | interested[i]->store(false, std::memory_order_release); 211 | } 212 | }); 213 | } 214 | for (int i = 0; i < 2; ++i) { 215 | threads[i].join(); 216 | } 217 | EXPECT_EQ(2 * kIncrementsPerThread, counter1); 218 | EXPECT_EQ(2 * kIncrementsPerThread, counter2); 219 | } 220 | 221 | TEST(Rseq, LoadsCorrectly) { 222 | int numThreads = 10; 223 | int rseqsPerThread = 100; 224 | 225 | rseq::Value value(0); 226 | std::atomic numThreadsAlive(numThreads); 227 | std::vector threads(numThreads); 228 | for (int i = 0; i < numThreads; ++i) { 229 | threads[i] = std::thread([&, i]() { 230 | rseq::internal::switchToCpu(0); 231 | for (int j = 0; j < rseqsPerThread; ++j) { 232 | int cpu = rseq::begin(); 233 | EXPECT_EQ(0, cpu); 234 | if (!rseq::store(&value, i)) { 235 | continue; 236 | } 237 | while (true) { 238 | if (numThreadsAlive.load() == 1) { 239 | break; 240 | } 241 | std::uint64_t loadedValue = numThreads + 1; 242 | if (!rseq::load(&loadedValue, &value)) { 243 | EXPECT_EQ(numThreads + 1, loadedValue); 244 | break; 245 | } 246 | EXPECT_EQ(i, loadedValue); 247 | } 248 | } 249 | numThreadsAlive.fetch_sub(1); 250 | }); 251 | } 252 | for (int i = 0; i < numThreads; ++i) { 253 | threads[i].join(); 254 | } 255 | } 256 | 257 | TEST(Rseq, EndsCorrectly) { 258 | // A call to end() has no observable behavior; we test to make sure that it 259 | // won't cause crashes, but not much else. 260 | int numThreads = 100; 261 | int incrementsPerRseq = 100; 262 | int numRseqs = 10000; 263 | std::vector threads(numThreads); 264 | 265 | rseq::Value counter(0); 266 | std::atomic atomicCounter(0); 267 | 268 | for (int i = 0; i < numThreads; ++i) { 269 | threads[i] = std::thread([&]() { 270 | std::uint64_t localCounter = 0; 271 | rseq::internal::switchToCpu(0); 272 | for (int j = 0; j < numRseqs; ++j) { 273 | int cpu = rseq::begin(); 274 | EXPECT_EQ(0, cpu); 275 | for (int k = 0; k < incrementsPerRseq; ++k) { 276 | std::uint64_t view = counter.load(); 277 | bool success = rseq::store(&counter, view + 1); 278 | if (!success) { 279 | break; 280 | } 281 | ++localCounter; 282 | } 283 | rseq::end(); 284 | } 285 | atomicCounter.fetch_add(localCounter); 286 | }); 287 | } 288 | for (int i = 0; i < numThreads; ++i) { 289 | threads[i].join(); 290 | } 291 | EXPECT_EQ(atomicCounter.load(), counter.load()); 292 | } 293 | 294 | // Very dumb implementation based on spinning, but its enough to test the 295 | // fencing primitives. 296 | class RWLock { 297 | public: 298 | // If fenceWith is positive, we fence with that cpu. If it's -1, we fence with 299 | // *all* CPUs. 300 | explicit RWLock(int fenceWith) 301 | : readersMayBegin_(true), 302 | fenceWith_(fenceWith) { 303 | for (int i = 0; i < rseq::internal::numCpus(); ++i) { 304 | readerCounts_.forCpu(i)->store(0); 305 | } 306 | } 307 | 308 | void lock() { 309 | while (!readersMayBegin_.exchange(false)) { 310 | } 311 | if (fenceWith_ == -1) { 312 | rseq::fence(); 313 | } else { 314 | rseq::fenceWith(fenceWith_); 315 | } 316 | std::int64_t sum; 317 | do { 318 | sum = 0; 319 | for (int i = 0; i < rseq::internal::numCpus(); ++i) { 320 | sum += readerCounts_.forCpu(i)->load(); 321 | } 322 | } while (sum != 0); 323 | } 324 | 325 | void unlock() { 326 | readersMayBegin_.store(true); 327 | } 328 | 329 | void lock_shared() { 330 | while (true) { 331 | int cpu = rseq::begin(); 332 | if (!readersMayBegin_.load()) { 333 | continue; 334 | } 335 | std::int64_t curCount = readerCounts_.forCpu(cpu)->load(); 336 | if (rseq::store(readerCounts_.forCpu(cpu), curCount + 1)) { 337 | break; 338 | } 339 | } 340 | } 341 | 342 | void unlock_shared() { 343 | while (true) { 344 | int cpu = rseq::begin(); 345 | std::int64_t curCount = readerCounts_.forCpu(cpu)->load(); 346 | if (rseq::store(readerCounts_.forCpu(cpu), curCount - 1)) { 347 | break; 348 | } 349 | } 350 | } 351 | 352 | private: 353 | std::atomic readersMayBegin_; 354 | rseq::internal::CpuLocal> readerCounts_; 355 | int fenceWith_; 356 | }; 357 | 358 | void runFenceTest( 359 | int numReaders, 360 | int numReadLocks, 361 | int numWriteLocks, 362 | bool tieReadersToSameCpu) { 363 | rseq::internal::switchToCpu(0); 364 | int fenceWith; 365 | if (tieReadersToSameCpu) { 366 | fenceWith = rseq::internal::numCpus() > 1 ? 1 : 0; 367 | } else { 368 | fenceWith = -1; 369 | } 370 | 371 | RWLock lock(fenceWith); 372 | std::uint64_t val1 = 0; 373 | std::uint64_t val2 = 0; 374 | 375 | std::vector threads(numReaders); 376 | 377 | for (int i = 0; i < numReaders; ++i) { 378 | threads[i] = std::thread([&, i]() { 379 | if (tieReadersToSameCpu) { 380 | rseq::internal::switchToCpu(fenceWith); 381 | } else { 382 | rseq::internal::switchToCpu(i % rseq::internal::numCpus()); 383 | } 384 | 385 | for (int j = 0; j < numReadLocks; ++j) { 386 | lock.lock_shared(); 387 | EXPECT_TRUE(val1 == val2); 388 | lock.unlock_shared(); 389 | } 390 | }); 391 | } 392 | for (int i = 0; i < numWriteLocks; ++i) { 393 | lock.lock(); 394 | EXPECT_TRUE(val1 == val2); 395 | ++val1; 396 | ++val2; 397 | lock.unlock(); 398 | } 399 | for (int i = 0; i < numReaders; ++i) { 400 | threads[i].join(); 401 | } 402 | } 403 | 404 | TEST(Rseq, FenceWithsCorrectly) { 405 | runFenceTest(10, 100000, 10000000, true); 406 | } 407 | 408 | TEST(Rseq, FencesCorrectly) { 409 | runFenceTest(40, 10000, 100000, false); 410 | } 411 | 412 | TEST(Rseq, ReinitializesCorrectly) { 413 | static pthread_key_t key1; 414 | static pthread_key_t key2; 415 | static pthread_key_t key3; 416 | static std::once_flag once; 417 | static void (*destructor3)(void*) = [](void*) { 418 | rseq::begin(); 419 | }; 420 | static void (*destructor1)(void*) = [](void*) { 421 | rseq::begin(); 422 | pthread_setspecific(key3, reinterpret_cast(3)); 423 | }; 424 | static void (*destructor2)(void*) = [](void*) { 425 | rseq::begin(); 426 | }; 427 | 428 | std::call_once(once, []() { 429 | pthread_key_create(&key1, destructor1); 430 | pthread_key_create(&key2, destructor2); 431 | pthread_key_create(&key3, destructor3); 432 | }); 433 | std::thread t([&]() { 434 | pthread_setspecific(key1, reinterpret_cast(1)); 435 | rseq::begin(); 436 | pthread_setspecific(key2, reinterpret_cast(2)); 437 | }); 438 | t.join(); 439 | } 440 | -------------------------------------------------------------------------------- /rseq/Rseq.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "rseq/internal/Likely.h" 17 | #include "rseq/internal/Rseq.h" 18 | #include "rseq/internal/rseq_c.h" 19 | 20 | namespace rseq { 21 | 22 | template 23 | class Value; 24 | 25 | template 26 | bool load(T* dst, const Value* src); 27 | 28 | template 29 | bool store(Value* dst, U&& val); 30 | 31 | template 32 | bool storeFence(Value* dst, U&& val); 33 | 34 | // Overview 35 | // 36 | // This is a userspace take on the kernel restartable-sequences API. This allows 37 | // efficient per-cpu atomic operations that don't use barriers. A thread can 38 | // begin a restartable sequence (henceforth, "rseq"), and do rseq-load's and 39 | // rseq-stores. These are just like normal loads and stores (they're efficient 40 | // and don't come with any built-in barriers), which one exception: if another 41 | // thread has begun an rseq on the same CPU, then the load / store doesn't take 42 | // place, and returns an error code instead. 43 | // 44 | // See Rseq.md for a more thorough overview. 45 | 46 | // Example 47 | // 48 | // It's well known that using CAS, one can implement an arbitrary fetch-and-phi 49 | // operation (where 'phi' is any function from X -> X). When we want to do these 50 | // operations per-cpu, rseq can result in dramatic speed-ups. 51 | // 52 | // Without rseq: 53 | // std::atomic data[kNumCpus]; 54 | // 55 | // int fetchAndSquare() { 56 | // while (true) { 57 | // int cpu = sched_getcpu(); 58 | // int cur = data[cpu].load(std::memory_order_relaxed); 59 | // if (data[cpu].compare_exchange_strong(cur, cur * cur)) { 60 | // return cur; 61 | // } 62 | // } 63 | // } 64 | // 65 | // With rseq: 66 | // rseq::Value data[kNumCpus]; 67 | // 68 | // int fetchAndSquare() { 69 | // while (true) { 70 | // int cpu = rseq::begin(); 71 | // int cur = data[cpu].load(std::memory_order_relaxed); 72 | // if (rseq::store(&data[cpu], cur * cur)) { 73 | // return cur; 74 | // } 75 | // } 76 | // } 77 | // 78 | // This does the same operation, and has about the same complexity, but the rseq 79 | // version is significantly faster; it does a plain store instead of an 80 | // expensive atomic operation. 81 | // 82 | // Rseq can also solve the other tricky issue with concurrent data structures 83 | // built around CAS: the ABA problem. See Rseq.md for a more complete example. 84 | 85 | // Caveats 86 | // 1. The current implementation assumes x86-64 / TSO semantics (this isn't 87 | // fundamental, but is something to keep in mind before trying to port to 88 | // another architecture). 89 | // 90 | // 2. We only support types <= 8 bytes. 91 | // 92 | // 3. Down a slow-path, we may do an operation taking O(microseconds) (at most 93 | // once a scheduling quantum). We try to avoid it, but can't make any 94 | // guarantees. 95 | 96 | // API and memory model specifics 97 | // 98 | // An rseq is started by a call to rseq::begin(). This returns an integer in 99 | // [0, numCpus - 1], intended to be used as an index into per-cpu sharded data; 100 | // the integer tells us which cpu's data we should use). 101 | // The rseq lasts for an unspecified amount of time after the call. It might 102 | // even terminate immediately after beginning; the length of an rseq is a QoI 103 | // issue, not an API guarantee (we try very hard to ensure that an rseq lasts 104 | // at least until the initiating thread gets descheduled). 105 | // 106 | // Rseqs started with the same rseq::begin() return value are totally ordered; 107 | // the stores done in or visible to an rseq with shard index N are always 108 | // visible to subsequent rseqs with shard index N. An rseq may end at any time 109 | // (even spuriously; an rseq may end even if no other thread has begun an rseq 110 | // since this one began). Therefore, a thread that reads some sharded data 111 | // within an rseq should almost always ensure that the view it got was 112 | // consistent, by checking that the rseq is still ongoing at some point after 113 | // the reads are done. 114 | // 115 | // A warning on pointer-chasing: 116 | // Rseqs have seqlock-like semantics. The data you read might not be consistent; 117 | // the only way to be sure you saw a consistent view of things is if you find 118 | // that the rseq is ongoing at some point after you read some data. Following a 119 | // pointer is dangerous unless you're sure that the pointed-to data will still 120 | // be alive even if you rseq has ended at the time of the read. This is done 121 | // most easily by reading any unsafe data through rseq::load(). Note that you 122 | // probably want to use RSEQ_MEMBER_ADDR if you do this. 123 | // 124 | // rseq::Value objects are API-compatabile with std::atomics, including the use 125 | // of std::memory_orders (with the same semantics). 126 | // 127 | // RSEQ_MEMBER_ADDR macro: 128 | // In general, we use rseq::load because we want to load a member from a struct 129 | // whose existence we aren't sure about. But if we have a SomeType* someTypePtr, 130 | // it's undefined behavior to do *anything at all* with it unless we know that 131 | // the pointed-to memory has not been freed. This macro doesn't fix that, but it 132 | // attempts to obscure the fact well enough to ensure that we don't actually let 133 | // the compiler break us, and doesn't trigger an asan/ubsan/msan warnings. In 134 | // particular, it never dereferences its argument, even purely syntactically. 135 | // 136 | // This pre-decays array fields. This is almost always what you want. Example: 137 | // 138 | // struct Foo { 139 | // float justSomeRandomData; 140 | // bool someOtherPieceOfData; 141 | // int arr[22]; 142 | // }; 143 | // Foo* foo; 144 | // auto ptr = RSEQ_MEMBER_ADDR(foo, arr); 145 | // 146 | // Then "ptr" is of type int*, not int (*)[22]. Writing 147 | // "&RSEQ_MEMBER_ADDR(foo, arr)[7]" gives a pointer to the 7th element of the 148 | // arr field of the object foo points to. Note that we shouldn't ever write that 149 | // though, since if we know foo is safe to dereference, we don't need this macro 150 | // at all. Instead we want "RSEQ_MEMBER_ADDR(foo, arr) + 7". This gives 151 | // a pointer to the element without dereferencing it. 152 | // 153 | // The macro preserves const and volatile qualifiers. 154 | // 155 | // "ptr" and "member" should be plain identifier names; advanced syntactic 156 | // constructs like commas are not supported. 157 | template 158 | struct ReferenceRemoveExtent; 159 | template 160 | struct ReferenceRemoveExtent { 161 | typedef typename std::remove_extent::type& type; 162 | }; 163 | #define RSEQ_MEMBER_ADDR(ptr, member) \ 164 | (&reinterpret_cast< \ 165 | rseq::ReferenceRemoveExtentmember))>::type>( \ 166 | *const_cast( \ 167 | reinterpret_cast(ptr) \ 168 | + offsetof( \ 169 | std::remove_reference::type, member)))) 170 | 171 | 172 | template 173 | class Value { 174 | public: 175 | static_assert(sizeof(std::atomic) <= 8, 176 | "Can only have a Value when T is <= 8 bytes and can be atomic!"); 177 | 178 | Value() = default; 179 | explicit constexpr Value(T t) : repr_(toRepr(t)) {} 180 | Value(const Value&) = delete; 181 | 182 | Value& operator=(const Value&) = delete; 183 | 184 | T operator=(T t) { 185 | repr_ = toRepr(t); 186 | return t; 187 | } 188 | 189 | bool is_lock_free() const { 190 | return true; 191 | } 192 | 193 | static constexpr bool is_always_lock_free() { 194 | return true; 195 | } 196 | 197 | void store(T val, std::memory_order order = std::memory_order_seq_cst) { 198 | repr_.store(toRepr(val), order); 199 | } 200 | 201 | T load(std::memory_order order = std::memory_order_seq_cst) const { 202 | return fromRepr(repr_.load(order)); 203 | } 204 | 205 | /* implicit */ operator T() const { 206 | return load(); 207 | } 208 | 209 | T exchange(T desired, std::memory_order order = std::memory_order_seq_cst) { 210 | return fromRepr(repr_.exchange(toRepr(desired), order)); 211 | } 212 | 213 | bool compare_exchange_weak( 214 | T& expected, T desired, 215 | std::memory_order successOrder, std::memory_order failureOrder) { 216 | unsigned long expectedRepr = toRepr(expected); 217 | unsigned long desiredRepr = toRepr(desired); 218 | bool result = repr_.compare_exchange_weak( 219 | expectedRepr, desiredRepr, successOrder, failureOrder); 220 | expected = fromRepr(expectedRepr); 221 | return result; 222 | } 223 | 224 | bool compare_exchange_weak( 225 | T& expected, T desired, 226 | std::memory_order order = std::memory_order_seq_cst) { 227 | unsigned long expectedRepr = toRepr(expected); 228 | unsigned long desiredRepr = toRepr(desired); 229 | bool result = repr_.compare_exchange_weak(expectedRepr, desiredRepr, order); 230 | expected = fromRepr(expectedRepr); 231 | return result; 232 | } 233 | 234 | bool compare_exchange_strong( 235 | T& expected, T desired, 236 | std::memory_order successOrder, std::memory_order failureOrder) { 237 | unsigned long expectedRepr = toRepr(expected); 238 | unsigned long desiredRepr = toRepr(desired); 239 | bool result = repr_.compare_exchange_strong( 240 | expectedRepr, desiredRepr, successOrder, failureOrder); 241 | expected = fromRepr(expectedRepr); 242 | return result; 243 | } 244 | 245 | bool compare_exchange_strong( 246 | T& expected, T desired, 247 | std::memory_order order = std::memory_order_seq_cst) { 248 | unsigned long expectedRepr = toRepr(expected); 249 | unsigned long desiredRepr = toRepr(desired); 250 | bool result = repr_.compare_exchange_strong( 251 | expectedRepr, desiredRepr, order); 252 | expected = fromRepr(expectedRepr); 253 | return result; 254 | } 255 | 256 | // We don't implement the numeric operations. I think we could, but I'm not 257 | // knowledgeable enough about the numeric conversion rules to be sure (it's 258 | // tricky, because we would need to e.g. implement Value::fetch_add in 259 | // terms of atomic::fetch_add). 260 | // If you actually have a use case for them, we can figure it out then (I'm 261 | // already on the fence about allowing values of size other than 8, so that 262 | // would tip the scales). 263 | 264 | private: 265 | friend bool ::rseq::load(T* dst, const Value* src); 266 | // Can't do partial specialization of friend declarations; we just make store 267 | // with *any* types a friend. 268 | template 269 | friend bool ::rseq::store(Value* dst, V&& val); 270 | template 271 | friend bool ::rseq::storeFence(Value* dst, V&& val); 272 | 273 | // toRepr and fromRepr let us dodge aliasing violations and avoid dealing with 274 | // sizes. 275 | // Note that we static_assert using an std::atomic above, so we know that T 276 | // is trivially copyable. 277 | static unsigned long toRepr(T t) { 278 | unsigned long result = 0; 279 | std::memcpy(&result, &t, sizeof(T)); 280 | return result; 281 | } 282 | 283 | static T fromRepr(unsigned long repr) { 284 | T result; 285 | std::memcpy(&result, &repr, sizeof(T)); 286 | return result; 287 | } 288 | 289 | unsigned long* raw() const { 290 | return reinterpret_cast( 291 | const_cast*>(&repr_)); 292 | } 293 | 294 | std::atomic repr_; 295 | }; 296 | 297 | // Returns a shard index. Ensures that any rseqs on other threads that received 298 | // the same shard index are over before returning. 299 | inline int begin() { 300 | int ret = internal::threadCachedCpu()->load(); 301 | if (RSEQ_UNLIKELY(ret < 0)) { 302 | ret = internal::beginSlowPathWrapper(); 303 | } 304 | return ret; 305 | } 306 | 307 | // Tries to do "*dst = *src;" in the rseq last started by this thread, with 308 | // memory_order_seq_cst semantics. 309 | // If this returns true, then the load was successful and the rseq was not yet 310 | // over at the time of the load. (Note: the store to dst may take place after 311 | // the rseq is over). 312 | // If it returns false, then the rseq ended at some point prior to the call, and 313 | // no load or store occurred. 314 | // May only be called after begin(). 315 | // This is slighly slower than regular atomic loads, so those should be used 316 | // unless the load being part of the rseq is required for correctness (e.g. 317 | // pointer-chasing through dynamically allocated memory). 318 | template 319 | bool load(T* dst, const Value* src) { 320 | // An asymmetricThreadFenceLight() belongs after the load, but we omit it to 321 | // avoid namespace pollution. Invoking the generated code accomplishes the 322 | // same thing. 323 | if (sizeof(T) == 8) { 324 | unsigned long* realDst = reinterpret_cast(dst); 325 | return RSEQ_LIKELY(!rseq_load_trampoline(realDst, src->raw())); 326 | } else { 327 | unsigned long realDst; 328 | bool result = RSEQ_LIKELY(!rseq_load_trampoline(&realDst, src->raw())); 329 | if (result) { 330 | *dst = Value::fromRepr(realDst); 331 | } 332 | return result; 333 | } 334 | } 335 | 336 | // Tries to do "*dst = val;" in the rseq last started by this thread, with 337 | // memory_order_release semantics. 338 | // If this function returns true, then the store was performed, and the rseq was 339 | // not yet over at the time of the store. 340 | // If it returns false, then the rseq ended at some point prior to the call, and 341 | // no store occurred. 342 | // May only be called after begin(). 343 | template 344 | bool store(Value* dst, U&& val) { 345 | // Here as above we omit the asymmetricThreadFenceLight(). 346 | return RSEQ_LIKELY( 347 | !rseq_store_trampoline( 348 | dst->raw(), 349 | Value::toRepr(static_cast(val)))); 350 | } 351 | 352 | // Tries to do "*dst = val;" in the rseq last started by this thread, with 353 | // memory_order_seq_cst semantics. 354 | // If this function returns true, then the store was performed, and the rseq was 355 | // not yet over at the time of the store. 356 | // If it returns false, then the rseq ended at some point prior to the call, and 357 | // no store occurred. 358 | // May only be called after begin(). 359 | template 360 | bool storeFence(Value* dst, U&& val) { 361 | // Here as above we omit the asymmetricThreadFenceLight(). 362 | return RSEQ_LIKELY( 363 | !rseq_store_fence_trampoline( 364 | dst->raw(), 365 | Value::toRepr(static_cast(val)))); 366 | } 367 | 368 | // If this returns true, then the rseq last started by this thread has not yet 369 | // ended (and therefore, no other thread has called begin() and gotten back the 370 | // same shard index as the calling thread after the calling thread). 371 | inline bool validate() { 372 | Value dummy; 373 | return store(&dummy, 0); 374 | } 375 | 376 | // Ends the current rseq. 377 | // This does an atomic operation; in general it's better to just not do anything 378 | // and wait until you hit a failure in an rseq operation. 379 | // If you know you're likely to get descheduled soon (e.g. you're about to 380 | // sleep), or that a thread on another CPU will try to acquire ownership of the 381 | // current CPU (presumably while you do something else), then calling this first 382 | // can speed up that thread's call to begin(). 383 | inline void end() { 384 | internal::endWrapper(); 385 | } 386 | 387 | // Inserts a synchronization point in the rseq ordering of shard (ending the 388 | // rseq prior to that point). Stores visible to rseqs on that shard before the 389 | // point are visble to this thread after this function returns. Stores visible 390 | // to this thread are visible to rseqs that occur after the point. 391 | // 392 | // This isn't really any faster that fence() in most cases. However: 393 | // - Include fenceWith() makes the description of the memory model effects of 394 | // fence() simpler. 395 | // - There are some optimizations we can apply that will make fenceWith() faster 396 | // than a plain fence(). 397 | inline void fenceWith(int shard) { 398 | internal::fenceWithWrapper(shard); 399 | } 400 | 401 | // Equivalent to, but faster than, a call to fenceWith each each possible 402 | // argument. 403 | inline void fence() { 404 | internal::fenceWrapper(); 405 | } 406 | 407 | } // namespace rseq 408 | -------------------------------------------------------------------------------- /RseqBenchmark.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. An additional grant 7 | * of patent rights can be found in the PATENTS file in the same directory. 8 | */ 9 | 10 | /* 11 | 12 | `./rseq_benchmark` for usage. 13 | 14 | The output of lscpu on my machine: 15 | Architecture: x86_64 16 | CPU op-mode(s): 32-bit, 64-bit 17 | Byte Order: Little Endian 18 | CPU(s): 32 19 | On-line CPU(s) list: 0-31 20 | Thread(s) per core: 2 21 | Core(s) per socket: 8 22 | Socket(s): 2 23 | NUMA node(s): 2 24 | Vendor ID: GenuineIntel 25 | CPU family: 6 26 | Model: 45 27 | Model name: Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz 28 | Stepping: 6 29 | CPU MHz: 2201.000 30 | CPU max MHz: 2201.0000 31 | CPU min MHz: 1200.0000 32 | BogoMIPS: 4405.46 33 | Virtualization: VT-x 34 | L1d cache: 32K 35 | L1i cache: 32K 36 | L2 cache: 256K 37 | L3 cache: 20480K 38 | NUMA node0 CPU(s): 0-7,16-23 39 | NUMA node1 CPU(s): 8-15,24-31 40 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid xsaveopt 41 | 42 | 43 | According to some rough benchmarks: 44 | When there are lots of threads 45 | - Counter increments using rseq stores are about 36% slower than ones using 46 | stack variables. 47 | - Counter increments using rseq stores are about 4.2x faster than ones using 48 | per-cpu atomics. 49 | When there is only one thread: 50 | - Counter increments using rseq stores are about 9.8% slower than ones using 51 | stack variables. 52 | - Counter increments using rseq stores are about 5.3x faster than ones using 53 | per-cpu atomics. 54 | 55 | 56 | The output of `./rseq_benchmark threadLocal,rseq,atomicsCachedCpu 256 100000000`: 57 | =========================================================== 58 | Benchmarking Thread-local operations only (no sharing) 59 | Increments: 25600000000 60 | Seconds: 2.739452 61 | TSC ticks: 6026707360 62 | Single-CPU TSC ticks per increment: 0.235418 63 | Global TSC ticks per increment: 7.533384 64 | =========================================================== 65 | =========================================================== 66 | Benchmarking Per-cpu restartable sequences 67 | Increments: 25600000000 68 | Seconds: 3.732481 69 | TSC ticks: 8211339968 70 | Single-CPU TSC ticks per increment: 0.320755 71 | Global TSC ticks per increment: 10.264175 72 | =========================================================== 73 | =========================================================== 74 | Benchmarking Per-cpu atomics (with cached sched_getcpu calls) 75 | Increments: 25600000000 76 | Seconds: 15.678768 77 | TSC ticks: 34492797698 78 | Single-CPU TSC ticks per increment: 1.347375 79 | Global TSC ticks per increment: 43.115997 80 | =========================================================== 81 | 82 | 83 | The output of `./rseq_benchmark threadLocal,rseq,atomicsCachedCpu 1 100000000`: 84 | =========================================================== 85 | Benchmarking Thread-local operations only (no sharing) 86 | Increments: 100000000 87 | Seconds: 0.255986 88 | TSC ticks: 563156988 89 | Single-CPU TSC ticks per increment: 5.631570 90 | Global TSC ticks per increment: 180.210236 91 | =========================================================== 92 | =========================================================== 93 | Benchmarking Per-cpu restartable sequences 94 | Increments: 100000000 95 | Seconds: 0.281085 96 | TSC ticks: 618375013 97 | Single-CPU TSC ticks per increment: 6.183750 98 | Global TSC ticks per increment: 197.880004 99 | =========================================================== 100 | =========================================================== 101 | Benchmarking Per-cpu atomics (with cached sched_getcpu calls) 102 | Increments: 100000000 103 | Seconds: 1.478343 104 | TSC ticks: 3252272957 105 | Single-CPU TSC ticks per increment: 32.522730 106 | Global TSC ticks per increment: 1040.727346 107 | =========================================================== 108 | */ 109 | 110 | #include 111 | #include 112 | #include 113 | #include 114 | #include 115 | #include 116 | #include 117 | #include 118 | #include 119 | #include 120 | 121 | #include "rseq/Rseq.h" 122 | #include "rseq/internal/NumCpus.h" 123 | 124 | constexpr int kCachelineSize = 128; 125 | 126 | struct PercpuCounter { 127 | std::atomic atomicCounter; 128 | rseq::Value rseqCounter; 129 | std::mutex mu; 130 | char padding[ 131 | kCachelineSize 132 | - sizeof(atomicCounter) 133 | - sizeof(rseqCounter) 134 | - sizeof(mu)]; 135 | }; 136 | 137 | std::vector counterByCpu; 138 | char padding1[kCachelineSize - sizeof(counterByCpu)]; 139 | 140 | std::mutex contendedMu; 141 | char padding2[kCachelineSize - sizeof(contendedMu)]; 142 | 143 | std::atomic contendedCounter; 144 | 145 | enum TestType { 146 | kLongCriticalSection, 147 | kContendedAtomics, 148 | kContendedLocks, 149 | kRseq, 150 | kAtomics, 151 | kAtomicsCachedCpu, 152 | kLocks, 153 | kLocksCachedCpu, 154 | kThreadLocal, 155 | kTestTypeEnd, 156 | }; 157 | 158 | const char* testTypeString(TestType testType) { 159 | switch (testType) { 160 | case kLongCriticalSection: 161 | return "Long critical section"; 162 | case kContendedAtomics: 163 | return "Contended atomics"; 164 | case kContendedLocks: 165 | return "Contended locks"; 166 | case kRseq: 167 | return "Per-cpu restartable sequences"; 168 | case kAtomics: 169 | return "Per-cpu atomics"; 170 | case kAtomicsCachedCpu: 171 | return "Per-cpu atomics (with cached sched_getcpu calls)"; 172 | case kLocks: 173 | return "Per-cpu locks"; 174 | case kLocksCachedCpu: 175 | return "Per-cpu locks (with cached sched_getcpu calls)"; 176 | case kThreadLocal: 177 | return "Thread-local operations only (no sharing)"; 178 | case kTestTypeEnd: 179 | /* should never happen */ 180 | return nullptr; 181 | } 182 | return nullptr; 183 | } 184 | 185 | void doIncrementsLongCriticalSection(std::uint64_t numIncrements) { 186 | std::lock_guard lg(contendedMu); 187 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 188 | contendedCounter.store(contendedCounter.load(std::memory_order_relaxed) + 1, 189 | std::memory_order_relaxed); 190 | } 191 | } 192 | 193 | void doIncrementsContendedAtomics(std::uint64_t numIncrements) { 194 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 195 | std::uint64_t old = contendedCounter.load(); 196 | while (!contendedCounter.compare_exchange_weak(old, old + 1)) { 197 | } 198 | } 199 | } 200 | 201 | void doIncrementsContendedLocks(std::uint64_t numIncrements) { 202 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 203 | std::lock_guard lg(contendedMu); 204 | contendedCounter.store(contendedCounter.load(std::memory_order_relaxed) + 1, 205 | std::memory_order_relaxed); 206 | } 207 | } 208 | 209 | void doIncrementsRseq(std::uint64_t numIncrements) { 210 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 211 | bool success = false; 212 | do { 213 | int cpu = rseq::begin(); 214 | std::uint64_t curVal = counterByCpu[cpu].rseqCounter.load(); 215 | success = rseq::store(&counterByCpu[cpu].rseqCounter, curVal + 1); 216 | } while (!success); 217 | } 218 | } 219 | 220 | void doIncrementsAtomics(std::uint64_t numIncrements) { 221 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 222 | std::uint64_t old; 223 | int cpu; 224 | do { 225 | cpu = sched_getcpu(); 226 | old = counterByCpu[cpu].atomicCounter.load(); 227 | } while (!counterByCpu[cpu].atomicCounter.compare_exchange_weak( 228 | old, old + 1)); 229 | } 230 | } 231 | 232 | void doIncrementsAtomicsCachedCpu(std::uint64_t numIncrements) { 233 | for (std::uint64_t i = 0; i < numIncrements;) { 234 | int cpu = sched_getcpu(); 235 | for (int j = 0; j < 100 && i < numIncrements; ++i, ++j) { 236 | std::uint64_t old = counterByCpu[cpu].atomicCounter.load(); 237 | if (!counterByCpu[cpu].atomicCounter.compare_exchange_weak( 238 | old, old + 1)) { 239 | break; 240 | } 241 | } 242 | } 243 | } 244 | 245 | void doIncrementsLocks(std::uint64_t numIncrements) { 246 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 247 | int cpu = sched_getcpu(); 248 | std::lock_guard lg(counterByCpu[cpu].mu); 249 | counterByCpu[cpu].atomicCounter.store( 250 | counterByCpu[cpu].atomicCounter.load(std::memory_order_relaxed) + 1, 251 | std::memory_order_relaxed); 252 | } 253 | } 254 | 255 | void doIncrementsLocksCachedCpu(std::uint64_t numIncrements) { 256 | for (std::uint64_t i = 0; i < numIncrements;) { 257 | int cpu = sched_getcpu(); 258 | for (int j = 0; j < 100 && i < numIncrements; ++i, ++j) { 259 | std::lock_guard lg(counterByCpu[cpu].mu); 260 | counterByCpu[cpu].atomicCounter.store( 261 | counterByCpu[cpu].atomicCounter.load(std::memory_order_relaxed) + 1, 262 | std::memory_order_relaxed); 263 | } 264 | } 265 | } 266 | 267 | void doIncrementsThreadLocal(std::uint64_t numIncrements) { 268 | volatile std::uint64_t counter = 0; 269 | for (std::uint64_t i = 0; i < numIncrements; ++i) { 270 | std::uint64_t oldVal = counter; 271 | counter = oldVal + 1; 272 | } 273 | counterByCpu[0].atomicCounter.fetch_add(counter); 274 | } 275 | 276 | void printErrorIfNotEqual(std::uint64_t expected, std::uint64_t actual) { 277 | if (expected != actual) { 278 | std::printf( 279 | "Error: actual increment count %lu " 280 | "does not match expected increment count %lu.\n", 281 | actual, 282 | expected); 283 | } 284 | } 285 | 286 | std::uint64_t rdtscp() { 287 | std::uint32_t ecx; 288 | std::uint64_t rax,rdx; 289 | asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (ecx) : : ); 290 | return (rdx << 32) + rax; 291 | } 292 | 293 | void runTest( 294 | TestType testType, 295 | std::uint64_t numThreads, 296 | std::uint64_t numIncrements) { 297 | contendedCounter.store(0); 298 | for (unsigned i = 0; i < counterByCpu.size(); ++i) { 299 | counterByCpu[i].atomicCounter.store(0); 300 | counterByCpu[i].rseqCounter.store(0); 301 | } 302 | void (*benchmarkThreadFunc)(std::uint64_t) = 303 | testType == kLongCriticalSection ? doIncrementsLongCriticalSection : 304 | testType == kContendedAtomics ? doIncrementsContendedAtomics : 305 | testType == kContendedLocks ? doIncrementsContendedLocks : 306 | testType == kRseq ? doIncrementsRseq : 307 | testType == kAtomics ? doIncrementsAtomics : 308 | testType == kAtomicsCachedCpu ? doIncrementsAtomicsCachedCpu : 309 | testType == kLocks ? doIncrementsLocks : 310 | testType == kLocksCachedCpu ? doIncrementsLocksCachedCpu : 311 | testType == kThreadLocal ? doIncrementsThreadLocal : 312 | nullptr; 313 | std::printf("===========================================================\n"); 314 | std::printf("Benchmarking %s\n", testTypeString(testType)); 315 | auto beginTime = std::chrono::high_resolution_clock::now(); 316 | std::uint64_t beginCycles = rdtscp(); 317 | std::vector threads(numThreads); 318 | for (unsigned i = 0; i < numThreads; ++i) { 319 | threads[i] = std::thread(benchmarkThreadFunc, numIncrements); 320 | } 321 | for (unsigned i = 0; i < numThreads; ++i) { 322 | threads[i].join(); 323 | } 324 | std::uint64_t endCycles = rdtscp(); 325 | auto endTime = std::chrono::high_resolution_clock::now(); 326 | std::uint64_t expectedIncrements = numThreads * numIncrements; 327 | std::uint64_t actualIncrements = contendedCounter.load(); 328 | for (std::uint64_t i = 0; i < rseq::internal::numCpus(); ++i) { 329 | actualIncrements += counterByCpu[i].atomicCounter.load(); 330 | actualIncrements += counterByCpu[i].rseqCounter.load(); 331 | } 332 | printErrorIfNotEqual(expectedIncrements, actualIncrements); 333 | std::chrono::nanoseconds duration = endTime - beginTime; 334 | std::uint64_t ns = duration.count(); 335 | std::uint64_t cycles = endCycles - beginCycles; 336 | double seconds = static_cast(ns) / 1000000000.0; 337 | std::printf("Increments: %lu \n", actualIncrements); 338 | std::printf("Seconds: %f\n", seconds); 339 | std::printf("TSC ticks: %lu \n", cycles); 340 | double myCycles = static_cast(cycles) / actualIncrements; 341 | std::printf("Single-CPU TSC ticks per increment: %f\n", myCycles); 342 | std::printf("Global TSC ticks per increment: %f\n", 343 | rseq::internal::numCpus() * myCycles); 344 | std::printf("===========================================================\n"); 345 | } 346 | 347 | const char* usage = R"(Usage: %s benchmarks num_threads increments_per_thread 348 | Where 'benchmarks' is either 'all', or a comma-separated list containing the 349 | benchmarks to run: 350 | longCriticalSection: Each thread acquires a single shared lock, does all 351 | its increments, and releases the lock. 352 | 353 | contendedAtomics: Each thread updates a global counter with a CAS. 354 | 355 | contendedLocks: Each thread acquires and releases a global lock for 356 | counter increment. 357 | 358 | rseq: Threads increment cpu-local counters using restartable 359 | sequences. 360 | 361 | atomics: Threads increment cpu-local counters using CASs. 362 | 363 | atomicsCachedCpu: Threads increment cpu-local counters using CASs, but 364 | only call sched_getcpu once every 100 increments (or 365 | until contention is detected). 366 | 367 | locks: Threads increment cpu-local counters, protecting their 368 | increments with locks. 369 | 370 | locksCachedCPu: Threads increment cpu-local counters, protecting their 371 | increments with locks, but only call sched_getcpu once 372 | every 100 increments. 373 | 374 | threadLocal: Threads increment thread-local counters, with no 375 | synchronization. 376 | )"; 377 | 378 | std::vector parseBenchmarks(const char* benchmarks) { 379 | if (!strcmp(benchmarks, "all")) { 380 | return { 381 | kLongCriticalSection, 382 | kContendedAtomics, 383 | kContendedLocks, 384 | kRseq, 385 | kAtomics, 386 | kAtomicsCachedCpu, 387 | kLocks, 388 | kLocksCachedCpu, 389 | kThreadLocal 390 | }; 391 | } 392 | 393 | std::vector result; 394 | 395 | const char* benchmarksEnd = benchmarks + strlen(benchmarks); 396 | 397 | const char* tokBegin = benchmarks; 398 | while (true) { 399 | const char* tokEnd = std::strpbrk(tokBegin, ","); 400 | if (tokEnd == nullptr) { 401 | tokEnd = benchmarksEnd; 402 | } 403 | 404 | auto matches 405 | = [&](const char* str) { return std::equal(tokBegin, tokEnd, str); }; 406 | 407 | TestType testType = 408 | matches("longCriticalSection") ? kLongCriticalSection : 409 | matches("contendedAtomics") ? kContendedAtomics : 410 | matches("contendedLocks") ? kContendedLocks : 411 | matches("rseq") ? kRseq : 412 | matches("atomics") ? kAtomics : 413 | matches("atomicsCachedCpu") ? kAtomicsCachedCpu : 414 | matches("locks") ? kLocks : 415 | matches("locksCachedCpu") ? kLocksCachedCpu : 416 | matches("threadLocal") ? kThreadLocal : 417 | kTestTypeEnd; 418 | 419 | if (testType == kTestTypeEnd) { 420 | std::printf( 421 | "Error: unknown benchmark type at the beginning of \"%s\"\n", 422 | tokBegin); 423 | std::exit(1); 424 | } 425 | result.push_back(testType); 426 | 427 | if (tokEnd == benchmarksEnd) { 428 | break; 429 | } 430 | tokBegin = tokEnd + 1; 431 | } 432 | return result; 433 | } 434 | 435 | int main(int argc, char** argv) { 436 | if (argc != 4) { 437 | std::printf(usage, argv[0]); 438 | std::exit(1); 439 | } 440 | 441 | std::uint64_t numThreads; 442 | std::uint64_t numIncrements; 443 | 444 | std::vector benchmarks = parseBenchmarks(argv[1]); 445 | 446 | numThreads = atol(argv[2]); 447 | numIncrements = atol(argv[3]); 448 | 449 | if (numThreads == 0 || numIncrements == 0) { 450 | std::printf("Error: invalid value for threads or increments\n"); 451 | std::exit(1); 452 | } 453 | 454 | // PercpuCounter objects aren't moveable, so we construct a vector then swap 455 | // it with the global one. 456 | std::vector p(rseq::internal::numCpus()); 457 | counterByCpu.swap(p); 458 | 459 | for (TestType benchmark : benchmarks) { 460 | runTest(benchmark, numThreads, numIncrements); 461 | } 462 | 463 | return 0; 464 | } 465 | --------------------------------------------------------------------------------