├── CODE_OF_CONDUCT.md
├── rseq
    ├── internal
    │   ├── NumCpus.cpp
    │   ├── Likely.h
    │   ├── SwitchToCpu.h
    │   ├── AsymmetricThreadFence.h
    │   ├── rseq_c_inlines.c
    │   ├── SwitchToCpuTest.cpp
    │   ├── OsMem.h
    │   ├── Dummy.cpp
    │   ├── Errors.cpp
    │   ├── NumCpus.h
    │   ├── SwitchToCpu.cpp
    │   ├── CleanUpOnThreadDeath.h
    │   ├── CpuLocalTest.cpp
    │   ├── Code.h
    │   ├── Mutex.cpp
    │   ├── rseq_c.h
    │   ├── rseq_c.cpp
    │   ├── CachelinePadded.h
    │   ├── Rseq.h
    │   ├── CpuLocal.h
    │   ├── CleanUpOnThreadDeath.cpp
    │   ├── OsMem.cpp
    │   ├── IntrusiveLinkedList.h
    │   ├── MutexTest.cpp
    │   ├── AsymmetricThreadFence.cpp
    │   ├── CachelinePaddedTest.cpp
    │   ├── IntrusiveLinkedListTest.cpp
    │   ├── IdAllocator.h
    │   ├── ThreadControl.h
    │   ├── AsymmetricThreadFenceTest.cpp
    │   ├── Errors.h
    │   ├── ErrorsTest.cpp
    │   ├── Mutex.h
    │   ├── CMakeLists.txt
    │   ├── OsMemTest.cpp
    │   ├── ThreadControlTest.cpp
    │   ├── IdAllocatorTest.cpp
    │   ├── CodeTest.cpp
    │   ├── CleanUpOnThreadDeathTest.cpp
    │   ├── Code.cpp
    │   ├── Rseq.cpp
    │   └── ThreadControl.cpp
    ├── CMakeLists.txt
    ├── RseqCTest.cpp
    ├── rseq_c.h
    ├── RseqTest.cpp
    └── Rseq.h
├── CMakeLists.txt
├── LICENSE
├── CONTRIBUTING.md
├── PATENTS
├── README.md
├── Rseq.md
└── RseqBenchmark.cpp


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/rseq/internal/NumCpus.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/NumCpus.h"
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | namespace detail {
16 | mutex::OnceFlag numCpusOnceFlag;
17 | } // namespace detail
18 | 
19 | } // namespace internal
20 | } // namespace rseq
21 | 


--------------------------------------------------------------------------------
/rseq/internal/Likely.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #if defined(__GNUC__) && __GNUC__ >= 4
13 | #define RSEQ_LIKELY(x)   (__builtin_expect((x), 1))
14 | #define RSEQ_UNLIKELY(x) (__builtin_expect((x), 0))
15 | #else
16 | #define RSEQ_LIKELY(x)   (x)
17 | #define RSEQ_UNLIKELY(x) (x)
18 | #endif
19 | 


--------------------------------------------------------------------------------
/rseq/internal/SwitchToCpu.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | // Switch to the given CPU. Throws a std::runtime_error if it couldn't do so
16 | // successfully.
17 | void switchToCpu(int cpu);
18 | 
19 | } // namespace internal
20 | } // namespace rseq
21 | 


--------------------------------------------------------------------------------
/rseq/internal/AsymmetricThreadFence.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | inline void asymmetricThreadFenceLight() {
16 |   asm volatile("" : : : "memory");
17 | }
18 | 
19 | // Throws std::runtime_error on failure.
20 | void asymmetricThreadFenceHeavy();
21 | 
22 | } // namespace internal
23 | } // namespace rseq
24 | 


--------------------------------------------------------------------------------
/rseq/internal/rseq_c_inlines.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/rseq_c.h"
11 | 
12 | extern inline int rseq_begin();
13 | extern inline int rseq_load(rseq_value_t *dst, rseq_repr_t *src);
14 | extern inline int rseq_store(rseq_repr_t *dst, rseq_value_t val);
15 | extern inline int rseq_store_fence(rseq_repr_t *dst, rseq_value_t val);
16 | extern inline int rseq_validate();
17 | 


--------------------------------------------------------------------------------
/rseq/internal/SwitchToCpuTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/SwitchToCpu.h"
11 | 
12 | #include <thread>
13 | 
14 | #include <gtest/gtest.h>
15 | 
16 | #include "rseq/internal/NumCpus.h"
17 | 
18 | using namespace rseq::internal;
19 | 
20 | TEST(SwitchToCpu, SwitchesCpus) {
21 |   for (int i = 0; i < numCpus(); ++i) {
22 |     switchToCpu(i);
23 |     EXPECT_EQ(i, sched_getcpu());
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/rseq/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree. An additional grant
 6 | # of patent rights can be found in the PATENTS file in the same directory.
 7 | 
 8 | add_subdirectory(internal)
 9 | # internal/CMakeLists.txt populates the all_sources variable.
10 | add_library(rseq ${all_sources})
11 | 
12 | rseq_gtest(
13 |   rseq_test
14 |   RseqTest.cpp
15 |   rseq
16 |   cpu_local
17 |   num_cpus
18 |   switch_to_cpu
19 | )
20 | 
21 | rseq_gtest(
22 |   rseq_c_test
23 |   RseqCTest.cpp
24 |   rseq
25 |   cpu_local
26 |   num_cpus
27 |   switch_to_cpu
28 | )
29 | 
30 | install (TARGETS rseq DESTINATION lib)
31 | 


--------------------------------------------------------------------------------
/rseq/internal/OsMem.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <cstdint>
13 | 
14 | namespace rseq {
15 | namespace internal {
16 | namespace os_mem {
17 | 
18 | // Allocation functions throw a std::runtime_exception on failure.
19 | void* allocate(std::size_t bytes);
20 | void* allocateExecutable(std::size_t bytes);
21 | void free(void* ptr, std::size_t bytes);
22 | 
23 | 
24 | } // namespace os_mem
25 | } // namespace internal
26 | } // namespace rseq
27 | 


--------------------------------------------------------------------------------
/rseq/internal/Dummy.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | // This file only exists because CMake will complain about libraries with no
11 | // .cpp files (i.e. header-only libraries). Explicitly listing such libraries in
12 | // a CMakeLists.txt file isn't strictly necessary (header-only libraries should
13 | // "just work"), but it helps make library inter-dependencies clear.
14 | 
15 | namespace rseq {
16 | namespace internal {
17 | namespace dummy {
18 | inline void dummy() {}
19 | }
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/rseq/internal/Errors.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/Errors.h"
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | namespace errors {
15 | 
16 | static __thread FatalErrorHandler curHandler;
17 | 
18 | void setFatalErrorHandler(FatalErrorHandler handler) {
19 |   curHandler = handler;
20 | }
21 | 
22 | FatalErrorHandler getFatalErrorHandler() {
23 |   return curHandler;
24 | }
25 | 
26 | void fatalError(const char* message) {
27 |   curHandler(message);
28 | }
29 | 
30 | } // namespace errors
31 | } // namespace internal
32 | } // namespace rseq
33 | 


--------------------------------------------------------------------------------
/rseq/internal/NumCpus.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <unistd.h>
13 | 
14 | #include "rseq/internal/Mutex.h"
15 | 
16 | namespace rseq {
17 | namespace internal {
18 | 
19 | namespace detail {
20 | extern mutex::OnceFlag numCpusOnceFlag;
21 | } // namespace detail
22 | 
23 | // std::thread::hardware_concurrency() is surprisingly slow. This just caches
24 | // the result.
25 | inline int numCpus() {
26 |   static int result;
27 |   mutex::callOnce(detail::numCpusOnceFlag, []() {
28 |     result = sysconf(_SC_NPROCESSORS_ONLN);
29 |   });
30 |   return result;
31 | }
32 | 
33 | } // namespace internal
34 | } // namespace rseq
35 | 


--------------------------------------------------------------------------------
/rseq/internal/SwitchToCpu.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/SwitchToCpu.h"
11 | 
12 | #include <sys/types.h>
13 | #include <unistd.h>
14 | #include <sched.h>
15 | #include <syscall.h>
16 | 
17 | #include <cstdlib>
18 | #include <stdexcept>
19 | 
20 | #include "rseq/internal/Errors.h"
21 | 
22 | namespace rseq {
23 | namespace internal {
24 | 
25 | void switchToCpu(int cpu) {
26 |   pid_t tid = syscall(__NR_gettid);
27 |   cpu_set_t set;
28 |   CPU_ZERO(&set);
29 |   CPU_SET(cpu, &set);
30 |   int err = sched_setaffinity(tid, sizeof(cpu_set_t), &set);
31 |   if (err != 0) {
32 |     errors::fatalError("Couldn't switch cpus");
33 |   }
34 | }
35 | 
36 | } // namespace internal
37 | } // namespace rseq
38 | 


--------------------------------------------------------------------------------
/rseq/internal/CleanUpOnThreadDeath.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | // We want to centralize all the thread death logic for a couple reasons:
16 | // - The order of execution matters; we do rseq cleanup before thread control
17 | //   cleanup.
18 | // - That's how jemalloc does it, so porting will be easier if we decide to.
19 | // - The logic is actually a little bit subtle (there are ODR issues involved).
20 | // We have to wrap up the calls behind a layer of indirection to avoid a
21 | // circular dependency.
22 | void setRseqCleanup(void (*)());
23 | void setThreadControlCleanup(void (*)());
24 | 
25 | } // namespace internal
26 | } // namespace rseq
27 | 


--------------------------------------------------------------------------------
/rseq/internal/CpuLocalTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/CpuLocal.h"
11 | 
12 | #include <gtest/gtest.h>
13 | 
14 | #include "rseq/internal/NumCpus.h"
15 | #include "rseq/internal/SwitchToCpu.h"
16 | 
17 | using namespace rseq::internal;
18 | 
19 | TEST(CpuLocal, DataIsPerCpu) {
20 |   CpuLocal<int> data;
21 |   for (int i = 0; i < numCpus(); ++i) {
22 |     switchToCpu(i);
23 |     *data.forCpu(i) = i;
24 |   }
25 | 
26 |   for (int i = 0; i < numCpus(); ++i) {
27 |     switchToCpu(i);
28 |     EXPECT_EQ(i, *data.forCpu(i));
29 |   }
30 | }
31 | 
32 | TEST(CpuLocal, CanAccessAnotherCpusData) {
33 |   CpuLocal<int> data;
34 |   switchToCpu(0);
35 |   for (int i = 0; i < numCpus(); ++i) {
36 |     *data.forCpu(i) = i;
37 |   }
38 |   for (int i = 0; i < numCpus(); ++i) {
39 |     switchToCpu(i);
40 |     EXPECT_EQ(i, *data.forCpu(i));
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/rseq/internal/Code.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <atomic>
13 | #include <cstdint>
14 | 
15 | namespace rseq {
16 | namespace internal {
17 | 
18 | class Code {
19 |  public:
20 |   // The rseq load and store functions return 1 if there was an interruption,
21 |   // and 0 otherwise.
22 |   typedef int (*RseqLoadFunc)(unsigned long* dst, unsigned long* src);
23 |   typedef int (*RseqStoreFunc)(unsigned long* dst, unsigned long val);
24 | 
25 |   static Code* initForId(std::uint32_t id, std::atomic<int>* threadCachedCpu);
26 | 
27 |   RseqLoadFunc rseqLoadFunc();
28 |   RseqStoreFunc rseqStoreFunc();
29 |   RseqStoreFunc rseqStoreFenceFunc();
30 | 
31 |   void blockRseqOps();
32 |   void unblockRseqOps();
33 | 
34 |  private:
35 |   unsigned char code_[54]; // See Code.cpp to see where 54 comes from.
36 | };
37 | 
38 | } // namespace internal
39 | } // namespace rseq
40 | 


--------------------------------------------------------------------------------
/rseq/internal/Mutex.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/Mutex.h"
11 | 
12 | #include <linux/futex.h>
13 | #include <sys/syscall.h>
14 | #include <unistd.h>
15 | 
16 | namespace rseq {
17 | namespace internal {
18 | namespace mutex {
19 | 
20 | void Mutex::futexWait(std::uint32_t val) {
21 |   // We ignore errors here; it just means we'll spin a little extra.
22 |   syscall(
23 |       __NR_futex,
24 |       &state_,
25 |       FUTEX_WAIT | FUTEX_PRIVATE_FLAG,
26 |       val,
27 |       nullptr,
28 |       nullptr,
29 |       0);
30 | }
31 | 
32 | void Mutex::futexWake(int num) {
33 |   // Ignore errors here, too; it probably means a destructor race.
34 |   syscall(
35 |       __NR_futex,
36 |       &state_,
37 |       FUTEX_WAKE | FUTEX_PRIVATE_FLAG,
38 |       num,
39 |       nullptr,
40 |       nullptr,
41 |       0);
42 | }
43 | 
44 | } // namespace mutex
45 | } // namespace internal
46 | } // namespace rseq
47 | 


--------------------------------------------------------------------------------
/rseq/internal/rseq_c.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | /* We want this in C-land so that C users can get the fast inlined versions too.
17 |  * It turns out to be slightly faster to have these return false on success and
18 |  * true on failure, so we invert the result in the wrapper functions, hoping
19 |  * that the compiler can use its visibility into them to avoid having to do its
20 |  * own inversion. */
21 | extern __thread int (*rseq_load_trampoline)(
22 |     unsigned long* dst, unsigned long* src);
23 | extern __thread int (*rseq_store_trampoline)(
24 |     unsigned long* dst, unsigned long val);
25 | extern __thread int (*rseq_store_fence_trampoline)(
26 |     unsigned long* dst, unsigned long val);
27 | extern __thread volatile int rseq_thread_cached_cpu;
28 | 
29 | int rseq_begin_slow_path();
30 | 
31 | #ifdef __cplusplus
32 | } /* extern "C" */
33 | #endif
34 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree. An additional grant
 6 | # of patent rights can be found in the PATENTS file in the same directory.
 7 | 
 8 | cmake_minimum_required(VERSION 2.8)
 9 | 
10 | project("Userspace restartable sequences")
11 | 
12 | option(test "Build all tests." OFF)
13 | if (test)
14 |   find_package(GTest REQUIRED)
15 |   include_directories(${GTEST_INCLUDE_DIRS})
16 | 
17 |   enable_testing()
18 | endif ()
19 | function(rseq_gtest name src_file)
20 |   if (test)
21 |     add_executable(
22 |       "${name}_runner"
23 |       ${src_file}
24 |     )
25 |     target_link_libraries(
26 |       "${name}_runner"
27 |       ${GTEST_BOTH_LIBRARIES}
28 |       ${ARGN}
29 |     )
30 |     add_test(
31 |       NAME ${name}
32 |       COMMAND "${name}_runner"
33 |     )
34 |   endif ()
35 | endfunction(rseq_gtest)
36 | 
37 | set (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-std=c++11 -pthread -fno-exceptions")
38 | 
39 | include_directories(PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
40 | 
41 | add_subdirectory(rseq)
42 | 
43 | add_executable(rseq_benchmark RseqBenchmark.cpp)
44 | target_link_libraries(rseq_benchmark rseq)
45 | 
46 | install(DIRECTORY rseq DESTINATION include FILES_MATCHING PATTERN "*.h")
47 | 


--------------------------------------------------------------------------------
/rseq/internal/rseq_c.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include <unistd.h>
11 | 
12 | #include <stdio.h>
13 | 
14 | #include <cstdlib>
15 | #include <exception>
16 | 
17 | #include "rseq/internal/Errors.h"
18 | #include "rseq/internal/Rseq.h"
19 | 
20 | extern "C" {
21 | 
22 | __thread int (*rseq_load_trampoline)(unsigned long* dst, unsigned long* src);
23 | __thread int (*rseq_store_trampoline)(unsigned long* dst, unsigned long val);
24 | __thread int (*rseq_store_fence_trampoline)(
25 |     unsigned long* dst, unsigned long val);
26 | __thread volatile int rseq_thread_cached_cpu = -1;
27 | 
28 | int rseq_begin_slow_path() {
29 |   rseq::internal::errors::AbortOnError aoe;
30 |   return rseq::internal::beginSlowPath();
31 | }
32 | 
33 | void rseq_end() {
34 |   rseq::internal::errors::AbortOnError aoe;
35 |   rseq::internal::end();
36 | }
37 | 
38 | void rseq_fence_with(int shard) {
39 |   rseq::internal::errors::AbortOnError aoe;
40 |   rseq::internal::fenceWith(shard);
41 | }
42 | 
43 | void rseq_fence() {
44 |   rseq::internal::errors::AbortOnError aoe;
45 |   rseq::internal::fence();
46 | }
47 | 
48 | } /* extern "C" */
49 | 


--------------------------------------------------------------------------------
/rseq/internal/CachelinePadded.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | constexpr int kCachelineSize = 64;
16 | 
17 | template <typename T, bool alreadyCachelineAligned>
18 | struct CachelinePaddedImpl;
19 | template <typename T>
20 | struct CachelinePaddedImpl<T, true> {
21 |   T item;
22 | };
23 | template <typename T>
24 | struct CachelinePaddedImpl<T, false> {
25 |   T item;
26 |   char padding[kCachelineSize - sizeof(T) % kCachelineSize];
27 | };
28 | 
29 | template <typename T>
30 | struct CachelinePadded {
31 |   // Casting from the return value of get() back to a CachelinePadded<T> is
32 |   // guaranteed to work if T is standard-layout.
33 |   T* get() {
34 |     return &paddedItem.item;
35 |   }
36 | 
37 |   // Note: can't be private; this struct must remain standard-layout to get the
38 |   // guarantee that we can cast back and forth between the item and this struct
39 |   // (in particular, we need this for Code objects).
40 |   CachelinePaddedImpl<T, sizeof(T) % kCachelineSize == 0> paddedItem;
41 | };
42 | 
43 | } // namespace internal
44 | } // namespace rseq
45 | 


--------------------------------------------------------------------------------
/rseq/internal/Rseq.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <atomic>
13 | 
14 | #include "rseq/internal/Errors.h"
15 | #include "rseq/internal/rseq_c.h"
16 | 
17 | namespace rseq {
18 | namespace internal {
19 | 
20 | // Internal equivalents of public functions.
21 | // We put the error wrappers in the header file so that no exception logic lives
22 | // in librseq.
23 | 
24 | 
25 | int beginSlowPath();
26 | void end();
27 | void fenceWith(int shard);
28 | void fence();
29 | 
30 | inline int beginSlowPathWrapper() {
31 |   errors::ThrowOnError thrower;
32 |   return beginSlowPath();
33 | };
34 | 
35 | inline void endWrapper() {
36 |   errors::ThrowOnError thrower;
37 |   end();
38 | }
39 | 
40 | inline void fenceWithWrapper(int shard) {
41 |   errors::ThrowOnError thrower;
42 |   fenceWith(shard);
43 | }
44 | 
45 | inline void fenceWrapper() {
46 |   errors::ThrowOnError thrower;
47 |   fence();
48 | }
49 | 
50 | inline std::atomic<int>* threadCachedCpu() {
51 |   return reinterpret_cast<std::atomic<int>*>(
52 |       const_cast<int*>(&rseq_thread_cached_cpu));
53 | }
54 | 
55 | } // namespace internal
56 | } // namespace rseq
57 | 


--------------------------------------------------------------------------------
/rseq/internal/CpuLocal.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <new>
13 | 
14 | #include "rseq/internal/CachelinePadded.h"
15 | #include "rseq/internal/NumCpus.h"
16 | #include "rseq/internal/OsMem.h"
17 | 
18 | 
19 | namespace rseq {
20 | namespace internal {
21 | 
22 | template <typename T>
23 | class CpuLocal {
24 |  public:
25 |   CpuLocal() {
26 |     void* mem = os_mem::allocate(sizeof(ElemType) * numCpus());
27 |     elements_ = static_cast<ElemType*>(mem);
28 |     for (int i = 0; i < numCpus(); ++i) {
29 |       new (&elements_[i]) ElemType;
30 |     }
31 |   }
32 | 
33 |   ~CpuLocal() {
34 |     for (int i = 0; i < numCpus(); ++i) {
35 |       elements_[i].~ElemType();
36 |     }
37 |     os_mem::free(elements_, sizeof(ElemType) * numCpus());
38 |   }
39 | 
40 |   T* forCpu(int i) {
41 |     return elements_[i].get();
42 |   }
43 | 
44 |  private:
45 |   // This saves us some typing, and is needed for explicit destructor invocation
46 |   // (which doesn't parse with template types).
47 |   typedef CachelinePadded<T> ElemType;
48 |   ElemType* elements_;
49 | };
50 | 
51 | } // namespace internal
52 | } // namespace rseq
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For rseq software
 4 | 
 5 | Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Rseq
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible. We anticipate only bug-fixes, but functionality improvements might be
 4 | possible as well.
 5 | 
 6 | ## Our Development Process
 7 | We intend github to be the source of truth for this project, with future
 8 | development happening entirely "in the open".
 9 | 
10 | ## Pull Requests
11 | We actively welcome your pull requests.
12 | 
13 | 1. Fork the repo and create your branch from `master`.
14 | 2. If you've added code that should be tested, add tests.
15 | 3. If you've changed APIs, update the documentation.
16 | 4. Ensure the test suite passes.
17 | 5. Make sure your code lints.
18 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
19 | 
20 | ## Contributor License Agreement ("CLA")
21 | In order to accept your pull request, we need you to submit a CLA. You only need
22 | to do this once to work on any of Facebook's open source projects.
23 | 
24 | Complete your CLA here: <https://code.facebook.com/cla>
25 | 
26 | ## Issues
27 | We use GitHub issues to track public bugs. Please ensure your description is
28 | clear and has sufficient instructions to be able to reproduce the issue.
29 | 
30 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
31 | disclosure of security bugs. In those cases, please go through the process
32 | outlined on that page and do not file a public issue.
33 | 
34 | ## Coding Style  
35 | * 2 spaces for indentation rather than tabs
36 | * 80 character line length
37 | * Broadly, we follow the Google C++ style guide.
38 | 
39 | ## License
40 | By contributing to Rseq, you agree that your contributions will be licensed
41 | under the LICENSE file in the root directory of this source tree.
42 | 


--------------------------------------------------------------------------------
/rseq/RseqCTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/rseq_c.h"
11 | 
12 | #include <atomic>
13 | 
14 | #include <gtest/gtest.h>
15 | 
16 | // We don't put this through the ringer the same way we do the C++ interface. So
17 | // long as it compiles and runs, we assume things are correct.
18 | TEST(RseqC, SanityChecks) {
19 |   rseq_repr_t rseqItem;
20 |   reinterpret_cast<std::atomic<unsigned long>*>(&rseqItem)->store(1);
21 |   rseq_value_t rseqValue;
22 | 
23 |   /* int cpu = */ rseq_begin();
24 | 
25 |   // Starts at 1
26 |   EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem));
27 |   EXPECT_EQ(1, rseqValue);
28 | 
29 |   // Store 2, then load
30 |   EXPECT_TRUE(rseq_store(&rseqItem, 2));
31 |   EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem));
32 |   EXPECT_EQ(2, rseqValue);
33 | 
34 |   // Store-fence 3, then load
35 |   EXPECT_TRUE(rseq_store_fence(&rseqItem, 3));
36 |   EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem));
37 |   EXPECT_EQ(3, rseqValue);
38 | 
39 |   // Fence
40 |   rseq_fence();
41 | 
42 |   // Store should fail then.
43 |   EXPECT_FALSE(rseq_store(&rseqItem, 4));
44 |   EXPECT_EQ(
45 |       3, reinterpret_cast<std::atomic<unsigned long>*>(&rseqItem)->load());
46 | 
47 |   // Start up again
48 |   /* int cpu = */ rseq_begin();
49 | 
50 |   // End
51 |   rseq_end();
52 | 
53 |   // Start up yet again.
54 |   /* int cpu = */ rseq_begin();
55 | 
56 |   // And things should work, even after ending.
57 |   EXPECT_TRUE(rseq_store(&rseqItem, 5));
58 |   EXPECT_TRUE(rseq_load(&rseqValue, &rseqItem));
59 |   EXPECT_EQ(5, rseqValue);
60 | }
61 | 


--------------------------------------------------------------------------------
/rseq/internal/CleanUpOnThreadDeath.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | #include "rseq/internal/CleanUpOnThreadDeath.h"
10 | 
11 | #include <pthread.h>
12 | 
13 | #include "rseq/internal/Mutex.h"
14 | #include "rseq/internal/Errors.h"
15 | 
16 | namespace rseq {
17 | namespace internal {
18 | 
19 | static __thread void (*cleanUpRseq)();
20 | static __thread void (*cleanUpThreadControl)();
21 | 
22 | static __thread bool myDestructorScheduled;
23 | static pthread_key_t pthreadOnceKey;
24 | static mutex::OnceFlag destructorScheduledOnceFlag;
25 | 
26 | static void destructor(void* /* ignored */) {
27 |   // If someone does an rseq operation *within* a pthread destructor, we'll
28 |   // re-initialize our data.
29 |   myDestructorScheduled = false;
30 |   if (cleanUpRseq != nullptr) {
31 |     cleanUpRseq();
32 |   }
33 |   if (cleanUpThreadControl != nullptr) {
34 |     cleanUpThreadControl();
35 |   }
36 |   cleanUpRseq = nullptr;
37 |   cleanUpThreadControl = nullptr;
38 | }
39 | 
40 | static void ensureDestructorScheduled() {
41 |   mutex::callOnce(destructorScheduledOnceFlag, []() {
42 |     int err = pthread_key_create(&pthreadOnceKey, &destructor);
43 |     if (err != 0) {
44 |       errors::fatalError("Couldn't schedule thread death destructor");
45 |     }
46 |   });
47 |   if (!myDestructorScheduled) {
48 |     // Exists purely to schedule the destructor.
49 |     pthread_setspecific(pthreadOnceKey, reinterpret_cast<void*>(1));
50 |   }
51 | }
52 | 
53 | void setRseqCleanup(void (*func)()) {
54 |   cleanUpRseq = func;
55 |   ensureDestructorScheduled();
56 | }
57 | 
58 | void setThreadControlCleanup(void (*func)()) {
59 |   cleanUpThreadControl = func;
60 |   ensureDestructorScheduled();
61 | }
62 | 
63 | } // namespace internal
64 | } // namespace rseq
65 | 


--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
 1 | Additional Grant of Patent Rights Version 2
 2 | 
 3 | "Software" means the rseq software distributed by Facebook, Inc.
 4 | 
 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
 7 | (subject to the termination provision below) license under any Necessary
 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
 9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 | 
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 | 
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 | 
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 | 


--------------------------------------------------------------------------------
/rseq/internal/OsMem.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include <sys/mman.h>
11 | 
12 | #include <cstdint>
13 | #include <stdexcept>
14 | 
15 | #include "rseq/internal/Errors.h"
16 | 
17 | namespace rseq {
18 | namespace internal {
19 | namespace os_mem {
20 | 
21 | static void* mmapWithPermissions(std::size_t bytes, int prot) {
22 |   // If we die in this method, it'd be helpful to know the arguments; make sure
23 |   // they're available in the debugger.
24 |   volatile int bytesCopy = bytes;
25 |   volatile int protCopy = prot;
26 | 
27 |   void* alloc = mmap(
28 |     nullptr,
29 |     bytes,
30 |     prot,
31 |     MAP_PRIVATE | MAP_ANONYMOUS,
32 |     -1,
33 |     0);
34 |   if (alloc == MAP_FAILED) {
35 |     errors::fatalError("mmap failed.");
36 |   }
37 |   return alloc;
38 | }
39 | 
40 | void* allocate(std::size_t bytes) {
41 |   return mmapWithPermissions(bytes, PROT_READ | PROT_WRITE);
42 | }
43 | 
44 | void* allocateExecutable(std::size_t bytes) {
45 |   return mmapWithPermissions(bytes, PROT_READ | PROT_WRITE | PROT_EXEC);
46 | }
47 | 
48 | void free(void* ptr, std::size_t bytes) {
49 |   // Note that we may throw, even though this is on a deallocation path. So if
50 |   // we get called with an invalid argument during exception unwinding, we'll
51 |   // crash the process. This is an acceptable penalty for passing invalid
52 |   // pointers to your memory allocator.
53 |   const int kPageSize = 4096;
54 |   std::uintptr_t ptrInt = reinterpret_cast<std::uintptr_t>(ptr);
55 |   if (ptrInt & (kPageSize - 1)) {
56 |     errors::fatalError("Improperly aligned pointer");
57 |   }
58 |   int err = munmap(ptr, bytes);
59 |   if (err != 0) {
60 |     errors::fatalError("munmap failed");
61 |   }
62 | }
63 | 
64 | } // namespace os_mem
65 | } // namespace internal
66 | } // namespace rseq
67 | 


--------------------------------------------------------------------------------
/rseq/internal/IntrusiveLinkedList.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | namespace rseq {
13 | namespace internal {
14 | 
15 | // Intrusive linked list, using the CRTP. Does not take ownership of its
16 | // elements.
17 | 
18 | // Supports the bare minimum interface necessary for its only use, in
19 | // ThreadControl.
20 | 
21 | template <typename T>
22 | class IntrusiveLinkedList;
23 | 
24 | template <typename T>
25 | class IntrusiveLinkedListNode {
26 |  private:
27 |   friend class IntrusiveLinkedList<T>;
28 | 
29 |   IntrusiveLinkedListNode<T>* next;
30 |   IntrusiveLinkedListNode<T>* prev;
31 | };
32 | 
33 | template <typename T>
34 | class IntrusiveLinkedList {
35 |  public:
36 |   IntrusiveLinkedList() {
37 |     dummyHead_.next = &dummyTail_;
38 |     dummyTail_.prev = &dummyHead_;
39 |   }
40 | 
41 |   void link(IntrusiveLinkedListNode<T>* node) {
42 |     node->next = &dummyTail_;
43 |     node->prev = dummyTail_.prev;
44 | 
45 |     node->next->prev = node;
46 |     node->prev->next = node;
47 |   }
48 | 
49 |   void unlink(IntrusiveLinkedListNode<T>* node) {
50 |     node->next->prev = node->prev;
51 |     node->prev->next = node->next;
52 |   }
53 | 
54 | 
55 | 
56 |   // We don't need real iterator support, just enough for a range-based for
57 |   // loop.
58 |   struct Iterator {
59 |     IntrusiveLinkedListNode<T>* item;
60 |     void operator++() {
61 |       item = item->next;
62 |     }
63 |     T& operator*() {
64 |       return *static_cast<T*>(item);
65 |     }
66 |     bool operator!=(const Iterator& other) {
67 |       return item != other.item;
68 |     }
69 |   };
70 |   Iterator begin() {
71 |     return { dummyHead_.next };
72 |   }
73 |   Iterator end() {
74 |     return { &dummyTail_ };
75 |   }
76 | 
77 |   IntrusiveLinkedListNode<T> dummyHead_;
78 |   IntrusiveLinkedListNode<T> dummyTail_;
79 | };
80 | 
81 | } // namespace internal
82 | } // namespace rseq
83 | 


--------------------------------------------------------------------------------
/rseq/internal/MutexTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/Mutex.h"
11 | 
12 | #include <thread>
13 | #include <vector>
14 | 
15 | #include <gtest/gtest.h>
16 | 
17 | using namespace rseq::internal::mutex;
18 | 
19 | TEST(Mutex, ProvidesExclusion) {
20 |   const int kNumThreads = 10;
21 |   const int kIncrementsPerThread = 1000000;
22 | 
23 |   Mutex mu;
24 |   mu.init();
25 |   int x = 0;
26 |   int y = 0;
27 |   std::vector<std::thread> threads(kNumThreads);
28 |   for (int i = 0; i < kNumThreads; ++i) {
29 |     threads[i] = std::thread([&]() {
30 |       for (int j = 0; j < kIncrementsPerThread; ++j) {
31 |         LockGuard<Mutex> lg(mu);
32 |         EXPECT_TRUE(x == y);
33 |         ++x;
34 |         ++y;
35 |       }
36 |     });
37 |   }
38 |   for (int i = 0; i < kNumThreads; ++i) {
39 |     threads[i].join();
40 |   }
41 |   EXPECT_TRUE(x == y);
42 |   EXPECT_EQ(kNumThreads * kIncrementsPerThread, x);
43 | }
44 | 
45 | TEST(CallOnce, SimpleCase) {
46 |   int x = 0;
47 |   OnceFlag once;
48 |   once.init();
49 |   callOnce(once, [&]() {
50 |     ++x;
51 |   });
52 |   callOnce(once, [&]() {
53 |     ++x;
54 |   });
55 |   EXPECT_EQ(1, x);
56 | }
57 | 
58 | TEST(CallOnce, Racy) {
59 |   const int kNumTrials = 10000;
60 |   const int kNumThreads = 10;
61 |   for (int i = 0; i < kNumTrials; ++i) {
62 |     std::vector<std::thread> threads(kNumThreads);
63 |     std::atomic<bool> ready(false);
64 |     int x = 0;
65 |     OnceFlag once;
66 |     once.init();
67 |     for (int j = 0; j < kNumThreads; ++j) {
68 |       threads[j] = std::thread([&]() {
69 |         while (!ready.load()) {
70 |          // Spin until all threads have a chance to win the race.
71 |         }
72 |         callOnce(once, [&]() {
73 |           ++x;
74 |         });
75 |         EXPECT_EQ(1, x);
76 |       });
77 |     }
78 |     ready.store(true);
79 |     for (int j = 0; j < kNumThreads; ++j) {
80 |       threads[j].join();
81 |     }
82 |     EXPECT_EQ(1, x);
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/rseq/rseq_c.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include "rseq/internal/Likely.h"
13 | #include "rseq/internal/rseq_c.h"
14 | 
15 | #ifdef __cplusplus
16 | extern "C" {
17 | #endif
18 | 
19 | /* A 64-bit type; this is what "inhabits" rseq slots. */
20 | typedef unsigned long rseq_value_t;
21 | 
22 | /* Rseq slots to which you can do rseq-protected loads and stores. */
23 | typedef struct {
24 |   /* This union tricks gcc into not complaining about the strict-aliasing
25 |    * violation here. I don't think it actually fixes the undefined behavior, but
26 |    * as a practical matter the utility of being able to cast to atomic types is
27 |    * more important. */
28 |   union {
29 |     volatile rseq_value_t item;
30 |     volatile char aliasing_goo[sizeof(rseq_value_t)];
31 |   };
32 | } rseq_repr_t;
33 | 
34 | 
35 | inline int rseq_begin() {
36 |   int ret = rseq_thread_cached_cpu;
37 |   if (RSEQ_UNLIKELY(ret < 0)) {
38 |     ret = rseq_begin_slow_path();
39 |   }
40 |   /* Good enough for an acquire barrier on x86. */
41 |   __asm__ volatile("" : : : "memory");
42 |   return ret;
43 | }
44 | 
45 | inline int rseq_load(rseq_value_t *dst, rseq_repr_t *src) {
46 |   /* Note: this goes through dynamically generated code, which will prevent
47 |      compiler reordering. */
48 |   return RSEQ_LIKELY(!rseq_load_trampoline(dst, (unsigned long*)src));
49 | }
50 | 
51 | inline int rseq_store(rseq_repr_t *dst, rseq_value_t val) {
52 |   /* Same here. */
53 |   return RSEQ_LIKELY(!rseq_store_trampoline((unsigned long*)dst, val));
54 | }
55 | 
56 | inline int rseq_store_fence(rseq_repr_t *dst, rseq_value_t val) {
57 |   /* And here. */
58 |   return RSEQ_LIKELY(!rseq_store_fence_trampoline((unsigned long*)dst, val));
59 | }
60 | 
61 | inline int rseq_validate() {
62 |   rseq_repr_t dummy;
63 |   return rseq_store(&dummy, 0);
64 | }
65 | 
66 | void rseq_end();
67 | void rseq_fence_with(int shard);
68 | void rseq_fence();
69 | 
70 | 
71 | #ifdef __cplusplus
72 | } /* extern "C" */
73 | #endif
74 | 


--------------------------------------------------------------------------------
/rseq/internal/AsymmetricThreadFence.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/AsymmetricThreadFence.h"
11 | 
12 | #include <sys/mman.h>
13 | 
14 | #include <cstdint>
15 | 
16 | #include "rseq/internal/Errors.h"
17 | #include "rseq/internal/Mutex.h"
18 | 
19 | namespace rseq {
20 | namespace internal {
21 | 
22 | // TODO: There's a lot we can do to speed this up if we like, with varying
23 | // time/space tradeoffs:
24 | // - Can make this lock-free, or allocate from a pool of threads.
25 | // - Give each thread its own page for mprotect operations. This lets us cycle
26 | //   it through: "R/W/X" -> "R/W" -> "R" -> "None", doing 4 mprotects for 3
27 | //   heavy fences (instead of the 6 mprotects we need for the current
28 | //   mechanism).
29 | // - Give each thread a range of pages to operate on; allocate N pages instead
30 | //   of 1. Permissions are lowered one at a time, and raised in batch after
31 | //   we've exhausted all of the lowerings we can.
32 | // - We can delegate the permission raising to a helper thread. Wouldn't save
33 | //   much time, but could save pages.
34 | 
35 | static mutex::Mutex mu;
36 | void asymmetricThreadFenceHeavy() {
37 |   static char page[8192];
38 | 
39 |   std::uintptr_t pageInt = reinterpret_cast<std::uintptr_t>(page);
40 |   std::uintptr_t alignedInt = (pageInt + 4096 - 1) & ~(4096 - 1);
41 |   char* aligned = reinterpret_cast<char*>(alignedInt);
42 | 
43 |   mutex::LockGuard<mutex::Mutex> lg(mu);
44 | 
45 |   // Make this volatile so that we know the debugger can see it if we die (for
46 |   // simplicity, we don't include it in the error message)
47 |   volatile int err = mprotect(aligned, 4096, PROT_READ | PROT_WRITE);
48 |   if (err) {
49 |     errors::fatalError(
50 |         "First mprotect in asymmetricThreadFenceHeavy failed.\n");
51 |   }
52 | 
53 |   // Page must be dirty to trigger the IPI.
54 |   *static_cast<volatile char*>(aligned) = 0;
55 | 
56 |   err = mprotect(aligned, 4096, PROT_READ);
57 |   if (err) {
58 |     errors::fatalError(
59 |         "Second mprotect in asymmetricThreadFenceHeavy failed.\n");
60 |   }
61 | }
62 | 
63 | } // namespace internal
64 | } // namespace rseq
65 | 


--------------------------------------------------------------------------------
/rseq/internal/CachelinePaddedTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #include "rseq/internal/CachelinePadded.h"
11 | 
12 | #include <gtest/gtest.h>
13 | 
14 | using namespace rseq::internal;
15 | 
16 | template <int dataSize>
17 | struct SizedData {
18 |   SizedData() {
19 |     for (unsigned i = 0; i < dataSize; ++i) {
20 |       data[i] = i;
21 |     }
22 |   }
23 | 
24 |   void doModifications() {
25 |     for (unsigned i = 0; i < dataSize; ++i) {
26 |       EXPECT_EQ(static_cast<unsigned char>(i), data[i]);
27 |       ++data[i];
28 |     }
29 |   }
30 | 
31 |   ~SizedData() {
32 |     for (unsigned i = 0; i < dataSize; ++i) {
33 |       EXPECT_EQ(static_cast<unsigned char>(i + 1), data[i]);
34 |     }
35 |   }
36 | 
37 |   unsigned char data[dataSize];
38 | };
39 | 
40 | using ExactlyCachelineSized = SizedData<kCachelineSize>;
41 | using DoubleCachelineSized = SizedData<2 * kCachelineSize>;
42 | using BelowCachelineSized = SizedData<kCachelineSize / 2>;
43 | using AboveCachelineSized = SizedData<kCachelineSize + kCachelineSize / 2>;
44 | 
45 | TEST(CachelinePadded, Exact) {
46 |   EXPECT_EQ(kCachelineSize, sizeof(CachelinePadded<ExactlyCachelineSized>));
47 |   CachelinePadded<ExactlyCachelineSized> item;
48 |   item.get()->doModifications();
49 |   EXPECT_TRUE(reinterpret_cast<CachelinePadded<ExactlyCachelineSized>*>(
50 |         item.get()) == &item);
51 | }
52 | 
53 | TEST(CachelinePadded, Double) {
54 |   EXPECT_EQ(2 * kCachelineSize, sizeof(CachelinePadded<DoubleCachelineSized>));
55 |   CachelinePadded<DoubleCachelineSized> item;
56 |   item.get()->doModifications();
57 |   EXPECT_TRUE(reinterpret_cast<CachelinePadded<DoubleCachelineSized>*>(
58 |         item.get()) == &item);
59 | }
60 | 
61 | TEST(CachelinePadded, Below) {
62 |   EXPECT_EQ(kCachelineSize, sizeof(CachelinePadded<BelowCachelineSized>));
63 |   CachelinePadded<BelowCachelineSized> item;
64 |   item.get()->doModifications();
65 |   EXPECT_TRUE(reinterpret_cast<CachelinePadded<BelowCachelineSized>*>(
66 |         item.get()) == &item);
67 | }
68 | 
69 | TEST(CachelinePadded, Above) {
70 |   EXPECT_EQ(2 * kCachelineSize, sizeof(CachelinePadded<AboveCachelineSized>));
71 |   CachelinePadded<AboveCachelineSized> item;
72 |   item.get()->doModifications();
73 |   EXPECT_TRUE(reinterpret_cast<CachelinePadded<AboveCachelineSized>*>(
74 |         item.get()) == &item);
75 | }
76 | 


--------------------------------------------------------------------------------
/rseq/internal/IntrusiveLinkedListTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/IntrusiveLinkedList.h"
 11 | 
 12 | #include <gtest/gtest.h>
 13 | 
 14 | using namespace rseq::internal;
 15 | 
 16 | struct LLInt : IntrusiveLinkedListNode<LLInt> {
 17 |   unsigned data;
 18 | };
 19 | 
 20 | struct DiesNoisily : IntrusiveLinkedListNode<DiesNoisily> {
 21 |   DiesNoisily() : noisy(true) {}
 22 |   ~DiesNoisily() {
 23 |     EXPECT_FALSE(noisy);
 24 |   }
 25 | 
 26 |   bool noisy;
 27 | };
 28 | 
 29 | TEST(IntrusiveLinkedList, ConstructsEmpty) {
 30 |   IntrusiveLinkedList<LLInt> list;
 31 |   int count = 0;
 32 |   for (auto& item : list) {
 33 |     ++count;
 34 |   }
 35 |   EXPECT_EQ(0, count);
 36 | }
 37 | 
 38 | TEST(IntrusiveLinkedList, DoesListOperations) {
 39 |   const int kNumItems = 10;
 40 |   const unsigned kItemSetMask = ((1 << kNumItems) - 1);
 41 | 
 42 |   LLInt itemsArr[kNumItems];
 43 |   for (int i = 0; i < kNumItems; ++i) {
 44 |     itemsArr[i].data = (1 << i);
 45 |   }
 46 | 
 47 |   // Add all the even indices
 48 |   IntrusiveLinkedList<LLInt> itemsList;
 49 |   for (int i = 0; i < kNumItems; ++i) {
 50 |     if (i % 2 == 0) {
 51 |       itemsList.link(&itemsArr[i]);
 52 |     }
 53 |   }
 54 | 
 55 |   // Make sure only the even bit positions are set.
 56 |   unsigned itemSet = 0;
 57 |   for (auto& item : itemsList) {
 58 |     itemSet |= item.data;
 59 |   }
 60 |   EXPECT_EQ(0x55555555U & kItemSetMask, itemSet);
 61 | 
 62 |   // Add the odds, too
 63 |   for (int i = 0; i < kNumItems; ++i) {
 64 |     if (i % 2 == 1) {
 65 |       itemsList.link(&itemsArr[i]);
 66 |     }
 67 |   }
 68 | 
 69 |   // Make sure *all* bits are set.
 70 |   itemSet = 0;
 71 |   for (auto& item : itemsList) {
 72 |     itemSet |= item.data;
 73 |   }
 74 |   EXPECT_EQ(kItemSetMask, itemSet);
 75 | 
 76 |   // Remove the items divisible by 4
 77 |   for (int i = 0; i < kNumItems; ++i) {
 78 |     if (i % 4 == 0) {
 79 |       itemsList.unlink(&itemsArr[i]);
 80 |     }
 81 |   }
 82 | 
 83 |   // Make sure every fourth bit is unset.
 84 |   itemSet = 0;
 85 |   for (auto& item : itemsList) {
 86 |     itemSet |= item.data;
 87 |   }
 88 |   EXPECT_EQ(0xEEEEEEEEU & kItemSetMask, itemSet);
 89 | }
 90 | 
 91 | TEST(IntrusiveLinkedList, DoesNotTakeOwnership) {
 92 |   DiesNoisily item;
 93 |   {
 94 |     IntrusiveLinkedList<DiesNoisily> list;
 95 |     list.link(&item);
 96 |     // Destructor runs here.
 97 |   }
 98 |   item.noisy = false;
 99 | }
100 | 


--------------------------------------------------------------------------------
/rseq/internal/IdAllocator.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <atomic>
13 | #include <cstdint>
14 | 
15 | #include "rseq/internal/Mutex.h"
16 | #include "rseq/internal/OsMem.h"
17 | 
18 | namespace rseq {
19 | namespace internal {
20 | 
21 | // This can block when acquiring or releasing an Id, but doing an Id->owner
22 | // lookup is lock-free and fast.
23 | // This never returns an Id of 0; you can use that as "null".
24 | // This is guaranteed to return either an Id that has been acquired and then
25 | // released, or, if no such Id exists, the smallest positive uint32_t that has
26 | // not already been allocated.
27 | // TODO: if we ever start using this more than in a handful of places, we should
28 | // type-erase everything; the identity of T doesn't matter.
29 | template <typename T>
30 | class IdAllocator {
31 |  public:
32 |    // maxElements should include the null element. If you need 10 items + null,
33 |    // maxElement should be 11.
34 |    explicit IdAllocator(std::uint32_t maxElements) : maxElements_(maxElements) {
35 |      mu_.init();
36 | 
37 |      freeListHead_ = 0;
38 |      // We never allocate id 0, so we can use it as null in the linked list of
39 |      // free ids.
40 |      firstUntouchedId_ = 1;
41 | 
42 |      void* mem = os_mem::allocate(maxElements_ * sizeof(FreeNodeOrItem));
43 |      items_ = static_cast<FreeNodeOrItem*>(mem);
44 |    }
45 | 
46 |    ~IdAllocator() {
47 |      os_mem::free(items_, maxElements_ * sizeof(FreeNodeOrItem));
48 |    }
49 | 
50 |    std::uint32_t allocate(T* owner) {
51 |      mutex::LockGuard<mutex::Mutex> lg(mu_);
52 | 
53 |      std::uint32_t result;
54 |      if (freeListHead_ != 0) {
55 |        result = freeListHead_;
56 |        freeListHead_ = items_[freeListHead_].next;
57 |      } else {
58 |        result = firstUntouchedId_++;
59 |      }
60 |      items_[result].owner = owner;
61 |      return result;
62 |    }
63 | 
64 |    void free(std::uint32_t id) {
65 |      mutex::LockGuard<mutex::Mutex> lg(mu_);
66 |      items_[id].next = freeListHead_;
67 |      freeListHead_ = id;
68 |    }
69 | 
70 |    T* lookupOwner(std::uint32_t id) {
71 |      return items_[id].owner;
72 |    }
73 | 
74 |  private:
75 |   union FreeNodeOrItem {
76 |     std::uint32_t next;
77 |     T* owner;
78 |   };
79 | 
80 |   mutex::Mutex mu_;
81 |   FreeNodeOrItem* items_;
82 |   std::uint32_t freeListHead_;
83 |   std::uint32_t firstUntouchedId_;
84 |   std::uint32_t maxElements_;
85 | };
86 | 
87 | } // namespace internal
88 | } // namespace rseq
89 | 


--------------------------------------------------------------------------------
/rseq/internal/ThreadControl.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <atomic>
13 | #include <cstdint>
14 | 
15 | #include "rseq/internal/IntrusiveLinkedList.h"
16 | 
17 | namespace rseq {
18 | namespace internal {
19 | 
20 | class Code;
21 | 
22 | class ThreadControl : public IntrusiveLinkedListNode<ThreadControl> {
23 |  public:
24 |   // Get the calling thread's ThreadControl.
25 |   static ThreadControl* get(std::atomic<int>* threadCachedCpu);
26 | 
27 |   // Get the ThreadControl with the given id
28 |   static ThreadControl* forId(std::uint32_t id);
29 | 
30 |   // Each living thread has a distinct id.
31 |   std::uint32_t id() {
32 |     return id_;
33 |   }
34 | 
35 |   Code* code() {
36 |     return code_;
37 |   }
38 | 
39 |   // Block or unblock this thread's rseq operations.
40 |   // This doesn't do any memory model trickery; it's up to callers to ensure
41 |   // that this method's actions are visible to the victim before knowing that
42 |   // no more rseq operations will happen / will succeed.
43 |   void blockRseqOps();
44 |   void unblockRseqOps();
45 | 
46 |   // Try to get the associated thread's current CPU (if its running), or else
47 |   // the next CPU it will run on. May fail and return -1.
48 |   // Memory ordering is tricky here. Everything is best effort, with the
49 |   // exception of one memory ordering guarantee: a thread that observes itself
50 |   // to be running on cpu N, and subsequently observes another thread to be
51 |   // running on cpu N using curCpu, then the effect is that of an
52 |   // asymmetricThreadFenceHeavy() that pairs only with an
53 |   // asymmetricThreadFenceLight() in the other thread.
54 |   int curCpu();
55 | 
56 |   // A ThreadControl object remains valid (and the corresponding thread alive)
57 |   // whenever some other thread's accessing field contains its id, and when the
58 |   // store happens-before the execution of die() below (which is executed when
59 |   // the owning thread terminates).
60 |   std::atomic<std::uint32_t>* accessing() {
61 |     return &accessing_;
62 |   }
63 | 
64 |  private:
65 |   // We don't want users making their own ThreadControls; the semantics and
66 |   // cleanup code need for each thread to have at most one ThreadControl.
67 |   explicit ThreadControl(std::atomic<int>* threadCachedCpu);
68 |   ~ThreadControl();
69 | 
70 |   Code* code_;
71 |   int tid_;
72 |   std::uint32_t id_;
73 |   std::atomic<int>* threadCachedCpu_;
74 |   std::atomic<std::uint32_t> accessing_;
75 | 
76 |   ThreadControl* next_;
77 |   ThreadControl* prev_;
78 | };
79 | 
80 | } // namespace internal
81 | } // namespace rseq
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rseq
 2 | Rseq is a userspace take on the proposed kernel restartable sequences API, and
 3 | provides and mechanism to perform efficient per-cpu operations.
 4 | 
 5 | This isn't intended to be a long-running project. Instead, its goal is to allow
 6 | userland experiments with rseq (without having to recompile the kernel), to
 7 | collect data as to how useful it would be.
 8 | 
 9 | ## Example
10 | Here is a simple demonstration of how to use rseq and why it might be useful. A
11 | more thorough explanation (together with history and some implementation notes)
12 | can be found in `Rseq.md`.
13 | 
14 |     rseq::Value<int> counterByCpu[kNumCpus];
15 |     void bumpCounter() {
16 |       while (true) {
17 |         int cpu = rseq::begin();
18 |         // A plain atomic load. rseq::Value types are API-compatible with
19 |         // std::atomic.
20 |         int curValue = counterByCpu[cpu].load();
21 |         // rseq::store takes no action and returns false if another thread might
22 |         // have run on the same CPU after the call to rseq::begin(). Otherwise, the
23 |         // store happens and the call returns true.
24 |         bool success = rseq::store(&counterByCpu[cpu], curValue + 1);
25 |         if (success) {
26 |           break;
27 |         }
28 |       }
29 |     }
30 | 
31 | ## Requirements
32 | Rseq only works on Linux and x86-64. Building requires CMake and a recent
33 | version of clang or g++. Building the tests requires a gtest installation.
34 | 
35 | ## Building Rseq
36 | In this directory, run:
37 | 
38 |     mkdir build
39 |     cd build
40 |     # Include the former option to produce an optimized build, and the latter to
41 |     # enable running tests.
42 |     cmake [-DCMAKE_BUILD_TYPE=Release] [-Dtest=ON] [-DCMAKE_INSTALL_PREFIX=</path/to/install/dir>] ../
43 |     make
44 | 
45 |     # Now we can take some of our binaries for a test drive
46 | 
47 |     # If you passed -Dtest=ON above, this will run all tests.
48 |     make test
49 |     # Run a benchmark of a variety of mechanisms for incrementing a set of
50 |     # counters.
51 |     ./rseq_benchmark all 8 10000000
52 | 
53 | ## Installing Rseq
54 | For the common case, you probably want:
55 | 
56 |     mkdir build && cd build
57 |     cmake -DCMAKE_BUILD_TYPE=Release ../
58 |     sudo make install
59 | 
60 | You can then compile programs that `#include "rseq/Rseq.h"` with
61 | `g++ myProgram.cpp -lrseq`.
62 | 
63 | 
64 | ## How Rseq works
65 | See `Rseq.md` for a more thorough description. Essentially, each thread gets its
66 | own copy of the code that does an rseq operation. When one thread wants to evict
67 | another from ownership of a CPU, it patches that thread's copy of the function
68 | to jump to a failure path instead of doing the operation.
69 | 
70 | ## Full documentation
71 | [`Rseq.md`](Rseq.md) contains a more thorough description. Reading the comments in
72 | [`rseq/Rseq.h`](rseq/Rseq.h) should give a working understanding of the API.
73 | 
74 | ## License
75 | Rseq is BSD-licensed. We also provide an additional patent grant.
76 | 


--------------------------------------------------------------------------------
/rseq/internal/AsymmetricThreadFenceTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/AsymmetricThreadFence.h"
 11 | 
 12 | #include <atomic>
 13 | #include <thread>
 14 | #include <vector>
 15 | 
 16 | #include <gtest/gtest.h>
 17 | 
 18 | #include "rseq/internal/NumCpus.h"
 19 | 
 20 | using namespace rseq::internal;
 21 | 
 22 | class BiasedLock {
 23 |  public:
 24 |   BiasedLock()
 25 |     : fastTurn(true),
 26 |       fastInterested(false),
 27 |       slowInterested(false),
 28 |       slowMu(false) {
 29 |   }
 30 | 
 31 |   void lockFast() {
 32 |     fastInterested.store(true, std::memory_order_relaxed);
 33 |     fastTurn.store(true, std::memory_order_release);
 34 |     asymmetricThreadFenceLight();
 35 |     while (slowInterested.load() && fastTurn.load()) {
 36 |     }
 37 |   }
 38 | 
 39 |   void unlockFast() {
 40 |     fastInterested.store(false, std::memory_order_release);
 41 |   }
 42 | 
 43 |   void lockSlow() {
 44 |     bool expected;
 45 |     do {
 46 |       expected = false;
 47 |     } while (!slowMu.compare_exchange_weak(expected, true));
 48 |     slowInterested.store(true, std::memory_order_relaxed);
 49 |     fastTurn.store(false, std::memory_order_release);
 50 |     asymmetricThreadFenceHeavy();
 51 |     while(fastInterested.load() && !fastTurn.load()) {
 52 |     }
 53 |   }
 54 | 
 55 |   void unlockSlow() {
 56 |     slowInterested.store(false, std::memory_order_release);
 57 |     slowMu.store(false, std::memory_order_release);
 58 |   }
 59 | 
 60 |  private:
 61 |   std::atomic<bool> fastTurn;
 62 |   std::atomic<bool> fastInterested;
 63 |   std::atomic<bool> slowInterested;
 64 |   std::atomic<bool> slowMu;
 65 | };
 66 | 
 67 | TEST(AsymmetricThreadFence, BiasedLocking) {
 68 |   const std::uint64_t kFastIters = 3000000;
 69 |   const std::uint64_t kSlowIters = 10000;
 70 | 
 71 |   BiasedLock lock;
 72 |   std::uint64_t counter = 0;
 73 | 
 74 |   int numSlowThreads = numCpus() - 1;
 75 | 
 76 |   std::thread fastThread;
 77 |   std::vector<std::thread> slowThreads(numSlowThreads);
 78 | 
 79 |   // Start the slow threads incrementing the counter
 80 |   for (int i = 0; i < numSlowThreads; ++i) {
 81 |     slowThreads[i] = std::thread([&]() {
 82 |       for (int j = 0; j < kSlowIters; ++j) {
 83 |         lock.lockSlow();
 84 |         ++counter;
 85 |         lock.unlockSlow();
 86 |       }
 87 |     });
 88 |   }
 89 |   // Start the fast thread incrementing the counter
 90 |   fastThread = std::thread([&]() {
 91 |     for (int j = 0; j < kFastIters; ++j) {
 92 |       lock.lockFast();
 93 |       ++counter;
 94 |       lock.unlockFast();
 95 |     }
 96 |   });
 97 | 
 98 |   // Wait for the threads to finish.
 99 |   fastThread.join();
100 |   for (int i = 0; i < numSlowThreads; ++i) {
101 |     slowThreads[i].join();
102 |   }
103 |   EXPECT_EQ(kFastIters + numSlowThreads * kSlowIters, counter);
104 | }
105 | 


--------------------------------------------------------------------------------
/rseq/internal/Errors.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree. An additional grant
 7 |  * of patent rights can be found in the PATENTS file in the same directory.
 8 |  */
 9 | 
10 | #pragma once
11 | 
12 | #include <unistd.h>
13 | 
14 | #include <stdio.h>
15 | 
16 | #include <cstdlib>
17 | #include <cstring>
18 | #include <exception>
19 | #include <stdexcept>
20 | 
21 | namespace rseq {
22 | namespace internal {
23 | namespace errors {
24 | 
25 | namespace detail {
26 | inline void abortWithMessage(const char* message) {
27 |   // We ignore the error code; we can't do anything in case of a "real" failure,
28 |   // and handling e.g. signal logic is more complicated than we need.
29 |   write(STDERR_FILENO, message, std::strlen(message));
30 |   std::abort();
31 | }
32 | 
33 | inline void throwRuntimeException(const char* message) {
34 | #if __EXCEPTIONS
35 |   throw std::runtime_error(message);
36 | #endif // __EXCEPTIONS
37 | }
38 | } // namespace detail
39 | 
40 | // This should not return; it should either terminate the program or throw an
41 | // exception.
42 | typedef void (*FatalErrorHandler)(const char* message);
43 | 
44 | // Error handlers are thread-local. The default one throws an
45 | // std::runtime_exception
46 | void setFatalErrorHandler(FatalErrorHandler handler);
47 | FatalErrorHandler getFatalErrorHandler();
48 | 
49 | void fatalError(const char* message);
50 | 
51 | // While one of these is in scope, rseq failures will call abort(), and
52 | // destruction via a thrown exception causes abort.
53 | // Having the abort call happen implicitly in a destructor (as opposed to
54 | // writing "catch(...) { abort(); }") is advantageous because it means that core
55 | // dumps will show the stack trace of the function that threw the exception, not
56 | // the one that caught it.
57 | class AbortOnError {
58 |  public:
59 |   AbortOnError() {
60 |     previousHandler_ = getFatalErrorHandler();
61 |     setFatalErrorHandler(&detail::abortWithMessage);
62 |   }
63 |   ~AbortOnError() {
64 | #if __EXCEPTIONS
65 |     if (std::uncaught_exception()) {
66 |       // Being destroyed as part of exception unwinding; abort.
67 |       detail::abortWithMessage("Exception thrown into top-level C function.\n");
68 |     }
69 | #endif // __EXCEPTIONS
70 |     setFatalErrorHandler(previousHandler_);
71 |   }
72 |   AbortOnError(const AbortOnError&) = delete;
73 |   AbortOnError& operator=(const AbortOnError&) = delete;
74 |  private:
75 |   FatalErrorHandler previousHandler_;
76 | };
77 | 
78 | #ifdef __EXCEPTIONS
79 | class ThrowOnError {
80 |  public:
81 |   ThrowOnError() {
82 |     previousHandler_ = getFatalErrorHandler();
83 |     setFatalErrorHandler(&detail::throwRuntimeException);
84 |   }
85 |   ~ThrowOnError() {
86 |     setFatalErrorHandler(previousHandler_);
87 |   }
88 |  private:
89 |   FatalErrorHandler previousHandler_;
90 | };
91 | #else // __EXCEPTIONS
92 | typedef AbortOnError ThrowOnError;
93 | #endif // __EXCEPTIONS
94 | 
95 | 
96 | } // namespace errors
97 | } // namespace internal
98 | } // namespace rseq
99 | 


--------------------------------------------------------------------------------
/rseq/internal/ErrorsTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/Errors.h"
 11 | 
 12 | #include <gtest/gtest.h>
 13 | 
 14 | #include <stdexcept>
 15 | #include <string>
 16 | 
 17 | using namespace rseq::internal::errors;
 18 | 
 19 | TEST(Errors, AbortOnErrorAborts) {
 20 |   AbortOnError aoe;
 21 |   ASSERT_DEATH(fatalError("ThisIsAnErrorString"), "ThisIsAnErrorString");
 22 | }
 23 | 
 24 | #if __EXCEPTIONS
 25 | 
 26 | TEST(Errors, DefaultThrows) {
 27 |   ThrowOnError thrower;
 28 |   std::string msg = "Some error message";
 29 |   bool exceptionCaught = false;
 30 |   try {
 31 |     fatalError(msg.c_str());
 32 |   } catch (const std::runtime_error& exception) {
 33 |     EXPECT_EQ(msg, exception.what());
 34 |     exceptionCaught = true;
 35 |   }
 36 |   EXPECT_TRUE(exceptionCaught);
 37 | }
 38 | 
 39 | TEST(Errors, AllowsChangingHandler) {
 40 |   ThrowOnError thrower;
 41 |   // Get a copy of the old handler.
 42 |   FatalErrorHandler oldHandler = getFatalErrorHandler();
 43 | 
 44 |   // Install a custom handler that throws a custom type.
 45 |   struct MyException {
 46 |   };
 47 |   FatalErrorHandler myHandler = +[](const char* /* message */) {
 48 |     throw MyException();
 49 |   };
 50 |   setFatalErrorHandler(myHandler);
 51 | 
 52 |   // Make sure the custom handler is called.
 53 |   bool exceptionCaught = false;
 54 |   try {
 55 |     fatalError("this gets ignored");
 56 |   } catch (const MyException& /* exception */) {
 57 |     exceptionCaught = true;
 58 |   }
 59 |   EXPECT_TRUE(exceptionCaught);
 60 | 
 61 |   // Make sure we can reinstall the old handler, and that it's the right one.
 62 |   setFatalErrorHandler(oldHandler);
 63 |   exceptionCaught = false;
 64 |   try {
 65 |     fatalError("this gets ignored too");
 66 |   } catch (const std::runtime_error& /* exception */) {
 67 |     exceptionCaught = true;
 68 |   }
 69 |   EXPECT_TRUE(exceptionCaught);
 70 | }
 71 | 
 72 | static void throwException() {
 73 |   throw std::runtime_error("Runtime error");
 74 | }
 75 | 
 76 | static void abortAfterCallingThrowException() {
 77 |   AbortOnError aoe;
 78 |   throwException();
 79 | }
 80 | 
 81 | static void tryCatchException() {
 82 |   try {
 83 |     abortAfterCallingThrowException();
 84 |   } catch (...) {
 85 |   }
 86 | }
 87 | 
 88 | TEST(Errors, AbortOnErrorAbortsAfterExceptions) {
 89 |   ASSERT_DEATH(tryCatchException(), "");
 90 | }
 91 | 
 92 | TEST(Errors, AbortOnErrorIsntPermanent) {
 93 |   ThrowOnError thrower;
 94 |   {
 95 |     AbortOnError aoe;
 96 |   }
 97 |   bool exceptionCaught = false;
 98 |   try {
 99 |     fatalError("blah blah blah");
100 |   } catch (const std::runtime_error& /* exception */) {
101 |     exceptionCaught = true;
102 |   }
103 |   EXPECT_TRUE(exceptionCaught);
104 | }
105 | 
106 | #else // __EXCEPTIONS
107 | 
108 | TEST(Errors, DefaultAborts) {
109 |   ASSERT_DEATH(fatalError(""), "");
110 | }
111 | 
112 | #endif // __EXCEPTIONS
113 | 


--------------------------------------------------------------------------------
/rseq/internal/Mutex.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #pragma once
 11 | 
 12 | #include <atomic>
 13 | #include <cstdint>
 14 | #include <utility>
 15 | 
 16 | #include "rseq/internal/Likely.h"
 17 | 
 18 | // A simple clone of parts of <mutex>. This lets us avoid depending on C++
 19 | // static constructors and linking against libstdc++ (which would stop people
 20 | // from linking with plain-C binaries).
 21 | // We only acquire mutexes down slow paths, so we don't bother doing anything
 22 | // fancy (like adaptive spinning, anything to avoid wasted wakeup attempts,
 23 | // etc.).
 24 | 
 25 | // These classes don't have constructors or destructors, so that they can live
 26 | // safely in static memory without any C++ runtime support. If they do in fact
 27 | // live in static memory, no initialization is needed. Otherwise, you have to
 28 | // call init() on them explicitly.
 29 | 
 30 | namespace rseq {
 31 | namespace internal {
 32 | namespace mutex {
 33 | 
 34 | template <typename Lock>
 35 | class LockGuard {
 36 |  public:
 37 |   explicit LockGuard(Lock& lock) : lock_(lock) {
 38 |     lock_.lock();
 39 |   }
 40 |   ~LockGuard() {
 41 |     lock_.unlock();
 42 |   }
 43 |  private:
 44 |   Lock& lock_;
 45 | };
 46 | 
 47 | class Mutex {
 48 |  public:
 49 |   void init() {
 50 |     state_.store(0, std::memory_order_relaxed);
 51 |   }
 52 | 
 53 |   void lock() {
 54 |     std::uint32_t oldState = state_.exchange(kHeldNoWaiter);
 55 |     if (oldState == kFree) {
 56 |       return;
 57 |     }
 58 |     oldState = state_.exchange(kHeldPossibleWaiter);
 59 |     while (oldState != kFree) {
 60 |       futexWait(kHeldPossibleWaiter);
 61 |       oldState = state_.exchange(kHeldPossibleWaiter);
 62 |     }
 63 |   }
 64 | 
 65 |   void unlock() {
 66 |     std::uint32_t oldState = state_.exchange(0);
 67 |     if (oldState == kHeldPossibleWaiter) {
 68 |       futexWake(1);
 69 |     }
 70 |   }
 71 | 
 72 |  private:
 73 |   constexpr static std::uint32_t kFree = 0;
 74 |   constexpr static std::uint32_t kHeldNoWaiter = 1;
 75 |   constexpr static std::uint32_t kHeldPossibleWaiter = 2;
 76 | 
 77 |   void futexWait(std::uint32_t val);
 78 |   void futexWake(int num);
 79 | 
 80 |   std::atomic<std::uint32_t> state_;
 81 | };
 82 | 
 83 | 
 84 | class OnceFlag;
 85 | template <typename Func, typename... Args>
 86 | void callOnce(OnceFlag&, Func&&, Args&&...);
 87 | 
 88 | class OnceFlag {
 89 |  public:
 90 |   void init() {
 91 |     initialized_.store(false, std::memory_order_relaxed);
 92 |     mu_.init();
 93 |   }
 94 |  private:
 95 |   template <typename Func, typename... Args>
 96 |   friend void ::rseq::internal::mutex::callOnce(OnceFlag&, Func&&, Args&&...);
 97 | 
 98 |   std::atomic<bool> initialized_;
 99 |   Mutex mu_;
100 | };
101 | 
102 | template <typename Func, typename... Args>
103 | void callOnce(OnceFlag& flag, Func&& func, Args&&... args) {
104 |   if (RSEQ_LIKELY(flag.initialized_.load(std::memory_order_acquire))) {
105 |     return;
106 |   }
107 |   LockGuard<Mutex> lg(flag.mu_);
108 |   if (RSEQ_LIKELY(flag.initialized_.load(std::memory_order_relaxed))) {
109 |     return;
110 |   }
111 |   func(std::forward<Args>(args)...);
112 |   flag.initialized_.store(true, std::memory_order_release);
113 | }
114 | 
115 | } // namespace mutex
116 | } // namespace internal
117 | } // namespace rseq
118 | 


--------------------------------------------------------------------------------
/rseq/internal/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the BSD-style license found in the
  5 | # LICENSE file in the root directory of this source tree. An additional grant
  6 | # of patent rights can be found in the PATENTS file in the same directory.
  7 | 
  8 | add_library(asymmetric_thread_fence AsymmetricThreadFence.cpp)
  9 | target_link_libraries(
 10 |   asymmetric_thread_fence
 11 |   mutex
 12 |   errors
 13 | )
 14 | list(APPEND all_sources internal/AsymmetricThreadFence.cpp)
 15 | 
 16 | rseq_gtest(
 17 |   asymmetric_thread_fence_test
 18 |   AsymmetricThreadFenceTest.cpp
 19 |   asymmetric_thread_fence
 20 |   num_cpus
 21 | )
 22 | 
 23 | 
 24 | add_library(cacheline_padded Dummy.cpp)
 25 | 
 26 | rseq_gtest(
 27 |   cacheline_padded_test
 28 |   CachelinePaddedTest.cpp
 29 |   cacheline_padded
 30 | )
 31 | 
 32 | 
 33 | add_library(clean_up_on_thread_death CleanUpOnThreadDeath.cpp)
 34 | target_link_libraries(
 35 |   clean_up_on_thread_death
 36 |   errors
 37 |   mutex
 38 | )
 39 | list(APPEND all_sources internal/CleanUpOnThreadDeath.cpp)
 40 | 
 41 | rseq_gtest(
 42 |   clean_up_on_thread_death_test
 43 |   CleanUpOnThreadDeathTest.cpp
 44 |   clean_up_on_thread_death
 45 | )
 46 | 
 47 | 
 48 | add_library(code Code.cpp)
 49 | target_link_libraries(code cacheline_padded mutex os_mem)
 50 | list(APPEND all_sources internal/Code.cpp)
 51 | 
 52 | rseq_gtest(
 53 |   code_test
 54 |   CodeTest.cpp
 55 |   code
 56 | )
 57 | 
 58 | 
 59 | add_library(cpu_local Dummy.cpp)
 60 | target_link_libraries(cpu_local cacheline_padded num_cpus os_mem)
 61 | 
 62 | rseq_gtest(
 63 |   cpu_local_test
 64 |   CpuLocalTest.cpp
 65 |   cpu_local
 66 |   num_cpus
 67 |   switch_to_cpu
 68 | )
 69 | 
 70 | 
 71 | add_library(id_allocator Dummy.cpp)
 72 | target_link_libraries(id_allocator mutex os_mem)
 73 | 
 74 | rseq_gtest(
 75 |   id_allocator
 76 |   IdAllocatorTest.cpp
 77 |   id_allocator
 78 | )
 79 | 
 80 | add_library(errors Errors.cpp)
 81 | list(APPEND all_sources internal/Errors.cpp)
 82 | 
 83 | rseq_gtest(
 84 |   errors_test
 85 |   ErrorsTest.cpp
 86 |   errors
 87 | )
 88 | 
 89 | 
 90 | add_library(intrusive_linked_list Dummy.cpp)
 91 | 
 92 | rseq_gtest(
 93 |   intrusive_linked_list_test
 94 |   IntrusiveLinkedListTest.cpp
 95 |   intrusive_linked_list
 96 | )
 97 | 
 98 | 
 99 | add_library(likely Dummy.cpp)
100 | # LIKELY and UNLIKELY macros not tested
101 | 
102 | 
103 | add_library(mutex Mutex.cpp)
104 | target_link_libraries(mutex likely)
105 | list(APPEND all_sources internal/Mutex.cpp)
106 | 
107 | rseq_gtest(
108 |   mutex_test
109 |   MutexTest.cpp
110 |   mutex
111 | )
112 | 
113 | 
114 | add_library(num_cpus NumCpus.cpp)
115 | list(APPEND all_sources internal/NumCpus.cpp)
116 | target_link_libraries(num_cpus mutex)
117 | # numCpus() not tested
118 | 
119 | 
120 | add_library(os_mem OsMem.cpp)
121 | target_link_libraries(
122 |   os_mem
123 |   errors
124 | )
125 | list(APPEND all_sources internal/OsMem.cpp)
126 | 
127 | rseq_gtest(
128 |   os_mem_test
129 |   OsMemTest.cpp
130 |   os_mem
131 |   errors
132 | )
133 | 
134 | 
135 | add_library(internal_rseq Rseq.cpp rseq_c.cpp rseq_c_inlines.c)
136 | target_link_libraries(
137 |   internal_rseq
138 |   asymmetric_thread_fence
139 |   code
140 |   cpu_local
141 |   errors
142 |   mutex
143 |   num_cpus
144 |   thread_control
145 | )
146 | list(
147 |   APPEND
148 |   all_sources
149 |   internal/Rseq.cpp
150 |   internal/rseq_c.cpp
151 |   internal/rseq_c_inlines.c
152 | )
153 | # rseq is tested through the public interface; no rseq_gtest here.
154 | 
155 | 
156 | add_library(switch_to_cpu SwitchToCpu.cpp)
157 | target_link_libraries(
158 |   switch_to_cpu
159 |   errors
160 | )
161 | # SwitchToCpu.cpp is test-only; we don't include it in all_sources
162 | rseq_gtest(
163 |   switch_to_cpu_test
164 |   SwitchToCpuTest.cpp
165 |   num_cpus
166 |   switch_to_cpu
167 | )
168 | 
169 | 
170 | add_library(thread_control ThreadControl.cpp)
171 | target_link_libraries(
172 |   thread_control
173 |   clean_up_on_thread_death
174 |   code
175 |   id_allocator
176 |   intrusive_linked_list
177 |   mutex
178 | )
179 | list(APPEND all_sources internal/ThreadControl.cpp)
180 | 
181 | rseq_gtest(
182 |   thread_control_test
183 |   ThreadControlTest.cpp
184 |   num_cpus
185 |   switch_to_cpu
186 |   thread_control
187 | )
188 | 
189 | set (all_sources ${all_sources} PARENT_SCOPE)
190 | 


--------------------------------------------------------------------------------
/rseq/internal/OsMemTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/OsMem.h"
 11 | 
 12 | #include <setjmp.h>
 13 | #include <signal.h>
 14 | 
 15 | #include <atomic>
 16 | #include <cstring>
 17 | 
 18 | #include <gtest/gtest.h>
 19 | 
 20 | #include "rseq/internal/Errors.h"
 21 | 
 22 | using namespace rseq::internal;
 23 | using namespace rseq::internal::os_mem;
 24 | 
 25 | TEST(OsMem, SanityCheck) {
 26 |   const int kAllocSize1 = 123456;
 27 |   const int kAllocSize2 = 12345;
 28 | 
 29 |   void* alloc1 = allocate(kAllocSize1);
 30 |   void* alloc2 = allocate(kAllocSize2);
 31 | 
 32 |   unsigned char* arr1 = reinterpret_cast<unsigned char*>(alloc1);
 33 |   unsigned char* arr2 = reinterpret_cast<unsigned char*>(alloc2);
 34 | 
 35 |   for (int i = 0; i < kAllocSize1; ++i) {
 36 |     arr1[i] = 111;
 37 |   }
 38 |   for (int i = 0; i < kAllocSize2; ++i) {
 39 |     arr2[i] = 222;
 40 |   }
 41 | 
 42 |   for (int i = 0; i < kAllocSize1; ++i) {
 43 |     EXPECT_EQ(111, arr1[i]);
 44 |   }
 45 |   free(alloc1, kAllocSize1);
 46 | 
 47 |   for (int i = 0; i < kAllocSize2; ++i) {
 48 |     EXPECT_EQ(222, arr2[i]);
 49 |   }
 50 |   free(alloc2, kAllocSize2);
 51 | }
 52 | 
 53 | #if __EXCEPTIONS
 54 | TEST(OsMem, ThrowsOnFailure) {
 55 |   errors::ThrowOnError thrower;
 56 | 
 57 |   bool failed = false;
 58 |   const std::size_t kTooBig = 1ULL << 48;
 59 |   try {
 60 |     allocate(kTooBig);
 61 |   } catch (...) {
 62 |     failed = true;
 63 |   }
 64 |   EXPECT_TRUE(failed);
 65 |   failed = false;
 66 |   try {
 67 |     allocateExecutable(kTooBig);
 68 |   } catch (...) {
 69 |     failed = true;
 70 |   }
 71 |   EXPECT_TRUE(failed);
 72 | }
 73 | #endif // __EXCEPTIONS
 74 | 
 75 | TEST(OsMem, AllocatesExecutable) {
 76 |   const unsigned char return12345Template[] = {
 77 |     // mov $12345, %eax ; (12345 = 0x3039)
 78 |     0xb8, 0x39, 0x30, 0x00, 0x00,
 79 |     // retq
 80 |     0xc3,
 81 |   };
 82 | 
 83 |   void* code = allocateExecutable(sizeof(return12345Template));
 84 |   std::memcpy(code, return12345Template, sizeof(return12345Template));
 85 |   int (*fn)() = reinterpret_cast<int (*)()>(code);
 86 |   EXPECT_EQ(12345, fn());
 87 |   free(code, sizeof(return12345Template));
 88 | }
 89 | 
 90 | TEST(OsMem, Frees) {
 91 |   // These can't be stack variables, since we need to know their address
 92 |   // (without being told) in the signal handler. We make them thread-local to
 93 |   // avoid any parallel testing trickiness.
 94 |   static __thread void* volatile alloc;
 95 |   static __thread volatile bool segfaulted;
 96 |   static __thread jmp_buf returnFromSegfault;
 97 | 
 98 |   alloc = nullptr;
 99 |   segfaulted = false;
100 | 
101 |   struct sigaction oldHandler;
102 |   struct sigaction newHandler;
103 | 
104 |   void (*segfaultHandler)(int, siginfo_t*, void*)
105 |       = +[](int signo, siginfo_t* info, void* ucontext) {
106 |         EXPECT_EQ(SIGSEGV, signo);
107 |         // EXPECT_EQ is a little screwy with regards to volatile pointer (note:
108 |         // not pointer *to* volatile) arguments. We copy its argument into a
109 |         // non-volatile pointer to help it out.
110 |         void* allocCopy = alloc;
111 |         EXPECT_EQ(allocCopy, info->si_addr);
112 |         segfaulted = true;
113 |         // We setjmp(returnFromSegfault) before triggering the segfault.
114 |         longjmp(returnFromSegfault, 1);
115 |       };
116 | 
117 |   std::memset(&newHandler, 0, sizeof(newHandler));
118 |   newHandler.sa_sigaction = segfaultHandler;
119 |   newHandler.sa_flags = SA_SIGINFO;
120 |   int err = sigaction(SIGSEGV, &newHandler, &oldHandler);
121 |   ASSERT_EQ(0, err);
122 | 
123 |   alloc = allocate(1);
124 |   volatile char* c = static_cast<char*>(alloc);
125 |   *c = 123;
126 | 
127 |   free(alloc, 1);
128 |   EXPECT_FALSE(segfaulted);
129 | 
130 |   // BEGIN MAGIC
131 |   if (!setjmp(returnFromSegfault)) {
132 |     // Not returning from the segfault handler; cause a segfault.
133 |     *c;
134 |   } else {
135 |     // Returning from the segfault handler.
136 |     EXPECT_TRUE(segfaulted);
137 |   }
138 |   // END MAGIC
139 | 
140 |   // Go back to the previous signal handler (probably crashing).
141 |   sigaction(SIGSEGV, &oldHandler, nullptr);
142 | }
143 | 


--------------------------------------------------------------------------------
/rseq/internal/ThreadControlTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/ThreadControl.h"
 11 | 
 12 | #include <chrono>
 13 | #include <condition_variable>
 14 | #include <cstdint>
 15 | #include <functional>
 16 | #include <mutex>
 17 | #include <thread>
 18 | 
 19 | #include <gtest/gtest.h>
 20 | 
 21 | #include "rseq/internal/Code.h"
 22 | #include "rseq/internal/NumCpus.h"
 23 | #include "rseq/internal/SwitchToCpu.h"
 24 | 
 25 | using namespace rseq::internal;
 26 | 
 27 | class ThreadControlFixture : public ::testing::Test {
 28 |  protected:
 29 |   void SetUp() override {
 30 |     me = ThreadControl::get(&myThreadCachedCpu);
 31 | 
 32 |     childDead = false;
 33 |     childShouldDie = false;
 34 |     child = std::thread([&]() {
 35 |       std::unique_lock<std::mutex> ul(mu);
 36 |       while (true) {
 37 |         if (childShouldDie) {
 38 |           return;
 39 |         }
 40 |         if (func != nullptr) {
 41 |           func();
 42 |           func = nullptr;
 43 |         }
 44 |         cond.notify_all();
 45 |         cond.wait(ul, [&]() {
 46 |           return childShouldDie || func != nullptr;
 47 |         });
 48 |       }
 49 |     });
 50 | 
 51 |     childCpu = numCpus() > 1 ? 1 : 0;
 52 | 
 53 |     switchToCpu(0);
 54 |     // To check thread transition logic, we want the child to get its
 55 |     // ThreadControl on one CPU, and have it manipulated while it's on another.
 56 |     runOnChild([&]() {
 57 |       switchToCpu(0);
 58 |       childThreadControl = ThreadControl::get(&childThreadCachedCpu);
 59 |       switchToCpu(childCpu);
 60 |     });
 61 |   }
 62 | 
 63 |   void TearDown() override {
 64 |     if (!childDead) {
 65 |       killChild();
 66 |     }
 67 |     child.join();
 68 |   }
 69 | 
 70 |   // Only starts the child's death; doesn't join() it.
 71 |   void killChild() {
 72 |     std::lock_guard<std::mutex> lg(mu);
 73 |     childShouldDie = true;
 74 |     cond.notify_all();
 75 |   }
 76 | 
 77 |   void runOnChild(std::function<void ()> f) {
 78 |     func = f;
 79 |     std::unique_lock<std::mutex> ul(mu);
 80 |     cond.notify_all();
 81 |     cond.wait(ul, [&]() {
 82 |       return func == nullptr;
 83 |     });
 84 |   }
 85 | 
 86 |   std::atomic<int> myThreadCachedCpu;
 87 |   ThreadControl* me;
 88 | 
 89 |   int childCpu;
 90 |   std::atomic<int> childThreadCachedCpu;
 91 |   ThreadControl* childThreadControl;
 92 | 
 93 |   bool childDead;
 94 |   bool childShouldDie;
 95 |   std::thread child;
 96 |   std::mutex mu;
 97 |   std::condition_variable cond;
 98 |   std::function<void ()> func;
 99 | };
100 | 
101 | TEST_F(ThreadControlFixture, IdManipulation) {
102 |   std::uint32_t myId = me->id();
103 |   std::uint32_t childId = childThreadControl->id();
104 |   EXPECT_EQ(me, ThreadControl::forId(myId));
105 |   EXPECT_EQ(childThreadControl, ThreadControl::forId(childId));
106 | }
107 | 
108 | TEST_F(ThreadControlFixture, Code) {
109 |   Code* code = me->code();
110 |   EXPECT_NE(nullptr, code->rseqLoadFunc());
111 |   EXPECT_NE(nullptr, code->rseqStoreFunc());
112 |   EXPECT_NE(nullptr, code->rseqStoreFenceFunc());
113 | }
114 | 
115 | TEST_F(ThreadControlFixture, RseqManipulation) {
116 |   std::uintptr_t dst = 0;
117 |   runOnChild([&]() {
118 |     EXPECT_FALSE(childThreadControl->code()->rseqStoreFunc()(&dst, 1));
119 |   });
120 |   EXPECT_EQ(1, dst);
121 |   childThreadControl->blockRseqOps();
122 |   childThreadCachedCpu.store(0);
123 |   runOnChild([&]() {
124 |     EXPECT_TRUE(childThreadControl->code()->rseqStoreFunc()(&dst, 2));
125 |   });
126 |   EXPECT_LT(childThreadCachedCpu.load(), 0);
127 |   EXPECT_EQ(1, dst);
128 |   childThreadControl->unblockRseqOps();
129 |   runOnChild([&]() {
130 |     EXPECT_FALSE(childThreadControl->code()->rseqStoreFunc()(&dst, 2));
131 |   });
132 |   EXPECT_EQ(2, dst);
133 | }
134 | 
135 | TEST_F(ThreadControlFixture, CurCpu) {
136 |   EXPECT_EQ(childCpu, childThreadControl->curCpu());
137 |   runOnChild([&]() {
138 |     switchToCpu(0);
139 |   });
140 |   EXPECT_EQ(0, childThreadControl->curCpu());
141 | }
142 | 
143 | TEST_F(ThreadControlFixture, LivesWhileBeingAccessed) {
144 |   me->accessing()->store(childThreadControl->id());
145 |   killChild();
146 |   /* sleep override */
147 |   // Give it a bit to die on its own, if it's going to.
148 |   std::this_thread::sleep_for(std::chrono::milliseconds(100));
149 |   EXPECT_EQ(childCpu, childThreadControl->curCpu());
150 |   me->accessing()->store(0);
151 | }
152 | 
153 | TEST_F(ThreadControlFixture, DiesWhenNotAccessed) {
154 |   killChild();
155 |   // If the child doesn't die, then we'll time out when the subsequent join()
156 |   // call fails.
157 | }
158 | 


--------------------------------------------------------------------------------
/rseq/internal/IdAllocatorTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/IdAllocator.h"
 11 | 
 12 | #include <atomic>
 13 | #include <cstdint>
 14 | #include <thread>
 15 | #include <unordered_map>
 16 | #include <vector>
 17 | 
 18 | #include <gtest/gtest.h>
 19 | 
 20 | using namespace rseq::internal;
 21 | 
 22 | struct IdOwner {
 23 |   std::uint32_t id;
 24 | };
 25 | 
 26 | TEST(IdAllocator, SingleThreaded) {
 27 |   const int kNumOwners = 100000;
 28 | 
 29 |   // Remember that [] dereferencing automatically inserts the key with the value
 30 |   // 0.
 31 |   std::unordered_map<std::uint32_t, int> countForId;
 32 | 
 33 |   IdAllocator<IdOwner> idAllocator(kNumOwners + 1);
 34 | 
 35 |   std::vector<IdOwner> owners(kNumOwners);
 36 | 
 37 |   // Allocate a bunch of ids
 38 |   for (int i = 0; i < kNumOwners; ++i) {
 39 |     owners[i].id = idAllocator.allocate(&owners[i]);
 40 |     EXPECT_NE(0, owners[i].id);
 41 |     EXPECT_EQ(i + 1, owners[i].id);
 42 |     EXPECT_EQ(1, ++countForId[owners[i].id]);
 43 |   }
 44 | 
 45 |   // Check that the owners match up
 46 |   for (int i = 0; i < kNumOwners; ++i) {
 47 |     EXPECT_EQ(&owners[i], idAllocator.lookupOwner(owners[i].id));
 48 |   }
 49 | 
 50 |   // OK, now we mix things up a little
 51 | 
 52 |   // Free index i if i % 3 == 0
 53 |   for (int i = 0; i < kNumOwners; i += 3) {
 54 |     idAllocator.free(owners[i].id);
 55 |     EXPECT_EQ(0, --countForId[owners[i].id]);
 56 |   }
 57 | 
 58 |   // Free index i if i % 3 == 1
 59 |   for (int i = 1; i < kNumOwners; i += 3) {
 60 |     idAllocator.free(owners[i].id);
 61 |     EXPECT_EQ(0, --countForId[owners[i].id]);
 62 |   }
 63 | 
 64 |   // Allocate for index i if i % 3 == 0 or i % 3 == 1
 65 |   for (int i = 0; i < kNumOwners; ++i) {
 66 |     if (i % 3 == 0 || i % 3 == 1) {
 67 |       owners[i].id = idAllocator.allocate(&owners[i]);
 68 |       EXPECT_EQ(1, ++countForId[owners[i].id]);
 69 |       EXPECT_NE(0, owners[i].id);
 70 |     }
 71 |   }
 72 | 
 73 |   // Check that things still match
 74 |   for (int i = 0; i < kNumOwners; ++i) {
 75 |     EXPECT_EQ(&owners[i], idAllocator.lookupOwner(owners[i].id));
 76 |   }
 77 | 
 78 |   // At any given time, we had <= kNumOwners allocated Ids. Now we go to
 79 |   // kNumOwners + 1.
 80 |   IdOwner newOwner;
 81 |   newOwner.id = idAllocator.allocate(&newOwner);
 82 |   EXPECT_EQ(newOwner.id, kNumOwners + 1);
 83 |   EXPECT_EQ(1, ++countForId[newOwner.id]);
 84 | }
 85 | 
 86 | void updateMax(std::atomic<std::uint32_t>* max, std::uint32_t atLeast) {
 87 |   std::uint32_t curMax = max->load();
 88 |   while (curMax < atLeast) {
 89 |     if (max->compare_exchange_strong(curMax, atLeast)) {
 90 |       break;
 91 |     }
 92 |   }
 93 | }
 94 | 
 95 | TEST(IdAllocator, MultiThreaded) {
 96 |   const int kNumThreads = 10;
 97 |   const int kAllocationsPerThread = 100000;
 98 | 
 99 |   std::atomic<std::uint32_t> highestIdAllocated(0);
100 | 
101 |   std::vector<std::vector<IdOwner>> ownersByThread(kNumThreads);
102 |   for (int i = 0; i < kNumThreads; ++i) {
103 |     ownersByThread[i] = std::vector<IdOwner>(kAllocationsPerThread);
104 |   }
105 | 
106 |   IdAllocator<IdOwner> idAllocator(kNumThreads * kAllocationsPerThread + 1);
107 | 
108 |   std::vector<std::thread> threads(kNumThreads);
109 | 
110 |   // Spawn many threads, doing many allocations and frees
111 |   for (int i = 0; i < kNumThreads; ++i) {
112 |     threads[i] = std::thread([&, i]() {
113 |       // Allocate everything
114 |       for (int j = 0; j < kAllocationsPerThread; ++j) {
115 |         ownersByThread[i][j].id = idAllocator.allocate(&ownersByThread[i][j]);
116 |         EXPECT_NE(0, ownersByThread[i][j].id);
117 |       }
118 |       // Free the evens
119 |       for (int j = 0; j < kAllocationsPerThread; j += 2) {
120 |         idAllocator.free(ownersByThread[i][j].id);
121 |       }
122 |       // Reallocate them
123 |       for (int j = 0; j < kAllocationsPerThread; j += 2) {
124 |         ownersByThread[i][j].id = idAllocator.allocate(&ownersByThread[i][j]);
125 |         EXPECT_NE(0, ownersByThread[i][j].id);
126 |       }
127 |     });
128 |   }
129 |   for (int i = 0; i < kNumThreads; ++i) {
130 |     threads[i].join();
131 |   }
132 | 
133 |   std::unordered_map<std::uint32_t, int> countForId;
134 |   for (int i = 0; i < kNumThreads; ++i) {
135 |     for (int j = 0; j < kAllocationsPerThread; ++j) {
136 |       EXPECT_NE(0, ownersByThread[i][j].id);
137 |       EXPECT_EQ(
138 |           &ownersByThread[i][j],
139 |           idAllocator.lookupOwner(ownersByThread[i][j].id));
140 |     }
141 |   }
142 |   IdOwner newOwner;
143 |   newOwner.id = idAllocator.allocate(&newOwner);
144 |   EXPECT_EQ(kNumThreads * kAllocationsPerThread + 1, newOwner.id);
145 | }
146 | 


--------------------------------------------------------------------------------
/rseq/internal/CodeTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/Code.h"
 11 | 
 12 | #include <gtest/gtest.h>
 13 | 
 14 | using namespace rseq::internal;
 15 | 
 16 | TEST(Code, Allocation) {
 17 |   // Note: we assume that this is divisible by 4 later.
 18 |   const int kNumAllocations = 10000;
 19 |   std::atomic<int> threadCachedCpu[kNumAllocations];
 20 |   Code* code[kNumAllocations];
 21 |   for (int i = 0; i < kNumAllocations; ++i) {
 22 |     code[i] = Code::initForId(i, &threadCachedCpu[i]);
 23 |   }
 24 |   // Make sure they all work
 25 |   for (int i = 0; i < kNumAllocations; ++i) {
 26 |     std::uint64_t val = 12345;
 27 |     std::uint64_t dst = 0;
 28 |     EXPECT_FALSE(code[i]->rseqLoadFunc()(&dst, &val));
 29 |     EXPECT_EQ(12345, dst);
 30 |   }
 31 |   // Block the even ones
 32 |   for (int i = 0; i < kNumAllocations; i += 2) {
 33 |     code[i]->blockRseqOps();
 34 |   }
 35 |   // Make sure the evens don't work and the odds do.
 36 |   for (int i = 0; i < kNumAllocations; ++i) {
 37 |     std::uint64_t val = 12345;
 38 |     std::uint64_t dst = 0;
 39 |     EXPECT_EQ(i % 2 == 0, code[i]->rseqLoadFunc()(&dst, &val));
 40 |     EXPECT_EQ(12345 * (i % 2), dst);
 41 |   }
 42 |   // Block the odds
 43 |   for (int i = 1; i < kNumAllocations; i += 2) {
 44 |     code[i]->blockRseqOps();
 45 |   }
 46 |   // Reallocate the evens, but with a different mapping between threadCachedCpus
 47 |   // and Codes.
 48 |   for (int i = 0; i < kNumAllocations; ++i) {
 49 |     if (i % 4 == 0) {
 50 |       code[i] = Code::initForId(i / 2, &threadCachedCpu[i]);
 51 |     }
 52 |     if (i % 4 == 2) {
 53 |       // Here we use the knowledge that kNumAllocations is divisible by 4
 54 |       // (kNumAllocations / 2 is even).
 55 |       code[i] = Code::initForId(
 56 |           i / 2 + kNumAllocations / 2, &threadCachedCpu[i]);
 57 |     }
 58 |   }
 59 |   // Make sure the evens work and the odds dont.
 60 |   for (int i = 0; i < kNumAllocations; ++i) {
 61 |     std::uint64_t val = 12345;
 62 |     std::uint64_t dst = 0;
 63 | 
 64 |     bool failed = code[i]->rseqLoadFunc()(&dst, &val);
 65 |     if (i % 2 == 0) {
 66 |       EXPECT_FALSE(failed);
 67 |       EXPECT_EQ(12345, dst);
 68 |     }
 69 |   }
 70 | }
 71 | 
 72 | class CodeFixture : public ::testing::Test {
 73 |  protected:
 74 |   void SetUp() override {
 75 |     code = Code::initForId(1, &threadCachedCpu);
 76 |     threadCachedCpu.store(0);
 77 |   }
 78 | 
 79 |   Code* code;
 80 |   std::atomic<int> threadCachedCpu;
 81 | };
 82 | 
 83 | TEST_F(CodeFixture, LoadsCorrectly) {
 84 |   std::uint64_t val = 12345;
 85 |   std::uint64_t dst = 0;
 86 |   EXPECT_FALSE(code->rseqLoadFunc()(&dst, &val));
 87 |   EXPECT_EQ(12345, dst);
 88 |   EXPECT_GE(0, threadCachedCpu.load());
 89 | }
 90 | 
 91 | TEST_F(CodeFixture, StoresCorrectly) {
 92 |   std::uint64_t dst = 0;
 93 |   EXPECT_FALSE(code->rseqStoreFunc()(&dst, 12345));
 94 |   EXPECT_EQ(12345, dst);
 95 |   EXPECT_GE(0, threadCachedCpu.load());
 96 | }
 97 | 
 98 | TEST_F(CodeFixture, StoreFencesCorrectly) {
 99 |   std::uint64_t dst = 0;
100 |   EXPECT_FALSE(code->rseqStoreFenceFunc()(&dst, 12345));
101 |   EXPECT_EQ(12345, dst);
102 |   EXPECT_GE(0, threadCachedCpu.load());
103 | }
104 | 
105 | TEST_F(CodeFixture, BlocksLoads) {
106 |   std::uint64_t val = 12345;
107 |   std::uint64_t dst = 0;
108 |   code->blockRseqOps();
109 |   EXPECT_TRUE(code->rseqLoadFunc()(&dst, &val));
110 |   EXPECT_LT(threadCachedCpu.load(), 0);
111 |   EXPECT_EQ(0, dst);
112 | }
113 | 
114 | TEST_F(CodeFixture, BlocksStores) {
115 |   std::uint64_t dst = 0;
116 |   code->blockRseqOps();
117 |   EXPECT_TRUE(code->rseqStoreFunc()(&dst, 12345));
118 |   EXPECT_LT(threadCachedCpu.load(), 0);
119 |   EXPECT_EQ(0, dst);
120 | }
121 | 
122 | TEST_F(CodeFixture, BlocksStoreFences) {
123 |   std::uint64_t dst = 0;
124 |   code->blockRseqOps();
125 |   EXPECT_TRUE(code->rseqStoreFenceFunc()(&dst, 12345));
126 |   EXPECT_LT(threadCachedCpu.load(), 0);
127 |   EXPECT_EQ(0, dst);
128 | }
129 | 
130 | TEST_F(CodeFixture, UnblocksLoads) {
131 |   std::uint64_t val = 12345;
132 |   std::uint64_t dst = 0;
133 |   code->blockRseqOps();
134 |   code->unblockRseqOps();
135 |   EXPECT_FALSE(code->rseqLoadFunc()(&dst, &val));
136 |   EXPECT_EQ(12345, dst);
137 | }
138 | 
139 | TEST_F(CodeFixture, UnblocksStores) {
140 |   std::uint64_t dst = 0;
141 |   code->blockRseqOps();
142 |   code->unblockRseqOps();
143 |   EXPECT_FALSE(code->rseqStoreFunc()(&dst, 12345));
144 |   EXPECT_EQ(dst, 12345);
145 | }
146 | 
147 | TEST_F(CodeFixture, UnblocksStoreFences) {
148 |   std::uint64_t dst = 0;
149 |   code->blockRseqOps();
150 |   code->unblockRseqOps();
151 |   EXPECT_FALSE(code->rseqStoreFenceFunc()(&dst, 12345));
152 |   EXPECT_EQ(dst, 12345);
153 | }
154 | 


--------------------------------------------------------------------------------
/rseq/internal/CleanUpOnThreadDeathTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/CleanUpOnThreadDeath.h"
 11 | 
 12 | #include <pthread.h>
 13 | 
 14 | #include <memory>
 15 | #include <mutex>
 16 | #include <thread>
 17 | 
 18 | #include <gtest/gtest.h>
 19 | 
 20 | using namespace rseq::internal;
 21 | 
 22 | int rseqVal;
 23 | int threadControlVal;
 24 | bool rseqValSetDuringThreadControlCleanup;
 25 | 
 26 | void rseqCleanupFunc() {
 27 |   rseqVal = 1;
 28 | }
 29 | 
 30 | void threadControlCleanupFunc() {
 31 |   rseqValSetDuringThreadControlCleanup = (rseqVal == 1);
 32 |   threadControlVal = 1;
 33 | }
 34 | 
 35 | TEST(CleanUpOnThreadDeath, CallsRseq) {
 36 |   rseqValSetDuringThreadControlCleanup = false;
 37 |   rseqVal = threadControlVal = 0;
 38 |   std::thread t([]() {
 39 |     setRseqCleanup(rseqCleanupFunc);
 40 |   });
 41 |   t.join();
 42 |   EXPECT_EQ(1, rseqVal);
 43 | }
 44 | 
 45 | TEST(CleanUpOnThreadDeath, CallsThreadControl) {
 46 |   rseqValSetDuringThreadControlCleanup = false;
 47 |   rseqVal = threadControlVal = 0;
 48 | 
 49 |   std::thread t([]() {
 50 |     setThreadControlCleanup(threadControlCleanupFunc);
 51 |   });
 52 |   t.join();
 53 |   EXPECT_EQ(1, threadControlVal);
 54 | }
 55 | 
 56 | 
 57 | TEST(CleanUpOnThreadDeath, OrdersCallsCorrectlyWhenAddedInOrder) {
 58 |   rseqValSetDuringThreadControlCleanup = false;
 59 |   rseqVal = threadControlVal = 0;
 60 |   std::thread t([]() {
 61 |     setRseqCleanup(rseqCleanupFunc);
 62 |     setThreadControlCleanup(threadControlCleanupFunc);
 63 |   });
 64 |   t.join();
 65 |   EXPECT_TRUE(rseqValSetDuringThreadControlCleanup);
 66 | }
 67 | 
 68 | TEST(CleanUpOnThreadDeath, OrdersCallsCorrectlyWhenNotAddedInOrder) {
 69 |   rseqValSetDuringThreadControlCleanup = false;
 70 |   rseqVal = threadControlVal = 0;
 71 |   std::thread t([]() {
 72 |     setThreadControlCleanup(threadControlCleanupFunc);
 73 |     setRseqCleanup(rseqCleanupFunc);
 74 |   });
 75 |   t.join();
 76 |   EXPECT_TRUE(rseqValSetDuringThreadControlCleanup);
 77 | }
 78 | 
 79 | TEST(CleanUpOnThreadDeath, OutlivesThreadLocals) {
 80 |   static __thread int deathCount;
 81 |   deathCount = 0;
 82 | 
 83 |   static void (*bumpAndCheckDeathCount)() = []() {
 84 |     EXPECT_EQ(0, deathCount);
 85 |     ++deathCount;
 86 |   };
 87 | 
 88 |   struct SetsCleanup {
 89 |     ~SetsCleanup() {
 90 |       setRseqCleanup(bumpAndCheckDeathCount);
 91 |     }
 92 |   };
 93 |   thread_local SetsCleanup setsCleanup1;
 94 |   thread_local SetsCleanup setsCleanup2;
 95 |   void* volatile odrUse = &setsCleanup1;
 96 |   odrUse = &setsCleanup2;
 97 |   setRseqCleanup(bumpAndCheckDeathCount);
 98 | }
 99 | 
100 | TEST(CleanUpOnThreadDeath, SupportsReinitialization) {
101 |   static pthread_key_t key1;
102 |   static pthread_key_t key2;
103 |   static pthread_key_t key3;
104 | 
105 |   struct TestInfo {
106 |     TestInfo()
107 |       : rseqInitialized(false),
108 |         numRseqInitializations(0),
109 |         numRseqDestructions(0) {}
110 |     bool rseqInitialized;
111 |     int numRseqInitializations;
112 |     int numRseqDestructions;
113 |   };
114 |   // Note: only used by the child
115 |   static __thread TestInfo* myTestInfo;
116 |   std::unique_ptr<TestInfo> childTestInfo;
117 | 
118 |   static auto initializeRseq = []() {
119 |     if (!myTestInfo->rseqInitialized) {
120 |       myTestInfo->rseqInitialized = true;
121 |       ++myTestInfo->numRseqInitializations;
122 |       setRseqCleanup([]() {
123 |         ++myTestInfo->numRseqDestructions;
124 |         myTestInfo->rseqInitialized = false;
125 |       });
126 |     }
127 |   };
128 | 
129 |   static void (*destructor3)(void*) = [](void*) {
130 |     initializeRseq();
131 |   };
132 |   static void (*destructor1)(void*) = [](void*) {
133 |     initializeRseq();
134 |     pthread_setspecific(key3, reinterpret_cast<void*>(3));
135 |   };
136 |   static void (*destructor2)(void*) = [](void*) {
137 |     initializeRseq();
138 |   };
139 |   static std::once_flag once;
140 |   std::call_once(once, []() {
141 |     pthread_key_create(&key1, destructor1);
142 |     pthread_key_create(&key2, destructor2);
143 |     pthread_key_create(&key3, destructor3);
144 |   });
145 | 
146 | 
147 |   std::thread t([&]() {
148 |     // Easiest way to tell the pthread destructors where to find the TestInfo is
149 |     // a threadlocal.
150 |     myTestInfo = new TestInfo;
151 |     childTestInfo.reset(myTestInfo);
152 |     pthread_setspecific(key1, reinterpret_cast<void*>(1));
153 |     initializeRseq();
154 |     pthread_setspecific(key2, reinterpret_cast<void*>(2));
155 |   });
156 |   t.join();
157 |   EXPECT_TRUE(
158 |       childTestInfo->numRseqInitializations
159 |           == childTestInfo->numRseqDestructions);
160 |   EXPECT_FALSE(
161 |       childTestInfo->rseqInitialized);
162 | }
163 | 


--------------------------------------------------------------------------------
/rseq/internal/Code.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/Code.h"
 11 | 
 12 | #include <cstring>
 13 | 
 14 | #include "rseq/internal/CachelinePadded.h"
 15 | #include "rseq/internal/Mutex.h"
 16 | #include "rseq/internal/OsMem.h"
 17 | 
 18 | namespace rseq {
 19 | namespace internal {
 20 | 
 21 | static const unsigned char codeTemplate[] = {
 22 |   // 8-byte load code. Prototype is:
 23 |   // int (*)(unsigned long* dst, unsigned long* src);
 24 | 
 25 |   // Do the load
 26 |   //                       mov (%rsi), %rax
 27 |   /* offset   0: */        0x48, 0x8b, 0x06,
 28 | 
 29 |   // Store it into *dst
 30 |   //                       mov %rax, (%rdi)
 31 |   /* offset   3: */        0x48, 0x89, 0x07,
 32 | 
 33 |   // Return success! (i.e. 0)
 34 |   //                       xor %eax, %eax
 35 |   /* offset   6: */        0x31, 0xc0,
 36 |   //                       retq
 37 |   /* offset   8: */        0xc3,
 38 | 
 39 |   // Padding bytes
 40 |   /* offset   9: */        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 41 | 
 42 |   // 8-byte store code. Prototype is:
 43 |   // int (*)(unsigned long* dst, unsigned long val);
 44 | 
 45 |   // Do the store.
 46 |   //                       mov %rsi, (%rdi)
 47 |   /* offset  16: */        0x48, 0x89, 0x37,
 48 | 
 49 |   // Return success! (i.e. 0)
 50 |   //                       xor %eax, %eax
 51 |   /* offset  19: */        0x31, 0xc0,
 52 |   //                       retq
 53 |   /* offset  21: */        0xc3,
 54 | 
 55 | 
 56 |   // Padding bytes
 57 |   /* offset  22: */        0x00, 0x00,
 58 | 
 59 | 
 60 |   // 8-byte store-fence code. Prototype is:
 61 |   // int (*)(unsigned long* dst, unsigned long val);
 62 | 
 63 |   // Do the store (via xchg).
 64 |   //                       xchg %rsi, (%rdi)
 65 |   /* offset  24: */        0x48, 0x87, 0x37,
 66 | 
 67 |   // Return success! (i.e. 0)
 68 |   //                       xor %eax, %eax
 69 |   /* offset  27: */        0x31, 0xc0,
 70 |   //                       retq
 71 |   /* offset  29: */        0xc3,
 72 | 
 73 | 
 74 |   // Padding bytes
 75 |   /* offset  30: */        0x00, 0x00,
 76 | 
 77 | 
 78 |   // Failure path.
 79 |   // This code is shared by all the load and store paths above.
 80 |   // The initial instruction of each path is patched to be a jump to here.
 81 | 
 82 |   // Store -1 into the threadCachedCpu variable.
 83 |   // The 42s get replaced with a pointer to the owning thread's threadCachedCpu
 84 |   // variable.
 85 |   //                       movabs $0x4242424242424242, %rax
 86 |   /* offset  32: */        0x48, 0xb8,
 87 |   /* offset  34: */        0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42,
 88 |   //                       movl   $-1, (%rax)
 89 |   /* offset  42: */        0xc7, 0x00, 0xff, 0xff, 0xff, 0xff,
 90 | 
 91 |   // Return failure :( (i.e. 1).
 92 |   //                       mov $1, %eax
 93 |   /* offset  48: */        0xb8, 0x01, 0x00, 0x00, 0x00,
 94 |   //                       retq
 95 |   /* offset  53: */        0xc3
 96 | };
 97 | 
 98 | 
 99 | const static int kLoadOffset = 0;
100 | const static int kStoreOffset = 16;
101 | const static int kStoreFenceOffset = 24;
102 | const static int kReturnFailureOffset = 32;
103 | const static int kThreadCachedCpuOffset = 34;
104 | 
105 | 
106 | const static int kJmpInstructionSize = 2;
107 | 
108 | const int kLoadToFailureJmpSize
109 |     = kReturnFailureOffset - kLoadOffset - kJmpInstructionSize;
110 | const int kStoreToFailureJmpSize
111 |     = kReturnFailureOffset - kStoreOffset - kJmpInstructionSize;
112 | const int kStoreFenceToFailureJmpSize
113 |     = kReturnFailureOffset - kStoreFenceOffset - kJmpInstructionSize;
114 | 
115 | 
116 | const std::uint16_t kJmpBytecode = 0xeb;
117 | const std::uint16_t kLoadReplacement
118 |     = kJmpBytecode | (kLoadToFailureJmpSize << 8);
119 | const std::uint16_t kStoreReplacement
120 |     = kJmpBytecode | (kStoreToFailureJmpSize << 8);
121 | const std::uint16_t kStoreFenceReplacement
122 |     = kJmpBytecode | (kStoreFenceToFailureJmpSize << 8);
123 | 
124 | 
125 | static mutex::OnceFlag codePagesOnceFlag;
126 | static CachelinePadded<Code>* codePages;
127 | 
128 | // static
129 | Code* Code::initForId(std::uint32_t id, std::atomic<int>* threadCachedCpu) {
130 |   static_assert(
131 |       sizeof(codeTemplate) == sizeof(Code::code_),
132 |       "codeTemplate and code_ storage size must match.");
133 | 
134 |   mutex::callOnce(codePagesOnceFlag, []() {
135 |     // We get kMaxGlobalThreads from the kernel limit. This reserves 256MB of
136 |     // address space, but pages are lazily allocated, so the actual cost is much
137 |     // smaller.
138 |     const int kMaxGlobalThreads = 1 << 22;
139 |     const int kMemToReserve = kMaxGlobalThreads * sizeof(CachelinePadded<Code>);
140 | 
141 |     void* alloc = os_mem::allocateExecutable(kMemToReserve);
142 |     codePages = static_cast<CachelinePadded<Code>*>(alloc);
143 |   });
144 |   Code* code = codePages[id].get();
145 |   std::memcpy(code->code_, codeTemplate, sizeof(codeTemplate));
146 |   std::memcpy(
147 |       &code->code_[kThreadCachedCpuOffset],
148 |       &threadCachedCpu,
149 |       sizeof(threadCachedCpu));
150 |   return code;
151 | }
152 | 
153 | Code::RseqLoadFunc Code::rseqLoadFunc() {
154 |   return reinterpret_cast<RseqLoadFunc>(&code_[kLoadOffset]);
155 | }
156 | 
157 | Code::RseqStoreFunc Code::rseqStoreFunc() {
158 |   return reinterpret_cast<RseqStoreFunc>(&code_[kStoreOffset]);
159 | }
160 | 
161 | Code::RseqStoreFunc Code::rseqStoreFenceFunc() {
162 |   return reinterpret_cast<RseqStoreFunc>(&code_[kStoreFenceOffset]);
163 | }
164 | 
165 | void Code::blockRseqOps() {
166 |   std::atomic<std::uint16_t>* load =
167 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kLoadOffset]);
168 |   std::atomic<std::uint16_t>* store =
169 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kStoreOffset]);
170 |   std::atomic<std::uint16_t>* storeFence =
171 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kStoreFenceOffset]);
172 |   load->store(kLoadReplacement, std::memory_order_relaxed);
173 |   store->store(kStoreReplacement, std::memory_order_relaxed);
174 |   storeFence->store(kStoreFenceReplacement, std::memory_order_relaxed);
175 | }
176 | 
177 | void Code::unblockRseqOps() {
178 |   const std::uint16_t kLoadBytes = 0x8b48;
179 |   const std::uint16_t kStoreBytes = 0x8948;
180 |   const std::uint16_t kStoreFenceBytes = 0x8748;
181 | 
182 |   std::atomic<std::uint16_t>* load =
183 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kLoadOffset]);
184 |   std::atomic<std::uint16_t>* store =
185 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kStoreOffset]);
186 |   std::atomic<std::uint16_t>* storeFence =
187 |       reinterpret_cast<std::atomic<std::uint16_t>*>(&code_[kStoreFenceOffset]);
188 | 
189 |   load->store(kLoadBytes, std::memory_order_relaxed);
190 |   store->store(kStoreBytes, std::memory_order_relaxed);
191 |   storeFence->store(kStoreFenceBytes, std::memory_order_relaxed);
192 | }
193 | } // namespace internal
194 | } // namespace rseq
195 | 


--------------------------------------------------------------------------------
/rseq/internal/Rseq.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/Rseq.h"
 11 | 
 12 | #include <sched.h>
 13 | 
 14 | #include <atomic>
 15 | #include <cstdint>
 16 | 
 17 | #include "rseq/internal/AsymmetricThreadFence.h"
 18 | #include "rseq/internal/Code.h"
 19 | #include "rseq/internal/CleanUpOnThreadDeath.h"
 20 | #include "rseq/internal/CpuLocal.h"
 21 | #include "rseq/internal/Mutex.h"
 22 | #include "rseq/internal/NumCpus.h"
 23 | #include "rseq/internal/ThreadControl.h"
 24 | 
 25 | namespace rseq {
 26 | namespace internal {
 27 | 
 28 | static __thread int lastCpu;
 29 | 
 30 | static __thread ThreadControl* me;
 31 | 
 32 | // In at least some environments, alignof(std::atomic<T>) == 4 if
 33 | // alignof(T) == 4, even if sizeof(T) == 8; this won't work for us.
 34 | // We could alignas this struct, but I think the gold standard
 35 | // for "this is lock-free regardless of compiler and standard library choices"
 36 | // is still using an integral type. So instead of using
 37 | // std::atomic<OwnerAndEvictor>, we use AtomicOwnerAndEvictor below.
 38 | struct OwnerAndEvictor {
 39 |   std::uint32_t ownerId;
 40 |   std::uint32_t evictorId;
 41 | };
 42 | 
 43 | struct AtomicOwnerAndEvictor {
 44 |   AtomicOwnerAndEvictor() : repr(0) {
 45 |   }
 46 | 
 47 |   OwnerAndEvictor load() {
 48 |     OwnerAndEvictor result;
 49 |     std::uint64_t view = repr.load();
 50 |     result.ownerId = view >> 32;
 51 |     result.evictorId = view & 0xFFFFFFFFU;
 52 |     return result;
 53 |   }
 54 | 
 55 |   bool cas(OwnerAndEvictor expected, OwnerAndEvictor desired) {
 56 |     std::uint64_t expectedRepr
 57 |         = (static_cast<std::uint64_t>(expected.ownerId) << 32)
 58 |             | expected.evictorId;
 59 |     std::uint64_t desiredRepr
 60 |         = (static_cast<std::uint64_t>(desired.ownerId) << 32)
 61 |             | desired.evictorId;
 62 |     return repr.compare_exchange_strong(expectedRepr, desiredRepr);
 63 |   }
 64 | 
 65 |   std::atomic<std::uint64_t> repr;
 66 | };
 67 | 
 68 | // Initialized in ensureMyThreadControlInitialized below.
 69 | // PodWrapper since there are shutdown order issues that mean this can't ever be
 70 | // safely destroyed (dying threads access it).
 71 | static CpuLocal<AtomicOwnerAndEvictor>* ownerAndEvictor;
 72 | static char ownerAndEvictorStorage alignas(CpuLocal<AtomicOwnerAndEvictor>) [
 73 |     sizeof(*ownerAndEvictor)];
 74 | 
 75 | static int acquireCpuOwnership() {
 76 |   while (true) {
 77 |     lastCpu = sched_getcpu();
 78 |     threadCachedCpu()->store(lastCpu, std::memory_order_relaxed);
 79 | 
 80 |     OwnerAndEvictor curOwnerAndEvictor
 81 |       = ownerAndEvictor->forCpu(lastCpu)->load();
 82 |     if (curOwnerAndEvictor.ownerId == 0) {
 83 |       if (ownerAndEvictor->forCpu(lastCpu)->cas(
 84 |             curOwnerAndEvictor, { me->id(), 0 } )) {
 85 |         return lastCpu;
 86 |       } else {
 87 |         continue;
 88 |       }
 89 |     }
 90 | 
 91 |     me->accessing()->store(
 92 |         curOwnerAndEvictor.ownerId, std::memory_order_relaxed);
 93 |     if (!ownerAndEvictor->forCpu(lastCpu)->cas(
 94 |           curOwnerAndEvictor, { curOwnerAndEvictor.ownerId, me->id() })) {
 95 |       me->accessing()->store(0, std::memory_order_relaxed);
 96 |       continue;
 97 |     }
 98 |     // The CAS succeeded, so we installed ourself as the evictor.
 99 |     curOwnerAndEvictor.evictorId = me->id();
100 | 
101 |     ThreadControl* victim = ThreadControl::forId(curOwnerAndEvictor.ownerId);
102 |     victim->blockRseqOps(); // A
103 | 
104 |     if (lastCpu != sched_getcpu()) { // B
105 |       me->accessing()->store(0, std::memory_order_relaxed);
106 |       continue;
107 |     }
108 | 
109 |     // This is a little bit tricky; why don't we *always* need to do the
110 |     // asymmetricThreadFencyHeavy()?
111 |     // We did the stores blocking the victim's rseq ops above (A), and then
112 |     // viewed ourselves to be running on CPU lastCpu (B). So the blocking stores
113 |     // will be visible to all threads that run on CPU lastCpu in the future. If
114 |     // we observe victim->curCpu() == lastCpu below, we know that the victim is
115 |     // such a thread. So either the victim ran in between the blocking stores
116 |     // and now (in which case it did a CAS to lastCpu's OwnerEvictor from
117 |     // <victim, me> to <victim, 0>, so we'll retry below), or the victim hasn't
118 |     // run yet, in which case we don't need the heavy fence.
119 |     // This relies on the memory ordering guarantee of ThreadControl::curCpu()
120 |     // (which itself relies on the way the kernel handles thread migrations).
121 |     if (victim->curCpu() != lastCpu) {
122 |       asymmetricThreadFenceHeavy();
123 |     }
124 | 
125 |     me->accessing()->store(0, std::memory_order_relaxed);
126 | 
127 |     if (ownerAndEvictor->forCpu(lastCpu)->cas(
128 |           curOwnerAndEvictor, { me->id(), 0 })) {
129 |       return lastCpu;
130 |     }
131 |   }
132 | }
133 | 
134 | static mutex::OnceFlag ownerAndEvictorOnceFlag;
135 | 
136 | static void ensureMyThreadControlInitialized() {
137 |   if (me == nullptr) {
138 |     me = ThreadControl::get(threadCachedCpu());
139 |     rseq_load_trampoline = me->code()->rseqLoadFunc();
140 |     rseq_store_trampoline = me->code()->rseqStoreFunc();
141 |     rseq_store_fence_trampoline = me->code()->rseqStoreFenceFunc();
142 |     setRseqCleanup([]() {
143 |       end();
144 |       // If rseq is shut-down at thread-death, then resurrected at thread-death,
145 |       // we need to make sure we re-initialize our data structures.
146 |       me = nullptr;
147 |     });
148 | 
149 |     mutex::callOnce(ownerAndEvictorOnceFlag, []() {
150 |       ownerAndEvictor
151 |           = new (ownerAndEvictorStorage) CpuLocal<AtomicOwnerAndEvictor>;
152 |     });
153 |   }
154 | }
155 | 
156 | int beginSlowPath() {
157 |   ensureMyThreadControlInitialized();
158 |   end();
159 |   me->unblockRseqOps();
160 |   return acquireCpuOwnership();
161 | }
162 | 
163 | void end() {
164 |   threadCachedCpu()->store(-1, std::memory_order_relaxed);
165 |   while (true) {
166 |     OwnerAndEvictor curOwnerAndEvictor
167 |         = ownerAndEvictor->forCpu(lastCpu)->load();
168 |     if (curOwnerAndEvictor.ownerId != me->id()) {
169 |       break;
170 |     }
171 |     if (ownerAndEvictor->forCpu(lastCpu)->cas(curOwnerAndEvictor, { 0, 0 })) {
172 |       break;
173 |     }
174 |   }
175 | }
176 | 
177 | static void evictOwner(int shard) {
178 |   OwnerAndEvictor curOwnerAndEvictor = ownerAndEvictor->forCpu(shard)->load();
179 |   if (curOwnerAndEvictor.ownerId == 0) {
180 |     return;
181 |   }
182 | 
183 |   me->accessing()->store(curOwnerAndEvictor.ownerId);
184 |   if (ownerAndEvictor->forCpu(shard)->load().ownerId
185 |       != curOwnerAndEvictor.ownerId) {
186 |     me->accessing()->store(0, std::memory_order_relaxed);
187 |     return;
188 |   }
189 | 
190 |   ThreadControl* victim = ThreadControl::forId(curOwnerAndEvictor.ownerId);
191 |   victim->blockRseqOps();
192 | 
193 |   me->accessing()->store(0, std::memory_order_relaxed);
194 | }
195 | 
196 | void fenceWith(int shard) {
197 |   std::atomic_thread_fence(std::memory_order_seq_cst);
198 |   ensureMyThreadControlInitialized();
199 |   evictOwner(shard);
200 |   asymmetricThreadFenceHeavy();
201 | }
202 | 
203 | void fence() {
204 |   std::atomic_thread_fence(std::memory_order_seq_cst);
205 |   ensureMyThreadControlInitialized();
206 |   for (int i = 0; i < numCpus(); ++i) {
207 |     evictOwner(i);
208 |   }
209 |   asymmetricThreadFenceHeavy();
210 | }
211 | 
212 | } // namespace internal
213 | } // namespace rseq
214 | 


--------------------------------------------------------------------------------
/rseq/internal/ThreadControl.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/internal/ThreadControl.h"
 11 | 
 12 | #include <sys/syscall.h>
 13 | 
 14 | #include <fcntl.h>
 15 | #include <unistd.h>
 16 | #include <sched.h>
 17 | 
 18 | #include <cstring>
 19 | #include <new>
 20 | 
 21 | #include "rseq/internal/CleanUpOnThreadDeath.h"
 22 | #include "rseq/internal/Code.h"
 23 | #include "rseq/internal/IdAllocator.h"
 24 | #include "rseq/internal/IntrusiveLinkedList.h"
 25 | #include "rseq/internal/Mutex.h"
 26 | 
 27 | namespace rseq {
 28 | namespace internal {
 29 | 
 30 | // ThreadControls are all kept in a global linked list. The list, including all
 31 | // additions to and removals from the list, are protected by the mutex.
 32 | // Theoretically we ought to worry about destructor order issues during
 33 | // shutdown, but as a practical matter everything works fine for these types.
 34 | static mutex::Mutex allThreadControlsMu;
 35 | static IntrusiveLinkedList<ThreadControl> allThreadControls;
 36 | 
 37 | // Initialized in ThreadControl::get below.
 38 | // Here we *do* care about destructors running during shutdown.
 39 | static mutex::OnceFlag idAllocatorOnceFlag;
 40 | static IdAllocator<ThreadControl>* idAllocator;
 41 | static char idAllocatorStorage alignas(IdAllocator<ThreadControl>) [
 42 |     sizeof(*idAllocator)];
 43 | 
 44 | // We get this from the kernel limit.
 45 | // TODO: Code.cpp has the same constant. We ought to move it into someplace
 46 | // common.
 47 | constexpr static int kMaxGlobalThreads = 1 << 22;
 48 | 
 49 | // The ThreadControl for the current thread. The rules around __thread variables
 50 | // in gcc are weird; putting ThreadControl directly in thread depends on a lot
 51 | // of finicky details. It's easier to do this lazy initialization hack.
 52 | static __thread ThreadControl* me;
 53 | static __thread char meStorage alignas(ThreadControl) [sizeof(*me)];
 54 | 
 55 | // static
 56 | ThreadControl* ThreadControl::get(std::atomic<int>* threadCachedCpu) {
 57 |   if (me != nullptr) {
 58 |     return me;
 59 |   }
 60 | 
 61 |   mutex::callOnce(idAllocatorOnceFlag, []() {
 62 |     idAllocator =
 63 |         new (idAllocatorStorage) IdAllocator<ThreadControl>(kMaxGlobalThreads);
 64 |   });
 65 | 
 66 |   me = new (meStorage) ThreadControl(threadCachedCpu);
 67 |   return me;
 68 | }
 69 | 
 70 | // static
 71 | ThreadControl* ThreadControl::forId(std::uint32_t id) {
 72 |   return idAllocator->lookupOwner(id);
 73 | }
 74 | 
 75 | ThreadControl::ThreadControl(std::atomic<int>* threadCachedCpu) {
 76 |   // Get our id.
 77 |   id_ = idAllocator->allocate(this);
 78 | 
 79 |   // Fill in the data about our process
 80 |   threadCachedCpu_ = threadCachedCpu;
 81 |   code_ = Code::initForId(id_, threadCachedCpu);
 82 |   tid_ = syscall(SYS_gettid);
 83 | 
 84 |   // Insert the ThreadControl into the global list
 85 |   {
 86 |     mutex::LockGuard<mutex::Mutex> lg(allThreadControlsMu);
 87 |     allThreadControls.link(this);
 88 |   }
 89 |   setThreadControlCleanup([]() {
 90 |     me->~ThreadControl();
 91 |     // If we're reinitialized during thread death, we need to *know* it, and
 92 |     // reinitialize our data structures.
 93 |     me = nullptr;
 94 |   });
 95 | }
 96 | 
 97 | ThreadControl::~ThreadControl() {
 98 |   // Remove ourselves from the list.
 99 |   {
100 |     mutex::LockGuard<mutex::Mutex> lg(allThreadControlsMu);
101 |     allThreadControls.unlink(this);
102 |   }
103 | 
104 |   // Wait until no one's trying to evict us.
105 |   bool beingAccessed = true;
106 |   int numYields = 0;
107 |   while (beingAccessed) {
108 |     beingAccessed = false;
109 |     {
110 |       mutex::LockGuard<mutex::Mutex> lg(allThreadControlsMu);
111 |       for (ThreadControl& thread : allThreadControls) {
112 |         if (thread.accessing()->load() == id_) {
113 |           beingAccessed = true;
114 |           break;
115 |         }
116 |       }
117 |     }
118 |     if (!beingAccessed) {
119 |       break;
120 |     }
121 |     // We yield for the first 100 attempts at dying. After that, we sleep.
122 |     if (numYields < 100) {
123 |       ++numYields;
124 |       sched_yield();
125 |     } else {
126 |       /* sleep override */
127 |       sleep(1);
128 |     }
129 |   }
130 |   idAllocator->free(id_);
131 | }
132 | 
133 | void ThreadControl::blockRseqOps() {
134 |   threadCachedCpu_->store(-1, std::memory_order_relaxed);
135 |   code_->blockRseqOps();
136 | }
137 | 
138 | void ThreadControl::unblockRseqOps() {
139 |   // threadCachedCpu is set at the point of the sched_getcpu() call.
140 |   code_->unblockRseqOps();
141 | }
142 | 
143 | // Returns -1 on error.
144 | static int tryParseCpu(char* procFileContents, ssize_t length) {
145 |   if (length < 0) {
146 |     return -1;
147 |   }
148 | 
149 |   int indexOfLastRParen = -1;
150 |   for (int i = 0; i < length; ++i) {
151 |     if (procFileContents[i] == ')') {
152 |       indexOfLastRParen = i;
153 |     }
154 |   }
155 |   if (indexOfLastRParen == -1) {
156 |     return -1;
157 |   }
158 | 
159 |   // Command is field 39, command is field 2.
160 |   const int kSpacesBeforeCpu = 38;
161 |   int pos = 0;
162 |   for (
163 |       int numSpacesEncountered = 0;
164 |       pos < length && numSpacesEncountered < kSpacesBeforeCpu;
165 |       ++pos) {
166 |     if (procFileContents[pos] == ' ') {
167 |       ++numSpacesEncountered;
168 |     }
169 |   }
170 |   int cpu = 0;
171 |   for (; pos < length; ++pos) {
172 |     char charAtPos = procFileContents[pos];
173 |     if (charAtPos == ' ') {
174 |       return cpu;
175 |     } else if ('0' <= charAtPos && charAtPos <= '9') {
176 |       cpu *= 10;
177 |       cpu += charAtPos - '0';
178 |     } else {
179 |       return -1;
180 |     }
181 |   }
182 |   return -1;
183 | }
184 | 
185 | // Returns a pointer to the first character after the integer output.
186 | static char* rseqItoa(int i, char* a) {
187 |   char* cur = a;
188 |   if (i == 0) {
189 |     *cur++ = '0';
190 |   }
191 |   while (i != 0) {
192 |     *cur++ = '0' + i % 10;
193 |     i /= 10;
194 |   }
195 |   // We printed the string least-significant digit first; we have to reverse it.
196 |   for (char* left = a, *right = cur - 1; left < right; ++left, --right) {
197 |     char temp = *right;
198 |     *right = *left;
199 |     *left = temp;
200 |   }
201 |   return cur;
202 | }
203 | 
204 | int ThreadControl::curCpu() {
205 |   // We want to construct "/proc/self/task/<tid>/stat".
206 |   // "/proc/self/task/" is 16 characters, tid is a positive int, so it's at most
207 |   // 10 characters. "/stat" is 5 characters, and we need 1 terminating null
208 |   // character. Adding all these together, we get 32 characters.
209 |   const int procFileNameSize = 32;
210 |   // We know the types of all the fields in /proc/self/<tid>/stat, and can bound
211 |   // their length to get the maximum buffer size we need, much the same way as
212 |   // above. See P56392714 for the arithmetic.
213 |   const int procFileContentsSize = 968;
214 | 
215 |   char filename[procFileNameSize];
216 |   // What we want here is:
217 |   //   snprintf(filename, sizeof(filename), "/prof/self/task/%d/stat", tid_);
218 |   // But there are snprintf paths that can call malloc. Rather than try to
219 |   // reason about the conditions under which this happens, we'll do our own
220 |   // string printing.
221 |   const char* filenamePrefix = "/proc/self/task/";
222 |   const char* filenameSuffix = "/stat";
223 |   std::strcpy(filename, filenamePrefix);
224 |   char* tidStart = filename + std::strlen(filenamePrefix);
225 |   char* suffixStart = rseqItoa(tid_, tidStart);
226 |   std::strcpy(suffixStart, filenameSuffix);
227 | 
228 |   char procFileContents[procFileContentsSize];
229 | 
230 |   int fd = open(filename, O_RDONLY);
231 |   if (fd == -1) {
232 |     return -1;
233 |   }
234 |   // To get atomicity, we want to read the whole file (well, the part of it that
235 |   // we care about anyway) in a single read() call. We retry in case a signal
236 |   // causes a length of -1.
237 |   ssize_t length = -1;
238 |   for (int i = 0; i < 10 && length == -1; ++i) {
239 |     length = read(fd, procFileContents, procFileContentsSize);
240 |   }
241 |   int cpu = tryParseCpu(procFileContents, length);
242 |   close(fd);
243 |   return cpu;
244 | }
245 | 
246 | } // namespace internal
247 | } // namespace rseq
248 | 


--------------------------------------------------------------------------------
/Rseq.md:
--------------------------------------------------------------------------------
  1 | # `Rseq.h`
  2 | --------
  3 | 
  4 | ## Overview
  5 | ***
  6 | 
  7 | This is a userspace take on the kernel restartable-sequences API. This allows
  8 | efficient per-cpu atomic operations that don't use barriers. A thread can
  9 | begin a restartable sequence (henceforth, "rseq"), and do rseq-load's and
 10 | rseq-stores. These are just like normal loads and stores (they're efficient
 11 | and don't come with any built-in barriers), which one exception: if another
 12 | thread has begun an rseq on the same CPU, then the load / store doesn't take
 13 | place, and returns an error code instead.
 14 | 
 15 | ## History
 16 | ***
 17 | 
 18 | This idea originated with "Fast mutual exclusion for uniprocessors"
 19 | (http://dl.acm.org/citation.cfm?id=143523), though similar ideas go back at
 20 | least to the 1980s, with "Concurrency Features for the Trellis/Owl Language"
 21 | (http://link.springer.com/chapter/10.1007%2F3-540-47891-4_16). "Mostly lock-free
 22 | malloc" (http://dl.acm.org/citation.cfm?id=512451) showed some impressive
 23 | performance wins by using essentially the same scheme. There has been a recent
 24 | resurgence in interest prompted by work done by Google
 25 | (http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf),
 26 | resulting in a number of attempts to provide support in the Linux kernel, the
 27 | most recent of which is at https://lkml.org/lkml/2016/8/19/699 .
 28 | 
 29 | 
 30 | ## Usage example
 31 | ***
 32 | 
 33 | To see why this is useful, let's consider a hypothetical malloc
 34 | implementation. At its core is a global data structure that keeps track of
 35 | chunks of free memory of various size classes, each size class organized into
 36 | a linked list.
 37 | 
 38 | Adding and removing elements from the centralized linked lists will be
 39 | expensive because of the synchronization overhead (lots of threads trying to
 40 | pull an element off the same linked list will get expensive). So in addition
 41 | to the centralized free-lists, we keep a per-thread cache.
 42 | 
 43 | Here's how the fast path alloc/free from a size-class might look then.
 44 | ThreadLocalSizeClassCache::head is the head of a linked-list based stack of
 45 | free memory.
 46 | 
 47 |     void free(void* memory) {
 48 |       ThreadLocalSizeClassCache* cache = myTLD()->sizeClassCacheForPtr(memory);
 49 |       *(void**) memory = cache->head;
 50 |       cache->head = memory;
 51 |     }
 52 | 
 53 |     void* alloc(size_t size) {
 54 |       ThreadLocalSizeClassCache* cache = myTLD()->sizeClassCacheForSize(size);
 55 |       if (cache->head == nullptr) {
 56 |         return getMemoryFromCentralFreeList(cache->sizeClass);
 57 |       }
 58 |       void* result = cache->head;
 59 |       cache->head = *(void**)cache->head;
 60 |       return result;
 61 |     }
 62 | 
 63 | But this approach has some problems. One big one is memory usage; to avoid
 64 | the locking overhead of the central free-lists, we need caches to be big. But
 65 | an N-byte cache per thread for T threads means we need N * T bytes reserved
 66 | in caches. It wouldn't be unrealistic for N to be on the order of millions
 67 | and T on the order of thousands. That's gigabytes of memory just sitting
 68 | around waiting to be used.
 69 | 
 70 | To save memory, we'll try per-CPU caching: make the linked-list stack where
 71 | we keep freed memory in a per-cpu data structure instead of a per-thread one.
 72 | Since there can be tens or even hundreds of threads per CPU, we may hope for
 73 | a dramatic reduction in memory sitting around unused in caches.
 74 | 
 75 |     void free(void* ptr) {
 76 |       while (true) {
 77 |         CpuLocalSizeClassCache* cache = myCLD()->sizeClassCacheForPtr(ptr);
 78 |         do {
 79 |           *(void**) ptr = cache->head;
 80 |         } while (!compareAndSwap(&cache->head, *(void**) ptr, ptr));
 81 |       }
 82 |     }
 83 | 
 84 |     void alloc(size_t size) {
 85 |       while (true) {
 86 |         CpuLocalSizeClassCache* cache = myCLD()->sizeClassCacheForSize(size);
 87 |         void* result = cache->head;
 88 |         if (result == nullptr) {
 89 |           return getMemoryFromCentralFreeList(cache->sizeClass);
 90 |         }
 91 |         void* newHead = *(void**) result;
 92 |         if (compareAndSwap(&cache->head, result, newHead)) {
 93 |           return result;
 94 |         }
 95 |       }
 96 |     }
 97 | 
 98 | There are two problems here:
 99 | 1. We have a compare-and-swap on the fast paths for both allocation and free.
100 | Even assuming cache hits, this is expensive.
101 | 2. There is an ABA problem in alloc. Resolving it involves strategies that
102 | are complicated, error-prone, and slow.
103 | 
104 | Both of these problems are caused by the fact that a thread doesn't have any
105 | way of knowing if another thread will run between the loading of cache->head
106 | and the subsequent modification of it. This is exactly the problem that rseq
107 | can solve.
108 | 
109 | Here's how it looks:
110 | 
111 |     void free(void* ptr) {
112 |       while (true) {
113 |         int cpu = rseq::begin();
114 |         CpuLocalSizeClassCache* cache = cldFor(cpu)->sizeClassCacheForPtr(ptr);
115 |         *(void**) ptr = cache->head;
116 |         if (rseq::store(&cache->head, ptr)) {
117 |           return;
118 |         }
119 |       }
120 |     }
121 | 
122 |     void alloc(size_t size) {
123 |       while (true) {
124 |         int cpu = rseq::begin();
125 |         CpuLocalSizeClassCache* cache = cldFor(cpu)->sizeClassCacheForSize(size);
126 | 
127 |         void* result = cache->head;
128 |         if (result == nullptr) {
129 |           return getMemoryFromCentralFreeList(cache->sizeClass);
130 |         }
131 |         void* newHead = *(void**) result;
132 |         if (rseq::store(&cache->head, newHead)) {
133 |           return result;
134 |         }
135 |       }
136 |     }
137 | 
138 | This is efficient (an rseq store has very little overhead over a
139 | plain-store), and correct (the store to cache->head will fail if another
140 | thread touched the cpu-local data after the call to rseq::begin(), avoiding
141 | the ABA-problem).
142 | 
143 | 
144 | ## Implementation
145 | ***
146 | 
147 | We'll cover rseq::store only; the other functions are similar. Each thread
148 | gets its own copy of the following function:
149 | 
150 |     bool storeImpl(uint64_t* dst, uint64_t val) {
151 |     do_store:
152 |       *dst = val;
153 |     success_path:
154 |       return success;
155 |     failure_path:
156 |       return failure;
157 |     }
158 | 
159 | That is to say, we dynamically generate the assembly for storeImpl once per
160 | thread. Note that failure_path is unreachable as written.
161 | 
162 | Additionally, there is a global cache that maps cpu -> thread owning that
163 | cpu, and a thread-local int that indicates the CPU a thread thinks it's
164 | running on. In rseq::begin(), we see if globalCpuOwner[myCachedCpu] == me,
165 | and if so, return myCachedCpu.
166 | 
167 | The interesting case is if we detect an ownership change. If that happens, we
168 | update myCachedCpu, and look at globalCpuOwner[myCachedCpu] with the new
169 | value of myCachedCpu. We're going to block that thread's stores. We do so by
170 | patching the victim thread's copy of storeImpl to instead look like:
171 | 
172 |     bool storeImpl(uint64_t* dst, uint64_t val) {
173 |     do_store:
174 |       goto failure_path; // Store instruction has been overwritten with a jump!
175 |     success_path:
176 |       return success;
177 |     failure_path:
178 |       return failure;
179 |     }
180 | 
181 | After this store becomes visible to the victim, we know that any victim rseqs
182 | are done, and we may proceed; we cas ourselves into becoming the owner of the
183 | CPU and are done.
184 | 
185 | The implementation is slightly more complicated; we need an
186 | asymmetricThreadFence() to make sure the victim thread has made its
187 | operations visible and seen the blocking of its operations. By looking at
188 | /proc/self/task/<victim_tid>/stat, we can usually tell if the other thread
189 | has been migrated or simply descheduled, and thereby usually avoid the fence.
190 | As described, we have an ABA issue when a victim thread has its operations
191 | blocked and re-enables them and runs again on the same CPU. We fix this by
192 | having globalCpuOwner[n] store a <owner, curEvictor> pair rather than just
193 | the owner.
194 | 
195 | Note that if we aren't able to prove that the previous thread running on this
196 | CPU has been descheduled (say, because thread migrations are very frequent),
197 | then we have to take a slow path involving an IPI (triggered by an mprotect)
198 | more often. This can cause overheads on the order of microseconds per scheduling
199 | quantum.
200 | 
201 | 
202 | ## Dangers
203 | ***
204 | 
205 | We break a few rules at several layers of the stack. These are described below.
206 | To increase our confidence that this behavior won't manifest, we include some
207 | stress tests (see `Readme.md` for more information on how to build and run
208 | tests).
209 | 
210 | ### The CPU
211 | Our approach (patching a store to a jump without synchronization) is officially
212 | disallowed by the Intel architecture manuals
213 | (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
214 | section 8.1.3, "Handling Self- and Cross-Modifying Code"). As a practical
215 | matter, no problems have appeared under our stress testing. The AMD manuals
216 | (http://support.amd.com/TechDocs/24593.pdf section 7.6.1) guarantee that it
217 | works. Reading between the lines a little bit, I think this is likely safe
218 | (we're patching a single-micro-op instruction to another single-micro-op
219 | instruction at a word-aligned boundary that is only ever jumped to). Windows
220 | hot-patching points do something similar (patching a NOP to a jump instead of a
221 | store), so hopefully Intel will be conservative with this sort of behavior.
222 | 
223 | An alternative approach would be to abuse the breakpoint mechanism (int3). We
224 | install a sigtrap handler that checks if the breakpoint we hit was on our copy
225 | of the store function, and if so moves the pc to the failure path. The evicting
226 | thread sets the breakpoint on the victim's copy of store and does an
227 | asymmetricThreadFenceHeavy(). This assumes that cross-modifying breakpoint
228 | insertion is allowed. This isn't stated explicitly, but the assumption is used
229 | in the Linux kernel, so Intel will have a harder time breaking it. A fancier
230 | variant is the following:
231 |   - Insert the breakpoint on the store.
232 |   - asymmetricThreadFenceHeavy()
233 |   - Change the rest of the bytes in the store to a jump.
234 |   - asymmetricThreadFenceHeavy()
235 |   - Change the breakpoint to the first byte of the jump.
236 | This makes it far less likely that the victim will have to hit the breakpoint.
237 | In either case, we can try to use the /proc/self/ check mentioned above to avoid
238 | the asymmetricThreadFenceHeavy()s.
239 | 
240 | A completely safe but slower approach is to put each thread's copies of its
241 | functions on a page specific to that thread. An evicting thread removes the
242 | execute permissions of the victim thread's page to stop it, and the victim fixes
243 | things up in a sigsegv handler.
244 | 
245 | The advantages of the current approach over the others are speed (no cross-core
246 | activity on the fast path) and the fact that it does not need to steal a signal.
247 | 
248 | 
249 | ### The kernel
250 | 
251 | There are two issues here.
252 | 
253 | #### The mprotect hack
254 | We assume that our asymmetricThreadFenceHeavy() call gets the effect of a
255 | sys_membarrier() for cheap (i.e. without descheduling the calling thread). This
256 | works for now, about which Linus says "I'd be a bit leery about it"
257 | (https://lists.lttng.org/pipermail/lttng-dev/2015-March/024269.html).
258 | 
259 | #### Trusting `/proc/stat/task/<tid>/stat`
260 | To avoid the cost of the asymmetricThreadFenceHeavy() down the fast path where
261 | the victim has been descheduled rather than changed CPUs, we read its CPU out of
262 | /proc and see that it's assigned to our CPU; we then know that it will see the
263 | eviction. This works because the task's CPU is updated on the old CPU before it
264 | changes CPUs and begins running. If the kernel changes this, we'll break.
265 | 
266 | 
267 | ### The compiler
268 | 
269 | We have a few bits of undefined behavior:
270 | 
271 | - We manipulate pointers via a uintptr_t, and reinterpret the manipulated
272 |   address as a pointer.
273 | - There are a few instances of what I think are strict aliasing violations (the
274 |   code patching, rseq_repr_t, maybe elsewhere).
275 | - We use volatile as a stand-in for real atomics in places where we need C99
276 |   compatibility, and use heuristic arguments about compiler reorderings and the
277 |   fact that we're only concerned with x86.
278 | 


--------------------------------------------------------------------------------
/rseq/RseqTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #include "rseq/Rseq.h"
 11 | 
 12 | #include <sched.h>
 13 | 
 14 | #include <cstdint>
 15 | #include <cstring>
 16 | #include <functional>
 17 | #include <memory>
 18 | #include <mutex>
 19 | #include <thread>
 20 | #include <vector>
 21 | 
 22 | #include <gtest/gtest.h>
 23 | 
 24 | #include "rseq/internal/CpuLocal.h"
 25 | #include "rseq/internal/SwitchToCpu.h"
 26 | 
 27 | TEST(RseqMemberAddr, GetsAddresses) {
 28 |   struct Type {
 29 |     int field1;
 30 |     char field2;
 31 |     float arrayField[17];
 32 |     double trailingField;
 33 |   };
 34 |   Type* t = new Type;
 35 |   EXPECT_EQ(&t->field1, RSEQ_MEMBER_ADDR(t, field1));
 36 |   EXPECT_EQ(&t->field2, RSEQ_MEMBER_ADDR(t, field2));
 37 |   EXPECT_EQ(&t->arrayField[0], RSEQ_MEMBER_ADDR(t, arrayField));
 38 |   EXPECT_EQ(&t->arrayField[0], &RSEQ_MEMBER_ADDR(t, arrayField)[0]);
 39 |   EXPECT_EQ(&t->arrayField[11], &RSEQ_MEMBER_ADDR(t, arrayField)[11]);
 40 |   EXPECT_EQ(&t->arrayField[11], RSEQ_MEMBER_ADDR(t, arrayField) + 11);
 41 |   EXPECT_EQ(&t->trailingField, RSEQ_MEMBER_ADDR(t, trailingField));
 42 |   delete t;
 43 |   // t is deleted; if sanitiizers are going to complain about these, they'll do
 44 |   // it now.
 45 |   void* volatile ignored;
 46 |   ignored = RSEQ_MEMBER_ADDR(t, field1);
 47 |   ignored = RSEQ_MEMBER_ADDR(t, field2);
 48 |   ignored = RSEQ_MEMBER_ADDR(t, arrayField) + 11;
 49 |   ignored = RSEQ_MEMBER_ADDR(t, trailingField);
 50 | 
 51 |   // Make sure that it even works with null.
 52 |   t = nullptr;
 53 |   ignored = RSEQ_MEMBER_ADDR(t, field2);
 54 | 
 55 |   // Silence warnings about an unused variable.
 56 |   (void) ignored;
 57 | }
 58 | 
 59 | TEST(RseqMemberAddr, PreservesQualifiers) {
 60 |   enum Qualification {
 61 |     kInvalid,
 62 |     kUnqualified,
 63 |     kConst,
 64 |     kVolatile,
 65 |     kConstVolatile,
 66 |   };
 67 |   struct DoesStores {
 68 |     void doStore(Qualification* qualification) {
 69 |       *qualification = kUnqualified;
 70 |     }
 71 |     void doStore(Qualification* qualification) const {
 72 |       *qualification = kConst;
 73 |     }
 74 |     void doStore(Qualification* qualification) volatile {
 75 |       *qualification = kVolatile;
 76 |     }
 77 |     void doStore(Qualification* qualification) const volatile {
 78 |       *qualification = kConstVolatile;
 79 |     }
 80 |   };
 81 | 
 82 |   struct Holder {
 83 |     DoesStores doesStores;
 84 |   };
 85 | 
 86 |   Holder holder;
 87 |   Holder* unqualifiedHolder = &holder;
 88 |   const Holder* constHolder = &holder;
 89 |   volatile Holder* volatileHolder = &holder;
 90 |   const volatile Holder* constVolatileHolder = &holder;
 91 | 
 92 |   Qualification qualification = kInvalid;
 93 | 
 94 |   RSEQ_MEMBER_ADDR(unqualifiedHolder, doesStores)->doStore(&qualification);
 95 |   EXPECT_EQ(kUnqualified, qualification);
 96 | 
 97 |   RSEQ_MEMBER_ADDR(constHolder, doesStores)->doStore(&qualification);
 98 |   EXPECT_EQ(kConst, qualification);
 99 | 
100 |   RSEQ_MEMBER_ADDR(volatileHolder, doesStores)->doStore(&qualification);
101 |   EXPECT_EQ(kVolatile, qualification);
102 | 
103 |   RSEQ_MEMBER_ADDR(constVolatileHolder, doesStores)->doStore(&qualification);
104 |   EXPECT_EQ(kConstVolatile, qualification);
105 | }
106 | 
107 | // It's hard to verify that the atomics actually act atomic; we just make sure
108 | // the things that ought to compile do.
109 | TEST(RseqValue, ActsLikeAtomic) {
110 |   rseq::Value<int> i0;
111 |   rseq::Value<int> i1(1);
112 |   rseq::Value<int> i2{1};
113 |   rseq::Value<double> d;
114 | 
115 |   rseq::Value<short> s;
116 |   short s2 = s = 1;
117 |   s.store(1);
118 |   s.store(1, std::memory_order_relaxed);
119 |   EXPECT_EQ(1, s.load());
120 |   EXPECT_EQ(1, s.load(std::memory_order_acquire));
121 |   EXPECT_EQ(1, s.exchange(2));
122 |   EXPECT_EQ(2, s.load());
123 |   EXPECT_EQ(2, s.exchange(2, std::memory_order_relaxed));
124 |   short expected = 1;
125 |   EXPECT_FALSE(s.compare_exchange_weak(expected, 3));
126 |   EXPECT_EQ(2, expected);
127 |   EXPECT_TRUE(s.compare_exchange_weak(expected, 3));
128 |   s.compare_exchange_weak(expected, 0, std::memory_order_relaxed);
129 |   s.compare_exchange_weak(
130 |       expected, 0, std::memory_order_relaxed, std::memory_order_relaxed);
131 |   expected = 1;
132 |   s.store(2);
133 |   EXPECT_FALSE(s.compare_exchange_strong(expected, 3));
134 |   EXPECT_EQ(2, expected);
135 |   EXPECT_TRUE(s.compare_exchange_strong(expected, 3));
136 |   s.compare_exchange_strong(expected, 0, std::memory_order_relaxed);
137 |   s.compare_exchange_strong(
138 |       expected, 0, std::memory_order_relaxed, std::memory_order_relaxed);
139 | }
140 | 
141 | TEST(Rseq, StoresCorrectly) {
142 |   std::uint64_t threadsPerCore = 200;
143 |   std::uint64_t incrementsPerThread = 1000000;
144 |   std::uint64_t numCores = rseq::internal::numCpus();
145 |   std::uint64_t numThreads = threadsPerCore * numCores;
146 | 
147 |   rseq::internal::CpuLocal<rseq::Value<std::uint64_t>> counters;
148 |   for (int i = 0; i < numCores; ++i) {
149 |     *counters.forCpu(i) = 0;
150 |   }
151 |   std::vector<std::thread> threads(numThreads);
152 |   for (int i = 0; i < numThreads; ++i) {
153 |     threads[i] = std::thread([&]() {
154 |       for (int j = 0; j < incrementsPerThread; ++j) {
155 |         while (true) {
156 |           int cpu = rseq::begin();
157 |           rseq::Value<std::uint64_t>* target = counters.forCpu(cpu);
158 |           if (rseq::store(target, target->load() + 1)) {
159 |             break;
160 |           }
161 |         }
162 |       }
163 |     });
164 |   }
165 |   for (int i = 0; i < numThreads; ++i) {
166 |     threads[i].join();
167 |   }
168 |   std::uint64_t sum = 0;
169 |   for (int i = 0; i < numCores; ++i) {
170 |     sum += *counters.forCpu(i);
171 |   }
172 |   EXPECT_EQ(numThreads * incrementsPerThread, sum);
173 | }
174 | 
175 | TEST(Rseq, StoreFencesCorrectly) {
176 |   // First test that it does a store.
177 |   rseq::Value<int> dst(0);
178 |   /* int cpu = */ rseq::begin();
179 |   EXPECT_TRUE(rseq::store(&dst, 1));
180 |   EXPECT_EQ(1, dst.load());
181 | 
182 |   // Can't test fencing with only one processor.
183 |   if (rseq::internal::numCpus() < 2) {
184 |     return;
185 |   }
186 |   // We test fencing with dekker locking. The protected data is the counter
187 |   // below.
188 |   const int kIncrementsPerThread = 10000000;
189 |   std::uint64_t counter1 = 0;
190 |   std::uint64_t counter2 = 0;
191 |   alignas(64) rseq::Value<int> turn;
192 |   alignas(64) std::atomic<bool> interested0;
193 |   alignas(64) std::atomic<bool> interested1;
194 |   std::atomic<bool>* interested[] = {&interested0, &interested1};
195 | 
196 |   std::thread threads[2];
197 |   for (int i = 0; i < 2; ++i) {
198 |     threads[i] = std::thread([&, i]() {
199 |       rseq::internal::switchToCpu(i);
200 |       for (int j = 0; j < kIncrementsPerThread; ++j) {
201 |         EXPECT_EQ(i, rseq::begin());
202 |         interested[i]->store(true, std::memory_order_relaxed);
203 |         EXPECT_TRUE(rseq::storeFence(&turn, 1 - i));
204 |         while (interested[1 - i]->load() && turn.load() != i) {
205 |           // spin
206 |         }
207 |         EXPECT_TRUE(counter1 == counter2);
208 |         ++counter1;
209 |         ++counter2;
210 |         interested[i]->store(false, std::memory_order_release);
211 |       }
212 |     });
213 |   }
214 |   for (int i = 0; i < 2; ++i) {
215 |     threads[i].join();
216 |   }
217 |   EXPECT_EQ(2 * kIncrementsPerThread, counter1);
218 |   EXPECT_EQ(2 * kIncrementsPerThread, counter2);
219 | }
220 | 
221 | TEST(Rseq, LoadsCorrectly) {
222 |   int numThreads = 10;
223 |   int rseqsPerThread = 100;
224 | 
225 |   rseq::Value<std::uint64_t> value(0);
226 |   std::atomic<int> numThreadsAlive(numThreads);
227 |   std::vector<std::thread> threads(numThreads);
228 |   for (int i = 0; i < numThreads; ++i) {
229 |     threads[i] = std::thread([&, i]() {
230 |       rseq::internal::switchToCpu(0);
231 |       for (int j = 0; j < rseqsPerThread; ++j) {
232 |         int cpu = rseq::begin();
233 |         EXPECT_EQ(0, cpu);
234 |         if (!rseq::store(&value, i)) {
235 |           continue;
236 |         }
237 |         while (true) {
238 |           if (numThreadsAlive.load() == 1) {
239 |             break;
240 |           }
241 |           std::uint64_t loadedValue = numThreads + 1;
242 |           if (!rseq::load(&loadedValue, &value)) {
243 |             EXPECT_EQ(numThreads + 1, loadedValue);
244 |             break;
245 |           }
246 |           EXPECT_EQ(i, loadedValue);
247 |         }
248 |       }
249 |       numThreadsAlive.fetch_sub(1);
250 |     });
251 |   }
252 |   for (int i = 0; i < numThreads; ++i) {
253 |     threads[i].join();
254 |   }
255 | }
256 | 
257 | TEST(Rseq, EndsCorrectly) {
258 |   // A call to end() has no observable behavior; we test to make sure that it
259 |   // won't cause crashes, but not much else.
260 |   int numThreads = 100;
261 |   int incrementsPerRseq = 100;
262 |   int numRseqs = 10000;
263 |   std::vector<std::thread> threads(numThreads);
264 | 
265 |   rseq::Value<std::uint64_t> counter(0);
266 |   std::atomic<std::uint64_t> atomicCounter(0);
267 | 
268 |   for (int i = 0; i < numThreads; ++i) {
269 |     threads[i] = std::thread([&]() {
270 |       std::uint64_t localCounter = 0;
271 |       rseq::internal::switchToCpu(0);
272 |       for (int j = 0; j < numRseqs; ++j) {
273 |         int cpu = rseq::begin();
274 |         EXPECT_EQ(0, cpu);
275 |         for (int k = 0; k < incrementsPerRseq; ++k) {
276 |           std::uint64_t view = counter.load();
277 |           bool success = rseq::store(&counter, view + 1);
278 |           if (!success) {
279 |             break;
280 |           }
281 |           ++localCounter;
282 |         }
283 |         rseq::end();
284 |       }
285 |       atomicCounter.fetch_add(localCounter);
286 |     });
287 |   }
288 |   for (int i = 0; i < numThreads; ++i) {
289 |     threads[i].join();
290 |   }
291 |   EXPECT_EQ(atomicCounter.load(), counter.load());
292 | }
293 | 
294 | // Very dumb implementation based on spinning, but its enough to test the
295 | // fencing primitives.
296 | class RWLock {
297 |  public:
298 |   // If fenceWith is positive, we fence with that cpu. If it's -1, we fence with
299 |   // *all* CPUs.
300 |   explicit RWLock(int fenceWith)
301 |     : readersMayBegin_(true),
302 |       fenceWith_(fenceWith) {
303 |     for (int i = 0; i < rseq::internal::numCpus(); ++i) {
304 |       readerCounts_.forCpu(i)->store(0);
305 |     }
306 |   }
307 | 
308 |   void lock() {
309 |     while (!readersMayBegin_.exchange(false)) {
310 |     }
311 |     if (fenceWith_ == -1) {
312 |       rseq::fence();
313 |     } else {
314 |       rseq::fenceWith(fenceWith_);
315 |     }
316 |     std::int64_t sum;
317 |     do {
318 |       sum = 0;
319 |       for (int i = 0; i < rseq::internal::numCpus(); ++i) {
320 |         sum += readerCounts_.forCpu(i)->load();
321 |       }
322 |     } while (sum != 0);
323 |   }
324 | 
325 |   void unlock() {
326 |     readersMayBegin_.store(true);
327 |   }
328 | 
329 |   void lock_shared() {
330 |     while (true) {
331 |       int cpu = rseq::begin();
332 |       if (!readersMayBegin_.load()) {
333 |         continue;
334 |       }
335 |       std::int64_t curCount = readerCounts_.forCpu(cpu)->load();
336 |       if (rseq::store(readerCounts_.forCpu(cpu), curCount + 1)) {
337 |         break;
338 |       }
339 |     }
340 |   }
341 | 
342 |   void unlock_shared() {
343 |     while (true) {
344 |       int cpu = rseq::begin();
345 |       std::int64_t curCount = readerCounts_.forCpu(cpu)->load();
346 |       if (rseq::store(readerCounts_.forCpu(cpu), curCount - 1)) {
347 |         break;
348 |       }
349 |     }
350 |   }
351 | 
352 |  private:
353 |   std::atomic<bool> readersMayBegin_;
354 |   rseq::internal::CpuLocal<rseq::Value<std::int64_t>> readerCounts_;
355 |   int fenceWith_;
356 | };
357 | 
358 | void runFenceTest(
359 |     int numReaders,
360 |     int numReadLocks,
361 |     int numWriteLocks,
362 |     bool tieReadersToSameCpu) {
363 |   rseq::internal::switchToCpu(0);
364 |   int fenceWith;
365 |   if (tieReadersToSameCpu) {
366 |     fenceWith = rseq::internal::numCpus() > 1 ? 1 : 0;
367 |   } else {
368 |     fenceWith = -1;
369 |   }
370 | 
371 |   RWLock lock(fenceWith);
372 |   std::uint64_t val1 = 0;
373 |   std::uint64_t val2 = 0;
374 | 
375 |   std::vector<std::thread> threads(numReaders);
376 | 
377 |   for (int i = 0; i < numReaders; ++i) {
378 |     threads[i] = std::thread([&, i]() {
379 |       if (tieReadersToSameCpu) {
380 |         rseq::internal::switchToCpu(fenceWith);
381 |       } else {
382 |         rseq::internal::switchToCpu(i % rseq::internal::numCpus());
383 |       }
384 | 
385 |       for (int j = 0; j < numReadLocks; ++j) {
386 |         lock.lock_shared();
387 |         EXPECT_TRUE(val1 == val2);
388 |         lock.unlock_shared();
389 |       }
390 |     });
391 |   }
392 |   for (int i = 0; i < numWriteLocks; ++i) {
393 |     lock.lock();
394 |     EXPECT_TRUE(val1 == val2);
395 |     ++val1;
396 |     ++val2;
397 |     lock.unlock();
398 |   }
399 |   for (int i = 0; i < numReaders; ++i) {
400 |     threads[i].join();
401 |   }
402 | }
403 | 
404 | TEST(Rseq, FenceWithsCorrectly) {
405 |   runFenceTest(10, 100000, 10000000, true);
406 | }
407 | 
408 | TEST(Rseq, FencesCorrectly) {
409 |   runFenceTest(40, 10000, 100000, false);
410 | }
411 | 
412 | TEST(Rseq, ReinitializesCorrectly) {
413 |   static pthread_key_t key1;
414 |   static pthread_key_t key2;
415 |   static pthread_key_t key3;
416 |   static std::once_flag once;
417 |   static void (*destructor3)(void*) = [](void*) {
418 |     rseq::begin();
419 |   };
420 |   static void (*destructor1)(void*) = [](void*) {
421 |     rseq::begin();
422 |     pthread_setspecific(key3, reinterpret_cast<void*>(3));
423 |   };
424 |   static void (*destructor2)(void*) = [](void*) {
425 |     rseq::begin();
426 |   };
427 | 
428 |   std::call_once(once, []() {
429 |     pthread_key_create(&key1, destructor1);
430 |     pthread_key_create(&key2, destructor2);
431 |     pthread_key_create(&key3, destructor3);
432 |   });
433 |   std::thread t([&]() {
434 |     pthread_setspecific(key1, reinterpret_cast<void*>(1));
435 |     rseq::begin();
436 |     pthread_setspecific(key2, reinterpret_cast<void*>(2));
437 |   });
438 |   t.join();
439 | }
440 | 


--------------------------------------------------------------------------------
/rseq/Rseq.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | #pragma once
 11 | 
 12 | #include <atomic>
 13 | #include <cstring>
 14 | #include <type_traits>
 15 | 
 16 | #include "rseq/internal/Likely.h"
 17 | #include "rseq/internal/Rseq.h"
 18 | #include "rseq/internal/rseq_c.h"
 19 | 
 20 | namespace rseq {
 21 | 
 22 | template <typename T>
 23 | class Value;
 24 | 
 25 | template <typename T>
 26 | bool load(T* dst, const Value<T>* src);
 27 | 
 28 | template <typename T, typename U>
 29 | bool store(Value<T>* dst, U&& val);
 30 | 
 31 | template <typename T, typename U>
 32 | bool storeFence(Value<T>* dst, U&& val);
 33 | 
 34 | // Overview
 35 | //
 36 | // This is a userspace take on the kernel restartable-sequences API. This allows
 37 | // efficient per-cpu atomic operations that don't use barriers. A thread can
 38 | // begin a restartable sequence (henceforth, "rseq"), and do rseq-load's and
 39 | // rseq-stores. These are just like normal loads and stores (they're efficient
 40 | // and don't come with any built-in barriers), which one exception: if another
 41 | // thread has begun an rseq on the same CPU, then the load / store doesn't take
 42 | // place, and returns an error code instead.
 43 | //
 44 | // See Rseq.md for a more thorough overview.
 45 | 
 46 | // Example
 47 | //
 48 | // It's well known that using CAS, one can implement an arbitrary fetch-and-phi
 49 | // operation (where 'phi' is any function from X -> X). When we want to do these
 50 | // operations per-cpu, rseq can result in dramatic speed-ups.
 51 | //
 52 | // Without rseq:
 53 | // std::atomic<int> data[kNumCpus];
 54 | //
 55 | // int fetchAndSquare() {
 56 | //   while (true) {
 57 | //     int cpu = sched_getcpu();
 58 | //     int cur = data[cpu].load(std::memory_order_relaxed);
 59 | //     if (data[cpu].compare_exchange_strong(cur, cur * cur)) {
 60 | //       return cur;
 61 | //     }
 62 | //   }
 63 | // }
 64 | //
 65 | // With rseq:
 66 | // rseq::Value<int> data[kNumCpus];
 67 | //
 68 | // int fetchAndSquare() {
 69 | //   while (true) {
 70 | //     int cpu = rseq::begin();
 71 | //     int cur = data[cpu].load(std::memory_order_relaxed);
 72 | //     if (rseq::store(&data[cpu], cur * cur)) {
 73 | //       return cur;
 74 | //     }
 75 | //   }
 76 | // }
 77 | //
 78 | // This does the same operation, and has about the same complexity, but the rseq
 79 | // version is significantly faster; it does a plain store instead of an
 80 | // expensive atomic operation.
 81 | //
 82 | // Rseq can also solve the other tricky issue with concurrent data structures
 83 | // built around CAS: the ABA problem. See Rseq.md for a more complete example.
 84 | 
 85 | // Caveats
 86 | // 1. The current implementation assumes x86-64 / TSO semantics (this isn't
 87 | // fundamental, but is something to keep in mind before trying to port to
 88 | // another architecture).
 89 | //
 90 | // 2. We only support types <= 8 bytes.
 91 | //
 92 | // 3. Down a slow-path, we may do an operation taking O(microseconds) (at most
 93 | // once a scheduling quantum). We try to avoid it, but can't make any
 94 | // guarantees.
 95 | 
 96 | // API and memory model specifics
 97 | //
 98 | // An rseq is started by a call to rseq::begin(). This returns an integer in
 99 | // [0, numCpus - 1], intended to be used as an index into per-cpu sharded data;
100 | // the integer tells us which cpu's data we should use).
101 | // The rseq lasts for an unspecified amount of time after the call. It might
102 | // even terminate immediately after beginning; the length of an rseq is a QoI
103 | // issue, not an API guarantee (we try very hard to ensure that an rseq lasts
104 | // at least until the initiating thread gets descheduled).
105 | //
106 | // Rseqs started with the same rseq::begin() return value are totally ordered;
107 | // the stores done in or visible to an rseq with shard index N are always
108 | // visible to subsequent rseqs with shard index N. An rseq may end at any time
109 | // (even spuriously; an rseq may end even if no other thread has begun an rseq
110 | // since this one began). Therefore, a thread that reads some sharded data
111 | // within an rseq should almost always ensure that the view it got was
112 | // consistent, by checking that the rseq is still ongoing at some point after
113 | // the reads are done.
114 | //
115 | // A warning on pointer-chasing:
116 | // Rseqs have seqlock-like semantics. The data you read might not be consistent;
117 | // the only way to be sure you saw a consistent view of things is if you find
118 | // that the rseq is ongoing at some point after you read some data. Following a
119 | // pointer is dangerous unless you're sure that the pointed-to data will still
120 | // be alive even if you rseq has ended at the time of the read. This is done
121 | // most easily by reading any unsafe data through rseq::load(). Note that you
122 | // probably want to use RSEQ_MEMBER_ADDR if you do this.
123 | //
124 | // rseq::Value objects are API-compatabile with std::atomics, including the use
125 | // of std::memory_orders (with the same semantics).
126 | //
127 | // RSEQ_MEMBER_ADDR macro:
128 | // In general, we use rseq::load because we want to load a member from a struct
129 | // whose existence we aren't sure about. But if we have a SomeType* someTypePtr,
130 | // it's undefined behavior to do *anything at all* with it unless we know that
131 | // the pointed-to memory has not been freed. This macro doesn't fix that, but it
132 | // attempts to obscure the fact well enough to ensure that we don't actually let
133 | // the compiler break us, and doesn't trigger an asan/ubsan/msan warnings. In
134 | // particular, it never dereferences its argument, even purely syntactically.
135 | //
136 | // This pre-decays array fields. This is almost always what you want. Example:
137 | //
138 | // struct Foo {
139 | //   float justSomeRandomData;
140 | //   bool someOtherPieceOfData;
141 | //   int arr[22];
142 | // };
143 | // Foo* foo;
144 | // auto ptr = RSEQ_MEMBER_ADDR(foo, arr);
145 | //
146 | // Then "ptr" is of type int*, not int (*)[22]. Writing
147 | // "&RSEQ_MEMBER_ADDR(foo, arr)[7]" gives a pointer to the 7th element of the
148 | // arr field of the object foo points to. Note that we shouldn't ever write that
149 | // though, since if we know foo is safe to dereference, we don't need this macro
150 | // at all. Instead we want "RSEQ_MEMBER_ADDR(foo, arr) + 7". This gives
151 | // a pointer to the element without dereferencing it.
152 | //
153 | // The macro preserves const and volatile qualifiers.
154 | //
155 | // "ptr" and "member" should be plain identifier names; advanced syntactic
156 | // constructs like commas are not supported.
157 | template <typename T>
158 | struct ReferenceRemoveExtent;
159 | template <typename T>
160 | struct ReferenceRemoveExtent<T&> {
161 |   typedef typename std::remove_extent<T>::type& type;
162 | };
163 | #define RSEQ_MEMBER_ADDR(ptr, member) \
164 |     (&reinterpret_cast< \
165 |         rseq::ReferenceRemoveExtent<decltype((ptr->member))>::type>( \
166 |             *const_cast<char*>( \
167 |                 reinterpret_cast<const volatile char*>(ptr) \
168 |                     + offsetof( \
169 |                         std::remove_reference<decltype(*ptr)>::type, member))))
170 | 
171 | 
172 | template <typename T>
173 | class Value {
174 |  public:
175 |   static_assert(sizeof(std::atomic<T>) <= 8,
176 |       "Can only have a Value<T> when T is <= 8 bytes and can be atomic!");
177 | 
178 |   Value() = default;
179 |   explicit constexpr Value(T t) : repr_(toRepr(t)) {}
180 |   Value(const Value&) = delete;
181 | 
182 |   Value& operator=(const Value&) = delete;
183 | 
184 |   T operator=(T t) {
185 |     repr_ = toRepr(t);
186 |     return t;
187 |   }
188 | 
189 |   bool is_lock_free() const {
190 |     return true;
191 |   }
192 | 
193 |   static constexpr bool is_always_lock_free() {
194 |     return true;
195 |   }
196 | 
197 |   void store(T val, std::memory_order order = std::memory_order_seq_cst) {
198 |     repr_.store(toRepr(val), order);
199 |   }
200 | 
201 |   T load(std::memory_order order = std::memory_order_seq_cst) const {
202 |     return fromRepr(repr_.load(order));
203 |   }
204 | 
205 |   /* implicit */ operator T() const {
206 |     return load();
207 |   }
208 | 
209 |   T exchange(T desired, std::memory_order order = std::memory_order_seq_cst) {
210 |     return fromRepr(repr_.exchange(toRepr(desired), order));
211 |   }
212 | 
213 |   bool compare_exchange_weak(
214 |       T& expected, T desired,
215 |       std::memory_order successOrder, std::memory_order failureOrder) {
216 |     unsigned long expectedRepr = toRepr(expected);
217 |     unsigned long desiredRepr = toRepr(desired);
218 |     bool result = repr_.compare_exchange_weak(
219 |         expectedRepr, desiredRepr, successOrder, failureOrder);
220 |     expected = fromRepr(expectedRepr);
221 |     return result;
222 |   }
223 | 
224 |   bool compare_exchange_weak(
225 |       T& expected, T desired,
226 |       std::memory_order order = std::memory_order_seq_cst) {
227 |     unsigned long expectedRepr = toRepr(expected);
228 |     unsigned long desiredRepr = toRepr(desired);
229 |     bool result = repr_.compare_exchange_weak(expectedRepr, desiredRepr, order);
230 |     expected = fromRepr(expectedRepr);
231 |     return result;
232 |   }
233 | 
234 |   bool compare_exchange_strong(
235 |       T& expected, T desired,
236 |       std::memory_order successOrder, std::memory_order failureOrder) {
237 |     unsigned long expectedRepr = toRepr(expected);
238 |     unsigned long desiredRepr = toRepr(desired);
239 |     bool result = repr_.compare_exchange_strong(
240 |         expectedRepr, desiredRepr, successOrder, failureOrder);
241 |     expected = fromRepr(expectedRepr);
242 |     return result;
243 |   }
244 | 
245 |   bool compare_exchange_strong(
246 |       T& expected, T desired,
247 |       std::memory_order order = std::memory_order_seq_cst) {
248 |     unsigned long expectedRepr = toRepr(expected);
249 |     unsigned long desiredRepr = toRepr(desired);
250 |     bool result = repr_.compare_exchange_strong(
251 |         expectedRepr, desiredRepr, order);
252 |     expected = fromRepr(expectedRepr);
253 |     return result;
254 |   }
255 | 
256 |   // We don't implement the numeric operations. I think we could, but I'm not
257 |   // knowledgeable enough about the numeric conversion rules to be sure (it's
258 |   // tricky, because we would need to e.g. implement Value<int>::fetch_add in
259 |   // terms of atomic<unsigned long>::fetch_add).
260 |   // If you actually have a use case for them, we can figure it out then (I'm
261 |   // already on the fence about allowing values of size other than 8, so that
262 |   // would tip the scales).
263 | 
264 |  private:
265 |   friend bool ::rseq::load<T>(T* dst, const Value<T>* src);
266 |   // Can't do partial specialization of friend declarations; we just make store
267 |   // with *any* types a friend.
268 |   template <typename U, typename V>
269 |   friend bool ::rseq::store(Value<U>* dst, V&& val);
270 |   template <typename U, typename V>
271 |   friend bool ::rseq::storeFence(Value<U>* dst, V&& val);
272 | 
273 |   // toRepr and fromRepr let us dodge aliasing violations and avoid dealing with
274 |   // sizes.
275 |   // Note that we static_assert using an std::atomic<T> above, so we know that T
276 |   // is trivially copyable.
277 |   static unsigned long toRepr(T t) {
278 |     unsigned long result = 0;
279 |     std::memcpy(&result, &t, sizeof(T));
280 |     return result;
281 |   }
282 | 
283 |   static T fromRepr(unsigned long repr) {
284 |     T result;
285 |     std::memcpy(&result, &repr, sizeof(T));
286 |     return result;
287 |   }
288 | 
289 |   unsigned long* raw() const {
290 |     return reinterpret_cast<unsigned long*>(
291 |         const_cast<std::atomic<unsigned long>*>(&repr_));
292 |   }
293 | 
294 |   std::atomic<unsigned long> repr_;
295 | };
296 | 
297 | // Returns a shard index. Ensures that any rseqs on other threads that received
298 | // the same shard index are over before returning.
299 | inline int begin() {
300 |   int ret = internal::threadCachedCpu()->load();
301 |   if (RSEQ_UNLIKELY(ret < 0)) {
302 |     ret = internal::beginSlowPathWrapper();
303 |   }
304 |   return ret;
305 | }
306 | 
307 | // Tries to do "*dst = *src;" in the rseq last started by this thread, with
308 | // memory_order_seq_cst semantics.
309 | // If this returns true, then the load was successful and the rseq was not yet
310 | // over at the time of the load. (Note: the store to dst may take place after
311 | // the rseq is over).
312 | // If it returns false, then the rseq ended at some point prior to the call, and
313 | // no load or store occurred.
314 | // May only be called after begin().
315 | // This is slighly slower than regular atomic loads, so those should be used
316 | // unless the load being part of the rseq is required for correctness (e.g.
317 | // pointer-chasing through dynamically allocated memory).
318 | template <typename T>
319 | bool load(T* dst, const Value<T>* src) {
320 |   // An asymmetricThreadFenceLight() belongs after the load, but we omit it to
321 |   // avoid namespace pollution. Invoking the generated code accomplishes the
322 |   // same thing.
323 |   if (sizeof(T) == 8) {
324 |     unsigned long* realDst = reinterpret_cast<unsigned long*>(dst);
325 |     return RSEQ_LIKELY(!rseq_load_trampoline(realDst, src->raw()));
326 |   } else {
327 |     unsigned long realDst;
328 |     bool result = RSEQ_LIKELY(!rseq_load_trampoline(&realDst, src->raw()));
329 |     if (result) {
330 |       *dst = Value<T>::fromRepr(realDst);
331 |     }
332 |     return result;
333 |   }
334 | }
335 | 
336 | // Tries to do "*dst = val;" in the rseq last started by this thread, with
337 | // memory_order_release semantics.
338 | // If this function returns true, then the store was performed, and the rseq was
339 | // not yet over at the time of the store.
340 | // If it returns false, then the rseq ended at some point prior to the call, and
341 | // no store occurred.
342 | // May only be called after begin().
343 | template <typename T, typename U>
344 | bool store(Value<T>* dst, U&& val) {
345 |   // Here as above we omit the asymmetricThreadFenceLight().
346 |   return RSEQ_LIKELY(
347 |       !rseq_store_trampoline(
348 |           dst->raw(),
349 |           Value<T>::toRepr(static_cast<decltype(val)&&>(val))));
350 | }
351 | 
352 | // Tries to do "*dst = val;" in the rseq last started by this thread, with
353 | // memory_order_seq_cst semantics.
354 | // If this function returns true, then the store was performed, and the rseq was
355 | // not yet over at the time of the store.
356 | // If it returns false, then the rseq ended at some point prior to the call, and
357 | // no store occurred.
358 | // May only be called after begin().
359 | template <typename T, typename U>
360 | bool storeFence(Value<T>* dst, U&& val) {
361 |   // Here as above we omit the asymmetricThreadFenceLight().
362 |   return RSEQ_LIKELY(
363 |       !rseq_store_fence_trampoline(
364 |           dst->raw(),
365 |           Value<T>::toRepr(static_cast<decltype(val)&&>(val))));
366 | }
367 | 
368 | // If this returns true, then the rseq last started by this thread has not yet
369 | // ended (and therefore, no other thread has called begin() and gotten back the
370 | // same shard index as the calling thread after the calling thread).
371 | inline bool validate() {
372 |   Value<unsigned long> dummy;
373 |   return store(&dummy, 0);
374 | }
375 | 
376 | // Ends the current rseq.
377 | // This does an atomic operation; in general it's better to just not do anything
378 | // and wait until you hit a failure in an rseq operation.
379 | // If you know you're likely to get descheduled soon (e.g. you're about to
380 | // sleep), or that a thread on another CPU will try to acquire ownership of the
381 | // current CPU (presumably while you do something else), then calling this first
382 | // can speed up that thread's call to begin().
383 | inline void end() {
384 |   internal::endWrapper();
385 | }
386 | 
387 | // Inserts a synchronization point in the rseq ordering of shard (ending the
388 | // rseq prior to that point). Stores visible to rseqs on that shard before the
389 | // point are visble to this thread after this function returns. Stores visible
390 | // to this thread are visible to rseqs that occur after the point.
391 | //
392 | // This isn't really any faster that fence() in most cases. However:
393 | // - Include fenceWith() makes the description of the memory model effects of
394 | //   fence() simpler.
395 | // - There are some optimizations we can apply that will make fenceWith() faster
396 | //   than a plain fence().
397 | inline void fenceWith(int shard) {
398 |   internal::fenceWithWrapper(shard);
399 | }
400 | 
401 | // Equivalent to, but faster than, a call to fenceWith each each possible
402 | // argument.
403 | inline void fence() {
404 |   internal::fenceWrapper();
405 | }
406 | 
407 | } // namespace rseq
408 | 


--------------------------------------------------------------------------------
/RseqBenchmark.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the BSD-style license found in the
  6 |  * LICENSE file in the root directory of this source tree. An additional grant
  7 |  * of patent rights can be found in the PATENTS file in the same directory.
  8 |  */
  9 | 
 10 | /*
 11 | 
 12 | `./rseq_benchmark` for usage.
 13 | 
 14 | The output of lscpu on my machine:
 15 | Architecture:          x86_64
 16 | CPU op-mode(s):        32-bit, 64-bit
 17 | Byte Order:            Little Endian
 18 | CPU(s):                32
 19 | On-line CPU(s) list:   0-31
 20 | Thread(s) per core:    2
 21 | Core(s) per socket:    8
 22 | Socket(s):             2
 23 | NUMA node(s):          2
 24 | Vendor ID:             GenuineIntel
 25 | CPU family:            6
 26 | Model:                 45
 27 | Model name:            Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
 28 | Stepping:              6
 29 | CPU MHz:               2201.000
 30 | CPU max MHz:           2201.0000
 31 | CPU min MHz:           1200.0000
 32 | BogoMIPS:              4405.46
 33 | Virtualization:        VT-x
 34 | L1d cache:             32K
 35 | L1i cache:             32K
 36 | L2 cache:              256K
 37 | L3 cache:              20480K
 38 | NUMA node0 CPU(s):     0-7,16-23
 39 | NUMA node1 CPU(s):     8-15,24-31
 40 | Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid xsaveopt
 41 | 
 42 | 
 43 | According to some rough benchmarks:
 44 | When there are lots of threads
 45 |  - Counter increments using rseq stores are about 36% slower than ones using
 46 |    stack variables.
 47 |  - Counter increments using rseq stores are about 4.2x faster than ones using
 48 |    per-cpu atomics.
 49 | When there is only one thread:
 50 |  - Counter increments using rseq stores are about 9.8% slower than ones using
 51 |    stack variables.
 52 |  - Counter increments using rseq stores are about 5.3x faster than ones using
 53 |    per-cpu atomics.
 54 | 
 55 | 
 56 | The output of `./rseq_benchmark threadLocal,rseq,atomicsCachedCpu 256 100000000`:
 57 | ===========================================================
 58 | Benchmarking Thread-local operations only (no sharing)
 59 | Increments: 25600000000
 60 | Seconds: 2.739452
 61 | TSC ticks: 6026707360
 62 | Single-CPU TSC ticks per increment: 0.235418
 63 | Global TSC ticks per increment: 7.533384
 64 | ===========================================================
 65 | ===========================================================
 66 | Benchmarking Per-cpu restartable sequences
 67 | Increments: 25600000000
 68 | Seconds: 3.732481
 69 | TSC ticks: 8211339968
 70 | Single-CPU TSC ticks per increment: 0.320755
 71 | Global TSC ticks per increment: 10.264175
 72 | ===========================================================
 73 | ===========================================================
 74 | Benchmarking Per-cpu atomics (with cached sched_getcpu calls)
 75 | Increments: 25600000000
 76 | Seconds: 15.678768
 77 | TSC ticks: 34492797698
 78 | Single-CPU TSC ticks per increment: 1.347375
 79 | Global TSC ticks per increment: 43.115997
 80 | ===========================================================
 81 | 
 82 | 
 83 | The output of `./rseq_benchmark threadLocal,rseq,atomicsCachedCpu 1 100000000`:
 84 | ===========================================================
 85 | Benchmarking Thread-local operations only (no sharing)
 86 | Increments: 100000000
 87 | Seconds: 0.255986
 88 | TSC ticks: 563156988
 89 | Single-CPU TSC ticks per increment: 5.631570
 90 | Global TSC ticks per increment: 180.210236
 91 | ===========================================================
 92 | ===========================================================
 93 | Benchmarking Per-cpu restartable sequences
 94 | Increments: 100000000
 95 | Seconds: 0.281085
 96 | TSC ticks: 618375013
 97 | Single-CPU TSC ticks per increment: 6.183750
 98 | Global TSC ticks per increment: 197.880004
 99 | ===========================================================
100 | ===========================================================
101 | Benchmarking Per-cpu atomics (with cached sched_getcpu calls)
102 | Increments: 100000000
103 | Seconds: 1.478343
104 | TSC ticks: 3252272957
105 | Single-CPU TSC ticks per increment: 32.522730
106 | Global TSC ticks per increment: 1040.727346
107 | ===========================================================
108 | */
109 | 
110 | #include <algorithm>
111 | #include <atomic>
112 | #include <chrono>
113 | #include <cstdint>
114 | #include <cstdio>
115 | #include <cstdlib>
116 | #include <cstring>
117 | #include <mutex>
118 | #include <thread>
119 | #include <vector>
120 | 
121 | #include "rseq/Rseq.h"
122 | #include "rseq/internal/NumCpus.h"
123 | 
124 | constexpr int kCachelineSize = 128;
125 | 
126 | struct PercpuCounter {
127 |   std::atomic<std::uint64_t> atomicCounter;
128 |   rseq::Value<std::uint64_t> rseqCounter;
129 |   std::mutex mu;
130 |   char padding[
131 |       kCachelineSize
132 |           - sizeof(atomicCounter)
133 |           - sizeof(rseqCounter)
134 |           - sizeof(mu)];
135 | };
136 | 
137 | std::vector<PercpuCounter> counterByCpu;
138 | char padding1[kCachelineSize - sizeof(counterByCpu)];
139 | 
140 | std::mutex contendedMu;
141 | char padding2[kCachelineSize - sizeof(contendedMu)];
142 | 
143 | std::atomic<std::uint64_t> contendedCounter;
144 | 
145 | enum TestType {
146 |   kLongCriticalSection,
147 |   kContendedAtomics,
148 |   kContendedLocks,
149 |   kRseq,
150 |   kAtomics,
151 |   kAtomicsCachedCpu,
152 |   kLocks,
153 |   kLocksCachedCpu,
154 |   kThreadLocal,
155 |   kTestTypeEnd,
156 | };
157 | 
158 | const char* testTypeString(TestType testType) {
159 |   switch (testType) {
160 |     case kLongCriticalSection:
161 |         return "Long critical section";
162 |     case kContendedAtomics:
163 |         return "Contended atomics";
164 |     case kContendedLocks:
165 |         return "Contended locks";
166 |     case kRseq:
167 |         return "Per-cpu restartable sequences";
168 |     case kAtomics:
169 |         return "Per-cpu atomics";
170 |     case kAtomicsCachedCpu:
171 |         return "Per-cpu atomics (with cached sched_getcpu calls)";
172 |     case kLocks:
173 |         return "Per-cpu locks";
174 |     case kLocksCachedCpu:
175 |         return "Per-cpu locks (with cached sched_getcpu calls)";
176 |     case kThreadLocal:
177 |         return "Thread-local operations only (no sharing)";
178 |     case kTestTypeEnd:
179 |         /* should never happen */
180 |         return nullptr;
181 |   }
182 |   return nullptr;
183 | }
184 | 
185 | void doIncrementsLongCriticalSection(std::uint64_t numIncrements) {
186 |   std::lock_guard<std::mutex> lg(contendedMu);
187 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
188 |     contendedCounter.store(contendedCounter.load(std::memory_order_relaxed) + 1,
189 |                            std::memory_order_relaxed);
190 |   }
191 | }
192 | 
193 | void doIncrementsContendedAtomics(std::uint64_t numIncrements) {
194 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
195 |     std::uint64_t old = contendedCounter.load();
196 |     while (!contendedCounter.compare_exchange_weak(old, old + 1)) {
197 |     }
198 |   }
199 | }
200 | 
201 | void doIncrementsContendedLocks(std::uint64_t numIncrements) {
202 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
203 |     std::lock_guard<std::mutex> lg(contendedMu);
204 |     contendedCounter.store(contendedCounter.load(std::memory_order_relaxed) + 1,
205 |                            std::memory_order_relaxed);
206 |   }
207 | }
208 | 
209 | void doIncrementsRseq(std::uint64_t numIncrements) {
210 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
211 |     bool success = false;
212 |     do {
213 |       int cpu = rseq::begin();
214 |       std::uint64_t curVal = counterByCpu[cpu].rseqCounter.load();
215 |       success = rseq::store(&counterByCpu[cpu].rseqCounter, curVal + 1);
216 |     } while (!success);
217 |   }
218 | }
219 | 
220 | void doIncrementsAtomics(std::uint64_t numIncrements) {
221 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
222 |     std::uint64_t old;
223 |     int cpu;
224 |     do {
225 |       cpu = sched_getcpu();
226 |       old = counterByCpu[cpu].atomicCounter.load();
227 |     } while (!counterByCpu[cpu].atomicCounter.compare_exchange_weak(
228 |           old, old + 1));
229 |   }
230 | }
231 | 
232 | void doIncrementsAtomicsCachedCpu(std::uint64_t numIncrements) {
233 |   for (std::uint64_t i = 0; i < numIncrements;) {
234 |     int cpu = sched_getcpu();
235 |     for (int j = 0; j < 100 && i < numIncrements; ++i, ++j) {
236 |       std::uint64_t old = counterByCpu[cpu].atomicCounter.load();
237 |       if (!counterByCpu[cpu].atomicCounter.compare_exchange_weak(
238 |             old, old + 1)) {
239 |         break;
240 |       }
241 |     }
242 |   }
243 | }
244 | 
245 | void doIncrementsLocks(std::uint64_t numIncrements) {
246 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
247 |     int cpu = sched_getcpu();
248 |     std::lock_guard<std::mutex> lg(counterByCpu[cpu].mu);
249 |     counterByCpu[cpu].atomicCounter.store(
250 |         counterByCpu[cpu].atomicCounter.load(std::memory_order_relaxed) + 1,
251 |         std::memory_order_relaxed);
252 |   }
253 | }
254 | 
255 | void doIncrementsLocksCachedCpu(std::uint64_t numIncrements) {
256 |   for (std::uint64_t i = 0; i < numIncrements;) {
257 |     int cpu = sched_getcpu();
258 |     for (int j = 0; j < 100 && i < numIncrements; ++i, ++j) {
259 |       std::lock_guard<std::mutex> lg(counterByCpu[cpu].mu);
260 |       counterByCpu[cpu].atomicCounter.store(
261 |           counterByCpu[cpu].atomicCounter.load(std::memory_order_relaxed) + 1,
262 |           std::memory_order_relaxed);
263 |     }
264 |   }
265 | }
266 | 
267 | void doIncrementsThreadLocal(std::uint64_t numIncrements) {
268 |   volatile std::uint64_t counter = 0;
269 |   for (std::uint64_t i = 0; i < numIncrements; ++i) {
270 |     std::uint64_t oldVal = counter;
271 |     counter = oldVal + 1;
272 |   }
273 |   counterByCpu[0].atomicCounter.fetch_add(counter);
274 | }
275 | 
276 | void printErrorIfNotEqual(std::uint64_t expected, std::uint64_t actual) {
277 |   if (expected != actual) {
278 |     std::printf(
279 |         "Error: actual increment count %lu "
280 |         "does not match expected increment count %lu.\n",
281 |         actual,
282 |         expected);
283 |   }
284 | }
285 | 
286 | std::uint64_t rdtscp() {
287 |   std::uint32_t ecx;
288 |   std::uint64_t rax,rdx;
289 |   asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (ecx) : : );
290 |   return (rdx << 32) + rax;
291 | }
292 | 
293 | void runTest(
294 |     TestType testType,
295 |     std::uint64_t numThreads,
296 |     std::uint64_t numIncrements) {
297 |   contendedCounter.store(0);
298 |   for (unsigned i = 0; i < counterByCpu.size(); ++i) {
299 |     counterByCpu[i].atomicCounter.store(0);
300 |     counterByCpu[i].rseqCounter.store(0);
301 |   }
302 |   void (*benchmarkThreadFunc)(std::uint64_t) =
303 |       testType == kLongCriticalSection ? doIncrementsLongCriticalSection :
304 |       testType == kContendedAtomics ? doIncrementsContendedAtomics :
305 |       testType == kContendedLocks ? doIncrementsContendedLocks :
306 |       testType == kRseq ? doIncrementsRseq :
307 |       testType == kAtomics ? doIncrementsAtomics :
308 |       testType == kAtomicsCachedCpu ? doIncrementsAtomicsCachedCpu :
309 |       testType == kLocks ? doIncrementsLocks :
310 |       testType == kLocksCachedCpu ? doIncrementsLocksCachedCpu :
311 |       testType == kThreadLocal ? doIncrementsThreadLocal :
312 |       nullptr;
313 |   std::printf("===========================================================\n");
314 |   std::printf("Benchmarking %s\n", testTypeString(testType));
315 |   auto beginTime = std::chrono::high_resolution_clock::now();
316 |   std::uint64_t beginCycles = rdtscp();
317 |   std::vector<std::thread> threads(numThreads);
318 |   for (unsigned i = 0; i < numThreads; ++i) {
319 |     threads[i] = std::thread(benchmarkThreadFunc, numIncrements);
320 |   }
321 |   for (unsigned i = 0; i < numThreads; ++i) {
322 |     threads[i].join();
323 |   }
324 |   std::uint64_t endCycles = rdtscp();
325 |   auto endTime = std::chrono::high_resolution_clock::now();
326 |   std::uint64_t expectedIncrements = numThreads * numIncrements;
327 |   std::uint64_t actualIncrements = contendedCounter.load();
328 |   for (std::uint64_t i = 0; i < rseq::internal::numCpus(); ++i) {
329 |     actualIncrements += counterByCpu[i].atomicCounter.load();
330 |     actualIncrements += counterByCpu[i].rseqCounter.load();
331 |   }
332 |   printErrorIfNotEqual(expectedIncrements, actualIncrements);
333 |   std::chrono::nanoseconds duration = endTime - beginTime;
334 |   std::uint64_t ns = duration.count();
335 |   std::uint64_t cycles = endCycles - beginCycles;
336 |   double seconds = static_cast<double>(ns) / 1000000000.0;
337 |   std::printf("Increments: %lu \n", actualIncrements);
338 |   std::printf("Seconds: %f\n", seconds);
339 |   std::printf("TSC ticks: %lu \n", cycles);
340 |   double myCycles = static_cast<double>(cycles) / actualIncrements;
341 |   std::printf("Single-CPU TSC ticks per increment: %f\n", myCycles);
342 |   std::printf("Global TSC ticks per increment: %f\n",
343 |       rseq::internal::numCpus() * myCycles);
344 |   std::printf("===========================================================\n");
345 | }
346 | 
347 | const char* usage = R"(Usage: %s benchmarks num_threads increments_per_thread
348 |   Where 'benchmarks' is either 'all', or a comma-separated list containing the
349 |   benchmarks to run:
350 |     longCriticalSection:  Each thread acquires a single shared lock, does all
351 |                           its increments, and releases the lock.
352 | 
353 |     contendedAtomics:     Each thread updates a global counter with a CAS.
354 | 
355 |     contendedLocks:       Each thread acquires and releases a global lock for
356 |                           counter increment.
357 | 
358 |     rseq:                 Threads increment cpu-local counters using restartable
359 |                           sequences.
360 | 
361 |     atomics:              Threads increment cpu-local counters using CASs.
362 | 
363 |     atomicsCachedCpu:     Threads increment cpu-local counters using CASs, but
364 |                           only call sched_getcpu once every 100 increments (or
365 |                           until contention is detected).
366 | 
367 |     locks:                Threads increment cpu-local counters, protecting their
368 |                           increments with locks.
369 | 
370 |     locksCachedCPu:       Threads increment cpu-local counters, protecting their
371 |                           increments with locks, but only call sched_getcpu once
372 |                           every 100 increments.
373 | 
374 |     threadLocal:          Threads increment thread-local counters, with no
375 |                           synchronization.
376 | )";
377 | 
378 | std::vector<TestType> parseBenchmarks(const char* benchmarks) {
379 |   if (!strcmp(benchmarks, "all")) {
380 |     return {
381 |       kLongCriticalSection,
382 |       kContendedAtomics,
383 |       kContendedLocks,
384 |       kRseq,
385 |       kAtomics,
386 |       kAtomicsCachedCpu,
387 |       kLocks,
388 |       kLocksCachedCpu,
389 |       kThreadLocal
390 |     };
391 |   }
392 | 
393 |   std::vector<TestType> result;
394 | 
395 |   const char* benchmarksEnd = benchmarks + strlen(benchmarks);
396 | 
397 |   const char* tokBegin = benchmarks;
398 |   while (true) {
399 |     const char* tokEnd = std::strpbrk(tokBegin, ",");
400 |     if (tokEnd == nullptr) {
401 |       tokEnd = benchmarksEnd;
402 |     }
403 | 
404 |     auto matches
405 |         = [&](const char* str) { return std::equal(tokBegin, tokEnd, str); };
406 | 
407 |     TestType testType =
408 |       matches("longCriticalSection") ? kLongCriticalSection :
409 |       matches("contendedAtomics") ? kContendedAtomics :
410 |       matches("contendedLocks") ? kContendedLocks :
411 |       matches("rseq") ? kRseq :
412 |       matches("atomics") ? kAtomics :
413 |       matches("atomicsCachedCpu") ? kAtomicsCachedCpu :
414 |       matches("locks") ? kLocks :
415 |       matches("locksCachedCpu") ? kLocksCachedCpu :
416 |       matches("threadLocal") ? kThreadLocal :
417 |       kTestTypeEnd;
418 | 
419 |     if (testType == kTestTypeEnd) {
420 |       std::printf(
421 |           "Error: unknown benchmark type at the beginning of \"%s\"\n",
422 |           tokBegin);
423 |       std::exit(1);
424 |     }
425 |     result.push_back(testType);
426 | 
427 |     if (tokEnd == benchmarksEnd) {
428 |       break;
429 |     }
430 |     tokBegin = tokEnd + 1;
431 |   }
432 |   return result;
433 | }
434 | 
435 | int main(int argc, char** argv) {
436 |   if (argc != 4) {
437 |     std::printf(usage, argv[0]);
438 |     std::exit(1);
439 |   }
440 | 
441 |   std::uint64_t numThreads;
442 |   std::uint64_t numIncrements;
443 | 
444 |   std::vector<TestType> benchmarks = parseBenchmarks(argv[1]);
445 | 
446 |   numThreads = atol(argv[2]);
447 |   numIncrements = atol(argv[3]);
448 | 
449 |   if (numThreads == 0 || numIncrements == 0) {
450 |     std::printf("Error: invalid value for threads or increments\n");
451 |     std::exit(1);
452 |   }
453 | 
454 |   // PercpuCounter objects aren't moveable, so we construct a vector then swap
455 |   // it with the global one.
456 |   std::vector<PercpuCounter> p(rseq::internal::numCpus());
457 |   counterByCpu.swap(p);
458 | 
459 |   for (TestType benchmark : benchmarks) {
460 |     runTest(benchmark, numThreads, numIncrements);
461 |   }
462 | 
463 |   return 0;
464 | }
465 | 


--------------------------------------------------------------------------------