├── .gitignore ├── Jenkinsfile ├── LICENSE ├── Makefile ├── README.rst ├── benchmarks └── lockhammer │ ├── LICENSE │ ├── Makefile │ ├── README.rst │ ├── TODO │ ├── cpuorders │ └── schema.txt │ ├── graphs │ ├── github_lockhammer_all_common_20181106_cas_event_mutex_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_cas_lockref_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_cas_rw_lock_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_empty_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_event_mutex_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_incdec_refcount_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_jvm_objectmonitor_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_osq_lock_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_queued_spinlock_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_swap_mutex_200ns_1000ns.png │ ├── github_lockhammer_all_common_20181106_tbb_spin_rw_mutex_200ns_1000ns.png │ └── github_lockhammer_all_common_20181106_ticket_spinlock_200ns_1000ns.png │ ├── include │ ├── alloc.h │ ├── args.h │ ├── atomics.h │ ├── cpu_relax.h │ ├── lockhammer.h │ ├── perf_timer.h │ └── verbose.h │ ├── scripts │ ├── lh_sweepdelay_cfg.yaml │ ├── lh_sweeptest_cfg.yaml │ ├── lh_unittest_cfg.yaml │ ├── lockhammer-all.csv.xz │ ├── lockhammer-jupyter-notebook.ipynb │ ├── run-tests.sh │ ├── run_sweep_delay.sh │ ├── runall.sh │ ├── show-per-thread-lock-acquires.sh │ ├── sweep.sh │ ├── test_lockhammer.py │ └── view-results-json.sh │ ├── src │ ├── alloc.c │ ├── args.c │ ├── cpufreq-scaling-detect.c │ ├── lockhammer.c │ ├── measure.c │ └── report.c │ └── tests │ ├── cas_lockref.h │ ├── cas_rw_lock.h │ ├── empty.h │ ├── incdec_refcount.h │ └── swap_mutex.h ├── contributing.rst ├── ext ├── jvm │ └── jvm_objectmonitor.h ├── linux │ ├── hybrid_spinlock.h │ ├── hybrid_spinlock_fastdequeue.h │ ├── hybrid_spinlock_old_fastdequeue.h │ ├── include │ │ ├── lk_atomics.h │ │ ├── lk_barrier.h │ │ └── lk_cmpxchg.h │ ├── osq_lock.h │ ├── queued_spinlock.h │ └── ticket_spinlock.h ├── mysql │ ├── cas_event_mutex.h │ ├── event_mutex.h │ └── include │ │ └── ut_atomics.h ├── pagemap │ └── include │ │ └── pagemap.h ├── sms │ ├── base │ │ ├── build_config.h │ │ ├── cpu.h │ │ └── llsc.h │ └── clh_spinlock.h └── tbb │ ├── include │ └── tbb.h │ └── tbb_spin_rw_mutex.h ├── hooks └── commit-msg └── tools └── .gitignore /.gitignore: -------------------------------------------------------------------------------- 1 | benchmarks/lockhammer/build/ 2 | benchmarks/lockhammer/build.*/ 3 | benchmarks/lockhammer/*.json 4 | *.log 5 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | import static groovy.io.FileType.FILES 2 | import static groovy.io.FileType.DIRECTORIES 3 | 4 | def getRepoURL() { 5 | sh "git config --get remote.origin.url > .git/remote-url" 6 | return readFile(".git/remote-url").trim() 7 | } 8 | 9 | void setBuildStatus(String message, String state, String context) { 10 | repoUrl = getRepoURL(); 11 | step([ 12 | $class: "GitHubCommitStatusSetter", 13 | reposSource: [$class: "ManuallyEnteredRepositorySource", url: repoUrl], 14 | contextSource: [$class: "ManuallyEnteredCommitContextSource", context: context], 15 | errorHandlers: [[$class: "ChangingBuildStatusErrorHandler", result: "UNSTABLE"]], 16 | statusResultSource: [ $class: "ConditionalStatusResultSource", results: [[$class: "AnyBuildResult", message: message, state: state]] ], 17 | statusBackrefSource: [ $class: "ManuallyEnteredBackrefSource", backref: ""] 18 | ]); 19 | } 20 | 21 | node { 22 | stage('checkout') { 23 | checkout scm 24 | } 25 | 26 | stage('Build') { 27 | setBuildStatus("Building code", 'PENDING', 'Build'); 28 | def fails = [] 29 | def dir = new File("${env.WORKSPACE}/benchmarks/"); 30 | dir.traverse(type: DIRECTORIES, maxDepth: 0) { 31 | //build 32 | try { 33 | sh "make -C ${it}" 34 | } 35 | catch (exc) { 36 | fails.add(it.toString().substring(env.WORKSPACE.length())) 37 | } 38 | } 39 | if (fails) { 40 | setBuildStatus("${fails} failed to build", 'FAILURE', 'Build'); 41 | error "${fails} failed to build" 42 | } 43 | setBuildStatus("Build Successful!", 'SUCCESS', 'Build'); 44 | } 45 | 46 | stage('Test') { 47 | setBuildStatus("Testing code", 'PENDING', 'Test'); 48 | 49 | fails = [] 50 | dir = new File("${env.WORKSPACE}/benchmarks/"); 51 | // Run all scripts starting with the prefix "test" 52 | dir.traverse(type: DIRECTORIES, maxDepth: 0) { 53 | def scr = new File("${it}/scripts/") 54 | scr.traverse(type: FILES, filter: ~/.*\/test[^\/]*/) { 55 | try { 56 | sh "${it}" 57 | } 58 | catch (exc) { 59 | fails.add(it.toString().substring(env.WORKSPACE.length())) 60 | } 61 | } 62 | } 63 | if (fails) { 64 | setBuildStatus("Tests scripts: ${fails} Failed", 'FAILURE', 'Test'); 65 | error "Tests scripts: ${fails} Failed" 66 | } 67 | setBuildStatus("Tests Passed!", 'SUCCESS', 'Test'); 68 | } 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, ARM Limited. All rights reserved. 2 | 3 | SPDX-License-Identifier: BSD-3-Clause 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | Redistributions in binary form must reproduce the above copyright notice, this 12 | list of conditions and the following disclaimer in the documentation and/or 13 | other materials provided with the distribution. 14 | 15 | Neither the name of ARM Limited nor the names of its contributors may be used 16 | to endorse or promote products derived from this software without specific 17 | prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 27 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | # SPDX-FileCopyrightText: Copyright 2019-2025 Arm Limited and/or its affiliates 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | 6 | 7 | .PHONY: help 8 | 9 | LOCKHAMMER_DIR=benchmarks/lockhammer 10 | 11 | help: 12 | @echo 13 | @echo "This Makefile passes to $(LOCKHAMMER_DIR)/Makefile" 14 | @echo 15 | @echo "try:" 16 | @echo 17 | @echo " make -j 8 allvariants" 18 | @echo 19 | 20 | %:: 21 | $(MAKE) -C $(LOCKHAMMER_DIR) $(MAKEFLAGS) $(MAKECMDGOALS) 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Synchronization Benchmarks 2 | ========================== 3 | 4 | This is a micro-benchmarks suite targeting evaluation of synchronization primitives used primarily 5 | in data-center application and system software by evaluating their scalability and code overhead. It contains synchronization 6 | primitives that are both independently developed and extracted from real software applications. 7 | 8 | License 9 | ------- 10 | 11 | The software is provided under a BSD-3-Clause `license`_. Contributions to this 12 | project are accepted under the same license with developer sign-off as 13 | described in the `Contributing Guidelines`_. 14 | 15 | This project contains code from other projects, the license information for which 16 | can be found in the relevant directories or files. Any contributions to third party 17 | open source projects are under the relevant license for that project or file. 18 | 19 | Repository Contents 20 | =================== 21 | 22 | The synchronization-benchmarks repository is divided up into multiple directories with the following semantics: 23 | 24 | - tools/ -- Contains support tools for the micro-benchmarks contained in benchmarks/ such as application profilers or code 25 | analyzers. In general, support code that applies to multiple benchmarks should go here. 26 | - benchmarks/ -- Broken up into sub-directories, one for each micro-benchmark. Each sub-directory should general be structured 27 | as: 28 | 29 | - / -- The root of the directory should contain a README with build instructions, and a detailed 30 | description of the test: what it is testing, how it is testing, and how to interpret the results. The root 31 | directory should also contain the build system files. 32 | - src/ 33 | - include/ 34 | - scripts/ -- Automation scripts for running and parsing the output of your micro-benchmark 35 | 36 | - ext/ -- This is a directory for third party code taken from other projects if for instance your micro-benchmark is 37 | meant for testing example synchronization primitives for various sources. For each third party source, a sub-directory 38 | should be created that is descriptive of the origin of the imported code and the imported code placed in that sub-directory. 39 | All imported code needs to retain the original license and copyright information from the source location. 40 | For more detail on how to include third party code, please consult the `Contributing Guidelines`_. 41 | 42 | Getting Started 43 | =============== 44 | 45 | Clone this repository and add the commit-msg hook from the hooks/ directory into your .git/hooks directory. To build 46 | the microbenchmarks, follow the build and run instructions in the individual test sub-directories contained 47 | in benchmarks/. 48 | 49 | Feedback and support 50 | -------------------- 51 | 52 | Arm welcomes any feedback on this benchmark suite. If you find that this suite lacks important 53 | tests, please use the `Github issue tracker`_ to log the issue and initiate a pull request with your fixes as outlined in 54 | the `Contributing Guidelines`_. 55 | 56 | -------------- 57 | 58 | *Copyright (c) 2018, ARM Limited and Contributors. All rights reserved.* 59 | 60 | .. _GitHub: https://www.github.com/ARM-software/synchronization-benchmarks 61 | .. _GitHub issue tracker: https://github.com/ARM-software/synchronization-benchmarks/issues 62 | .. _license: ./LICENSE 63 | .. _Contributing Guidelines: ./contributing.rst 64 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, The Linux Foundation. All rights reserved. 2 | 3 | SPDX-License-Identifier: BSD-3-Clause 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following 12 | disclaimer in the documentation and/or other materials provided 13 | with the distribution. 14 | * Neither the name of The Linux Foundation nor the names of its 15 | contributors may be used to endorse or promote products derived 16 | from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 19 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 22 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 25 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 26 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 27 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 28 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/TODO: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Physical address accessing strategy 5 | - While a specified physical address can be obtained by mmap /dev/mem with 6 | CONFIG_STRICT_DEVMEM=n (and nopat on x86), we can not guarantee that memory 7 | location is freely available for use. This may be OK for simulation, but not 8 | OK on a real system. 9 | 10 | To get around this problem, find a physical address in a persistent hugepage. 11 | This means not transparent hugepages, but HugeTLB pages. Using a persistent 12 | hugepage lets us access a physical memory that also persists after the 13 | program ends so that there is repeatability. 14 | 15 | TODO: check that N=1 hugepages works with multiple NUMA domains (yes it 16 | does, but since hugepages are default round-robin distributed ("interleaved") 17 | across NUMA domains, N=1 will place only one hugepage in the first NUMA domain. 18 | If other domains are to be tested, use --hugepage-physaddr to request the 19 | hugepage with that physical address in that NUMA domain.) 20 | TODO: use get_mempolicy() to determine the NUMA domain of a hugepage. 21 | TODO: use fewer hugepages (done) 22 | TODO: add a flag to specify the hugepage physical address, and to try remapping 23 | hugepages until it is obtained again. (done) 24 | TODO: use set_mempolicy(MPOL_BIND) to place a hugepage on a node instead of the above. 25 | TODO: respect hugepage size in bytes and in kilobytes by name; it only takes in the abbreviated one right now 26 | 27 | 28 | Update SpinPause() in ext/jvm/jvm_objectmonitor.h 29 | - The SpinPause() function returns 0. However, this is only the case 30 | in now very old versions of OpenJDK. Modern versions use pause 31 | on 64-bit x86 (amd64) and a parameterized choice of a one or more 32 | ISB, or NOP on aarch64. 33 | 34 | 35 | ext/linux/hybrid_spinlock* 36 | - use the lockhammer lock pointer instead of malloc'ing mcs_pool for better reproducibility 37 | 38 | queued_spinlock 39 | - queued_spinlock uses the lock pointer as well as mcs_pool, so need a way to have both be reproducible. 40 | 41 | tbb_spin_rw_mutex 42 | - instead of doing operations on the state variable, use the test harness lock pointer for better reproducibility 43 | 44 | clh_spinlock 45 | - instead of operating on global_clh_lock, use the test harness lock pointer for better reproducibility 46 | 47 | ticket_spinlock 48 | - Modify so that USE_RELAX is effective 49 | 50 | 51 | cpufreq check: 52 | - for the intel_pstate driver, warn if no_turbo is set to 0 53 | 54 | 55 | 56 | 57 | add a memory update in the critical section 58 | - optionally store update on the same cacheline as the lock 59 | - expect a lot of kernel locks to have this 60 | - optionally store update somewhere else than the lock cache line (GUPS) 61 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/cpuorders/schema.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | naming schema 4 | 5 | # cloud instance, should be a symlink to a system cpuorder 6 | ..cpuorder 7 | 8 | # system cpuorder 9 | ..cpuorder 10 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_event_mutex_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_event_mutex_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_lockref_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_lockref_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_rw_lock_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_cas_rw_lock_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_empty_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_empty_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_event_mutex_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_event_mutex_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_incdec_refcount_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_incdec_refcount_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_jvm_objectmonitor_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_jvm_objectmonitor_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_osq_lock_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_osq_lock_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_queued_spinlock_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_queued_spinlock_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_swap_mutex_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_swap_mutex_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_tbb_spin_rw_mutex_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_tbb_spin_rw_mutex_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_ticket_spinlock_200ns_1000ns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/graphs/github_lockhammer_all_common_20181106_ticket_spinlock_200ns_1000ns.png -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/alloc.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright (c) 2017-2025, The Linux Foundation. All rights reserved. 4 | * 5 | * SPDX-License-Identifier: BSD-3-Clause 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are 9 | * met: 10 | * * Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * * Redistributions in binary form must reproduce the above 13 | * copyright notice, this list of conditions and the following 14 | * disclaimer in the documentation and/or other materials provided 15 | * with the distribution. 16 | * * Neither the name of The Linux Foundation nor the names of its 17 | * contributors may be used to endorse or promote products derived 18 | * from this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 21 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 22 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 24 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 27 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 28 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 29 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 30 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | 34 | #ifndef ALLOC_H 35 | #define ALLOC_H 36 | 37 | enum { 38 | HUGEPAGES_NONE, 39 | HUGEPAGES_DEFAULT, 40 | HUGEPAGES_64K, 41 | HUGEPAGES_2M, 42 | HUGEPAGES_32M, 43 | HUGEPAGES_512M, 44 | HUGEPAGES_1G, 45 | HUGEPAGES_16G, 46 | HUGEPAGES_MAX_ENUM 47 | }; 48 | 49 | 50 | void * do_hugepage_alloc(int use_hugepages, size_t hugepage_req_physaddr, int verbose); 51 | void * do_alloc(size_t length, int use_hugepages, size_t nonhuge_alignment, size_t hugepage_req_physaddr, int verbose); 52 | void print_hugepage_physaddr_and_exit(void * mmap_ret); 53 | 54 | // hugepage flag parameter parsing 55 | int parse_hugepage_parameter(const char * optarg); 56 | const char * hugepage_map (int enum_param_value); 57 | 58 | // function prototypes used by osq_lock 59 | uintptr_t get_phys_addr(uintptr_t vaddr); 60 | 61 | #endif 62 | 63 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 64 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/args.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #ifndef ARGS_H 33 | #define ARGS_H 34 | 35 | #include "lockhammer.h" 36 | 37 | int parse_args(int argc, char ** argv, test_args_t * pargs, const system_info_t * psysinfo); 38 | int init_sysinfo(system_info_t * psysinfo); 39 | void print_test_args(const test_args_t * p); 40 | 41 | #endif 42 | 43 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 44 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/cpu_relax.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #ifndef CPU_RELAX_H 33 | #define CPU_RELAX_H 34 | 35 | 36 | #ifndef CPU_RELAX_ITERATIONS 37 | #define CPU_RELAX_ITERATIONS 1 38 | #endif 39 | 40 | static inline void __cpu_relax(void) { 41 | for (unsigned long i = 0; i < CPU_RELAX_ITERATIONS; i++) { 42 | #ifdef __aarch64__ 43 | #if defined(RELAX_IS_ISB) 44 | asm volatile ("isb" : : : "memory" ); 45 | #elif defined(RELAX_IS_NOP) 46 | asm volatile ("nop" : : : "memory"); 47 | #elif defined(RELAX_IS_EMPTY) 48 | asm volatile ("" : : : "memory"); 49 | #elif defined(RELAX_IS_NOTHING) 50 | 51 | #endif 52 | #endif // __aarch64__ 53 | 54 | #ifdef __x86_64__ 55 | 56 | #if defined(RELAX_IS_PAUSE) 57 | // RELAX_IS_PAUSE is the implementation for x86 in jdk-9 58 | asm volatile ("rep; nop"); // aka pause 59 | #elif defined(RELAX_IS_EMPTY) 60 | asm volatile ("" : : : "memory"); 61 | #elif defined(RELAX_IS_NOTHING) 62 | 63 | #endif 64 | #endif // __x86_64__ 65 | 66 | } 67 | } 68 | 69 | #endif // CPU_RELAX_H 70 | 71 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 72 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/lockhammer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #ifndef __LOCKHAMMER_H__ 33 | #define __LOCKHAMMER_H__ 34 | 35 | 36 | // PROGRESS_TICK_PROFILE - prints each thread's timer value at lock_acquires milestones to show thread concurrency 37 | #define PROGRESS_TICK_PROFILE 38 | 39 | enum units { NS, 40 | INSTS, NOT_SET }; 41 | typedef enum units Units; 42 | 43 | #define _stringify(x) #x 44 | #define stringify(x) _stringify(x) 45 | 46 | // per_thread_results_t - each thread returns its results in this struct (inside thread_args_t) 47 | typedef struct { 48 | unsigned long cpu_affined; // which CPU this was pinned on. 49 | 50 | unsigned long lock_acquires;// number of locks acquired-and-released per thread 51 | unsigned long cputime_ns; // this thread's CPU time in nanoseconds 52 | unsigned long walltime_ns; // this thread's wall clock time in nanoseconds 53 | unsigned long hmrdepth; // depth=lock-specific notion of contention 54 | 55 | unsigned long hwtimer_start; // timer value at start of measurement loop 56 | unsigned long hwtimer_end; // "' at end 57 | 58 | unsigned long hwtimer_10p; // timer value at 10% of work completion 59 | unsigned long hwtimer_25p; // "" at 25% 60 | unsigned long hwtimer_50p; // "" at 50% 61 | unsigned long hwtimer_75p; // "" at 75% 62 | unsigned long hwtimer_90p; // "" at 90% 63 | 64 | // hold/post durations from calibrate_timer() 65 | double hold_ns, post_ns; 66 | 67 | // metrics only for osq_lock 68 | unsigned long osq_lock_wait_next_spins; 69 | unsigned long osq_unlock_wait_next_spins; 70 | unsigned long osq_lock_locked_spins; 71 | unsigned long osq_lock_unqueue_spins; 72 | unsigned long osq_lock_acquire_backoffs; 73 | 74 | } per_thread_results_t; 75 | 76 | 77 | // thread_args_t -- pointer to an instance of this is passed to each thread 78 | typedef struct { 79 | unsigned long thread_num; // thread number, ordinal 0 80 | unsigned long num_threads; // number of worker threads in total for experiment 81 | unsigned long num_acquires; // -a flag, aka nacqrs, aka number of acquires per thread to do 82 | unsigned long *lock; // pointer to the lock variable 83 | 84 | unsigned long *p_start_ns; // marshal thread's monotonic start time, in ns, for computing wall_elapsed_ns; only marshall thread sets this 85 | unsigned long hold, post; // ncrit, nparallel 86 | Units hold_unit, post_unit; // NS or INSTS, hold_unit = ncrit_units, post_unit = nparallel_units 87 | unsigned long hold_count; 88 | unsigned long post_count; 89 | 90 | double tickspns; // number of ticks_per_ns 91 | 92 | unsigned long run_on_this_cpu; // logical CPU on which a worker thread is to run 93 | 94 | unsigned long run_limit_ticks; // if non-zero, the number of timer ticks to run for when using --run-limit-ticks or --run-limit-seconds 95 | unsigned long run_limit_inner_loop_iters; // the number of lock acquire/release sequences to run before checking the hwtimer when using --run-limit-ticks or --run-limit-seconds 96 | unsigned long hwtimer_frequency; 97 | 98 | int verbose; 99 | unsigned long blackhole_numtries; 100 | 101 | per_thread_results_t results; // output data structure 102 | 103 | } thread_args_t; 104 | 105 | // pinorder_t - describes a set of CPUs on which to run worker threads 106 | typedef struct { 107 | int * cpu_list; // pointer to an array of int. index into this array is the thread number, each element is the logical CPU on which that thread is to run. 108 | size_t num_threads; // number of threads defined for this pinorder (i.e. length of the number of valid entries in the pinorder array). 109 | } pinorder_t; 110 | 111 | 112 | typedef struct { 113 | unsigned long t; // duration time, either in nanoseconds or iterations 114 | Units unit; // duration unit, either NS or INSTS 115 | } duration_t; 116 | 117 | // test_args_t - mostly command line parameters 118 | typedef struct { 119 | unsigned long num_acquires; // -a number of acquires (not documented?) 120 | duration_t * crits; // -c, --cn=, --ci= critical duration 121 | duration_t * pars; // -p, --pn=, --pi= parallel duration 122 | size_t num_crits; 123 | size_t num_pars; 124 | unsigned long ileave; // -i interleave value for SMT pinning 125 | int scheduling_policy; // -S use explicit scheduling policy 126 | size_t num_pinorders; 127 | pinorder_t * pinorders; // -o CPU pinning order 128 | unsigned long timeout_usec; // -A timeout_usec 129 | 130 | int hugepagesz; 131 | int use_mmap; 132 | int mmap_hugepage_offset_exists; 133 | int print_hugepage_physaddr; 134 | size_t mmap_hugepage_offset; 135 | size_t mmap_hugepage_physaddr; 136 | unsigned long hwtimer_frequency; 137 | unsigned long probed_hwtimer_frequency; 138 | long estimate_hwtimer_freq_cpu; 139 | 140 | double run_limit_seconds; 141 | unsigned long run_limit_ticks; 142 | unsigned long run_limit_inner_loop_iters; 143 | int ignore_unknown_scaling_governor; 144 | int suppress_cpu_frequency_warnings; 145 | const char * cpuorder_filename; 146 | #ifdef JSON_OUTPUT 147 | const char * json_output_filename; 148 | #endif 149 | #ifdef __aarch64__ 150 | char disable_outline_atomics_lse; 151 | #endif 152 | int verbose; 153 | size_t iterations; 154 | size_t blackhole_numtries; 155 | } test_args_t; 156 | 157 | // system_info_t - system configuration data 158 | typedef struct { 159 | unsigned long num_cores; // number of processors configured by the operating system 160 | size_t page_size_bytes; // page size in bytes 161 | size_t erg_bytes; // number of bytes per exclusive reservation granule (e.g. cache line/block) 162 | 163 | cpu_set_t avail_cores; // cores that the CPU affinity mask allows us to run on 164 | size_t num_avail_cores; // number of cores that the CPU affinity mask allows us to run on 165 | size_t num_online_cores; // the number of cores that getconf _NPROCESSORS_ONLN returns 166 | 167 | // num_online_cores can be less than num_cores because some may be offline or not permitted by affinity mask 168 | // num_avail_cores may be less than num_online_cores because some online cores may be isolated 169 | } system_info_t; 170 | 171 | // locks_t -- pointers to the actual locks to be used 172 | typedef struct { 173 | unsigned long * p_test_lock; // address of main lock 174 | unsigned long * p_ready_lock; // lock to synchronize all threads' entry into hmr() 175 | unsigned long * p_sync_lock; // lock to synchronize before blackhole cabliration 176 | unsigned long * p_calibrate_lock; // lock to synchronize after blackhole calibration 177 | } locks_t; 178 | 179 | // calibrate_blackhole -- (used in osq_lock) 180 | unsigned long calibrate_blackhole(unsigned long target, unsigned long tokens_low, unsigned long tokens_high, unsigned long core_id, unsigned long NUMTRIES); 181 | 182 | // evaluate_blackhole -- returns average duration of NUMTRIES 183 | int64_t evaluate_blackhole( const unsigned long tokens_mid, const unsigned long NUMTRIES); 184 | 185 | // blackhole() -- runs a small loop to consume time (also used in osq_lock) 186 | void blackhole(unsigned long iters); 187 | 188 | // measure_setup_initialize_lock() -- calls lock-specific setup routine if it exists 189 | void measure_setup_initialize_lock(locks_t * p_locks, pinorder_t * pinorder); 190 | 191 | // measure_setup_parse_test_args() -- calls lock-specific parsing routine if it exists 192 | void measure_setup_parse_test_args(test_args_t * p_test_args, int argc, char ** argv); 193 | 194 | // convert the struct timespec to only nanoseconds 195 | unsigned long timespec_to_ns (struct timespec * ts); 196 | 197 | // selectively disable LSE instructions in outline atomics/libgcc; in measure.c 198 | void handle_disable_outline_atomics_lse(void); 199 | 200 | #if __GNUC__==1 201 | #define NOINLINE __attribute__((noinline)) 202 | #elif __clang__==1 203 | #define NOINLINE __attribute__((noinline)) 204 | #else 205 | #define NOINLINE 206 | #endif 207 | 208 | #if __GNUC__==1 209 | #define NO_UNROLL_LOOP _Pragma("GCC unroll 0") 210 | #elif __clang__==1 211 | #define NO_UNROLL_LOOP _Pragma("clang loop unroll(disable)") 212 | #else 213 | #define NO_UNROLL_LOOP 214 | #endif 215 | 216 | 217 | #endif 218 | 219 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 220 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/perf_timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018, ARM Limited. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * Redistributions of source code must retain the above copyright notice, this 10 | * list of conditions and the following disclaimer. 11 | * 12 | * Redistributions in binary form must reproduce the above copyright notice, this 13 | * list of conditions and the following disclaimer in the documentation and/or 14 | * other materials provided with the distribution. 15 | * 16 | * Neither the name of ARM Limited nor the names of its contributors may be used 17 | * to endorse or promote products derived from this software without specific 18 | * prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 28 | * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | * Authors: Rob Golshan, 32 | * James Yang (James.Yang@arm.com), 33 | * Geoffrey Blake (Geoffrey.Blake@arm.com) 34 | */ 35 | 36 | /* 37 | * perf_timer.h 38 | * Functions to read hardware timers and query timer frequency. 39 | * Supports x86 and AArch64 platforms 40 | * 41 | * Define DEBUG in makefile or here if you desire debug output, 42 | * define DDEBUG if you require detailed debug output. 43 | */ 44 | 45 | #ifndef __PERF_TIMER_H_ 46 | #define __PERF_TIMER_H_ 47 | 48 | #include 49 | #include 50 | 51 | #include 52 | #include 53 | #include /* for access() */ 54 | #include 55 | 56 | #include "atomics.h" 57 | 58 | extern __thread uint64_t prev_tsc; 59 | 60 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) 61 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) 62 | 63 | /* Cautionary note about using the invariant TSC on x86: 64 | Depending upon the model of CPU, TSC may 65 | not count cycles representing the current 66 | operating frequency. It may, for example, 67 | count cycles at the maximum frequency of the 68 | device, even if the CPU core is running at a 69 | lower frequency, or it may count at a frequency 70 | unrelated to the operating frequency. Use 71 | the --estimate-hwtimer-frequency flag to measure 72 | the frequency and the --hwtimer-frequency flag to 73 | override the value detected by the code below. 74 | */ 75 | #ifdef __x86_64__ 76 | static inline uint64_t __attribute__((always_inline)) 77 | rdtsc(void) 78 | { 79 | union { 80 | uint64_t tsc_64; 81 | struct { 82 | uint32_t lo_32; 83 | uint32_t hi_32; 84 | }; 85 | } tsc; 86 | 87 | asm volatile("rdtsc" : 88 | "=a" (tsc.lo_32), 89 | "=d" (tsc.hi_32)); 90 | 91 | return tsc.tsc_64; 92 | } 93 | 94 | // rdtscp is serializing; rdtsc is not 95 | // NOTE: rdtscp can not guarantee subsequent instructions do not begin execution 96 | // before the timer is read 97 | static inline uint64_t __attribute__((always_inline)) 98 | rdtscp(void) 99 | { 100 | union { 101 | uint64_t tsc_64; 102 | struct { 103 | uint32_t lo_32; 104 | uint32_t hi_32; 105 | }; 106 | } tsc; 107 | 108 | asm volatile("rdtscp" : 109 | "=a" (tsc.lo_32), 110 | "=d" (tsc.hi_32)); 111 | 112 | return tsc.tsc_64; 113 | } 114 | 115 | static inline void __attribute__((always_inline)) 116 | cpuid(void) 117 | { 118 | uint32_t a, b, c, d; 119 | asm volatile("CPUID": 120 | "=a" (a), 121 | "=b" (b), 122 | "=c" (c), 123 | "=d" (d)); 124 | } 125 | 126 | /* CPUID creates a barrier to avoid out of order execution before rdtsc 127 | */ 128 | static inline uint64_t __attribute__((always_inline)) 129 | rdtscp_start(void) 130 | { 131 | union { 132 | uint64_t tsc_64; 133 | struct { 134 | uint32_t lo_32; 135 | uint32_t hi_32; 136 | }; 137 | } tsc; 138 | 139 | asm volatile("CPUID\n\t" /* serialize */ 140 | "RDTSC\n\t" /*read clock */ 141 | "mov %%edx, %0\n\t" 142 | "mov %%eax, %1\n\t": 143 | "=r" (tsc.hi_32), 144 | "=r" (tsc.lo_32) 145 | ::"eax", "ebx", "ecx", "edx"); 146 | 147 | return tsc.tsc_64; 148 | } 149 | 150 | /* "RDTSCP instruction waits until all previous instructions have been executed 151 | * before reading the counter. However, subsequent instructions may begin execution 152 | * before the read operation is performed.” 153 | * CPUID creates a barrier to avoid out of order execution 154 | */ 155 | static inline uint64_t __attribute__((always_inline)) 156 | rdtscp_end(void) 157 | { 158 | union { 159 | uint64_t tsc_64; 160 | struct { 161 | uint32_t lo_32; 162 | uint32_t hi_32; 163 | }; 164 | } tsc; 165 | 166 | asm volatile("RDTSCP\n\t" 167 | "mov %%edx, %0\n\t" 168 | "mov %%eax, %1\n\t" 169 | "CPUID\n\t": 170 | "=r" (tsc.hi_32), 171 | "=r" (tsc.lo_32) 172 | ::"eax", "ebx", "ecx", "edx"); 173 | 174 | return tsc.tsc_64; 175 | 176 | } 177 | 178 | 179 | static inline uint64_t __attribute__((always_inline)) 180 | get_raw_counter(void) { 181 | return rdtsc(); 182 | } 183 | #endif 184 | 185 | 186 | #ifdef __aarch64__ 187 | static inline uint64_t __attribute__((always_inline)) 188 | get_cntvct_el0(void) { 189 | uint64_t t; 190 | asm volatile ("ISB; mrs %0, cntvct_el0" : "=r" (t)); 191 | return t; 192 | } 193 | 194 | 195 | static inline uint64_t __attribute__((always_inline)) 196 | get_raw_counter(void) { 197 | return get_cntvct_el0(); 198 | } 199 | #endif 200 | 201 | 202 | static inline void __attribute__((always_inline)) 203 | timer_reset_counter() 204 | { 205 | #ifdef __aarch64__ 206 | __asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (prev_tsc)); 207 | #elif __x86_64__ 208 | prev_tsc = rdtscp(); 209 | #endif 210 | } 211 | 212 | 213 | /* Standard timer read functions */ 214 | static inline uint64_t __attribute__((always_inline)) 215 | timer_get_counter() 216 | { 217 | /* this returns the counter value from a constant-rate timer */ 218 | #ifdef __aarch64__ 219 | uint64_t counter_value; 220 | __asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (counter_value)); 221 | #elif __x86_64__ 222 | uint64_t counter_value = rdtscp(); // assume constant_tsc 223 | #endif 224 | return counter_value; 225 | } 226 | 227 | /* Timer read for when at start of timing block 228 | */ 229 | static inline uint64_t __attribute__((always_inline)) 230 | timer_get_counter_start() 231 | { 232 | /* this returns the counter value from a constant-rate timer */ 233 | #ifdef __aarch64__ 234 | uint64_t counter_value; 235 | __asm__ __volatile__ ("dsb ish; isb; mrs %0, cntvct_el0" : "=r" (counter_value)); 236 | #elif __x86_64__ 237 | uint64_t counter_value = rdtscp_start(); // assume constant_tsc 238 | #endif 239 | return counter_value; 240 | } 241 | 242 | 243 | /* Timer read for when at end of timing block 244 | */ 245 | static inline uint64_t __attribute__((always_inline)) 246 | timer_get_counter_end() 247 | { 248 | /* this returns the counter value from a constant-rate timer */ 249 | #ifdef __aarch64__ 250 | uint64_t counter_value; 251 | __asm__ __volatile__ ("isb; mrs %0, cntvct_el0; isb" : "=r" (counter_value)); 252 | #elif __x86_64__ 253 | uint64_t counter_value = rdtscp_end(); // assume constant_tsc 254 | #endif 255 | return counter_value; 256 | } 257 | 258 | static inline void __attribute__((always_inline)) 259 | timer_reset_all() 260 | { 261 | timer_reset_counter(); 262 | } 263 | 264 | static inline void __attribute__((always_inline)) 265 | timer_init() { 266 | } 267 | 268 | static inline uint64_t __attribute__((always_inline)) 269 | timer_get_timer_freq(void) 270 | { 271 | extern unsigned long hwtimer_frequency; 272 | if (hwtimer_frequency) { return hwtimer_frequency; } 273 | 274 | uint64_t cnt_freq; 275 | #ifdef __aarch64__ 276 | __asm__ __volatile__ ("isb; mrs %0, cntfrq_el0" : "=r" (cnt_freq)); 277 | #elif __x86_64__ 278 | // This code attempts to get the TSC frequency. The assumption made 279 | // is TSC frequency equals the CPUFreq cpuinfo_max_freq attribute 280 | // value, which is the maximum operating frequency of the processor. 281 | // However, this equality is not always true, and less so in newer CPUs. 282 | // Also, the actual TSC frequency may not exactly match any nominal 283 | // frequency attribute value provided by CPUFreq, so the chances of 284 | // this returning the correct frequency have diminished. 285 | 286 | // If the CPUFreq cpuinfo_max_freq attribute is not available, this code 287 | // then tries to quickly measure it. 288 | 289 | // Use --timer-frequency flag to override the frequency value. 290 | // Use --estimate-timer-frequency to explicitly measure it. 291 | 292 | char buf[100]; 293 | FILE * f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r"); 294 | if (f == NULL) { 295 | printf("Failed to open cpuinfo_max_freq, error %s\n", 296 | strerror(errno)); 297 | uint64_t iterations = 2; 298 | uint64_t time = 0; 299 | for (uint64_t i = 0; i < iterations; i++) { 300 | uint64_t start = rdtscp_start(); 301 | sleep(1); 302 | uint64_t end = rdtscp_end(); 303 | time += end - start; 304 | } 305 | 306 | // round down cycles 307 | uint64_t tmp = (time/iterations); 308 | unsigned long len = log10(tmp); 309 | double div = pow(10, len-2); 310 | return floor(tmp/div)*div; 311 | } 312 | while (! feof(f) && ! ferror(f)) { 313 | size_t end = fread(buf, 1, sizeof(buf) - 1, f); 314 | buf[end] = 0; 315 | } 316 | fclose(f); 317 | 318 | /* The ACPI cpufreq driver reports 'base' (aka non-turbo) frequency 319 | in cpuinfo_max_freq while the intel_pstate driver reports the 320 | turbo frequency. Warn if ACPI cpufreq is not found. */ 321 | if (access("/sys/devices/system/cpu/cpufreq", F_OK)) { 322 | printf("cpuinfo_max_freq is not from ACPI cpufreq driver! TSC frequency is probably turbo frequency.\n"); 323 | } 324 | 325 | cnt_freq = strtoul(buf, NULL, 0); 326 | cnt_freq = ((cnt_freq + 5000) / 10000) * 10000; /* round to nearest 10000 kHz */ 327 | cnt_freq *= 1000; /* convert KHz to Hz */ 328 | #endif 329 | return cnt_freq; 330 | } 331 | 332 | #define TOKENS_MAX_HIGH 1000000 /* good for ~41500 cntvct cycles */ 333 | #define THRESHOLD 1.05 // if the ratio of cycles to do the total eval loop to the sum of the individual 334 | // calls (e.g. due to context switch), rerun 335 | 336 | 337 | #endif 338 | 339 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 340 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/include/verbose.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | */ 6 | 7 | #ifndef VERBOSE_H 8 | #define VERBOSE_H 9 | 10 | enum { 11 | VERBOSE_MORE=3, 12 | VERBOSE_YES=2, 13 | VERBOSE_LOW=1, // default 14 | VERBOSE_NONE=0 // to-be-implemented 15 | }; 16 | 17 | #endif 18 | 19 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 20 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/lh_sweepdelay_cfg.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, ARM Limited. All rights reserved. 2 | # 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # Redistributions in binary form must reproduce the above copyright notice, this 12 | # list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # Neither the name of ARM Limited nor the names of its contributors may be used 16 | # to endorse or promote products derived from this software without specific 17 | # prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 27 | # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # The views and conclusions contained in the software and documentation are those 31 | # of the authors and should not be interpreted as representing official policies, 32 | # either expressed or implied, of this project. 33 | 34 | 35 | ## Global Settings 36 | globalcfg: 37 | execdir: ../build 38 | logfile: lockhammer.csv 39 | 40 | 41 | ## Sweep Test Settings 42 | # 43 | # Common assumptions for sweep delay test: 44 | # The system should be able to handle CAS workload at any delay. 45 | # Using eBPF, we find that Nginx lockref inter-arrival delay is less than 32us, 46 | # and lockref_get to lockref_put_return delay is less than 2us. Therefore we 47 | # create this script to sweep <32us delay using -p parameter and keep -c 48 | # parameter at 1us because eBPF intrinsic overhead is about 1us. We also find 49 | # the ratio between -c and -p is relatively fixed, e.g. 1:8 ~ 1:16. We choose 50 | # 1:9 for the last case (-c=3000ns, -p=28000ns). 51 | # 52 | ## 53 | sweeptest: 54 | enabled: True 55 | safemode: True 56 | cmd: 57 | - lh_cas_lockref 58 | cmd_aarch64: 59 | cmd_x86_64: 60 | repeat: 9 61 | sweepargu: t 62 | argumax: 0 63 | skipsince: 48 64 | skipstep: 8 65 | argulist: 66 | - a: 5000 67 | c: 0ns 68 | p: 0ns 69 | o: lstopo 70 | - a: 5000 71 | c: 200ns 72 | p: 0ns 73 | o: lstopo 74 | - a: 5000 75 | c: 1000ns 76 | p: 0ns 77 | o: lstopo 78 | - a: 5000 79 | c: 200ns 80 | p: 1000ns 81 | o: lstopo 82 | - a: 5000 83 | c: 1000ns 84 | p: 1000ns 85 | o: lstopo 86 | - a: 5000 87 | c: 1000ns 88 | p: 2000ns 89 | o: lstopo 90 | - a: 5000 91 | c: 1000ns 92 | p: 3000ns 93 | o: lstopo 94 | - a: 5000 95 | c: 1000ns 96 | p: 4000ns 97 | o: lstopo 98 | - a: 5000 99 | c: 1000ns 100 | p: 5000ns 101 | o: lstopo 102 | - a: 5000 103 | c: 1000ns 104 | p: 6000ns 105 | o: lstopo 106 | - a: 5000 107 | c: 1000ns 108 | p: 7000ns 109 | o: lstopo 110 | - a: 5000 111 | c: 1000ns 112 | p: 8000ns 113 | o: lstopo 114 | - a: 5000 115 | c: 1000ns 116 | p: 9000ns 117 | o: lstopo 118 | - a: 5000 119 | c: 1000ns 120 | p: 10000ns 121 | o: lstopo 122 | - a: 5000 123 | c: 1000ns 124 | p: 11000ns 125 | o: lstopo 126 | - a: 5000 127 | c: 1000ns 128 | p: 12000ns 129 | o: lstopo 130 | - a: 5000 131 | c: 1000ns 132 | p: 13000ns 133 | o: lstopo 134 | - a: 5000 135 | c: 1000ns 136 | p: 14000ns 137 | o: lstopo 138 | - a: 5000 139 | c: 1000ns 140 | p: 15000ns 141 | o: lstopo 142 | - a: 5000 143 | c: 1000ns 144 | p: 16000ns 145 | o: lstopo 146 | - a: 5000 147 | c: 2000ns 148 | p: 20000ns 149 | o: lstopo 150 | - a: 5000 151 | c: 2000ns 152 | p: 24000ns 153 | o: lstopo 154 | - a: 5000 155 | c: 3000ns 156 | p: 28000ns 157 | o: lstopo 158 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, ARM Limited. All rights reserved. 2 | # 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # Redistributions in binary form must reproduce the above copyright notice, this 12 | # list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # Neither the name of ARM Limited nor the names of its contributors may be used 16 | # to endorse or promote products derived from this software without specific 17 | # prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 27 | # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # The views and conclusions contained in the software and documentation are those 31 | # of the authors and should not be interpreted as representing official policies, 32 | # either expressed or implied, of this project. 33 | 34 | 35 | ## Global Settings 36 | globalcfg: 37 | execdir: ../build 38 | logfile: lockhammer.csv 39 | 40 | 41 | ## Sweep Test Settings 42 | # 43 | # Common assumptions for sweeptest: 44 | # CPU Frequency = 2GHz 45 | # CPU Cycle = 0.5ns 46 | # Remote DRAM (x86_64 or aarch64, NUMA) = 100ns ~ 300ns 47 | # Page Fault (4KB, x86_64) = 2000 cycles = 1000ns 48 | # Therefore we set critical section delay (-c) to 0ns, 200ns and 1000ns 49 | # We also set post critical section delay (-p) to 5x of (-c) value 50 | # By default, sweeptest will sweep sweepargu (-t) from 1 to max core count 51 | # 52 | ## 53 | sweeptest: 54 | enabled: True 55 | safemode: True 56 | cmd: 57 | - lh_cas_event_mutex 58 | - lh_cas_lockref 59 | - lh_cas_rw_lock 60 | - lh_empty 61 | - lh_event_mutex 62 | - lh_incdec_refcount 63 | - lh_jvm_objectmonitor 64 | - lh_osq_lock 65 | - lh_queued_spinlock 66 | - lh_swap_mutex 67 | - lh_tbb_spin_rw_mutex 68 | - lh_ticket_spinlock 69 | - lh_clh_spinlock 70 | cmd_aarch64: [lh_hybrid_spinlock, lh_hybrid_spinlock_fastdequeue] 71 | cmd_x86_64: 72 | repeat: 9 73 | sweepargu: t 74 | argumax: 0 75 | skipsince: 48 76 | skipstep: 8 77 | argulist: 78 | - a: 5000 79 | c: 0ns 80 | p: 0ns 81 | o: lstopo 82 | - a: 5000 83 | c: 200ns 84 | p: 0ns 85 | o: lstopo 86 | - a: 5000 87 | c: 1000ns 88 | p: 0ns 89 | o: lstopo 90 | - a: 5000 91 | c: 200ns 92 | p: 1000ns 93 | o: lstopo 94 | - a: 5000 95 | c: 1000ns 96 | p: 5000ns 97 | o: lstopo 98 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/lh_unittest_cfg.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, ARM Limited. All rights reserved. 2 | # 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # Redistributions in binary form must reproduce the above copyright notice, this 12 | # list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # Neither the name of ARM Limited nor the names of its contributors may be used 16 | # to endorse or promote products derived from this software without specific 17 | # prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 27 | # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # The views and conclusions contained in the software and documentation are those 31 | # of the authors and should not be interpreted as representing official policies, 32 | # either expressed or implied, of this project. 33 | 34 | 35 | ## Global Settings 36 | globalcfg: 37 | execdir: ../build 38 | logfile: lockhammer.csv 39 | 40 | 41 | ## Unittest Settings 42 | # 43 | # Common assumptions for unittest: 44 | # Only cover functional correctness, use as least time as possible 45 | # Normal runtime should be around 1 minute 46 | # t=0 means maximum core count 47 | # o=lstopo means using lstopo output as preferred thread pinning order 48 | # 49 | ## 50 | unittest: 51 | enabled: True 52 | safemode: True 53 | testcase: 54 | - cmd: 55 | - lh_cas_event_mutex 56 | - lh_cas_lockref 57 | - lh_cas_rw_lock 58 | - lh_empty 59 | - lh_event_mutex 60 | - lh_incdec_refcount 61 | - lh_jvm_objectmonitor 62 | - lh_osq_lock 63 | - lh_queued_spinlock 64 | - lh_swap_mutex 65 | - lh_tbb_spin_rw_mutex 66 | - lh_ticket_spinlock 67 | cmd_aarch64: [lh_hybrid_spinlock, lh_hybrid_spinlock_fastdequeue] 68 | cmd_x86_64: 69 | t: [1, 0] 70 | a: 100 71 | c: [0ns, 50ns] 72 | p: [0ns, 50ns] 73 | 74 | - cmd: lh_osq_lock 75 | t: [1, 0] 76 | a: 100 77 | c: 50ns 78 | p: 0ns 79 | o: lstopo 80 | extra: 81 | u: 10 82 | s: 2 83 | 84 | - cmd: lh_tbb_spin_rw_mutex 85 | t: [1, 0] 86 | a: 100 87 | c: 50ns 88 | p: 0ns 89 | i: 1 90 | o: '0:1:2:3' 91 | extra: 92 | r: 4 93 | m: 1 94 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/lockhammer-all.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/benchmarks/lockhammer/scripts/lockhammer-all.csv.xz -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/lockhammer-jupyter-notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# License" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Copyright (c) 2018, ARM Limited. All rights reserved.\n", 15 | "\n", 16 | "SPDX-License-Identifier: BSD-3-Clause\n", 17 | "\n", 18 | "Redistribution and use in source and binary forms, with or without\n", 19 | "modification, are permitted provided that the following conditions are met:\n", 20 | "\n", 21 | "Redistributions of source code must retain the above copyright notice, this\n", 22 | "list of conditions and the following disclaimer.\n", 23 | "\n", 24 | "Redistributions in binary form must reproduce the above copyright notice, this\n", 25 | "list of conditions and the following disclaimer in the documentation and/or\n", 26 | "other materials provided with the distribution.\n", 27 | "\n", 28 | "Neither the name of ARM Limited nor the names of its contributors may be used\n", 29 | "to endorse or promote products derived from this software without specific\n", 30 | "prior written permission.\n", 31 | "\n", 32 | "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n", 33 | "AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", 34 | "IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n", 35 | "DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n", 36 | "FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n", 37 | "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n", 38 | "SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n", 39 | "CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR\n", 40 | "TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", 41 | "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Prerequisite Libraries" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "- ## python3\n", 56 | "`apt install python3 python3-pip`\n", 57 | "- ## jupyter-notebook\n", 58 | "`apt install jupyter-notebook`\n", 59 | "- ## matplotlib seaborn pandas numpy\n", 60 | "`pip3 install matplotlib seaborn pandas numpy`" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# Matplotlib and Seaborn Settings" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "%matplotlib inline\n", 79 | "import matplotlib\n", 80 | "import matplotlib.pyplot as plt\n", 81 | "import seaborn as sns\n", 82 | "import pandas as pd\n", 83 | "import numpy as np\n", 84 | "import warnings\n", 85 | "warnings.filterwarnings('ignore')\n", 86 | "sns.set()\n", 87 | "\n", 88 | "# default 12 colors and markers\n", 89 | "default_palette = [\n", 90 | " '#765f97', #Purple\n", 91 | " '#1b9e77', #Dark Green\n", 92 | " '#8c5c20', #Brown\n", 93 | " '#0038bd', #Blue\n", 94 | " '#cf364a', #Red\n", 95 | " '#343434', #Jet Black\n", 96 | " '#878681', #Titanium Gray\n", 97 | " '#f561dd', #Magenta\n", 98 | " '#a6cee3', #Calico Blue\n", 99 | " '#dea0dd', #Plum\n", 100 | " '#7fc97f', #Grass Green\n", 101 | " '#fdc086', #Pale Yellow\n", 102 | " ]\n", 103 | "\n", 104 | "default_markers=['^', '*', 'd', 'x', 'D', 'o', 'v', 's', 'p', '>', '<', '.']\n", 105 | "default_marker_size = 100\n", 106 | "\n", 107 | "# seaborn settings\n", 108 | "sns.set(context=\"notebook\", style=\"darkgrid\", font_scale=2, rc={\"lines.linewidth\": 3, \"xtick.major.size\": 4, \"ytick.major.size\": 4})\n", 109 | "sns.set_palette(default_palette)\n", 110 | "sns.palplot(sns.color_palette())" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# Lockhammer Common Settings" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "# common variables\n", 129 | "lock_workloads = [\"cas_event_mutex\", \"cas_lockref\", \"cas_rw_lock\", \"incdec_refcount\", \"osq_lock\", \"queued_spinlock\", \"ticket_spinlock\", \"jvm_objectmonitor\", \"swap_mutex\", \"tbb_spin_rw_mutex\", \"event_mutex\", \"empty\"]\n", 130 | "lock_hosts = [\"x86-1\", \"x86-2\", \"x86-3\", \"x86-4\"]\n", 131 | "lock_parameters = [[\"-c\", \"200ns\", \"-p\", \"1000ns\"]]\n", 132 | "exectx_count_max = 88\n", 133 | "exectx_count_gap = 4\n", 134 | "lh_csv_result_header = [\"num_threads\", \"avg_exectx\", \"scheduled_time_per_access\", \"real_time_per_access\", \"access_rate\", \"avg_lock_depth\",\n", 135 | " \"date\", \"fqdn\", \"exec\", \"t\", \"tv\", \"a\", \"av\", \"c\", \"cv\", \"p\", \"pv\", \"o\", \"ov\"]\n", 136 | "\n", 137 | "param_name = \"contended_system_latency (ns)\"\n", 138 | "lock_yaxis_name = param_name\n", 139 | "\n", 140 | "# lockhammer-all.csv.xz is a xz-compressed aggregated file of different machines' raw csv result\n", 141 | "raw_csv_filename = \"lockhammer-all.csv.xz\"\n", 142 | "raw_df = pd.read_csv(raw_csv_filename, sep=', ', header=None, names=lh_csv_result_header, engine='python')\n", 143 | "\n", 144 | "# common functions\n", 145 | "def plot_lines_only(dataf):\n", 146 | " # each test repeat 9 times, but we only plot the median latency (access_rate)\n", 147 | " median_list = []\n", 148 | " for hst, grp0 in dataf.groupby(\"host\"):\n", 149 | " for nth, grp1 in grp0.groupby(\"num_threads\"):\n", 150 | " median_list.append({\"host\": hst, \"num_threads\": nth, param_name: grp1.median()[param_name]})\n", 151 | " median_df = pd.DataFrame(median_list)\n", 152 | " \n", 153 | " from matplotlib.colors import ListedColormap\n", 154 | " cmap = ListedColormap(sns.color_palette(default_palette).as_hex())\n", 155 | " for i, (hst, grp2) in enumerate(median_df.groupby(\"host\")):\n", 156 | " plt.plot(\"num_threads\", param_name, data=grp2, color=cmap(i))" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "# Lockhammer all workloads, raw contended system latency, 2018.11.06." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false, 171 | "scrolled": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "# use lmplot (not catplot) to plot raw system latencies\n", 176 | "for param in lock_parameters:\n", 177 | " for workload in lock_workloads:\n", 178 | " tidy_df = pd.DataFrame()\n", 179 | " for sut in sorted(lock_hosts):\n", 180 | " host_df = raw_df.loc[(raw_df['fqdn'].str.startswith(sut) & raw_df['exec'].str.endswith(workload))]\n", 181 | " test_df = host_df.loc[(host_df['cv'] == param[1]) & (host_df['pv'] == param[3])]\n", 182 | " copy_df = test_df.copy()\n", 183 | " copy_df['host'] = sut\n", 184 | " all_df = pd.melt(copy_df, id_vars=['host', 'num_threads'], value_vars=['access_rate'], value_name=param_name)\n", 185 | " tidy_df = pd.concat([tidy_df, all_df])\n", 186 | " \n", 187 | " # because lmplot doesn't plot lines, we have to use plot_lines_only to plot them\n", 188 | " sns.lmplot(x=\"num_threads\", y=param_name, hue=\"host\", data=tidy_df, x_estimator=np.median, x_ci=50,\n", 189 | " height=10, aspect=2, fit_reg=False, markers=default_markers[:len(lock_hosts)], scatter_kws={\"s\": default_marker_size})\n", 190 | " \n", 191 | " # plot lines which connect lmplot dots\n", 192 | " plot_lines_only(tidy_df)\n", 193 | " \n", 194 | " # change title / axis and save the figure\n", 195 | " plt.title(\"lockhammer workload: {}, critical_time: {}, parallel_time: {}\".format(workload, param[1], param[3]))\n", 196 | " plt.xlim(0, exectx_count_max)\n", 197 | " plt.xticks(np.arange(0, exectx_count_max+1, exectx_count_gap))\n", 198 | " plt.savefig(\"github_lockhammer_all_common_20181106_{}_{}_{}.png\".format(workload, param[1], param[3]))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Python 3", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.6.3" 228 | } 229 | }, 230 | "nbformat": 4, 231 | "nbformat_minor": 2 232 | } 233 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: Copyright 2019-2025 Arm Limited and/or its affiliates 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | 6 | # This script invokes lockhammer tests. 7 | # This script is meant to be edited for customizing which tests to run. 8 | # Edit the *_LIST variables below to choose the configuration combinations. 9 | # A json file will be made for each variant and test combination. 10 | 11 | set -e 12 | 13 | usage() { 14 | cat<&2 echo "ERROR: flag -$OPTARG required an argument, but none was given" 48 | usage 49 | ;; 50 | *) echo name=$name, OPTARG=$OPTARG 51 | usage 52 | ;; 53 | esac 54 | done 55 | 56 | shift $((OPTIND-1)) 57 | 58 | # make the list of build variants to run; use # to comment out variant 59 | 60 | # TODO: update lists by arch 61 | 62 | VARIANT_LIST=$(grep -v -E '\#|^$' <<'EOF1' 63 | builtin.cond_load.relax_empty 64 | builtin.cond_load.relax_nothing 65 | builtin.cond_load.relax_pause 66 | builtin.relax_empty 67 | builtin.relax_nothing 68 | builtin.relax_pause 69 | cond_load.relax_empty 70 | cond_load.relax_nothing 71 | cond_load.relax_pause 72 | relax_empty 73 | relax_nothing 74 | relax_pause 75 | 76 | 77 | #builtin.relax_nothing 78 | #builtin.relax_isb 79 | #builtin.cond_load.relax_nothing 80 | #builtin.cond_load.relax_isb 81 | 82 | #lse.builtin.relax_nothing 83 | #lse.builtin.relax_isb 84 | #lse.builtin.cond_load.relax_nothing 85 | #lse.builtin.cond_load.relax_isb 86 | EOF1 87 | ) 88 | 89 | # make the list of tests to run; use # to comment out test 90 | 91 | TEST_LIST=$(grep -v -E '\#|^$' <<'EOF2' 92 | lh_cas_event_mutex 93 | lh_cas_lockref 94 | lh_cas_rw_lock 95 | #lh_clh_spinlock 96 | #lh_empty 97 | #lh_event_mutex 98 | #lh_hybrid_spinlock 99 | #lh_hybrid_spinlock_fastdequeue 100 | lh_incdec_refcount 101 | lh_jvm_objectmonitor 102 | lh_osq_lock 103 | #lh_queued_spinlock 104 | #lh_swap_mutex 105 | #lh_tbb_spin_rw_mutex 106 | lh_ticket_spinlock 107 | EOF2 108 | ) 109 | 110 | CRIT_NS_LIST=$(grep -v -E '\#|^$' <<'EOF_CRIT_NS' 111 | 0 112 | 500 113 | 1000 114 | EOF_CRIT_NS 115 | ) 116 | 117 | PAR_NS_LIST=$(grep -v -E '\#|^$' <<'EOF_PAR_NS' 118 | 0 119 | 500 120 | 1000 121 | 2000 122 | 4000 123 | EOF_PAR_NS 124 | ) 125 | 126 | PAR="" 127 | for a in $PAR_NS_LIST; do PAR+="-p${a}ns "; done 128 | 129 | CRIT="" 130 | for a in $CRIT_NS_LIST; do CRIT+="-c${a}ns "; done 131 | 132 | 133 | # check that a hugepage is available 134 | #HUGEPAGE_SIZE=32MB 135 | #HUGEPAGE_SIZE_KB=$((32*1024)) 136 | HUGEPAGE_SIZE=1GB 137 | HUGEPAGE_SIZE_KB=$((1024*1024)) 138 | HUGEPAGES_DIR=/sys/kernel/mm/hugepages/hugepages-${HUGEPAGE_SIZE_KB}kB 139 | FREE_HUGEPAGES_FILE=$HUGEPAGES_DIR/free_hugepages 140 | NR_HUGEPAGES_FILE=$HUGEPAGES_DIR/nr_hugepages 141 | NR_HUGEPAGES=$(cat "$NR_HUGEPAGES_FILE") 142 | NR_HUGEPAGES_PLUS_ONE=$((NR_HUGEPAGES+1)) 143 | if [ ! -e "$FREE_HUGEPAGES_FILE" ] || [ $(cat "$FREE_HUGEPAGES_FILE") -eq 0 ]; then 144 | echo "ERROR: no free $HUGEPAGE_SIZE hugepages. Perhaps try running:" 145 | echo "echo $NR_HUGEPAGES_PLUS_ONE | sudo tee -a $NR_HUGEPAGES_FILE" 146 | exit -1 147 | fi 148 | HUGEPAGE_FLAGS="--hugepage-size $HUGEPAGE_SIZE" 149 | 150 | 151 | # determine cpuorder file to use based on hostname. 152 | HOSTNAME_S=$(hostname -s) 153 | CPUORDER_FLAGS= 154 | if [ -e hostname_to_cpuorder_type.sh ]; then 155 | . hostname_to_cpuorder_type.sh 156 | 157 | CPUORDER_TYPE=$(hostname_to_cpuorder_type $HOSTNAME_S) 158 | CPUORDER=cpuorders/$CPUORDER_TYPE.cpuorder 159 | 160 | if [ ! -e "$CPUORDER" ]; then 161 | echo "ERROR: $CPUORDER does not exist!" 162 | exit -1 163 | fi 164 | 165 | CPUORDER_FLAGS="-C $CPUORDER" 166 | fi 167 | 168 | # compute the number of threads using the number of available processors 169 | NPROC=$(nproc) 170 | TLIST= 171 | for num_threads in 2 4 $(eval echo "{8..$NPROC..$CPU_SKIP}") 172 | do 173 | if [ $num_threads -gt $NPROC ]; then 174 | break 175 | fi 176 | 177 | TLIST+="-t $num_threads " 178 | done 179 | 180 | 181 | # compute number of test and variants 182 | NUM_TESTS=$(echo $TEST_LIST | wc -w) 183 | NUM_VARIANTS=$(echo $VARIANT_LIST | wc -w) 184 | #echo NUM_VARIANTS=$NUM_VARIANTS NUM_TESTS=$NUM_TESTS 185 | NUM_TEST_AND_VARIANTS=$((NUM_TESTS*NUM_VARIANTS)) 186 | TEST_AND_VARIANT_COUNT=0 187 | #echo NUM_TEST_AND_VARIANTS=$NUM_TEST_AND_VARIANTS 188 | 189 | #exit 0 190 | 191 | # change newline to space for the summary 192 | TEST_LIST=${TEST_LIST//$'\n'/ } 193 | VARIANT_LIST=${VARIANT_LIST//$'\n'/ } 194 | PAR_NS_LIST=${PAR_NS_LIST//$'\n'/ } 195 | CRIT_NS_LIST=${CRIT_NS_LIST//$'\n'/ } 196 | 197 | cat< 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | 6 | # This script shows the per-thread fairness of lock acquires in the result json(s). 7 | 8 | # show-per-thread-lock-acquires.sh result1.json [result2.json ...] 9 | 10 | # add this to filter only one set of crit/par 11 | #.results[]|select(.nominal_critical==0 and .nominal_parallel==0)| 12 | 13 | read -r -d '' CMD <<'EOF' 14 | .results[]|"\(.nominal_critical)\t\(.nominal_parallel)\t\(.num_threads)\t\(.full_concurrency_fraction*10000|round/10000)\t\(.lock_acquires_mean | round )\t\(.lock_acquires_stddev_over_mean * 10000 | round / 10000)\t\(.per_thread_stats | map(.lock_acquires) | sort | join(","))" 15 | EOF 16 | 17 | #echo "$CMD" 18 | #exit 19 | 20 | ( 21 | echo -e "crit\tpar\tnthrds\tfcf\tlock_acquires_mean\tlock_acquires_stddev/mean\tlock_acquires_each_thread\n" 22 | jq -r "$CMD" "$@" 23 | ) | column -t 24 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2017, The Linux Foundation. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are 7 | # met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of The Linux Foundation nor the names of its 15 | # contributors may be used to endorse or promote products derived 16 | # from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 19 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 21 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 22 | # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 25 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 26 | # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 27 | # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 28 | # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | cores=$(grep -c "^processor" /proc/cpuinfo) 31 | cores_q1=$(($cores / 4)) 32 | cores_q2=$(($cores / 2)) 33 | cores_q3=$(($cores_q1 + $cores_q2)) 34 | cores_all="`seq 48` `seq 8 8 $(($cores))` $cores_q1 $cores_q2 $cores_q3 $cores" 35 | cores_sort=$(echo $cores_all | tr ' ' '\n' | sort -nu) 36 | for c in $cores_sort 37 | do 38 | if (( $c <= $cores )) 39 | then 40 | acquires=50000 41 | if (( $c > 8 )) 42 | then 43 | acquires=$((${acquires}*8/$c)) 44 | if (( $acquires < 1000 )) 45 | then 46 | acquires=1000 47 | fi 48 | fi 49 | 50 | echo Test: ${1} CPU: exectx=$c Date: `date` 1>&2 51 | sudo ../build/lh_${1} -t $c -a ${acquires} -c ${2} -p ${3} 52 | sleep 5s 53 | fi 54 | done 55 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/scripts/view-results-json.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: Copyright 2019-2025 Arm Limited and/or its affiliates 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | 6 | # This script displays the values from one or more lockhammer json files in a table format using jq. 7 | 8 | # XXX: can't differentiate between ns vs. inst for crit/par; please select only using the same units! 9 | 10 | SORT_STRING='.num_threads' 11 | REVERSE=0 12 | DUMP_DATA=0 13 | 14 | declare -a CRIT 15 | declare -a PAR 16 | declare -a NUM_THREADS 17 | declare -a VARIANT_NAMES 18 | 19 | usage() { 20 | cat<<"USAGE" 21 | 22 | ./view-results-json.sh [options] json [json ...] 23 | 24 | select options: 25 | -c crit nominal critical time/inst parameter (repeatable) 26 | -p par nominal parallel time/inst parameter (repeatable) 27 | -t num_threads number of threads (repeatable) 28 | -v variant_name variant name (repeatable) 29 | 30 | sort options: 31 | -s sort_string sort string (default is by '.num_threads') 32 | -s help print short header to .key mapping 33 | -r reverse the sort 34 | 35 | output options: 36 | -D dump the records in a json array 37 | 38 | -h print this usage help message 39 | 40 | 41 | Example: 42 | 43 | # list all data with threads=8, parallel=1000 or parallel=500, and critical=0 44 | # from files *osq_lock*.json, sort by overhead % 45 | 46 | ./view-results-json.sh -s overhead_% -t 8 -p 1000 -p 500 -c 0 *osq_lock*.json 47 | 48 | USAGE 49 | exit 1 50 | } 51 | 52 | 53 | shopt -s extglob 54 | 55 | while getopts ":c:p:t:v:s:rDh" name; do 56 | case "${name}" in 57 | c) CRIT+=(${OPTARG}) 58 | ;; 59 | p) PAR+=(${OPTARG}) 60 | ;; 61 | t) NUM_THREADS+=(${OPTARG}) 62 | ;; 63 | v) VARIANT_NAMES+=(${OPTARG}) 64 | ;; 65 | s) SORT_STRING=${OPTARG} 66 | ;; 67 | r) REVERSE=1 68 | ;; 69 | D) DUMP_DATA=1 70 | ;; 71 | h) usage 72 | ;; 73 | :) >&2 echo "ERROR: flag -$OPTARG required an argument, but none was given" 74 | usage 75 | ;; 76 | *) echo "ERROR: unknown flag name=$name, OPTARG=$OPTARG" 77 | usage 78 | ;; 79 | esac 80 | done 81 | 82 | shift $((OPTIND-1)) 83 | 84 | FILES="$@" 85 | 86 | if [ -z "$FILES" ]; then 87 | echo "no json files given; run with -h for usage help" 88 | exit -1 89 | fi 90 | 91 | # ----------------------------------------------------------------------------- 92 | # jq filter stages. Write as separate single-quoted strings so that escapes are not needed (i.e., do not use escapes!). 93 | # 94 | # reducer - puts data from all the json into an array with some modifications 95 | # selector - selects the data from the array that match the command line criteria 96 | # sorter - sort the selected data by the sorting criteria 97 | # filter - convert the sorted data into formatted output 98 | 99 | # ---------------------------------- 100 | # Reducer gets the .results[] array from each json, and, for each results 101 | # element/object, deletes the pinorder and per_thread_sets, and adds an 102 | # .input_filename to the object. The output is a single array of results 103 | # elements. 104 | 105 | REDUCER='reduce inputs as $s ([]; . += [$s.results[] | del(.pinorder) | del(.per_thread_stats) | . += {"input_filename":input_filename}])' 106 | 107 | 108 | # ---------------------------------- 109 | # Select the records with the requested element values 110 | 111 | make_selector() { 112 | local NAME="$1" 113 | shift 114 | local AS_STRING=0 115 | if [ "$1" = "as_string" ]; then 116 | AS_STRING=1 117 | shift 118 | elif [ "$1" = "as_number" ]; then 119 | AS_STRING=0 120 | shift 121 | fi 122 | 123 | local ARRAY=("$@") 124 | local ARRAY_SELECTOR= 125 | 126 | if [ ${#ARRAY[@]} -eq 0 ]; then 127 | return 128 | fi 129 | 130 | for a in ${ARRAY[@]}; do 131 | if [ -n "$ARRAY_SELECTOR" ]; then ARRAY_SELECTOR+=" or "; fi 132 | if [ $AS_STRING -eq 1 ]; then 133 | ARRAY_SELECTOR+=".${NAME}==\"${a}\"" 134 | else 135 | ARRAY_SELECTOR+=".${NAME}==${a}" 136 | fi 137 | done 138 | 139 | echo " and ($ARRAY_SELECTOR)" 140 | } 141 | 142 | SELECTOR_ARGLIST="true" 143 | SELECTOR_ARGLIST+=$(make_selector nominal_parallel "${PAR[@]}") 144 | SELECTOR_ARGLIST+=$(make_selector nominal_critical "${CRIT[@]}") 145 | SELECTOR_ARGLIST+=$(make_selector num_threads "${NUM_THREADS[@]}") 146 | SELECTOR_ARGLIST+=$(make_selector variant_name as_string "${VARIANT_NAMES[@]}") 147 | 148 | SELECTOR=' [.[] | select('$SELECTOR_ARGLIST')] ' 149 | 150 | 151 | # ---------------------------------- 152 | # Sort; output is an array 153 | 154 | # for -s sort_string flag, map it to these fields. TODO: reverse SPECIAL_HEADER array instead of hard-coding 155 | declare -A SHORT_HEADER 156 | SHORT_HEADER[cputime_ns/lock]=".cputime_ns_per_lock_acquire" 157 | SHORT_HEADER[cpu_ns/lock]=".cputime_ns_per_lock_acquire" 158 | SHORT_HEADER[wall_ns/lock]=".wall_elapsed_ns_per_lock_acquire" 159 | SHORT_HEADER[fcf]=".full_concurrency_fraction" 160 | SHORT_HEADER[nom_par]=".nominal_parallel" 161 | SHORT_HEADER[nom_crit]=".nominal_critical" 162 | SHORT_HEADER[par_ns]=".avg_parallel_ns_per_loop" 163 | SHORT_HEADER[crit_ns]=".avg_critical_ns_per_loop" 164 | SHORT_HEADER[overhead_ns]=".avg_lock_overhead_cputime_ns" 165 | SHORT_HEADER[overhead_%]=".lock_overhead_cputime_percent" 166 | SHORT_HEADER[locks/wall_sec]=".total_lock_acquires_per_second" 167 | SHORT_HEADER[num_threads]=".num_threads" 168 | SHORT_HEADER[json]=".input_filename" 169 | SHORT_HEADER[host]=".hostname" 170 | SHORT_HEADER[lasom]=".lock_acquires_stddev_over_mean" 171 | 172 | # print SHORT_HEADER as a table 173 | if [[ $SORT_STRING == "help" ]]; then 174 | (echo "sort_key sort_string"; 175 | for key in "${!SHORT_HEADER[@]}" ; do 176 | echo "$key ${SHORT_HEADER[$key]}" 177 | done) | column -t 178 | exit -1 179 | fi 180 | 181 | if [[ -v SHORT_HEADER[$SORT_STRING] ]]; then 182 | SORT_STRING="${SHORT_HEADER[$SORT_STRING]}" 183 | elif [[ ! $SORT_STRING =~ ^\. ]]; then 184 | # we check for this to allow for complex multikey comma-separated sort string to be passed in as an argument. 185 | echo "ERROR: SORT_STRING does not being with a . and is not one of the SHORT_HEADER keys, so it's probably not referring to a results variable." 186 | exit -1 187 | fi 188 | 189 | #SORTER='sort_by(.cputime_ns_per_lock_acquire) ' 190 | #SORTER='sort_by(.num_threads) ' 191 | SORTER='sort_by('$SORT_STRING')' 192 | if [ $REVERSE -eq 1 ]; then 193 | SORTER+=' | reverse' 194 | fi 195 | 196 | # json output from jq 197 | if [ $DUMP_DATA -eq 1 ]; then 198 | exec jq -n -r "$REDUCER | $SELECTOR | $SORTER | . " $FILES 199 | fi 200 | 201 | 202 | # the rest of this is for the tabulated output 203 | 204 | # ---------------------------------- 205 | # Construct KEY_LIST, an array defining the order of the columns. 206 | # These are typically keynames from entries in the .results[] of a json or, if there's a corresponding entry in SPECIAL_HEADER or SPECIAL_FILTER, what to show instead. 207 | # If the row begins with #, the metric is omitted. 208 | read -r -d '' -a KEY_LIST <<'EOF_KEY_LIST' 209 | test_name 210 | variant_name 211 | num_threads 212 | nominal_critical 213 | nominal_parallel 214 | cputime_ns_per_lock_acquire 215 | avg_critical_ns_per_loop 216 | avg_parallel_ns_per_loop 217 | avg_lock_overhead_cputime_ns 218 | lock_overhead_cputime_percent 219 | full_concurrency_fraction 220 | lock_acquires_stddev_over_mean 221 | host 222 | #json 223 | wall_elapsed_ns_per_lock_acquire 224 | total_lock_acquires_per_second 225 | EOF_KEY_LIST 226 | 227 | # SPECIAL_HEADER is what to print in the header for a key name. If the key does not exist, then the key name is used as the header. 228 | declare -A SPECIAL_HEADER 229 | SPECIAL_HEADER[cputime_ns_per_lock_acquire]="cpu_ns/lock" 230 | SPECIAL_HEADER[wall_elapsed_ns_per_lock_acquire]="wall_ns/lock" 231 | SPECIAL_HEADER[full_concurrency_fraction]="fcf" 232 | SPECIAL_HEADER[avg_parallel_ns_per_loop]="par_ns" 233 | SPECIAL_HEADER[avg_critical_ns_per_loop]="crit_ns" 234 | SPECIAL_HEADER[avg_lock_overhead_cputime_ns]="overhead_ns" 235 | SPECIAL_HEADER[lock_overhead_cputime_percent]="overhead_%" 236 | SPECIAL_HEADER[total_lock_acquires_per_second]="locks/wall_sec" 237 | SPECIAL_HEADER[lock_acquires_stddev_over_mean]="lasom" 238 | SPECIAL_HEADER[nominal_critical]="nom_crit" 239 | SPECIAL_HEADER[nominal_parallel]="nom_par" 240 | 241 | # SPECIAL_FILTER is how to have jq format the element. If the key does not exist, then .key is used for the filter. 242 | declare -A SPECIAL_FILTER 243 | SPECIAL_FILTER[cputime_ns_per_lock_acquire]='\(.cputime_ns_per_lock_acquire|round)' 244 | SPECIAL_FILTER[wall_elapsed_ns_per_lock_acquire]='\(.wall_elapsed_ns_per_lock_acquire|round)' 245 | SPECIAL_FILTER[full_concurrency_fraction]='\(.full_concurrency_fraction * 100 | round / 100)' 246 | SPECIAL_FILTER[host]='\(.hostname | split(".") | .[0])' 247 | SPECIAL_FILTER[json]='\(.input_filename | split(".") | .[:-1] | join("."))' 248 | SPECIAL_FILTER[avg_critical_ns_per_loop]='\(.avg_critical_ns_per_loop | round)' 249 | SPECIAL_FILTER[avg_parallel_ns_per_loop]='\(.avg_parallel_ns_per_loop | round)' 250 | SPECIAL_FILTER[avg_lock_overhead_cputime_ns]='\(.avg_lock_overhead_cputime_ns | round)' 251 | SPECIAL_FILTER[lock_overhead_cputime_percent]='\(.lock_overhead_cputime_percent | round)' 252 | SPECIAL_FILTER[total_lock_acquires_per_second]='\(.total_lock_acquires_per_second|round)' 253 | SPECIAL_FILTER[lock_acquires_stddev_over_mean]='\(.lock_acquires_stddev_over_mean*10000|round/10000)' 254 | 255 | # constructs the header or filter 256 | make_special() { 257 | local -n pointer="$1" # name reference to associative array, needs bash 4.2 or later 258 | local normal_format_pre_eval=$2 259 | local normal_format 260 | local key 261 | local list= 262 | for key in "${KEY_LIST[@]}" 263 | do 264 | if [[ $key =~ ^\# ]]; then 265 | continue 266 | fi 267 | if [ -n "$list" ]; then 268 | list="$list\t" 269 | fi 270 | 271 | normal_format=$(eval "echo \"$normal_format_pre_eval\"") 272 | 273 | if [[ -v pointer[$key] ]]; then 274 | list+="${pointer[$key]}" 275 | else 276 | list+=$normal_format 277 | fi 278 | done 279 | echo "$list" 280 | } 281 | 282 | HEADER=$(make_special SPECIAL_HEADER '$key') 283 | FILTER=$(make_special SPECIAL_FILTER '\(.${key})') 284 | 285 | # ---------------------------------- 286 | # finally invoke jq for tabulated output using 'column' to pretty print. 287 | ( 288 | echo -e "$HEADER" 289 | jq -n -r "$REDUCER | $SELECTOR | $SORTER | .[] | \"$FILTER\" " $FILES 290 | ) | column -t -o " " 291 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/tests/cas_lockref.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "atomics.h" 33 | #include "cpu_relax.h" 34 | 35 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 36 | unsigned long val, old; 37 | 38 | do { 39 | old = *(volatile unsigned long *) lock; 40 | val = old + 0x100000000; 41 | 42 | while ((old & 0xFFFFFFFF) && ((val >> 32) <= 32)) { 43 | old = *(volatile unsigned long *) lock; 44 | val = old + 0x100000000; 45 | } 46 | 47 | val = cas64(lock, val, old); 48 | if (val == old) { 49 | break; 50 | } 51 | __cpu_relax(); 52 | } while (1); 53 | 54 | return val >> 32; 55 | } 56 | 57 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 58 | unsigned long val, old; 59 | 60 | do { 61 | old = *(volatile unsigned long *) lock; 62 | val = old - 0x100000000; 63 | 64 | while ((old & 0xFFFFFFFF) && ((val >> 32) > 0)) { 65 | old = *(volatile unsigned long *) lock; 66 | val = old - 0x100000000; 67 | } 68 | 69 | val = cas64(lock, val, old); 70 | if (val == old) { 71 | return; 72 | } 73 | __cpu_relax(); 74 | } while (1); 75 | } 76 | 77 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 78 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/tests/cas_rw_lock.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #ifdef initialize_lock 33 | #undef initialize_lock 34 | #endif 35 | 36 | #define initialize_lock(lock, pinorder, threads) cas_rw_lock_init(lock, threads) 37 | #define CAS_RW_INIT_VAL 0x20000000 38 | #define CAS_RW_THRESHOLD 0 39 | 40 | #include "atomics.h" 41 | 42 | void cas_rw_lock_init(uint64_t *lock, uint64_t threads) { 43 | *lock = CAS_RW_INIT_VAL; 44 | } 45 | 46 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 47 | unsigned long val, old; 48 | 49 | old = *(volatile unsigned long *) lock; 50 | val = old - 1; 51 | 52 | while (*((long *) &old) > CAS_RW_THRESHOLD) { 53 | old = *(volatile unsigned long *) lock; 54 | val = old - 1; 55 | val = cas64_acquire(lock, val, old); 56 | 57 | if (val == old) { 58 | return CAS_RW_INIT_VAL - val; 59 | } 60 | } 61 | 62 | /* exclusive lock is held (should never actually happen in this test) */ 63 | return 0; 64 | } 65 | 66 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 67 | fetchadd64_release(lock, 1); 68 | } 69 | 70 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 71 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/tests/empty.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "atomics.h" 33 | 34 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 35 | return 0; 36 | } 37 | 38 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 39 | return; 40 | } 41 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/tests/incdec_refcount.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "atomics.h" 33 | 34 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 35 | return fetchadd64(lock, 1ul); 36 | } 37 | 38 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 39 | fetchsub64(lock, 1ul); 40 | } 41 | 42 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 43 | -------------------------------------------------------------------------------- /benchmarks/lockhammer/tests/swap_mutex.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, The Linux Foundation. All rights reserved. 3 | * 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are 8 | * met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above 12 | * copyright notice, this list of conditions and the following 13 | * disclaimer in the documentation and/or other materials provided 14 | * with the distribution. 15 | * * Neither the name of The Linux Foundation nor the names of its 16 | * contributors may be used to endorse or promote products derived 17 | * from this software without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "atomics.h" 33 | 34 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 35 | unsigned long val = 1; 36 | 37 | while (val) { 38 | val = swap64 (lock, 1); // uses acquire-release semantics 39 | } 40 | 41 | return 0; 42 | } 43 | 44 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 45 | __atomic_store_n(lock, 0, __ATOMIC_RELEASE); 46 | } 47 | 48 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 49 | -------------------------------------------------------------------------------- /contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing to Synchronization-Benchmarks 2 | ========================================== 3 | 4 | Getting Started 5 | --------------- 6 | 7 | - Make sure you have a `GitHub account`_. 8 | - Create an `issue`_ for your work if one does not already exist. This gives 9 | everyone visibility of whether others are working on something similar. 10 | 11 | - If you intend to include Third Party IP in your contribution, please 12 | raise a separate `issue`_ for this and ensure that the changes that 13 | include Third Party IP are made on a separate topic branch. 14 | 15 | - `Fork`_ `synchronization-benchmarks`_ on GitHub. 16 | - Clone the fork to your own machine. 17 | - Create a local topic branch based on the `synchronization-benchmarks`_ ``master`` 18 | branch. 19 | - Make sure you have placed the hooks/commit-msg hook into your .git/hooks directory 20 | to append change IDs to your commits. 21 | 22 | Making Changes 23 | -------------- 24 | 25 | - Make commits of logical units. See these general `Git guidelines`_ for 26 | contributing to a project. 27 | - Keep the commits on topic. If you need to fix another bug or make another 28 | enhancement, please create a separate `issue`_ and address it on a separate 29 | topic branch. 30 | - Avoid long commit series. If you do have a long series, consider whether 31 | some commits should be squashed together or addressed in a separate topic. 32 | - Make sure your commit messages are in the proper format. If a commit fixes 33 | a GitHub `issue`_, include a reference; this ensures the `issue`_ is 34 | `automatically closed`_ when merged into the `synchronization-benchmarks`_ ``master`` 35 | branch. 36 | - Where appropriate, please update the documentation and license of files. 37 | 38 | - Ensure that each changed file has the correct copyright and license 39 | information. Files that entirely consist of contributions to this 40 | project should have the copyright notice and BSD-3-Clause SPDX license 41 | identifier as shown in `license.rst`_. Files that contain 42 | changes to imported Third Party IP should contain a notice as follows, 43 | with the original copyright and license text retained: 44 | 45 | :: 46 | 47 | Portions copyright (c) [XXXX-]YYYY, ARM Limited and Contributors. All rights reserved. 48 | 49 | where XXXX is the year of first contribution (if different to YYYY) and 50 | YYYY is the year of most recent contribution. 51 | - For topics with multiple commits, you should make all documentation 52 | changes (and nothing else) in the last commit of the series. Otherwise, 53 | include the documentation changes within the single commit. 54 | 55 | Submitting Changes 56 | ------------------ 57 | 58 | - We prefer that each commit in the series has at least one ``Signed-off-by:`` 59 | line, using your real name and email address, but it is not required. 60 | - Push your local changes to your fork of the repository. 61 | - Submit a `pull request`_ to the `synchronization-benchmarks`_ ``integration`` branch. 62 | 63 | - The changes in the `pull request`_ will then undergo further review. 64 | Any review comments will be made as comments on the `pull request`_. 65 | This may require you to do some rework. 66 | 67 | - When the changes are accepted, the maintainer of the repository will integrate them. 68 | 69 | - Typically, the Maintainers will merge the `pull request`_ into the 70 | ``integration`` branch within the GitHub UI, creating a merge commit. 71 | - Please avoid creating merge commits in the `pull request`_ itself. 72 | - If the `pull request`_ is not based on a recent commit, the Maintainers 73 | may rebase it onto the ``master`` branch first, or ask you to do this. 74 | - If the `pull request`_ cannot be automatically merged, the Maintainers 75 | will ask you to rebase it onto the ``master`` branch. 76 | - After final integration testing, the Maintainers will push your merge 77 | commit to the ``master`` branch. If a problem is found during integration, 78 | the merge commit will be removed from the ``integration`` branch and the 79 | Maintainers will ask you to create a new pull request to resolve the 80 | problem. 81 | - Please do not delete your topic branch until it is safely merged into 82 | the ``master`` branch. 83 | 84 | -------------- 85 | 86 | *Copyright (c) 2018, ARM Limited and Contributors. All rights reserved.* 87 | 88 | .. _GitHub account: https://github.com/signup/free 89 | .. _issue: https://github.com/ARM-software/synchronization-benchmarks/issues 90 | .. _Fork: https://help.github.com/articles/fork-a-repo 91 | .. _synchronization-benchmarks: https://github.com/ARM-software/synchronization-benchmarks 92 | .. _Git guidelines: http://git-scm.com/book/ch5-2.html 93 | .. _automatically closed: https://help.github.com/articles/closing-issues-via-commit-messages 94 | .. _license.rst: ./license.rst 95 | .. _pull request: https://help.github.com/articles/using-pull-requests 96 | -------------------------------------------------------------------------------- /ext/linux/hybrid_spinlock.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 ARM Ltd. 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License version 2 as 6 | * published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with this program. If not, see . 15 | */ 16 | 17 | #ifdef initialize_lock 18 | #undef initialize_lock 19 | #endif 20 | 21 | #define initialize_lock(lock, pinorder, threads) mcs_init_locks(lock, threads) 22 | 23 | #include "atomics.h" 24 | #include "lk_atomics.h" 25 | 26 | #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 27 | << _Q_ ## type ## _OFFSET) 28 | 29 | #define _Q_TAIL_IDX_OFFSET 0 30 | #define _Q_TAIL_IDX_BITS 2 31 | #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 32 | 33 | #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 34 | #define _Q_TAIL_CPU_BITS (16 - _Q_TAIL_CPU_OFFSET) 35 | #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 36 | #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 37 | 38 | #define _Q_TAIL_MASK (_Q_TAIL_CPU_MASK | _Q_TAIL_IDX_MASK) 39 | 40 | #define _Q_THRESHOLD 4 41 | 42 | struct mcs_spinlock { 43 | struct mcs_spinlock *next; 44 | int locked; 45 | int count; 46 | }; 47 | 48 | struct mcs_spinlock *mcs_pool; 49 | 50 | void mcs_init_locks (uint64_t *lock, unsigned long cores) 51 | { 52 | size_t n = 4 * cores * sizeof(struct mcs_spinlock); 53 | if (mcs_pool) { free(mcs_pool); } 54 | mcs_pool = (struct mcs_spinlock *) malloc(n); 55 | if (! mcs_pool) { fprintf(stderr, "malloc failed in " __FILE__ " %s\n", __func__); exit(-1); } 56 | memset(mcs_pool, 0, n); 57 | } 58 | 59 | static inline unsigned ticket_depth (unsigned ticketval) 60 | { 61 | return (((ticketval & 0xff000000) >> 24) - ((ticketval & 0x00ff0000) >> 16)) & 0xff; 62 | } 63 | 64 | static inline __attribute((pure)) u32 encode_tail(int cpu, int idx) 65 | { 66 | u32 tail; 67 | 68 | #ifdef CONFIG_DEBUG_SPINLOCK 69 | BUG_ON(idx > 3); 70 | #endif 71 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 72 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 73 | 74 | return tail; 75 | } 76 | 77 | static inline __attribute((pure)) struct mcs_spinlock *decode_tail(u32 tail) 78 | { 79 | int cpu = ((tail & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET) - 1; 80 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 81 | 82 | return &mcs_pool[4 * cpu + idx]; 83 | } 84 | 85 | static __always_inline u32 xchg_tail(uint64_t *lock, u32 tail) 86 | { 87 | /* 88 | * Use release semantics to make sure that the MCS node is properly 89 | * initialized before changing the tail code. 90 | */ 91 | return (u32)xchg_release16((uint16_t *) lock, 92 | tail & _Q_TAIL_MASK); 93 | } 94 | 95 | unsigned long hybrid_spinlock_slowpath(uint64_t *lock, unsigned long threadnum) 96 | { 97 | unsigned long depth = 0; 98 | struct mcs_spinlock *prev, *next, *node; 99 | 100 | u32 /* new, */ old, tail, val, ticketval; 101 | 102 | int idx; 103 | 104 | node = &mcs_pool[4 * threadnum]; 105 | idx = node->count++; 106 | 107 | tail = encode_tail(threadnum, idx); 108 | 109 | node += idx; 110 | node->locked = 0; 111 | node->next = NULL; 112 | 113 | old = xchg_tail(lock, tail); 114 | next = NULL; 115 | 116 | if (old & _Q_TAIL_MASK) { 117 | prev = decode_tail(old); 118 | smp_read_barrier_depends(); 119 | 120 | WRITE_ONCE(prev->next, node); 121 | 122 | arch_mcs_spin_lock_contended(&node->locked); 123 | 124 | next = READ_ONCE(node->next); 125 | if (next) 126 | prefetchw(next); 127 | } 128 | 129 | /* do ticket spin */ 130 | #if defined(__aarch64__) 131 | unsigned /* tmp, */ tmp2, tmp3; 132 | asm volatile ( 133 | "5: ldaxr %w[ticket], %[lock]\n" 134 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 135 | " stxr %w[tmp3], %w[tmp2], %[lock]\n" 136 | " cbnz %w[tmp3], 5b\n" 137 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 138 | [tmp3] "=&r" (tmp3), [lock] "+Q" (*lock) 139 | : [ticket_inc] "r" (0x01000000) 140 | : ); 141 | // printf("%d enqueued on %d behind %d (serving %d)\n", ticketval >> 24, tail >> 2, old >> 2, (ticketval >> 16) & 0xFF); 142 | 143 | depth = ticket_depth(ticketval); 144 | 145 | asm volatile ( 146 | " sevl\n" 147 | "7: wfe\n" 148 | " ldaxrb %w[tmp3], %[serving]\n" 149 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 150 | " cbnz %w[tmp2], 7b\n" 151 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 152 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 153 | : [tmp] "r" (ticketval >> 24) 154 | : ); 155 | #else 156 | #endif 157 | 158 | val = READ_ONCE(*lock); 159 | 160 | /* If we're the list tail then destroy the queue */ 161 | while ((val & _Q_TAIL_MASK) == tail) { 162 | old = atomic_cmpxchg_relaxed32((u32 *) lock, val, val & ~_Q_TAIL_MASK); 163 | 164 | if (old == val) 165 | goto release; 166 | 167 | val = old; 168 | } 169 | 170 | if (!next) { 171 | while (!(next = READ_ONCE(node->next))) 172 | cpu_relax(); 173 | } 174 | 175 | arch_mcs_spin_unlock_contended(&next->locked); 176 | 177 | release: 178 | 179 | mcs_pool[4 * threadnum].count--; 180 | 181 | return depth; 182 | } 183 | 184 | unsigned long __attribute__((noinline)) lock_acquire (uint64_t *lock, unsigned long threadnum) { 185 | unsigned long depth = 0; 186 | 187 | u32 ticketval; 188 | 189 | unsigned enqueue; 190 | 191 | #if defined(__aarch64__) 192 | unsigned /* tmp, */ tmp2, tmp3; 193 | asm volatile ( 194 | "1: ldaxr %w[ticket], %[lock]\n" 195 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 196 | " rev16 %w[enqueue], %w[ticket]\n" 197 | " eor %w[enqueue], %w[enqueue], %w[ticket]\n" 198 | " cbnz %w[enqueue], 2f\n" 199 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 200 | " cbnz %w[enqueue], 1b\n" 201 | "2:\n" 202 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 203 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 204 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24) 205 | : ); 206 | if (!enqueue) 207 | return 0; /* Ticket acquired immediately */ 208 | 209 | #else 210 | /* TODO: Generic C implementation of fastpath */ 211 | val = READ_ONCE(*lock); 212 | 213 | enqueue = val & _Q_TAIL_MASK; 214 | 215 | if (!enqueue) 216 | { 217 | } 218 | #endif 219 | 220 | #if defined (__aarch64__) 221 | asm volatile ( 222 | " mov %w[enqueue], #1\n" 223 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 224 | " rev16 %w[tmp2], %w[tmp3]\n" 225 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 226 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 227 | " cbz %w[tmp3], 4f\n" 228 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 229 | " cbnz %w[tmp3], 4f\n" 230 | "3: ldaxr %w[ticket], %[lock]\n" 231 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 232 | " rev16 %w[tmp2], %w[tmp3]\n" 233 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 234 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 235 | " cbz %w[tmp3], 4f\n" 236 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 237 | " cbnz %w[tmp3], 4f\n" 238 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 239 | " cbnz %w[enqueue], 3b\n" 240 | "4:\n" 241 | : [ticket] "+&r" (ticketval), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 242 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 243 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24), 244 | [qtailmask] "i" (_Q_TAIL_MASK) 245 | : ); 246 | #else 247 | #endif 248 | 249 | if (enqueue) 250 | { 251 | depth = hybrid_spinlock_slowpath(lock, threadnum); 252 | } 253 | else 254 | { 255 | depth = ticket_depth(ticketval); 256 | #if defined(__aarch64__) 257 | asm volatile ( 258 | " sevl\n" 259 | "9: wfe\n" 260 | " ldaxrb %w[tmp3], %[serving]\n" 261 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 262 | " cbnz %w[tmp2], 9b\n" 263 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 264 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 265 | : [tmp] "r" (ticketval >> 24) 266 | : ); 267 | #else 268 | #endif 269 | } 270 | 271 | return depth; 272 | } 273 | 274 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 275 | #if defined(__x86_64__) 276 | asm volatile ( 277 | " addw $0x2,%[lock]\n" 278 | : [lock] "+m" (*lock) 279 | : 280 | : "cc" ); 281 | #elif defined(__aarch64__) 282 | unsigned long tmp; 283 | asm volatile ( 284 | " ldrb %w[tmp], %[lock]\n" 285 | " add %w[tmp], %w[tmp], #0x1\n" 286 | " stlrb %w[tmp], %[lock]\n" 287 | : [tmp] "=&r" (tmp), [lock] "+Q" (*(((unsigned char *) lock) + 2)) 288 | : 289 | : ); 290 | 291 | #endif 292 | } 293 | 294 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab : */ 295 | -------------------------------------------------------------------------------- /ext/linux/hybrid_spinlock_fastdequeue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 ARM Ltd. 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License version 2 as 6 | * published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with this program. If not, see . 15 | */ 16 | 17 | #ifdef initialize_lock 18 | #undef initialize_lock 19 | #endif 20 | 21 | #define initialize_lock(lock, pinorder, threads) mcs_init_locks(lock, threads) 22 | 23 | #include "atomics.h" 24 | #include "lk_atomics.h" 25 | 26 | #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 27 | << _Q_ ## type ## _OFFSET) 28 | 29 | #define _Q_TAIL_IDX_OFFSET 0 30 | #define _Q_TAIL_IDX_BITS 2 31 | #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 32 | 33 | #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 34 | #define _Q_TAIL_CPU_BITS (16 - _Q_TAIL_CPU_OFFSET) 35 | #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 36 | #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 37 | 38 | #define _Q_TAIL_MASK (_Q_TAIL_CPU_MASK | _Q_TAIL_IDX_MASK) 39 | 40 | /* Number of ticket waiters required before a queue is established */ 41 | #define _Q_THRESHOLD 6 42 | /* Maximum number of queued waiters allowed to exit queue early */ 43 | #define _Q_DEQUEUE_THRESHOLD 2 44 | 45 | struct mcs_spinlock { 46 | struct mcs_spinlock *next; 47 | int locked; 48 | int count; 49 | }; 50 | 51 | struct mcs_spinlock *mcs_pool; 52 | 53 | void mcs_init_locks (uint64_t *lock, unsigned long cores) 54 | { 55 | size_t n = 4 * cores * sizeof(struct mcs_spinlock); 56 | if (mcs_pool) { free(mcs_pool); } 57 | mcs_pool = (struct mcs_spinlock *) malloc(n); 58 | if (! mcs_pool) { fprintf(stderr, "malloc failed in " __FILE__ " %s\n", __func__); exit(-1); } 59 | memset(mcs_pool, 0, n); 60 | } 61 | 62 | static inline unsigned ticket_depth (unsigned ticketval) 63 | { 64 | return (((ticketval & 0xff000000) >> 24) - ((ticketval & 0x00ff0000) >> 16)) & 0xff; 65 | } 66 | 67 | static inline __attribute((pure)) u32 encode_tail(int cpu, int idx) 68 | { 69 | u32 tail; 70 | 71 | #ifdef CONFIG_DEBUG_SPINLOCK 72 | BUG_ON(idx > 3); 73 | #endif 74 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 75 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 76 | 77 | return tail; 78 | } 79 | 80 | static inline __attribute((pure)) struct mcs_spinlock *decode_tail(u32 tail) 81 | { 82 | int cpu = ((tail & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET) - 1; 83 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 84 | 85 | return &mcs_pool[4 * cpu + idx]; 86 | } 87 | 88 | static __always_inline u32 xchg_tail(uint64_t *lock, u32 tail) 89 | { 90 | /* 91 | * Use release semantics to make sure that the MCS node is properly 92 | * initialized before changing the tail code. 93 | */ 94 | return (u32)xchg_release16((uint16_t *) lock, 95 | tail & _Q_TAIL_MASK); 96 | } 97 | 98 | unsigned long hybrid_spinlock_slowpath(uint64_t *lock, unsigned long threadnum) 99 | { 100 | unsigned long depth = 0; 101 | struct mcs_spinlock *prev, *next, *node; 102 | 103 | u32 /* new, */ old, tail, val, ticketval; 104 | 105 | int idx; 106 | 107 | node = &mcs_pool[4 * threadnum]; 108 | idx = node->count++; 109 | 110 | tail = encode_tail(threadnum, idx); 111 | 112 | node += idx; 113 | node->locked = 0; 114 | node->next = NULL; 115 | 116 | old = xchg_tail(lock, tail); 117 | next = NULL; 118 | 119 | if (old & _Q_TAIL_MASK) { 120 | prev = decode_tail(old); 121 | smp_read_barrier_depends(); 122 | 123 | WRITE_ONCE(prev->next, node); 124 | 125 | arch_mcs_spin_lock_contended(&node->locked); 126 | 127 | next = READ_ONCE(node->next); 128 | if (next) 129 | prefetchw(next); 130 | } 131 | 132 | /* do ticket spin */ 133 | #if defined(__aarch64__) 134 | unsigned /* tmp, */ tmp2, tmp3; 135 | #if _Q_DEQUEUE_THRESHOLD 136 | asm volatile ( 137 | " sevl\n" 138 | "44: wfe\n" 139 | "5: ldaxr %w[ticket], %[lock]\n" 140 | " sub %w[tmp3], %w[ticket], %w[ticket], lsl #8\n" 141 | " and %w[tmp3], %w[tmp3], #0xFF000000\n" 142 | " cmp %w[tmp3], %w[qthresh]\n" 143 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 144 | " bgt 44b\n" 145 | " stxr %w[tmp3], %w[tmp2], %[lock]\n" 146 | " cbnz %w[tmp3], 5b\n" 147 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 148 | [tmp3] "=&r" (tmp3), [lock] "+Q" (*lock) 149 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" ((_Q_DEQUEUE_THRESHOLD) << 24) 150 | : "cc" ); 151 | #else 152 | asm volatile ( 153 | "5: ldaxr %w[ticket], %[lock]\n" 154 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 155 | " stxr %w[tmp3], %w[tmp2], %[lock]\n" 156 | " cbnz %w[tmp3], 5b\n" 157 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 158 | [tmp3] "=&r" (tmp3), [lock] "+Q" (*lock) 159 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24) 160 | : ); 161 | asm volatile ( 162 | " sevl\n" 163 | "7: wfe\n" 164 | " ldaxrb %w[tmp3], %[serving]\n" 165 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 166 | " cbnz %w[tmp2], 7b\n" 167 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 168 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 169 | : [tmp] "r" (ticketval >> 24) 170 | : ); 171 | #endif 172 | 173 | depth = ticket_depth(ticketval); 174 | val = READ_ONCE(*lock); 175 | 176 | /* If we're the list tail then destroy the queue */ 177 | while ((val & _Q_TAIL_MASK) == tail) { 178 | old = atomic_cmpxchg_relaxed32((u32 *) lock, val, val & ~_Q_TAIL_MASK); 179 | 180 | if (old == val) 181 | goto release; 182 | 183 | val = old; 184 | } 185 | 186 | if (!next) { 187 | while (!(next = READ_ONCE(node->next))) 188 | cpu_relax(); 189 | } 190 | 191 | arch_mcs_spin_unlock_contended(&next->locked); 192 | 193 | release: 194 | 195 | mcs_pool[4 * threadnum].count--; 196 | 197 | #if _Q_DEQUEUE_THRESHOLD 198 | asm volatile ( 199 | " sevl\n" 200 | "7: wfe\n" 201 | " ldaxrb %w[tmp3], %[serving]\n" 202 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 203 | " cbnz %w[tmp2], 7b\n" 204 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 205 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 206 | : [tmp] "r" (ticketval >> 24) 207 | : ); 208 | #endif 209 | 210 | #else 211 | #endif 212 | 213 | return depth; 214 | 215 | } 216 | 217 | unsigned long __attribute__((noinline)) lock_acquire (uint64_t *lock, unsigned long threadnum) { 218 | unsigned long depth = 0; 219 | 220 | u32 ticketval; 221 | 222 | unsigned enqueue; 223 | 224 | #if defined(__aarch64__) 225 | unsigned /* tmp, */ tmp2, tmp3; 226 | asm volatile ( 227 | "1: ldaxr %w[ticket], %[lock]\n" 228 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 229 | " rev16 %w[enqueue], %w[ticket]\n" 230 | " eor %w[enqueue], %w[enqueue], %w[ticket]\n" 231 | " cbnz %w[enqueue], 2f\n" 232 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 233 | " cbnz %w[enqueue], 1b\n" 234 | "2:\n" 235 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 236 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 237 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24) 238 | : ); 239 | if (!enqueue) 240 | return 0; /* Ticket acquired immediately */ 241 | 242 | #else 243 | /* TODO: Generic C implementation of fastpath */ 244 | val = READ_ONCE(*lock); 245 | 246 | enqueue = val & _Q_TAIL_MASK; 247 | 248 | if (!enqueue) 249 | { 250 | } 251 | #endif 252 | 253 | #if defined (__aarch64__) 254 | asm volatile ( 255 | " mov %w[enqueue], #1\n" 256 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 257 | " rev16 %w[tmp2], %w[tmp3]\n" 258 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 259 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 260 | " cbz %w[tmp3], 4f\n" 261 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 262 | " cbnz %w[tmp3], 4f\n" 263 | "3: ldaxr %w[ticket], %[lock]\n" 264 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 265 | " rev16 %w[tmp2], %w[tmp3]\n" 266 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 267 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 268 | " cbz %w[tmp3], 4f\n" 269 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 270 | " cbnz %w[tmp3], 4f\n" 271 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 272 | " cbnz %w[enqueue], 3b\n" 273 | "4:\n" 274 | : [ticket] "+&r" (ticketval), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 275 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 276 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24), 277 | [qtailmask] "i" (_Q_TAIL_MASK) 278 | : ); 279 | #else 280 | #endif 281 | 282 | if (enqueue) 283 | { 284 | depth = hybrid_spinlock_slowpath(lock, threadnum); 285 | } 286 | else 287 | { 288 | depth = ticket_depth(ticketval); 289 | #if defined(__aarch64__) 290 | asm volatile ( 291 | " sevl\n" 292 | "9: wfe\n" 293 | " ldaxrb %w[tmp3], %[serving]\n" 294 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 295 | " cbnz %w[tmp2], 9b\n" 296 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 297 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 298 | : [tmp] "r" (ticketval >> 24) 299 | : ); 300 | #else 301 | #endif 302 | } 303 | 304 | return depth; 305 | } 306 | 307 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 308 | #if defined(__x86_64__) 309 | asm volatile ( 310 | " addw $0x2,%[lock]\n" 311 | : [lock] "+m" (*lock) 312 | : 313 | : "cc" ); 314 | #elif defined(__aarch64__) 315 | unsigned long tmp; 316 | asm volatile ( 317 | " ldrb %w[tmp], %[lock]\n" 318 | " add %w[tmp], %w[tmp], #0x1\n" 319 | " stlrb %w[tmp], %[lock]\n" 320 | : [tmp] "=&r" (tmp), [lock] "+Q" (*(((unsigned char *) lock) + 2)) 321 | : 322 | : ); 323 | 324 | #endif 325 | } 326 | 327 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab : */ 328 | -------------------------------------------------------------------------------- /ext/linux/hybrid_spinlock_old_fastdequeue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 ARM Ltd. 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License version 2 as 6 | * published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with this program. If not, see . 15 | */ 16 | 17 | // 18 | // NOTE: This file is currently unused 19 | // 20 | 21 | 22 | #ifdef initialize_lock 23 | #undef initialize_lock 24 | #endif 25 | 26 | #define initialize_lock(lock, threads) mcs_init_locks(lock, threads) 27 | 28 | #include "atomics.h" 29 | #include "lk_atomics.h" 30 | 31 | #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 32 | << _Q_ ## type ## _OFFSET) 33 | 34 | #define _Q_TAIL_IDX_OFFSET 0 35 | #define _Q_TAIL_IDX_BITS 2 36 | #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 37 | 38 | #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 39 | #define _Q_TAIL_CPU_BITS (16 - _Q_TAIL_CPU_OFFSET) 40 | #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 41 | #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 42 | 43 | #define _Q_TAIL_MASK (_Q_TAIL_CPU_MASK | _Q_TAIL_IDX_MASK) 44 | 45 | #define _Q_THRESHOLD 4 46 | 47 | struct mcs_spinlock { 48 | struct mcs_spinlock *next; 49 | int locked; 50 | int count; 51 | }; 52 | 53 | struct mcs_spinlock *mcs_pool; 54 | 55 | void mcs_init_locks (uint64_t *lock, unsigned long cores) 56 | { 57 | size_t n = 4 * cores * sizeof(struct mcs_spinlock); 58 | if (mcs_pool) { free(mcs_pool); } 59 | mcs_pool = (struct mcs_spinlock *) malloc(n); 60 | if (! mcs_pool) { fprintf(stderr, "malloc failed in " __FILE__ " %s\n", __func__); exit(-1); } 61 | memset(mcs_pool, 0, n); 62 | } 63 | 64 | static inline __attribute((pure)) u32 encode_tail(int cpu, int idx) 65 | { 66 | u32 tail; 67 | 68 | #ifdef CONFIG_DEBUG_SPINLOCK 69 | BUG_ON(idx > 3); 70 | #endif 71 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 72 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 73 | 74 | return tail; 75 | } 76 | 77 | static inline __attribute((pure)) struct mcs_spinlock *decode_tail(u32 tail) 78 | { 79 | int cpu = ((tail & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET) - 1; 80 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 81 | 82 | return &mcs_pool[4 * cpu + idx]; 83 | } 84 | 85 | static __always_inline u32 xchg_tail(uint64_t *lock, u32 tail) 86 | { 87 | /* 88 | * Use release semantics to make sure that the MCS node is properly 89 | * initialized before changing the tail code. 90 | */ 91 | return (u32)xchg_release16((uint16_t *) lock, 92 | tail & _Q_TAIL_MASK); 93 | } 94 | 95 | void hybrid_spinlock_slowpath(uint64_t *lock, unsigned long threadnum) 96 | { 97 | unsigned long depth = 0; 98 | struct mcs_spinlock *prev, *next, *node; 99 | 100 | u32 new, old, tail, val, ticketval; 101 | 102 | int idx; 103 | 104 | node = &mcs_pool[4 * threadnum]; 105 | idx = node->count++; 106 | 107 | tail = encode_tail(threadnum, idx); 108 | 109 | node += idx; 110 | node->locked = 0; 111 | node->next = NULL; 112 | 113 | old = xchg_tail(lock, tail); 114 | next = NULL; 115 | 116 | if (old & _Q_TAIL_MASK) { 117 | prev = decode_tail(old); 118 | smp_read_barrier_depends(); 119 | 120 | WRITE_ONCE(prev->next, node); 121 | 122 | arch_mcs_spin_lock_contended(&node->locked); 123 | 124 | next = READ_ONCE(node->next); 125 | if (next) 126 | prefetchw(next); 127 | } 128 | 129 | /* do ticket spin */ 130 | #if defined(__aarch64__) 131 | unsigned tmp, tmp2, tmp3; 132 | asm volatile ( 133 | " sevl\n" 134 | "44: wfe\n" 135 | "5: ldaxr %w[ticket], %[lock]\n" 136 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 137 | " rev16 %w[tmp2], %w[tmp3]\n" 138 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 139 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 140 | " cbz %w[tmp3], 44b\n" 141 | " stxr %w[tmp3], %w[tmp2], %[lock]\n" 142 | " cbnz %w[tmp3], 5b\n" 143 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 144 | [tmp3] "=&r" (tmp3), [lock] "+Q" (*lock) 145 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24) 146 | : ); 147 | // printf("%d enqueued on %d behind %d (serving %d)\n", ticketval >> 24, tail >> 2, old >> 2, (ticketval >> 16) & 0xFF); 148 | val = READ_ONCE(*lock); 149 | 150 | /* If we're the list tail then destroy the queue */ 151 | while ((val & _Q_TAIL_MASK) == tail) { 152 | old = atomic_cmpxchg_relaxed32((u32 *) lock, val, val & ~_Q_TAIL_MASK); 153 | 154 | if (old == val) 155 | goto release; 156 | 157 | val = old; 158 | } 159 | 160 | if (!next) { 161 | while (!(next = READ_ONCE(node->next))) 162 | cpu_relax(); 163 | } 164 | 165 | arch_mcs_spin_unlock_contended(&next->locked); 166 | 167 | release: 168 | 169 | mcs_pool[4 * threadnum].count--; 170 | asm volatile ( 171 | " sevl\n" 172 | "7: wfe\n" 173 | " ldaxrb %w[tmp3], %[serving]\n" 174 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 175 | " cbnz %w[tmp2], 7b\n" 176 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 177 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 178 | : [tmp] "r" (ticketval >> 24) 179 | : ); 180 | #else 181 | #endif 182 | 183 | } 184 | 185 | unsigned long __attribute__((noinline)) lock_acquire (uint64_t *lock, unsigned long threadnum) { 186 | unsigned long depth = 0; 187 | 188 | u32 ticketval; 189 | 190 | unsigned enqueue; 191 | 192 | #if defined(__aarch64__) 193 | unsigned tmp, tmp2, tmp3; 194 | asm volatile ( 195 | "1: ldaxr %w[ticket], %[lock]\n" 196 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 197 | " rev16 %w[enqueue], %w[ticket]\n" 198 | " eor %w[enqueue], %w[enqueue], %w[ticket]\n" 199 | " cbnz %w[enqueue], 2f\n" 200 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 201 | " cbnz %w[enqueue], 1b\n" 202 | "2:\n" 203 | : [ticket] "=&r" (ticketval), [tmp2] "=&r" (tmp2), 204 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 205 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24) 206 | : ); 207 | if (!enqueue) 208 | return 0; /* Ticket acquired immediately */ 209 | 210 | depth = ((ticketval >> 24) - (ticketval >> 16)) & 0xFF; 211 | #else 212 | /* TODO: Generic C implementation of fastpath */ 213 | val = READ_ONCE(*lock); 214 | 215 | enqueue = val & _Q_TAIL_MASK; 216 | 217 | if (!(val & _Q_TAIL_MASK) 218 | { 219 | } 220 | #endif 221 | 222 | #if defined (__aarch64__) 223 | asm volatile ( 224 | " mov %[enqueue], #1\n" 225 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 226 | " rev16 %w[tmp2], %w[tmp3]\n" 227 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 228 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 229 | " cbz %w[tmp3], 4f\n" 230 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 231 | " cbnz %w[tmp3], 4f\n" 232 | "3: ldaxr %w[ticket], %[lock]\n" 233 | " sub %w[tmp3], %w[ticket], %w[qthresh]\n" 234 | " rev16 %w[tmp2], %w[tmp3]\n" 235 | " eor %w[tmp3], %w[tmp2], %w[tmp3]\n" 236 | " add %w[tmp2], %w[ticket], %w[ticket_inc]\n" 237 | " cbz %w[tmp3], 4f\n" 238 | " and %w[tmp3], %w[ticket], %w[qtailmask]\n" 239 | " cbnz %w[tmp3], 4f\n" 240 | " stxr %w[enqueue], %w[tmp2], %[lock]\n" 241 | " cbnz %w[enqueue], 3b\n" 242 | "4:\n" 243 | : [ticket] "+&r" (ticketval), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 244 | [enqueue] "=&r" (enqueue), [lock] "+Q" (*lock) 245 | : [ticket_inc] "r" (0x01000000), [qthresh] "r" (_Q_THRESHOLD << 24), 246 | [qtailmask] "i" (_Q_TAIL_MASK) 247 | : ); 248 | #else 249 | #endif 250 | 251 | if (enqueue) 252 | { 253 | hybrid_spinlock_slowpath(lock, threadnum); 254 | } 255 | else 256 | { 257 | depth = 0; 258 | #if defined(__aarch64__) 259 | asm volatile ( 260 | " sevl\n" 261 | "9: wfe\n" 262 | " ldaxrb %w[tmp3], %[serving]\n" 263 | " eor %w[tmp2], %w[tmp], %w[tmp3]\n" 264 | " cbnz %w[tmp2], 9b\n" 265 | : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), 266 | [serving] "+Q" (*(((unsigned char *) lock) + 2)) 267 | : [tmp] "r" (ticketval >> 24) 268 | : ); 269 | #else 270 | #endif 271 | } 272 | 273 | return depth; 274 | } 275 | 276 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 277 | #if defined(__x86_64__) 278 | asm volatile ( 279 | " addw $0x2,%[lock]\n" 280 | : [lock] "+m" (*lock) 281 | : 282 | : "cc" ); 283 | #elif defined(__aarch64__) 284 | unsigned long tmp; 285 | asm volatile ( 286 | " ldrb %w[tmp], %[lock]\n" 287 | " add %w[tmp], %w[tmp], #0x1\n" 288 | " stlrb %w[tmp], %[lock]\n" 289 | : [tmp] "=&r" (tmp), [lock] "+Q" (*(((unsigned char *) lock) + 2)) 290 | : 291 | : ); 292 | 293 | #endif 294 | } 295 | 296 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab : */ 297 | -------------------------------------------------------------------------------- /ext/linux/include/lk_barrier.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | 3 | /* Based on Linux kernel 4.16.10 4 | * arch/arm64/include/asm/barrier.h 5 | * arch/x86/include/asm/barrier.h 6 | * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115 7 | */ 8 | 9 | #ifndef __ASM_BARRIER_H 10 | #define __ASM_BARRIER_H 11 | 12 | #include "lk_cmpxchg.h" 13 | 14 | #if defined(__x86_64__) 15 | 16 | #define mb() asm volatile("mfence":::"memory") 17 | #define rmb() asm volatile("lfence":::"memory") 18 | #define wmb() asm volatile("sfence" ::: "memory") 19 | #define dma_rmb() barrier() 20 | #define dma_wmb() barrier() 21 | #define smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") 22 | #define smp_rmb() dma_rmb() 23 | #define smp_wmb() barrier() 24 | #define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) 25 | 26 | 27 | /* Atomic operations are already serializing on x86 */ 28 | #define __smp_mb__before_atomic() barrier() 29 | #define __smp_mb__after_atomic() barrier() 30 | 31 | 32 | #elif defined(__aarch64__) 33 | 34 | #define isb() asm volatile("isb" : : : "memory") 35 | #define dmb(opt) asm volatile("dmb " #opt : : : "memory") 36 | #define dsb(opt) asm volatile("dsb " #opt : : : "memory") 37 | #define psb_csync() asm volatile("hint #17" : : : "memory") 38 | #define csdb() asm volatile("hint #20" : : : "memory") 39 | #define mb() dsb(sy) 40 | #define rmb() dsb(ld) 41 | #define wmb() dsb(st) 42 | #define dma_rmb() dmb(oshld) 43 | #define dma_wmb() dmb(oshst) 44 | #define smp_mb() dmb(ish) 45 | #define smp_rmb() dmb(ishld) 46 | #define smp_wmb() dmb(ishst) 47 | 48 | #else /* No Arch */ 49 | /* TODO: No Arch Default */ 50 | #endif /* __x86_64__ */ 51 | 52 | #endif /* __ASM_BARRIER_H */ 53 | -------------------------------------------------------------------------------- /ext/linux/ticket_spinlock.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 ARM Ltd. 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License version 2 as 6 | * published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | * 13 | * You should have received a copy of the GNU General Public License 14 | * along with this program. If not, see . 15 | */ 16 | 17 | /* aarch64 version is based on Linux 3.13 */ 18 | 19 | #include "atomics.h" 20 | 21 | unsigned long __attribute__((noinline)) lock_acquire (uint64_t *lock, unsigned long threadnum) { 22 | unsigned long depth = 0; 23 | #if defined(__x86_64__) 24 | asm volatile ( 25 | " movw $0,%[depth]\n" 26 | " nop\n" 27 | " nop\n" 28 | " nop\n" 29 | " mov $0x20000,%%eax\n" 30 | " lock xadd %%eax,%[lock]\n" 31 | " mov %%eax,%%edx\n" 32 | " mov %%eax,%[depth]\n" 33 | " shr $0x10,%%edx\n" 34 | " cmp %%ax,%%dx\n" 35 | " jne 2f\n" 36 | "1: nop\n" 37 | " jmp 4f\n" 38 | "2: movzwl %[lock],%%eax\n" 39 | " mov %%edx,%%ecx\n" 40 | " cmp %%dx,%%ax\n" 41 | " je 1b\n" 42 | "3: pause\n" 43 | " movzwl %[lock],%%eax\n" 44 | " cmp %%cx,%%ax\n" 45 | " jne 3b\n" 46 | "4:\n" 47 | : [lock] "+m" (*lock), [depth] "=m" (depth) 48 | : 49 | : "cc", "eax", "ecx", "edx", "ax", "cx", "dx" ); 50 | depth = (((depth >> 16) - (depth & 0xFFFF)) & 0xFFFF) >> 2; 51 | #elif defined(__aarch64__) 52 | unsigned tmp, tmp2, tmp3; 53 | asm volatile ( 54 | " mov %w[depth], #0\n" 55 | #if defined(USE_LSE) 56 | " mov %w[tmp3], #0x10000\n" 57 | " ldadda %w[tmp3], %w[tmp], %[lock]\n" 58 | " nop\n" 59 | " nop\n" 60 | #else 61 | "1: ldaxr %w[tmp], %[lock]\n" 62 | " add %w[tmp2], %w[tmp], #0x10, lsl #12\n" 63 | " stxr %w[tmp3], %w[tmp2], %[lock]\n" 64 | " cbnz %w[tmp3], 1b\n" 65 | #endif 66 | " eor %w[tmp2], %w[tmp], %w[tmp], ror #16\n" 67 | " cbz %w[tmp2], 3f\n" 68 | " and %w[tmp3], %w[tmp], #0xFFFF\n" 69 | " lsr %w[depth], %w[tmp], #16\n" 70 | " sub %w[depth], %w[depth], %w[tmp3]\n" 71 | " and %w[depth], %w[depth], #0xFFFF\n" 72 | " sevl\n" 73 | "2: wfe\n" 74 | " ldaxrh %w[tmp3], %[lock]\n" 75 | " eor %w[tmp2], %w[tmp3], %w[tmp], lsr #16\n" 76 | " cbnz %w[tmp2], 2b\n" 77 | "3:\n" 78 | : [tmp] "=&r" (tmp), [tmp2] "=&r" (tmp2), 79 | [tmp3] "=&r" (tmp3), [lock] "+Q" (*lock), 80 | [depth] "=&r" (depth) 81 | : 82 | : ); 83 | #endif 84 | 85 | return depth; 86 | } 87 | 88 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 89 | #if defined(__x86_64__) 90 | asm volatile ( 91 | " addw $0x2,%[lock]\n" 92 | : [lock] "+m" (*lock) 93 | : 94 | : "cc" ); 95 | #elif defined(__aarch64__) 96 | unsigned long tmp; 97 | asm volatile ( 98 | #if defined(USE_LSE) 99 | " mov %w[tmp], #1\n" 100 | " staddlh %w[tmp], %[lock]\n" 101 | " nop\n" 102 | #else 103 | " ldrh %w[tmp], %[lock]\n" 104 | " add %w[tmp], %w[tmp], #0x1\n" 105 | " stlrh %w[tmp], %[lock]\n" 106 | #endif 107 | : [tmp] "=&r" (tmp), [lock] "+Q" (*lock) 108 | : 109 | : ); 110 | #endif 111 | } 112 | 113 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 114 | -------------------------------------------------------------------------------- /ext/mysql/cas_event_mutex.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | 3 | Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved. 4 | Copyright (c) 2017, The Linux Foundation. All rights reserved. 5 | 6 | This program is free software; you can redistribute it and/or modify it under 7 | the terms of the GNU General Public License as published by the Free Software 8 | Foundation; version 2 of the License. 9 | 10 | This program is distributed in the hope that it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License along with 15 | this program; if not, write to the Free Software Foundation, Inc., 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 17 | 18 | *****************************************************************************/ 19 | 20 | /* Based on MySQL 5.7 */ 21 | #ifdef initialize_lock 22 | #undef initialize_lock 23 | #endif 24 | 25 | #define initialize_lock(lock, pinorder, threads) event_mutex_init(lock, threads) 26 | 27 | #include "atomics.h" 28 | #include "ut_atomics.h" 29 | 30 | unsigned long ev_generation = 0; 31 | 32 | typedef unsigned long ulint; 33 | 34 | /** Mutex states. */ 35 | enum mutex_state_t { 36 | /** Mutex is free */ 37 | MUTEX_STATE_UNLOCKED = 0, 38 | 39 | /** Mutex is acquired by some thread. */ 40 | MUTEX_STATE_LOCKED = 1, 41 | 42 | /** Mutex is contended and there are threads waiting on the lock. */ 43 | MUTEX_STATE_WAITERS = 2 44 | }; 45 | 46 | #define UT_RND1 151117737 // 901DFA9 47 | #define UT_RND2 119785373 // 723C79D 48 | #define UT_RND3 85689495 // 51B8497 49 | #define UT_RND4 76595339 // 490C08B 50 | #define UT_SUM_RND2 98781234 // 5E34832 51 | #define UT_SUM_RND3 126792457 // 78EB309 52 | #define UT_SUM_RND4 63498502 // 3C8E906 53 | #define UT_XOR_RND1 187678878 // B2FC09E 54 | #define UT_XOR_RND2 143537923 // 88E3703 55 | 56 | /** Seed value of ut_rnd_gen_ulint() */ 57 | ulint ut_rnd_ulint_counter = 65654363; 58 | 59 | /** Wakeup any waiting thread(s). */ 60 | 61 | void lock_signal(void) 62 | { 63 | unsigned long version = *((volatile unsigned long *) &ev_generation); 64 | 65 | 66 | *((volatile unsigned long *) &ev_generation) = (version + 1); 67 | } 68 | 69 | /** Try and acquire the lock using TestAndSet. 70 | @return true if lock succeeded */ 71 | int tas_lock(uint64_t *lock) 72 | { 73 | #if defined(__aarch64__) && !defined(USE_BUILTIN) 74 | 75 | uint64_t lockValue; 76 | 77 | __asm__ __volatile__ ("ldaxr %[lockValue],[%[lockAddr]]" 78 | : [lockValue] "=r" (lockValue) 79 | : [lockAddr] "r" (lock) 80 | : "memory"); 81 | if (lockValue != MUTEX_STATE_UNLOCKED) 82 | return 0; 83 | 84 | uint32_t exResult; 85 | 86 | __asm__ __volatile__ ("stxr %w[exResult], %[lockValue], [%[lockAddr]]" 87 | : [exResult] "=&r" (exResult) 88 | : [lockAddr] "r" (lock), [lockValue] "r" ((long) MUTEX_STATE_LOCKED) 89 | : "memory"); 90 | 91 | return exResult == 0; 92 | #else 93 | return(swap64(lock, MUTEX_STATE_LOCKED) 94 | == MUTEX_STATE_UNLOCKED); 95 | #endif 96 | } 97 | 98 | /** In theory __sync_lock_release should be used to release the lock. 99 | Unfortunately, it does not work properly alone. The workaround is 100 | that more conservative __sync_lock_test_and_set is used instead. */ 101 | void tas_unlock(uint64_t *lock) 102 | { 103 | #if defined(__aarch64__) && !defined(USE_BUILTIN) 104 | __asm__ __volatile__ ("stlr %[lockValue],[%[lockAddr]]" 105 | : 106 | : [lockAddr] "r" (lock), [lockValue] "r" ((long) MUTEX_STATE_UNLOCKED) 107 | : "memory"); 108 | os_wmb; 109 | #else 110 | swap64(lock, MUTEX_STATE_UNLOCKED); 111 | #endif 112 | } 113 | 114 | 115 | 116 | /********************************************************//** 117 | The following function generates a series of 'random' ulint integers. 118 | @return the next 'random' number */ 119 | static inline 120 | ulint 121 | ut_rnd_gen_next_ulint( 122 | /*==================*/ 123 | ulint rnd) /*!< in: the previous random number value */ 124 | { 125 | ulint n_bits; 126 | 127 | n_bits = 8 * sizeof(ulint); 128 | 129 | rnd = UT_RND2 * rnd + UT_SUM_RND3; 130 | rnd = UT_XOR_RND1 ^ rnd; 131 | rnd = (rnd << 20) + (rnd >> (n_bits - 20)); 132 | rnd = UT_RND3 * rnd + UT_SUM_RND4; 133 | rnd = UT_XOR_RND2 ^ rnd; 134 | rnd = (rnd << 20) + (rnd >> (n_bits - 20)); 135 | rnd = UT_RND1 * rnd + UT_SUM_RND2; 136 | 137 | return(rnd); 138 | } 139 | 140 | /********************************************************//** 141 | The following function generates 'random' ulint integers which 142 | enumerate the value space of ulint integers in a pseudo random 143 | fashion. Note that the same integer is repeated always after 144 | 2 to power 32 calls to the generator (if ulint is 32-bit). 145 | @return the 'random' number */ 146 | static inline ulint 147 | ut_rnd_gen_ulint(void) 148 | /*==================*/ 149 | { 150 | ulint rnd; 151 | 152 | ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2; 153 | 154 | rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter); 155 | 156 | return(rnd); 157 | } 158 | 159 | /********************************************************//** 160 | Generates a random integer from a given interval. 161 | @return the 'random' number */ 162 | ulint 163 | ut_rnd_interval( 164 | /*============*/ 165 | ulint low, /*!< in: low limit; can generate also this value */ 166 | ulint high) /*!< in: high limit; can generate also this value */ 167 | { 168 | ulint rnd; 169 | 170 | if (low == high) { 171 | 172 | return(low); 173 | } 174 | 175 | rnd = ut_rnd_gen_ulint(); 176 | 177 | return(low + (rnd % (high - low))); 178 | } 179 | 180 | ulint 181 | ut_delay( 182 | /*=====*/ 183 | ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */ 184 | { 185 | ulint i, j; 186 | 187 | j = 0; 188 | 189 | for (i = 0; i < delay * 50; i++) { 190 | j += i; 191 | UT_RELAX_CPU(); 192 | } 193 | 194 | return(j); 195 | } 196 | 197 | /** @return true if locked by some thread */ 198 | int is_locked(uint64_t *lock) 199 | { 200 | return(*lock != MUTEX_STATE_UNLOCKED); 201 | } 202 | 203 | /** Spin and wait for the mutex to become free. 204 | @param[in] max_spins max spins 205 | @param[in] max_delay max delay per spin 206 | @param[in,out] n_spins spin start index 207 | @return true if unlocked */ 208 | int is_free( 209 | uint64_t *lock, 210 | uint32_t max_spins, 211 | uint32_t max_delay, 212 | uint32_t *n_spins) 213 | { 214 | /* Spin waiting for the lock word to become zero. Note 215 | that we do not have to assume that the read access to 216 | the lock word is atomic, as the actual locking is always 217 | committed with atomic test-and-set. In reality, however, 218 | all processors probably have an atomic read of a memory word. */ 219 | 220 | do { 221 | if (!is_locked(lock)) { 222 | return(1); 223 | } 224 | 225 | ut_delay(ut_rnd_interval(0, max_delay)); 226 | 227 | ++(*n_spins); 228 | 229 | } while (*n_spins < max_spins); 230 | 231 | return(0); 232 | } 233 | 234 | void event_mutex_init(uint64_t *lock, uint64_t threads) { 235 | *lock = MUTEX_STATE_UNLOCKED; 236 | } 237 | 238 | /** Try and lock the mutex. Note: POSIX returns 0 on success. 239 | @return true on success */ 240 | int try_lock(uint64_t *lock) 241 | { 242 | return(tas_lock(lock)); 243 | } 244 | 245 | /** Release the mutex. */ 246 | void lock_exit(uint64_t *lock) 247 | { 248 | /* A problem: we assume that mutex_reset_lock word 249 | is a memory barrier, that is when we read the waiters 250 | field next, the read must be serialized in memory 251 | after the reset. A speculative processor might 252 | perform the read first, which could leave a waiting 253 | thread hanging indefinitely. 254 | 255 | Our current solution call every second 256 | sync_arr_wake_threads_if_sema_free() 257 | to wake up possible hanging threads if they are missed 258 | in mutex_signal_object. */ 259 | 260 | tas_unlock(lock); 261 | 262 | lock_signal(); 263 | } 264 | 265 | /** Spin while trying to acquire the mutex 266 | @param[in] max_spins max number of spins 267 | @param[in] max_delay max delay per spin 268 | @param[in] filename from where called 269 | @param[in] line within filename */ 270 | unsigned long spin_and_try_lock( 271 | uint64_t *lock, 272 | uint32_t max_spins, 273 | uint32_t max_delay) 274 | { 275 | uint32_t n_spins = 0; 276 | uint32_t n_waits = 0; 277 | const uint32_t step = max_spins; 278 | unsigned long wait_state; 279 | 280 | os_rmb; 281 | 282 | for (;;) { 283 | 284 | /* If the lock was free then try and acquire it. */ 285 | 286 | if (is_free(lock, max_spins, max_delay, &n_spins)) { 287 | 288 | if (try_lock(lock)) { 289 | 290 | break; 291 | } else { 292 | 293 | continue; 294 | } 295 | 296 | } else { 297 | max_spins = n_spins + step; 298 | } 299 | 300 | ++n_waits; 301 | 302 | wait_state = *((volatile unsigned long *) &ev_generation); 303 | 304 | // Try lock one last time to avoid race with releaser 305 | if (try_lock(lock)) { 306 | break; 307 | } 308 | 309 | // Spin until generation changes 310 | while(*((volatile unsigned long *) &ev_generation) == wait_state); 311 | } 312 | 313 | return n_spins; 314 | } 315 | 316 | 317 | /** Acquire the mutex. 318 | @param[in] max_spins max number of spins 319 | @param[in] max_delay max delay per spin 320 | @param[in] filename from where called 321 | @param[in] line within filename */ 322 | unsigned long lock_enter(uint64_t *lock, 323 | uint32_t max_spins, 324 | uint32_t max_delay) 325 | { 326 | if (!try_lock(lock)) { 327 | return spin_and_try_lock(lock, max_spins, max_delay); 328 | } 329 | 330 | return 0; 331 | } 332 | 333 | 334 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 335 | return lock_enter(lock, 30, 200); 336 | } 337 | 338 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 339 | lock_exit(lock); 340 | } 341 | 342 | /* vim: set tabstop=8 shiftwidth=8 softtabstop=8 noexpandtab: */ 343 | -------------------------------------------------------------------------------- /ext/mysql/event_mutex.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | 3 | Copyright (c) 2013, 2017, Oracle and/or its affiliates. All Rights Reserved. 4 | Copyright (c) 2017, The Linux Foundation. All rights reserved. 5 | 6 | This program is free software; you can redistribute it and/or modify it under 7 | the terms of the GNU General Public License as published by the Free Software 8 | Foundation; version 2 of the License. 9 | 10 | This program is distributed in the hope that it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License along with 15 | this program; if not, write to the Free Software Foundation, Inc., 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 17 | 18 | *****************************************************************************/ 19 | 20 | /* Based on MySQL 5.7 */ 21 | #ifdef initialize_lock 22 | #undef initialize_lock 23 | #endif 24 | 25 | #define initialize_lock(lock, pinorder, threads) event_mutex_init(lock, threads) 26 | 27 | #include "atomics.h" 28 | #include "ut_atomics.h" 29 | 30 | unsigned long ev_generation = 0; 31 | 32 | typedef unsigned long ulint; 33 | 34 | /** Mutex states. */ 35 | enum mutex_state_t { 36 | /** Mutex is free */ 37 | MUTEX_STATE_UNLOCKED = 0, 38 | 39 | /** Mutex is acquired by some thread. */ 40 | MUTEX_STATE_LOCKED = 1, 41 | 42 | /** Mutex is contended and there are threads waiting on the lock. */ 43 | MUTEX_STATE_WAITERS = 2 44 | }; 45 | 46 | #define UT_RND1 151117737 // 901DFA9 47 | #define UT_RND2 119785373 // 723C79D 48 | #define UT_RND3 85689495 // 51B8497 49 | #define UT_RND4 76595339 // 490C08B 50 | #define UT_SUM_RND2 98781234 // 5E34832 51 | #define UT_SUM_RND3 126792457 // 78EB309 52 | #define UT_SUM_RND4 63498502 // 3C8E906 53 | #define UT_XOR_RND1 187678878 // B2FC09E 54 | #define UT_XOR_RND2 143537923 // 88E3703 55 | 56 | /** Seed value of ut_rnd_gen_ulint() */ 57 | ulint ut_rnd_ulint_counter = 65654363; 58 | 59 | /** Wakeup any waiting thread(s). */ 60 | 61 | void lock_signal(void) 62 | { 63 | unsigned long version = *((volatile unsigned long *) &ev_generation); 64 | 65 | 66 | *((volatile unsigned long *) &ev_generation) = (version + 1); 67 | } 68 | 69 | /** Try and acquire the lock using TestAndSet. 70 | @return true if lock succeeded */ 71 | int tas_lock(uint64_t *lock) 72 | { 73 | return(swap64(lock, MUTEX_STATE_LOCKED) 74 | == MUTEX_STATE_UNLOCKED); 75 | } 76 | 77 | /** In theory __sync_lock_release should be used to release the lock. 78 | Unfortunately, it does not work properly alone. The workaround is 79 | that more conservative __sync_lock_test_and_set is used instead. */ 80 | void tas_unlock(uint64_t *lock) 81 | { 82 | swap64(lock, MUTEX_STATE_UNLOCKED); 83 | } 84 | 85 | 86 | 87 | /********************************************************//** 88 | The following function generates a series of 'random' ulint integers. 89 | @return the next 'random' number */ 90 | static inline 91 | ulint 92 | ut_rnd_gen_next_ulint( 93 | /*==================*/ 94 | ulint rnd) /*!< in: the previous random number value */ 95 | { 96 | ulint n_bits; 97 | 98 | n_bits = 8 * sizeof(ulint); 99 | 100 | rnd = UT_RND2 * rnd + UT_SUM_RND3; 101 | rnd = UT_XOR_RND1 ^ rnd; 102 | rnd = (rnd << 20) + (rnd >> (n_bits - 20)); 103 | rnd = UT_RND3 * rnd + UT_SUM_RND4; 104 | rnd = UT_XOR_RND2 ^ rnd; 105 | rnd = (rnd << 20) + (rnd >> (n_bits - 20)); 106 | rnd = UT_RND1 * rnd + UT_SUM_RND2; 107 | 108 | return(rnd); 109 | } 110 | 111 | /********************************************************//** 112 | The following function generates 'random' ulint integers which 113 | enumerate the value space of ulint integers in a pseudo random 114 | fashion. Note that the same integer is repeated always after 115 | 2 to power 32 calls to the generator (if ulint is 32-bit). 116 | @return the 'random' number */ 117 | static inline ulint 118 | ut_rnd_gen_ulint(void) 119 | /*==================*/ 120 | { 121 | ulint rnd; 122 | 123 | ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2; 124 | 125 | rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter); 126 | 127 | return(rnd); 128 | } 129 | 130 | /********************************************************//** 131 | Generates a random integer from a given interval. 132 | @return the 'random' number */ 133 | ulint 134 | ut_rnd_interval( 135 | /*============*/ 136 | ulint low, /*!< in: low limit; can generate also this value */ 137 | ulint high) /*!< in: high limit; can generate also this value */ 138 | { 139 | ulint rnd; 140 | 141 | if (low == high) { 142 | 143 | return(low); 144 | } 145 | 146 | rnd = ut_rnd_gen_ulint(); 147 | 148 | return(low + (rnd % (high - low))); 149 | } 150 | 151 | ulint 152 | ut_delay( 153 | /*=====*/ 154 | ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */ 155 | { 156 | ulint i, j; 157 | 158 | j = 0; 159 | 160 | for (i = 0; i < delay * 50; i++) { 161 | j += i; 162 | UT_RELAX_CPU(); 163 | } 164 | 165 | return(j); 166 | } 167 | 168 | /** @return true if locked by some thread */ 169 | int is_locked(uint64_t *lock) 170 | { 171 | return(*lock != MUTEX_STATE_UNLOCKED); 172 | } 173 | 174 | /** Spin and wait for the mutex to become free. 175 | @param[in] max_spins max spins 176 | @param[in] max_delay max delay per spin 177 | @param[in,out] n_spins spin start index 178 | @return true if unlocked */ 179 | int is_free( 180 | uint64_t *lock, 181 | uint32_t max_spins, 182 | uint32_t max_delay, 183 | uint32_t *n_spins) 184 | { 185 | /* Spin waiting for the lock word to become zero. Note 186 | that we do not have to assume that the read access to 187 | the lock word is atomic, as the actual locking is always 188 | committed with atomic test-and-set. In reality, however, 189 | all processors probably have an atomic read of a memory word. */ 190 | 191 | do { 192 | if (!is_locked(lock)) { 193 | return(1); 194 | } 195 | 196 | ut_delay(ut_rnd_interval(0, max_delay)); 197 | 198 | ++(*n_spins); 199 | 200 | } while (*n_spins < max_spins); 201 | 202 | return(0); 203 | } 204 | 205 | void event_mutex_init(uint64_t *lock, uint64_t threads) { 206 | *lock = MUTEX_STATE_UNLOCKED; 207 | } 208 | 209 | /** Try and lock the mutex. Note: POSIX returns 0 on success. 210 | @return true on success */ 211 | int try_lock(uint64_t *lock) 212 | { 213 | return(tas_lock(lock)); 214 | } 215 | 216 | /** Release the mutex. */ 217 | void lock_exit(uint64_t *lock) 218 | { 219 | /* A problem: we assume that mutex_reset_lock word 220 | is a memory barrier, that is when we read the waiters 221 | field next, the read must be serialized in memory 222 | after the reset. A speculative processor might 223 | perform the read first, which could leave a waiting 224 | thread hanging indefinitely. 225 | 226 | Our current solution call every second 227 | sync_arr_wake_threads_if_sema_free() 228 | to wake up possible hanging threads if they are missed 229 | in mutex_signal_object. */ 230 | 231 | tas_unlock(lock); 232 | 233 | lock_signal(); 234 | } 235 | 236 | /** Spin while trying to acquire the mutex 237 | @param[in] max_spins max number of spins 238 | @param[in] max_delay max delay per spin 239 | @param[in] filename from where called 240 | @param[in] line within filename */ 241 | unsigned long spin_and_try_lock( 242 | uint64_t *lock, 243 | uint32_t max_spins, 244 | uint32_t max_delay) 245 | { 246 | uint32_t n_spins = 0; 247 | uint32_t n_waits = 0; 248 | const uint32_t step = max_spins; 249 | unsigned long wait_state; 250 | 251 | os_rmb; 252 | 253 | for (;;) { 254 | 255 | /* If the lock was free then try and acquire it. */ 256 | 257 | if (is_free(lock, max_spins, max_delay, &n_spins)) { 258 | 259 | if (try_lock(lock)) { 260 | 261 | break; 262 | } else { 263 | 264 | continue; 265 | } 266 | 267 | } else { 268 | max_spins = n_spins + step; 269 | } 270 | 271 | ++n_waits; 272 | 273 | wait_state = *((volatile unsigned long *) &ev_generation); 274 | 275 | // Try lock one last time to avoid race with releaser 276 | if (try_lock(lock)) { 277 | break; 278 | } 279 | 280 | // Spin until generation changes 281 | while(*((volatile unsigned long *) &ev_generation) == wait_state); 282 | } 283 | 284 | return n_spins; 285 | } 286 | 287 | 288 | /** Acquire the mutex. 289 | @param[in] max_spins max number of spins 290 | @param[in] max_delay max delay per spin 291 | @param[in] filename from where called 292 | @param[in] line within filename */ 293 | unsigned long lock_enter(uint64_t *lock, 294 | uint32_t max_spins, 295 | uint32_t max_delay) 296 | { 297 | if (!try_lock(lock)) { 298 | return spin_and_try_lock(lock, max_spins, max_delay); 299 | } 300 | 301 | return 0; 302 | } 303 | 304 | 305 | static inline unsigned long lock_acquire (uint64_t *lock, unsigned long threadnum) { 306 | return lock_enter(lock, 30, 600); 307 | } 308 | 309 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) { 310 | lock_exit(lock); 311 | } 312 | -------------------------------------------------------------------------------- /ext/mysql/include/ut_atomics.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | 3 | Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. 4 | 5 | This program is free software; you can redistribute it and/or modify it under 6 | the terms of the GNU General Public License as published by the Free Software 7 | Foundation; version 2 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, but WITHOUT 10 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 11 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License along with 14 | this program; if not, write to the Free Software Foundation, Inc., 15 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA 16 | 17 | *****************************************************************************/ 18 | 19 | #define os_rmb __atomic_thread_fence(__ATOMIC_ACQUIRE) 20 | #define os_wmb __atomic_thread_fence(__ATOMIC_RELEASE) 21 | 22 | #include "cpu_relax.h" 23 | 24 | #define UT_RELAX_CPU() __cpu_relax() 25 | -------------------------------------------------------------------------------- /ext/pagemap/include/pagemap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: BSD-3-Clause 3 | * SPDX-FileCopyrightText: Copyright 2020 Ciro Santilli 4 | * 5 | * pagemap.h is retrieved from 6 | * https://raw.githubusercontent.com/cirosantilli/linux-kernel-module-cheat/master/lkmc/pagemap.h 7 | */ 8 | 9 | /* https://cirosantilli.com/linux-kernel-module-cheat#userland-physical-address-experiments 10 | * https://cirosantilli.com/linux-kernel-module-cheat#pagemap-dump-out 11 | * 12 | * This file is dual licensed as both 3-Clause BSD and GPLv3. 13 | */ 14 | 15 | #ifndef LKMC_PAGEMAP_H 16 | #define LKMC_PAGEMAP_H 17 | 18 | #define _XOPEN_SOURCE 700 19 | #include /* open */ 20 | #include /* uint64_t */ 21 | #include /* snprintf */ 22 | #include 23 | #include /* pread, sysconf */ 24 | 25 | /* Format documented at: 26 | * https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt 27 | */ 28 | typedef struct { 29 | uint64_t pfn : 55; 30 | unsigned int soft_dirty : 1; 31 | unsigned int file_page : 1; 32 | unsigned int swapped : 1; 33 | unsigned int present : 1; 34 | } LkmcPagemapEntry; 35 | 36 | /* Parse the pagemap entry for the given virtual address. 37 | * 38 | * @param[out] entry the parsed entry 39 | * @param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file 40 | * @param[in] vaddr virtual address to get entry for 41 | * @return 0 for success, 1 for failure 42 | */ 43 | int lkmc_pagemap_get_entry(LkmcPagemapEntry *entry, int pagemap_fd, uintptr_t vaddr) { 44 | size_t nread; 45 | ssize_t ret; 46 | uint64_t data; 47 | uintptr_t vpn; 48 | 49 | vpn = vaddr / sysconf(_SC_PAGE_SIZE); 50 | nread = 0; 51 | while (nread < sizeof(data)) { 52 | ret = pread( 53 | pagemap_fd, 54 | ((uint8_t*)&data) + nread, 55 | sizeof(data) - nread, 56 | vpn * sizeof(data) + nread 57 | ); 58 | nread += ret; 59 | if (ret <= 0) { 60 | return 1; 61 | } 62 | } 63 | entry->pfn = data & (((uint64_t)1 << 55) - 1); 64 | entry->soft_dirty = (data >> 55) & 1; 65 | entry->file_page = (data >> 61) & 1; 66 | entry->swapped = (data >> 62) & 1; 67 | entry->present = (data >> 63) & 1; 68 | return 0; 69 | } 70 | 71 | /* Convert the given virtual address to physical using /proc/PID/pagemap. 72 | * 73 | * @param[out] paddr physical address 74 | * @param[in] pid process to convert for 75 | * @param[in] vaddr virtual address to get entry for 76 | * @return 0 for success, 1 for failure 77 | */ 78 | int lkmc_pagemap_virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr) { 79 | char pagemap_file[BUFSIZ]; 80 | int pagemap_fd; 81 | 82 | snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid); 83 | pagemap_fd = open(pagemap_file, O_RDONLY); 84 | if (pagemap_fd < 0) { 85 | return 1; 86 | } 87 | LkmcPagemapEntry entry; 88 | if (lkmc_pagemap_get_entry(&entry, pagemap_fd, vaddr)) { 89 | return 1; 90 | } 91 | close(pagemap_fd); 92 | *paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE)); 93 | return 0; 94 | } 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /ext/sms/base/build_config.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 ARM Limited. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | #pragma once 5 | 6 | // Architecture detection is inferred from the toolchain. This relies on 7 | // the C compiler's system-specific macros. 8 | #if defined(__aarch64__) 9 | #define CONFIG_ARCH_ARM_V8 10 | #define CONFIG_ARCH_64BIT 11 | #elif defined(__arm__) 12 | #define CONFIG_ARCH_ARM_V7 13 | #define CONFIG_ARCH_32BIT 14 | #elif defined(__x86_64__) 15 | #define CONFIG_ARCH_X86_64 16 | #define CONFIG_ARCH_64BIT 17 | #elif defined(__i386__) 18 | #define CONFIG_ARCH_X86 19 | #define CONFIG_ARCH_32BIT 20 | #endif 21 | 22 | #if !defined(CONFIG_ARCH_64BIT) && !defined(CONFIG_ARCH_32BIT) 23 | #error Please add support for N-bit computing to build_config.h 24 | // If you experience this C pre-processor error, take a look at the place 25 | // in this file where CONFIG_ARCH_64/32BIT are defined. If there are no issues 26 | // there and you are needing to add support for a new N-bit processor, please 27 | // search the source code for all occurances of CONFIG_ARCH_64BIT and 28 | // CONFIG_ARCH_32BIT to check whether further modification is necessary. 29 | // These places will not necessarily #error for unsupported N-bit computing. 30 | #endif 31 | 32 | // OS detection is also inferred from the toolchain. 33 | #if defined(__APPLE__) 34 | #define OS_MACOSX 1 35 | #elif defined(__linux__) 36 | #define OS_LINUX 1 37 | #elif defined(__FreeBSD__) 38 | #define OS_FREEBSD 1 39 | #endif 40 | 41 | #if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) 42 | #define OS_POSIX 1 43 | #endif 44 | 45 | #define MAX_THREADS 32 46 | 47 | //Use LL/SC atomic primitives instead of __atomic_compare_exchange built-ins 48 | //This seems to be the most performant option on ARM but may violate 49 | //recommendations by the ARM architecture (e.g. no memory accesses between 50 | //LL and SC) 51 | //USE_LLSC overrides the use of __atomic_compare_exchange 52 | #ifdef __ARM_ARCH 53 | #define USE_LLSC 54 | #endif 55 | 56 | //Use barrier + relaxed store (DMB;STR) instead of store-release (STRL) 57 | //This is more performant on Cortex-A57 and possibly also on Cortex-A53 58 | #if defined(__aarch64__) 59 | #define USE_DMB 60 | #endif 61 | 62 | #if defined(USE_DMB) && defined(__arm__) 63 | #error USE_DMB optimization only applies to select ARMv8 processors 64 | #endif 65 | 66 | //Use ARM wait-for-event mechanism when busy polling 67 | //This will minimise interconnect transactions and often increase system-wide 68 | //performance 69 | #if defined __ARM_ARCH 70 | #define USE_WFE 71 | #if defined(__arm__) 72 | //TODO: WFE on ARMv7 73 | #undef USE_WFE 74 | #endif 75 | #endif 76 | -------------------------------------------------------------------------------- /ext/sms/base/cpu.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 ARM Limited. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | #pragma once 5 | 6 | #ifndef CACHE_LINE 7 | // Default CPU cache line size 8 | #define CACHE_LINE 128 9 | #endif 10 | 11 | #include "cpu_relax.h" 12 | 13 | static inline void doze(void) 14 | { 15 | __cpu_relax(); 16 | } 17 | 18 | int num_cpus(void); 19 | 20 | unsigned long cpu_hz(void); 21 | -------------------------------------------------------------------------------- /ext/sms/base/llsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 ARM Limited. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | #pragma once 5 | 6 | #include "build_config.h" 7 | #include "cpu.h" 8 | 9 | #include 10 | #include 11 | 12 | /****************************************************************************** 13 | * LL/SC primitives 14 | *****************************************************************************/ 15 | 16 | #if __ARM_ARCH == 7 || (__ARM_ARCH == 8 && __ARM_64BIT_STATE == 0) 17 | 18 | static inline void dmb() 19 | { 20 | __asm volatile("dmb" : : : "memory"); 21 | } 22 | 23 | static inline uint8_t ll8(uint8_t *var, int mm) 24 | { 25 | uint8_t old; 26 | __asm volatile("ldrexb %0, [%1]" 27 | : "=&r" (old) 28 | : "r" (var) 29 | : ); 30 | if (mm == __ATOMIC_ACQUIRE) 31 | dmb(); 32 | return old; 33 | } 34 | 35 | static inline uint32_t ll(uint32_t *var, int mm) 36 | { 37 | uint32_t old; 38 | __asm volatile("ldrex %0, [%1]" 39 | : "=&r" (old) 40 | : "r" (var) 41 | : ); 42 | //Barrier after an acquiring load 43 | if (mm == __ATOMIC_ACQUIRE) 44 | dmb(); 45 | return old; 46 | } 47 | #define ll32(a, b) ll((a), (b)) 48 | 49 | //Return 0 on success, 1 on failure 50 | static inline uint32_t sc(uint32_t *var, uint32_t neu, int mm) 51 | { 52 | uint32_t ret; 53 | //Barrier before a releasing store 54 | if (mm == __ATOMIC_RELEASE) 55 | dmb(); 56 | __asm volatile("strex %0, %1, [%2]" 57 | : "=&r" (ret) 58 | : "r" (neu), "r" (var) 59 | : ); 60 | return ret; 61 | } 62 | #define sc32(a, b, c) sc((a), (b), (c)) 63 | 64 | static inline uint64_t lld(uint64_t *var, int mm) 65 | { 66 | uint64_t old; 67 | __asm volatile("ldrexd %0, %H0, [%1]" 68 | : "=&r" (old) 69 | : "r" (var) 70 | : ); 71 | //Barrier after an acquiring load 72 | if (mm == __ATOMIC_ACQUIRE) 73 | dmb(); 74 | return old; 75 | } 76 | #define ll64(a, b) lld((a), (b)) 77 | 78 | //Return 0 on success, 1 on failure 79 | static inline uint32_t scd(uint64_t *var, uint64_t neu, int mm) 80 | { 81 | uint32_t ret; 82 | //Barrier before a releasing store 83 | if (mm == __ATOMIC_RELEASE) 84 | dmb(); 85 | __asm volatile("strexd %0, %1, %H1, [%2]" 86 | : "=&r" (ret) 87 | : "r" (neu), "r" (var) 88 | : ); 89 | return ret; 90 | } 91 | #define sc64(a, b, c) scd((a), (b), (c)) 92 | 93 | #endif 94 | 95 | #if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1 96 | 97 | static inline uint8_t ll8(uint8_t *var, int mm) 98 | { 99 | uint8_t old; 100 | if (mm == __ATOMIC_ACQUIRE) 101 | __asm volatile("ldaxrb %w0, [%1]" 102 | : "=&r" (old) 103 | : "r" (var) 104 | : "memory"); 105 | else if (mm == __ATOMIC_RELAXED) 106 | __asm volatile("ldxrb %w0, [%1]" 107 | : "=&r" (old) 108 | : "r" (var) 109 | : ); 110 | else 111 | abort(); 112 | return old; 113 | } 114 | 115 | static inline uint16_t ll16(uint16_t *var, int mm) 116 | { 117 | uint16_t old; 118 | if (mm == __ATOMIC_ACQUIRE) 119 | __asm volatile("ldaxrh %w0, [%1]" 120 | : "=&r" (old) 121 | : "r" (var) 122 | : "memory"); 123 | else if (mm == __ATOMIC_RELAXED) 124 | __asm volatile("ldxrh %w0, [%1]" 125 | : "=&r" (old) 126 | : "r" (var) 127 | : ); 128 | else 129 | abort(); 130 | return old; 131 | } 132 | 133 | static inline uint32_t ll32(uint32_t *var, int mm) 134 | { 135 | uint32_t old; 136 | if (mm == __ATOMIC_ACQUIRE) 137 | __asm volatile("ldaxr %w0, [%1]" 138 | : "=&r" (old) 139 | : "r" (var) 140 | : "memory"); 141 | else if (mm == __ATOMIC_RELAXED) 142 | __asm volatile("ldxr %w0, [%1]" 143 | : "=&r" (old) 144 | : "r" (var) 145 | : ); 146 | else 147 | abort(); 148 | return old; 149 | } 150 | 151 | //Return 0 on success, 1 on failure 152 | static inline uint8_t sc8(uint8_t *var, uint8_t neu, int mm) 153 | { 154 | uint8_t ret; 155 | if (mm == __ATOMIC_RELEASE) 156 | __asm volatile("stlxrb %w0, %w1, [%2]" 157 | : "=&r" (ret) 158 | : "r" (neu), "r" (var) 159 | : "memory"); 160 | else if (mm == __ATOMIC_RELAXED) 161 | __asm volatile("stxrb %w0, %w1, [%2]" 162 | : "=&r" (ret) 163 | : "r" (neu), "r" (var) 164 | : ); 165 | else 166 | abort(); 167 | return ret; 168 | } 169 | 170 | //Return 0 on success, 1 on failure 171 | static inline uint32_t sc32(uint32_t *var, uint32_t neu, int mm) 172 | { 173 | uint32_t ret; 174 | if (mm == __ATOMIC_RELEASE) 175 | __asm volatile("stlxr %w0, %w1, [%2]" 176 | : "=&r" (ret) 177 | : "r" (neu), "r" (var) 178 | : "memory"); 179 | else if (mm == __ATOMIC_RELAXED) 180 | __asm volatile("stxr %w0, %w1, [%2]" 181 | : "=&r" (ret) 182 | : "r" (neu), "r" (var) 183 | : ); 184 | else 185 | abort(); 186 | return ret; 187 | } 188 | 189 | static inline uint64_t ll(uint64_t *var, int mm) 190 | { 191 | uint64_t old; 192 | if (mm == __ATOMIC_ACQUIRE) 193 | __asm volatile("ldaxr %0, [%1]" 194 | : "=&r" (old) 195 | : "r" (var) 196 | : "memory"); 197 | else if (mm == __ATOMIC_RELAXED) 198 | __asm volatile("ldxr %0, [%1]" 199 | : "=&r" (old) 200 | : "r" (var) 201 | : ); 202 | else 203 | abort(); 204 | return old; 205 | } 206 | #define ll64(a, b) ll((a), (b)) 207 | 208 | //Return 0 on success, 1 on failure 209 | static inline uint32_t sc(uint64_t *var, uint64_t neu, int mm) 210 | { 211 | uint32_t ret; 212 | if (mm == __ATOMIC_RELEASE) 213 | __asm volatile("stlxr %w0, %1, [%2]" 214 | : "=&r" (ret) 215 | : "r" (neu), "r" (var) 216 | : "memory"); 217 | else if (mm == __ATOMIC_RELAXED) 218 | __asm volatile("stxr %w0, %1, [%2]" 219 | : "=&r" (ret) 220 | : "r" (neu), "r" (var) 221 | : ); 222 | else 223 | abort(); 224 | return ret; 225 | } 226 | #define sc64(a, b, c) sc((a), (b), (c)) 227 | 228 | #if defined(__clang__) 229 | union i128 230 | { 231 | __int128 i128; 232 | int64_t i64[2]; 233 | }; 234 | #endif 235 | 236 | static inline __int128 lld(__int128 *var, int mm) 237 | { 238 | #if defined(__clang__) 239 | union i128 old; 240 | if (mm == __ATOMIC_ACQUIRE) 241 | __asm volatile("ldaxp %0, %1, [%2]" 242 | : "=&r" (old.i64[0]), "=&r" (old.i64[1]) 243 | : "r" (var) 244 | : "memory"); 245 | else if (mm == __ATOMIC_RELAXED) 246 | __asm volatile("ldxp %0, %1, [%2]" 247 | : "=&r" (old.i64[0]), "=&r" (old.i64[1]) 248 | : "r" (var) 249 | : ); 250 | else 251 | abort(); 252 | return old.i128; 253 | #else 254 | __int128 old; 255 | if (mm == __ATOMIC_ACQUIRE) 256 | __asm volatile("ldaxp %0, %H0, [%1]" 257 | : "=&r" (old) 258 | : "r" (var) 259 | : "memory"); 260 | else if (mm == __ATOMIC_RELAXED) 261 | __asm volatile("ldxp %0, %H0, [%1]" 262 | : "=&r" (old) 263 | : "r" (var) 264 | : ); 265 | else 266 | abort(); 267 | return old; 268 | #endif 269 | } 270 | 271 | //Return 0 on success, 1 on failure 272 | static inline uint32_t scd(__int128 *var, __int128 neu, int mm) 273 | { 274 | #if defined(__clang__) 275 | uint32_t ret; 276 | if (mm == __ATOMIC_RELEASE) 277 | __asm volatile("stlxp %w0, %1, %2, [%3]" 278 | : "=&r" (ret) 279 | : "r" (((union i128)neu).i64[0]), 280 | "r" (((union i128)neu).i64[1]), 281 | "r" (var) 282 | : "memory"); 283 | else if (mm == __ATOMIC_RELAXED) 284 | __asm volatile("stxp %w0, %1, %2, [%3]" 285 | : "=&r" (ret) 286 | : "r" (((union i128)neu).i64[0]), 287 | "r" (((union i128)neu).i64[1]), 288 | "r" (var) 289 | : ); 290 | else 291 | abort(); 292 | return ret; 293 | #else 294 | uint32_t ret; 295 | if (mm == __ATOMIC_RELEASE) 296 | __asm volatile("stlxp %w0, %1, %H1, [%2]" 297 | : "=&r" (ret) 298 | : "r" (neu), "r" (var) 299 | : "memory"); 300 | else if (mm == __ATOMIC_RELAXED) 301 | __asm volatile("stxp %w0, %1, %H1, [%2]" 302 | : "=&r" (ret) 303 | : "r" (neu), "r" (var) 304 | : ); 305 | else 306 | abort(); 307 | return ret; 308 | #endif 309 | } 310 | #endif 311 | 312 | static inline void sevl(void) 313 | { 314 | #if defined __ARM_ARCH 315 | __asm volatile("sevl" : : : ); 316 | #endif 317 | } 318 | 319 | static inline void sev(void) 320 | { 321 | #if defined __ARM_ARCH 322 | __asm volatile("sev" : : : "memory"); 323 | #endif 324 | } 325 | 326 | static inline int wfe(void) 327 | { 328 | #if defined __ARM_ARCH 329 | __asm volatile("wfe" : : : "memory"); 330 | #endif 331 | return 1; 332 | } 333 | 334 | #ifdef USE_WFE 335 | #define SEVL() sevl() 336 | #define WFE() wfe() 337 | #define SEV() do { __asm volatile ("dsb ish" ::: "memory"); sev(); } while(0) 338 | #if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1 339 | #define LDXR128(addr, mo) lld((addr), (mo)) 340 | #endif 341 | #define LDXR64(addr, mo) ll64((addr), (mo)) 342 | #define LDXR32(addr, mo) ll32((addr), (mo)) 343 | #define LDXR16(addr, mo) ll16((addr), (mo)) 344 | #define LDXR8(addr, mo) ll8((addr), (mo)) 345 | #define LDXR(addr, mo) ll((addr), (mo)) 346 | //When using WFE we should not stall the pipeline using other means 347 | #define DOZE() (void)0 348 | #else 349 | #define SEVL() (void)0 350 | #define WFE() 1 351 | #define SEV() (void)0 352 | #define LDXR128(addr, mo) __atomic_load_n((addr), (mo)) 353 | #define LDXR64(addr, mo) __atomic_load_n((addr), (mo)) 354 | #define LDXR32(addr, mo) __atomic_load_n((addr), (mo)) 355 | #define LDXR16(addr, mo) __atomic_load_n((addr), (mo)) 356 | #define LDXR8(addr, mo) __atomic_load_n((addr), (mo)) 357 | #define LDXR(addr, mo) __atomic_load_n((addr), (mo)) 358 | #define DOZE() doze() 359 | #endif 360 | -------------------------------------------------------------------------------- /ext/sms/clh_spinlock.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 ARM Limited. All rights reserved. 3 | * SPDX-License-Identifier: BSD-3-Clause 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 11 | * Redistributions in binary form must reproduce the above copyright notice, this 12 | * list of conditions and the following disclaimer in the documentation and/or 13 | * other materials provided with the distribution. 14 | * 15 | * Neither the name of ARM Limited nor the names of its contributors may be used 16 | * to endorse or promote products derived from this software without specific 17 | * prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 27 | * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* 32 | * Arm Shared Memory Synchronization Benchmark (SMS) 33 | * commit: 85a4b2456f1c84e2235a527d8b2b69be99621e94 34 | * August 6 2018 35 | * 36 | * Description: 37 | * CLH (Craig, Landin, and Hagersten) spinlock is a queue-based spinlock that each 38 | * node spins on previous node's wait status. CLH spinlock is starvation-free 39 | * and has FCFS (first come, first served) order. Because each thread spins 40 | * on the previous node created by another thread, CLH's performance may be 41 | * worse than MCS spinlock, which only spins on local memory. However, this 42 | * should not be a problem because modern architectures always implement ccNUMA 43 | * (cache coherent non-uniform memory architecture) which will coherently cache 44 | * remote memory to a local cache-line. The remote memory may not be updated at 45 | * all and the changed status will be implicit transferred by interconnect cache 46 | * coherence protocols to the spinning core. CLH data structure is an implicit 47 | * linked list, the global_clh only contains a cache-line aligned tail pointer 48 | * and an initial dummy clh_node. The main disadvantages of CLH spinlock compared 49 | * to MCS spinlock are: 1) slower than MCS on cacheless NUMA, 2) hard to implement 50 | * wait-free back-off / time-out / abortable / hierarchical spinlock. 51 | * 52 | * Changes compared to official CLH spinlock 53 | * Official CLH spinlock reuses previous released queue node. We used thread-local 54 | * pointers to indicate current local node, which is also a thread-local struct. 55 | * Therefore each thread may spin at other thread's TLS queue node, and ccNUMA 56 | * coherence protocols will cache the remote DRAM to local cache. Overall 57 | * performance should be similar to MCS spinlock. 58 | * 59 | * Internals: 60 | * The only LSE instruction is SWPAL which exchanges current node and lock tail. 61 | * There is a tunable parameter -w which can be used to disable WFE. All variables 62 | * are cache-line aligned. Queue node is implemented with TLS __thread keyword. 63 | * New initial clh_thread_local_init() function will initialize all queue nodes. 64 | * clh_lock() and clh_unlock() strictly follow the original CLH algorithm. Global 65 | * uint64_t lock pointer is unused. 66 | * 67 | * Workings: 68 | * clh_spinlock works similar to osq_lock and queued_spinlock 69 | * 70 | * Tuning Parameters: 71 | * 72 | * Optional without_wfe to disable wfe instruction and use empty loops instead. 73 | * 74 | * [-- [-w]]: disable sevl and wfe 75 | * 76 | */ 77 | 78 | #pragma once 79 | 80 | #include "llsc.h" 81 | 82 | #include 83 | #include 84 | #include 85 | 86 | #ifdef initialize_lock 87 | #undef initialize_lock 88 | #endif 89 | 90 | #ifdef parse_test_args 91 | #undef parse_test_args 92 | #endif 93 | 94 | #ifdef thread_local_init 95 | #undef thread_local_init 96 | #endif 97 | 98 | #define initialize_lock(lock, pinorder, threads) clh_lock_init(lock, threads) 99 | #define parse_test_args(args, argc, argv) clh_parse_args(args, argc, argv) 100 | #define thread_local_init(smtid) clh_thread_local_init(smtid) 101 | 102 | 103 | struct clh_node 104 | { 105 | struct clh_node *prev; 106 | unsigned long wait; 107 | } __attribute__ ((aligned (CACHE_LINE))); 108 | 109 | struct clh_node_pointer 110 | { 111 | struct clh_node *ptr; 112 | } __attribute__ ((aligned (CACHE_LINE))); 113 | 114 | struct clh_lock 115 | { 116 | struct clh_node node; 117 | unsigned long num_cores; 118 | struct clh_node *tail __attribute__ ((aligned(CACHE_LINE))); 119 | }; 120 | 121 | static bool without_wfe; 122 | static struct clh_lock global_clh_lock; // clh lock queue 123 | /* 124 | * Cannot use __thread thread local storage because some threads 125 | * may be joined earlier and their node may be referenced by other 126 | * threads, this will cause memory access violation. We have to 127 | * use the main thread heap and share a common C array. Two arrays 128 | * are used here, one is used as a pointer array, which is fixed 129 | * for each thread. The other is a nodepool, whose node is assigned 130 | * to each thread according to its threadid initially. Then 131 | * according to CLH algorithm, current node will reuse its previous 132 | * node as the next available node. We just update the fixed pointer 133 | * array to reflect this change. That is, each thread will retrieve 134 | * its next available node from fixed pointer array by its thread 135 | * id offset, but the pointer value may point to any node in the 136 | * CLH nodepool. 137 | */ 138 | static struct clh_node_pointer *clh_nodeptr; // clh node pointer array 139 | static struct clh_node *clh_nodepool; // clh node struct array 140 | 141 | /* additional parameter to enable WFE(default) or disable WFE */ 142 | static void clh_parse_args(test_args_t * unused, int argc, char** argv) { 143 | int i = 0; 144 | #if defined(__aarch64__) 145 | without_wfe = false; 146 | #else 147 | /* only aarch64 supports WFE */ 148 | without_wfe = true; 149 | #endif 150 | 151 | /* extended options retrieved after '--' operator */ 152 | while ((i = getopt(argc, argv, "w")) != -1) 153 | { 154 | switch (i) { 155 | case 'w': 156 | without_wfe = true; 157 | break; 158 | 159 | default: 160 | fprintf(stderr, 161 | "clh_spinlock additional options after --:\n" 162 | "\t[-h print this msg]\n" 163 | "\t[-w without_wfe, aarch64 default is false, non-aarch64 default is true]\n"); 164 | exit(2); 165 | } 166 | } 167 | } 168 | 169 | static inline void clh_lock_init(uint64_t *u64_lock, unsigned long num_cores) 170 | { 171 | /* default tail node should be set to 0 */ 172 | global_clh_lock.node.prev = NULL; 173 | global_clh_lock.node.wait = 0; 174 | global_clh_lock.num_cores = num_cores; 175 | global_clh_lock.tail = &global_clh_lock.node; 176 | 177 | /* save clh_lock pointer to global u64int_t */ 178 | *u64_lock = (uint64_t)&global_clh_lock; 179 | 180 | /* calloc will initialize all memory to zero automatically */ 181 | if (clh_nodeptr) free(clh_nodeptr); 182 | clh_nodeptr = calloc(num_cores, sizeof(struct clh_node_pointer)); 183 | if (clh_nodeptr == NULL) exit(errno); 184 | 185 | 186 | if (clh_nodepool) free(clh_nodepool); 187 | clh_nodepool = calloc(num_cores, sizeof(struct clh_node)); 188 | if (clh_nodepool == NULL) exit(errno); 189 | 190 | #ifdef DDEBUG 191 | printf("CLH: global_clh_lock=%llx\n", (long long unsigned int) &global_clh_lock); 192 | #endif 193 | } 194 | 195 | static inline void clh_thread_local_init(unsigned long smtid) 196 | { 197 | /* initialize clh node pointer array individually */ 198 | clh_nodepool[smtid].wait = 1; 199 | clh_nodeptr[smtid].ptr = &clh_nodepool[smtid]; 200 | } 201 | 202 | static inline void clh_lock(struct clh_lock *lock, struct clh_node *node, bool use_wfe, unsigned long tid) 203 | { 204 | /* must set wait to 1 first, otherwise next node after new tail will not spin */ 205 | node->wait = 1; 206 | struct clh_node *prev = node->prev = __atomic_exchange_n(&lock->tail, node, __ATOMIC_ACQ_REL); 207 | #ifdef DDEBUG 208 | printf("T%lu LOCK: prev<-node: %llx<-%llx\n", tid, (long long unsigned int)prev, (long long unsigned int)node); 209 | #endif 210 | 211 | /* CLH spinlock: spinning on previous node's wait status */ 212 | if (use_wfe) 213 | { 214 | if (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE)) 215 | { 216 | SEVL(); 217 | while (WFE() && LDXR(&prev->wait, __ATOMIC_ACQUIRE)) 218 | { 219 | DOZE(); 220 | } 221 | } 222 | } 223 | else 224 | { 225 | while (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE)) 226 | { 227 | ; 228 | } 229 | } 230 | } 231 | 232 | /* return the previous node as reused node for the next clh_lock() */ 233 | static inline void clh_unlock(struct clh_node *node, unsigned long tid) 234 | { 235 | #ifdef DDEBUG 236 | printf("T%lu UNLOCK: node: %llx\n", tid, (long long unsigned int)node); 237 | #endif 238 | /* CLH spinlock: release current node by resetting wait status */ 239 | #ifdef USE_DMB 240 | __atomic_thread_fence(__ATOMIC_RELEASE); 241 | __atomic_store_n(&node->wait, 0, __ATOMIC_RELAXED); 242 | #else 243 | __atomic_store_n(&node->wait, 0, __ATOMIC_RELEASE); 244 | #endif 245 | } 246 | 247 | /* standard lockhammer lock_acquire and lock_release interfaces */ 248 | static unsigned long __attribute__((noinline)) 249 | lock_acquire (uint64_t *lock, unsigned long threadnum) 250 | { 251 | clh_lock(&global_clh_lock, clh_nodeptr[threadnum].ptr, !without_wfe, threadnum); 252 | return 1; 253 | } 254 | 255 | static inline void lock_release (uint64_t *lock, unsigned long threadnum) 256 | { 257 | /* 258 | * Have to save prev first, once called clh_unlock(), node->prev might 259 | * be overwritten by another thread and caused two thread use the same 260 | * nodepool clh_node, therefore generated a circular linked list after 261 | * another round of lock acquisition. 262 | */ 263 | struct clh_node* prev = clh_nodeptr[threadnum].ptr->prev; 264 | clh_unlock(clh_nodeptr[threadnum].ptr, threadnum); 265 | clh_nodeptr[threadnum].ptr = prev; 266 | } 267 | 268 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 269 | -------------------------------------------------------------------------------- /ext/tbb/include/tbb.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2005-2018 Intel Corporation 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | 17 | 18 | 19 | */ 20 | 21 | /* 22 | * Based on: 23 | * 24 | * project: github.com/01org/tbb, files: 25 | * tbb/include/tbb/machine/gcc_generic.h, 26 | * tbb/inclide/tbb/machine/linux-intel64.h 27 | * 28 | * __TBB mappings: 29 | * 30 | * Only added logic that is needed for spin_rw_mutex - only support for 31 | * 64b wide data (wordsize == 8) and Linux 32 | * 33 | * for Aarch64: default is GCC built-ins (based on gcc version), 34 | * alternative: lockhammer local atomics via USE_LOCAL, with or w/o USE_LSE 35 | * 36 | * for x86-64: default is lockhammer local atomics (which should be same 37 | * as machine/linux_intel64.h), aternative: GCC built-ins via 38 | * USE_GCC_BUILTINS (based on gcc version) 39 | * 40 | * For both ISAs, USE_LOCAL has higher priority over USE_GCC_BUILTINS if used 41 | * together. 42 | */ 43 | 44 | 45 | #ifndef __TBB_H 46 | #define __TBB_H 47 | 48 | #define _GNU_SOURCE 49 | 50 | #include "atomics.h" 51 | #include "cpu_relax.h" 52 | 53 | /* Non default configurations */ 54 | // #define USE_LOCAL 55 | // #define USE_LSE 56 | // #define USE_GCC_BUILTINS 57 | 58 | 59 | #ifndef NDEBUG 60 | #pragma message("Using debug build!!") 61 | #define DBG(fmt,...) \ 62 | do { fprintf(stderr, "tbb>%s:%d " fmt, \ 63 | __func__, __LINE__, ##__VA_ARGS__); } while (0); 64 | 65 | #define __TBB_ASSERT(b, msg) \ 66 | do { if (!(b)) { DBG("Assert: %s\n", msg); exit (1); } } while(0); 67 | 68 | #else /* NDEBUG */ 69 | 70 | #define DBG(fmt, ...) do {} while (0); 71 | #define __TBB_ASSERT(b, msg) do { } while(0); 72 | 73 | #endif /* NDEBUG */ 74 | 75 | /* 76 | * spin; do not yield 77 | */ 78 | static inline void machine_pause (int32_t delay) { 79 | while(delay>0) { 80 | __cpu_relax(); 81 | delay--; 82 | } 83 | } 84 | 85 | #if defined(USE_LOCAL) || (defined(__x86_64__) && !defined(USE_GCC_BUILTINS)) 86 | #ifndef NDEBUG 87 | #pragma message("Using lockhammer atomics library!!") 88 | #endif /* NDEBUG */ 89 | 90 | /* 91 | * this really needs to be fetchadd64_release, however we want to be same as 92 | * how intel-tbb uses gcc built ins 93 | * 94 | * The atomics.h is aware of USE_LSE configuration 95 | * So no need to do anything here. 96 | */ 97 | #define __TBB_machine_cmpswp8(P,V,C) cas64_acquire_release((unsigned long *) P,V,C) 98 | #define __TBB_machine_fetchadd8(P,V) fetchadd64_acquire_release((unsigned long *) P,V) 99 | #define __TBB_machine_fetchadd8release(P,V) fetchadd64_acquire_release((unsigned long *) P,V) 100 | 101 | static inline void __TBB_machine_or(volatile void* operand, uint64_t addend) { 102 | #if defined(__x86_64__) 103 | asm volatile( 104 | "lock\norq %1,%0" 105 | : "=m"(*(volatile uint64_t*)operand) 106 | : "r"(addend), "m"(*(volatile uint64_t*)operand) 107 | : "memory"); 108 | #elif defined(__aarch64__) 109 | #ifndef USE_LSE 110 | unsigned long old, newval, tmp; 111 | asm volatile( 112 | "1: ldaxr %[old], %[ptr]\n" 113 | " orr %[newval], %[old], %[val]\n" 114 | " stlxr %w[tmp], %[newval], %[ptr]\n" 115 | " cbnz %w[tmp], 1b\n" 116 | : [tmp] "=&r" (tmp), [old] "=&r" (old), [newval] "=&r" (newval), 117 | [ptr] "+Q" (*(unsigned long *)operand) 118 | : [val] "Lr" (addend) 119 | : ); 120 | #else /* USE_LSE */ 121 | // clobbering addend - to match gcc 122 | asm volatile( 123 | "ldsetal %[val], %[val], %[ptr]\n" 124 | : [val] "+&r" (addend), [ptr] "+Q" (*(unsigned long *)operand) 125 | : ); 126 | #endif /* USE_LSE */ 127 | #else 128 | /* Arch independent implementation */ 129 | for(;;) { 130 | uintptr_t tmp = *(volatile uintptr_t *)operand; 131 | uintptr_t result = __TBB_machine_cmpswp8(operand, tmp|addend, tmp); 132 | if( result==tmp ) break; 133 | } 134 | #endif /* ARCH */ 135 | } 136 | 137 | static inline void __TBB_machine_and(volatile void* operand, uint64_t addend) { 138 | #if defined(__x86_64__) 139 | asm volatile( 140 | "lock\nandq %1,%0" 141 | : "=m"(*(volatile uint64_t*)operand) 142 | : "r"(addend), "m"(*(volatile uint64_t*)operand) 143 | : "memory"); 144 | #elif defined(__aarch64__) 145 | #ifndef USE_LSE 146 | unsigned long old, newval, tmp; 147 | asm volatile( 148 | "1: ldaxr %[old], %[ptr]\n" 149 | " and %[newval], %[old], %[val]\n" 150 | " stlxr %w[tmp], %[newval], %[ptr]\n" 151 | " cbnz %w[tmp], 1b\n" 152 | : [tmp] "=&r" (tmp), [old] "=&r" (old), [newval] "=&r" (newval), 153 | [ptr] "+Q" (*(unsigned long *)operand) 154 | : [val] "Lr" (addend) 155 | : ); 156 | #else /* USE_LSE */ 157 | // clobbering addend - to match gcc 158 | asm volatile( 159 | "mvn %[val], %[val]\n" 160 | "ldclral %[val], %[val], %[ptr]\n" 161 | : [val] "+&r" (addend), [ptr] "+Q" (*(unsigned long *)operand) 162 | : ); 163 | #endif /* USE_LSE */ 164 | #else 165 | /* Arch independent implementation */ 166 | for(;;) { 167 | uintptr_t tmp = *(volatile uintptr_t *)operand; 168 | uintptr_t result = __TBB_machine_cmpswp8(operand, tmp&addend, tmp); 169 | if( result==tmp ) break; 170 | } 171 | #endif /* ARCH */ 172 | } 173 | 174 | #else /* GCC Built-ins */ 175 | 176 | #define __GCC_VERSION \ 177 | (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 178 | 179 | #if __GCC_VERSION < 40700 /* use __sync* built-ins */ 180 | #ifndef NDEBUG 181 | #pragma message("Using old gcc (<4.7.0) built-ins!!") 182 | #endif /* NDEBUG */ 183 | 184 | #define __TBB_MACHINE_DEFINE_ATOMICS(S,T) \ 185 | inline T __TBB_machine_cmpswp##S( volatile void *ptr, T value, T comparand ) { \ 186 | return __sync_val_compare_and_swap((volatile T *)ptr,comparand,value); \ 187 | } \ 188 | inline T __TBB_machine_fetchadd##S( volatile void *ptr, T value ) { \ 189 | return __sync_fetch_and_add((volatile T *)ptr,value); \ 190 | } 191 | 192 | static inline void __TBB_machine_or( volatile void *ptr, uintptr_t addend ) { 193 | __sync_fetch_and_or((volatile uintptr_t *)ptr,addend); 194 | } 195 | 196 | static inline void __TBB_machine_and( volatile void *ptr, uintptr_t addend ) { 197 | __sync_fetch_and_and((volatile uintptr_t *)ptr,addend); 198 | } 199 | 200 | #else /* __GCC_VERSION >= 40700; use __atomic* built-ins */ 201 | #ifndef NDEBUG 202 | #pragma message("Using new gcc (>=4.7.0) built-ins!!") 203 | #endif /* NDEBUG */ 204 | 205 | #define __TBB_MACHINE_DEFINE_ATOMICS(S,T) \ 206 | inline T __TBB_machine_cmpswp##S( volatile void *ptr, T value, T comparand ) { \ 207 | (void)__atomic_compare_exchange_n((volatile T *)ptr, &comparand, value, \ 208 | 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ 209 | return comparand; \ 210 | } \ 211 | inline T __TBB_machine_fetchadd##S( volatile void *ptr, T value ) { \ 212 | return __atomic_fetch_add((volatile T *)ptr, value, __ATOMIC_SEQ_CST); \ 213 | } 214 | 215 | static inline void __TBB_machine_or( volatile void *ptr, uintptr_t addend ) { 216 | __atomic_fetch_or((volatile uintptr_t *)ptr,addend,__ATOMIC_SEQ_CST); 217 | } 218 | 219 | static inline void __TBB_machine_and( volatile void *ptr, uintptr_t addend ) { 220 | __atomic_fetch_and((volatile uintptr_t *)ptr,addend,__ATOMIC_SEQ_CST); 221 | } 222 | 223 | #endif /* __GCC_VERSION */ 224 | 225 | /* only intptr_t for now */ 226 | __TBB_MACHINE_DEFINE_ATOMICS(8, intptr_t) 227 | 228 | /* 229 | * func: fetchaddNrelese 230 | * Scope for optimization on AArch64 side: we may not need acquire semantics? 231 | */ 232 | #define __TBB_machine_fetchadd8release(P,V) __TBB_machine_fetchadd8(P,V) 233 | 234 | #endif /* USE_LOCAL, __x86_64__ && !USE_GCC_BUILTINS */ 235 | 236 | 237 | /* 238 | * Top level abstraction 239 | */ 240 | #define __TBB_machine_pause(C) machine_pause(C) 241 | #define __TBB_Yield() sched_yield() 242 | #define __TBB_Pause(C) __TBB_machine_pause(C) 243 | #define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C) 244 | #define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd8(P,V) 245 | #define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd8release(P,V) 246 | #define __TBB_AtomicOR(P,V) __TBB_machine_or(P,V) 247 | #define __TBB_AtomicAND(P,V) __TBB_machine_and(P,V) 248 | 249 | /* TBB helper routines */ 250 | 251 | /* 252 | * From: class atomic_backoff : no_copy 253 | * 254 | * //! Class that implements exponential backoff. 255 | * 16 is approximately how many 'pause' x86 instruction takes for 256 | * their context switch, not changing it for now, do we need to change? 257 | */ 258 | #define LOOPS_BEFORE_YIELD 16 259 | 260 | static inline void atomic_backoff__pause(int32_t *count) { 261 | if( *count<=LOOPS_BEFORE_YIELD ) { 262 | __TBB_Pause(*count); 263 | // Pause twice as long the next time. 264 | *count*=2; 265 | } else { 266 | // Pause is so long that we might as well yield CPU to scheduler. 267 | __TBB_Yield(); 268 | } 269 | } 270 | 271 | /* 272 | * Generic versions of helper functions if not defined by now 273 | */ 274 | #ifndef __TBB_AtomicOR 275 | #ifndef NDEBUG 276 | #pragma message("Using backoff based AtomicOR!!") 277 | #endif /* NDEBUG */ 278 | static inline void __TBB_AtomicOR(void* operand, uintmax_t addend) { 279 | int32_t count; 280 | for(count = 1;;atomic_backoff__pause(&count)) { 281 | uintptr_t tmp = *(volatile uintptr_t *)operand; 282 | uintptr_t result = __TBB_CompareAndSwapW(operand, tmp|addend, tmp); 283 | if( result==tmp ) break; 284 | } 285 | } 286 | #endif /* __TBB_AtomicOR */ 287 | 288 | #ifndef __TBB_AtomicAND 289 | #ifndef NDEBUG 290 | #pragma message("Using backoff based AtomicAND!!") 291 | #endif /* NDEBUG */ 292 | static inline void __TBB_AtomicAND(void* operand, uintptr_t addend) { 293 | int32_t count; 294 | for(count = 1;;atomic_backoff__pause(&count)) { 295 | uintptr_t tmp = *(volatile uintptr_t *)operand; 296 | uintptr_t result = __TBB_CompareAndSwapW(operand, tmp&addend, tmp); 297 | if( result==tmp ) break; 298 | } 299 | } 300 | #endif /* __TBB_AtomicAND */ 301 | #endif /* __TBB_H */ 302 | 303 | 304 | /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ 305 | -------------------------------------------------------------------------------- /ext/tbb/tbb_spin_rw_mutex.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2005-2018 Intel Corporation 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | 17 | 18 | 19 | */ 20 | 21 | /* 22 | * Based on: 23 | * 24 | * Project: github.com/01org/tbb, File: tbb/include/tbb/spin_rw_mutex.h 25 | * Tag: 2018_U3-0-g633b01a 26 | * 27 | * Description: 28 | * 29 | * This file implements 'Fast, unfair, spinning reader-writer lock with 30 | * back-off and writer-preference'. The algorithm is based on 31 | * 'spin_rw_mutex' from Intel TBB library. 32 | * 33 | * Internals: 34 | * 35 | * - Cutting through layers of abstractions in the original source code, I 36 | * made things not as clean as it was. However, during the porting 37 | * process, I tried to keep things as similar as possible to the setup in 38 | * the Intel TBB library. I ported only required things for this 39 | * synchronization scheme to work. 40 | * 41 | * - The lockhammer/tbb.h file tries to provide similar __TBB level 42 | * abstractions as tbb/include/tbb/tbb_machine.h but it is primitive and 43 | * has only definitions needed for this particular scheme. 44 | * 45 | * - Underlying atomics primitives are from GCC built-ins as configured in 46 | * gcc_generic.h file in tbb project for Aarch64. For x86-64 they are 47 | * derived from tbb/include/tbb/machine/linux_intel64.h file. The expected 48 | * ISA is either x86-64 (no TSX) or Aarch64, 64bit only and the OS is 49 | * Linux (for sched_yield). 50 | * 51 | * - For Aarch64, TBB is using GCC generic atomic built-ins as a base. It 52 | * does not assume anything about memory model or ISA. So, the 53 | * implementation could be suboptimal. We inherit those traits here as 54 | * well. 55 | * 56 | * - In lockhammer/tbb.h, there are several macros which allow you to 57 | * select which variant of atomics to use. For Aarch64, the default is GCC 58 | * built-ins, and for x86-64, the defaults are supplied by the file. These 59 | * default choices are similar to the TBB setup. 60 | * 61 | * Changes from TBB: 62 | * 63 | * - One main change is in the definition of 'machine_pause()'. Here, it 64 | * would first spin and then sched_yield() unlike the default in TBB where 65 | * it would sched_yield() immediately (at least for Aarch64). 66 | * 67 | * - Does not implement upgrade() or downgrade() methods 68 | * 69 | * - Not using C++ because it is difficult given this benchmark framework 70 | * as well as the other complexities which comes from pulling out a set of 71 | * classes from a class tree in tbb. 72 | * 73 | * Workings: 74 | * 75 | * This implements classical reader-writer lock. Which means a lock can be 76 | * held by a single writer or a group of readers at the same time but not 77 | * both. 78 | * 79 | * From tbb docs: " Mutual exclusion is necessary when at least one thread 80 | * writes to a shared variable. But it does no harm to permit multiple 81 | * readers into a protected region. The reader-writer variants of the 82 | * mutexes [...] enable multiple readers by distinguishing reader locks 83 | * from writer locks. There can be more than one reader lock on a given 84 | * mutex." 85 | * 86 | * When a writer first tries to acquire the lock, if there are no readers 87 | * already holding the lock, it will acquire it else in the presence of 88 | * readers it will set a writer pending bit if not set. If this bit is 89 | * already set or after setting the bit the writer will start backing off 90 | * eventually yielding the CPU until obtaining the lock. 91 | * 92 | * In case of readers, more than one of them can go in the exclusive 93 | * section simultaneously. If no writer is holding the lock or no pending 94 | * writers, a reader even in presence of other reader can acquire the lock. 95 | * It will back off and eventually yield the CPU when writer is holding a 96 | * lock until the lock becomes available again. 97 | * 98 | * Readers/Writers ratio (-r) and Pure readers (-m): 99 | * 100 | * - 'rw_mask' variable defines the ratio between readers and writers per 101 | * thread. It is controlled using log2_ratio variable, cmdline args -r. 102 | * 103 | * - Given the ratio, a thread will perform that many 'read_acquire' and 104 | * 'read_release' calls and then it will do one 'write_acquire' and 105 | * 'write_release'. And then if more work to be done, repeat. 106 | * 107 | * For a thread: 108 | * 109 | * num readers 110 | * ----------- = 2^(log2_ratio) - 1; 111 | * num writers 112 | * 113 | * > log2_ratio of 0 means all writers 114 | * > log2_ratio of ~0 means all readers 115 | * > default log2_ratio is 6 e.g 63 reads per write. 116 | * 117 | * - Pure readers are CPUs which will never perform a write acq/rel. The 118 | * cmdline arg is a bit mask e.g. 0x8 will make 4th cpu (cpu id: 0x3) 119 | * a pure reader. Default is 0x0 e.g. no pure readers. 120 | * 121 | */ 122 | 123 | #ifndef __TBB_spin_mutex_H 124 | #define __TBB_spin_mutex_H 125 | 126 | #ifdef initialize_lock 127 | #undef initialize_lock 128 | #endif 129 | 130 | #ifdef parse_test_args 131 | #undef parse_test_args 132 | #endif 133 | 134 | #define initialize_lock(lock, pinorder, threads) tbb_init_locks(lock, threads) 135 | #define parse_test_args(args, argc, argv) tbb_parse_args(args, argc, argv) 136 | 137 | #include "tbb.h" 138 | 139 | #define WRITER 1 140 | #define WRITER_PENDING 2 141 | #define READERS ~(WRITER | WRITER_PENDING) 142 | #define ONE_READER 4 143 | #define BUSY (WRITER | READERS) 144 | 145 | unsigned long log2_ratio = 0; 146 | unsigned long rw_mask = 0; 147 | unsigned long reader_cpu_mask = 0; 148 | 149 | typedef struct { 150 | unsigned long c; 151 | uint8_t pure_reader; 152 | } __attribute__((aligned(64))) rw_count_t; 153 | 154 | rw_count_t *rw_counts; 155 | 156 | inline uint8_t is_writer(unsigned long i, uint8_t val) { 157 | if (rw_counts[i].pure_reader) 158 | return 0; 159 | rw_counts[i].c += val; 160 | return !(rw_counts[i].c & rw_mask); 161 | } 162 | 163 | void tbb_print_usage() { 164 | fprintf(stderr, "tbb_spin_rw_mutex additional options:\n"); 165 | fprintf(stderr, "\t[-h print this msg]\n"); 166 | fprintf(stderr, "\t[-r reader/writer log ratio, default: 6 (2^(6)-1 readers per writer)]\n"); 167 | fprintf(stderr, "\t[-m pure reader cpu mask, default: 0x0 (no pure readers)]\n"); 168 | } 169 | 170 | void tbb_check_strtoul(int rval, char* endptr) { 171 | if ((errno == ERANGE && (rval == ULONG_MAX)) 172 | || (errno != 0 && rval == 0) || endptr == optarg) { 173 | fprintf(stderr, "tbb_spin_rw_mutex: value unsuitable for 'unsigned long'\n\n"); 174 | tbb_print_usage(); 175 | exit(1); 176 | } 177 | } 178 | 179 | void tbb_parse_args(test_args_t * unused, int argc, char** argv) { 180 | int i = 0; 181 | char *endptr; 182 | 183 | log2_ratio = 6; 184 | reader_cpu_mask = 0x0; 185 | 186 | while ((i = getopt(argc, argv, "hr:m:")) != -1) 187 | { 188 | switch (i) { 189 | case 'r': 190 | errno = 0; 191 | log2_ratio = strtoul(optarg, &endptr, 10); 192 | tbb_check_strtoul(log2_ratio, endptr); 193 | if (log2_ratio >= 64) { 194 | fprintf(stderr, "tbb_spin_rw_mutex: -r can not be >= 64\n"); 195 | exit(1); 196 | } 197 | break; 198 | case 'm': 199 | errno = 0; 200 | if (!strncmp(optarg, "0x", 2)) 201 | reader_cpu_mask = strtoul(optarg, &endptr, 16); 202 | else 203 | reader_cpu_mask = strtoul(optarg, &endptr, 10); 204 | 205 | tbb_check_strtoul(reader_cpu_mask, endptr); 206 | break; 207 | case 'h': 208 | tbb_print_usage(); 209 | exit(0); 210 | case '?': 211 | default: 212 | tbb_print_usage(); 213 | exit(3); 214 | } 215 | } 216 | } 217 | 218 | void tbb_init_locks (unsigned long *lock, unsigned long cores) { 219 | unsigned i; 220 | rw_mask = ((1UL</dev/null 55 | then 56 | return 57 | fi 58 | 59 | id=`_gen_ChangeId` 60 | T="$MSG.tmp.$$" 61 | AWK=awk 62 | if [ -x /usr/xpg4/bin/awk ]; then 63 | # Solaris AWK is just too broken 64 | AWK=/usr/xpg4/bin/awk 65 | fi 66 | 67 | # Get core.commentChar from git config or use default symbol 68 | commentChar=`git config --get core.commentChar` 69 | commentChar=${commentChar:-#} 70 | 71 | # How this works: 72 | # - parse the commit message as (textLine+ blankLine*)* 73 | # - assume textLine+ to be a footer until proven otherwise 74 | # - exception: the first block is not footer (as it is the title) 75 | # - read textLine+ into a variable 76 | # - then count blankLines 77 | # - once the next textLine appears, print textLine+ blankLine* as these 78 | # aren't footer 79 | # - in END, the last textLine+ block is available for footer parsing 80 | $AWK ' 81 | BEGIN { 82 | # while we start with the assumption that textLine+ 83 | # is a footer, the first block is not. 84 | isFooter = 0 85 | footerComment = 0 86 | blankLines = 0 87 | } 88 | 89 | # Skip lines starting with commentChar without any spaces before it. 90 | /^'"$commentChar"'/ { next } 91 | 92 | # Skip the line starting with the diff command and everything after it, 93 | # up to the end of the file, assuming it is only patch data. 94 | # If more than one line before the diff was empty, strip all but one. 95 | /^diff --git / { 96 | blankLines = 0 97 | while (getline) { } 98 | next 99 | } 100 | 101 | # Count blank lines outside footer comments 102 | /^$/ && (footerComment == 0) { 103 | blankLines++ 104 | next 105 | } 106 | 107 | # Catch footer comment 108 | /^\[[a-zA-Z0-9-]+:/ && (isFooter == 1) { 109 | footerComment = 1 110 | } 111 | 112 | /]$/ && (footerComment == 1) { 113 | footerComment = 2 114 | } 115 | 116 | # We have a non-blank line after blank lines. Handle this. 117 | (blankLines > 0) { 118 | print lines 119 | for (i = 0; i < blankLines; i++) { 120 | print "" 121 | } 122 | 123 | lines = "" 124 | blankLines = 0 125 | isFooter = 1 126 | footerComment = 0 127 | } 128 | 129 | # Detect that the current block is not the footer 130 | (footerComment == 0) && (!/^\[?[a-zA-Z0-9-]+:/ || /^[a-zA-Z0-9-]+:\/\//) { 131 | isFooter = 0 132 | } 133 | 134 | { 135 | # We need this information about the current last comment line 136 | if (footerComment == 2) { 137 | footerComment = 0 138 | } 139 | if (lines != "") { 140 | lines = lines "\n"; 141 | } 142 | lines = lines $0 143 | } 144 | 145 | # Footer handling: 146 | # If the last block is considered a footer, splice in the Change-Id at the 147 | # right place. 148 | # Look for the right place to inject Change-Id by considering 149 | # CHANGE_ID_AFTER. Keys listed in it (case insensitive) come first, 150 | # then Change-Id, then everything else (eg. Signed-off-by:). 151 | # 152 | # Otherwise just print the last block, a new line and the Change-Id as a 153 | # block of its own. 154 | END { 155 | unprinted = 1 156 | if (isFooter == 0) { 157 | print lines "\n" 158 | lines = "" 159 | } 160 | changeIdAfter = "^(" tolower("'"$CHANGE_ID_AFTER"'") "):" 161 | numlines = split(lines, footer, "\n") 162 | for (line = 1; line <= numlines; line++) { 163 | if (unprinted && match(tolower(footer[line]), changeIdAfter) != 1) { 164 | unprinted = 0 165 | print "Change-Id: I'"$id"'" 166 | } 167 | print footer[line] 168 | } 169 | if (unprinted) { 170 | print "Change-Id: I'"$id"'" 171 | } 172 | }' "$MSG" > "$T" && mv "$T" "$MSG" || rm -f "$T" 173 | } 174 | _gen_ChangeIdInput() { 175 | echo "tree `git write-tree`" 176 | if parent=`git rev-parse "HEAD^0" 2>/dev/null` 177 | then 178 | echo "parent $parent" 179 | fi 180 | echo "author `git var GIT_AUTHOR_IDENT`" 181 | echo "committer `git var GIT_COMMITTER_IDENT`" 182 | echo 183 | printf '%s' "$clean_message" 184 | } 185 | _gen_ChangeId() { 186 | _gen_ChangeIdInput | 187 | git hash-object -t commit --stdin 188 | } 189 | 190 | 191 | add_ChangeId 192 | -------------------------------------------------------------------------------- /tools/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ARM-software/synchronization-benchmarks/9cc9fb6b5a5ddad855ead6aab88180c870d94a0d/tools/.gitignore --------------------------------------------------------------------------------