├── .bazelrc ├── AUTHORS.md ├── BUILD ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── WORKSPACE ├── abi.bzl ├── abi ├── 84 │ └── kernel │ │ └── ghost.h ├── 90 │ └── kernel │ │ └── ghost.h └── latest │ └── kernel │ └── ghost.h ├── bpf ├── bpf.bzl └── user │ ├── agent.c │ ├── agent.h │ ├── ghost_shared.h │ ├── schedclasstop.c │ ├── schedfair.c │ ├── schedghostidle.c │ ├── schedlat.c │ ├── schedlat_shared.h │ ├── schedrun.c │ └── schedrun_shared.h ├── docs └── ghostfs.md ├── experiments ├── antagonist │ ├── cfs_orchestrator.cc │ ├── cfs_orchestrator.h │ ├── ghost_orchestrator.cc │ ├── ghost_orchestrator.h │ ├── main.cc │ ├── options_test.cc │ ├── orchestrator.cc │ ├── orchestrator.h │ ├── orchestrator_test.cc │ ├── results.cc │ ├── results.h │ └── results_test.cc ├── microbenchmarks │ ├── global_scalability.cc │ └── ioctl_test.cc ├── rocksdb │ ├── cfs_orchestrator.cc │ ├── cfs_orchestrator.h │ ├── clock.h │ ├── database.cc │ ├── database.h │ ├── database_test.cc │ ├── ghost_orchestrator.cc │ ├── ghost_orchestrator.h │ ├── ingress.cc │ ├── ingress.h │ ├── latency.cc │ ├── latency.h │ ├── latency_test.cc │ ├── main.cc │ ├── options_test.cc │ ├── orchestrator.cc │ ├── orchestrator.h │ ├── orchestrator_test.cc │ ├── request.h │ └── synthetic_network_test.cc ├── scripts │ ├── BUILD │ ├── __init__.py │ ├── centralized_queuing.py │ ├── options.py │ ├── run.py │ ├── setup.py │ ├── shenango.py │ ├── shinjuku.py │ └── shinjuku_shenango.py └── shared │ ├── prio_table_helper.cc │ ├── prio_table_helper.h │ ├── thread_pool.cc │ ├── thread_pool.h │ ├── thread_pool_test.cc │ ├── thread_wait.cc │ └── thread_wait.h ├── kernel ├── ghost_uapi.h └── vmlinux_ghost_5_11.h ├── lib ├── agent.cc ├── agent.h ├── arr_structs.bpf.h ├── avl.bpf.h ├── base.cc ├── base.h ├── channel.cc ├── channel.h ├── enclave.cc ├── enclave.h ├── flux.h ├── ghost.cc ├── ghost.h ├── ghost_uapi.cc ├── ghost_uapi.h ├── logging.h ├── queue.bpf.h ├── scheduler.h ├── topology.cc ├── topology.h ├── trivial_status.cc └── trivial_status.h ├── requirements.txt ├── schedulers ├── biff │ ├── agent_biff.cc │ ├── biff_scheduler.cc │ └── biff_scheduler.h ├── cfs │ ├── README.md │ ├── cfs_agent.cc │ ├── cfs_scheduler.cc │ └── cfs_scheduler.h ├── cfs_bpf │ ├── agent_cfs.cc │ ├── cfs_scheduler.cc │ └── cfs_scheduler.h ├── edf │ ├── agent_exp.cc │ ├── edf_scheduler.cc │ ├── edf_scheduler.h │ ├── orchestrator.cc │ └── orchestrator.h ├── fifo │ ├── centralized │ │ ├── fifo_agent.cc │ │ ├── fifo_scheduler.cc │ │ └── fifo_scheduler.h │ └── per_cpu │ │ ├── fifo_agent.cc │ │ ├── fifo_scheduler.cc │ │ └── fifo_scheduler.h ├── flux │ ├── agent_flux.cc │ ├── flux_scheduler.cc │ └── flux_scheduler.h ├── shinjuku │ ├── agent_shinjuku.cc │ ├── shinjuku_orchestrator.cc │ ├── shinjuku_orchestrator.h │ ├── shinjuku_scheduler.cc │ └── shinjuku_scheduler.h └── sol │ ├── agent_sol.cc │ ├── sol_scheduler.cc │ └── sol_scheduler.h ├── shared ├── fd_server.cc ├── fd_server.h ├── prio_table.cc ├── prio_table.h ├── shmem.cc └── shmem.h ├── tests ├── agent_test.cc ├── api_test.cc ├── base_test.cc ├── biff_test.cc ├── bpf_avl_test.cc ├── bpf_queue_test.cc ├── capabilities_test.cc ├── capabilities_test.h ├── cfs_bpf_test.cc ├── cfs_test.cc ├── channel_test.cc ├── edf_test.cc ├── enclave_test.cc ├── fd_server_test.cc ├── flux_test.cc ├── prio_table_test.cc ├── simple_cfs.cc ├── simple_edf.cc ├── simple_exp.cc ├── sol_test.cc └── topology_test.cc ├── third_party ├── BUILD.bazel ├── bpf │ ├── BUILD │ ├── LICENSE │ ├── biff.bpf.c │ ├── biff_bpf.h │ ├── biff_flux.bpf.c │ ├── biff_flux_bpf.h │ ├── cfs.bpf.c │ ├── cfs_bpf.h │ ├── common.bpf.h │ ├── edf.bpf.c │ ├── edf.h │ ├── flux.bpf.c │ ├── flux_api.bpf.c │ ├── flux_bpf.h │ ├── flux_dispatch.bpf.c │ ├── flux_header_bpf.h │ ├── ghost_shared_bpf.h │ ├── idle_flux.bpf.c │ ├── idle_flux_bpf.h │ ├── pntring.bpf.h │ ├── pntring_funcs.bpf.h │ ├── prov_flux.bpf.c │ ├── prov_flux_bpf.h │ ├── roci_flux.bpf.c │ ├── roci_flux_bpf.h │ ├── schedclasstop.bpf.c │ ├── schedfair.bpf.c │ ├── schedfair.h │ ├── schedghostidle.bpf.c │ ├── schedlat.bpf.c │ ├── schedlat.h │ ├── schedlat_shared_bpf.h │ ├── schedrun.bpf.c │ ├── schedrun.h │ ├── schedrun_shared_bpf.h │ ├── test.bpf.c │ ├── topology.bpf.h │ └── vmlinux_ghost.h ├── iovisor_bcc │ ├── LICENSE │ ├── bits.bpf.h │ └── trace_helpers.h ├── linux.BUILD ├── rocksdb.BUILD └── util │ ├── LICENSE │ └── util.h └── util ├── cgroup_scraper.sh ├── enclave_watcher.cc ├── fdcat.cc ├── fdsrv.cc └── pushtosched.cc /.bazelrc: -------------------------------------------------------------------------------- 1 | build --cxxopt='-std=c++2a' 2 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | The people below have contributed to ghOSt. The list is ordered by when each 2 | person first joined the project. If you contribute to ghOSt, feel free to add 3 | your name to the end. 4 | 5 |
    6 |
  1. Paul Turner
  2. 7 |
  3. Neel Natu
  4. 8 |
  5. Ashwin Chaugule
  6. 9 |
  7. Oleg Rombakh
  8. 10 |
  9. Jack Humphries
  10. 11 |
  11. Christos Kozyrakis
  12. 12 |
  13. Luigi Rizzo
  14. 13 |
  15. Barret Rhoden
  16. 14 |
  17. Josh Don
  18. 15 |
  19. Ofir Weisse
  20. 16 |
  21. Hannah Pan
  22. 17 |
  23. Sourav Panda
  24. 18 |
  25. Andrew Delgadillo
  26. 19 |
  27. Dohyun Kim
  28. 20 |
  29. Stanko Novakovic
  30. 21 |
22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement (CLA). You (or your employer) retain the copyright to your 10 | contribution; this simply gives us permission to use and redistribute your 11 | contributions as part of the project. Head over to 12 | to see your current agreements on file or 13 | to sign a new one. 14 | 15 | You generally only need to submit a CLA once, so if you've already submitted one 16 | (even if it was for a different project), you probably don't need to do it 17 | again. 18 | 19 | ## Code Reviews 20 | 21 | All submissions, including submissions by project members, require review. We 22 | use GitHub pull requests for this purpose. Consult 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 24 | information on using pull requests. 25 | 26 | ## Community Guidelines 27 | 28 | This project follows 29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Google LLC 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google LLC nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | To report a security issue, please email 2 | [kernel-ghost@google.com](mailto:kernel-ghost@google.com) with a description of 3 | the issue, the steps you took to create the issue, affected versions, and, if 4 | known, mitigations for the issue. Our vulnerability management team will respond 5 | within 3 working days of your email. If the issue is confirmed as a 6 | vulnerability, we will open a Security Advisory. This project follows a 90 day 7 | disclosure timeline. 8 | -------------------------------------------------------------------------------- /bpf/bpf.bzl: -------------------------------------------------------------------------------- 1 | """The open source build rules for eBPF programs and skeleton headers.""" 2 | 3 | def bpf_program(name, src, hdrs, bpf_object, macros = [], **kwargs): 4 | """Generates an eBPF object file from .c source code. 5 | 6 | Args: 7 | name: target name for eBPF program. 8 | src: eBPF program source code in C. 9 | hdrs: list of header files depended on by src. 10 | bpf_object: name of generated eBPF object file. 11 | macros: additional macros that will be passed to clang. 12 | **kwargs: additional arguments. 13 | """ 14 | native.genrule( 15 | name = name, 16 | srcs = ["@linux//:libbpf"] + [src] + hdrs, 17 | outs = [bpf_object], 18 | cmd = ( 19 | "clang-12 -g -O2 -target bpf -D__TARGET_ARCH_x86 -D__x86_64__ " + 20 | # The `.` directory is the project root, so we pass it with the `-I` 21 | # flag so that #includes work in the source files. 22 | # 23 | # `$(BINDIR)/external/linux` contains the outputs of the targets in 24 | # linux.BUILD. Thus, the headers for libbpf are within that 25 | # directory at libbpf/include/* 26 | # (i.e., $(BINDIR)/external/linux/libbpf/include/*). 27 | # 28 | # `$@` is the location to write the eBPF object file. 29 | "-I . -I /usr/include/x86_64-linux-gnu " + 30 | "-I $(BINDIR)/external/linux/libbpf/include " + 31 | "-c $(location " + src + ") -o $@ " + 32 | "".join([" -D%s" % m for m in macros]) + " && llvm-strip -g $@" 33 | ), 34 | **kwargs 35 | ) 36 | 37 | def bpf_skeleton(name, bpf_object, skel_hdr, **kwargs): 38 | """Generates eBPF skeleton from object file to .c source code. 39 | 40 | Args: 41 | name: target name for eBPF program. 42 | bpf_object: built eBPF program. 43 | skel_hdr: name of generated skeleton header file. 44 | **kwargs: additional arguments. 45 | """ 46 | native.genrule( 47 | name = name, 48 | # bpftool does not seem to be compiled when I include it in the `tools` 49 | # attribute list instead. 50 | srcs = ["@linux//:bpftool", bpf_object], 51 | outs = [skel_hdr], 52 | cmd = ( 53 | "$(BINDIR)/external/linux/bpftool/bin/bpftool gen skeleton $(location " + bpf_object + ") > $@ && " + 54 | # The libbpf headers are located in `libbpf` rather than `bpf`. 55 | "sed -i 's/#include 11 | 12 | #include "libbpf/bpf.h" 13 | #include "libbpf/libbpf.h" 14 | 15 | #include "lib/ghost_uapi.h" // for GHOST_VERSION. 16 | 17 | // See e.g. smp_store_release(). We can't check when we compile the BPF 18 | // programs, which are built with clang -target bpf, but all agents that load 19 | // bpf programs include this header. 20 | #ifndef __x86_64__ 21 | #error "BPF shared memory sync only works on x86" 22 | #endif 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | #ifndef GHOST_BPF 29 | // The definitions below are needed when the userspace code is compiled on a 30 | // machine that is *not* running the ghOSt kernel and therefore does not have 31 | // the ghOSt declarations below in the bpf.h UAPI header. 32 | 33 | // From include/uapi/linux/bpf.h for the ghost kernel. 34 | 35 | enum { 36 | BPF_PROG_TYPE_GHOST_SCHED = 1000, 37 | BPF_PROG_TYPE_GHOST_MSG, 38 | #if GHOST_VERSION >= 83 39 | BPF_PROG_TYPE_GHOST_SELECT_RQ, 40 | #endif 41 | #if GHOST_VERSION >= 84 42 | BPF_PROG_TYPE_GHOST_HALT_POLL, 43 | #endif 44 | 45 | BPF_GHOST_SCHED_PNT = 2000, 46 | BPF_GHOST_MSG_SEND, 47 | #if GHOST_VERSION >= 83 48 | BPF_GHOST_SELECT_RQ, 49 | #endif 50 | #if GHOST_VERSION >= 84 51 | BPF_GHOST_HALT_POLL, 52 | #endif 53 | __MAX_BPF_GHOST_ATTACH_TYPE 54 | }; 55 | 56 | // end include/uapi/linux/bpf.h 57 | 58 | #endif 59 | 60 | // Generic BPF helpers 61 | 62 | size_t bpf_map__mmap_sz(struct bpf_map *map); 63 | void *bpf_map__mmap(struct bpf_map *map); 64 | int bpf_map__munmap(struct bpf_map *map, void *addr); 65 | void bpf_program__set_types(struct bpf_program *prog, int prog_type, 66 | int expected_attach_type); 67 | 68 | // Common BPF initialization 69 | // 70 | // Returns 0 on success, -1 with errno set on failure. 71 | int agent_bpf_init(void); 72 | 73 | // Registers `prog` to be inserted at attach point `eat` during 74 | // agent_bpf_insert_registered(). You must load the programs before calling 75 | // insert. You may call this repeatedly, and it will only insert each program 76 | // once. In particular, you may temporarily get EBUSY during an agent handoff. 77 | // 78 | // Returns 0 on success, -1 with errno set on failure. 79 | int agent_bpf_register(struct bpf_program *prog, int eat); 80 | 81 | // Inserts the programs you previously registered and loaded. 82 | // 83 | // Returns 0 on success, -1 with errno set on failure. Any programs inserted 84 | // are not removed on error; call bpf_destroy() or just exit your process. 85 | int agent_bpf_insert_registered(int ctl_fd); 86 | 87 | // Gracefully unlinks and unloads the BPF programs. When agents call this, they 88 | // explicitly close (and thus unlink/detach) BPF programs from the enclave, 89 | // which will speed up agent upgrade/handoff. 90 | void agent_bpf_destroy(void); 91 | 92 | enum { 93 | AGENT_BPF_TRACE_SCHEDGHOSTIDLE, 94 | MAX_AGENT_BPF_TRACE, 95 | }; 96 | 97 | int agent_bpf_trace_init(unsigned int type); 98 | void agent_bpf_trace_output(FILE *to, unsigned int type); 99 | void agent_bpf_trace_reset(unsigned int type); 100 | 101 | #ifdef __cplusplus 102 | } /* extern "C" */ 103 | #endif 104 | 105 | #endif // GHOST_BPF_USER_AGENT_H_ 106 | -------------------------------------------------------------------------------- /bpf/user/ghost_shared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef GHOST_LIB_BPF_GHOST_SHARED_H_ 18 | #define GHOST_LIB_BPF_GHOST_SHARED_H_ 19 | 20 | // Keep this file's structs in sync with bpf/ghost_shared_bpf.h. 21 | // We need different headers for BPF and C programs due to various Google3 22 | // reasons. 23 | 24 | #include 25 | 26 | struct ghost_per_cpu_data { 27 | uint8_t want_tick; 28 | } __attribute__((aligned(64))); 29 | 30 | #endif // GHOST_LIB_BPF_GHOST_SHARED_H_ 31 | -------------------------------------------------------------------------------- /bpf/user/schedghostidle.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "bpf/user/bpf_schedghostidle.skel.h" 17 | #include "third_party/iovisor_bcc/trace_helpers.h" 18 | #include "libbpf/bpf.h" 19 | #include "libbpf/libbpf.h" 20 | 21 | #define handle_error(msg) \ 22 | do { perror(msg); exit(-1); } while (0) 23 | 24 | /* Keep this in sync with schedghostidle.bpf.c. */ 25 | #define NR_SLOTS 25 26 | 27 | static uint64_t start_time_ns, print_time_ns; 28 | 29 | static void print_hist(int fd) 30 | { 31 | unsigned int nr_cpus = libbpf_num_possible_cpus(); 32 | unsigned int hist[NR_SLOTS] = {0}; 33 | uint64_t *count; 34 | uint64_t total = 0; 35 | float total_sec; 36 | 37 | count = calloc(nr_cpus, sizeof(*count)); 38 | if (!count) 39 | handle_error("calloc"); 40 | 41 | for (int i = 0; i < NR_SLOTS; i++) { 42 | if (bpf_map_lookup_elem(fd, &i, count)) 43 | handle_error("lookup"); 44 | hist[i] = 0; 45 | for (int c = 0; c < nr_cpus; c++) { 46 | hist[i] += count[c]; 47 | total += count[c]; 48 | } 49 | } 50 | free(count); 51 | 52 | printf("\n"); 53 | printf("Latency of a CPU going Idle until a task is Latched:\n"); 54 | printf("----------------------------------------------------\n"); 55 | print_log2_hist(hist, NR_SLOTS, "usec"); 56 | 57 | total_sec = 1.0 * (print_time_ns - start_time_ns) / NSEC_PER_SEC; 58 | printf("\nTotal: %lu events over %f seconds (%f / sec) on %u cpus\n\n", 59 | total, total_sec, total / total_sec, nr_cpus); 60 | } 61 | 62 | static volatile bool exiting; 63 | 64 | static void sig_hand(int signr) 65 | { 66 | exiting = true; 67 | } 68 | 69 | static struct sigaction sigact = {.sa_handler = sig_hand}; 70 | 71 | int main(int argc, char **argv) 72 | { 73 | struct bpf_schedghostidle_bpf *obj; 74 | int err; 75 | 76 | sigaction(SIGINT, &sigact, 0); 77 | err = bump_memlock_rlimit(); 78 | if (err) { 79 | fprintf(stderr, "failed to increase rlimit: %d\n", err); 80 | return -1; 81 | } 82 | 83 | obj = bpf_schedghostidle_bpf__open_and_load(); 84 | if (!obj) { 85 | fprintf(stderr, "failed to open BPF object\n"); 86 | return -1; 87 | } 88 | 89 | err = bpf_schedghostidle_bpf__attach(obj); 90 | if (err) { 91 | fprintf(stderr, "failed to attach BPF programs\n"); 92 | goto cleanup; 93 | } 94 | 95 | start_time_ns = get_ktime_ns(); 96 | 97 | printf("Ctrl-c to exit\n"); 98 | 99 | while (!exiting) 100 | sleep(9999999); 101 | 102 | print_time_ns = get_ktime_ns(); 103 | print_hist(bpf_map__fd(obj->maps.hist)); 104 | 105 | printf("Total latches: %lu, bpf_latches %lu (%f), idle_to_bpf_latches %lu (%f)\n\n", 106 | obj->bss->nr_latches, 107 | obj->bss->nr_bpf_latches, 108 | 100.0 * obj->bss->nr_bpf_latches / obj->bss->nr_latches, 109 | obj->bss->nr_idle_to_bpf_latches, 110 | 100.0 * obj->bss->nr_idle_to_bpf_latches / obj->bss->nr_latches); 111 | 112 | cleanup: 113 | bpf_schedghostidle_bpf__destroy(obj); 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /bpf/user/schedlat.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "third_party/bpf/schedlat.h" 16 | #include "bpf/user/schedlat_bpf.skel.h" 17 | #include "third_party/iovisor_bcc/trace_helpers.h" 18 | #include "libbpf/bpf.h" 19 | #include "libbpf/libbpf.h" 20 | 21 | #define handle_error(msg) \ 22 | do { perror(msg); exit(-1); } while (0) 23 | 24 | static const char *titles[] = { 25 | [RUNNABLE_TO_LATCHED] = "Latency from Runnable to Latched", 26 | [LATCHED_TO_RUN] = "Latency from Latched to Run", 27 | [RUNNABLE_TO_RUN] = "Latency from Runnable to Run", 28 | }; 29 | 30 | static void print_hists(int fd) 31 | { 32 | unsigned int nr_cpus = libbpf_num_possible_cpus(); 33 | struct hist *hist; 34 | uint32_t total[MAX_NR_HIST_SLOTS]; 35 | 36 | /* 37 | * There are NR_HISTS members of the PERCPU_ARRAY. Each one we read is 38 | * an *array[nr_cpus]* of the struct hist, one for each cpu. This 39 | * differs from a accessing an element from within a BPF program, where 40 | * we only get the percpu element. 41 | */ 42 | hist = calloc(nr_cpus, sizeof(struct hist)); 43 | if (!hist) 44 | handle_error("calloc"); 45 | 46 | for (int i = 0; i < NR_HISTS; i++) { 47 | if (bpf_map_lookup_elem(fd, &i, hist)) 48 | handle_error("lookup"); 49 | memset(total, 0, sizeof(total)); 50 | for (int c = 0; c < nr_cpus; c++) { 51 | for (int s = 0; s < MAX_NR_HIST_SLOTS; s++) 52 | total[s] += hist[c].slots[s]; 53 | } 54 | printf("\n%s:\n----------\n", titles[i]); 55 | print_log2_hist(total, MAX_NR_HIST_SLOTS, "usec"); 56 | } 57 | 58 | free(hist); 59 | } 60 | 61 | static volatile bool exiting; 62 | 63 | static void sig_hand(int signr) 64 | { 65 | exiting = true; 66 | } 67 | 68 | static struct sigaction sigact = {.sa_handler = sig_hand}; 69 | 70 | int main(int argc, char **argv) 71 | { 72 | struct schedlat_bpf *obj; 73 | int err; 74 | 75 | sigaction(SIGINT, &sigact, 0); 76 | err = bump_memlock_rlimit(); 77 | if (err) { 78 | fprintf(stderr, "failed to increase rlimit: %d\n", err); 79 | return -1; 80 | } 81 | 82 | obj = schedlat_bpf__open_and_load(); 83 | if (!obj) { 84 | fprintf(stderr, "failed to open BPF object\n"); 85 | return -1; 86 | } 87 | 88 | err = schedlat_bpf__attach(obj); 89 | if (err) { 90 | fprintf(stderr, "failed to attach BPF programs\n"); 91 | goto cleanup; 92 | } 93 | 94 | printf("Ctrl-c to exit\n"); 95 | while (!exiting) 96 | sleep(9999999); 97 | 98 | print_hists(bpf_map__fd(obj->maps.hists)); 99 | 100 | printf("Exiting\n"); 101 | 102 | cleanup: 103 | schedlat_bpf__destroy(obj); 104 | 105 | return 0; 106 | } 107 | -------------------------------------------------------------------------------- /bpf/user/schedlat_shared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef GHOST_LIB_BPF_SCHEDLAT_SHARED_H_ 18 | #define GHOST_LIB_BPF_SCHEDLAT_SHARED_H_ 19 | 20 | // Keep this file's structs in sync with bpf/schedlat_shared_bpf.h. 21 | // We need different headers for BPF and C programs due to various Google3 22 | // reasons. 23 | 24 | #include 25 | 26 | #define MAX_PIDS 102400 27 | #define MAX_NR_HIST_SLOTS 25 28 | 29 | struct task_stat { 30 | uint64_t runnable_at; 31 | uint64_t latched_at; 32 | uint64_t ran_at; 33 | }; 34 | 35 | /* 36 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 37 | * 8-byte aligned, since it is a value for a BPF map. 38 | */ 39 | struct hist { 40 | uint32_t slots[MAX_NR_HIST_SLOTS]; 41 | } __attribute__((aligned(64))); 42 | 43 | enum { 44 | RUNNABLE_TO_LATCHED, 45 | LATCHED_TO_RUN, 46 | RUNNABLE_TO_RUN, 47 | NR_HISTS, 48 | }; 49 | 50 | #endif // GHOST_LIB_BPF_SCHEDLAT_SHARED_H_ 51 | -------------------------------------------------------------------------------- /bpf/user/schedrun.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "third_party/bpf/schedrun.h" 16 | #include "bpf/user/schedrun_bpf.skel.h" 17 | #include "third_party/iovisor_bcc/trace_helpers.h" 18 | #include "libbpf/bpf.h" 19 | #include "libbpf/libbpf.h" 20 | 21 | #define error_exit(msg) do { \ 22 | perror(msg); \ 23 | exit(EXIT_FAILURE); \ 24 | } while (0) 25 | 26 | static bool ghost_only = false; 27 | static pid_t pid = 0; 28 | 29 | static const char *titles[] = { 30 | [RUNTIMES_PREEMPTED_YIELDED] = "Runtimes of preempted/yielded tasks", 31 | [RUNTIMES_BLOCKED] = "Runtimes of tasks that blocked", 32 | [RUNTIMES_ALL] = "All task runtimes", 33 | }; 34 | 35 | // TODO: refactor (copied from schedlat.c). 36 | static void print_hists(int fd) 37 | { 38 | unsigned int nr_cpus = libbpf_num_possible_cpus(); 39 | struct hist *hist; 40 | uint32_t total[MAX_NR_HIST_SLOTS]; 41 | 42 | /* 43 | * There are NR_HISTS members of the PERCPU_ARRAY. Each one we read is 44 | * an *array[nr_cpus]* of the struct hist, one for each cpu. This 45 | * differs from accessing an element from within a BPF program, where 46 | * we only get the percpu element. 47 | */ 48 | hist = calloc(nr_cpus, sizeof(struct hist)); 49 | if (!hist) 50 | error_exit("calloc"); 51 | 52 | for (int i = 0; i < NR_HISTS; i++) { 53 | if (bpf_map_lookup_elem(fd, &i, hist)) 54 | error_exit("bpf_map_lookup_elem"); 55 | memset(total, 0, sizeof(total)); 56 | for (int c = 0; c < nr_cpus; c++) { 57 | for (int s = 0; s < MAX_NR_HIST_SLOTS; s++) 58 | total[s] += hist[c].slots[s]; 59 | } 60 | printf("\n%s:\n----------\n", titles[i]); 61 | print_log2_hist(total, MAX_NR_HIST_SLOTS, "usec"); 62 | } 63 | } 64 | 65 | int main(int argc, char **argv) 66 | { 67 | sigset_t set; 68 | int opt, err, sig; 69 | struct schedrun_bpf *skel; 70 | 71 | if (sigemptyset(&set)) 72 | error_exit("sigemptyset"); 73 | if (sigaddset(&set, SIGINT)) 74 | error_exit("sigaddset"); 75 | if (sigprocmask(SIG_BLOCK, &set, NULL)) 76 | error_exit("sigprocmask"); 77 | 78 | while ((opt = getopt(argc, argv, "gp:")) != -1) { 79 | switch (opt) { 80 | case 'g': 81 | ghost_only = true; 82 | break; 83 | case 'p': 84 | errno = 0; 85 | pid = strtol(optarg, NULL, 10); 86 | if (errno) 87 | error_exit("strtol"); 88 | if (pid <= 0) { 89 | fprintf(stderr, "Invalid pid: %s\n", optarg); 90 | return 1; 91 | } 92 | break; 93 | default: 94 | fprintf(stderr, "Usage: %s [-p pid | -g]\n", argv[0]); 95 | return 1; 96 | } 97 | } 98 | 99 | if (ghost_only && pid) { 100 | fprintf(stderr, "-g and -p options are mutually exclusive\n"); 101 | return 1; 102 | } 103 | 104 | if (bump_memlock_rlimit()) 105 | error_exit("bump_memlock_rlimit"); 106 | 107 | skel = schedrun_bpf__open(); 108 | if (!skel) { 109 | fprintf(stderr, "Failed to open BPF skeleton\n"); 110 | return 1; 111 | } 112 | 113 | skel->rodata->ghost_only = ghost_only; 114 | skel->rodata->targ_tgid = pid; 115 | 116 | err = schedrun_bpf__load(skel); 117 | if (err) { 118 | fprintf(stderr, "Failed to load BPF skeleton\n"); 119 | return 1; 120 | } 121 | 122 | err = schedrun_bpf__attach(skel); 123 | if (err) { 124 | fprintf(stderr, "Failed to attach BPF skeleton\n"); 125 | goto cleanup; 126 | } 127 | 128 | printf("Ctrl-c to exit\n"); 129 | 130 | if (sigwait(&set, &sig)) 131 | error_exit("sigwait"); 132 | 133 | print_hists(bpf_map__fd(skel->maps.hists)); 134 | printf("Exiting\n"); 135 | 136 | cleanup: 137 | schedrun_bpf__destroy(skel); 138 | return -err; 139 | } 140 | -------------------------------------------------------------------------------- /bpf/user/schedrun_shared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef GHOST_LIB_BPF_SCHEDRUN_SHARED_H_ 18 | #define GHOST_LIB_BPF_SCHEDRUN_SHARED_H_ 19 | 20 | // Keep this file's structs in sync with bpf/schedrun_shared_bpf.h. 21 | // We need different headers for BPF and C programs due to various Google3 22 | // reasons. 23 | 24 | #include 25 | 26 | #define MAX_PIDS 102400 27 | #define MAX_NR_HIST_SLOTS 25 28 | 29 | /* 30 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 31 | * 8-byte aligned, since it is a value for a BPF map. 32 | */ 33 | struct hist { 34 | uint32_t slots[MAX_NR_HIST_SLOTS]; 35 | } __attribute__((aligned(64))); 36 | 37 | enum { 38 | RUNTIMES_PREEMPTED_YIELDED, 39 | RUNTIMES_BLOCKED, 40 | RUNTIMES_ALL, 41 | NR_HISTS, 42 | }; 43 | 44 | #endif // GHOST_LIB_BPF_SCHEDRUN_SHARED_H_ 45 | -------------------------------------------------------------------------------- /experiments/antagonist/cfs_orchestrator.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/antagonist/cfs_orchestrator.h" 8 | 9 | #include "absl/functional/bind_front.h" 10 | 11 | namespace ghost_test { 12 | 13 | void CfsOrchestrator::InitThreadPool() { 14 | std::vector kernel_schedulers( 15 | options().num_threads, ghost::GhostThread::KernelScheduler::kCfs); 16 | std::vector> thread_work( 17 | options().num_threads, absl::bind_front(&CfsOrchestrator::Worker, this)); 18 | 19 | CHECK_EQ(kernel_schedulers.size(), options().num_threads); 20 | CHECK_EQ(kernel_schedulers.size(), thread_work.size()); 21 | thread_pool().Init(kernel_schedulers, thread_work); 22 | } 23 | 24 | CfsOrchestrator::CfsOrchestrator(Orchestrator::Options opts) 25 | : Orchestrator(std::move(opts)), threads_ready_(options().num_threads + 1) { 26 | CHECK_EQ(options().num_threads, options().cpus.Size()); 27 | 28 | InitThreadPool(); 29 | threads_ready_.Block(); 30 | set_start(absl::Now()); 31 | } 32 | 33 | void CfsOrchestrator::Worker(uint32_t sid) { 34 | if (!thread_triggers().Triggered(sid)) { 35 | thread_triggers().Trigger(sid); 36 | const ghost::Cpu cpu = options().cpus.GetNthCpu(sid); 37 | CHECK_EQ( 38 | ghost::GhostHelper()->SchedSetAffinity( 39 | ghost::Gtid::Current(), ghost::MachineTopology()->ToCpuList({cpu})), 40 | 0); 41 | printf("Worker (SID %u, TID: %ld, affined to CPU %u)\n", sid, 42 | syscall(SYS_gettid), cpu.id()); 43 | threads_ready_.Block(); 44 | } 45 | 46 | Soak(sid); 47 | } 48 | 49 | } // namespace ghost_test 50 | -------------------------------------------------------------------------------- /experiments/antagonist/cfs_orchestrator.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ANTAGONIST_CFS_ORCHESTRATOR_H_ 8 | #define GHOST_EXPERIMENTS_ANTAGONIST_CFS_ORCHESTRATOR_H_ 9 | 10 | #include "absl/synchronization/barrier.h" 11 | #include "experiments/antagonist/orchestrator.h" 12 | 13 | namespace ghost_test { 14 | 15 | // This is the orchestrator for the CFS (Linux Completely Fair Scheduler) 16 | // experiments. All threads are scheduled by CFS. 17 | // 18 | // Example: 19 | // Orchestrator::Options options; 20 | // ... Fill in the options. 21 | // CfsOrchestrator orchestrator(options); 22 | // (Constructs orchestrator with options.) 23 | // ... 24 | // orchestrator.Terminate(); 25 | // (Tells orchestrator to stop the experiment and print the results.) 26 | class CfsOrchestrator : public Orchestrator { 27 | public: 28 | explicit CfsOrchestrator(Orchestrator::Options opts); 29 | ~CfsOrchestrator() final {} 30 | 31 | private: 32 | // Initializes the thread pool. 33 | void InitThreadPool(); 34 | 35 | void Worker(uint32_t sid) final; 36 | 37 | // Used so that the main thread does not start the timer (and workers do not 38 | // start spinning) until the worker threads have initialized. 39 | absl::Barrier threads_ready_; 40 | }; 41 | 42 | } // namespace ghost_test 43 | 44 | #endif // GHOST_EXPERIMENTS_ANTAGONIST_CFS_ORCHESTRATOR_H_ 45 | -------------------------------------------------------------------------------- /experiments/antagonist/ghost_orchestrator.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/antagonist/ghost_orchestrator.h" 8 | 9 | #include "absl/functional/bind_front.h" 10 | 11 | namespace ghost_test { 12 | namespace { 13 | // We do not need a different class of service (e.g., different expected 14 | // runtimes, different QoS (Quality-of-Service) classes, etc.) across workers in 15 | // our experiments. Furthermore, all workers are ghOSt one-shots. Thus, put all 16 | // worker sched items in the same work class. 17 | static constexpr uint32_t kWorkClassIdentifier = 0; 18 | } // namespace 19 | 20 | void GhostOrchestrator::InitThreadPool() { 21 | std::vector kernel_schedulers( 22 | options().num_threads, ghost::GhostThread::KernelScheduler::kGhost); 23 | std::vector> thread_work( 24 | options().num_threads, 25 | absl::bind_front(&GhostOrchestrator::Worker, this)); 26 | 27 | CHECK_EQ(kernel_schedulers.size(), options().num_threads); 28 | CHECK_EQ(kernel_schedulers.size(), thread_work.size()); 29 | thread_pool().Init(kernel_schedulers, thread_work); 30 | } 31 | 32 | void GhostOrchestrator::InitGhost() { 33 | const std::vector gtids = thread_pool().GetGtids(); 34 | CHECK_EQ(gtids.size(), options().num_threads); 35 | 36 | ghost::work_class wc; 37 | prio_table_helper_.GetWorkClass(kWorkClassIdentifier, wc); 38 | wc.id = kWorkClassIdentifier; 39 | wc.flags = WORK_CLASS_ONESHOT; 40 | wc.qos = options().ghost_qos; 41 | // Write the max unsigned 64-bit integer as the deadline just in case we want 42 | // to run the experiment with the ghOSt EDF (Earliest-Deadline-First) 43 | // scheduler. 44 | wc.exectime = std::numeric_limits::max(); 45 | // 'period' is irrelevant because all threads scheduled by ghOSt are 46 | // one-shots. 47 | wc.period = 0; 48 | prio_table_helper_.SetWorkClass(kWorkClassIdentifier, wc); 49 | 50 | for (size_t i = 0; i < gtids.size(); ++i) { 51 | ghost::sched_item si; 52 | prio_table_helper_.GetSchedItem(/*sid=*/i, si); 53 | si.sid = i; 54 | si.wcid = kWorkClassIdentifier; 55 | si.gpid = gtids[i].id(); 56 | si.flags = SCHED_ITEM_RUNNABLE; 57 | si.deadline = 0; 58 | prio_table_helper_.SetSchedItem(/*sid=*/i, si); 59 | } 60 | } 61 | 62 | GhostOrchestrator::GhostOrchestrator(Orchestrator::Options opts) 63 | : Orchestrator(std::move(opts)), 64 | prio_table_helper_(/*num_sched_items=*/options().num_threads, 65 | /*num_work_classes=*/1) { 66 | CHECK(options().cpus.Empty()); 67 | 68 | InitThreadPool(); 69 | // This must be called after 'InitThreadPool' since it accesses the GTIDs of 70 | // the threads in the thread pool. 71 | InitGhost(); 72 | set_start(absl::Now()); 73 | } 74 | 75 | void GhostOrchestrator::Worker(uint32_t sid) { 76 | if (!thread_triggers().Triggered(sid)) { 77 | thread_triggers().Trigger(sid); 78 | printf("Worker (SID %u, TID: %ld, not affined to any CPU)\n", sid, 79 | syscall(SYS_gettid)); 80 | } 81 | 82 | Soak(sid); 83 | } 84 | 85 | } // namespace ghost_test 86 | -------------------------------------------------------------------------------- /experiments/antagonist/ghost_orchestrator.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ANTAGONIST_GHOST_ORCHESTRATOR_H_ 8 | #define GHOST_EXPERIMENTS_ANTAGONIST_GHOST_ORCHESTRATOR_H_ 9 | 10 | #include "experiments/antagonist/orchestrator.h" 11 | #include "experiments/shared/prio_table_helper.h" 12 | 13 | namespace ghost_test { 14 | 15 | // This is the orchestrator for the ghOSt experiments. All threads are scheduled 16 | // by ghOSt. 17 | // 18 | // Example: 19 | // Orchestrator::Options options; 20 | // ... Fill in the options. 21 | // GhostOrchestrator orchestrator(options); 22 | // (Constructs orchestrator with options.) 23 | // ... 24 | // orchestrator.Terminate(); 25 | // (Tells orchestrator to stop the experiment and print the results.) 26 | class GhostOrchestrator : public Orchestrator { 27 | public: 28 | explicit GhostOrchestrator(Orchestrator::Options opts); 29 | ~GhostOrchestrator() final {} 30 | 31 | private: 32 | // Initializes the thread pool. 33 | void InitThreadPool(); 34 | 35 | // Initializes the ghOSt PrioTable. 36 | void InitGhost(); 37 | 38 | void Worker(uint32_t sid) final; 39 | 40 | // Manages communication with ghOSt via the shared PrioTable. 41 | PrioTableHelper prio_table_helper_; 42 | }; 43 | 44 | } // namespace ghost_test 45 | 46 | #endif // GHOST_EXPERIMENTS_ANTAGONIST_GHOST_ORCHESTRATOR_H_ 47 | -------------------------------------------------------------------------------- /experiments/antagonist/options_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "gmock/gmock.h" 8 | #include "gtest/gtest.h" 9 | #include "experiments/antagonist/orchestrator.h" 10 | 11 | // These tests check that the application prints options and parses command line 12 | // flags properly. 13 | 14 | namespace ghost_test { 15 | namespace { 16 | 17 | using ::testing::Eq; 18 | 19 | // Returns orchestrator options suitable for the tests. 20 | Orchestrator::Options GetOptions() { 21 | Orchestrator::Options options; 22 | 23 | options.print_options.pretty = true; 24 | options.work_share = 0.9; 25 | options.num_threads = 4; 26 | options.cpus = 27 | ghost::MachineTopology()->ToCpuList(std::vector{1, 2, 3, 4}); 28 | options.experiment_duration = absl::Seconds(15); 29 | options.scheduler = ghost::GhostThread::KernelScheduler::kCfs; 30 | options.ghost_qos = 2; 31 | 32 | return options; 33 | } 34 | 35 | // This tests that the '<<' operator prints all options and their values in 36 | // alphabetical order by option name. 37 | TEST(OptionsTest, PrintOptions) { 38 | Orchestrator::Options options = GetOptions(); 39 | std::ostringstream os; 40 | 41 | os << options; 42 | std::string expected = R"(cpus: 1 2 3 4 43 | experiment_duration: 15s 44 | ghost_qos: 2 45 | num_threads: 4 46 | print_format: pretty 47 | scheduler: cfs 48 | work_share: 0.9)"; 49 | EXPECT_THAT(os.str(), Eq(expected)); 50 | } 51 | 52 | } // namespace 53 | } // namespace ghost_test 54 | -------------------------------------------------------------------------------- /experiments/antagonist/results.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/antagonist/results.h" 8 | 9 | #include 10 | #include 11 | 12 | #include "lib/base.h" 13 | 14 | namespace ghost_test { 15 | 16 | namespace { 17 | constexpr size_t kWorkerLen = 8; 18 | constexpr size_t kDurationLen = 20; 19 | constexpr size_t kShareLen = 12; 20 | // Add 2 to the end to account for the space between each column in the results. 21 | constexpr size_t kNumDashes = kWorkerLen + kDurationLen + kShareLen + 2; 22 | 23 | // Prints the results in human-readable form. 24 | template 25 | void PrintLinePretty(std::ostream& os, const std::string& worker, 26 | T run_duration, U work_share, bool dashes) { 27 | os << std::left; 28 | os << std::setw(kWorkerLen) << worker << " "; 29 | os << std::setw(kDurationLen) << run_duration << " "; 30 | os << std::setw(kShareLen) << work_share << " "; 31 | os << std::endl; 32 | if (dashes) { 33 | os << std::string(kNumDashes, '-') << std::endl; 34 | } 35 | } 36 | 37 | // Prints the results in CSV form. 38 | template 39 | void PrintLineCsv(std::ostream& os, const std::string& worker, T run_duration, 40 | U work_share) { 41 | os << std::left; 42 | os << worker << ","; 43 | os << run_duration << ","; 44 | os << work_share; 45 | os << std::endl; 46 | } 47 | 48 | // Prints the preface to the results if pretty mode is set. 49 | void PrintPrettyPreface(PrintOptions options) { 50 | CHECK(options.pretty); 51 | 52 | PrintLinePretty(*options.os, "Worker", "Run Duration (ns)", "Work Share", 53 | /*dashes=*/true); 54 | } 55 | 56 | // Adds/averages all results and prints out the summary. 57 | void PrintTotal(const std::vector& run_durations, 58 | absl::Duration runtime, PrintOptions options) { 59 | absl::Duration run_duration; 60 | for (const absl::Duration& r : run_durations) { 61 | run_duration += r; 62 | } 63 | const double work_share = 64 | absl::ToDoubleMilliseconds(run_duration) / 65 | (run_durations.size() * absl::ToDoubleMilliseconds(runtime)); 66 | 67 | if (options.pretty) { 68 | PrintLinePretty(*options.os, "Total", 69 | absl::ToInt64Nanoseconds(run_duration), work_share, 70 | /*dashes=*/false); 71 | } else { 72 | PrintLineCsv(*options.os, "Total", absl::ToInt64Nanoseconds(run_duration), 73 | work_share); 74 | } 75 | } 76 | } // namespace 77 | 78 | // Prints all results. 79 | void Print(const std::vector& run_durations, 80 | absl::Duration runtime, const PrintOptions& options) { 81 | CHECK_NE(options.os, nullptr); 82 | 83 | if (options.pretty) { 84 | PrintPrettyPreface(options); 85 | } 86 | 87 | for (size_t i = 0; i < run_durations.size(); i++) { 88 | const double work_share = absl::ToDoubleMilliseconds(run_durations[i]) / 89 | absl::ToDoubleMilliseconds(runtime); 90 | const int64_t run_duration = absl::ToInt64Nanoseconds(run_durations[i]); 91 | if (options.pretty) { 92 | PrintLinePretty(*options.os, std::to_string(i), run_duration, work_share, 93 | /*dashes=*/false); 94 | } else { 95 | PrintLineCsv(*options.os, std::to_string(i), run_duration, work_share); 96 | } 97 | } 98 | PrintTotal(run_durations, runtime, options); 99 | } 100 | 101 | } // namespace ghost_test 102 | -------------------------------------------------------------------------------- /experiments/antagonist/results.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ANTAGONIST_RESULTS_H_ 8 | #define GHOST_EXPERIMENTS_ANTAGONIST_RESULTS_H_ 9 | 10 | #include "absl/time/clock.h" 11 | 12 | namespace ghost_test { 13 | 14 | struct PrintOptions { 15 | // If true, prints the results in human-readable form. Otherwise, prints the 16 | // results in CSV form. 17 | bool pretty; 18 | // The output stream to send the results to. We make 'os' a pointer rather 19 | // than a reference since a reference cannot be reassigned. 20 | // 21 | // 'os' is owned by whoever instantiated this struct. 22 | std::ostream* os; 23 | }; 24 | 25 | // Prints the results for the workers. 26 | void Print(const std::vector& run_durations, 27 | absl::Duration runtime, const PrintOptions& options); 28 | 29 | } // namespace ghost_test 30 | 31 | #endif // GHOST_EXPERIMENTS_ANTAGONIST_RESULTS_H_ 32 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/ioctl_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | 9 | #include "benchmark/benchmark.h" 10 | #include "lib/enclave.h" 11 | #include "lib/ghost.h" 12 | #include "lib/topology.h" 13 | 14 | namespace ghost { 15 | 16 | void BM_ghost_null_ioctl(benchmark::State& state) { 17 | GhostHelper()->InitCore(); 18 | Topology* topology = MachineTopology(); 19 | LocalEnclave enclave(AgentConfig(topology, CpuList(*topology))); 20 | int ctl = GhostHelper()->GetGlobalEnclaveCtlFd(); 21 | 22 | for (auto _ : state) { 23 | CHECK_EQ(ioctl(ctl, GHOST_IOC_NULL), 0); 24 | } 25 | } 26 | BENCHMARK(BM_ghost_null_ioctl); 27 | 28 | void BM_getpid(benchmark::State& state) { 29 | for (auto _ : state) { 30 | CHECK_GT(syscall(SYS_getpid), 0); 31 | } 32 | } 33 | BENCHMARK(BM_getpid); 34 | 35 | } // namespace ghost 36 | 37 | int main(int argc, char** argv) { 38 | ::benchmark::RunSpecifiedBenchmarks(); 39 | } 40 | -------------------------------------------------------------------------------- /experiments/rocksdb/cfs_orchestrator.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ROCKSDB_CFS_ORCHESTRATOR_H_ 8 | #define GHOST_EXPERIMENTS_ROCKSDB_CFS_ORCHESTRATOR_H_ 9 | 10 | #include "absl/synchronization/barrier.h" 11 | #include "experiments/rocksdb/latency.h" 12 | #include "experiments/rocksdb/orchestrator.h" 13 | #include "experiments/rocksdb/request.h" 14 | #include "experiments/shared/thread_wait.h" 15 | 16 | namespace ghost_test { 17 | 18 | // This is the orchestrator for the CFS (Linux Completely Fair Scheduler) 19 | // experiments. All threads are scheduled by CFS. The worker threads may either 20 | // (1) spin when waiting for more work to be assigned to them or (2) sleep on a 21 | // futex until more work is assigned to them. 22 | // 23 | // Example: 24 | // Options options; 25 | // ... Fill in the options. 26 | // CfsOrchestrator orchestrator_(options); 27 | // (Constructs orchestrator with options.) 28 | // ... 29 | // orchestrator_.Terminate(); 30 | // (Tells orchestrator to stop the experiment and print the results.) 31 | class CfsOrchestrator final : public Orchestrator { 32 | public: 33 | explicit CfsOrchestrator(Options opts); 34 | ~CfsOrchestrator() final {} 35 | 36 | void Terminate() final; 37 | 38 | protected: 39 | // For CFS, the load generator passes requests to the dispatcher. 40 | void LoadGenerator(uint32_t sid) final; 41 | 42 | void Dispatcher(uint32_t sid) final; 43 | 44 | void Worker(uint32_t sid) final; 45 | 46 | private: 47 | // Initializes the thread pool. 48 | void InitThreadPool(); 49 | 50 | // The dispatcher calls this method to receive requests sent to it by the load 51 | // generator. 52 | void HandleLoadGenerator(uint32_t sid); 53 | 54 | // The dispatcher calls this method to populate 'idle_sids_' with a list of 55 | // the SIDs of idle workers. Note that this method clears 'idle_sids_' before 56 | // filling it in. 57 | void GetIdleWorkerSIDs(uint32_t sid); 58 | 59 | // Allows runnable threads to run and keeps idle threads either spinning or 60 | // sleeping on a futex until they are marked runnable again. 61 | ThreadWait thread_wait_; 62 | 63 | // Each thread (the load generator, the dispatcher, and the workers) 64 | // decrements this once they have initialized themselves. This barrier is used 65 | // to block the load generator until all threads have been initialized so that 66 | // it does not generate load while the system is initializing. If it generated 67 | // load while the system is initializing, the experiment results would be bad 68 | // solely due to initialization costs rather than any deficiency in the 69 | // system. The initialization costs are irrelevant to the experiment. 70 | absl::Barrier threads_ready_; 71 | 72 | // The max number of requests that the load generator will send at a time to 73 | // the dispatcher. 74 | static constexpr size_t kLoadGeneratorBatchSize = 100; 75 | 76 | // The dispatchers' queues to hold waiting requests that will later be 77 | // assigned to workers. 78 | std::vector> dispatcher_queue_; 79 | 80 | // The dispatchers use this to store idle SIDs. We make this a class member 81 | // rather than a local variable in the 'Dispatcher' method to avoid repeatedly 82 | // allocating memory for the list backing in the dispatchers' common case, 83 | // which is expensive. 84 | std::vector> idle_sids_; 85 | }; 86 | 87 | } // namespace ghost_test 88 | 89 | #endif // GHOST_EXPERIMENTS_ROCKSDB_CFS_ORCHESTRATOR_H_ 90 | -------------------------------------------------------------------------------- /experiments/rocksdb/clock.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ROCKSDB_CLOCK_H_ 8 | #define GHOST_EXPERIMENTS_ROCKSDB_CLOCK_H_ 9 | 10 | #include "absl/time/clock.h" 11 | #include "lib/base.h" 12 | 13 | // This is a pure virtual parent class that represents a clock. 14 | class Clock { 15 | public: 16 | virtual ~Clock() = 0; 17 | 18 | // Returns the current clock time. 19 | virtual absl::Time TimeNow() const = 0; 20 | }; 21 | 22 | inline Clock::~Clock() {} 23 | 24 | // This represents a real clock that returns the current time from 25 | // `ghost::MonotonicNow()`. 26 | // 27 | // Example: 28 | // RealClock clock; 29 | // absl::Time now = clock.TimeNow(); 30 | class RealClock final : public Clock { 31 | public: 32 | // Returns the current time (from `ghost::MonotonicNow()`). 33 | absl::Time TimeNow() const final { return ghost::MonotonicNow(); } 34 | }; 35 | 36 | // This represents a simulated clock whose time can be arbitrarily changed. This 37 | // is mainly useful for testing code that depends on time, such as the `Ingress` 38 | // class. 39 | // 40 | // Example: 41 | // SimulatedClock clock; 42 | // clock.SetTime(ghost::MonotonicNow()); 43 | // clock.AdvanceTime(absl::Minutes(10)); 44 | // absl::Time time = clock.TimeNow(); 45 | // (`time` is equal to the time about 10 minutes from now.) 46 | class SimulatedClock final : public Clock { 47 | public: 48 | absl::Time TimeNow() const { return time_; } 49 | 50 | // Set the clock to `time`. 51 | void SetTime(absl::Time time) { time_ = time; } 52 | 53 | // Change the time on the clock by `duration`. 54 | void AdvanceTime(absl::Duration duration) { time_ += duration; } 55 | 56 | private: 57 | // The current time for this clock. 58 | absl::Time time_; 59 | }; 60 | 61 | #endif // GHOST_EXPERIMENTS_ROCKSDB_CLOCK_H_ 62 | -------------------------------------------------------------------------------- /experiments/rocksdb/database.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/rocksdb/database.h" 8 | 9 | #include 10 | 11 | #include "rocksdb/table.h" 12 | 13 | namespace ghost_test { 14 | 15 | bool Database::OpenDatabase(const std::filesystem::path& path) { 16 | rocksdb::Options options; 17 | options.create_if_missing = true; 18 | options.allow_mmap_reads = true; 19 | options.allow_mmap_writes = true; 20 | options.error_if_exists = false; 21 | 22 | rocksdb::BlockBasedTableOptions table_options; 23 | // Use a ClockCache as the default LRU cache requires locking a per-shard 24 | // mutex, even on lookups. Using a ClockCache improves lookup throughput as a 25 | // mutex is only acquired on inserts. 26 | table_options.block_cache = rocksdb::NewClockCache(kCacheSize, 0); 27 | CHECK_NE(table_options.block_cache, nullptr); 28 | options.table_factory.reset( 29 | rocksdb::NewBlockBasedTableFactory(table_options)); 30 | 31 | options.compression = rocksdb::kNoCompression; 32 | options.OptimizeLevelStyleCompaction(); 33 | rocksdb::Status status = rocksdb::DB::Open(options, path.string(), &db_); 34 | return status.ok(); 35 | } 36 | 37 | Database::Database(const std::filesystem::path& path) { 38 | if (!OpenDatabase(path)) { 39 | // The database is corrupted. 40 | CHECK(std::filesystem::exists(path)); 41 | CHECK_GT(std::filesystem::remove_all(path), 0); 42 | CHECK(OpenDatabase(path)); 43 | } 44 | CHECK(Fill()); 45 | PrepopulateCache(); 46 | } 47 | 48 | Database::~Database() { delete db_; } 49 | 50 | bool Database::Fill() { 51 | for (uint32_t i = 0; i < kNumEntries; i++) { 52 | rocksdb::Status status = 53 | db_->Put(rocksdb::WriteOptions(), Key(i), Value(i)); 54 | if (!status.ok()) { 55 | return false; 56 | } 57 | } 58 | return true; 59 | } 60 | 61 | void Database::PrepopulateCache() const { 62 | std::string value; 63 | for (int i = 0; i < kNumEntries; i++) { 64 | CHECK(Get(i, value)); 65 | } 66 | } 67 | 68 | bool Database::Get(uint32_t entry, std::string& value) const { 69 | rocksdb::Status status = db_->Get(rocksdb::ReadOptions(), Key(entry), &value); 70 | if (status.ok()) { 71 | CHECK_EQ(value, Value(entry)); 72 | return true; 73 | } 74 | return false; 75 | } 76 | 77 | bool Database::RangeQuery(uint32_t start_entry, uint32_t range_size, 78 | std::string& value) const { 79 | std::stringstream ss; 80 | std::unique_ptr it( 81 | db_->NewIterator(rocksdb::ReadOptions())); 82 | it->Seek(Key(start_entry)); 83 | 84 | for (uint32_t i = 0; i < range_size; i++) { 85 | if (!it->Valid()) { 86 | return false; 87 | } 88 | CHECK_EQ(it->value().ToString(), Value(start_entry + i)); 89 | ss << it->value().ToString(); 90 | if (i < range_size - 1) { 91 | ss << ","; 92 | } 93 | it->Next(); 94 | } 95 | value = ss.str(); 96 | return true; 97 | } 98 | 99 | } // namespace ghost_test 100 | -------------------------------------------------------------------------------- /experiments/rocksdb/ingress.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/rocksdb/ingress.h" 8 | 9 | #include "experiments/rocksdb/database.h" 10 | 11 | namespace ghost_test { 12 | 13 | SyntheticNetwork::SyntheticNetwork(double throughput, double range_query_ratio, 14 | Clock& clock) 15 | : ingress_(throughput, clock), range_query_ratio_(range_query_ratio) { 16 | CHECK_GE(range_query_ratio, 0.0); 17 | CHECK_LE(range_query_ratio, 1.0); 18 | } 19 | 20 | void SyntheticNetwork::Start() { 21 | CHECK(!start_.HasBeenNotified()); 22 | 23 | ingress_.Start(); 24 | start_.Notify(); 25 | } 26 | 27 | bool SyntheticNetwork::Poll(Request& request) { 28 | CHECK(start_.HasBeenNotified()); 29 | 30 | const auto [arrived, arrival_time] = ingress_.HasNewArrival(); 31 | if (!arrived) { 32 | return false; 33 | } 34 | // A request is in the ingress queue 35 | absl::Time received = ghost::MonotonicNow(); 36 | bool get = absl::Bernoulli(gen_, 1.0 - range_query_ratio_); 37 | if (get) { 38 | // Get request 39 | request.work = Request::Get{ 40 | .entry = absl::Uniform(gen_, 0, Database::kNumEntries)}; 41 | } else { 42 | // Range query 43 | request.work = Request::Range{ 44 | .start_entry = absl::Uniform( 45 | gen_, 0, Database::kNumEntries - kRangeQuerySize + 1), 46 | .size = kRangeQuerySize}; 47 | } 48 | request.request_generated = arrival_time; 49 | request.request_received = received; 50 | return true; 51 | } 52 | 53 | } // namespace ghost_test 54 | -------------------------------------------------------------------------------- /experiments/rocksdb/latency.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ROCKSDB_LATENCY_H_ 8 | #define GHOST_EXPERIMENTS_ROCKSDB_LATENCY_H_ 9 | 10 | #include "absl/time/clock.h" 11 | #include "experiments/rocksdb/request.h" 12 | 13 | namespace ghost_test { 14 | 15 | namespace latency { 16 | 17 | struct PrintOptions { 18 | // If true, prints the results in human-readable form. Otherwise, prints the 19 | // results in CSV form. 20 | bool pretty; 21 | // If true, only prints the end-to-end results, rather than the results for 22 | // each stage. 23 | bool print_last; 24 | // If true, prints the entire distribution. 25 | bool distribution; 26 | // If true, prints the latencies in units of nanoseconds. If false, prints the 27 | // latencies in units of microseconds. 28 | bool ns; 29 | // The output stream to send the results to. We make 'os' a pointer rather 30 | // than a reference since a reference cannot be reassigned. 31 | std::ostream* os; 32 | }; 33 | 34 | void Print(const std::vector& requests, absl::Duration runtime, 35 | PrintOptions options); 36 | 37 | // We put these in the header rather than in latency.cc since latency_test needs 38 | // these in order to generate the correct number of dashes for the pretty print 39 | // prefix. 40 | constexpr size_t kStageLen = 28; 41 | constexpr size_t kTotalRequestsLen = 18; 42 | constexpr size_t kThroughputLen = 22; 43 | constexpr size_t kResultLen = 12; 44 | // Add 8 to the end to account for the space between each column in the results. 45 | constexpr size_t kNumDashes = 46 | kStageLen + kTotalRequestsLen + kThroughputLen + (6 * kResultLen) + 8; 47 | 48 | } // namespace latency 49 | 50 | } // namespace ghost_test 51 | 52 | #endif // GHOST_EXPERIMENTS_ROCKSDB_LATENCY_H_ 53 | -------------------------------------------------------------------------------- /experiments/rocksdb/options_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "gmock/gmock.h" 8 | #include "gtest/gtest.h" 9 | #include "experiments/rocksdb/orchestrator.h" 10 | 11 | // These tests check that the application prints options and parses command line 12 | // flags properly. 13 | 14 | namespace ghost_test { 15 | namespace { 16 | 17 | using ::testing::Eq; 18 | 19 | // Returns orchestrator options suitable for the tests. 20 | Options GetOptions() { 21 | Options options; 22 | 23 | options.print_options.pretty = true; 24 | options.print_options.distribution = false; 25 | options.print_options.ns = false; 26 | options.print_options.os = &std::cout; 27 | options.print_get = true; 28 | options.print_range = false; 29 | options.rocksdb_db_path = "/tmp/orch_db"; 30 | options.throughput = 20'000.0; 31 | options.range_query_ratio = 0.005; 32 | options.load_generator_cpus = 33 | ghost::MachineTopology()->ToCpuList(std::vector{1}); 34 | options.cfs_dispatcher_cpus = 35 | ghost::MachineTopology()->ToCpuList(std::vector{2}); 36 | options.num_workers = 2; 37 | options.cfs_wait_type = ThreadWait::WaitType::kSpin; 38 | options.worker_cpus = 39 | ghost::MachineTopology()->ToCpuList(std::vector{3, 4}); 40 | options.ghost_wait_type = GhostWaitType::kFutex; 41 | options.get_duration = absl::Microseconds(10); 42 | options.range_duration = absl::Milliseconds(5); 43 | options.get_exponential_mean = absl::ZeroDuration(); 44 | options.batch = 1; 45 | options.experiment_duration = absl::Seconds(15); 46 | options.discard_duration = absl::Seconds(2); 47 | options.scheduler = ghost::GhostThread::KernelScheduler::kCfs; 48 | options.ghost_qos = 2; 49 | 50 | return options; 51 | } 52 | 53 | // The '<<' operator for 'Options' should print all options and 54 | // their values in alphabetical order by option name. 55 | std::string GetExpectedOutput() { 56 | return R"(batch: 1 57 | cfs_dispatcher_cpus: 2 58 | cfs_wait_type: spin 59 | discard_duration: 2s 60 | experiment_duration: 15s 61 | get_duration: 10us 62 | get_exponential_mean: 0 63 | ghost_qos: 2 64 | ghost_wait_type: futex 65 | load_generator_cpus: 1 66 | num_workers: 2 67 | print_distribution: false 68 | print_format: pretty 69 | print_get: true 70 | print_ns: false 71 | print_range: false 72 | range_duration: 5ms 73 | range_query_ratio: 0.005000 74 | rocksdb_db_path: /tmp/orch_db 75 | scheduler: cfs 76 | throughput: 20000.000000 77 | worker_cpus: 3 4)"; 78 | } 79 | 80 | // This tests that the '<<' operator prints all options and their values in 81 | // alphabetical order by option name. 82 | TEST(OptionsTest, PrintOptions) { 83 | Options options = GetOptions(); 84 | std::ostringstream os; 85 | 86 | os << options; 87 | EXPECT_THAT(os.str(), Eq(GetExpectedOutput())); 88 | } 89 | 90 | } // namespace 91 | } // namespace ghost_test 92 | -------------------------------------------------------------------------------- /experiments/rocksdb/request.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_ROCKSDB_REQUEST_H_ 8 | #define GHOST_EXPERIMENTS_ROCKSDB_REQUEST_H_ 9 | 10 | #include "absl/random/random.h" 11 | #include "absl/time/clock.h" 12 | #include "lib/base.h" 13 | 14 | namespace ghost_test { 15 | 16 | // A synthetic request for RocksDB generated by 'Ingress'. 17 | struct Request { 18 | struct Get { 19 | // The entry to access for the Get request. 20 | uint32_t entry; 21 | }; 22 | 23 | struct Range { 24 | // The accessed range is [start_entry, start_entry + size). 25 | 26 | // The first entry in the range. 27 | uint32_t start_entry; 28 | // The range size. 29 | uint32_t size; 30 | }; 31 | 32 | // Returns a sample duration from an exponential distribution with a mean 33 | // duration of 'mean'. 34 | // This is used to generate a request service time from an exponential 35 | // distribution (so the request service times follow a lightly-tailed 36 | // distribution). 37 | static absl::Duration GetExponentialHandleTime(absl::BitGen& gen, 38 | absl::Duration mean) { 39 | int64_t mean_ns = absl::ToInt64Nanoseconds(mean); 40 | // In the exponential distribution Exp('lambda'), the expected value (i.e., 41 | // the mean) is equal to '1 / lambda'. Thus, we need to pass '1 / mean_ns' 42 | // as 'lambda' to the exponential distribution to have a mean sample value 43 | // of 'mean_ns'. 44 | double handle_ns = absl::Exponential(gen, 1.0 / mean_ns); 45 | return absl::Nanoseconds(handle_ns); 46 | } 47 | 48 | // Returns true if this is a Get request. Returns false otherwise (i.e., this 49 | // is a Range query). 50 | bool IsGet() const { return work.index() == 0; } 51 | 52 | // Returns true if this is a Range query. Returns false otherwise (i.e., this 53 | // is a Get request). 54 | bool IsRange() const { return work.index() == 1; } 55 | 56 | // Unique request identifier. 57 | uint64_t id; 58 | 59 | // When the request was generated. 60 | absl::Time request_generated; 61 | // When the request was picked up by the app. 62 | absl::Time request_received; 63 | // When the request was assigned to a worker. 64 | absl::Time request_assigned; 65 | // When the request started to be handled by a worker. 66 | absl::Time request_start; 67 | // When the worker finished handling the request. 68 | absl::Time request_finished; 69 | 70 | // The work to do. The request is either a Get request or a Range query. 71 | std::variant work; 72 | }; 73 | 74 | } // namespace ghost_test 75 | 76 | #endif // GHOST_EXPERIMENTS_ROCKSDB_REQUEST_H_ 77 | -------------------------------------------------------------------------------- /experiments/scripts/BUILD: -------------------------------------------------------------------------------- 1 | # Note: If you modify this BUILD file, please contact jhumphri@ first to ensure 2 | # that you are not breaking the Copybara script. 3 | 4 | # Runs the RocksDB and Antagonist experiments on ghOSt and CFS (Linux Completely Fair Scheduler). 5 | 6 | package(default_applicable_licenses = ["//:license"]) 7 | 8 | licenses(["notice"]) 9 | 10 | load("@subpar//:subpar.bzl", "par_binary") 11 | load("@my_deps//:requirements.bzl", "requirement") 12 | 13 | # This library supports all experiments. 14 | py_library( 15 | name = "experiments", 16 | srcs = [ 17 | "options.py", 18 | "run.py", 19 | "setup.py", 20 | ], 21 | data = [ 22 | "//:agent_shinjuku", 23 | "//:antagonist", 24 | "//:rocksdb", 25 | ], 26 | ) 27 | 28 | # Runs the centralized queuing experiments. 29 | par_binary( 30 | name = "centralized_queuing", 31 | srcs = [ 32 | "centralized_queuing.py", 33 | ], 34 | python_version = "PY3", 35 | deps = [ 36 | ":experiments", 37 | requirement("absl-py"), 38 | ], 39 | ) 40 | 41 | # Runs the Shinjuku experiments. 42 | par_binary( 43 | name = "shinjuku", 44 | srcs = [ 45 | "shinjuku.py", 46 | ], 47 | python_version = "PY3", 48 | deps = [ 49 | ":experiments", 50 | requirement("absl-py"), 51 | ], 52 | ) 53 | 54 | # Runs the Shenango experiments. 55 | par_binary( 56 | name = "shenango", 57 | srcs = [ 58 | "shenango.py", 59 | ], 60 | python_version = "PY3", 61 | deps = [ 62 | ":experiments", 63 | requirement("absl-py"), 64 | ], 65 | ) 66 | 67 | # Runs the Shinjuku+Shenango experiments. 68 | par_binary( 69 | name = "shinjuku_shenango", 70 | srcs = [ 71 | "shinjuku_shenango.py", 72 | ], 73 | python_version = "PY3", 74 | deps = [ 75 | ":experiments", 76 | requirement("absl-py"), 77 | ], 78 | ) 79 | -------------------------------------------------------------------------------- /experiments/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/ghost-userspace/9ca0a1fb6ed88f0c4b0b40a5a35502938efa567f/experiments/scripts/__init__.py -------------------------------------------------------------------------------- /experiments/scripts/centralized_queuing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file or at 5 | # https://developers.google.com/open-source/licenses/bsd 6 | """Runs the RocksDB centralized-queuing experiments. 7 | 8 | This script runs the centralized-queuing RocksDB experiments on ghOSt and on 9 | CFS. In these experiments, there is a centralized queue maintained for RocksDB 10 | requests and the requests are not reordered or preempted. This script should be 11 | run on a machine with an Intel Xeon Platinum 8173M as that is what we used in 12 | the paper. If another CPU is used, the throughput ranges below should be 13 | adjusted. 14 | """ 15 | 16 | from typing import Sequence 17 | from absl import app 18 | from experiments.scripts.options import CheckSchedulers 19 | from experiments.scripts.options import GetGhostOptions 20 | from experiments.scripts.options import GetRocksDBOptions 21 | from experiments.scripts.options import Scheduler 22 | from experiments.scripts.run import Experiment 23 | from experiments.scripts.run import Run 24 | 25 | _NUM_CPUS = 8 26 | _NUM_CFS_WORKERS = _NUM_CPUS - 2 27 | _NUM_GHOST_WORKERS = 11 28 | 29 | 30 | def RunCfs(): 31 | """Runs the CFS (Linux Completely Fair Scheduler) experiment.""" 32 | e: Experiment = Experiment() 33 | # Run throughputs 10000, 20000, 30000, ... 440000. 34 | e.throughputs = list(i for i in range(10000, 441000, 10000)) 35 | # Toward the end, run throughputs 450000, 451000, 452000, ..., 480000. 36 | e.throughputs.extend(list(i for i in range(450000, 481000, 1000))) 37 | e.rocksdb = GetRocksDBOptions(Scheduler.CFS, _NUM_CPUS, _NUM_CFS_WORKERS) 38 | e.rocksdb.get_exponential_mean = '1us' 39 | e.antagonist = None 40 | e.ghost = None 41 | 42 | Run(e) 43 | 44 | 45 | def RunGhost(): 46 | """Runs the ghOSt experiment.""" 47 | e: Experiment = Experiment() 48 | # Run throughputs 10000, 20000, 30000, ..., 420000. 49 | e.throughputs = list(i for i in range(10000, 421000, 10000)) 50 | # Toward the end, run throughputs 430000, 431000, 432000, ..., 460000. 51 | e.throughputs.extend(list(i for i in range(430000, 461000, 1000))) 52 | e.rocksdb = GetRocksDBOptions(Scheduler.GHOST, _NUM_CPUS, _NUM_GHOST_WORKERS) 53 | e.rocksdb.get_exponential_mean = '1us' 54 | e.antagonist = None 55 | e.ghost = GetGhostOptions(_NUM_CPUS) 56 | # There is no time-based preemption for centralized queuing, so set the 57 | # preemption time slice to infinity. 58 | e.ghost.preemption_time_slice = 'inf' 59 | 60 | Run(e) 61 | 62 | 63 | def main(argv: Sequence[str]): 64 | if len(argv) > 3: 65 | raise app.UsageError('Too many command-line arguments.') 66 | elif len(argv) == 1: 67 | raise app.UsageError( 68 | 'No experiment specified. Pass `cfs` and/or `ghost` as arguments.') 69 | 70 | # First check that all of the command line arguments are valid. 71 | if not CheckSchedulers(argv[1:]): 72 | raise ValueError('Invalid scheduler specified.') 73 | 74 | # Run the experiments. 75 | for i in range(1, len(argv)): 76 | scheduler = Scheduler(argv[i]) 77 | if scheduler == Scheduler.CFS: 78 | RunCfs() 79 | else: 80 | if scheduler != Scheduler.GHOST: 81 | raise ValueError(f'Unknown scheduler {scheduler}.') 82 | RunGhost() 83 | 84 | 85 | if __name__ == '__main__': 86 | app.run(main) 87 | -------------------------------------------------------------------------------- /experiments/scripts/shenango.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file or at 5 | # https://developers.google.com/open-source/licenses/bsd 6 | """Runs the RocksDB Shenango experiments. 7 | 8 | This script runs the RocksDB Shenango experiments on ghOSt and on CFS. In these 9 | experiments, RocksDB is co-located with an Antagonist. Specifically, the 10 | dispatcher and worker threads are co-located with the Antagonist threads while 11 | the load generator is isolated on its own CPU (to ensure that the load we think 12 | we are generating is the load we are actually generating). For ghOSt, the 13 | Antagonist threads are preempted to allow RocksDB threads to run. For CFS, this 14 | preemption is left to CFS to figure out. Furthermore, for the CFS experiments, 15 | the worker threads sleep on a futex when they do not have work rather than spin 16 | so that CFS gives the Antagonist threads a chance to run. 17 | """ 18 | 19 | from typing import Sequence 20 | from absl import app 21 | from experiments.scripts.options import CfsWaitType 22 | from experiments.scripts.options import CheckSchedulers 23 | from experiments.scripts.options import GetAntagonistOptions 24 | from experiments.scripts.options import GetGhostOptions 25 | from experiments.scripts.options import GetRocksDBOptions 26 | from experiments.scripts.options import Scheduler 27 | from experiments.scripts.run import Experiment 28 | from experiments.scripts.run import Run 29 | 30 | _NUM_CPUS = 8 31 | _NUM_CFS_WORKERS = _NUM_CPUS - 2 32 | _NUM_GHOST_WORKERS = 11 33 | # Subtract 1 for the Antagonist since the Antagonist does not run a thread on 34 | # the same CPU as the load generator. 35 | _NUM_ANTAGONIST_CPUS = _NUM_CPUS - 1 36 | 37 | 38 | def RunCfs(): 39 | """Runs the CFS (Linux Completely Fair Scheduler) experiment.""" 40 | e: Experiment = Experiment() 41 | # Run throughputs 10000, 20000, 30000, ... 60000. 42 | e.throughputs = list(i for i in range(10000, 600000, 10000)) 43 | # Toward the end, run throughputs 70000, 71000, 72000, ..., 120000. 44 | e.throughputs.extend(list(i for i in range(70000, 121000, 1000))) 45 | e.rocksdb = GetRocksDBOptions(Scheduler.CFS, _NUM_CPUS, _NUM_CFS_WORKERS) 46 | e.rocksdb.cfs_wait_type = CfsWaitType.FUTEX 47 | e.rocksdb.get_exponential_mean = '1us' 48 | e.antagonist = GetAntagonistOptions(Scheduler.CFS, _NUM_ANTAGONIST_CPUS) 49 | e.ghost = None 50 | 51 | Run(e) 52 | 53 | 54 | def RunGhost(): 55 | """Runs the ghOSt experiment.""" 56 | e: Experiment = Experiment() 57 | # Run throughputs 10000, 20000, 30000, ..., 380000. 58 | e.throughputs = list(i for i in range(10000, 381000, 10000)) 59 | # Toward the end, run throughputs 390000, 391000, 392000, ..., 450000. 60 | e.throughputs.extend(list(i for i in range(390000, 451000, 1000))) 61 | e.rocksdb = GetRocksDBOptions(Scheduler.GHOST, _NUM_CPUS, _NUM_GHOST_WORKERS) 62 | e.rocksdb.get_exponential_mean = '1us' 63 | e.rocksdb.ghost_qos = 2 64 | e.antagonist = GetAntagonistOptions(Scheduler.GHOST, _NUM_ANTAGONIST_CPUS) 65 | e.antagonist.ghost_qos = 1 66 | e.ghost = GetGhostOptions(_NUM_CPUS) 67 | # There is no time-based preemption for Shenango, so set the preemption time 68 | # slice to infinity. 69 | e.ghost.preemption_time_slice = 'inf' 70 | 71 | Run(e) 72 | 73 | 74 | def main(argv: Sequence[str]): 75 | if len(argv) > 3: 76 | raise app.UsageError('Too many command-line arguments.') 77 | elif len(argv) == 1: 78 | raise app.UsageError( 79 | 'No experiment specified. Pass `cfs` and/or `ghost` as arguments.') 80 | 81 | # First check that all of the command line arguments are valid. 82 | if not CheckSchedulers(argv[1:]): 83 | raise ValueError('Invalid scheduler specified.') 84 | 85 | # Run the experiments. 86 | for i in range(1, len(argv)): 87 | scheduler = Scheduler(argv[i]) 88 | if scheduler == Scheduler.CFS: 89 | RunCfs() 90 | else: 91 | if scheduler != Scheduler.GHOST: 92 | raise ValueError(f'Unknown scheduler {scheduler}.') 93 | RunGhost() 94 | 95 | 96 | if __name__ == '__main__': 97 | app.run(main) 98 | -------------------------------------------------------------------------------- /experiments/scripts/shinjuku.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file or at 5 | # https://developers.google.com/open-source/licenses/bsd 6 | """Runs the RocksDB Shinjuku experiments. 7 | 8 | This script runs the RocksDB Shinjuku experiments on ghOSt and on CFS. In these 9 | experiments, there is a centralized FIFO queue maintained for RocksDB requests. 10 | For ghOSt, long requests that exceed their time slice are preempted so that they 11 | do not prevent short requests from running (i.e., ghOSt prevents head-of-line 12 | blocking). The preempted requests are added to the back of the FIFO. For CFS, 13 | requests are run to completion. 14 | """ 15 | 16 | from typing import Sequence 17 | from absl import app 18 | from experiments.scripts.options import CheckSchedulers 19 | from experiments.scripts.options import GetGhostOptions 20 | from experiments.scripts.options import GetRocksDBOptions 21 | from experiments.scripts.options import Scheduler 22 | from experiments.scripts.run import Experiment 23 | from experiments.scripts.run import Run 24 | 25 | _NUM_CPUS = 8 26 | _NUM_CFS_WORKERS = _NUM_CPUS - 2 27 | _NUM_GHOST_WORKERS = 200 28 | 29 | 30 | def RunCfs(): 31 | """Runs the CFS (Linux Completely Fair Scheduler) experiment.""" 32 | e: Experiment = Experiment() 33 | # Run throughputs 10000, 20000, 30000, and 40000. 34 | e.throughputs = list(i for i in range(10000, 50000, 10000)) 35 | # Toward the end, run throughputs 50000, 51000, 52000, ..., 80000. 36 | e.throughputs.extend(list(i for i in range(50000, 81000, 1000))) 37 | e.rocksdb = GetRocksDBOptions(Scheduler.CFS, _NUM_CPUS, _NUM_CFS_WORKERS) 38 | e.rocksdb.range_query_ratio = 0.005 39 | e.antagonist = None 40 | e.ghost = None 41 | 42 | Run(e) 43 | 44 | 45 | def RunGhost(): 46 | """Runs the ghOSt experiment.""" 47 | e: Experiment = Experiment() 48 | # Run throughputs 1000, 20000, 30000, ..., 130000. 49 | e.throughputs = list(i for i in range(10000, 140000, 10000)) 50 | # Toward the end, run throughputs 140000, 141000, 142000, ..., 150000. 51 | e.throughputs.extend(list(i for i in range(140000, 151000, 1000))) 52 | e.rocksdb = GetRocksDBOptions(Scheduler.GHOST, _NUM_CPUS, _NUM_GHOST_WORKERS) 53 | e.rocksdb.range_query_ratio = 0.005 54 | e.antagonist = None 55 | e.ghost = GetGhostOptions(_NUM_CPUS) 56 | e.ghost.preemption_time_slice = '30us' 57 | 58 | Run(e) 59 | 60 | 61 | def main(argv: Sequence[str]): 62 | if len(argv) > 3: 63 | raise app.UsageError('Too many command-line arguments.') 64 | elif len(argv) == 1: 65 | raise app.UsageError( 66 | 'No experiment specified. Pass `cfs` and/or `ghost` as arguments.') 67 | 68 | # First check that all of the command line arguments are valid. 69 | if not CheckSchedulers(argv[1:]): 70 | raise ValueError('Invalid scheduler specified.') 71 | 72 | # Run the experiments. 73 | for i in range(1, len(argv)): 74 | scheduler = Scheduler(argv[i]) 75 | if scheduler == Scheduler.CFS: 76 | RunCfs() 77 | else: 78 | if scheduler != Scheduler.GHOST: 79 | raise ValueError(f'Unknown scheduler {scheduler}.') 80 | RunGhost() 81 | 82 | 83 | if __name__ == '__main__': 84 | app.run(main) 85 | -------------------------------------------------------------------------------- /experiments/scripts/shinjuku_shenango.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file or at 5 | # https://developers.google.com/open-source/licenses/bsd 6 | """Runs the RocksDB Shenango experiments. 7 | 8 | This script runs the RocksDB Shinjuku+Shenango experiments on ghOSt and CFS. The 9 | experiments contain a mix of short and long requests (Shinjuku) and the RocksDB 10 | threads are co-located with Antagonist threads (Shenango). 11 | """ 12 | 13 | from typing import Sequence 14 | from absl import app 15 | from experiments.scripts.options import CfsWaitType 16 | from experiments.scripts.options import CheckSchedulers 17 | from experiments.scripts.options import GetAntagonistOptions 18 | from experiments.scripts.options import GetGhostOptions 19 | from experiments.scripts.options import GetRocksDBOptions 20 | from experiments.scripts.options import Scheduler 21 | from experiments.scripts.run import Experiment 22 | from experiments.scripts.run import Run 23 | 24 | _NUM_CPUS = 8 25 | _NUM_CFS_WORKERS = _NUM_CPUS - 2 26 | _NUM_GHOST_WORKERS = 200 27 | # Subtract 1 for the Antagonist since the Antagonist does not run a thread on 28 | # the same CPU as the load generator. 29 | _NUM_ANTAGONIST_CPUS = _NUM_CPUS - 1 30 | 31 | 32 | def RunCfs(): 33 | """Runs the CFS (Linux Completely Fair Scheduler) experiment.""" 34 | e: Experiment = Experiment() 35 | # Run throughputs 10000, 20000, 30000, and 40000. 36 | e.throughputs = list(i for i in range(10000, 50000, 10000)) 37 | # Toward the end, run throughputs 50000, 51000, 52000, ..., 80000. 38 | e.throughputs.extend(list(i for i in range(50000, 81000, 1000))) 39 | e.rocksdb = GetRocksDBOptions(Scheduler.CFS, _NUM_CPUS, _NUM_CFS_WORKERS) 40 | e.rocksdb.range_query_ratio = 0.005 41 | e.rocksdb.cfs_wait_type = CfsWaitType.FUTEX 42 | e.antagonist = GetAntagonistOptions(Scheduler.CFS, _NUM_ANTAGONIST_CPUS) 43 | e.ghost = None 44 | 45 | Run(e) 46 | 47 | 48 | def RunGhost(): 49 | """Runs the ghOSt experiment.""" 50 | e: Experiment = Experiment() 51 | # Run throughputs 1000, 20000, 30000, ..., 130000. 52 | e.throughputs = list(i for i in range(10000, 140000, 10000)) 53 | # Toward the end, run throughputs 140000, 141000, 142000, ..., 150000. 54 | e.throughputs.extend(list(i for i in range(140000, 151000, 1000))) 55 | e.rocksdb = GetRocksDBOptions(Scheduler.GHOST, _NUM_CPUS, _NUM_GHOST_WORKERS) 56 | e.rocksdb.range_query_ratio = 0.005 57 | e.rocksdb.ghost_qos = 2 58 | e.antagonist = GetAntagonistOptions(Scheduler.GHOST, _NUM_ANTAGONIST_CPUS) 59 | e.antagonist.ghost_qos = 1 60 | e.ghost = GetGhostOptions(_NUM_CPUS) 61 | e.ghost.preemption_time_slice = '30us' 62 | 63 | Run(e) 64 | 65 | 66 | def main(argv: Sequence[str]): 67 | if len(argv) > 3: 68 | raise app.UsageError('Too many command-line arguments.') 69 | elif len(argv) == 1: 70 | raise app.UsageError( 71 | 'No experiment specified. Pass `cfs` and/or `ghost` as arguments.') 72 | 73 | # First check that all of the command line arguments are valid. 74 | if not CheckSchedulers(argv[1:]): 75 | raise ValueError('Invalid scheduler specified.') 76 | 77 | # Run the experiments. 78 | for i in range(1, len(argv)): 79 | scheduler = Scheduler(argv[i]) 80 | if scheduler == Scheduler.CFS: 81 | RunCfs() 82 | else: 83 | if scheduler != Scheduler.GHOST: 84 | raise ValueError(f'Unknown scheduler {scheduler}.') 85 | RunGhost() 86 | 87 | 88 | if __name__ == '__main__': 89 | app.run(main) 90 | -------------------------------------------------------------------------------- /experiments/shared/prio_table_helper.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/shared/prio_table_helper.h" 8 | 9 | #include "lib/base.h" 10 | 11 | namespace ghost_test { 12 | 13 | void PrioTableHelper::GetWorkClass(uint32_t wcid, ghost::work_class& wc) const { 14 | CheckWorkClassInRange(wcid); 15 | 16 | wc = *table_.work_class(wcid); 17 | } 18 | 19 | void PrioTableHelper::SetWorkClass(uint32_t wcid, const ghost::work_class& wc) { 20 | CHECK_EQ(wcid, wc.id); 21 | CheckWorkClassInRange(wcid); 22 | 23 | *table_.work_class(wcid) = wc; 24 | } 25 | 26 | void PrioTableHelper::CopySchedItem(ghost::sched_item& dst, 27 | const ghost::sched_item& src) const { 28 | dst.sid = src.sid; 29 | dst.wcid = src.wcid; 30 | dst.gpid = src.gpid; 31 | dst.flags = src.flags; 32 | dst.deadline = src.deadline; 33 | } 34 | 35 | void PrioTableHelper::GetSchedItem(uint32_t sid, ghost::sched_item& si) const { 36 | CheckSchedItemInRange(sid); 37 | 38 | CopySchedItem(si, *table_.sched_item(sid)); 39 | } 40 | 41 | void PrioTableHelper::SetSchedItem(uint32_t sid, const ghost::sched_item& si) { 42 | CHECK_EQ(sid, si.sid); 43 | CheckSchedItemInRange(si.sid); 44 | CheckWorkClassInRange(si.wcid); 45 | 46 | ghost::sched_item* curr = table_.sched_item(sid); 47 | uint32_t begin = curr->seqcount.write_begin(); 48 | CopySchedItem(*curr, si); 49 | curr->seqcount.write_end(begin); 50 | MarkUpdatedTableIndex(curr->sid); 51 | } 52 | 53 | PrioTableHelper::PrioTableHelper(uint32_t num_sched_items, 54 | uint32_t num_work_classes) 55 | : table_(num_sched_items, num_work_classes, 56 | ghost::PrioTable::StreamCapacity::kStreamCapacity83) { 57 | CHECK(num_sched_items == 0 || num_work_classes >= 1); 58 | } 59 | 60 | void PrioTableHelper::MarkRunnability(uint32_t sid, bool runnable) { 61 | CheckSchedItemInRange(sid); 62 | 63 | ghost::sched_item* si = table_.sched_item(sid); 64 | uint32_t begin = si->seqcount.write_begin(); 65 | if (runnable) { 66 | si->flags |= SCHED_ITEM_RUNNABLE; 67 | } else { 68 | si->flags &= ~SCHED_ITEM_RUNNABLE; 69 | } 70 | si->seqcount.write_end(begin); 71 | MarkUpdatedTableIndex(si->sid); 72 | } 73 | 74 | void PrioTableHelper::MarkRunnable(uint32_t sid) { 75 | MarkRunnability(sid, /*runnable=*/true); 76 | } 77 | 78 | void PrioTableHelper::MarkIdle(uint32_t sid) { 79 | MarkRunnability(sid, /*runnable=*/false); 80 | } 81 | 82 | void PrioTableHelper::WaitUntilRunnable(uint32_t sid) const { 83 | CheckSchedItemInRange(sid); 84 | 85 | ghost::sched_item* si = table_.sched_item(sid); 86 | std::atomic* flags = 87 | reinterpret_cast*>(&si->flags); 88 | while ((flags->load(std::memory_order_acquire) & SCHED_ITEM_RUNNABLE) == 0) { 89 | ghost::Pause(); 90 | } 91 | } 92 | 93 | } // namespace ghost_test 94 | -------------------------------------------------------------------------------- /experiments/shared/thread_pool.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/shared/thread_pool.h" 8 | 9 | namespace ghost_test { 10 | 11 | ExperimentThreadPool::~ExperimentThreadPool() { 12 | // Check that all threads have been joined. 13 | CHECK(absl::c_all_of(threads_, 14 | [](const std::unique_ptr& thread) { 15 | return !thread->Joinable(); 16 | })); 17 | } 18 | 19 | void ExperimentThreadPool::Init( 20 | const std::vector& ksched, 21 | const std::vector>& thread_work) { 22 | CHECK_EQ(ksched.size(), num_threads_); 23 | CHECK_EQ(ksched.size(), thread_work.size()); 24 | 25 | threads_.reserve(num_threads_); 26 | for (uint32_t i = 0; i < num_threads_; i++) { 27 | threads_.push_back(std::make_unique( 28 | ksched[i], 29 | std::bind(&ExperimentThreadPool::ThreadMain, this, i, thread_work[i]))); 30 | } 31 | } 32 | 33 | void ExperimentThreadPool::MarkExit(uint32_t sid) { 34 | thread_triggers_.Trigger(sid); 35 | } 36 | 37 | void ExperimentThreadPool::ThreadMain( 38 | uint32_t i, std::function thread_work) { 39 | while (!ShouldExit(i)) { 40 | thread_work(i); 41 | } 42 | num_exited_.fetch_add(1, std::memory_order_release); 43 | } 44 | 45 | void ExperimentThreadPool::Join() { 46 | // Check that all threads have already been notified to exit. If not, the call 47 | // to `Join` below will hang on one the threads because that thread will not 48 | // exit. 49 | for (uint32_t i = 0; i < num_threads_; i++) { 50 | CHECK(thread_triggers_.Triggered(/*sid=*/i)); 51 | } 52 | for (std::unique_ptr& thread : threads_) { 53 | // Check that `thread` is joinable. `thread` will not be joinable if it has 54 | // already been joined. 55 | CHECK(thread->Joinable()); 56 | thread->Join(); 57 | } 58 | } 59 | 60 | } // namespace ghost_test 61 | -------------------------------------------------------------------------------- /experiments/shared/thread_wait.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "experiments/shared/thread_wait.h" 8 | 9 | #include "lib/base.h" 10 | 11 | namespace ghost_test { 12 | 13 | ThreadWait::ThreadWait(uint32_t num_threads, WaitType wait_type) 14 | : num_threads_(num_threads), wait_type_(wait_type) { 15 | runnability_.reserve(num_threads); 16 | for (uint32_t i = 0; i < num_threads_; i++) { 17 | runnability_.push_back(std::make_unique>(0)); 18 | } 19 | } 20 | 21 | void ThreadWait::MarkRunnable(uint32_t sid) { 22 | CHECK_LT(sid, num_threads_); 23 | 24 | runnability_[sid]->store(1, std::memory_order_release); 25 | if (wait_type_ == WaitType::kFutex) { 26 | ghost::Futex::Wake(runnability_[sid].get(), 1); 27 | } 28 | } 29 | 30 | void ThreadWait::MarkIdle(uint32_t sid) { 31 | CHECK_LT(sid, num_threads_); 32 | 33 | runnability_[sid]->store(0, std::memory_order_release); 34 | } 35 | 36 | void ThreadWait::WaitUntilRunnable(uint32_t sid) const { 37 | CHECK_LT(sid, num_threads_); 38 | 39 | const std::unique_ptr>& r = runnability_[sid]; 40 | if (wait_type_ == WaitType::kSpin) { 41 | while (r->load(std::memory_order_acquire) == 0) { 42 | ghost::Pause(); 43 | } 44 | } else { 45 | CHECK_EQ(wait_type_, WaitType::kFutex); 46 | 47 | ghost::Futex::Wait(r.get(), 0); 48 | } 49 | } 50 | 51 | } // namespace ghost_test 52 | -------------------------------------------------------------------------------- /experiments/shared/thread_wait.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_EXPERIMENTS_SHARED_THREAD_WAIT_H_ 8 | #define GHOST_EXPERIMENTS_SHARED_THREAD_WAIT_H_ 9 | 10 | #include 11 | 12 | #include "lib/base.h" 13 | 14 | namespace ghost_test { 15 | 16 | // Support class for test apps that run experiments with threads that need to 17 | // wait. This class allows threads to be marked as idle/runnable and lets them 18 | // wait if they are idle until they are marked runnable again either by spinning 19 | // or sleeping on a futex. 20 | // 21 | // Example: 22 | // ThreadWait thread_wait_; 23 | // (Initialize with the number of threads you are using and the wait type.) 24 | // ... 25 | // Main Thread: thread_wait_.MarkIdle(/*sid=*/2); 26 | // ... 27 | // Thread 2: thread_wait_.WaitUntilRunnable(/*sid=*/2); 28 | // (Thread 2 now waits.) 29 | // ... 30 | // Thread 1: thread_wait_.MarkRunnable(/*sid=*/2); 31 | // (Thread 2 now returns from 'WaitUntilRunnable()' and does other work.) 32 | class ThreadWait { 33 | public: 34 | // When 'WaitUntilRunnable' is called, there are different ways to wait. Each 35 | // way affects performance differently. 36 | enum class WaitType { 37 | // Wait by spinning. Threads will return from 'WaitUntilRunnable' more 38 | // quickly when marked runnable but will burn up their CPU while waiting. 39 | kSpin, 40 | // Wait by sleeping on a futex. Threads will not burn up their CPU while 41 | // waiting but will return from 'WaitUntilRunnable' more slowly when marked 42 | // runnable. 43 | kFutex, 44 | }; 45 | 46 | ThreadWait(uint32_t num_threads, WaitType wait_type); 47 | 48 | // Marks 'sid' as runnable. 49 | void MarkRunnable(uint32_t sid); 50 | // Marks 'sid' as idle. 51 | void MarkIdle(uint32_t sid); 52 | // Waits until 'sid' is runnable. 53 | void WaitUntilRunnable(uint32_t sid) const; 54 | 55 | private: 56 | const uint32_t num_threads_; 57 | const WaitType wait_type_; 58 | std::vector>> runnability_; 59 | }; 60 | 61 | inline std::ostream& operator<<(std::ostream& os, 62 | ThreadWait::WaitType wait_type) { 63 | switch (wait_type) { 64 | case ThreadWait::WaitType::kSpin: 65 | return os << "Spin"; 66 | case ThreadWait::WaitType::kFutex: 67 | return os << "Futex"; 68 | } 69 | } 70 | 71 | } // namespace ghost_test 72 | 73 | #endif // GHOST_EXPERIMENTS_SHARED_THREAD_WAIT_H_ 74 | -------------------------------------------------------------------------------- /lib/agent.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "agent.h" 8 | 9 | #include 10 | 11 | #include "lib/scheduler.h" 12 | 13 | namespace ghost { 14 | 15 | Agent::~Agent() { 16 | enclave_->DetachAgent(this); 17 | CHECK(!thread_.joinable()); 18 | } 19 | 20 | void Agent::StartBegin() { thread_ = std::thread(&Agent::ThreadBody, this); } 21 | 22 | void Agent::StartComplete() { ready_.WaitForNotification(); } 23 | 24 | void LocalAgent::ThreadBody() { 25 | int queue_fd; 26 | Scheduler* s = AgentScheduler(); 27 | if (!s) { 28 | // Some tests don't have a scheduler. Those that don't need to set a 29 | // default channel before starting the agents, which the kernel will use. 30 | // If they did not set a default, then SchedAgentEnterGhost will fail. 31 | // TODO Once we move queues to ghostfs, we might be able to CHECK that 32 | // there is a default for the enclave. 33 | queue_fd = -1; 34 | } else { 35 | queue_fd = s->GetAgentChannel(cpu_).GetFd(); 36 | } 37 | 38 | CHECK_EQ(prctl(PR_SET_NAME, absl::StrCat("ap_task_", cpu().id()).c_str()), 0); 39 | 40 | gtid_ = Gtid::Current(); 41 | enclave_->WaitForOldAgent(); 42 | 43 | // setsched may fail with EBUSY, which is when there is an old agent that has 44 | // not left the cpu yet. Spin until we can. The old agent has priority; the 45 | // kernel will preempt us when it is runnable, since we are still in CFS. We 46 | // know that the old agent is gone or in the act of dying, because we called 47 | // WaitForOldAgent. 48 | int ret; 49 | do { 50 | ret = GhostHelper()->SchedAgentEnterGhost(enclave_->GetCtlFd(), cpu_, 51 | queue_fd); 52 | } while (ret && errno == EBUSY); 53 | CHECK_EQ(ret, 0); 54 | 55 | status_word_ = LocalStatusWord(StatusWord::AgentSW{}); 56 | CHECK(!status_word_.empty()); 57 | 58 | enclave_->AttachAgent(cpu_, this); 59 | 60 | AgentThread(); 61 | WaitForExitNotification(); 62 | } 63 | 64 | bool Agent::Ping() { 65 | RunRequest* req = enclave()->GetRunRequest(cpu_); 66 | return req->Ping(); 67 | } 68 | 69 | void Agent::TerminateBegin() { 70 | finished_.Notify(); 71 | 72 | // Ensure that we return control to agent to observe finished. 73 | Ping(); 74 | 75 | do_exit_.Notify(); 76 | } 77 | 78 | void Agent::TerminateComplete() { 79 | thread_.join(); 80 | 81 | // pthread_join() can return before the dying task has released all 82 | // of its resources (CLONE_CHILD_CLEARTID based synchronization via 83 | // do_exit()->exit_mm()->mm_release() happens much earlier than the 84 | // 'sched_class.task_dead' callback). 85 | // 86 | // Since agent state transitions don't produce task messages we use 87 | // the GHOST_SW_F_CANFREE bit to check whether the kernel has invoked 88 | // the 'task_dead' callback. 89 | while (!status_word().can_free()) { 90 | absl::SleepFor(absl::Milliseconds(1)); 91 | } 92 | } 93 | 94 | // static 95 | const bool Agent::kVersionCheck = Ghost::CheckVersion(); 96 | 97 | } // namespace ghost 98 | -------------------------------------------------------------------------------- /lib/arr_structs.bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Use of this source code is governed by a BSD-style 5 | * license that can be found in the LICENSE file or at 6 | * https://developers.google.com/open-source/licenses/bsd 7 | * 8 | * Helpers for building structures like linked lists where the elements are 9 | * indexes in an array instead of pointers. 10 | */ 11 | 12 | #ifndef GHOST_LIB_ARR_STRUCTS_BPF_H_ 13 | #define GHOST_LIB_ARR_STRUCTS_BPF_H_ 14 | 15 | #ifdef __BPF__ 16 | #include "third_party/bpf/common.bpf.h" 17 | #else 18 | #define BOUNDED_ARRAY_IDX(arr, arr_sz, idx) &(arr)[(idx)] 19 | #endif 20 | 21 | /* For older gcc, typeof may be undefined. */ 22 | #ifndef typeof 23 | #define typeof(x) __typeof__(x) 24 | #endif 25 | 26 | /* Helper to prevent the compiler from optimizing bounds check on x. */ 27 | #ifndef BPF_MUST_CHECK 28 | #define BPF_MUST_CHECK(x) ({ asm volatile ("" : "+r"(x)); x; }) 29 | #endif 30 | 31 | /* 32 | * Lookup the elem for an id. Returns a pointer to the elem or NULL. 33 | */ 34 | #define __id_to_elem(arr, arr_sz, id) ({ \ 35 | size_t ___id = id; \ 36 | ___id ? BOUNDED_ARRAY_IDX(arr, arr_sz, ___id - 1) : NULL; \ 37 | }) 38 | 39 | /* 40 | * Lookup the id for an elem. elem must be in arr. 41 | * 42 | * The manual pointer arithmetic avoids signed division, which is not allowed in 43 | * BPF. (The difference of pointers is signed). 44 | */ 45 | #define __elem_to_id(arr, elem) \ 46 | (((size_t)((unsigned char*)(elem) - (unsigned char*)(arr)) \ 47 | / sizeof(*elem)) + 1) 48 | 49 | #ifndef offsetof 50 | #define offsetof(type, member) ((size_t) (&((type*)0)->member)) 51 | #endif 52 | 53 | #ifndef container_of 54 | #define container_of(ptr, type, member) ({ \ 55 | (type*)((char*)ptr - offsetof(type, member)); \ 56 | }) 57 | #endif 58 | 59 | 60 | #endif // GHOST_LIB_ARR_STRUCTS_BPF_H_ 61 | -------------------------------------------------------------------------------- /lib/flux.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | // 7 | // Userspace helpers for schedulers using the flux infrastructure 8 | 9 | 10 | #ifndef GHOST_LIB_FLUX_H_ 11 | #define GHOST_LIB_FLUX_H_ 12 | 13 | #include "bpf/user/agent.h" 14 | 15 | #define FluxSetProgTypes(bpf_obj) ({ \ 16 | bpf_program__set_types(bpf_obj->progs.flux_pnt, \ 17 | BPF_PROG_TYPE_GHOST_SCHED, BPF_GHOST_SCHED_PNT); \ 18 | bpf_program__set_types(bpf_obj->progs.flux_msg_send, \ 19 | BPF_PROG_TYPE_GHOST_MSG, BPF_GHOST_MSG_SEND); \ 20 | bpf_program__set_types(bpf_obj->progs.flux_select_rq, \ 21 | BPF_PROG_TYPE_GHOST_SELECT_RQ, BPF_GHOST_SELECT_RQ); \ 22 | }) 23 | 24 | #define FluxRegisterProgs(bpf_obj) ({ \ 25 | CHECK_EQ(agent_bpf_register(bpf_obj->progs.flux_pnt, BPF_GHOST_SCHED_PNT), \ 26 | 0); \ 27 | CHECK_EQ(agent_bpf_register(bpf_obj->progs.flux_msg_send, \ 28 | BPF_GHOST_MSG_SEND), 0); \ 29 | CHECK_EQ(agent_bpf_register(bpf_obj->progs.flux_select_rq, \ 30 | BPF_GHOST_SELECT_RQ), 0); \ 31 | }) 32 | 33 | #define FluxSetGlobals(bpf_obj) ({ \ 34 | bpf_obj->rodata->enable_bpf_printd = CapHas(CAP_PERFMON); \ 35 | bpf_obj->rodata->ghost_gtid_seqnum_bits = ghost_tid_seqnum_bits(); \ 36 | }) 37 | 38 | #define FluxCheckMaps(bpf_obj) ({ \ 39 | CHECK_EQ(bpf_map__value_size(bpf_obj->maps.cpu_data), \ 40 | FLUX_MAX_CPUS * sizeof(struct flux_cpu)); \ 41 | CHECK_EQ(bpf_map__value_size(bpf_obj->maps.thread_data), \ 42 | FLUX_MAX_GTIDS * sizeof(struct flux_thread)); \ 43 | }) 44 | 45 | #endif // GHOST_LIB_FLUX_H_ 46 | -------------------------------------------------------------------------------- /lib/ghost_uapi.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "lib/ghost_uapi.h" 8 | 9 | // The global symbol in combination with alwayslink=1 ensures that only 10 | // a single instance of the ghost_uapi library is depended on by a cc_binary 11 | // target. 12 | // 13 | // Without this we could have some intermediate libraries or the binary 14 | // itself compile against one ABI while other libraries are compiled 15 | // against a different ABI. 16 | // 17 | // Now if a binary inadvertently takes a dependency on 'ghost_uapi' 18 | // _and_ 'ghost_uapi_75' then the linker will complain as follows: 19 | // ld: error: duplicate symbol: did_you_take_an_unintended_ghost_uapi_dependency 20 | 21 | int did_you_take_an_unintended_ghost_uapi_dependency = GHOST_VERSION; 22 | -------------------------------------------------------------------------------- /lib/ghost_uapi.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_LIB_GHOST_UAPI_H_ 8 | #define GHOST_LIB_GHOST_UAPI_H_ 9 | 10 | #ifndef GHOST_SELECT_ABI 11 | #include "abi/latest/kernel/ghost.h" 12 | #elif GHOST_SELECT_ABI == 84 13 | #include "abi/84/kernel/ghost.h" 14 | #elif GHOST_SELECT_ABI == 90 15 | #include "abi/90/kernel/ghost.h" 16 | #else 17 | #error "missing an abi?" 18 | #endif 19 | 20 | #endif // GHOST_LIB_GHOST_UAPI_H_ 21 | -------------------------------------------------------------------------------- /lib/logging.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_LIB_LOGGING_H_ 8 | #define GHOST_LIB_LOGGING_H_ 9 | 10 | #include 11 | 12 | #include "absl/log/check.h" 13 | #include "absl/log/log.h" 14 | #include "absl/strings/str_format.h" 15 | #include "third_party/util/util.h" 16 | 17 | #ifndef GHOST_DEBUG 18 | #ifdef NDEBUG 19 | #define GHOST_DEBUG 0 20 | #else 21 | #define GHOST_DEBUG 1 22 | #endif // !NDEBUG 23 | #endif // !GHOST_DEBUG 24 | 25 | #ifndef VLOG 26 | #ifdef NDEBUG 27 | #define VLOG(level) LOG_IF(INFO, false) 28 | #else 29 | #define VLOG(level) LOG_IF(INFO, verbose() < level) 30 | #endif // !NDEBUG 31 | #endif // !VLOG 32 | 33 | // TODO: Consider deprecating GHOST_DPRINT once we migrate to VLOG. 34 | #define GHOST_DPRINT(level, target, fmt, ...) \ 35 | do { \ 36 | if (verbose() < level) break; \ 37 | absl::FPrintF(target, fmt "\n", ##__VA_ARGS__); \ 38 | } while (0) 39 | 40 | #define GHOST_ERROR(fmt, ...) \ 41 | do { \ 42 | LOG(FATAL) << "(" << ghost::GetTID() << ") " \ 43 | << absl::StrFormat(fmt, ##__VA_ARGS__); \ 44 | } while (0) 45 | 46 | #define GHOST_I_AM_HERE \ 47 | do { \ 48 | LOG(INFO) << "GHOST_I_AM_HERE: PID " << getpid() << " " \ 49 | << ghost::Gtid::Current().describe() << " at " << __func__; \ 50 | } while (0) 51 | 52 | #endif // GHOST_LIB_LOGGING_H_ 53 | -------------------------------------------------------------------------------- /lib/trivial_status.cc: -------------------------------------------------------------------------------- 1 | #include "lib/trivial_status.h" 2 | #include "absl/strings/str_format.h" 3 | 4 | namespace ghost { 5 | 6 | namespace { 7 | 8 | template 9 | void CopyString(std::array& dest, absl::string_view s) { 10 | static_assert(ArraySize > 0); 11 | const size_t chars_to_copy = std::min(ArraySize - 1, s.size()); 12 | if (chars_to_copy < s.size()) { 13 | absl::FPrintF(stderr, 14 | "Source string too large to fit in TrivialStatus: %zu, vs " 15 | "max_size %zu\n", 16 | s.size(), chars_to_copy); 17 | } 18 | std::copy_n(s.begin(), chars_to_copy, dest.begin()); 19 | dest[chars_to_copy] = '\0'; 20 | } 21 | 22 | } // namespace 23 | 24 | TrivialStatus::TrivialStatus(const absl::Status& s) { 25 | code_ = s.code(); 26 | 27 | CopyString(error_message_, s.message()); 28 | } 29 | 30 | TrivialStatusOrString::TrivialStatusOrString( 31 | const absl::StatusOr& s) 32 | : status_(TrivialStatus(s.status())) { 33 | if (s.ok()) { 34 | string_length_ = s.value().size(); 35 | CopyString(str_, s.value()); 36 | } 37 | } 38 | 39 | absl::StatusOr TrivialStatusOrString::ToStatusOr() const 40 | { 41 | absl::Status s = status_.ToStatus(); 42 | if (s.ok()) { 43 | return std::string(str_.data(), string_length_); 44 | } 45 | return s; 46 | } 47 | 48 | } // namespace ghost 49 | -------------------------------------------------------------------------------- /lib/trivial_status.h: -------------------------------------------------------------------------------- 1 | #ifndef GHOST_LIB_TRIVIAL_STATUS_H_ 2 | #define GHOST_LIB_TRIVIAL_STATUS_H_ 3 | 4 | #include "absl/log/check.h" 5 | #include "absl/status/status.h" 6 | #include "absl/status/statusor.h" 7 | 8 | namespace ghost { 9 | 10 | // This is a trivially copyable version of absl::Status. This is useful 11 | // because it can be serialized across the shared memory AgentRpcBuffer. 12 | class TrivialStatus { 13 | public: 14 | explicit TrivialStatus() : TrivialStatus(absl::OkStatus()) {} 15 | explicit TrivialStatus(const absl::Status& s); 16 | 17 | // Returns the absl::Status version of this object. 18 | absl::Status ToStatus() const { 19 | return absl::Status(code_, std::string(error_message_.data())); 20 | } 21 | 22 | bool ok() const { return code_ == absl::StatusCode::kOk; } 23 | 24 | private: 25 | static constexpr size_t kMaxErrorMessageSize = 1000; 26 | 27 | absl::StatusCode code_; 28 | 29 | // Sized large enough to handle most error messages. Must fit in 30 | // AgentRpcBuffer BufferBytes. 31 | std::array error_message_; 32 | }; 33 | 34 | // This is a trivially copyable version of absl::StatusOr. This is useful 35 | // because it can be serialized across the shared memory AgentRpcBuffer. 36 | template 37 | class TrivialStatusOr { 38 | public: 39 | explicit TrivialStatusOr() : status_(TrivialStatus(absl::OkStatus())) {} 40 | 41 | // Constructs a TrivialStatusOr from an error status. 42 | explicit TrivialStatusOr(const absl::Status& s) : status_(TrivialStatus(s)) { 43 | CHECK(!s.ok()); 44 | } 45 | 46 | explicit TrivialStatusOr(const T& val) 47 | : status_(TrivialStatus(absl::OkStatus())) { 48 | value_ = val; 49 | } 50 | 51 | explicit TrivialStatusOr(const absl::StatusOr& s) 52 | : status_(TrivialStatus(s.status())) { 53 | if (s.ok()) { 54 | value_ = s.value(); 55 | } 56 | } 57 | 58 | // Returns the absl::StatusOr version of this object. 59 | absl::StatusOr ToStatusOr() const { 60 | absl::Status s = status_.ToStatus(); 61 | if (s.ok()) { 62 | return value_; 63 | } 64 | return s; 65 | } 66 | 67 | bool ok() const { return status_.ok(); } 68 | 69 | private: 70 | TrivialStatus status_; 71 | 72 | // If the status is OK, this stores the contained value. 73 | T value_; 74 | }; 75 | 76 | // This is a trivially copyable version of absl::StatusOr. This is 77 | // useful because it can be serialized across the shared memory AgentRpcBuffer. 78 | class TrivialStatusOrString { 79 | public: 80 | explicit TrivialStatusOrString() 81 | : status_(TrivialStatus(absl::OkStatus())) {} 82 | 83 | explicit TrivialStatusOrString(const absl::StatusOr& s); 84 | 85 | // Returns the absl::StatusOr version of this object. 86 | absl::StatusOr ToStatusOr() const; 87 | 88 | bool ok() const { return status_.ok(); } 89 | 90 | private: 91 | static constexpr size_t kMaxStringSize = 30000; 92 | 93 | TrivialStatus status_; 94 | 95 | // If the status is OK, this stores the contained string. 96 | // Must fit in AgentRpcBuffer BufferBytes. 97 | std::array str_; 98 | 99 | // Not all strings will use null terminators, so we must track the original 100 | // size of the std::string. 101 | size_t string_length_ = 0; 102 | }; 103 | 104 | } // namespace ghost 105 | 106 | #endif // GHOST_LIB_TRIVIAL_STATUS_H_ 107 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | -------------------------------------------------------------------------------- /schedulers/biff/agent_biff.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "absl/debugging/symbolize.h" 12 | #include "absl/flags/parse.h" 13 | #include "lib/agent.h" 14 | #include "lib/channel.h" 15 | #include "lib/enclave.h" 16 | #include "lib/topology.h" 17 | #include "schedulers/biff/biff_scheduler.h" 18 | 19 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 20 | 21 | int main(int argc, char* argv[]) { 22 | absl::InitializeSymbolizer(argv[0]); 23 | absl::ParseCommandLine(argc, argv); 24 | 25 | ghost::Topology* t = ghost::MachineTopology(); 26 | ghost::AgentConfig config(t, t->all_cpus()); 27 | std::string enclave = absl::GetFlag(FLAGS_enclave); 28 | if (!enclave.empty()) { 29 | int fd = open(enclave.c_str(), O_PATH); 30 | CHECK_GE(fd, 0); 31 | config.enclave_fd_ = fd; 32 | } 33 | 34 | auto uap = new ghost::AgentProcess, 35 | ghost::AgentConfig>(config); 36 | 37 | ghost::GhostHelper()->InitCore(); 38 | 39 | printf("Initialization complete, ghOSt active.\n"); 40 | fflush(stdout); 41 | 42 | ghost::Notification exit; 43 | static bool first = true; 44 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 45 | if (first) { 46 | exit.Notify(); 47 | first = false; 48 | return false; 49 | } 50 | return true; 51 | }); 52 | 53 | exit.WaitForNotification(); 54 | 55 | delete uap; 56 | 57 | printf("\nDone!\n"); 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /schedulers/biff/biff_scheduler.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "schedulers/biff/biff_scheduler.h" 8 | 9 | #include "absl/strings/str_format.h" 10 | #include "third_party/bpf/topology.bpf.h" 11 | #include "bpf/user/agent.h" 12 | 13 | namespace ghost { 14 | 15 | BiffScheduler::BiffScheduler(Enclave* enclave, CpuList cpulist, 16 | const AgentConfig& config) 17 | : Scheduler(enclave, std::move(cpulist)), 18 | unused_channel_(GHOST_MAX_QUEUE_ELEMS, /*node=*/0) { 19 | 20 | bpf_obj_ = biff_bpf__open(); 21 | CHECK_NE(bpf_obj_, nullptr); 22 | 23 | bpf_program__set_types(bpf_obj_->progs.biff_pnt, 24 | BPF_PROG_TYPE_GHOST_SCHED, BPF_GHOST_SCHED_PNT); 25 | bpf_program__set_types(bpf_obj_->progs.biff_msg_send, BPF_PROG_TYPE_GHOST_MSG, 26 | BPF_GHOST_MSG_SEND); 27 | bpf_program__set_types(bpf_obj_->progs.biff_select_rq, 28 | BPF_PROG_TYPE_GHOST_SELECT_RQ, BPF_GHOST_SELECT_RQ); 29 | 30 | bpf_obj_->rodata->enable_bpf_printd = CapHas(CAP_PERFMON); 31 | SetBpfTopologyVars(bpf_obj_->rodata, MachineTopology()); 32 | 33 | CHECK_EQ(biff_bpf__load(bpf_obj_), 0); 34 | 35 | CHECK_EQ(agent_bpf_register(bpf_obj_->progs.biff_pnt, BPF_GHOST_SCHED_PNT), 36 | 0); 37 | CHECK_EQ(agent_bpf_register(bpf_obj_->progs.biff_msg_send, 38 | BPF_GHOST_MSG_SEND), 0); 39 | CHECK_EQ(agent_bpf_register(bpf_obj_->progs.biff_select_rq, 40 | BPF_GHOST_SELECT_RQ), 0); 41 | 42 | bpf_cpu_data_ = static_cast( 43 | bpf_map__mmap(bpf_obj_->maps.cpu_data)); 44 | CHECK_NE(bpf_cpu_data_, MAP_FAILED); 45 | 46 | bpf_sw_data_ = static_cast( 47 | bpf_map__mmap(bpf_obj_->maps.sw_data)); 48 | CHECK_NE(bpf_sw_data_, MAP_FAILED); 49 | } 50 | 51 | BiffScheduler::~BiffScheduler() { 52 | bpf_map__munmap(bpf_obj_->maps.cpu_data, bpf_cpu_data_); 53 | bpf_map__munmap(bpf_obj_->maps.sw_data, bpf_sw_data_); 54 | biff_bpf__destroy(bpf_obj_); 55 | } 56 | 57 | void BiffScheduler::EnclaveReady() { 58 | enclave()->SetDeliverTicks(true); 59 | enclave()->SetDeliverCpuAvailability(true); 60 | WRITE_ONCE(bpf_obj_->bss->initialized, true); 61 | } 62 | 63 | void BiffScheduler::DiscoverTasks() { 64 | enclave()->DiscoverTasks(); 65 | } 66 | 67 | void BiffAgentTask::AgentThread() { 68 | gtid().assign_name("Agent:" + std::to_string(cpu().id())); 69 | 70 | SignalReady(); 71 | WaitForEnclaveReady(); 72 | 73 | while (!Finished()) { 74 | RunRequest* req = enclave()->GetRunRequest(cpu()); 75 | req->LocalYield(status_word().barrier(), /*flags=*/0); 76 | } 77 | } 78 | 79 | } // namespace ghost 80 | -------------------------------------------------------------------------------- /schedulers/biff/biff_scheduler.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_SCHEDULERS_BIFF_BIFF_SCHEDULER_H_ 8 | #define GHOST_SCHEDULERS_BIFF_BIFF_SCHEDULER_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include "third_party/bpf/biff_bpf.h" 14 | #include "lib/agent.h" 15 | #include "lib/scheduler.h" 16 | #include "schedulers/biff/biff_bpf.skel.h" 17 | 18 | namespace ghost { 19 | 20 | class BiffScheduler : public Scheduler { 21 | public: 22 | explicit BiffScheduler(Enclave* enclave, CpuList cpulist, 23 | const AgentConfig& config); 24 | ~BiffScheduler() final; 25 | 26 | void EnclaveReady() final; 27 | void DiscoverTasks() final; 28 | Channel& GetDefaultChannel() final { return unused_channel_; }; 29 | 30 | private: 31 | LocalChannel unused_channel_; 32 | struct biff_bpf* bpf_obj_; 33 | struct biff_bpf_cpu_data* bpf_cpu_data_; 34 | struct biff_bpf_sw_data* bpf_sw_data_; 35 | }; 36 | 37 | class BiffAgentTask : public LocalAgent { 38 | public: 39 | BiffAgentTask(Enclave* enclave, Cpu cpu, BiffScheduler* biff_sched) 40 | : LocalAgent(enclave, cpu), biff_sched_(biff_sched) {} 41 | 42 | void AgentThread() override; 43 | Scheduler* AgentScheduler() const override { return biff_sched_; } 44 | 45 | private: 46 | BiffScheduler* biff_sched_; 47 | }; 48 | 49 | template 50 | class FullBiffAgent : public FullAgent { 51 | public: 52 | explicit FullBiffAgent(AgentConfig config) 53 | : FullAgent(config) { 54 | biff_sched_ = std::make_unique( 55 | &this->enclave_, *this->enclave_.cpus(), config); 56 | this->StartAgentTasks(); 57 | this->enclave_.Ready(); 58 | } 59 | 60 | ~FullBiffAgent() override { 61 | this->enclave_.SetDeliverCpuAvailability(false); 62 | this->TerminateAgentTasks(); 63 | } 64 | 65 | std::unique_ptr MakeAgent(const Cpu& cpu) override { 66 | return std::make_unique(&this->enclave_, cpu, 67 | biff_sched_.get()); 68 | } 69 | 70 | void RpcHandler(int64_t req, const AgentRpcArgs& args, 71 | AgentRpcResponse& response) override { 72 | switch (req) { 73 | default: 74 | response.response_code = -1; 75 | return; 76 | } 77 | } 78 | 79 | private: 80 | std::unique_ptr biff_sched_; 81 | }; 82 | 83 | } // namespace ghost 84 | 85 | #endif // GHOST_SCHEDULERS_BIFF_BIFF_SCHEDULER_H_ 86 | -------------------------------------------------------------------------------- /schedulers/cfs/README.md: -------------------------------------------------------------------------------- 1 | # ghOSt CFS Agent 2 | 3 | CFS is the default scheduler in the Linux kernel. The CFS agent is a (currently 4 | incomplete) implementation of this scheduling policy as a ghost userspace 5 | agent. Currently it assigns new tasks in a round-robin fashion to CPUs. Each CPU 6 | has a runqueue; when ghost receives a message to schedule a ghost task on a cpu, 7 | it simply plucks the one with the lowest vruntime. 8 | 9 | To bring this agent to parity with CFS in the kernel, some items left to 10 | implement are: 11 | 12 | - load balancing 13 | 14 | - nice values 15 | 16 | - work stealing 17 | 18 | - group scheduling 19 | 20 | Once at feature parity, this agent can be used to deduce the "ghost" tax and 21 | be used to quickly iterate on parameter tuning. 22 | -------------------------------------------------------------------------------- /schedulers/cfs/cfs_agent.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "absl/debugging/symbolize.h" 13 | #include "absl/flags/parse.h" 14 | #include "lib/agent.h" 15 | #include "lib/enclave.h" 16 | #include "schedulers/cfs/cfs_scheduler.h" 17 | 18 | ABSL_FLAG(std::string, ghost_cpus, "1-5", "cpulist"); 19 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 20 | 21 | // Scheduling tuneables 22 | ABSL_FLAG( 23 | absl::Duration, min_granularity, absl::Milliseconds(1), 24 | "The minimum time a task will run before being preempted by another task"); 25 | ABSL_FLAG(absl::Duration, latency, absl::Milliseconds(10), 26 | "The target time period in which all tasks will run at least once"); 27 | 28 | namespace ghost { 29 | 30 | static void ParseAgentConfig(CfsConfig* config) { 31 | CpuList ghost_cpus = 32 | MachineTopology()->ParseCpuStr(absl::GetFlag(FLAGS_ghost_cpus)); 33 | CHECK(!ghost_cpus.Empty()); 34 | 35 | Topology* topology = MachineTopology(); 36 | config->topology_ = topology; 37 | config->cpus_ = ghost_cpus; 38 | std::string enclave = absl::GetFlag(FLAGS_enclave); 39 | if (!enclave.empty()) { 40 | int fd = open(enclave.c_str(), O_PATH); 41 | CHECK_GE(fd, 0); 42 | config->enclave_fd_ = fd; 43 | } 44 | 45 | config->min_granularity_ = absl::GetFlag(FLAGS_min_granularity); 46 | config->latency_ = absl::GetFlag(FLAGS_latency); 47 | } 48 | 49 | } // namespace ghost 50 | 51 | int main(int argc, char* argv[]) { 52 | absl::InitializeSymbolizer(argv[0]); 53 | absl::ParseCommandLine(argc, argv); 54 | 55 | ghost::CfsConfig config; 56 | ghost::ParseAgentConfig(&config); 57 | 58 | printf("Initializing...\n"); 59 | 60 | // Using new so we can destruct the object before printing Done 61 | auto uap = new ghost::AgentProcess, 62 | ghost::CfsConfig>(config); 63 | 64 | ghost::GhostHelper()->InitCore(); 65 | printf("Initialization complete, ghOSt active.\n"); 66 | // When `stdout` is directed to a terminal, it is newline-buffered. When 67 | // `stdout` is directed to a non-interactive device (e.g, a Python subprocess 68 | // pipe), it is fully buffered. Thus, in order for the Python script to read 69 | // the initialization message as soon as it is passed to `printf`, we need to 70 | // manually flush `stdout`. 71 | fflush(stdout); 72 | 73 | ghost::Notification exit; 74 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 75 | static bool first = true; // We only modify the first SIGINT. 76 | 77 | if (first) { 78 | exit.Notify(); 79 | first = false; 80 | return false; // We'll exit on subsequent SIGTERMs. 81 | } 82 | return true; 83 | }); 84 | 85 | // TODO: this is racy - uap could be deleted already 86 | ghost::GhostSignals::AddHandler(SIGUSR1, [uap](int) { 87 | uap->Rpc(ghost::CfsScheduler::kDebugRunqueue); 88 | return false; 89 | }); 90 | 91 | exit.WaitForNotification(); 92 | 93 | delete uap; 94 | 95 | printf("\nDone!\n"); 96 | 97 | return 0; 98 | } 99 | -------------------------------------------------------------------------------- /schedulers/cfs_bpf/agent_cfs.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "absl/debugging/symbolize.h" 12 | #include "absl/flags/parse.h" 13 | #include "lib/agent.h" 14 | #include "lib/channel.h" 15 | #include "lib/enclave.h" 16 | #include "lib/topology.h" 17 | #include "schedulers/cfs_bpf/cfs_scheduler.h" 18 | 19 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 20 | 21 | int main(int argc, char* argv[]) { 22 | absl::InitializeSymbolizer(argv[0]); 23 | absl::ParseCommandLine(argc, argv); 24 | 25 | ghost::Topology* t = ghost::MachineTopology(); 26 | ghost::AgentConfig config(t, t->all_cpus()); 27 | std::string enclave = absl::GetFlag(FLAGS_enclave); 28 | if (!enclave.empty()) { 29 | int fd = open(enclave.c_str(), O_PATH); 30 | CHECK_GE(fd, 0); 31 | config.enclave_fd_ = fd; 32 | } 33 | 34 | 35 | auto uap = new ghost::AgentProcess, 36 | ghost::AgentConfig>(config); 37 | 38 | ghost::GhostHelper()->InitCore(); 39 | 40 | printf("Initialization complete, ghOSt active.\n"); 41 | fflush(stdout); 42 | 43 | ghost::Notification exit; 44 | static bool first = true; 45 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 46 | if (first) { 47 | exit.Notify(); 48 | first = false; 49 | return false; 50 | } 51 | return true; 52 | }); 53 | 54 | exit.WaitForNotification(); 55 | 56 | delete uap; 57 | 58 | printf("\nDone!\n"); 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /schedulers/cfs_bpf/cfs_scheduler.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "schedulers/cfs_bpf/cfs_scheduler.h" 8 | 9 | #include "absl/strings/str_format.h" 10 | #include "bpf/user/agent.h" 11 | 12 | namespace ghost { 13 | 14 | CfsScheduler::CfsScheduler(Enclave* enclave, CpuList cpulist, 15 | const AgentConfig& config) 16 | : Scheduler(enclave, std::move(cpulist)), 17 | unused_channel_(1, /*node=*/0) { 18 | 19 | bpf_obj_ = cfs_bpf__open(); 20 | CHECK_NE(bpf_obj_, nullptr); 21 | 22 | 23 | bpf_program__set_types(bpf_obj_->progs.cfs_pnt, 24 | BPF_PROG_TYPE_GHOST_SCHED, BPF_GHOST_SCHED_PNT); 25 | bpf_program__set_types(bpf_obj_->progs.cfs_msg_send, BPF_PROG_TYPE_GHOST_MSG, 26 | BPF_GHOST_MSG_SEND); 27 | 28 | CHECK_EQ(cfs_bpf__load(bpf_obj_), 0); 29 | 30 | CHECK_EQ(agent_bpf_register(bpf_obj_->progs.cfs_pnt, BPF_GHOST_SCHED_PNT), 31 | 0); 32 | CHECK_EQ(agent_bpf_register(bpf_obj_->progs.cfs_msg_send, 33 | BPF_GHOST_MSG_SEND), 0); 34 | 35 | bpf_cpu_data_ = static_cast( 36 | bpf_map__mmap(bpf_obj_->maps.cpu_data)); 37 | CHECK_NE(bpf_cpu_data_, MAP_FAILED); 38 | 39 | bpf_thread_data_ = static_cast( 40 | bpf_map__mmap(bpf_obj_->maps.thread_data)); 41 | CHECK_NE(bpf_thread_data_, MAP_FAILED); 42 | 43 | enclave->SetDeliverCpuAvailability(false); 44 | } 45 | 46 | CfsScheduler::~CfsScheduler() { 47 | bpf_map__munmap(bpf_obj_->maps.cpu_data, bpf_cpu_data_); 48 | bpf_map__munmap(bpf_obj_->maps.thread_data, bpf_thread_data_); 49 | cfs_bpf__destroy(bpf_obj_); 50 | } 51 | 52 | void CfsScheduler::EnclaveReady() { 53 | enclave()->SetWakeOnWakerCpu(false); 54 | enclave()->SetDeliverTicks(true); 55 | WRITE_ONCE(bpf_obj_->bss->initialized, true); 56 | } 57 | 58 | void CfsScheduler::DiscoverTasks() { 59 | enclave()->DiscoverTasks(); 60 | } 61 | 62 | void CfsAgentTask::AgentThread() { 63 | gtid().assign_name("Agent:" + std::to_string(cpu().id())); 64 | 65 | SignalReady(); 66 | WaitForEnclaveReady(); 67 | 68 | while (!Finished()) { 69 | RunRequest* req = enclave()->GetRunRequest(cpu()); 70 | req->LocalYield(status_word().barrier(), /*flags=*/0); 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /schedulers/cfs_bpf/cfs_scheduler.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_SCHEDULERS_CFS_BPF_BIFF_SCHEDULER_H_ 8 | #define GHOST_SCHEDULERS_CFS_BPF_BIFF_SCHEDULER_H_ 9 | 10 | #include 11 | 12 | #include "third_party/bpf/cfs_bpf.h" 13 | #include "lib/agent.h" 14 | #include "lib/scheduler.h" 15 | #include "schedulers/cfs_bpf/cfs_bpf.skel.h" 16 | 17 | namespace ghost { 18 | 19 | class CfsScheduler : public Scheduler { 20 | public: 21 | explicit CfsScheduler(Enclave* enclave, CpuList cpulist, 22 | const AgentConfig& config); 23 | ~CfsScheduler() final; 24 | 25 | void EnclaveReady() final; 26 | void DiscoverTasks() final; 27 | Channel& GetDefaultChannel() final { return unused_channel_; }; 28 | 29 | private: 30 | LocalChannel unused_channel_; 31 | struct cfs_bpf* bpf_obj_; 32 | struct cfs_bpf_cpu_data* bpf_cpu_data_; 33 | struct cfs_bpf_thread* bpf_thread_data_; 34 | }; 35 | 36 | class CfsAgentTask : public LocalAgent { 37 | public: 38 | CfsAgentTask(Enclave* enclave, Cpu cpu, CfsScheduler* cfs_sched) 39 | : LocalAgent(enclave, cpu), cfs_sched_(cfs_sched) {} 40 | 41 | void AgentThread() override; 42 | Scheduler* AgentScheduler() const override { return cfs_sched_; } 43 | 44 | private: 45 | CfsScheduler* cfs_sched_; 46 | }; 47 | 48 | template 49 | class FullCfsAgent : public FullAgent { 50 | public: 51 | explicit FullCfsAgent(AgentConfig config) 52 | : FullAgent(config) { 53 | cfs_sched_ = absl::make_unique( 54 | &this->enclave_, *this->enclave_.cpus(), config); 55 | this->StartAgentTasks(); 56 | this->enclave_.Ready(); 57 | } 58 | 59 | ~FullCfsAgent() override { 60 | this->TerminateAgentTasks(); 61 | } 62 | 63 | std::unique_ptr MakeAgent(const Cpu& cpu) override { 64 | return absl::make_unique(&this->enclave_, cpu, 65 | cfs_sched_.get()); 66 | } 67 | 68 | void RpcHandler(int64_t req, const AgentRpcArgs& args, 69 | AgentRpcResponse& response) override { 70 | switch (req) { 71 | default: 72 | response.response_code = -1; 73 | return; 74 | } 75 | } 76 | 77 | private: 78 | std::unique_ptr cfs_sched_; 79 | }; 80 | 81 | } // namespace ghost 82 | 83 | #endif // GHOST_SCHEDULERS_CFS_BPF_CFS_SCHEDULER_H_ 84 | -------------------------------------------------------------------------------- /schedulers/edf/agent_exp.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | 10 | #include "absl/debugging/symbolize.h" 11 | #include "absl/flags/parse.h" 12 | #include "lib/agent.h" 13 | #include "lib/channel.h" 14 | #include "lib/enclave.h" 15 | #include "lib/topology.h" 16 | #include "schedulers/edf/edf_scheduler.h" 17 | 18 | ABSL_FLAG(std::string, ghost_cpus, "1-5", "cpulist"); 19 | ABSL_FLAG( 20 | int32_t, globalcpu, -1, 21 | "Global cpu. If -1, then defaults to the lowest CPU in )"); 22 | ABSL_FLAG(bool, ticks, false, "Generate cpu tick messages"); 23 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 24 | 25 | namespace ghost { 26 | 27 | void ParseGlobalConfig(GlobalConfig* config) { 28 | CpuList ghost_cpus = 29 | MachineTopology()->ParseCpuStr(absl::GetFlag(FLAGS_ghost_cpus)); 30 | // One CPU for the spinning global agent and at least one other for running 31 | // scheduled ghOSt tasks. 32 | CHECK_GE(ghost_cpus.Size(), 2); 33 | 34 | int globalcpu = absl::GetFlag(FLAGS_globalcpu); 35 | if (globalcpu < 0) { 36 | CHECK_EQ(globalcpu, -1); 37 | globalcpu = ghost_cpus.Front().id(); 38 | } 39 | CHECK(ghost_cpus.IsSet(globalcpu)); 40 | 41 | Topology* topology = MachineTopology(); 42 | config->topology_ = topology; 43 | config->cpus_ = ghost_cpus; 44 | config->global_cpu_ = topology->cpu(globalcpu); 45 | config->edf_ticks_ = absl::GetFlag(FLAGS_ticks) ? CpuTickConfig::kAllTicks 46 | : CpuTickConfig::kNoTicks; 47 | 48 | std::string enclave = absl::GetFlag(FLAGS_enclave); 49 | if (!enclave.empty()) { 50 | int fd = open(enclave.c_str(), O_PATH); 51 | CHECK_GE(fd, 0); 52 | config->enclave_fd_ = fd; 53 | } 54 | } 55 | 56 | } // namespace ghost 57 | 58 | int main(int argc, char* argv[]) { 59 | absl::InitializeSymbolizer(argv[0]); 60 | absl::ParseCommandLine(argc, argv); 61 | 62 | ghost::GlobalConfig config; 63 | ghost::ParseGlobalConfig(&config); 64 | 65 | printf("Core map\n"); 66 | 67 | int n = 0; 68 | for (const ghost::Cpu& c : config.topology_->all_cores()) { 69 | printf("( "); 70 | for (const ghost::Cpu& s : c.siblings()) printf("%2d ", s.id()); 71 | printf(")%c", ++n % 8 == 0 ? '\n' : '\t'); 72 | } 73 | printf("\n"); 74 | 75 | printf("Initializing...\n"); 76 | 77 | // Using new so we can destruct the object before printing Done 78 | auto uap = new ghost::AgentProcess, 79 | ghost::GlobalConfig>(config); 80 | 81 | ghost::GhostHelper()->InitCore(); 82 | 83 | printf("Initialization complete, ghOSt active.\n"); 84 | // When `stdout` is directed to a terminal, it is newline-buffered. When 85 | // `stdout` is directed to a non-interactive device (e.g, a Python subprocess 86 | // pipe), it is fully buffered. Thus, in order for the Python script to read 87 | // the initialization message as soon as it is passed to `printf`, we need to 88 | // manually flush `stdout`. 89 | fflush(stdout); 90 | 91 | ghost::Notification exit; 92 | static bool first = true; 93 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 94 | if (first) { 95 | exit.Notify(); 96 | first = false; 97 | return false; // We'll exit on subsequent signals. 98 | } 99 | return true; 100 | }); 101 | ghost::GhostSignals::AddHandler(SIGTERM, [&exit](int) { 102 | if (first) { 103 | exit.Notify(); 104 | first = false; 105 | return false; // We'll exit on subsequent signals. 106 | } 107 | return true; 108 | }); 109 | 110 | // TODO: this is racy - uap could be deleted already 111 | ghost::GhostSignals::AddHandler(SIGUSR1, [uap](int) { 112 | uap->Rpc(ghost::EdfScheduler::kDebugRunqueue); 113 | return false; 114 | }); 115 | 116 | exit.WaitForNotification(); 117 | 118 | delete uap; 119 | 120 | printf("\nDone!\n"); 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /schedulers/fifo/centralized/fifo_agent.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "absl/debugging/symbolize.h" 14 | #include "absl/flags/parse.h" 15 | #include "lib/agent.h" 16 | #include "lib/channel.h" 17 | #include "lib/enclave.h" 18 | #include "lib/topology.h" 19 | #include "schedulers/fifo/centralized/fifo_scheduler.h" 20 | 21 | ABSL_FLAG(std::string, ghost_cpus, "1-5", "cpulist"); 22 | ABSL_FLAG(int32_t, globalcpu, -1, 23 | "Global cpu. If -1, then defaults to the first cpu in "); 24 | ABSL_FLAG(absl::Duration, preemption_time_slice, absl::InfiniteDuration(), 25 | "A task is preempted after running for this time slice (default = " 26 | "infinite time slice)"); 27 | 28 | namespace ghost { 29 | 30 | void ParseFifoConfig(FifoConfig* config) { 31 | CpuList ghost_cpus = 32 | MachineTopology()->ParseCpuStr(absl::GetFlag(FLAGS_ghost_cpus)); 33 | // One CPU for the spinning global agent and at least one other for running 34 | // scheduled ghOSt tasks. 35 | CHECK_GE(ghost_cpus.Size(), 2); 36 | 37 | int globalcpu = absl::GetFlag(FLAGS_globalcpu); 38 | if (globalcpu < 0) { 39 | CHECK_EQ(globalcpu, -1); 40 | globalcpu = ghost_cpus.Front().id(); 41 | absl::SetFlag(&FLAGS_globalcpu, globalcpu); 42 | } 43 | CHECK(ghost_cpus.IsSet(globalcpu)); 44 | 45 | Topology* topology = MachineTopology(); 46 | config->topology_ = topology; 47 | config->cpus_ = ghost_cpus; 48 | config->global_cpu_ = topology->cpu(globalcpu); 49 | config->preemption_time_slice_ = absl::GetFlag(FLAGS_preemption_time_slice); 50 | } 51 | 52 | } // namespace ghost 53 | 54 | int main(int argc, char* argv[]) { 55 | absl::InitializeSymbolizer(argv[0]); 56 | 57 | absl::ParseCommandLine(argc, argv); 58 | 59 | ghost::FifoConfig config; 60 | ghost::ParseFifoConfig(&config); 61 | 62 | printf("Core map\n"); 63 | 64 | int n = 0; 65 | for (const ghost::Cpu& c : config.topology_->all_cores()) { 66 | printf("( "); 67 | for (const ghost::Cpu& s : c.siblings()) printf("%2d ", s.id()); 68 | printf(")%c", ++n % 8 == 0 ? '\n' : '\t'); 69 | } 70 | printf("\n"); 71 | 72 | printf("Initializing...\n"); 73 | 74 | // Using new so we can destruct the object before printing Done 75 | auto uap = new ghost::AgentProcess, 76 | ghost::FifoConfig>(config); 77 | 78 | ghost::GhostHelper()->InitCore(); 79 | 80 | printf("Initialization complete, ghOSt active.\n"); 81 | 82 | // When `stdout` is directed to a terminal, it is newline-buffered. When 83 | // `stdout` is directed to a non-interactive device (e.g, a Python subprocess 84 | // pipe), it is fully buffered. Thus, in order for the Python script to read 85 | // the initialization message as soon as it is passed to `printf`, we need to 86 | // manually flush `stdout`. 87 | fflush(stdout); 88 | 89 | ghost::Notification exit; 90 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 91 | static bool first = true; // We only modify the first SIGINT. 92 | 93 | if (first) { 94 | exit.Notify(); 95 | first = false; 96 | return false; // We'll exit on subsequent SIGTERMs. 97 | } 98 | return true; 99 | }); 100 | 101 | // TODO: this is racy - uap could be deleted already 102 | ghost::GhostSignals::AddHandler(SIGUSR1, [uap](int) { 103 | uap->Rpc(ghost::FifoScheduler::kDebugRunqueue); 104 | return false; 105 | }); 106 | 107 | exit.WaitForNotification(); 108 | 109 | delete uap; 110 | 111 | printf("Done!\n"); 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /schedulers/fifo/per_cpu/fifo_agent.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "absl/debugging/symbolize.h" 12 | #include "absl/flags/parse.h" 13 | #include "lib/agent.h" 14 | #include "lib/enclave.h" 15 | #include "schedulers/fifo/per_cpu/fifo_scheduler.h" 16 | 17 | ABSL_FLAG(std::string, ghost_cpus, "1-5", "cpulist"); 18 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 19 | 20 | namespace ghost { 21 | 22 | static void ParseAgentConfig(AgentConfig* config) { 23 | CpuList ghost_cpus = 24 | MachineTopology()->ParseCpuStr(absl::GetFlag(FLAGS_ghost_cpus)); 25 | CHECK(!ghost_cpus.Empty()); 26 | 27 | Topology* topology = MachineTopology(); 28 | config->topology_ = topology; 29 | config->cpus_ = ghost_cpus; 30 | std::string enclave = absl::GetFlag(FLAGS_enclave); 31 | if (!enclave.empty()) { 32 | int fd = open(enclave.c_str(), O_PATH); 33 | CHECK_GE(fd, 0); 34 | config->enclave_fd_ = fd; 35 | } 36 | } 37 | 38 | } // namespace ghost 39 | 40 | int main(int argc, char* argv[]) { 41 | absl::InitializeSymbolizer(argv[0]); 42 | absl::ParseCommandLine(argc, argv); 43 | 44 | ghost::AgentConfig config; 45 | ghost::ParseAgentConfig(&config); 46 | 47 | printf("Initializing...\n"); 48 | 49 | // Using new so we can destruct the object before printing Done 50 | auto uap = new ghost::AgentProcess, 51 | ghost::AgentConfig>(config); 52 | 53 | ghost::GhostHelper()->InitCore(); 54 | printf("Initialization complete, ghOSt active.\n"); 55 | // When `stdout` is directed to a terminal, it is newline-buffered. When 56 | // `stdout` is directed to a non-interactive device (e.g, a Python subprocess 57 | // pipe), it is fully buffered. Thus, in order for the Python script to read 58 | // the initialization message as soon as it is passed to `printf`, we need to 59 | // manually flush `stdout`. 60 | fflush(stdout); 61 | 62 | ghost::Notification exit; 63 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 64 | static bool first = true; // We only modify the first SIGINT. 65 | 66 | if (first) { 67 | exit.Notify(); 68 | first = false; 69 | return false; // We'll exit on subsequent SIGTERMs. 70 | } 71 | return true; 72 | }); 73 | 74 | // TODO: this is racy - uap could be deleted already 75 | ghost::GhostSignals::AddHandler(SIGUSR1, [uap](int) { 76 | uap->Rpc(ghost::FifoScheduler::kDebugRunqueue); 77 | return false; 78 | }); 79 | 80 | exit.WaitForNotification(); 81 | 82 | delete uap; 83 | 84 | printf("\nDone!\n"); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /schedulers/flux/agent_flux.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "absl/debugging/symbolize.h" 12 | #include "absl/flags/parse.h" 13 | #include "lib/agent.h" 14 | #include "lib/channel.h" 15 | #include "lib/enclave.h" 16 | #include "lib/topology.h" 17 | #include "schedulers/flux/flux_scheduler.h" 18 | 19 | ABSL_FLAG(std::string, enclave, "", "Connect to preexisting enclave directory"); 20 | 21 | int main(int argc, char* argv[]) { 22 | absl::InitializeSymbolizer(argv[0]); 23 | absl::ParseCommandLine(argc, argv); 24 | 25 | ghost::Topology* t = ghost::MachineTopology(); 26 | ghost::AgentConfig config(t, t->all_cpus()); 27 | std::string enclave = absl::GetFlag(FLAGS_enclave); 28 | if (!enclave.empty()) { 29 | int fd = open(enclave.c_str(), O_PATH); 30 | CHECK_GE(fd, 0); 31 | config.enclave_fd_ = fd; 32 | } 33 | 34 | auto uap = new ghost::AgentProcess, 35 | ghost::AgentConfig>(config); 36 | 37 | ghost::GhostHelper()->InitCore(); 38 | 39 | printf("Initialization complete, ghOSt active.\n"); 40 | fflush(stdout); 41 | 42 | ghost::Notification exit; 43 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 44 | static bool first = true; 45 | if (first) { 46 | exit.Notify(); 47 | first = false; 48 | return false; 49 | } 50 | return true; 51 | }); 52 | 53 | exit.WaitForNotification(); 54 | 55 | delete uap; 56 | 57 | printf("\nDone!\n"); 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /schedulers/flux/flux_scheduler.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | 9 | #include "schedulers/flux/flux_scheduler.h" 10 | 11 | #include "absl/strings/str_format.h" 12 | #include "lib/flux.h" 13 | 14 | namespace ghost { 15 | 16 | // We only have one scheduler of each type, so id-to-type is pretty basic. 17 | int IdToType(int id) 18 | { 19 | switch (id) { 20 | case FLUX_SCHED_NONE: 21 | return FLUX_SCHED_TYPE_NONE; 22 | case FLUX_SCHED_ROCI: 23 | return FLUX_SCHED_TYPE_ROCI; 24 | case FLUX_SCHED_BIFF: 25 | return FLUX_SCHED_TYPE_BIFF; 26 | case FLUX_SCHED_IDLE: 27 | return FLUX_SCHED_TYPE_IDLE; 28 | default: 29 | return FLUX_SCHED_TYPE_NONE; 30 | } 31 | } 32 | 33 | FluxScheduler::FluxScheduler(Enclave* enclave, CpuList cpulist, 34 | const AgentConfig& config) 35 | : Scheduler(enclave, std::move(cpulist)), 36 | unused_channel_(1, /*node=*/0) { 37 | 38 | bpf_obj_ = flux_bpf__open(); 39 | CHECK_NE(bpf_obj_, nullptr); 40 | 41 | FluxCheckMaps(bpf_obj_); 42 | FluxSetProgTypes(bpf_obj_); 43 | FluxSetGlobals(bpf_obj_); 44 | 45 | CHECK_EQ(flux_bpf__load(bpf_obj_), 0); 46 | 47 | FluxRegisterProgs(bpf_obj_); 48 | 49 | cpu_data_ = static_cast(bpf_map__mmap(bpf_obj_->maps.cpu_data)); 50 | CHECK_NE(cpu_data_, MAP_FAILED); 51 | for (int i = 0; i < FLUX_MAX_CPUS; ++i) { 52 | cpu_data_[i].f.id = i; 53 | } 54 | struct flux_sched s; 55 | for (int i = 0; i < FLUX_NR_SCHEDS; i++) { 56 | memset(&s, 0, sizeof(struct flux_sched)); 57 | s.f.id = i; 58 | s.f.type = IdToType(i); 59 | // All idle *sched types* should have their nr_cpus_wanted set 60 | if (s.f.type == FLUX_SCHED_TYPE_IDLE) { 61 | s.f.nr_cpus_wanted = MachineTopology()->num_cpus(); 62 | } 63 | if (s.f.id == FLUX_SCHED_ROCI) { 64 | s.roci.primary_id = FLUX_SCHED_BIFF; 65 | s.roci.idle_id = FLUX_SCHED_IDLE; 66 | } 67 | CHECK_EQ(bpf_map_update_elem(bpf_map__fd(bpf_obj_->maps.schedulers), 68 | &i, &s, BPF_ANY), 0); 69 | } 70 | thread_data_ = static_cast( 71 | bpf_map__mmap(bpf_obj_->maps.thread_data)); 72 | CHECK_NE(thread_data_, MAP_FAILED); 73 | } 74 | 75 | FluxScheduler::~FluxScheduler() { 76 | bpf_map__munmap(bpf_obj_->maps.cpu_data, cpu_data_); 77 | bpf_map__munmap(bpf_obj_->maps.thread_data, thread_data_); 78 | flux_bpf__destroy(bpf_obj_); 79 | } 80 | 81 | void FluxScheduler::EnclaveReady() { 82 | enclave()->SetDeliverTicks(true); 83 | enclave()->SetDeliverCpuAvailability(true); 84 | // We learn about cpu availability via a message. Some cpus may currently be 85 | // available and idle, but will not generate a message until CFS runs on them. 86 | // Poke each cpu to speed up the process. 87 | // 88 | // Running a CFS task on a cpu will eventually result in an 89 | // unavailable->available edge when that cpu runs out of CFS tasks, and that 90 | // edge will generate a MSG_CPU_AVAILABLE. 91 | std::thread thread([this] { 92 | for (const Cpu& cpu : *enclave()->cpus()) { 93 | // Ignore errors. It's possible the agent is in a cgroup that doesn't 94 | // include all of the enclave cpus. The cpus we skip will eventually run 95 | // a CFS task, just not right away. 96 | (void) GhostHelper()->SchedSetAffinity(Gtid::Current(), 97 | MachineTopology()->ToCpuList({cpu})); 98 | } 99 | }); 100 | thread.join(); 101 | 102 | WRITE_ONCE(bpf_obj_->bss->user_initialized, true); 103 | } 104 | 105 | void FluxScheduler::DiscoverTasks() { 106 | enclave()->DiscoverTasks(); 107 | } 108 | 109 | void FluxAgentTask::AgentThread() { 110 | gtid().assign_name("Agent:" + std::to_string(cpu().id())); 111 | 112 | SignalReady(); 113 | WaitForEnclaveReady(); 114 | 115 | while (!Finished()) { 116 | RunRequest* req = enclave()->GetRunRequest(cpu()); 117 | req->LocalYield(status_word().barrier(), /*flags=*/0); 118 | } 119 | } 120 | 121 | } // namespace ghost 122 | -------------------------------------------------------------------------------- /schedulers/flux/flux_scheduler.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_SCHEDULERS_FLUX_FLUX_SCHEDULER_H_ 8 | #define GHOST_SCHEDULERS_FLUX_FLUX_SCHEDULER_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include "third_party/bpf/flux_bpf.h" 14 | #include "lib/agent.h" 15 | #include "lib/scheduler.h" 16 | #include "schedulers/flux/flux_bpf.skel.h" 17 | 18 | namespace ghost { 19 | 20 | class FluxScheduler : public Scheduler { 21 | public: 22 | explicit FluxScheduler(Enclave* enclave, CpuList cpulist, 23 | const AgentConfig& config); 24 | ~FluxScheduler() final; 25 | 26 | void EnclaveReady() final; 27 | void DiscoverTasks() final; 28 | Channel& GetDefaultChannel() final { return unused_channel_; }; 29 | 30 | private: 31 | LocalChannel unused_channel_; 32 | flux_bpf* bpf_obj_; 33 | flux_cpu* cpu_data_; 34 | flux_thread* thread_data_; 35 | }; 36 | 37 | class FluxAgentTask : public LocalAgent { 38 | public: 39 | FluxAgentTask(Enclave* enclave, Cpu cpu, FluxScheduler* flux_sched) 40 | : LocalAgent(enclave, cpu), flux_sched_(flux_sched) {} 41 | 42 | void AgentThread() override; 43 | Scheduler* AgentScheduler() const override { return flux_sched_; } 44 | 45 | private: 46 | FluxScheduler* flux_sched_; 47 | }; 48 | 49 | template 50 | class FullFluxAgent : public FullAgent { 51 | public: 52 | explicit FullFluxAgent(AgentConfig config) 53 | : FullAgent(config) { 54 | flux_sched_ = std::make_unique( 55 | &this->enclave_, *this->enclave_.cpus(), config); 56 | this->StartAgentTasks(); 57 | this->enclave_.Ready(); 58 | } 59 | 60 | ~FullFluxAgent() override { 61 | // Turn off the availability messages before fully tearing down. Once the 62 | // BPF program is removed, we won't filter the messages anymore, and we'll 63 | // get a few MSG_CPU_AVAILABLE/BUSY sent to userspace. That will overflow 64 | // our channel. On older kernels, that'd trigger a WARN_ON_ONCE. 65 | this->enclave_.SetDeliverCpuAvailability(false); 66 | this->TerminateAgentTasks(); 67 | } 68 | 69 | std::unique_ptr MakeAgent(const Cpu& cpu) override { 70 | return std::make_unique(&this->enclave_, cpu, 71 | flux_sched_.get()); 72 | } 73 | 74 | void RpcHandler(int64_t req, const AgentRpcArgs& args, 75 | AgentRpcResponse& response) override { 76 | switch (req) { 77 | default: 78 | response.response_code = -1; 79 | return; 80 | } 81 | } 82 | 83 | private: 84 | std::unique_ptr flux_sched_; 85 | }; 86 | 87 | } // namespace ghost 88 | 89 | #endif // GHOST_SCHEDULERS_FLUX_FLUX_SCHEDULER_H_ 90 | -------------------------------------------------------------------------------- /schedulers/sol/agent_sol.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "absl/debugging/symbolize.h" 14 | #include "absl/flags/parse.h" 15 | #include "lib/agent.h" 16 | #include "lib/channel.h" 17 | #include "lib/enclave.h" 18 | #include "lib/topology.h" 19 | #include "schedulers/sol/sol_scheduler.h" 20 | 21 | ABSL_FLAG(std::string, ghost_cpus, "1-5", "cpulist"); 22 | ABSL_FLAG(int32_t, globalcpu, -1, 23 | "Global cpu. If -1, then defaults to the first cpu in "); 24 | ABSL_FLAG(absl::Duration, preemption_time_slice, absl::InfiniteDuration(), 25 | "A task is preempted after running for this time slice (default = " 26 | "infinite time slice)"); 27 | 28 | namespace ghost { 29 | 30 | void ParseSolConfig(SolConfig* config) { 31 | int globalcpu = absl::GetFlag(FLAGS_globalcpu); 32 | CpuList ghost_cpus = 33 | MachineTopology()->ParseCpuStr(absl::GetFlag(FLAGS_ghost_cpus)); 34 | 35 | CHECK_GT(ghost_cpus.Size(), 1); 36 | 37 | if (globalcpu < 0) { 38 | CHECK_EQ(globalcpu, -1); 39 | globalcpu = ghost_cpus.Front().id(); 40 | absl::SetFlag(&FLAGS_globalcpu, globalcpu); 41 | } 42 | 43 | Topology* topology = MachineTopology(); 44 | config->topology_ = topology; 45 | config->cpus_ = ghost_cpus; 46 | config->global_cpu_ = topology->cpu(globalcpu); 47 | config->numa_node_ = ghost_cpus.Front().numa_node(); 48 | config->preemption_time_slice_ = absl::GetFlag(FLAGS_preemption_time_slice); 49 | } 50 | 51 | } // namespace ghost 52 | 53 | int main(int argc, char* argv[]) { 54 | absl::InitializeSymbolizer(argv[0]); 55 | 56 | absl::ParseCommandLine(argc, argv); 57 | 58 | ghost::SolConfig config; 59 | ghost::ParseSolConfig(&config); 60 | 61 | printf("Core map\n"); 62 | 63 | int n = 0; 64 | for (const ghost::Cpu& c : config.topology_->all_cores()) { 65 | printf("( "); 66 | for (const ghost::Cpu& s : c.siblings()) printf("%2d ", s.id()); 67 | printf(")%c", ++n % 8 == 0 ? '\n' : '\t'); 68 | } 69 | printf("\n"); 70 | 71 | printf("Initializing...\n"); 72 | 73 | // Using new so we can destruct the object before printing Done 74 | auto uap = new ghost::AgentProcess, 75 | ghost::SolConfig>(config); 76 | 77 | ghost::GhostHelper()->InitCore(); 78 | 79 | printf("Initialization complete, ghOSt active.\n"); 80 | 81 | // When `stdout` is directed to a terminal, it is newline-buffered. When 82 | // `stdout` is directed to a non-interactive device (e.g, a Python subprocess 83 | // pipe), it is fully buffered. Thus, in order for the Python script to read 84 | // the initialization message as soon as it is passed to `printf`, we need to 85 | // manually flush `stdout`. 86 | fflush(stdout); 87 | 88 | ghost::Notification exit; 89 | ghost::GhostSignals::AddHandler(SIGINT, [&exit](int) { 90 | static bool first = true; // We only modify the first SIGINT. 91 | 92 | if (first) { 93 | exit.Notify(); 94 | first = false; 95 | return false; // We'll exit on subsequent SIGTERMs. 96 | } 97 | return true; 98 | }); 99 | 100 | // TODO: this is racy - uap could be deleted already 101 | ghost::GhostSignals::AddHandler(SIGUSR1, [uap](int) { 102 | uap->Rpc(ghost::SolScheduler::kDebugRunqueue); 103 | uap->Rpc(ghost::SolScheduler::kDumpStats); 104 | return false; 105 | }); 106 | 107 | exit.WaitForNotification(); 108 | 109 | printf("%ld nsecs\n", uap->Rpc(ghost::SolScheduler::kGetSchedOverhead)); 110 | 111 | delete uap; 112 | 113 | printf("Done!\n"); 114 | return 0; 115 | } 116 | -------------------------------------------------------------------------------- /shared/shmem.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | // Provides an abstraction for constructing shared memory mappings between two 8 | // (or more) processes. Mappings are huge-page backed, with synchronization for 9 | // versioning, and client initialization. 10 | // 11 | // Currently, a process can host an arbitrary number of shmem regions, but they 12 | // must each have a unique name. There is no limit on how many clients may 13 | // connect to a processes region. 14 | // 15 | // Connecting clients must have the ability to examine open file descriptors of 16 | // the remote process. Generally speaking, for the ghost use-case, this is not 17 | // a particular impingement as we expect processes to host shared memory with 18 | // their scheduling requirements and privileged agents to be the connecting 19 | // clients. 20 | #ifndef GHOST_SHARED_SHMEM_H 21 | #define GHOST_SHARED_SHMEM_H 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "lib/base.h" 30 | 31 | namespace ghost { 32 | 33 | class GhostShmem { 34 | public: 35 | GhostShmem() {} 36 | // Constructs a new named shared memory region hosted by the current process. 37 | // It is guaranteed that the useful size will be at least "size". 38 | // REQUIRES: "name" must uniquely identify this region. 39 | GhostShmem(int64_t client_version, const char* name, size_t size); 40 | ~GhostShmem(); 41 | 42 | // Connects to the region identified by "name", hosted by the process "pid". 43 | // REQUIRES: "pid" hosting "name" must exist. 44 | bool Attach(int64_t client_version, const char* name, pid_t pid); 45 | 46 | // Called by clients when they are aready for remote connections to proceed. 47 | // REQUIRES: Must be called. 48 | void MarkReady(); 49 | 50 | // A raw byte mapping into the hosted shared memory region. 51 | inline char* bytes() { return static_cast(data_); } 52 | 53 | // This is the client usable bytes addressable via bytes(). It will be at 54 | // least as large as requested at time of construction. 55 | size_t size(); 56 | 57 | // This includes internal overheads and roundings on the mapping. 58 | size_t absolute_size() const { return map_size_; } 59 | inline const void* absolute_start() const { return shmem_; } 60 | 61 | // The process that owns the shmem region. 62 | pid_t Owner() const; 63 | 64 | // Internal overheads that clients may optimized passed mapping sizes against. 65 | // This is useful as it represents the padding that should be considered if 66 | // trying to optimally pack against the huge-page backing. 67 | static size_t OverHeadbytes() { return kHeaderReservedBytes; } 68 | 69 | GhostShmem(const GhostShmem&) = delete; 70 | GhostShmem(GhostShmem&&) = delete; 71 | 72 | static GhostShmem* GetShmemBlob(size_t size); 73 | 74 | private: 75 | struct InternalHeader; 76 | 77 | void WaitForReady(); 78 | 79 | static int memfd_create(const char* name, unsigned int flags) { 80 | return syscall(__NR_memfd_create, name, flags); 81 | } 82 | void CreateShmem(int64_t client_version, const char* suffix, size_t size); 83 | bool ConnectShmem(int64_t client_version, const char* suffix, pid_t pid); 84 | 85 | // These members describe the shared memory area. 86 | void* shmem_ = nullptr; 87 | size_t map_size_; 88 | int memfd_ = -1; 89 | // These members map into the shared memory area. 90 | InternalHeader* hdr_ = nullptr; 91 | void* data_; 92 | 93 | static int OpenGhostShmemFd(const char* suffix, pid_t pid); 94 | static constexpr int kHeaderReservedBytes = 4096; // PAGE_SIZE 95 | }; 96 | 97 | } // namespace ghost 98 | 99 | #endif // GHOST_SHARED_SHMEM_H 100 | -------------------------------------------------------------------------------- /tests/biff_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "gmock/gmock.h" 8 | #include "gtest/gtest.h" 9 | #include "schedulers/biff/biff_scheduler.h" 10 | 11 | namespace ghost { 12 | namespace { 13 | 14 | class BiffTest : public testing::Test { 15 | protected: 16 | static void SetUpTestSuite() { 17 | Topology* t = MachineTopology(); 18 | AgentConfig cfg(t, t->all_cpus()); 19 | 20 | uap_ = new AgentProcess, AgentConfig>(cfg); 21 | } 22 | 23 | static void TearDownTestSuite() { 24 | delete uap_; 25 | uap_ = nullptr; 26 | } 27 | 28 | static AgentProcess, AgentConfig>* uap_; 29 | }; 30 | 31 | AgentProcess, AgentConfig>* BiffTest::uap_; 32 | 33 | TEST_F(BiffTest, Simple) { 34 | RemoteThreadTester(/*num_threads=*/1).Run( 35 | [] { 36 | absl::SleepFor(absl::Milliseconds(10)); 37 | sched_yield(); 38 | absl::SleepFor(absl::Milliseconds(10)); 39 | } 40 | ); 41 | } 42 | 43 | TEST_F(BiffTest, SimpleMany) { 44 | RemoteThreadTester().Run( 45 | [] { 46 | absl::SleepFor(absl::Milliseconds(10)); 47 | sched_yield(); 48 | absl::SleepFor(absl::Milliseconds(10)); 49 | } 50 | ); 51 | } 52 | 53 | TEST_F(BiffTest, BusyRunFor) { 54 | RemoteThreadTester(/*num_threads=*/100).Run( 55 | [] { 56 | SpinFor(absl::Milliseconds(10)); 57 | } 58 | ); 59 | } 60 | 61 | } // namespace 62 | } // namespace ghost 63 | 64 | int main(int argc, char **argv) { 65 | testing::InitGoogleMock(&argc, argv); 66 | 67 | return RUN_ALL_TESTS(); 68 | } 69 | -------------------------------------------------------------------------------- /tests/capabilities_test.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #ifndef GHOST_TESTS_CAPABILITIES_TEST_H_ 8 | #define GHOST_TESTS_CAPABILITIES_TEST_H_ 9 | 10 | #include "gmock/gmock.h" 11 | #include "gtest/gtest.h" 12 | #include 13 | 14 | using ::testing::Eq; 15 | using ::testing::IsFalse; 16 | using ::testing::IsTrue; 17 | using ::testing::NotNull; 18 | 19 | // Privileged ghOSt syscalls may only be used by threads with the `CAP_SYS_NICE` 20 | // capability. 21 | constexpr cap_value_t kGhostCapability = CAP_SYS_NICE; 22 | 23 | // Sets `is_set` to true if the calling thread has the `CAP_SYS_NICE` capability 24 | // in its effective set. Sets `is_set` to false otherwise. 25 | // 26 | // Note that we pass `is_set` by reference rather than return a boolean so that 27 | // we can use `ASSERT_THAT` and `EXPECT_THAT` macros in this function. These 28 | // macros only work in functions with void return types. 29 | void NiceCapabilitySet(bool& is_set) { 30 | cap_t current = cap_get_proc(); 31 | ASSERT_THAT(current, NotNull()); 32 | cap_flag_value_t flag_value; 33 | ASSERT_THAT( 34 | cap_get_flag(current, kGhostCapability, CAP_EFFECTIVE, &flag_value), 35 | Eq(0)); 36 | EXPECT_THAT(cap_free(current), Eq(0)); 37 | is_set = (flag_value == CAP_SET); 38 | } 39 | 40 | // Asserts that the `CAP_SYS_NICE` capability is set. 41 | void AssertNiceCapabilitySet() { 42 | bool is_set = false; 43 | NiceCapabilitySet(is_set); 44 | ASSERT_THAT(is_set, IsTrue()); 45 | } 46 | 47 | // Asserts that the `CAP_SYS_NICE` capability is not set. 48 | void AssertNiceCapabilityNotSet() { 49 | bool is_set = true; 50 | NiceCapabilitySet(is_set); 51 | ASSERT_THAT(is_set, IsFalse()); 52 | } 53 | 54 | // Drops the `CAP_SYS_NICE` capability from the calling thread's effective set. 55 | // Note that the calling thread must already hold the `CAP_SYS_NICE` capability 56 | // when it calls this function. 57 | void DropNiceCapability() { 58 | AssertNiceCapabilitySet(); 59 | 60 | cap_t current = cap_get_proc(); 61 | ASSERT_THAT(current, NotNull()); 62 | const cap_value_t cap_array[] = {kGhostCapability}; 63 | ASSERT_THAT( 64 | cap_set_flag(current, CAP_EFFECTIVE, /*ncaps=*/1, cap_array, CAP_CLEAR), 65 | Eq(0)); 66 | ASSERT_THAT(cap_set_proc(current), Eq(0)); 67 | EXPECT_THAT(cap_free(current), Eq(0)); 68 | 69 | AssertNiceCapabilityNotSet(); 70 | } 71 | 72 | #endif // GHOST_TESTS_CAPABILITIES_TEST_H_ 73 | -------------------------------------------------------------------------------- /tests/cfs_bpf_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include "gmock/gmock.h" 8 | #include "gtest/gtest.h" 9 | #include "schedulers/cfs_bpf/cfs_scheduler.h" 10 | 11 | namespace ghost { 12 | namespace { 13 | 14 | class CfsTest : public testing::Test { 15 | protected: 16 | static void SetUpTestSuite() { 17 | Topology* t = MachineTopology(); 18 | AgentConfig cfg(t, t->all_cpus()); 19 | 20 | uap_ = new AgentProcess, AgentConfig>(cfg); 21 | } 22 | 23 | static void TearDownTestSuite() { 24 | delete uap_; 25 | uap_ = nullptr; 26 | } 27 | 28 | static AgentProcess, AgentConfig>* uap_; 29 | }; 30 | 31 | AgentProcess, AgentConfig>* CfsTest::uap_; 32 | 33 | TEST_F(CfsTest, Simple) { 34 | GhostThread t(GhostThread::KernelScheduler::kGhost, [] { 35 | absl::SleepFor(absl::Milliseconds(10)); 36 | sched_yield(); 37 | }); 38 | 39 | t.Join(); 40 | } 41 | 42 | TEST_F(CfsTest, SimpleMany) { 43 | constexpr int kNumThreads = 1000; 44 | std::vector> threads; 45 | threads.reserve(kNumThreads); 46 | 47 | for (int i = 0; i < kNumThreads; ++i) { 48 | threads.push_back( 49 | std::make_unique 50 | (GhostThread::KernelScheduler::kGhost, [] { 51 | absl::SleepFor(absl::Milliseconds(10)); 52 | sched_yield(); 53 | absl::SleepFor(absl::Milliseconds(10)); 54 | })); 55 | } 56 | 57 | for (std::unique_ptr& t : threads) { 58 | t->Join(); 59 | } 60 | 61 | } 62 | 63 | TEST_F(CfsTest, BusyRunFor) { 64 | 65 | constexpr int kNumThreads = 1000; 66 | const absl::Duration d = absl::Milliseconds(10); 67 | 68 | std::vector> threads; 69 | threads.reserve(kNumThreads); 70 | 71 | for (int i = 0; i < kNumThreads; ++i) { 72 | threads.push_back( 73 | std::make_unique 74 | (GhostThread::KernelScheduler::kGhost, [&] { 75 | SpinFor(d); 76 | })); 77 | } 78 | 79 | for (std::unique_ptr& t : threads) { 80 | t->Join(); 81 | } 82 | 83 | 84 | } 85 | 86 | } // namespace 87 | } // namespace ghost 88 | 89 | int main(int argc, char **argv) { 90 | testing::InitGoogleMock(&argc, argv); 91 | 92 | return RUN_ALL_TESTS(); 93 | } 94 | -------------------------------------------------------------------------------- /tests/flux_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Use of this source code is governed by a BSD-style 4 | // license that can be found in the LICENSE file or at 5 | // https://developers.google.com/open-source/licenses/bsd 6 | 7 | #include 8 | 9 | #include "gmock/gmock.h" 10 | #include "gtest/gtest.h" 11 | #include "schedulers/flux/flux_scheduler.h" 12 | 13 | namespace ghost { 14 | namespace { 15 | 16 | class FluxTest : public testing::Test { 17 | protected: 18 | static void SetUpTestSuite() { 19 | Topology* t = MachineTopology(); 20 | AgentConfig cfg(t, t->all_cpus()); 21 | 22 | uap_ = new AgentProcess, AgentConfig>(cfg); 23 | } 24 | 25 | static void TearDownTestSuite() { 26 | delete uap_; 27 | uap_ = nullptr; 28 | } 29 | 30 | static AgentProcess, AgentConfig>* uap_; 31 | }; 32 | 33 | AgentProcess, AgentConfig>* FluxTest::uap_; 34 | 35 | TEST_F(FluxTest, Simple) { 36 | RemoteThreadTester(/*num_threads=*/1).Run( 37 | [] { 38 | absl::SleepFor(absl::Milliseconds(10)); 39 | sched_yield(); 40 | absl::SleepFor(absl::Milliseconds(10)); 41 | } 42 | ); 43 | } 44 | 45 | TEST_F(FluxTest, SimpleMany) { 46 | RemoteThreadTester().Run( 47 | [] { 48 | absl::SleepFor(absl::Milliseconds(10)); 49 | sched_yield(); 50 | absl::SleepFor(absl::Milliseconds(10)); 51 | } 52 | ); 53 | } 54 | 55 | TEST_F(FluxTest, BusyRunFor) { 56 | RemoteThreadTester(/*num_threads=*/100).Run( 57 | [] { 58 | SpinFor(absl::Milliseconds(10)); 59 | } 60 | ); 61 | } 62 | 63 | TEST_F(FluxTest, PrioChangeSelf) { 64 | RemoteThreadTester().Run( 65 | [] { 66 | EXPECT_EQ(setpriority(PRIO_PROCESS, 0, 5), 0); 67 | absl::SleepFor(absl::Milliseconds(10)); 68 | EXPECT_EQ(setpriority(PRIO_PROCESS, 0, 10), 0); 69 | sched_yield(); 70 | EXPECT_EQ(setpriority(PRIO_PROCESS, 0, 5), 0); 71 | } 72 | ); 73 | } 74 | 75 | TEST_F(FluxTest, PrioChangeRemote) { 76 | RemoteThreadTester().Run( 77 | [] { // ghost threads 78 | SpinFor(absl::Milliseconds(5)); 79 | sched_yield(); 80 | absl::SleepFor(absl::Milliseconds(5)); 81 | }, 82 | [](GhostThread* t) { // remote, per-thread work 83 | EXPECT_EQ(setpriority(PRIO_PROCESS, t->tid(), 5), 0); 84 | EXPECT_EQ(setpriority(PRIO_PROCESS, t->tid(), 10), 0); 85 | } 86 | ); 87 | } 88 | 89 | TEST_F(FluxTest, DepartedSelf) { 90 | RemoteThreadTester().Run( 91 | [] { // ghost threads 92 | absl::SleepFor(absl::Milliseconds(10)); 93 | const sched_param param{}; 94 | EXPECT_EQ(sched_setscheduler(/*pid=*/0, SCHED_OTHER, ¶m), 0); 95 | EXPECT_EQ(sched_getscheduler(/*pid=*/0), SCHED_OTHER); 96 | }, 97 | [](GhostThread* t) { // remote, per-thread work 98 | } 99 | ); 100 | } 101 | 102 | TEST_F(FluxTest, DepartedRemote) { 103 | RemoteThreadTester().Run( 104 | [] { // ghost threads 105 | SpinFor(absl::Milliseconds(5)); 106 | sched_yield(); 107 | absl::SleepFor(absl::Milliseconds(5)); 108 | }, 109 | [](GhostThread* t) { // remote, per-thread work 110 | const sched_param param{}; 111 | EXPECT_EQ(sched_setscheduler(t->tid(), SCHED_OTHER, ¶m), 0); 112 | } 113 | ); 114 | } 115 | 116 | // Originally, I thought this was trigging a bug. Turns out it just takes a 117 | // long time with 1000 threads (~30 seconds on CONFIG=dbg in virtme). 118 | TEST_F(FluxTest, DepartedRemoteShortSleep) { 119 | RemoteThreadTester(/*num_threads=*/100).Run( 120 | [] { // ghost threads 121 | absl::SleepFor(absl::Nanoseconds(1)); 122 | }, 123 | [](GhostThread* t) { // remote, per-thread work 124 | const sched_param param{}; 125 | EXPECT_EQ(sched_setscheduler(t->tid(), SCHED_OTHER, ¶m), 0); 126 | } 127 | ); 128 | } 129 | 130 | } // namespace 131 | } // namespace ghost 132 | 133 | int main(int argc, char **argv) { 134 | testing::InitGoogleMock(&argc, argv); 135 | 136 | return RUN_ALL_TESTS(); 137 | } 138 | -------------------------------------------------------------------------------- /third_party/BUILD.bazel: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | # This BUILD file is necessary so that `//third_party` is a package that the 4 | # WORKSPACE file can reference. Without this BUILD file, Bazel will be unable to 5 | # pull in the project dependencies and compile the project. 6 | 7 | exports_files([ 8 | "iovisor_bcc/bits.bpf.h", 9 | "iovisor_bcc/trace_helpers.h", 10 | "util/util.h", 11 | ]) 12 | -------------------------------------------------------------------------------- /third_party/bpf/BUILD: -------------------------------------------------------------------------------- 1 | # Note: If you modify this BUILD file, please contact jhumphri@ first to ensure 2 | # that you are not breaking the Copybara script. 3 | 4 | load("//:bpf/bpf.bzl", "bpf_program") 5 | 6 | package( 7 | default_applicable_licenses = ["//:license"], 8 | default_visibility = [ 9 | "//:__subpackages__", 10 | ], 11 | ) 12 | 13 | # We use the GPLv2 license for the eBPF code so that we can access kernel 14 | # functionality restricted to eBPF programs that are licensed under GPLv2. That 15 | # being said, keep in mind that all of this eBPF code is authored and owned by 16 | # Google. 17 | licenses(["restricted"]) 18 | 19 | exports_files( 20 | [ 21 | "biff_bpf.h", 22 | "cfs_bpf.h", 23 | "common.bpf.h", 24 | "edf.h", 25 | "flux_bpf.h", 26 | "pntring.bpf.h", 27 | "pntring_funcs.bpf.h", 28 | "schedfair.h", 29 | "schedlat.h", 30 | "schedrun.h", 31 | "topology.bpf.h", 32 | "schedghostidle.bpf.c", 33 | ], 34 | ) 35 | 36 | filegroup( 37 | name = "flux_infra", 38 | srcs = [ 39 | "flux_api.bpf.c", 40 | "flux_dispatch.bpf.c", 41 | "flux_header_bpf.h", 42 | ], 43 | ) 44 | 45 | filegroup( 46 | name = "flux_scheds", 47 | srcs = [ 48 | "biff_flux.bpf.c", 49 | "biff_flux_bpf.h", 50 | "idle_flux.bpf.c", 51 | "idle_flux_bpf.h", 52 | "prov_flux.bpf.c", 53 | "prov_flux_bpf.h", 54 | "roci_flux.bpf.c", 55 | "roci_flux_bpf.h", 56 | ], 57 | ) 58 | 59 | bpf_program( 60 | name = "biff_bpf", 61 | src = "biff.bpf.c", 62 | hdrs = [ 63 | "biff_bpf.h", 64 | "common.bpf.h", 65 | "topology.bpf.h", 66 | "//:abi/latest/kernel/ghost.h", 67 | "//:lib/ghost_uapi.h", 68 | ], 69 | bpf_object = "biff_bpf.o", 70 | ) 71 | 72 | bpf_program( 73 | name = "cfs_bpf", 74 | src = "cfs.bpf.c", 75 | hdrs = [ 76 | "cfs_bpf.h", 77 | "common.bpf.h", 78 | "//:abi/latest/kernel/ghost.h", 79 | "//:arr_structs", 80 | "//:lib/ghost_uapi.h", 81 | ], 82 | bpf_object = "cfs_bpf.o", 83 | ) 84 | 85 | bpf_program( 86 | name = "edf_bpf", 87 | src = "edf.bpf.c", 88 | hdrs = [ 89 | "common.bpf.h", 90 | "edf.h", 91 | "//:abi/latest/kernel/ghost.h", 92 | "//:lib/ghost_uapi.h", 93 | ], 94 | bpf_object = "edf_bpf.o", 95 | ) 96 | 97 | bpf_program( 98 | name = "flux_bpf", 99 | src = "flux.bpf.c", 100 | hdrs = [ 101 | "common.bpf.h", 102 | "flux_bpf.h", 103 | ":flux_infra", 104 | ":flux_scheds", 105 | "//:abi/latest/kernel/ghost.h", 106 | "//:arr_structs", 107 | "//:lib/ghost_uapi.h", 108 | ], 109 | bpf_object = "flux_bpf.o", 110 | ) 111 | 112 | bpf_program( 113 | name = "schedclasstop_bpf", 114 | src = "schedclasstop.bpf.c", 115 | hdrs = [ 116 | "common.bpf.h", 117 | ], 118 | bpf_object = "schedclasstop_bpf.o", 119 | ) 120 | 121 | bpf_program( 122 | name = "schedfair_bpf", 123 | src = "schedfair.bpf.c", 124 | hdrs = [ 125 | "common.bpf.h", 126 | "schedfair.h", 127 | "//third_party:iovisor_bcc/bits.bpf.h", 128 | ], 129 | bpf_object = "schedfair_bpf.o", 130 | ) 131 | 132 | bpf_program( 133 | name = "schedlat_bpf", 134 | src = "schedlat.bpf.c", 135 | hdrs = [ 136 | "common.bpf.h", 137 | "schedlat.h", 138 | "//third_party:iovisor_bcc/bits.bpf.h", 139 | ], 140 | bpf_object = "schedlat_bpf.o", 141 | ) 142 | 143 | bpf_program( 144 | name = "schedrun_bpf", 145 | src = "schedrun.bpf.c", 146 | hdrs = [ 147 | "common.bpf.h", 148 | "schedrun.h", 149 | "//third_party:iovisor_bcc/bits.bpf.h", 150 | ], 151 | bpf_object = "schedrun_bpf.o", 152 | ) 153 | 154 | bpf_program( 155 | name = "test_bpf", 156 | src = "test.bpf.c", 157 | hdrs = [ 158 | "common.bpf.h", 159 | "//:abi/latest/kernel/ghost.h", 160 | "//:lib/ghost_uapi.h", 161 | ], 162 | bpf_object = "test_bpf.o", 163 | ) 164 | -------------------------------------------------------------------------------- /third_party/bpf/biff_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_BIFF_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_BIFF_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | #define BIFF_MAX_CPUS 1024 22 | #define BIFF_MAX_GTIDS 65536 23 | 24 | /* 25 | * The array map of these, called `cpu_data`, can be mmapped by userspace. 26 | */ 27 | struct biff_bpf_cpu_data { 28 | uint64_t current; 29 | uint64_t cpu_seqnum; 30 | bool available; 31 | } __attribute__((aligned(64))); 32 | 33 | /* 34 | * bpf can quickly access hash maps, but userspace can't. Ghost already gives 35 | * us the "status_word region" (SWR), which is an mmappable file in ghostfs that 36 | * exports read-only data from the kernel. Every task has a status word ID 37 | * (identifier for SWR) the and index within the SWR. 38 | * 39 | * The sw_data is logically an extension of the status word. It is read-write 40 | * by userspace and bpf. 41 | * 42 | * For each SW region (and there is 1, with BIFF_MAX_GTIDS slots), there is a 43 | * corresponding bpf array map, called `sw_data`, with the same number of 44 | * "words", such that given a task's sw index, we can find its sw_data. In bpf, 45 | * the index is stored in struct task_sw_info and is maintained by bpf-msg. 46 | * 47 | * Since userspace doesn't receive messages, it will have to scan the SWR to 48 | * discover tasks and their SW {id, index} pairs. (You can start scanning from 49 | * the last-new spot, since the kernel allocates linearly, with wrapping.) 50 | * 51 | * This may seem like an extra level of indirection and pointer chasing, but bpf 52 | * autogenerates the array map access code, so even if we don't use sw_data from 53 | * userspace yet, it's not hard to have it ready. 54 | * 55 | * aligned(8) since this is a bpf map value. 56 | */ 57 | struct biff_bpf_sw_data { 58 | uint64_t ran_at; 59 | uint64_t ran_until; 60 | uint64_t runnable_at; 61 | uint64_t parent; 62 | } __attribute__((aligned(8))); 63 | 64 | 65 | #endif // GHOST_LIB_BPF_BPF_BIFF_BPF_H_ 66 | -------------------------------------------------------------------------------- /third_party/bpf/biff_flux_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_BIFF_FLUX_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_BIFF_FLUX_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | #include "lib/queue.bpf.h" 22 | 23 | struct biff_flux_sched { 24 | struct arr_list rq; 25 | }; 26 | 27 | struct biff_flux_cpu { 28 | uint64_t current; 29 | }; 30 | 31 | struct biff_flux_thread { 32 | uint64_t ran_at; 33 | uint64_t ran_until; 34 | uint64_t runnable_at; 35 | struct arr_list_entry link; 36 | bool enqueued; 37 | bool times_up; 38 | int cpu; 39 | }; 40 | 41 | #endif // GHOST_LIB_BPF_BPF_BIFF_FLUX_BPF_H_ 42 | -------------------------------------------------------------------------------- /third_party/bpf/cfs_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_BPF_BPF_CFS_BPF_H_ 15 | #define GHOST_BPF_BPF_CFS_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | #include "lib/queue.bpf.h" 22 | 23 | #define CFS_MAX_CPUS 1024 24 | #define CFS_MAX_GTIDS 65536 25 | 26 | /* 27 | * The array map of these, called `cpu_data`, can be mmapped by userspace. 28 | */ 29 | struct cfs_bpf_cpu_data { 30 | uint64_t current; 31 | uint64_t cpu_seqnum; 32 | bool available; 33 | } __attribute__((aligned(64))); 34 | 35 | /* 36 | * Per-cpu runqueue for CFS using Linked list. 37 | */ 38 | struct cfs_bpf_rq { 39 | uint64_t current; 40 | uint64_t weight; 41 | uint64_t nr_running; 42 | uint64_t min_vruntime; 43 | struct arr_list rq_root; 44 | #ifdef __BPF__ 45 | struct bpf_spin_lock lock; 46 | #else 47 | uint32_t lock; 48 | #endif 49 | }__attribute__((aligned(64))); 50 | 51 | /* 52 | * Thread struct to store the values required for cfs tasks. Think of this as 53 | * the same as a task struct for cfs. It brings its own memory for the runqueue 54 | * (LL). 55 | * aligned(8) since this is a bpf map value. 56 | */ 57 | struct cfs_bpf_thread { 58 | uint64_t gtid; 59 | uint64_t task_barrier; 60 | uint64_t ran_at; 61 | uint64_t ran_until; 62 | uint64_t runnable_at; 63 | uint64_t weight; 64 | uint64_t real_time; 65 | uint64_t sum_exec_runtime; 66 | uint64_t prev_sum_exec_runtime; 67 | uint64_t vruntime; 68 | uint64_t on_rq; 69 | struct arr_list_entry next_task; 70 | } __attribute__((aligned(8))); 71 | 72 | 73 | 74 | 75 | 76 | #endif // GHOST_BPF_BPF_CFS_BPF_H_ 77 | -------------------------------------------------------------------------------- /third_party/bpf/edf.bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #include 13 | 14 | // clang-format off 15 | #include 16 | #include "libbpf/bpf_helpers.h" 17 | #include "libbpf/bpf_tracing.h" 18 | // clang-format on 19 | 20 | #include "lib/ghost_uapi.h" 21 | #include "third_party/bpf/common.bpf.h" 22 | #include "third_party/bpf/edf.h" 23 | 24 | bool skip_tick = false; 25 | 26 | /* max_entries is patched at runtime to num_possible_cpus */ 27 | struct { 28 | __uint(type, BPF_MAP_TYPE_ARRAY); 29 | __uint(max_entries, 1024); 30 | __type(key, u32); 31 | __type(value, struct edf_bpf_per_cpu_data); 32 | __uint(map_flags, BPF_F_MMAPABLE); 33 | } cpu_data SEC(".maps"); 34 | 35 | SEC("ghost_sched/pnt") 36 | int edf_pnt(struct bpf_ghost_sched *ctx) 37 | { 38 | return 0; 39 | } 40 | 41 | /* 42 | * You have to play games to get the compiler to not modify the context pointer 43 | * (msg). You can load X bytes off a ctx, but if you add to ctx, then load, 44 | * you'll get the dreaded: "dereference of modified ctx ptr" error. 45 | * 46 | * You can also sprinkle asm volatile ("" ::: "memory") to help reduce compiler 47 | * optimizations on the context. 48 | */ 49 | static void __attribute__((noinline)) handle_yield(struct bpf_ghost_msg *msg) 50 | { 51 | struct ghost_msg_payload_task_yield *yield = &msg->yield; 52 | 53 | yield->agent_data = 1; 54 | } 55 | 56 | static void __attribute__((noinline)) handle_wakeup(struct bpf_ghost_msg *msg) 57 | { 58 | struct ghost_msg_payload_task_wakeup *wakeup = &msg->wakeup; 59 | 60 | wakeup->agent_data = 1; 61 | } 62 | 63 | SEC("ghost_msg/msg_send") 64 | int edf_msg_send(struct bpf_ghost_msg *msg) 65 | { 66 | switch (msg->type) { 67 | case MSG_TASK_WAKEUP: 68 | handle_wakeup(msg); 69 | break; 70 | case MSG_TASK_YIELD: 71 | handle_yield(msg); 72 | break; 73 | case MSG_CPU_TICK: 74 | if (skip_tick) 75 | return 1; 76 | break; 77 | case MSG_CPU_AGENT_BLOCKED: 78 | case MSG_CPU_AGENT_WAKEUP: 79 | /* 80 | * Suppress these messages. Having this in BPF ensures that 81 | * our vmlinux.h knows about these message types. 82 | */ 83 | return 1; 84 | } 85 | 86 | return 0; 87 | } 88 | 89 | char LICENSE[] SEC("license") = "GPL"; 90 | -------------------------------------------------------------------------------- /third_party/bpf/edf.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Google LLC 2 | * 3 | * This program is free software; you can redistribute it and/or 4 | * modify it under the terms of the GNU General Public License 5 | * version 2 as published by the Free Software Foundation. 6 | * 7 | * This program is distributed in the hope that it will be useful, 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | * GNU General Public License for more details. 11 | */ 12 | 13 | #ifndef GHOST_LIB_BPF_BPF_EDF_H_ 14 | #define GHOST_LIB_BPF_BPF_EDF_H_ 15 | 16 | #ifndef __BPF__ 17 | #include 18 | #endif 19 | 20 | struct edf_bpf_per_cpu_data { 21 | uint8_t example_bool; 22 | } __attribute__((aligned(64))); 23 | 24 | #endif // GHOST_LIB_BPF_BPF_EDF_H_ 25 | -------------------------------------------------------------------------------- /third_party/bpf/flux.bpf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 or later as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #include 15 | 16 | // clang-format off 17 | #include 18 | #include "libbpf/bpf_helpers.h" 19 | #include "libbpf/bpf_tracing.h" 20 | // clang-format on 21 | 22 | #include "lib/ghost_uapi.h" 23 | #include "third_party/bpf/common.bpf.h" 24 | #include "third_party/bpf/flux_bpf.h" 25 | 26 | #include 27 | 28 | struct { 29 | __uint(type, BPF_MAP_TYPE_ARRAY); 30 | __uint(max_entries, FLUX_NR_SCHEDS); 31 | __type(key, u32); 32 | __type(value, struct flux_sched); 33 | } schedulers SEC(".maps"); 34 | 35 | static inline struct flux_sched *get_sched(int id) 36 | { 37 | return bpf_map_lookup_elem(&schedulers, &id); 38 | } 39 | 40 | static inline int get_parent_id(struct flux_sched *s) 41 | { 42 | if (s->f.id == FLUX_SCHED_ROCI) 43 | return FLUX_SCHED_NONE; 44 | return FLUX_SCHED_ROCI; 45 | } 46 | 47 | /* 48 | * The 'tier' is where a scheduler is in the hierarchy of schedulers. Since 49 | * we're in BPF, this is hardcoded: Roci is at top, with biff and idle below. 50 | * 51 | * Schedulers can preempt their cpus, and you can have preemptions at every tier 52 | * concurrently. e.g. 53 | * - biff can preempt its own cpu to kick a thread off cpu (tier = 2) 54 | * - roci can preempt that cpu to kick biff off (tier = 1) 55 | * - the kernel can preempt the cpu completely (availability change, tier = 0) 56 | * 57 | * When preempt_to is 3 (FLUX_MAX_NR_TIERS, aka FLUX_TIER_NO_PREEMPT), there are 58 | * no preemption requests. 59 | * 60 | * Keep in mind that it's always OK for us to preempt a cpu. If there's some 61 | * corner case where we accidentally preempt a cpu unintentionally, that's fine. 62 | * The schedulers will just reallocate it. 63 | * 64 | * Quick example: roci on cpu A wants to preempt cpu B. It does its 65 | * bookkeeping, plans to preempt, then calls flux_preempt_cpu. At that point, 66 | * the kernel preempts the cpu, then reallocates it, and the cpu is roci's 67 | * again. Then cpu A writes preempt_to and sends the IPI. Next time we run 68 | * PNT, we'll preempt that cpu up to roci, which can then hand it back to 69 | * biff/idle/whoever. 70 | */ 71 | 72 | #define FLUX_MAX_NR_TIERS 3 73 | 74 | static inline int sched_id_to_tier(int id) 75 | { 76 | switch (id) { 77 | case FLUX_SCHED_NONE: 78 | return 0; 79 | case FLUX_SCHED_ROCI: 80 | return 1; 81 | case FLUX_SCHED_BIFF: 82 | case FLUX_SCHED_IDLE: 83 | return 2; 84 | }; 85 | return 0; 86 | } 87 | 88 | static int new_thread_sched_id(struct ghost_msg_payload_task_new *new) 89 | { 90 | return FLUX_SCHED_BIFF; 91 | } 92 | 93 | static int top_tier_sched_id(void) 94 | { 95 | return FLUX_SCHED_ROCI; 96 | } 97 | 98 | #define __gen_thread_op_cases(op_type, op, sched, ...) \ 99 | case FLUX_SCHED_TYPE_BIFF: \ 100 | op_type(biff, op)(sched, __VA_ARGS__); \ 101 | break; \ 102 | 103 | #define __gen_cpu_op_cases(op_type, op, sched, ...) \ 104 | case FLUX_SCHED_TYPE_ROCI: \ 105 | op_type(roci, op)(sched, __VA_ARGS__); \ 106 | break; \ 107 | case FLUX_SCHED_TYPE_BIFF: \ 108 | op_type(biff, op)(sched, __VA_ARGS__); \ 109 | break; \ 110 | case FLUX_SCHED_TYPE_IDLE: \ 111 | op_type(idle, op)(sched, __VA_ARGS__); \ 112 | break; \ 113 | 114 | 115 | #include "third_party/bpf/flux_dispatch.bpf.c" 116 | 117 | /********************* SCHED OPS *********************/ 118 | 119 | #include "third_party/bpf/roci_flux.bpf.c" 120 | #include "third_party/bpf/biff_flux.bpf.c" 121 | #include "third_party/bpf/idle_flux.bpf.c" 122 | 123 | #include "third_party/bpf/flux_api.bpf.c" 124 | -------------------------------------------------------------------------------- /third_party/bpf/flux_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_FLUX_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_FLUX_BPF_H_ 16 | 17 | #include "third_party/bpf/biff_flux_bpf.h" 18 | #include "third_party/bpf/flux_header_bpf.h" 19 | #include "third_party/bpf/idle_flux_bpf.h" 20 | #include "third_party/bpf/roci_flux_bpf.h" 21 | 22 | struct flux_sched { 23 | struct __flux_sched f; 24 | 25 | #ifdef __BPF__ 26 | /* 27 | * bpf_spin_lock is not available in userspace. 28 | * The sizeof == 32 is UAPI and statically asserted in flux_pnt. 29 | */ 30 | struct bpf_spin_lock lock; 31 | #else 32 | uint32_t lock; 33 | #endif 34 | union { 35 | struct roci_flux_sched roci; 36 | struct biff_flux_sched biff; 37 | struct idle_flux_sched idle; 38 | }; 39 | } __attribute__((aligned(8))); 40 | /* aligned(8) since this is a bpf map value. */ 41 | 42 | enum { 43 | FLUX_SCHED_NONE, 44 | FLUX_SCHED_ROCI, 45 | FLUX_SCHED_BIFF, 46 | FLUX_SCHED_IDLE, 47 | FLUX_NR_SCHEDS, 48 | }; 49 | 50 | enum { 51 | FLUX_SCHED_TYPE_NONE, 52 | FLUX_SCHED_TYPE_ROCI, 53 | FLUX_SCHED_TYPE_BIFF, 54 | FLUX_SCHED_TYPE_IDLE, 55 | FLUX_NR_SCHED_TYPES, 56 | }; 57 | 58 | struct flux_cpu { 59 | struct __flux_cpu f; 60 | 61 | /* 62 | * A cpu can be used by many schedulers concurrently, i.e. roci and biff 63 | * can both use cpu fields, since roci allocs the cpu to biff. 64 | * 65 | * Additionally, there could be multiple instances of biff. Even if you 66 | * try to ensure no scheduler has a descendent of the same type, you 67 | * still have a problem: schedulers might use the cpu struct even if 68 | * the cpu is not allocated to them. 69 | * 70 | * It seems like a simple rule: "don't use your blob in the cpu struct 71 | * if you no longer have it", however remember that in ghost, certain 72 | * messages happen after a context switch! e.g. by the time we run 73 | * flux_thread_preempted() (which resolves to biff_thread_preempted()), 74 | * the cpu was already taken away from that instance of biff, and 75 | * possibly allocted to another instance of biff, which is also using 76 | * the biff fields! 77 | * 78 | * The fix is to have an array, indexed by sched_id, which is unique for 79 | * multiple instances of a scheduler. i.e. each biff gets their own 80 | * sched_id and thus their own struct. That way, every scheduler can 81 | * touch *their part* of the cpu, even if they no longer have the cpu 82 | * allocated. 83 | * 84 | * However, the rule remains that schedulers cannot touch the __flux_cpu 85 | * unless they own the cpu. 86 | */ 87 | union { 88 | struct roci_flux_cpu roci; 89 | struct biff_flux_cpu biff; 90 | struct idle_flux_cpu idle; 91 | } __s[FLUX_NR_SCHEDS]; 92 | } __attribute__((aligned(64))); 93 | /* aligned(64) for per-cpu caching */ 94 | 95 | struct flux_thread { 96 | struct __flux_thread f; 97 | 98 | /* A thread belongs to a single scheduler at a time. */ 99 | union { 100 | struct biff_flux_thread biff; 101 | }; 102 | } __attribute__((aligned(8))); 103 | /* aligned(8) since this is a bpf map value. */ 104 | 105 | #endif // GHOST_LIB_BPF_BPF_FLUX_BPF_H_ 106 | -------------------------------------------------------------------------------- /third_party/bpf/ghost_shared_bpf.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #ifndef GHOST_LIB_BPF_GHOST_SHARED_BPF_H_ 13 | #define GHOST_LIB_BPF_GHOST_SHARED_BPF_H_ 14 | 15 | // Keep this file's structs in sync with bpf/ghost_shared.h. 16 | // We need different headers for BPF and C programs due to various Google3 17 | // reasons. 18 | 19 | struct ghost_per_cpu_data { 20 | __u8 want_tick; 21 | } __attribute__((aligned(64))); 22 | 23 | #endif // GHOST_LIB_BPF_GHOST_SHARED_BPF_H_ 24 | -------------------------------------------------------------------------------- /third_party/bpf/idle_flux.bpf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 or later as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | /* Idle scheduler implementation for Flux. */ 15 | 16 | static void idle_request_for_cpus(struct flux_sched *i, int child_id, 17 | int nr_cpus, int *ret) 18 | { 19 | /* Never called, we have no children. */ 20 | } 21 | 22 | static void idle_cpu_allocated(struct flux_sched *i, struct flux_cpu *cpu) 23 | { 24 | /* Don't care. */ 25 | } 26 | 27 | static void idle_cpu_returned(struct flux_sched *i, int child_id, 28 | struct flux_cpu *cpu) 29 | { 30 | /* Never called, we have no children. */ 31 | } 32 | 33 | static void idle_cpu_preempted(struct flux_sched *i, int child_id, 34 | struct flux_cpu *cpu) 35 | { 36 | /* Don't care. */ 37 | } 38 | 39 | static void idle_cpu_preemption_completed(struct flux_sched *i, int child_id, 40 | struct flux_cpu *cpu) 41 | { 42 | /* Don't care. */ 43 | } 44 | 45 | static void idle_cpu_ticked(struct flux_sched *i, int child_id, 46 | struct flux_cpu *cpu) 47 | { 48 | /* Don't care. */ 49 | } 50 | 51 | static void idle_pick_next_task(struct flux_sched *i, struct flux_cpu *cpu, 52 | struct bpf_ghost_sched *ctx) 53 | { 54 | flux_run_idle(cpu, ctx); 55 | } 56 | -------------------------------------------------------------------------------- /third_party/bpf/idle_flux_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_IDLE_FLUX_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_IDLE_FLUX_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | struct idle_flux_sched { 22 | uint8_t thanks_cplusplus; /* no zero-length structs... */ 23 | }; 24 | 25 | struct idle_flux_cpu { 26 | uint8_t thanks_cplusplus; /* no zero-length structs... */ 27 | }; 28 | 29 | #endif // GHOST_LIB_BPF_BPF_IDLE_FLUX_BPF_H_ 30 | -------------------------------------------------------------------------------- /third_party/bpf/prov_flux_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_PROV_FLUX_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_PROV_FLUX_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | #include "lib/queue.bpf.h" 22 | 23 | /* 24 | * Prov: a provisioning scheduler. 25 | * 26 | * We have three children: prio, next, and last. 27 | * 28 | * The policy is to give prio up to max_nr_prio_cpus, preferring to pick cpus 29 | * flagged "priority". Else take from last, then next. Next takes from last. 30 | * 31 | * max_nr_prio_cpus and the per-cpu priority fields are configured by userspace. 32 | * You can do things like set max_nr_prio_cpus = 10, and pick your ten favorite 33 | * cpus (e.g. sharing a LLC). Prio will get those 10, assuming they are 34 | * available to us at all (kernel CFS or our parent could have them instead). 35 | * If those chosen 10 aren't available, we'll find non-priority cpus to give you 36 | * instead. 37 | * 38 | * If prio has a cpu that isn't priority and there are available priority cpus, 39 | * we'll preempt prio (on timer tick) and move it to its desired cpu. It's a 40 | * tradeoff - if you don't do that, prio will get scattered around the machine. 41 | * Note that if no cpus are marked priority, prio will just get any 42 | * max_nr_prio_cpus. 43 | * 44 | * max_nr_prio_cpus could be changed at runtime, or we can make it a function of 45 | * our cpus (future work). Don't change cpu->priority at runtime without adding 46 | * some other state tracking bools. 47 | * 48 | * In the original version of Prov, priority was an int and these were stored in 49 | * a tree. However, the AVL code is expensive in terms of instructions, and it 50 | * was really easy to blow out of our 1 million instruction budget... 51 | */ 52 | 53 | struct prov_poke_tracker { 54 | uint64_t threshold; /* how many usec between pokes */ 55 | uint64_t poked_at; /* last time we poked, in usec */ 56 | }; 57 | 58 | struct prov_flux_sched { 59 | unsigned int prio_id; 60 | unsigned int next_id; 61 | unsigned int last_id; 62 | 63 | unsigned int max_nr_prio_cpus; 64 | 65 | /* 66 | * The first place prio looks for a victim. These cpus are next's and 67 | * last's cpus. 68 | * 69 | * There is a window of time when a priority cpu is granted to us 70 | * (prov), but not granted to any child scheduler yet. It won't be on 71 | * this list. If prio has an outstanding nr_cpus_wanted, when we get to 72 | * PNT, we'll hand out this cpu. (Recall that cpu_grant happens in 73 | * PNT). It's possible that there is a concurrent request on another 74 | * cpu that won't see this newly-granted cpu, and we may give out a 75 | * non-priority cpu to prio when this cpu would have been better. I'm 76 | * fine with that. 77 | */ 78 | struct arr_list priority_cpus; 79 | 80 | struct arr_list nexts_cpus; 81 | struct arr_list lasts_cpus; 82 | 83 | struct prov_poke_tracker prio_poke; 84 | struct prov_poke_tracker next_poke; 85 | 86 | /* debug stats, disabled at load time if prov_debug_stats is false */ 87 | uint64_t prio_grants; 88 | uint64_t next_grants; 89 | uint64_t last_grants; 90 | 91 | uint64_t prio_self_preempts; 92 | uint64_t next_self_preempts; 93 | uint64_t last_self_preempts; 94 | 95 | uint64_t prio_ipi_preempts; 96 | uint64_t next_ipi_preempts; 97 | uint64_t last_ipi_preempts; 98 | }; 99 | 100 | struct prov_flux_cpu { 101 | struct arr_list_entry prio_link; 102 | struct arr_list_entry child_link; 103 | bool priority; 104 | /* 105 | * preempt_pending is an earmark/signal that we already removed the cpu 106 | * from the appropriate child list(s). 107 | */ 108 | bool preempt_pending; 109 | unsigned int owning_child; 110 | }; 111 | 112 | #endif // GHOST_LIB_BPF_BPF_PROV_FLUX_BPF_H_ 113 | -------------------------------------------------------------------------------- /third_party/bpf/roci_flux_bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License 6 | * version 2 as published by the Free Software Foundation. 7 | * 8 | * This program is distributed in the hope that it will be useful, 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | * GNU General Public License for more details. 12 | */ 13 | 14 | #ifndef GHOST_LIB_BPF_BPF_ROCI_FLUX_BPF_H_ 15 | #define GHOST_LIB_BPF_BPF_ROCI_FLUX_BPF_H_ 16 | 17 | #ifndef __BPF__ 18 | #include 19 | #endif 20 | 21 | #include "lib/queue.bpf.h" 22 | 23 | /* 24 | * TODO: ROCI assumes it is the top of the hierarchy and that "idle_id" is 25 | * actually the idle scheduler. There are a few assumptions baked in here: 26 | * - the idle_id (secondary child) always wants a cpu. So we never yield in 27 | * PNT. 28 | * - We never ask our parent for cpus, since we assume there is no parent to 29 | * ask. 30 | * - We're extremely aggressive about taking cpus from idle. This is fine if it 31 | * is actually idle, but can get excessive. Specifically, we look at 32 | * nr_cpus_needed, not nr_cpus (which was the request from that call). If you 33 | * have two cpus making requests at the same time, ROCI might double-up and 34 | * preempt 2x the cpus needed. 35 | */ 36 | struct roci_flux_sched { 37 | struct arr_list primary_cpus; 38 | struct arr_list idle_cpus; 39 | unsigned int primary_id; 40 | unsigned int idle_id; 41 | }; 42 | 43 | struct roci_flux_cpu { 44 | struct arr_list_entry link; 45 | bool preempt_pending; 46 | }; 47 | 48 | #endif // GHOST_LIB_BPF_BPF_ROCI_FLUX_BPF_H_ 49 | -------------------------------------------------------------------------------- /third_party/bpf/schedclasstop.bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #include 13 | 14 | // clang-format off 15 | #include 16 | #include "libbpf/bpf_core_read.h" 17 | #include "libbpf/bpf_helpers.h" 18 | #include "libbpf/bpf_tracing.h" 19 | // clang-format on 20 | 21 | #include "third_party/bpf/common.bpf.h" 22 | 23 | #define SCHED_GHOST 18 24 | #define SCHED_AGENT 19 /* Not a real sched class */ 25 | #define MAX_SCHED_CLASS (SCHED_AGENT + 1) 26 | 27 | /* Using this map as a per-cpu u64 */ 28 | struct { 29 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 30 | __uint(max_entries, 1); 31 | __type(key, u32); 32 | __type(value, u64); 33 | } start_times SEC(".maps"); 34 | 35 | struct { 36 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 37 | __uint(max_entries, MAX_SCHED_CLASS); 38 | __type(key, u32); 39 | __type(value, u64); 40 | } class_times SEC(".maps"); 41 | 42 | static int task_sched_policy(struct task_struct *p) 43 | { 44 | #define PF_IDLE 0x2 /* linux/sched.h */ 45 | u32 flags = BPF_CORE_READ(p, flags); 46 | 47 | /* 48 | * SCHED_IDLE isn't the idle thread, but we do want to track idle 49 | * separately. We reuse SCHED_ISO (4), which is probably the least 50 | * likely value to be used. 51 | */ 52 | if (flags & PF_IDLE) 53 | return 4; 54 | if (task_has_ghost_policy(p)) { 55 | if (is_agent(p)) 56 | return SCHED_AGENT; 57 | else 58 | return SCHED_GHOST; 59 | 60 | } 61 | return BPF_CORE_READ(p, policy); 62 | } 63 | 64 | SEC("tp_btf/sched_switch") 65 | int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, 66 | struct task_struct *next) 67 | { 68 | u64 *start_time, *class_time; 69 | u32 prev_policy; 70 | u32 zero = 0; 71 | u64 now; 72 | 73 | prev_policy = task_sched_policy(prev); 74 | 75 | start_time = bpf_map_lookup_elem(&start_times, &zero); 76 | /* This lookup always succeeds, but the verifier needs proof. */ 77 | if (!start_time) 78 | return 0; 79 | 80 | now = bpf_ktime_get_ns(); 81 | if (*start_time) { 82 | class_time = bpf_map_lookup_elem(&class_times, &prev_policy); 83 | if (class_time) 84 | *class_time += now - *start_time; 85 | } 86 | *start_time = now; 87 | 88 | return 0; 89 | } 90 | 91 | char LICENSE[] SEC("license") = "GPL"; 92 | -------------------------------------------------------------------------------- /third_party/bpf/schedfair.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Google LLC 2 | * 3 | * This program is free software; you can redistribute it and/or 4 | * modify it under the terms of the GNU General Public License 5 | * version 2 as published by the Free Software Foundation. 6 | * 7 | * This program is distributed in the hope that it will be useful, 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | * GNU General Public License for more details. 11 | */ 12 | 13 | #ifndef GHOST_LIB_BPF_BPF_SCHEDFAIR_H_ 14 | #define GHOST_LIB_BPF_BPF_SCHEDFAIR_H_ 15 | 16 | #include 17 | 18 | #define MAX_PIDS 102400 19 | 20 | struct task_info { 21 | /* state tracking */ 22 | uint8_t load_tracked; 23 | int user_prio; 24 | 25 | /* intermediate variables */ 26 | uint64_t share_at_wake; 27 | uint64_t ran_at; 28 | uint64_t cpu_runtime_since_wake; 29 | 30 | /* output for userspace */ 31 | uint64_t total_cpu_runtime; 32 | uint64_t total_cpu_share; 33 | }; 34 | 35 | #endif // GHOST_LIB_BPF_BPF_SCHEDFAIR_H_ 36 | -------------------------------------------------------------------------------- /third_party/bpf/schedghostidle.bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #include 13 | 14 | // clang-format off 15 | #include 16 | #include "libbpf/bpf_core_read.h" 17 | #include "libbpf/bpf_helpers.h" 18 | #include "libbpf/bpf_tracing.h" 19 | // clang-format on 20 | 21 | #include "lib/ghost_uapi.h" 22 | #include "third_party/bpf/common.bpf.h" 23 | #include "third_party/iovisor_bcc/bits.bpf.h" 24 | 25 | #define MAX_CPUS 512 26 | /* Keep this in sync with schedghostidle.c and bpf/user/agent.c */ 27 | #define NR_SLOTS 25 28 | 29 | uint64_t nr_latches = 0; 30 | uint64_t nr_bpf_latches = 0; 31 | uint64_t nr_idle_to_bpf_latches = 0; 32 | 33 | /* 34 | * This array maps is racy, but it's fine. Both the latcher and sched_switch 35 | * tracepoints hold the RQ lock. We want to access a cpu's data from another 36 | * cpu, since the latcher may not be on a particular cpu. 37 | */ 38 | struct cpu_info { 39 | bool is_idle; 40 | u64 idle_start; 41 | }; 42 | 43 | struct { 44 | __uint(type, BPF_MAP_TYPE_ARRAY); 45 | __uint(max_entries, MAX_CPUS); 46 | __type(key, u32); 47 | __type(value, struct cpu_info); 48 | } cpu_info SEC(".maps"); 49 | 50 | /* key: hist slot idx. value: count */ 51 | struct { 52 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 53 | __uint(max_entries, NR_SLOTS); 54 | __type(key, u32); 55 | __type(value, u64); 56 | } hist SEC(".maps"); 57 | 58 | static bool task_is_idle(struct task_struct *p) 59 | { 60 | #define PF_IDLE 0x2 /* linux/sched.h */ 61 | u32 flags = BPF_CORE_READ(p, flags); 62 | 63 | return flags & PF_IDLE; 64 | } 65 | 66 | SEC("tp_btf/sched_switch") 67 | int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, 68 | struct task_struct *next) 69 | { 70 | u32 cpu = bpf_get_smp_processor_id(); 71 | struct cpu_info *ci = bpf_map_lookup_elem(&cpu_info, &cpu); 72 | 73 | if (!ci) 74 | return 0; 75 | 76 | if (task_is_idle(next)) { 77 | ci->is_idle = true; 78 | ci->idle_start = bpf_ktime_get_ns(); 79 | } else { 80 | ci->is_idle = false; 81 | } 82 | 83 | return 0; 84 | } 85 | 86 | static int task_cpu(struct task_struct *p) 87 | { 88 | return BPF_CORE_READ(p, cpu); 89 | } 90 | 91 | static void update_hist(u64 nsec) 92 | { 93 | u64 slot, *count; 94 | 95 | slot = log2l(nsec / 1000); 96 | if (slot >= NR_SLOTS) 97 | slot = NR_SLOTS - 1; 98 | count = bpf_map_lookup_elem(&hist, &slot); 99 | if (!count) 100 | return; 101 | *count += 1; 102 | } 103 | 104 | SEC("tp_btf/sched_ghost_latched") 105 | int BPF_PROG(sched_ghost_latched, struct task_struct *old, 106 | struct task_struct *new, int run_flags) 107 | { 108 | u32 cpu = task_cpu(new); 109 | struct cpu_info *ci = bpf_map_lookup_elem(&cpu_info, &cpu); 110 | 111 | __sync_fetch_and_add(&nr_latches, 1); 112 | /* BPF-PNT is the only one who uses SEND_TASK_ON_CPU. */ 113 | if (run_flags & SEND_TASK_ON_CPU) 114 | __sync_fetch_and_add(&nr_bpf_latches, 1); 115 | 116 | if (!ci || !ci->is_idle) { 117 | /* 118 | * When BPF-PNT latches a task, the cpu might not go idle. 119 | * However, we'd like to measure those events. 120 | */ 121 | if (run_flags & SEND_TASK_ON_CPU) 122 | update_hist(0); 123 | return 0; 124 | } 125 | __sync_fetch_and_add(&nr_idle_to_bpf_latches, 1); 126 | 127 | update_hist(bpf_ktime_get_ns() - ci->idle_start); 128 | /* 129 | * Technically, the cpu is still idle, and our latch may get aborted or 130 | * otherwise fail. But the agent has noticed the previous idling (as 131 | * shown by it trying to latch), so we do not want to count as idle for 132 | * any other latchings that happen before the next sched_switch. 133 | */ 134 | ci->is_idle = false; 135 | 136 | return 0; 137 | } 138 | 139 | char LICENSE[] SEC("license") = "GPL"; 140 | -------------------------------------------------------------------------------- /third_party/bpf/schedlat.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Google LLC 2 | * 3 | * This program is free software; you can redistribute it and/or 4 | * modify it under the terms of the GNU General Public License 5 | * version 2 as published by the Free Software Foundation. 6 | * 7 | * This program is distributed in the hope that it will be useful, 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | * GNU General Public License for more details. 11 | */ 12 | 13 | #ifndef GHOST_LIB_BPF_BPF_SCHEDLAT_H_ 14 | #define GHOST_LIB_BPF_BPF_SCHEDLAT_H_ 15 | 16 | #include 17 | 18 | #define MAX_PIDS 102400 19 | #define MAX_NR_HIST_SLOTS 25 20 | 21 | struct task_stat { 22 | uint64_t runnable_at; 23 | uint64_t latched_at; 24 | uint64_t ran_at; 25 | }; 26 | 27 | /* 28 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 29 | * 8-byte aligned, since it is a value for a BPF map. The kernel will round up 30 | * the size of any map value to 8 bytes internally. If we have an array of 31 | * these objects, the kernel will think each object is 8-byte aligned each. 32 | * When we read the per-cpu map in schedlat.c, we get an array of struct hist. 33 | * The compiler needs to agree with the kernel on the size of the objects, or 34 | * you'll corrupt your stats. 35 | */ 36 | struct hist { 37 | uint32_t slots[MAX_NR_HIST_SLOTS]; 38 | } __attribute__((aligned(8))); 39 | 40 | enum { 41 | RUNNABLE_TO_LATCHED, 42 | LATCHED_TO_RUN, 43 | RUNNABLE_TO_RUN, 44 | NR_HISTS, 45 | }; 46 | 47 | #endif // GHOST_LIB_BPF_BPF_SCHEDLAT_H_ 48 | -------------------------------------------------------------------------------- /third_party/bpf/schedlat_shared_bpf.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #ifndef GHOST_LIB_BPF_SCHEDLAT_SHARED_BPF_H_ 13 | #define GHOST_LIB_BPF_SCHEDLAT_SHARED_BPF_H_ 14 | 15 | // Keep this file's structs in sync with bpf/schedlat_shared.h. 16 | // We need different headers for BPF and C programs due to various Google3 17 | // reasons. 18 | 19 | #define MAX_PIDS 102400 20 | #define MAX_NR_HIST_SLOTS 25 21 | 22 | struct task_stat { 23 | __u64 runnable_at; 24 | __u64 latched_at; 25 | __u64 ran_at; 26 | }; 27 | 28 | /* 29 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 30 | * 8-byte aligned, since it is a value for a BPF map. 31 | */ 32 | struct hist { 33 | u32 slots[MAX_NR_HIST_SLOTS]; 34 | } __attribute__((aligned(64))); 35 | 36 | enum { 37 | RUNNABLE_TO_LATCHED, 38 | LATCHED_TO_RUN, 39 | RUNNABLE_TO_RUN, 40 | NR_HISTS, 41 | }; 42 | 43 | #endif // GHOST_LIB_BPF_SCHEDLAT_SHARED_BPF_H_ 44 | -------------------------------------------------------------------------------- /third_party/bpf/schedrun.bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #include 13 | 14 | // clang-format off 15 | #include 16 | #include "libbpf/bpf_core_read.h" 17 | #include "libbpf/bpf_helpers.h" 18 | #include "libbpf/bpf_tracing.h" 19 | 20 | // common.bpf.h comes before bits.bpf.h for u32/s32/u64/s64 in OSS. 21 | #include "third_party/bpf/common.bpf.h" 22 | #include "third_party/iovisor_bcc/bits.bpf.h" 23 | #include "third_party/bpf/schedrun.h" 24 | // clang-format on 25 | 26 | const volatile pid_t targ_tgid = 0; 27 | const volatile bool ghost_only = false; 28 | 29 | // Map each task's pid to the timestamp it started running. 30 | struct { 31 | __uint(type, BPF_MAP_TYPE_HASH); 32 | __uint(max_entries, MAX_PIDS); 33 | __type(key, u32); 34 | __type(value, u64); 35 | } task_start_times SEC(".maps"); 36 | 37 | struct { 38 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 39 | __uint(max_entries, NR_HISTS); 40 | __type(key, u32); 41 | __type(value, struct hist); 42 | } hists SEC(".maps"); 43 | 44 | // TODO: refactor (copied from schedlat.bpf.c). 45 | static void update_hist(u32 hist_id, u64 value) 46 | { 47 | u64 slot; /* Gotta love BPF. slot needs to be a u64, not a u32. */ 48 | struct hist *hist; 49 | 50 | hist = bpf_map_lookup_elem(&hists, &hist_id); 51 | if (!hist) 52 | return; 53 | slot = log2l(value); 54 | if (slot >= MAX_NR_HIST_SLOTS) 55 | slot = MAX_NR_HIST_SLOTS - 1; 56 | hist->slots[slot]++; 57 | } 58 | 59 | static void task_stop(struct task_struct *p) 60 | { 61 | u32 pid = BPF_CORE_READ(p, pid); 62 | u64 stop = bpf_ktime_get_us(); 63 | u64 *start = bpf_map_lookup_elem(&task_start_times, &pid); 64 | 65 | if (start) { 66 | u64 diff = stop - *start; 67 | update_hist(RUNTIMES_ALL, diff); 68 | 69 | long state = BPF_CORE_READ(p, state); 70 | if (state == TASK_RUNNING) // prev yielded or was preempted. 71 | update_hist(RUNTIMES_PREEMPTED_YIELDED, diff); 72 | else // prev blocked. 73 | update_hist(RUNTIMES_BLOCKED, diff); 74 | 75 | if (state == TASK_DEAD) 76 | bpf_map_delete_elem(&task_start_times, &pid); 77 | } 78 | } 79 | 80 | static void task_run(struct task_struct *p) 81 | { 82 | u32 pid = BPF_CORE_READ(p, pid); 83 | u64 start = bpf_ktime_get_us(); 84 | 85 | bpf_map_update_elem(&task_start_times, &pid, &start, BPF_ANY); 86 | } 87 | 88 | static bool is_traced(struct task_struct *p) 89 | { 90 | if (targ_tgid) 91 | return BPF_CORE_READ(p, tgid) == targ_tgid; 92 | 93 | if (ghost_only) 94 | return is_traced_ghost(p); 95 | 96 | return true; 97 | } 98 | 99 | SEC("tp_btf/sched_switch") 100 | int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, 101 | struct task_struct *next) 102 | { 103 | if (is_traced(prev)) 104 | task_stop(prev); 105 | 106 | if (is_traced(next)) 107 | task_run(next); 108 | 109 | return 0; 110 | } 111 | 112 | char LICENSE[] SEC("license") = "GPL"; 113 | -------------------------------------------------------------------------------- /third_party/bpf/schedrun.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Google LLC 2 | * 3 | * This program is free software; you can redistribute it and/or 4 | * modify it under the terms of the GNU General Public License 5 | * version 2 as published by the Free Software Foundation. 6 | * 7 | * This program is distributed in the hope that it will be useful, 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | * GNU General Public License for more details. 11 | */ 12 | 13 | #ifndef GHOST_LIB_BPF_BPF_SCHEDRUN_H_ 14 | #define GHOST_LIB_BPF_BPF_SCHEDRUN_H_ 15 | 16 | #include 17 | 18 | #define MAX_PIDS 102400 19 | #define MAX_NR_HIST_SLOTS 25 20 | 21 | /* 22 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 23 | * 8-byte aligned, since it is a value for a BPF map. 24 | */ 25 | struct hist { 26 | uint32_t slots[MAX_NR_HIST_SLOTS]; 27 | } __attribute__((aligned(64))); 28 | 29 | enum { 30 | RUNTIMES_PREEMPTED_YIELDED, 31 | RUNTIMES_BLOCKED, 32 | RUNTIMES_ALL, 33 | NR_HISTS, 34 | }; 35 | 36 | #endif // GHOST_LIB_BPF_BPF_SCHEDRUN_H_ 37 | -------------------------------------------------------------------------------- /third_party/bpf/schedrun_shared_bpf.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #ifndef GHOST_LIB_BPF_SCHEDRUN_SHARED_BPF_H_ 13 | #define GHOST_LIB_BPF_SCHEDRUN_SHARED_BPF_H_ 14 | 15 | // Keep this file's structs in sync with bpf/schedrun_shared.h. 16 | // We need different headers for BPF and C programs due to various Google3 17 | // reasons. 18 | 19 | #define MAX_PIDS 102400 20 | #define MAX_NR_HIST_SLOTS 25 21 | 22 | /* 23 | * Power of 2 histogram, <=1 us, 2us, 4us, etc. This struct must be at least 24 | * 8-byte aligned, since it is a value for a BPF map. 25 | */ 26 | struct hist { 27 | u32 slots[MAX_NR_HIST_SLOTS]; 28 | } __attribute__((aligned(64))); 29 | 30 | enum { 31 | RUNTIMES_PREEMPTED_YIELDED, 32 | RUNTIMES_BLOCKED, 33 | RUNTIMES_ALL, 34 | NR_HISTS, 35 | }; 36 | 37 | #endif // GHOST_LIB_BPF_SCHEDRUN_SHARED_BPF_H_ 38 | -------------------------------------------------------------------------------- /third_party/bpf/test.bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // This program is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU General Public License 5 | // version 2 as published by the Free Software Foundation. 6 | // 7 | // This program is distributed in the hope that it will be useful, 8 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | // GNU General Public License for more details. 11 | 12 | #include 13 | 14 | // clang-format off 15 | #include "libbpf/bpf_helpers.h" 16 | #include "libbpf/bpf_tracing.h" 17 | // clang-format on 18 | 19 | #include "lib/ghost_uapi.h" 20 | #include "third_party/bpf/common.bpf.h" 21 | 22 | SEC("ghost_sched/pnt") 23 | int test_pnt(struct bpf_ghost_sched *ctx) 24 | { 25 | return 0; 26 | } 27 | 28 | char LICENSE[] SEC("license") = "GPL"; 29 | -------------------------------------------------------------------------------- /third_party/iovisor_bcc/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Wenbo Zhang 2 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 3 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 4 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 5 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 6 | -------------------------------------------------------------------------------- /third_party/iovisor_bcc/bits.bpf.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | /* 3 | * From iovisor's bcc/libbpf-tools/bits.bpf.h. 4 | * 5 | * These are very small, and it's not worth getting a dependency on 6 | * third_party/bcc/. The tools including this header are similar in style to 7 | * libbpf-tools, which are intended to be built from within the 8 | * bcc/libbpf-tools/ directory. 9 | */ 10 | 11 | #ifndef GHOST_LIB_BPF_BITS_BPF_H_ 12 | #define GHOST_LIB_BPF_BITS_BPF_H_ 13 | 14 | static __always_inline u64 log2(u32 v) 15 | { 16 | u32 shift, r; 17 | 18 | r = (v > 0xFFFF) << 4; v >>= r; 19 | shift = (v > 0xFF) << 3; v >>= shift; r |= shift; 20 | shift = (v > 0xF) << 2; v >>= shift; r |= shift; 21 | shift = (v > 0x3) << 1; v >>= shift; r |= shift; 22 | r |= (v >> 1); 23 | 24 | return r; 25 | } 26 | 27 | static __always_inline u64 log2l(u64 v) 28 | { 29 | u32 hi = v >> 32; 30 | 31 | if (hi) 32 | return log2(hi) + 32; 33 | else 34 | return log2(v); 35 | } 36 | 37 | #endif // GHOST_LIB_BPF_BITS_BPF_H_ 38 | -------------------------------------------------------------------------------- /third_party/linux.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_foreign_cc//foreign_cc:defs.bzl", "make") 2 | 3 | # The libbpf source code. This is encompassed by the `:source` filegroup, but 4 | # the `make` rule below wants just the library source passed via the 5 | # `lib_source` parameter. 6 | filegroup( 7 | name = "libbpf_source", 8 | srcs = glob(["tools/lib/bpf/**"]), 9 | visibility = ["//visibility:private"], 10 | ) 11 | 12 | # The bpftool source code. This is encompassed by the `:source` filegroup, but 13 | # the `make` rule below wants just the library source passed via the 14 | # `lib_source` parameter. 15 | filegroup( 16 | name = "bpftool_source", 17 | srcs = glob(["tools/bpf/bpftool/**"]), 18 | visibility = ["//visibility:private"], 19 | ) 20 | 21 | # The Linux source code. 22 | filegroup( 23 | name = "source", 24 | srcs = glob(["**"]), 25 | visibility = ["//visibility:private"], 26 | ) 27 | 28 | # Compiles the libbpf static library. 29 | make( 30 | name = "libbpf", 31 | # This is the library source. This filegroup includes the Makefile. 32 | lib_source = ":libbpf_source", 33 | # The Makefile uses other files in the Linux kernel tree outside of its 34 | # directory during the build process (e.g., 35 | # `tools/scripts/Makefile.include`). 36 | build_data = [":source"], 37 | # This is the target passed to `make` (i.e., `make libbpf.a`). 38 | targets = ["libbpf.a"], 39 | # This copy should be done automatically by the rules_foreign_cc tool, yet 40 | # it is not. This may happen because the libbpf library is not at the root 41 | # of the Linux kernel tree. Perhaps the rules_foreign_cc tool makes an 42 | # assumption that the library source is at the root of the kernel tree, 43 | # which causes its copy of libbpf.a to fail since it cannot find the static 44 | # library at the root of the kernel tree. 45 | # 46 | # Note: The values of the environment variables below are written to 47 | # GNUMake.log, so look at that file to inspect them. You can also look at 48 | # that log to see which other environment variables exist. 49 | postfix_script = "cp $EXT_BUILD_ROOT/external/linux/tools/lib/bpf/libbpf.a $INSTALLDIR/lib/libbpf.a; " + 50 | # By making the `libbpf` directory and copying the libbpf header files into 51 | # it, we can have the #include paths in the project prefixed by `libbpf`. In 52 | # other words, we can do `#include "libbpf/header.h"` instead of 53 | # `#include "header.h"`. With the latter, it is more confusing to figure out 54 | # where the header file is and could cause conflicts if a header file in the 55 | # project has the same name as a header file in libbpf. 56 | "mkdir $INSTALLDIR/include/libbpf; " + 57 | "cp $EXT_BUILD_ROOT/external/linux/tools/lib/bpf/*.h $INSTALLDIR/include/libbpf", 58 | visibility = ["//visibility:public"], 59 | ) 60 | 61 | # Compiles the bpftool binary. 62 | make( 63 | name = "bpftool", 64 | lib_source = ":bpftool_source", 65 | # This attribute specifies that the output is a binary. Otherwise, the 66 | # rules_foreign_cc tool expects to find a static library (i.e., `bpftool.a`) 67 | # and fails when the static library is not produced. 68 | out_binaries = ["bpftool"], 69 | build_data = [":source"], 70 | # The default targets are `` and `install`, but we do not want the `install` 71 | # target. Thus, specify that the only target is `` (i.e., just `make`). 72 | targets = [""], 73 | # See the comment in the `:libbpf` target for an explanation of why this 74 | # copy is necessary. 75 | postfix_script = "cp $EXT_BUILD_ROOT/external/linux/tools/bpf/bpftool/bpftool $INSTALLDIR/bin/bpftool", 76 | visibility = ["//visibility:public"], 77 | ) 78 | -------------------------------------------------------------------------------- /third_party/util/LICENSE: -------------------------------------------------------------------------------- 1 | MIT license 2 | 3 | Copyright 2022 Google LLC 4 | 5 | Permission is hereby granted, free of charge, to any 6 | person obtaining a copy of this software and associated 7 | documentation files (the "Software"), to deal in the 8 | Software without restriction, including without 9 | limitation the rights to use, copy, modify, merge, 10 | publish, distribute, sublicense, and/or sell copies of 11 | the Software, and to permit persons to whom the Software 12 | is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice 16 | shall be included in all copies or substantial portions 17 | of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 20 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 21 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 22 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 23 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 26 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 27 | DEALINGS IN THE SOFTWARE. 28 | -------------------------------------------------------------------------------- /third_party/util/util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Use of this source code is governed by an MIT-style 4 | // license that can be found in the LICENSE file or at 5 | // https://opensource.org/licenses/MIT. 6 | 7 | #ifndef GHOST_THIRD_PARTY_UTIL_UTIL_H_ 8 | #define GHOST_THIRD_PARTY_UTIL_UTIL_H_ 9 | 10 | #include 11 | 12 | // The code below is derived from 13 | // https://stackoverflow.com/questions/34672441/stdis-base-of-for-template-classes. 14 | template