├── assets ├── demo.gif └── design.png ├── .gitmodules ├── .gitignore ├── util ├── endian.go ├── topology.go ├── emun_import.go └── emun.go ├── go.mod ├── goland_core ├── scheduler.go ├── uei.go ├── bss.go ├── task.go ├── rodata.go └── obj.go ├── .github ├── workflows │ └── go.yaml └── actions │ └── build-dependencies │ └── action.yaml ├── scripts └── test_scheduler.sh ├── go.sum ├── wrapper.h ├── intf.h ├── README.md ├── Makefile ├── wrapper.c ├── main.go ├── LICENSE └── main.bpf.c /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gthulhu/qumun/HEAD/assets/demo.gif -------------------------------------------------------------------------------- /assets/design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gthulhu/qumun/HEAD/assets/design.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "scx"] 2 | path = scx 3 | url = https://github.com/sched-ext/scx.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | libbpf 3 | libbpfgo 4 | output 5 | *.ll 6 | *.o 7 | *.skeleton.h 8 | main 9 | scx 10 | libwrapper.a 11 | -------------------------------------------------------------------------------- /util/endian.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "encoding/binary" 5 | "unsafe" 6 | ) 7 | 8 | func Endian() binary.ByteOrder { 9 | var i int32 = 0x01020304 10 | u := unsafe.Pointer(&i) 11 | pb := (*byte)(u) 12 | b := *pb 13 | if b == 0x04 { 14 | return binary.LittleEndian 15 | } 16 | 17 | return binary.BigEndian 18 | } 19 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Gthulhu/qumun 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.2 6 | 7 | require ( 8 | github.com/Gthulhu/plugin v1.0.1 9 | github.com/aquasecurity/libbpfgo v0.8.0-libbpf-1.5 10 | golang.org/x/sys v0.37.0 11 | ) 12 | 13 | require github.com/cilium/ebpf v0.20.0 14 | 15 | replace github.com/aquasecurity/libbpfgo => ./libbpfgo 16 | -------------------------------------------------------------------------------- /goland_core/scheduler.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "github.com/Gthulhu/plugin/models" 5 | ) 6 | 7 | func (s *Sched) DrainQueuedTask() int { 8 | if s.plugin != nil { 9 | return s.plugin.DrainQueuedTask(s) 10 | } 11 | return 0 12 | } 13 | 14 | func (s *Sched) SelectQueuedTask() *models.QueuedTask { 15 | if s.plugin != nil { 16 | return s.plugin.SelectQueuedTask(s) 17 | } 18 | return nil 19 | } 20 | 21 | func (s *Sched) SelectCPU(t *models.QueuedTask) (error, int32) { 22 | if s.plugin != nil { 23 | return s.plugin.SelectCPU(s, t) 24 | } 25 | return s.selectCPU(t) 26 | } 27 | 28 | func (s *Sched) DetermineTimeSlice(t *models.QueuedTask) uint64 { 29 | if s.plugin != nil { 30 | return s.plugin.DetermineTimeSlice(s, t) 31 | } 32 | return 0 33 | } 34 | 35 | func (s *Sched) GetPoolCount() uint64 { 36 | if s.plugin != nil { 37 | return s.plugin.GetPoolCount() 38 | } 39 | return 0 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/go.yaml: -------------------------------------------------------------------------------- 1 | name: Go 2 | on: 3 | push: 4 | branches: [ main ] 5 | pull_request: 6 | branches: 7 | - main 8 | workflow_call: 9 | jobs: 10 | self-tests: 11 | name: Selftests 12 | runs-on: ubuntu-24.04 13 | strategy: 14 | matrix: 15 | go-version: [ 'stable' ] 16 | steps: 17 | - name: Checkout Code 18 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 19 | - name: Install Dependencies 20 | uses: ./.github/actions/build-dependencies 21 | with: 22 | go-version: ${{ matrix.go-version }} 23 | - name: Static Selftests 24 | run: | 25 | make dep 26 | git submodule init 27 | git submodule sync 28 | git submodule update 29 | cd scx 30 | meson setup build --prefix ~ 31 | meson compile -C build 32 | cd .. 33 | cd libbpfgo 34 | make 35 | cd .. 36 | make build 37 | make test 38 | shell: bash -------------------------------------------------------------------------------- /.github/actions/build-dependencies/action.yaml: -------------------------------------------------------------------------------- 1 | name: Build Dependencies 2 | description: | 3 | Install build dependencies to test and compile tracee artifacts 4 | inputs: 5 | go-version: 6 | description: go version 7 | default: "1.21" 8 | runs: 9 | using: composite 10 | steps: 11 | - name: Setup Go 12 | uses: actions/setup-go@cdcb36043654635271a94b9a6d1392de5bb323a7 # v5.0.1 13 | with: 14 | go-version: "${{ inputs.go-version }}" 15 | - name: Install Compilers & Formatters 16 | run: | 17 | sudo apt-get update 18 | sudo apt-get install --yes bsdutils 19 | sudo apt-get install --yes build-essential 20 | sudo apt-get install --yes pkgconf 21 | sudo apt-get install --yes llvm-17 clang-17 clang-format-17 22 | sudo apt-get install --yes libbpf-dev libelf-dev libzstd-dev zlib1g-dev 23 | sudo apt-get install --yes virtme-ng 24 | sudo apt-get install --yes gcc-multilib 25 | sudo apt-get install --yes systemtap-sdt-dev 26 | sudo apt-get install --yes python3 python3-pip ninja-build 27 | sudo apt-get install --yes libseccomp-dev protobuf-compiler 28 | pip3 install --user meson 29 | for tool in "clang" "clang-format" "llc" "llvm-strip" 30 | do 31 | sudo rm -f /usr/bin/$tool 32 | sudo ln -s /usr/bin/$tool-17 /usr/bin/$tool 33 | done 34 | shell: bash 35 | -------------------------------------------------------------------------------- /scripts/test_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Scheduler test script 3 | # This script runs the scheduler and verifies it starts successfully 4 | 5 | set -e 6 | 7 | LOGFILE="/tmp/scheduler_test.log" 8 | TIMEOUT_DURATION=60 9 | WARMUP_TIME=15 10 | 11 | echo "Starting scheduler test..." 12 | 13 | # Run scheduler in background 14 | timeout ${TIMEOUT_DURATION} ./main > "${LOGFILE}" 2>&1 & 15 | SCHED_PID=$! 16 | 17 | echo "Scheduler PID: ${SCHED_PID}" 18 | 19 | # Wait for scheduler to initialize 20 | sleep ${WARMUP_TIME} 21 | 22 | # Check if scheduler is still running 23 | if ! ps -p ${SCHED_PID} > /dev/null 2>&1; then 24 | echo "✗ Scheduler crashed during initialization" 25 | echo "Log output:" 26 | cat "${LOGFILE}" 27 | exit 1 28 | fi 29 | 30 | echo "✓ Scheduler is running" 31 | 32 | # Check if scheduler started successfully 33 | if grep -q "scheduler started" "${LOGFILE}"; then 34 | echo "✓ Scheduler started successfully" 35 | else 36 | echo "✗ Scheduler did not start properly" 37 | echo "Log output:" 38 | cat "${LOGFILE}" 39 | kill ${SCHED_PID} 2>/dev/null || true 40 | exit 1 41 | fi 42 | 43 | # Let it run for a few more seconds 44 | sleep 20 45 | 46 | # Check final stats 47 | if grep -q "bss data" "${LOGFILE}"; then 48 | echo "✓ Scheduler produced stats" 49 | fi 50 | 51 | # Clean shutdown 52 | echo "Stopping scheduler..." 53 | kill ${SCHED_PID} 2>/dev/null || true 54 | wait ${SCHED_PID} 2>/dev/null || true 55 | 56 | echo "✓ Test completed successfully" 57 | exit 0 58 | -------------------------------------------------------------------------------- /goland_core/uei.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import "C" 4 | 5 | import ( 6 | "bytes" 7 | "encoding/binary" 8 | "fmt" 9 | "log" 10 | "unsafe" 11 | 12 | bpf "github.com/aquasecurity/libbpfgo" 13 | ) 14 | 15 | const ( 16 | UEI_REASON_LEN = 128 17 | UEI_MSG_LEN = 1024 18 | ) 19 | 20 | type UserExitInfo struct { 21 | Kind int32 22 | Paid uint32 23 | ExitCode int64 24 | Reason [UEI_REASON_LEN]C.char 25 | Message [UEI_MSG_LEN]C.char 26 | } 27 | 28 | type UeiMap struct { 29 | *bpf.BPFMap 30 | } 31 | 32 | func (s *Sched) Stopped() bool { 33 | uei, err := s.GetUeiData() 34 | if err != nil { 35 | log.Printf("uei: %v", err) 36 | return true 37 | } 38 | if uei.Kind != 0 || uei.ExitCode != 0 { 39 | log.Printf("uei.kind %v, uei.ExitCode: %v", uei.Kind, uei.ExitCode) 40 | return true 41 | } 42 | return false 43 | } 44 | 45 | func (s *Sched) GetUeiData() (UserExitInfo, error) { 46 | if s.uei == nil { 47 | return UserExitInfo{}, fmt.Errorf("UeiMap is nil") 48 | } 49 | i := 0 50 | b, err := s.uei.BPFMap.GetValue(unsafe.Pointer(&i)) 51 | if err != nil { 52 | return UserExitInfo{}, err 53 | } 54 | var uei UserExitInfo 55 | buff := bytes.NewBuffer(b) 56 | err = binary.Read(buff, binary.LittleEndian, &uei) 57 | if err != nil { 58 | return UserExitInfo{}, err 59 | } 60 | return uei, nil 61 | } 62 | 63 | func (uei *UserExitInfo) GetReason() string { 64 | return C.GoString(&uei.Reason[0]) 65 | } 66 | 67 | func (uei *UserExitInfo) GetMessage() string { 68 | return C.GoString(&uei.Message[0]) 69 | } 70 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Gthulhu/plugin v1.0.1 h1:RC76Xah9D6IsvSa1N/H3kTTVgHRuRyMM9+y4B1veilM= 2 | github.com/Gthulhu/plugin v1.0.1/go.mod h1:PJn7yc+XAtSD8peMRyyNN/kznESJSnfQaaTX68yKBDo= 3 | github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM= 4 | github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw= 5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s= 8 | github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI= 9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 11 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 12 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 13 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 14 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 15 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 16 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 17 | github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= 18 | github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= 19 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 20 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 21 | golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= 22 | golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 23 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 24 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 25 | kernel.org/pub/linux/libs/security/libcap/cap v1.2.76 h1:mrdLPj8ujM6eIKGtd1PkkuCIodpFFDM42Cfm0YODkIM= 26 | kernel.org/pub/linux/libs/security/libcap/cap v1.2.76/go.mod h1:7V2BQeHnVAQwhCnCPJ977giCeGDiywVewWF+8vkpPlc= 27 | kernel.org/pub/linux/libs/security/libcap/psx v1.2.76 h1:3DyzQ30OHt3wiOZVL1se2g1PAPJIU7+tMUyvfMUj1dY= 28 | kernel.org/pub/linux/libs/security/libcap/psx v1.2.76/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24= 29 | -------------------------------------------------------------------------------- /wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef WRAPPER_H__ 2 | #define WRAPPER_H__ 3 | typedef unsigned int __u32; 4 | 5 | typedef __u32 u32; 6 | 7 | typedef signed char __s8; 8 | 9 | typedef unsigned char __u8; 10 | 11 | typedef short unsigned int __u16; 12 | 13 | typedef int __s32; 14 | 15 | typedef long long int __s64; 16 | 17 | typedef long long unsigned int __u64; 18 | 19 | typedef __s8 s8; 20 | 21 | typedef __u8 u8; 22 | 23 | typedef __u16 u16; 24 | 25 | typedef __s32 s32; 26 | 27 | typedef __s64 s64; 28 | 29 | typedef __u64 u64; 30 | 31 | enum uei_sizes { 32 | UEI_REASON_LEN = 128, 33 | UEI_MSG_LEN = 1024, 34 | UEI_DUMP_DFL_LEN = 32768, 35 | }; 36 | 37 | struct user_exit_info { 38 | int kind; 39 | s64 exit_code; 40 | char reason[UEI_REASON_LEN]; 41 | char msg[UEI_MSG_LEN]; 42 | }; 43 | #include "main.skeleton.h" 44 | 45 | void *open_skel(); 46 | 47 | u32 get_usersched_pid(); 48 | 49 | void set_usersched_pid(u32 id); 50 | 51 | void set_kugepagepid(u32 id); 52 | 53 | void set_debug(bool enabled); 54 | 55 | void set_builtin_idle(bool enabled); 56 | 57 | void set_early_processing(bool enabled); 58 | 59 | void set_default_slice(u64 t); 60 | 61 | u64 get_nr_scheduled(); 62 | 63 | u64 get_nr_queued(); 64 | 65 | void notify_complete(u64 nr_pending); 66 | 67 | void sub_nr_queued(); 68 | 69 | void dec_nr_queued(u64 num); 70 | 71 | void destroy_skel(void *); 72 | 73 | void set_scx_enums( 74 | u64 SCX_OPS_NAME_LEN, 75 | u64 SCX_SLICE_DFL, 76 | u64 SCX_SLICE_INF, 77 | u64 SCX_RQ_ONLINE, 78 | u64 SCX_RQ_CAN_STOP_TICK, 79 | u64 SCX_RQ_BAL_PENDING, 80 | u64 SCX_RQ_BAL_KEEP, 81 | u64 SCX_RQ_BYPASSING, 82 | u64 SCX_RQ_CLK_VALID, 83 | u64 SCX_RQ_IN_WAKEUP, 84 | u64 SCX_RQ_IN_BALANCE, 85 | u64 SCX_DSQ_FLAG_BUILTIN, 86 | u64 SCX_DSQ_FLAG_LOCAL_ON, 87 | u64 SCX_DSQ_INVALID, 88 | u64 SCX_DSQ_GLOBAL, 89 | u64 SCX_DSQ_LOCAL, 90 | u64 SCX_DSQ_LOCAL_ON, 91 | u64 SCX_DSQ_LOCAL_CPU_MASK, 92 | u64 SCX_TASK_QUEUED, 93 | u64 SCX_TASK_RESET_RUNNABLE_AT, 94 | u64 SCX_TASK_DEQD_FOR_SLEEP, 95 | u64 SCX_TASK_STATE_SHIFT, 96 | u64 SCX_TASK_STATE_BITS, 97 | u64 SCX_TASK_STATE_MASK, 98 | u64 SCX_TASK_CURSOR, 99 | u64 SCX_TASK_NONE, 100 | u64 SCX_TASK_INIT, 101 | u64 SCX_TASK_READY, 102 | u64 SCX_TASK_ENABLED, 103 | u64 SCX_TASK_NR_STATES, 104 | u64 SCX_TASK_DSQ_ON_PRIQ, 105 | u64 SCX_KICK_IDLE, 106 | u64 SCX_KICK_PREEMPT, 107 | u64 SCX_KICK_WAIT, 108 | u64 SCX_ENQ_WAKEUP, 109 | u64 SCX_ENQ_HEAD, 110 | u64 SCX_ENQ_PREEMPT, 111 | u64 SCX_ENQ_REENQ, 112 | u64 SCX_ENQ_LAST, 113 | u64 SCX_ENQ_CLEAR_OPSS, 114 | u64 SCX_ENQ_DSQ_PRIQ 115 | ); 116 | 117 | #endif -------------------------------------------------------------------------------- /goland_core/bss.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "unsafe" 8 | 9 | bpf "github.com/aquasecurity/libbpfgo" 10 | ) 11 | 12 | /* 13 | #include "wrapper.h" 14 | */ 15 | import "C" 16 | 17 | type BssData struct { 18 | Nr_running uint64 `json:"nr_running"` // Number of tasks currently running in the userspace scheduler 19 | Nr_queued uint64 `json:"nr_queued"` // Number of tasks queued in the userspace scheduler 20 | Nr_scheduled uint64 `json:"nr_scheduled"` // Number of tasks scheduled by the userspace scheduler 21 | Nr_online_cpus uint64 `json:"nr_online_cpus"` // Number of online CPUs in the system 22 | Usersched_last_run_at uint64 `json:"usersched_last_run_at"` // The PID of the userspace scheduler 23 | Nr_user_dispatches uint64 `json:"nr_user_dispatches"` // Number of user-space dispatches 24 | Nr_kernel_dispatches uint64 `json:"nr_kernel_dispatches"` // Number of kernel-space dispatches 25 | Nr_cancel_dispatches uint64 `json:"nr_cancel_dispatches"` // Number of cancelled dispatches 26 | Nr_bounce_dispatches uint64 `json:"nr_bounce_dispatches"` // Number of bounce dispatches 27 | Nr_failed_dispatches uint64 `json:"nr_failed_dispatches"` // Number of failed dispatches 28 | Nr_sched_congested uint64 `json:"nr_sched_congested"` // Number of times the scheduler was congested 29 | } 30 | 31 | func (data BssData) String() string { 32 | return fmt.Sprintf("Usersched_last_run_at: %v, Nr_queued: %v ", data.Usersched_last_run_at, data.Nr_queued) + 33 | fmt.Sprintf("Nr_scheduled: %v, Nr_running: %v ", data.Nr_scheduled, data.Nr_running) + 34 | fmt.Sprintf("Nr_online_cpus: %v, Nr_user_dispatches: %v ", data.Nr_online_cpus, data.Nr_user_dispatches) + 35 | fmt.Sprintf("Nr_kernel_dispatches: %v, Nr_cancel_dispatches: %v ", data.Nr_kernel_dispatches, data.Nr_cancel_dispatches) + 36 | fmt.Sprintf("Nr_bounce_dispatches: %v, Nr_failed_dispatches: %v", data.Nr_bounce_dispatches, data.Nr_failed_dispatches) + 37 | fmt.Sprintf("Nr_sched_congested: %v", data.Nr_sched_congested) 38 | } 39 | 40 | func LoadSkel() unsafe.Pointer { 41 | return C.open_skel() 42 | } 43 | 44 | func GetUserSchedPid() int { 45 | return int(C.get_usersched_pid()) 46 | } 47 | 48 | func GetNrQueued() uint64 { 49 | return uint64(C.get_nr_queued()) 50 | } 51 | func GetNrScheduled() uint64 { 52 | return uint64(C.get_nr_scheduled()) 53 | } 54 | 55 | func NotifyComplete(nr_pending uint64) error { 56 | C.notify_complete(C.u64(nr_pending)) 57 | return nil 58 | } 59 | 60 | func (s *Sched) SubNrQueued() error { 61 | C.sub_nr_queued() 62 | return nil 63 | } 64 | 65 | func (s *Sched) DecNrQueued(num int) error { 66 | C.dec_nr_queued(C.u64(num)) 67 | return nil 68 | } 69 | 70 | type BssMap struct { 71 | *bpf.BPFMap 72 | } 73 | 74 | func (s *Sched) GetBssData() (BssData, error) { 75 | if s.bss == nil { 76 | return BssData{}, fmt.Errorf("BssMap is nil") 77 | } 78 | i := 0 79 | b, err := s.bss.BPFMap.GetValue(unsafe.Pointer(&i)) 80 | if err != nil { 81 | return BssData{}, err 82 | } 83 | var bss BssData 84 | buff := bytes.NewBuffer(b) 85 | err = binary.Read(buff, binary.LittleEndian, &bss) 86 | if err != nil { 87 | return BssData{}, err 88 | } 89 | return bss, nil 90 | } 91 | -------------------------------------------------------------------------------- /intf.h: -------------------------------------------------------------------------------- 1 | // This software may be used and distributed according to the terms of the 2 | // GNU General Public License version 2. 3 | 4 | #ifndef __INTF_H 5 | #define __INTF_H 6 | 7 | #define MAX(x, y) ((x) > (y) ? (x) : (y)) 8 | #define MIN(x, y) ((x) < (y) ? (x) : (y)) 9 | 10 | #define NSEC_PER_SEC 1000000000L 11 | #define CLOCK_BOOTTIME 7 12 | 13 | #include 14 | #ifndef __kptr 15 | #ifdef __KERNEL__ 16 | #error "__kptr_ref not defined in the kernel" 17 | #endif 18 | #define __kptr 19 | #endif 20 | 21 | #ifndef __VMLINUX_H__ 22 | typedef unsigned char u8; 23 | typedef unsigned short u16; 24 | typedef unsigned int u32; 25 | typedef unsigned long u64; 26 | 27 | typedef signed char s8; 28 | typedef signed short s16; 29 | typedef signed int s32; 30 | typedef signed long s64; 31 | 32 | typedef int pid_t; 33 | #endif /* __VMLINUX_H__ */ 34 | 35 | /* Check a condition at build time */ 36 | #define BUILD_BUG_ON(expr) \ 37 | do { \ 38 | extern char __build_assert__[(expr) ? -1 : 1] \ 39 | __attribute__((unused)); \ 40 | } while(0) 41 | 42 | /* 43 | * Maximum amount of CPUs supported by this scheduler (this defines the size of 44 | * cpu_map that is used to store the idle state and CPU ownership). 45 | */ 46 | #define MAX_CPUS 1024 47 | 48 | /* Special dispatch flags */ 49 | enum { 50 | /* 51 | * Do not assign any specific CPU to the task. 52 | * 53 | * The task will be dispatched to the global shared DSQ and it will run 54 | * on the first CPU available. 55 | */ 56 | RL_CPU_ANY = 1 << 20, 57 | }; 58 | 59 | /* 60 | * Specify a target CPU for a specific PID. 61 | */ 62 | struct task_cpu_arg { 63 | pid_t pid; 64 | s32 cpu; 65 | u64 flags; 66 | }; 67 | 68 | struct preempt_cpu_arg { 69 | s32 cpu_id; 70 | }; 71 | 72 | /* 73 | * Specify a sibling CPU relationship for a specific scheduling domain. 74 | */ 75 | struct domain_arg { 76 | s32 lvl_id; 77 | s32 cpu_id; 78 | s32 sibling_cpu_id; 79 | }; 80 | 81 | /* 82 | * Task sent to the user-space scheduler by the BPF dispatcher. 83 | * 84 | * All attributes are collected from the kernel by the the BPF component. 85 | */ 86 | struct queued_task_ctx { 87 | s32 pid; 88 | s32 cpu; /* CPU where the task is running */ 89 | u64 nr_cpus_allowed; /* Number of CPUs that the task can use */ 90 | u64 flags; /* Task enqueue flags */ 91 | u64 start_ts; /* Timestamp since last time the task ran on a CPU */ 92 | u64 stop_ts; /* Timestamp since last time the task released a CPU */ 93 | u64 exec_runtime; /* Total cpu time since last sleep */ 94 | u64 weight; /* Task static priority */ 95 | u64 vtime; /* Current task's vruntime */ 96 | s32 tgid; 97 | }; 98 | 99 | /* 100 | * Task sent to the BPF dispatcher by the user-space scheduler. 101 | * 102 | * This struct can be easily extended to send more information to the 103 | * dispatcher (i.e., a target CPU, a variable time slice, etc.). 104 | */ 105 | struct dispatched_task_ctx { 106 | s32 pid; 107 | s32 cpu; /* CPU where the task should be dispatched */ 108 | u64 flags; /* task enqueue flags */ 109 | u64 slice_ns; /* time slice assigned to the task (0=default) */ 110 | u64 vtime; /* task deadline / vruntime */ 111 | u64 enq_cnt; 112 | }; 113 | 114 | #endif /* __INTF_H */ 115 | -------------------------------------------------------------------------------- /util/topology.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "os" 7 | "path/filepath" 8 | "strconv" 9 | "strings" 10 | 11 | core "github.com/Gthulhu/qumun/goland_core" 12 | ) 13 | 14 | func parseCPUs(cpuList string) ([]int, error) { 15 | var result []int 16 | segments := strings.Split(cpuList, ",") 17 | 18 | for _, segment := range segments { 19 | segment = strings.TrimSpace(segment) 20 | if strings.Contains(segment, "-") { 21 | bounds := strings.Split(segment, "-") 22 | if len(bounds) != 2 { 23 | return nil, fmt.Errorf("invalid range: %s", segment) 24 | } 25 | 26 | start, err := strconv.Atoi(bounds[0]) 27 | if err != nil { 28 | return nil, fmt.Errorf("invalid start of range: %s", bounds[0]) 29 | } 30 | 31 | end, err := strconv.Atoi(bounds[1]) 32 | if err != nil { 33 | return nil, fmt.Errorf("invalid end of range: %s", bounds[1]) 34 | } 35 | 36 | if start > end { 37 | return nil, fmt.Errorf("start greater than end in range: %s", segment) 38 | } 39 | for i := start; i <= end; i++ { 40 | result = append(result, i) 41 | } 42 | } else { 43 | num, err := strconv.Atoi(segment) 44 | if err != nil { 45 | return nil, fmt.Errorf("invalid number: %s", segment) 46 | } 47 | result = append(result, num) 48 | } 49 | } 50 | 51 | return result, nil 52 | } 53 | 54 | func GetTopology() (map[string]map[string][]int, error) { 55 | cacheDir := "/sys/devices/system/cpu/" 56 | cacheMap := map[string]map[string][]int{ 57 | "L2": {}, 58 | "L3": {}, 59 | } 60 | 61 | err := filepath.Walk(cacheDir, func(path string, info fs.FileInfo, err error) error { 62 | if err != nil { 63 | return err 64 | } 65 | var content []byte 66 | var key string 67 | if strings.HasSuffix(path, "shared_cpu_list") { 68 | if strings.Contains(path, "/cache/index2/") { 69 | content, err = os.ReadFile(path) 70 | if err != nil { 71 | return err 72 | } 73 | key = "L2" 74 | 75 | } else if strings.Contains(path, "/cache/index3/") { 76 | content, err = os.ReadFile(path) 77 | if err != nil { 78 | return err 79 | } 80 | key = "L3" 81 | } 82 | cpuIdList, err := parseCPUs(strings.TrimSpace(string(content))) 83 | if err != nil { 84 | return nil 85 | } 86 | cacheMap[key][strings.TrimSpace(string(content))] = cpuIdList 87 | } 88 | return nil 89 | }) 90 | 91 | if err != nil { 92 | return cacheMap, err 93 | } 94 | 95 | return cacheMap, nil 96 | } 97 | 98 | func initCacheDomains(bpfModule *core.Sched, level int32) error { 99 | topo, err := GetTopology() 100 | if err != nil { 101 | return err 102 | } 103 | l := "L2" 104 | if level == 3 { 105 | l = "L3" 106 | } 107 | for _, cpuIdList := range topo[l] { 108 | for _, cpuId := range cpuIdList { 109 | for _, sibCpuId := range cpuIdList { 110 | err = bpfModule.EnableSiblingCpu(level, int32(cpuId), int32(sibCpuId)) 111 | if err != nil { 112 | return fmt.Errorf("EnableSiblingCpu failed: lvl %v cpuId %v sibCpuId %v", level, cpuId, sibCpuId) 113 | } 114 | } 115 | } 116 | } 117 | return nil 118 | } 119 | 120 | func InitCacheDomains(bpfModule *core.Sched) error { 121 | err := initCacheDomains(bpfModule, 2) 122 | if err != nil { 123 | return err 124 | } 125 | err = initCacheDomains(bpfModule, 3) 126 | if err != nil { 127 | return err 128 | } 129 | return nil 130 | } 131 | -------------------------------------------------------------------------------- /util/emun_import.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | /* 4 | #include "wrapper.h" 5 | */ 6 | import "C" 7 | import ( 8 | "fmt" 9 | ) 10 | 11 | func defaultScxEnums() *ScxEnums { 12 | return &ScxEnums{ 13 | SCX_OPS_NAME_LEN: 128, 14 | SCX_SLICE_DFL: 20000000, 15 | SCX_SLICE_INF: 18446744073709551615, 16 | SCX_RQ_ONLINE: 1, 17 | SCX_RQ_CAN_STOP_TICK: 2, 18 | SCX_RQ_BAL_PENDING: 4, 19 | SCX_RQ_BAL_KEEP: 8, 20 | SCX_RQ_BYPASSING: 16, 21 | SCX_RQ_CLK_VALID: 32, 22 | SCX_RQ_IN_WAKEUP: 65536, 23 | SCX_RQ_IN_BALANCE: 131072, 24 | SCX_DSQ_FLAG_BUILTIN: 9223372036854775808, 25 | SCX_DSQ_FLAG_LOCAL_ON: 4611686018427387904, 26 | SCX_DSQ_INVALID: 9223372036854775808, 27 | SCX_DSQ_GLOBAL: 9223372036854775809, 28 | SCX_DSQ_LOCAL: 9223372036854775810, 29 | SCX_DSQ_LOCAL_ON: 13835058055282163712, 30 | SCX_DSQ_LOCAL_CPU_MASK: 4294967295, 31 | SCX_TASK_QUEUED: 1, 32 | SCX_TASK_RESET_RUNNABLE_AT: 4, 33 | SCX_TASK_DEQD_FOR_SLEEP: 8, 34 | SCX_TASK_STATE_SHIFT: 8, 35 | SCX_TASK_STATE_BITS: 2, 36 | SCX_TASK_STATE_MASK: 768, 37 | SCX_TASK_CURSOR: 18446744071562067968, // -2147483648 as uint64 38 | SCX_TASK_NONE: 0, 39 | SCX_TASK_INIT: 1, 40 | SCX_TASK_READY: 2, 41 | SCX_TASK_ENABLED: 3, 42 | SCX_TASK_NR_STATES: 4, 43 | SCX_TASK_DSQ_ON_PRIQ: 1, 44 | SCX_KICK_IDLE: 1, 45 | SCX_KICK_PREEMPT: 2, 46 | SCX_KICK_WAIT: 4, 47 | SCX_ENQ_WAKEUP: 1, 48 | SCX_ENQ_HEAD: 16, 49 | SCX_ENQ_PREEMPT: 4294967296, 50 | SCX_ENQ_REENQ: 1099511627776, 51 | SCX_ENQ_LAST: 2199023255552, 52 | SCX_ENQ_CLEAR_OPSS: 72057594037927936, 53 | SCX_ENQ_DSQ_PRIQ: 144115188075855872, 54 | } 55 | } 56 | 57 | func ImportScxEnums() error { 58 | e, err := GetScxEnums() 59 | if err != nil { 60 | e = defaultScxEnums() 61 | } 62 | if e == nil { 63 | return fmt.Errorf("ScxEnums instance is nil") 64 | } 65 | C.set_scx_enums( 66 | (C.u64)(e.SCX_OPS_NAME_LEN), 67 | (C.u64)(e.SCX_SLICE_DFL), 68 | (C.u64)(e.SCX_SLICE_INF), 69 | (C.u64)(e.SCX_RQ_ONLINE), 70 | (C.u64)(e.SCX_RQ_CAN_STOP_TICK), 71 | (C.u64)(e.SCX_RQ_BAL_PENDING), 72 | (C.u64)(e.SCX_RQ_BAL_KEEP), 73 | (C.u64)(e.SCX_RQ_BYPASSING), 74 | (C.u64)(e.SCX_RQ_CLK_VALID), 75 | (C.u64)(e.SCX_RQ_IN_WAKEUP), 76 | (C.u64)(e.SCX_RQ_IN_BALANCE), 77 | (C.u64)(e.SCX_DSQ_FLAG_BUILTIN), 78 | (C.u64)(e.SCX_DSQ_FLAG_LOCAL_ON), 79 | (C.u64)(e.SCX_DSQ_INVALID), 80 | (C.u64)(e.SCX_DSQ_GLOBAL), 81 | (C.u64)(e.SCX_DSQ_LOCAL), 82 | (C.u64)(e.SCX_DSQ_LOCAL_ON), 83 | (C.u64)(e.SCX_DSQ_LOCAL_CPU_MASK), 84 | (C.u64)(e.SCX_TASK_QUEUED), 85 | (C.u64)(e.SCX_TASK_RESET_RUNNABLE_AT), 86 | (C.u64)(e.SCX_TASK_DEQD_FOR_SLEEP), 87 | (C.u64)(e.SCX_TASK_STATE_SHIFT), 88 | (C.u64)(e.SCX_TASK_STATE_BITS), 89 | (C.u64)(e.SCX_TASK_STATE_MASK), 90 | (C.u64)(e.SCX_TASK_CURSOR), 91 | (C.u64)(e.SCX_TASK_NONE), 92 | (C.u64)(e.SCX_TASK_INIT), 93 | (C.u64)(e.SCX_TASK_READY), 94 | (C.u64)(e.SCX_TASK_ENABLED), 95 | (C.u64)(e.SCX_TASK_NR_STATES), 96 | (C.u64)(e.SCX_TASK_DSQ_ON_PRIQ), 97 | (C.u64)(e.SCX_KICK_IDLE), 98 | (C.u64)(e.SCX_KICK_PREEMPT), 99 | (C.u64)(e.SCX_KICK_WAIT), 100 | (C.u64)(e.SCX_ENQ_WAKEUP), 101 | (C.u64)(e.SCX_ENQ_HEAD), 102 | (C.u64)(e.SCX_ENQ_PREEMPT), 103 | (C.u64)(e.SCX_ENQ_REENQ), 104 | (C.u64)(e.SCX_ENQ_LAST), 105 | (C.u64)(e.SCX_ENQ_CLEAR_OPSS), 106 | (C.u64)(e.SCX_ENQ_DSQ_PRIQ), 107 | ) 108 | return nil 109 | } 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # qumun - A Custom Linux Scheduler Framework using eBPF and Go 2 | 3 | **qumun** is a scheduler development framework that empowers developers to build custom Linux schedulers using Golang and eBPF. 4 | 5 | The name qumun comes from the Bunun language (an Indigenous people of Taiwan), where it means “heart.” 6 | Just as the heart powers and sustains the body, a scheduler is the core heartbeat of the operating system — orchestrating tasks, balancing workloads, and keeping everything alive and running. 7 | 8 | By choosing this name, we hope to highlight both the technical role of a scheduler and also share a piece of Taiwan’s Indigenous culture with the global open-source community. 9 | 10 | ## DEMO 11 | 12 | ![](./assets/demo.gif) 13 | 14 | ## Overview 15 | 16 | ![](./assets/design.png) 17 | 18 | This scheduler is designed to prioritize interactive workloads over background CPU-intensive tasks. It's particularly suitable for: 19 | 20 | - Low-latency interactive applications 21 | - Gaming 22 | - Video conferencing 23 | - Live streaming 24 | 25 | The scheduler consists of two main components: 26 | 1. A BPF component that implements low-level sched-ext functionalities 27 | 2. A user-space scheduler written in Go with scx_goland_core that implements the actual scheduling policy 28 | 29 | ## Key Features 30 | 31 | - Virtual runtime (vruntime) based scheduling 32 | - Latency-sensitive task prioritization 33 | - Dynamic time slice adjustment 34 | - CPU topology aware task placement 35 | - Automatic idle CPU selection 36 | 37 | ## How It Works 38 | 39 | The scheduling policy is based on virtual runtime: 40 | - Each task receives a time slice of execution (slice_ns) 41 | - The actual execution time is adjusted based on task's static priority (weight) 42 | - Tasks are dispatched from lowest to highest vruntime 43 | - Latency-sensitive tasks receive priority boost based on voluntary context switches 44 | 45 | ## Building 46 | 47 | Prerequisites: 48 | - Go 1.22+ 49 | - LLVM/Clang 17+ 50 | - libbpf 51 | - Linux kernel 6.12+ with sched_ext support 52 | 53 | ## Usage 54 | 55 | ### Setting Up Dependencies 56 | 57 | First, clone the required dependencies: 58 | 59 | ```bash 60 | make dep 61 | git submodule init 62 | git submodule sync 63 | git submodule update 64 | cd scx 65 | meson setup build --prefix ~ 66 | meson compile -C build 67 | ``` 68 | 69 | This will clone libbpf and the custom libbpfgo fork needed for the project. 70 | 71 | ### Building the Scheduler 72 | 73 | Build the scheduler with: 74 | 75 | ```bash 76 | make build 77 | ``` 78 | 79 | This compiles the BPF program, builds libbpf, generates the skeleton, and builds the Go application. 80 | 81 | ### Testing the Scheduler 82 | 83 | To test the scheduler in a virtual environment using kernel v6.12.2: 84 | 85 | ```bash 86 | make test 87 | ``` 88 | 89 | This uses `vng` (virtual kernel playground) to run the scheduler with the appropriate kernel version. 90 | 91 | ### Running in Production 92 | 93 | To run the scheduler on your system: 94 | 95 | ```bash 96 | sudo ./main 97 | ``` 98 | 99 | The scheduler will run until terminated with Ctrl+C (SIGINT) or SIGTERM. 100 | 101 | ### Debugging 102 | 103 | If you need to inspect the BPF components, you can use: 104 | 105 | ```bash 106 | sudo bpftool prog list # List loaded BPF programs 107 | sudo bpftool map list # List BPF maps 108 | sudo cat /sys/kernel/debug/tracing/trace_pipe # View BPF trace output 109 | ``` 110 | 111 | ### Stress Testing by using `stress-ng` 112 | 113 | ``` 114 | stress-ng -c 20 --timeout 20s --metrics-brief 115 | ``` 116 | 117 | ## License 118 | 119 | This software is distributed under the terms of the GNU General Public License version 2. 120 | 121 | ## Contributing 122 | 123 | Contributions are welcome! Please feel free to submit pull requests or open issues for bugs and feature requests. -------------------------------------------------------------------------------- /goland_core/task.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "context" 5 | "encoding/binary" 6 | "fmt" 7 | "os" 8 | "runtime" 9 | "strconv" 10 | "strings" 11 | "unsafe" 12 | 13 | "github.com/Gthulhu/plugin/models" 14 | ) 15 | 16 | func (s *Sched) BlockTilReadyForDequeue(ctx context.Context) { 17 | for { 18 | select { 19 | case t, ok := <-s.queue: 20 | if !ok { 21 | runtime.Gosched() 22 | continue 23 | } 24 | s.queue <- t 25 | return 26 | case <-ctx.Done(): 27 | return 28 | } 29 | } 30 | } 31 | 32 | func (s *Sched) ReadyForDequeue() bool { 33 | select { 34 | case t, ok := <-s.queue: 35 | if !ok { 36 | return false 37 | } 38 | s.queue <- t 39 | return true 40 | default: 41 | return false 42 | } 43 | } 44 | 45 | func (s *Sched) DequeueTask(task *models.QueuedTask) { 46 | select { 47 | case t := <-s.queue: 48 | err := fastDecode(t, task) 49 | if err != nil { 50 | task.Pid = -1 51 | return 52 | } 53 | return 54 | default: 55 | task.Pid = -1 56 | return 57 | } 58 | } 59 | 60 | // Task queued for dispatching to the BPF component (see bpf_intf::dispatched_task_ctx). 61 | type DispatchedTask struct { 62 | Pid int32 // pid that uniquely identifies a task 63 | Cpu int32 // target CPU selected by the scheduler 64 | Flags uint64 // special dispatch flags 65 | SliceNs uint64 // time slice assigned to the task (0 = default) 66 | Vtime uint64 // task deadline / vruntime 67 | CpuMaskCnt uint64 // cpumask generation counter (private) 68 | } 69 | 70 | // NewDispatchedTask creates a DispatchedTask from a QueuedTask. 71 | func NewDispatchedTask(task *models.QueuedTask) *DispatchedTask { 72 | return &DispatchedTask{ 73 | Pid: task.Pid, 74 | Cpu: task.Cpu, 75 | Flags: task.Flags, 76 | SliceNs: 0, // use default time slice 77 | Vtime: 0, 78 | } 79 | } 80 | 81 | // func (s *Sched) DispatchTask(t *DispatchedTask) error { 82 | // if err := s.urb.Error(); err != nil { 83 | // return err 84 | // } 85 | // s.dispatch <- fastEncode(t) 86 | // return nil 87 | // } 88 | 89 | func (s *Sched) DispatchTask(t *DispatchedTask) error { 90 | return s.urb.Submit(fastEncode(t)) 91 | } 92 | 93 | func fastDecode(data []byte, task *models.QueuedTask) error { 94 | if len(data) < int(unsafe.Sizeof(models.QueuedTask{})) { 95 | return fmt.Errorf("data length is less than QueuedTask size") 96 | } 97 | task.Pid = int32(binary.LittleEndian.Uint32(data[0:4])) 98 | task.Cpu = int32(binary.LittleEndian.Uint32(data[4:8])) 99 | task.NrCpusAllowed = binary.LittleEndian.Uint64(data[8:16]) 100 | task.Flags = binary.LittleEndian.Uint64(data[16:24]) 101 | task.StartTs = binary.LittleEndian.Uint64(data[24:32]) 102 | task.StopTs = binary.LittleEndian.Uint64(data[32:40]) 103 | task.SumExecRuntime = binary.LittleEndian.Uint64(data[40:48]) 104 | task.Weight = binary.LittleEndian.Uint64(data[48:56]) 105 | task.Vtime = binary.LittleEndian.Uint64(data[56:64]) 106 | task.Tgid = int32(binary.LittleEndian.Uint32(data[64:68])) 107 | 108 | return nil 109 | } 110 | 111 | func fastEncode(t *DispatchedTask) []byte { 112 | data := make([]byte, 8*8) // 64 bytes 113 | 114 | binary.LittleEndian.PutUint32(data[0:4], uint32(t.Pid)) 115 | binary.LittleEndian.PutUint32(data[4:8], uint32(t.Cpu)) 116 | binary.LittleEndian.PutUint64(data[8:16], t.Flags) 117 | binary.LittleEndian.PutUint64(data[16:24], t.SliceNs) 118 | binary.LittleEndian.PutUint64(data[24:32], t.Vtime) 119 | binary.LittleEndian.PutUint64(data[32:40], t.CpuMaskCnt) 120 | 121 | return data 122 | } 123 | 124 | func IsSMTActive() (bool, error) { 125 | data, err := os.ReadFile("/sys/devices/system/cpu/smt/active") 126 | if err != nil { 127 | return false, err 128 | } 129 | 130 | contents := strings.TrimSpace(string(data)) 131 | smtActive, err := strconv.Atoi(contents) 132 | if err != nil { 133 | return false, err 134 | } 135 | 136 | return smtActive == 1, nil 137 | } 138 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Architecture configuration (default: x86_64, can override with ARCH=arm64) 2 | # 3 | # Usage: 4 | # make build # Build for x86_64 (default) 5 | # make build ARCH=arm64 # Build for ARM64 (requires cross-compilation tools) 6 | # 7 | # ARM64 cross-compilation requirements: 8 | # 1. Install cross-compiler: sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu 9 | # 2. Install ARM64 dev libraries: sudo apt-get install libelf-dev:arm64 zlib1g-dev:arm64 libzstd-dev:arm64 10 | # 3. May need to configure dpkg for multi-arch: sudo dpkg --add-architecture arm64 11 | # 12 | ARCH ?= x86_64 13 | 14 | # Architecture-specific settings 15 | ifeq ($(ARCH),arm64) 16 | ARCH_DEFINE = -D__TARGET_ARCH_arm64 17 | ARCH_CPU_FLAGS = -mcpu=v3 18 | ARCH_SCHED_INCLUDE = -I scx/scheds/include/arch/aarch64 19 | ARCH_INCLUDE_DIR = aarch64-linux-gnu 20 | GOARCH_ENV = CGO_ENABLED=1 GOARCH=arm64 21 | CGO_CC = aarch64-linux-gnu-gcc 22 | LIBBPF_CC = aarch64-linux-gnu-gcc 23 | else 24 | ARCH_DEFINE = -D__TARGET_ARCH_x86 25 | ARCH_CPU_FLAGS = -mcpu=v3 26 | ARCH_SCHED_INCLUDE = -I scx/scheds/include/arch/x86 27 | ARCH_INCLUDE_DIR = x86_64-linux-gnu 28 | GOARCH_ENV = 29 | CGO_CC = clang 30 | LIBBPF_CC = gcc 31 | endif 32 | 33 | OUTPUT = output 34 | LIBBPF_SRC = $(abspath libbpf/src) 35 | LIBBPF_OBJ = $(abspath $(OUTPUT)/libbpf.a) 36 | LIBBPF_OBJDIR = $(abspath ./$(OUTPUT)/libbpf) 37 | LIBBPF_DESTDIR = $(abspath ./$(OUTPUT)) 38 | 39 | 40 | TARGET = main 41 | BPF_TARGET = ${TARGET:=.bpf} 42 | BPF_C = ${BPF_TARGET:=.c} 43 | BPF_OBJ = ${BPF_C:.c=.o} 44 | 45 | BASEDIR = $(abspath .) 46 | OUTPUT = output 47 | LIBBPF_INCLUDE_UAPI = $(abspath ./libbpf/include/uapi) 48 | LIBBPF_OBJ = $(abspath $(OUTPUT)/libbpf.a) 49 | LIBBPF_OBJDIR = $(abspath ./$(OUTPUT)/libbpf) 50 | LIBBPF_DESTDIR = $(abspath ./$(OUTPUT)) 51 | CLANG_BPF_SYS_INCLUDES := `shell $(CLANG) -v -E - &1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'` 52 | CGOFLAG = $(GOARCH_ENV) CC=$(CGO_CC) CGO_CFLAGS="-I$(BASEDIR) -I$(BASEDIR)/$(OUTPUT)" CGO_LDFLAGS="-lelf -lz $(LIBBPF_OBJ) -lzstd $(BASEDIR)/libwrapper.a" 53 | 54 | .PHONY: build 55 | build: clean $(BPF_OBJ) libbpf libbpf-uapi wrapper 56 | $(CGOFLAG) go build -ldflags "-w -s" main.go 57 | 58 | test: build 59 | @echo "Running scheduler test..." 60 | @chmod +x scripts/test_scheduler.sh 61 | @vng -r v6.12.2 -- bash scripts/test_scheduler.sh 62 | 63 | .PHONY: libbpf-uapi 64 | libbpf-uapi: $(LIBBPF_SRC) 65 | UAPIDIR=$(LIBBPF_DESTDIR) \ 66 | $(MAKE) -C $(LIBBPF_SRC) install_uapi_headers 67 | 68 | .PHONY: libbpf 69 | libbpf: $(LIBBPF_SRC) $(wildcard $(LIBBPF_SRC)/*.[ch]) 70 | $(MAKE) -C $(LIBBPF_SRC) clean 71 | CC="$(LIBBPF_CC)" CFLAGS="-g -O2 -Wall -fpie" \ 72 | $(MAKE) -C $(LIBBPF_SRC) \ 73 | BUILD_STATIC_ONLY=1 \ 74 | OBJDIR=$(LIBBPF_OBJDIR) \ 75 | DESTDIR=$(LIBBPF_DESTDIR) \ 76 | INCLUDEDIR= LIBDIR= UAPIDIR= install 77 | $(eval STATIC=-extldflags -static) 78 | 79 | dep: 80 | git clone https://github.com/libbpf/libbpf.git && \ 81 | cd libbpf/src && \ 82 | git checkout 09b9e83 && \ 83 | make && \ 84 | sudo make install && \ 85 | cd - && \ 86 | git clone -b feat/skel https://github.com/Gthulhu/libbpfgo.git 87 | 88 | $(BPF_OBJ): %.o: %.c 89 | clang-17 \ 90 | -O2 -g -Wall -target bpf \ 91 | $(ARCH_DEFINE) $(ARCH_CPU_FLAGS) -mlittle-endian \ 92 | -idirafter /usr/lib/llvm-17/lib/clang/17/include -idirafter /usr/local/include -idirafter /usr/include/$(ARCH_INCLUDE_DIR) -idirafter /usr/include \ 93 | -I scx/scheds/vmlinux -I scx/build/libbpf/src/usr/include -I scx/build/libbpf/include/uapi -I scx/scheds/include $(ARCH_SCHED_INCLUDE) -I scx/scheds/include/bpf-compat -I scx/scheds/include/lib \ 94 | -Wno-compare-distinct-pointer-types \ 95 | -c $< -o $@ 96 | 97 | wrapper: 98 | bpftool gen skeleton main.bpf.o > main.skeleton.h 99 | $(CGO_CC) -g -O2 -Wall -fPIC -I scx/build/libbpf/src/usr/include -I scx/build/libbpf/include/uapi -I scx/scheds/include $(ARCH_SCHED_INCLUDE) -I scx/scheds/include/bpf-compat -I scx/scheds/include/lib -c wrapper.c -o wrapper.o 100 | ar rcs libwrapper.a wrapper.o 101 | 102 | clean: 103 | rm libwrapper.a || true 104 | rm *.skeleton.h || true 105 | rm *.ll *.o || true 106 | rm main || true -------------------------------------------------------------------------------- /wrapper.c: -------------------------------------------------------------------------------- 1 | #include "wrapper.h" 2 | 3 | struct main_bpf *global_obj; 4 | 5 | void *open_skel() { 6 | struct main_bpf *obj = NULL; 7 | obj = main_bpf__open(); 8 | main_bpf__create_skeleton(obj); 9 | global_obj = obj; 10 | return obj->obj; 11 | } 12 | 13 | u32 get_usersched_pid() { 14 | return global_obj->rodata->usersched_pid; 15 | } 16 | 17 | void set_usersched_pid(u32 id) { 18 | global_obj->rodata->usersched_pid = id; 19 | } 20 | 21 | void set_kugepagepid(u32 id) { 22 | global_obj->rodata->khugepaged_pid = id; 23 | } 24 | 25 | void set_early_processing(bool enabled) { 26 | global_obj->rodata->early_processing = enabled; 27 | } 28 | 29 | void set_default_slice(u64 t) { 30 | global_obj->rodata->default_slice = t; 31 | } 32 | 33 | void set_debug(bool enabled) { 34 | global_obj->rodata->debug = enabled; 35 | } 36 | 37 | void set_builtin_idle(bool enabled) { 38 | global_obj->rodata->builtin_idle = enabled; 39 | } 40 | 41 | u64 get_nr_scheduled() { 42 | return global_obj->bss->nr_scheduled; 43 | } 44 | 45 | u64 get_nr_queued() { 46 | return global_obj->bss->nr_queued; 47 | } 48 | 49 | void notify_complete(u64 nr_pending) { 50 | global_obj->bss->nr_scheduled = nr_pending; 51 | } 52 | 53 | void sub_nr_queued() { 54 | if (global_obj->bss->nr_queued){ 55 | global_obj->bss->nr_queued--; 56 | } 57 | } 58 | 59 | void dec_nr_queued(u64 num) { 60 | if (global_obj->bss->nr_queued){ 61 | global_obj->bss->nr_queued-=num; 62 | } 63 | } 64 | 65 | void destroy_skel(void*skel) { 66 | main_bpf__destroy(skel); 67 | } 68 | 69 | void set_scx_enums( 70 | u64 SCX_OPS_NAME_LEN, 71 | u64 SCX_SLICE_DFL, 72 | u64 SCX_SLICE_INF, 73 | u64 SCX_RQ_ONLINE, 74 | u64 SCX_RQ_CAN_STOP_TICK, 75 | u64 SCX_RQ_BAL_PENDING, 76 | u64 SCX_RQ_BAL_KEEP, 77 | u64 SCX_RQ_BYPASSING, 78 | u64 SCX_RQ_CLK_VALID, 79 | u64 SCX_RQ_IN_WAKEUP, 80 | u64 SCX_RQ_IN_BALANCE, 81 | u64 SCX_DSQ_FLAG_BUILTIN, 82 | u64 SCX_DSQ_FLAG_LOCAL_ON, 83 | u64 SCX_DSQ_INVALID, 84 | u64 SCX_DSQ_GLOBAL, 85 | u64 SCX_DSQ_LOCAL, 86 | u64 SCX_DSQ_LOCAL_ON, 87 | u64 SCX_DSQ_LOCAL_CPU_MASK, 88 | u64 SCX_TASK_QUEUED, 89 | u64 SCX_TASK_RESET_RUNNABLE_AT, 90 | u64 SCX_TASK_DEQD_FOR_SLEEP, 91 | u64 SCX_TASK_STATE_SHIFT, 92 | u64 SCX_TASK_STATE_BITS, 93 | u64 SCX_TASK_STATE_MASK, 94 | u64 SCX_TASK_CURSOR, 95 | u64 SCX_TASK_NONE, 96 | u64 SCX_TASK_INIT, 97 | u64 SCX_TASK_READY, 98 | u64 SCX_TASK_ENABLED, 99 | u64 SCX_TASK_NR_STATES, 100 | u64 SCX_TASK_DSQ_ON_PRIQ, 101 | u64 SCX_KICK_IDLE, 102 | u64 SCX_KICK_PREEMPT, 103 | u64 SCX_KICK_WAIT, 104 | u64 SCX_ENQ_WAKEUP, 105 | u64 SCX_ENQ_HEAD, 106 | u64 SCX_ENQ_PREEMPT, 107 | u64 SCX_ENQ_REENQ, 108 | u64 SCX_ENQ_LAST, 109 | u64 SCX_ENQ_CLEAR_OPSS, 110 | u64 SCX_ENQ_DSQ_PRIQ 111 | ) { 112 | if (!global_obj || !global_obj->rodata) return; 113 | global_obj->rodata->__SCX_OPS_NAME_LEN = SCX_OPS_NAME_LEN; 114 | global_obj->rodata->__SCX_SLICE_DFL = SCX_SLICE_DFL; 115 | global_obj->rodata->__SCX_SLICE_INF = SCX_SLICE_INF; 116 | global_obj->rodata->__SCX_RQ_ONLINE = SCX_RQ_ONLINE; 117 | global_obj->rodata->__SCX_RQ_CAN_STOP_TICK = SCX_RQ_CAN_STOP_TICK; 118 | global_obj->rodata->__SCX_RQ_BAL_PENDING = SCX_RQ_BAL_PENDING; 119 | global_obj->rodata->__SCX_RQ_BAL_KEEP = SCX_RQ_BAL_KEEP; 120 | global_obj->rodata->__SCX_RQ_BYPASSING = SCX_RQ_BYPASSING; 121 | global_obj->rodata->__SCX_RQ_CLK_VALID = SCX_RQ_CLK_VALID; 122 | global_obj->rodata->__SCX_RQ_IN_WAKEUP = SCX_RQ_IN_WAKEUP; 123 | global_obj->rodata->__SCX_RQ_IN_BALANCE = SCX_RQ_IN_BALANCE; 124 | global_obj->rodata->__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN; 125 | global_obj->rodata->__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON; 126 | global_obj->rodata->__SCX_DSQ_INVALID = SCX_DSQ_INVALID; 127 | global_obj->rodata->__SCX_DSQ_GLOBAL = SCX_DSQ_GLOBAL; 128 | global_obj->rodata->__SCX_DSQ_LOCAL = SCX_DSQ_LOCAL; 129 | global_obj->rodata->__SCX_DSQ_LOCAL_ON = SCX_DSQ_LOCAL_ON; 130 | global_obj->rodata->__SCX_DSQ_LOCAL_CPU_MASK = SCX_DSQ_LOCAL_CPU_MASK; 131 | global_obj->rodata->__SCX_TASK_QUEUED = SCX_TASK_QUEUED; 132 | global_obj->rodata->__SCX_TASK_RESET_RUNNABLE_AT = SCX_TASK_RESET_RUNNABLE_AT; 133 | global_obj->rodata->__SCX_TASK_DEQD_FOR_SLEEP = SCX_TASK_DEQD_FOR_SLEEP; 134 | global_obj->rodata->__SCX_TASK_STATE_SHIFT = SCX_TASK_STATE_SHIFT; 135 | global_obj->rodata->__SCX_TASK_STATE_BITS = SCX_TASK_STATE_BITS; 136 | global_obj->rodata->__SCX_TASK_STATE_MASK = SCX_TASK_STATE_MASK; 137 | global_obj->rodata->__SCX_TASK_CURSOR = SCX_TASK_CURSOR; 138 | global_obj->rodata->__SCX_TASK_NONE = SCX_TASK_NONE; 139 | global_obj->rodata->__SCX_TASK_INIT = SCX_TASK_INIT; 140 | global_obj->rodata->__SCX_TASK_READY = SCX_TASK_READY; 141 | global_obj->rodata->__SCX_TASK_ENABLED = SCX_TASK_ENABLED; 142 | global_obj->rodata->__SCX_TASK_NR_STATES = SCX_TASK_NR_STATES; 143 | global_obj->rodata->__SCX_TASK_DSQ_ON_PRIQ = SCX_TASK_DSQ_ON_PRIQ; 144 | global_obj->rodata->__SCX_KICK_IDLE = SCX_KICK_IDLE; 145 | global_obj->rodata->__SCX_KICK_PREEMPT = SCX_KICK_PREEMPT; 146 | global_obj->rodata->__SCX_KICK_WAIT = SCX_KICK_WAIT; 147 | global_obj->rodata->__SCX_ENQ_WAKEUP = SCX_ENQ_WAKEUP; 148 | global_obj->rodata->__SCX_ENQ_HEAD = SCX_ENQ_HEAD; 149 | global_obj->rodata->__SCX_ENQ_PREEMPT = SCX_ENQ_PREEMPT; 150 | global_obj->rodata->__SCX_ENQ_REENQ = SCX_ENQ_REENQ; 151 | global_obj->rodata->__SCX_ENQ_LAST = SCX_ENQ_LAST; 152 | global_obj->rodata->__SCX_ENQ_CLEAR_OPSS = SCX_ENQ_CLEAR_OPSS; 153 | global_obj->rodata->__SCX_ENQ_DSQ_PRIQ = SCX_ENQ_DSQ_PRIQ; 154 | } -------------------------------------------------------------------------------- /goland_core/rodata.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | /* 4 | #include "wrapper.h" 5 | */ 6 | import "C" 7 | 8 | import ( 9 | "bytes" 10 | "encoding/binary" 11 | "fmt" 12 | "os" 13 | "path/filepath" 14 | "strconv" 15 | "strings" 16 | "unsafe" 17 | 18 | bpf "github.com/aquasecurity/libbpfgo" 19 | ) 20 | 21 | type RodataMap struct { 22 | *bpf.BPFMap 23 | } 24 | 25 | type Rodata struct { 26 | DefaultSlice uint64 `json:"default_slice"` 27 | SmtEnabled bool `json:"smt_enabled"` 28 | Debug bool `json:"debug"` 29 | Pad0 [54]byte `json:"-"` // __pad0[54] 30 | SCXOpsNameLen uint64 `json:"scx_ops_name_len"` 31 | SCXSliceDfl uint64 `json:"scx_slice_dfl"` 32 | SCXSliceInf uint64 `json:"scx_slice_inf"` 33 | SCXRqOnline uint64 `json:"scx_rq_online"` 34 | SCXRqCanStopTick uint64 `json:"scx_rq_can_stop_tick"` 35 | SCXRqBalPending uint64 `json:"scx_rq_bal_pending"` 36 | SCXRqBalKeep uint64 `json:"scx_rq_bal_keep"` 37 | SCXRqBypassing uint64 `json:"scx_rq_bypassing"` 38 | SCXRqClkValid uint64 `json:"scx_rq_clk_valid"` 39 | SCXRqInWakeup uint64 `json:"scx_rq_in_wakeup"` 40 | SCXRqInBalance uint64 `json:"scx_rq_in_balance"` 41 | SCXDsqFlagBuiltin uint64 `json:"scx_dsq_flag_builtin"` 42 | SCXDsqFlagLocalOn uint64 `json:"scx_dsq_flag_local_on"` 43 | SCXDsqInvalid uint64 `json:"scx_dsq_invalid"` 44 | SCXDsqGlobal uint64 `json:"scx_dsq_global"` 45 | SCXDsqLocal uint64 `json:"scx_dsq_local"` 46 | SCXDsqLocalOn uint64 `json:"scx_dsq_local_on"` 47 | SCXDsqLocalCpuMask uint64 `json:"scx_dsq_local_cpu_mask"` 48 | SCXTaskQueued uint64 `json:"scx_task_queued"` 49 | SCXTaskResetRunnableAt uint64 `json:"scx_task_reset_runnable_at"` 50 | SCXTaskDeqdForSleep uint64 `json:"scx_task_deqd_for_sleep"` 51 | SCXTaskStateShift uint64 `json:"scx_task_state_shift"` 52 | SCXTaskStateBits uint64 `json:"scx_task_state_bits"` 53 | SCXTaskStateMask uint64 `json:"scx_task_state_mask"` 54 | SCXTaskCursor uint64 `json:"scx_task_cursor"` 55 | SCXTaskNone uint64 `json:"scx_task_none"` 56 | SCXTaskInit uint64 `json:"scx_task_init"` 57 | SCXTaskReady uint64 `json:"scx_task_ready"` 58 | SCXTaskEnabled uint64 `json:"scx_task_enabled"` 59 | SCXTaskNrStates uint64 `json:"scx_task_nr_states"` 60 | SCXTaskDsqOnPriq uint64 `json:"scx_task_dsq_on_priq"` 61 | SCXKickIdle uint64 `json:"scx_kick_idle"` 62 | SCXKickPreempt uint64 `json:"scx_kick_preempt"` 63 | SCXKickWait uint64 `json:"scx_kick_wait"` 64 | SCXEnqWakeup uint64 `json:"scx_enq_wakeup"` 65 | SCXEnqHead uint64 `json:"scx_enq_head"` 66 | SCXEnqPreempt uint64 `json:"scx_enq_preempt"` 67 | SCXEnqReenq uint64 `json:"scx_enq_reenq"` 68 | SCXEnqLast uint64 `json:"scx_enq_last"` 69 | SCXEnqClearOpss uint64 `json:"scx_enq_clear_opss"` 70 | SCXEnqDsqPriq uint64 `json:"scx_enq_dsq_priq"` 71 | UeiDumpLen uint32 `json:"uei_dump_len"` 72 | UserschedPid uint32 `json:"usersched_pid"` 73 | KhugepagePid uint32 `json:"khugepage_pid"` 74 | SwitchPartial bool `json:"switch_partial"` 75 | EarlyProcessing bool `json:"early_processing"` 76 | BuiltinIdle bool `json:"builtin_idle"` 77 | } 78 | 79 | func (s *Sched) GetRoData() (Rodata, error) { 80 | if s.rodata == nil { 81 | return Rodata{}, fmt.Errorf("BssMap is nil") 82 | } 83 | i := 0 84 | b, err := s.rodata.BPFMap.GetValue(unsafe.Pointer(&i)) 85 | if err != nil { 86 | return Rodata{}, err 87 | } 88 | var ro Rodata 89 | buff := bytes.NewBuffer(b) 90 | err = binary.Read(buff, binary.LittleEndian, &ro) 91 | if err != nil { 92 | return Rodata{}, err 93 | } 94 | return ro, nil 95 | } 96 | 97 | func (s *Sched) AssignUserSchedPid(pid int) error { 98 | C.set_kugepagepid(C.u32(KhugepagePid())) 99 | C.set_usersched_pid(C.u32(pid)) 100 | return nil 101 | } 102 | 103 | func (s *Sched) SetDebug(enabled bool) { 104 | C.set_debug(C.bool(enabled)) 105 | } 106 | 107 | func (s *Sched) SetBuiltinIdle(enabled bool) { 108 | C.set_builtin_idle(C.bool(enabled)) 109 | } 110 | 111 | func (s *Sched) SetEarlyProcessing(enabled bool) { 112 | C.set_early_processing(C.bool(enabled)) 113 | } 114 | 115 | func (s *Sched) SetDefaultSlice(t uint64) { 116 | C.set_default_slice(C.u64(t)) 117 | } 118 | 119 | // KhugepagePid finds and returns the PID of the khugepaged process 120 | func KhugepagePid() uint32 { 121 | procDir := "/proc" 122 | 123 | // Read all entries in /proc 124 | entries, err := os.ReadDir(procDir) 125 | if err != nil { 126 | return 0 127 | } 128 | 129 | for _, entry := range entries { 130 | // Skip non-directories and non-numeric directories 131 | if !entry.IsDir() { 132 | continue 133 | } 134 | 135 | pidStr := entry.Name() 136 | // Check if directory name is numeric (PID) 137 | if _, err := strconv.Atoi(pidStr); err != nil { 138 | continue 139 | } 140 | 141 | // Read the comm file to get process name 142 | commPath := filepath.Join(procDir, pidStr, "comm") 143 | commData, err := os.ReadFile(commPath) 144 | if err != nil { 145 | continue 146 | } 147 | 148 | comm := strings.TrimSpace(string(commData)) 149 | if comm != "khugepaged" { 150 | continue 151 | } 152 | 153 | // Check if exe symlink exists (should not exist for kernel threads like khugepaged) 154 | exePath := filepath.Join(procDir, pidStr, "exe") 155 | if _, err := os.Readlink(exePath); err == nil { 156 | // exe symlink exists, this is not a kernel thread 157 | continue 158 | } 159 | 160 | // Convert PID string to uint32 161 | if pid, err := strconv.ParseUint(pidStr, 10, 32); err == nil { 162 | return uint32(pid) 163 | } 164 | } 165 | 166 | return 0 167 | } 168 | -------------------------------------------------------------------------------- /goland_core/obj.go: -------------------------------------------------------------------------------- 1 | package core 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "syscall" 7 | "unsafe" 8 | 9 | "github.com/Gthulhu/plugin/models" 10 | "github.com/Gthulhu/plugin/plugin" 11 | bpf "github.com/aquasecurity/libbpfgo" 12 | "github.com/cilium/ebpf" 13 | "golang.org/x/sys/unix" 14 | ) 15 | 16 | const ( 17 | RL_CPU_ANY = 1 << 20 18 | ) 19 | 20 | type Sched struct { 21 | mod *bpf.Module 22 | plugin plugin.CustomScheduler 23 | bss *BssMap 24 | uei *UeiMap 25 | rodata *RodataMap 26 | structOps *bpf.BPFMap 27 | urb *bpf.UserRingBuffer 28 | queue chan []byte // The map containing tasks that are queued to user space from the kernel. 29 | dispatch chan []byte 30 | preemptCpu *ebpf.Program 31 | siblingCpu *ebpf.Program 32 | selectCpuPrg *ebpf.Program // Cilium eBPF program for syscall-based invocation 33 | } 34 | 35 | func init() { 36 | unix.Mlockall(syscall.MCL_CURRENT | syscall.MCL_FUTURE) 37 | } 38 | 39 | func LoadSched(objPath string) *Sched { 40 | obj := LoadSkel() 41 | bpfModule, err := bpf.NewModuleFromFileArgs(bpf.NewModuleArgs{ 42 | BPFObjPath: "", 43 | KernelLogLevel: 0, 44 | }) 45 | if err != nil { 46 | panic(err) 47 | } 48 | if err := bpfModule.BPFReplaceExistedObject(obj); err != nil { 49 | panic(err) 50 | } 51 | 52 | s := &Sched{ 53 | mod: bpfModule, 54 | } 55 | 56 | return s 57 | } 58 | 59 | func (s *Sched) SetPlugin(p plugin.CustomScheduler) { 60 | s.plugin = p 61 | } 62 | 63 | func (s *Sched) Start() { 64 | var err error 65 | bpfModule := s.mod 66 | bpfModule.BPFLoadObject() 67 | iters := bpfModule.Iterator() 68 | for { 69 | prog := iters.NextProgram() 70 | if prog == nil { 71 | break 72 | } 73 | if prog.Name() == "kprobe_handle_mm_fault" { 74 | log.Println("attach kprobe_handle_mm_fault") 75 | _, err := prog.AttachGeneric() 76 | if err != nil { 77 | log.Panicf("attach kprobe_handle_mm_fault failed: %v", err) 78 | } 79 | continue 80 | } 81 | if prog.Name() == "kretprobe_handle_mm_fault" { 82 | log.Println("attach kretprobe_handle_mm_fault") 83 | _, err := prog.AttachGeneric() 84 | if err != nil { 85 | log.Panicf("attach kretprobe_handle_mm_fault failed: %v", err) 86 | } 87 | continue 88 | } 89 | } 90 | iters = bpfModule.Iterator() 91 | for { 92 | m := iters.NextMap() 93 | if m == nil { 94 | break 95 | } 96 | fmt.Printf("map: %s, type: %s, fd: %d\n", m.Name(), m.Type().String(), m.FileDescriptor()) 97 | if m.Name() == "main_bpf.bss" { 98 | s.bss = &BssMap{m} 99 | } else if m.Name() == "main_bpf.data" { 100 | s.uei = &UeiMap{m} 101 | } else if m.Name() == "main_bpf.rodata" { 102 | s.rodata = &RodataMap{m} 103 | } else if m.Name() == "queued" { 104 | s.queue = make(chan []byte, 128) 105 | rb, err := s.mod.InitRingBuf("queued", s.queue) 106 | if err != nil { 107 | panic(err) 108 | } 109 | rb.Poll(10) 110 | } else if m.Name() == "dispatched" { 111 | s.dispatch = make(chan []byte, 128) 112 | s.urb, err = s.mod.InitUserRingBuf("dispatched", s.dispatch) 113 | if err != nil { 114 | panic(err) 115 | } 116 | // s.urb.Start() 117 | } 118 | if m.Type().String() == "BPF_MAP_TYPE_STRUCT_OPS" { 119 | s.structOps = m 120 | } 121 | } 122 | 123 | iters = bpfModule.Iterator() 124 | for { 125 | prog := iters.NextProgram() 126 | if prog == nil { 127 | break 128 | } 129 | 130 | if prog.Name() == "rs_select_cpu" { 131 | if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil { 132 | s.selectCpuPrg = ciliumProg 133 | } 134 | } 135 | 136 | if prog.Name() == "enable_sibling_cpu" { 137 | if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil { 138 | s.siblingCpu = ciliumProg 139 | } 140 | } 141 | 142 | if prog.Name() == "do_preempt" { 143 | if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil { 144 | s.preemptCpu = ciliumProg 145 | } 146 | } 147 | } 148 | } 149 | 150 | type task_cpu_arg struct { 151 | pid int32 152 | cpu int32 153 | flags uint64 154 | } 155 | 156 | var selectFailed error = fmt.Errorf("prog (selectCpu) not found") 157 | 158 | func (s *Sched) DefaultSelectCPU(t *models.QueuedTask) (error, int32) { 159 | return s.selectCPU(t) 160 | } 161 | 162 | func (s *Sched) selectCPU(t *models.QueuedTask) (error, int32) { 163 | if s.selectCpuPrg == nil { 164 | return selectFailed, 0 165 | } 166 | 167 | arg := task_cpu_arg{ 168 | pid: t.Pid, 169 | cpu: t.Cpu, 170 | flags: t.Flags, 171 | } 172 | 173 | data := (*[16]byte)(unsafe.Pointer(&arg))[:] 174 | 175 | ret, err := s.selectCpuPrg.Run(&ebpf.RunOptions{ 176 | Context: data[:], 177 | }) 178 | if err != nil { 179 | return err, 0 180 | } 181 | 182 | retVal := int32(ret) 183 | if ret > 2147483647 { 184 | return nil, RL_CPU_ANY 185 | } 186 | return nil, retVal 187 | } 188 | 189 | type preempt_arg struct { 190 | cpuId int32 191 | } 192 | 193 | type domain_arg struct { 194 | lvlId int32 195 | cpuId int32 196 | siblingCpuId int32 197 | } 198 | 199 | func (s *Sched) PreemptCpu(cpuId int32) error { 200 | if s.preemptCpu == nil { 201 | return fmt.Errorf("prog (preemptCpu) not found") 202 | } 203 | 204 | arg := preempt_arg{ 205 | cpuId: cpuId, 206 | } 207 | data := (*[4]byte)(unsafe.Pointer(&arg))[:] 208 | 209 | ret, err := s.preemptCpu.Run(&ebpf.RunOptions{ 210 | Context: data[:], 211 | }) 212 | if err != nil { 213 | return err 214 | } 215 | if ret != 0 { 216 | return fmt.Errorf("retVal: %v", ret) 217 | } 218 | return nil 219 | } 220 | 221 | func (s *Sched) EnableSiblingCpu(lvlId, cpuId, siblingCpuId int32) error { 222 | if s.siblingCpu == nil { 223 | return fmt.Errorf("prog (siblingCpu) not found") 224 | } 225 | 226 | arg := domain_arg{ 227 | lvlId: lvlId, 228 | cpuId: cpuId, 229 | siblingCpuId: siblingCpuId, 230 | } 231 | data := (*[12]byte)(unsafe.Pointer(&arg))[:] 232 | 233 | ret, err := s.siblingCpu.Run(&ebpf.RunOptions{ 234 | Context: data[:], 235 | }) 236 | if err != nil { 237 | return err 238 | } 239 | if ret != 0 { 240 | return fmt.Errorf("retVal: %v", ret) 241 | } 242 | return nil 243 | } 244 | 245 | func (s *Sched) Attach() error { 246 | _, err := s.structOps.AttachStructOps() 247 | return err 248 | } 249 | 250 | func (s *Sched) Close() { 251 | if s.selectCpuPrg != nil { 252 | s.selectCpuPrg.Close() 253 | } 254 | if s.siblingCpu != nil { 255 | s.siblingCpu.Close() 256 | } 257 | if s.preemptCpu != nil { 258 | s.preemptCpu.Close() 259 | } 260 | s.urb.Close() 261 | s.mod.Close() 262 | } 263 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "log" 7 | "os" 8 | "os/signal" 9 | "syscall" 10 | "time" 11 | 12 | "github.com/Gthulhu/plugin/models" 13 | core "github.com/Gthulhu/qumun/goland_core" 14 | "github.com/Gthulhu/qumun/util" 15 | ) 16 | 17 | const ( 18 | MAX_LATENCY_WEIGHT = 1000 19 | SLICE_NS_DEFAULT = 5000 * 1000 // 5ms 20 | SLICE_NS_MIN = 500 * 1000 21 | SCX_ENQ_WAKEUP = 1 22 | NSEC_PER_SEC = 1000000000 // 1 second in nanoseconds 23 | PF_WQ_WORKER = 0x00000020 24 | ) 25 | 26 | const taskPoolSize = 4096 27 | 28 | var taskPool = make([]Task, taskPoolSize) 29 | var taskPoolCount = 0 30 | var taskPoolHead, taskPoolTail int 31 | 32 | func DrainQueuedTask(s *core.Sched) int { 33 | var count int 34 | for (taskPoolTail+1)%taskPoolSize != taskPoolHead { 35 | var newQueuedTask models.QueuedTask 36 | s.DequeueTask(&newQueuedTask) 37 | if newQueuedTask.Pid == -1 { 38 | s.DecNrQueued(count) 39 | return count 40 | } 41 | deadline := updatedEnqueueTask(s, &newQueuedTask) 42 | t := Task{ 43 | QueuedTask: &newQueuedTask, 44 | Deadline: deadline, 45 | } 46 | InsertTaskToPool(t) 47 | count++ 48 | } 49 | return 0 50 | } 51 | 52 | var timeout = uint64(3 * NSEC_PER_SEC) 53 | 54 | func updatedEnqueueTask(s *core.Sched, t *models.QueuedTask) uint64 { 55 | if minVruntime < t.Vtime { 56 | minVruntime = t.Vtime 57 | } 58 | minVruntimeLocal := saturating_sub(minVruntime, SLICE_NS_DEFAULT) 59 | if t.Vtime == 0 { 60 | t.Vtime = minVruntimeLocal + (SLICE_NS_DEFAULT * 100 / t.Weight) 61 | } else if t.Vtime < minVruntimeLocal { 62 | t.Vtime = minVruntimeLocal 63 | } 64 | t.Vtime += (t.StopTs - t.StartTs) * t.Weight / 100 65 | 66 | return t.Vtime + min(t.SumExecRuntime, SLICE_NS_DEFAULT*100) 67 | } 68 | 69 | func GetTaskFromPool() *models.QueuedTask { 70 | if taskPoolHead == taskPoolTail { 71 | return nil 72 | } 73 | t := &taskPool[taskPoolHead] 74 | taskPoolHead = (taskPoolHead + 1) % taskPoolSize 75 | taskPoolCount-- 76 | return t.QueuedTask 77 | } 78 | 79 | var minVruntime uint64 = 0 // global vruntime 80 | 81 | func now() uint64 { 82 | return uint64(time.Now().UnixNano()) 83 | } 84 | 85 | func calcAvg(oldVal uint64, newVal uint64) uint64 { 86 | return (oldVal - (oldVal >> 2)) + (newVal >> 2) 87 | } 88 | 89 | func saturating_sub(a, b uint64) uint64 { 90 | if a > b { 91 | return a - b 92 | } 93 | return 0 94 | } 95 | 96 | type Task struct { 97 | *models.QueuedTask 98 | Deadline uint64 99 | Timestamp uint64 100 | } 101 | 102 | func LessQueuedTask( 103 | a, b *Task, 104 | ) bool { 105 | if a.Deadline != b.Deadline { 106 | return a.Deadline < b.Deadline 107 | } 108 | if a.Timestamp != b.Timestamp { 109 | return a.Timestamp < b.Timestamp 110 | } 111 | return a.Pid < b.Pid 112 | } 113 | 114 | func InsertTaskToPool( 115 | newTask Task, 116 | ) bool { 117 | if taskPoolCount >= taskPoolSize-1 { 118 | return false 119 | } 120 | insertIdx := taskPoolTail 121 | for i := 0; i < taskPoolCount; i++ { 122 | idx := (taskPoolHead + i) % taskPoolSize 123 | if LessQueuedTask( 124 | &newTask, 125 | &taskPool[idx], 126 | ) { 127 | insertIdx = idx 128 | break 129 | } 130 | } 131 | 132 | cur := taskPoolTail 133 | for cur != insertIdx { 134 | next := (cur - 1 + taskPoolSize) % taskPoolSize 135 | taskPool[cur] = taskPool[next] 136 | cur = next 137 | } 138 | taskPool[insertIdx] = newTask 139 | taskPoolTail = (taskPoolTail + 1) % taskPoolSize 140 | taskPoolCount++ 141 | return true 142 | } 143 | 144 | func main() { 145 | bpfModule := core.LoadSched("main.bpf.o") 146 | defer bpfModule.Close() 147 | pid := os.Getpid() 148 | err := bpfModule.AssignUserSchedPid(pid) 149 | if err != nil { 150 | log.Printf("AssignUserSchedPid failed: %v", err) 151 | } 152 | 153 | err = util.ImportScxEnums() 154 | if err != nil { 155 | log.Panicf("ImportScxEnums failed: %v", err) 156 | } 157 | 158 | bpfModule.SetDebug(true) 159 | bpfModule.SetBuiltinIdle(true) 160 | bpfModule.Start() 161 | 162 | err = util.InitCacheDomains(bpfModule) 163 | if err != nil { 164 | log.Panicf("InitCacheDomains failed: %v", err) 165 | } 166 | 167 | if err := bpfModule.Attach(); err != nil { 168 | log.Panicf("bpfModule attach failed: %v", err) 169 | } 170 | 171 | log.Printf("UserSched's Pid: %v", core.GetUserSchedPid()) 172 | log.Printf("scheduler started") 173 | 174 | signalChan := make(chan os.Signal, 1) 175 | signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) 176 | cont := true 177 | timer := time.NewTicker(1 * time.Second) 178 | notifyCount := 0 179 | 180 | ctx, cancel := context.WithCancel(context.Background()) 181 | 182 | go func() { 183 | var t *models.QueuedTask 184 | var task *core.DispatchedTask 185 | var err error 186 | var cpu int32 187 | 188 | for true { 189 | select { 190 | case <-ctx.Done(): 191 | return 192 | default: 193 | } 194 | t = GetTaskFromPool() 195 | if t == nil { 196 | bpfModule.BlockTilReadyForDequeue(ctx) 197 | DrainQueuedTask(bpfModule) 198 | } else if t.Pid != -1 { 199 | task = core.NewDispatchedTask(t) 200 | err, cpu = bpfModule.SelectCPU(t) 201 | if err != nil { 202 | log.Printf("SelectCPU failed: %v", err) 203 | return 204 | } 205 | 206 | // Evaluate used task time slice. 207 | nrWaiting := core.GetNrQueued() + core.GetNrScheduled() + 1 208 | task.Vtime = t.Vtime 209 | task.SliceNs = max(SLICE_NS_DEFAULT/nrWaiting, SLICE_NS_MIN) 210 | task.Cpu = cpu 211 | 212 | err = bpfModule.DispatchTask(task) 213 | if err != nil { 214 | log.Printf("DispatchTask failed: %v", err) 215 | return 216 | } 217 | 218 | err = core.NotifyComplete(uint64(taskPoolCount)) 219 | if err != nil { 220 | log.Printf("NotifyComplete failed: %v", err) 221 | return 222 | } 223 | } 224 | } 225 | }() 226 | 227 | for cont { 228 | select { 229 | case <-signalChan: 230 | log.Println("receive os signal") 231 | cancel() 232 | cont = false 233 | case <-timer.C: 234 | notifyCount++ 235 | if notifyCount%10 == 0 { 236 | bss, err := bpfModule.GetBssData() 237 | if err != nil { 238 | log.Println("GetBssData failed", "error", err) 239 | } else { 240 | b, err := json.Marshal(bss) 241 | if err != nil { 242 | log.Println("json.Marshal failed", "error", err) 243 | } else { 244 | log.Println("bss data", "data", string(b)) 245 | } 246 | } 247 | } 248 | if bpfModule.Stopped() { 249 | log.Println("bpfModule stopped") 250 | uei, err := bpfModule.GetUeiData() 251 | if err == nil { 252 | log.Println("uei", "kind", uei.Kind, "exitCode", uei.ExitCode, "reason", uei.GetReason(), "message", uei.GetMessage()) 253 | } else { 254 | log.Println("GetUeiData failed", "error", err) 255 | } 256 | cont = false 257 | } 258 | } 259 | } 260 | timer.Stop() 261 | log.Println("scheduler exit") 262 | } 263 | -------------------------------------------------------------------------------- /util/emun.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "sync" 8 | 9 | "github.com/cilium/ebpf/btf" 10 | ) 11 | 12 | // ScxEnums mirrors the Rust Enums struct, holding values read from BTF enums 13 | // in vmlinux. Missing symbols are left as zero. 14 | type ScxEnums struct { 15 | SCX_OPS_NAME_LEN uint64 16 | SCX_SLICE_DFL uint64 17 | SCX_SLICE_INF uint64 18 | SCX_RQ_ONLINE uint64 19 | SCX_RQ_CAN_STOP_TICK uint64 20 | SCX_RQ_BAL_PENDING uint64 21 | SCX_RQ_BAL_KEEP uint64 22 | SCX_RQ_BYPASSING uint64 23 | SCX_RQ_CLK_VALID uint64 24 | SCX_RQ_IN_WAKEUP uint64 25 | SCX_RQ_IN_BALANCE uint64 26 | SCX_DSQ_FLAG_BUILTIN uint64 27 | SCX_DSQ_FLAG_LOCAL_ON uint64 28 | SCX_DSQ_INVALID uint64 29 | SCX_DSQ_GLOBAL uint64 30 | SCX_DSQ_LOCAL uint64 31 | SCX_DSQ_LOCAL_ON uint64 32 | SCX_DSQ_LOCAL_CPU_MASK uint64 33 | SCX_TASK_QUEUED uint64 34 | SCX_TASK_RESET_RUNNABLE_AT uint64 35 | SCX_TASK_DEQD_FOR_SLEEP uint64 36 | SCX_TASK_STATE_SHIFT uint64 37 | SCX_TASK_STATE_BITS uint64 38 | SCX_TASK_STATE_MASK uint64 39 | SCX_TASK_CURSOR uint64 40 | SCX_TASK_NONE uint64 41 | SCX_TASK_INIT uint64 42 | SCX_TASK_READY uint64 43 | SCX_TASK_ENABLED uint64 44 | SCX_TASK_NR_STATES uint64 45 | SCX_TASK_DSQ_ON_PRIQ uint64 46 | SCX_KICK_IDLE uint64 47 | SCX_KICK_PREEMPT uint64 48 | SCX_KICK_WAIT uint64 49 | SCX_ENQ_WAKEUP uint64 50 | SCX_ENQ_HEAD uint64 51 | SCX_ENQ_PREEMPT uint64 52 | SCX_ENQ_REENQ uint64 53 | SCX_ENQ_LAST uint64 54 | SCX_ENQ_CLEAR_OPSS uint64 55 | SCX_ENQ_DSQ_PRIQ uint64 56 | } 57 | 58 | var ( 59 | loadOnce sync.Once 60 | enumsInst *ScxEnums 61 | loadErr error 62 | ) 63 | 64 | // VmlinuxBTFPathEnv allows overriding the BTF vmlinux path. 65 | const VmlinuxBTFPathEnv = "QUMUN_VMLINUX_BTF" 66 | 67 | // Default vmlinux BTF path. 68 | const defaultVmlinuxBTF = "/sys/kernel/btf/vmlinux" 69 | 70 | // GetScxEnums returns the loaded enumeration values, performing a lazy load on first call. 71 | func GetScxEnums() (*ScxEnums, error) { 72 | loadOnce.Do(func() { 73 | enumsInst, loadErr = loadFromBTF() 74 | }) 75 | return enumsInst, loadErr 76 | } 77 | 78 | // loadFromBTF performs the actual parsing of BTF enums from vmlinux. 79 | func loadFromBTF() (*ScxEnums, error) { 80 | path := os.Getenv(VmlinuxBTFPathEnv) 81 | if path == "" { 82 | path = defaultVmlinuxBTF 83 | } 84 | spec, err := btf.LoadSpec(path) 85 | if err != nil { 86 | return nil, fmt.Errorf("load BTF spec from %s: %w", path, err) 87 | } 88 | 89 | enumCache := map[string]*btf.Enum{} 90 | // Build a lookup map for required enum type names. 91 | needed := map[string]struct{}{ 92 | "scx_public_consts": {}, 93 | "scx_rq_flags": {}, 94 | "scx_dsq_id_flags": {}, 95 | "scx_ent_flags": {}, 96 | "scx_task_state": {}, 97 | "scx_ent_dsq_flags": {}, 98 | "scx_kick_flags": {}, 99 | "scx_enq_flags": {}, 100 | } 101 | 102 | for t, err := range spec.All() { 103 | if err != nil { 104 | return nil, fmt.Errorf("iterate BTF types: %w", err) 105 | } 106 | if e, ok := t.(*btf.Enum); ok { 107 | if _, wanted := needed[e.Name]; wanted { 108 | enumCache[e.Name] = e 109 | } 110 | } 111 | } 112 | if len(enumCache) == 0 { 113 | return nil, errors.New("no required SCX enum types found in BTF") 114 | } 115 | 116 | read := func(enumType, name string) uint64 { 117 | e := enumCache[enumType] 118 | if e == nil { 119 | return 0 120 | } 121 | for _, v := range e.Values { 122 | if v.Name == name { 123 | return uint64(v.Value) 124 | } 125 | } 126 | return 0 127 | } 128 | 129 | scx := &ScxEnums{ 130 | SCX_OPS_NAME_LEN: read("scx_public_consts", "SCX_OPS_NAME_LEN"), 131 | SCX_SLICE_DFL: read("scx_public_consts", "SCX_SLICE_DFL"), 132 | SCX_SLICE_INF: read("scx_public_consts", "SCX_SLICE_INF"), 133 | SCX_RQ_ONLINE: read("scx_rq_flags", "SCX_RQ_ONLINE"), 134 | SCX_RQ_CAN_STOP_TICK: read("scx_rq_flags", "SCX_RQ_CAN_STOP_TICK"), 135 | SCX_RQ_BAL_PENDING: read("scx_rq_flags", "SCX_RQ_BAL_PENDING"), 136 | SCX_RQ_BAL_KEEP: read("scx_rq_flags", "SCX_RQ_BAL_KEEP"), 137 | SCX_RQ_BYPASSING: read("scx_rq_flags", "SCX_RQ_BYPASSING"), 138 | SCX_RQ_CLK_VALID: read("scx_rq_flags", "SCX_RQ_CLK_VALID"), 139 | SCX_RQ_IN_WAKEUP: read("scx_rq_flags", "SCX_RQ_IN_WAKEUP"), 140 | SCX_RQ_IN_BALANCE: read("scx_rq_flags", "SCX_RQ_IN_BALANCE"), 141 | SCX_DSQ_FLAG_BUILTIN: read("scx_dsq_id_flags", "SCX_DSQ_FLAG_BUILTIN"), 142 | SCX_DSQ_FLAG_LOCAL_ON: read("scx_dsq_id_flags", "SCX_DSQ_FLAG_LOCAL_ON"), 143 | SCX_DSQ_INVALID: read("scx_dsq_id_flags", "SCX_DSQ_INVALID"), 144 | SCX_DSQ_GLOBAL: read("scx_dsq_id_flags", "SCX_DSQ_GLOBAL"), 145 | SCX_DSQ_LOCAL: read("scx_dsq_id_flags", "SCX_DSQ_LOCAL"), 146 | SCX_DSQ_LOCAL_ON: read("scx_dsq_id_flags", "SCX_DSQ_LOCAL_ON"), 147 | SCX_DSQ_LOCAL_CPU_MASK: read("scx_dsq_id_flags", "SCX_DSQ_LOCAL_CPU_MASK"), 148 | SCX_TASK_QUEUED: read("scx_ent_flags", "SCX_TASK_QUEUED"), 149 | SCX_TASK_RESET_RUNNABLE_AT: read("scx_ent_flags", "SCX_TASK_RESET_RUNNABLE_AT"), 150 | SCX_TASK_DEQD_FOR_SLEEP: read("scx_ent_flags", "SCX_TASK_DEQD_FOR_SLEEP"), 151 | SCX_TASK_STATE_SHIFT: read("scx_ent_flags", "SCX_TASK_STATE_SHIFT"), 152 | SCX_TASK_STATE_BITS: read("scx_ent_flags", "SCX_TASK_STATE_BITS"), 153 | SCX_TASK_STATE_MASK: read("scx_ent_flags", "SCX_TASK_STATE_MASK"), 154 | SCX_TASK_CURSOR: read("scx_ent_flags", "SCX_TASK_CURSOR"), 155 | SCX_TASK_NONE: read("scx_task_state", "SCX_TASK_NONE"), 156 | SCX_TASK_INIT: read("scx_task_state", "SCX_TASK_INIT"), 157 | SCX_TASK_READY: read("scx_task_state", "SCX_TASK_READY"), 158 | SCX_TASK_ENABLED: read("scx_task_state", "SCX_TASK_ENABLED"), 159 | SCX_TASK_NR_STATES: read("scx_task_state", "SCX_TASK_NR_STATES"), 160 | SCX_TASK_DSQ_ON_PRIQ: read("scx_ent_dsq_flags", "SCX_TASK_DSQ_ON_PRIQ"), 161 | SCX_KICK_IDLE: read("scx_kick_flags", "SCX_KICK_IDLE"), 162 | SCX_KICK_PREEMPT: read("scx_kick_flags", "SCX_KICK_PREEMPT"), 163 | SCX_KICK_WAIT: read("scx_kick_flags", "SCX_KICK_WAIT"), 164 | SCX_ENQ_WAKEUP: read("scx_enq_flags", "SCX_ENQ_WAKEUP"), 165 | SCX_ENQ_HEAD: read("scx_enq_flags", "SCX_ENQ_HEAD"), 166 | SCX_ENQ_PREEMPT: read("scx_enq_flags", "SCX_ENQ_PREEMPT"), 167 | SCX_ENQ_REENQ: read("scx_enq_flags", "SCX_ENQ_REENQ"), 168 | SCX_ENQ_LAST: read("scx_enq_flags", "SCX_ENQ_LAST"), 169 | SCX_ENQ_CLEAR_OPSS: read("scx_enq_flags", "SCX_ENQ_CLEAR_OPSS"), 170 | SCX_ENQ_DSQ_PRIQ: read("scx_enq_flags", "SCX_ENQ_DSQ_PRIQ"), 171 | } 172 | 173 | return scx, nil 174 | } 175 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. -------------------------------------------------------------------------------- /main.bpf.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Ian Chen */ 2 | /* The scx_goland is based on scx_rustland_core */ 3 | /* Copyright (c) Andrea Righi */ 4 | /* 5 | * scx_rustland_core: BPF backend for schedulers running in user-space. 6 | * 7 | * This BPF backend implements the low level sched-ext functionalities for a 8 | * user-space counterpart, that implements the actual scheduling policy. 9 | * 10 | * The BPF part collects total cputime and weight from the tasks that need to 11 | * run, then it sends all details to the user-space scheduler that decides the 12 | * best order of execution of the tasks (based on the collected metrics). 13 | * 14 | * The user-space scheduler then returns to the BPF component the list of tasks 15 | * to be dispatched in the proper order. 16 | * 17 | * Messages between the BPF component and the user-space scheduler are passed 18 | * using BPF_MAP_TYPE_RINGBUF / BPF_MAP_TYPE_USER_RINGBUF maps: @queued for 19 | * the messages sent by the BPF dispatcher to the user-space scheduler and 20 | * @dispatched for the messages sent by the user-space scheduler to the BPF 21 | * dispatcher. 22 | * 23 | * The BPF dispatcher is completely agnostic of the particular scheduling 24 | * policy implemented in user-space. For this reason developers that are 25 | * willing to use this scheduler to experiment scheduling policies should be 26 | * able to simply modify the Rust component, without having to deal with any 27 | * internal kernel / BPF details. 28 | * 29 | * This software may be used and distributed according to the terms of the 30 | * GNU General Public License version 2. 31 | */ 32 | #ifdef LSP 33 | #define __bpf__ 34 | #include "../../../../scheds/include/scx/common.bpf.h" 35 | #else 36 | #include 37 | #endif 38 | 39 | #include 40 | #include "intf.h" 41 | #include 42 | 43 | /* Compatibility fallbacks for kernel flag macros that may not be defined 44 | * in older build environments or trimmed header sets used during BPF 45 | * compilation. Define them as 0 if missing so bitwise checks become no-op. 46 | */ 47 | #ifndef PF_KSWAPD 48 | #define PF_KSWAPD 0 49 | #endif 50 | #ifndef PF_KCOMPACTD 51 | #define PF_KCOMPACTD 0 52 | #endif 53 | 54 | char _license[] SEC("license") = "GPL"; 55 | 56 | UEI_DEFINE(uei); 57 | 58 | /* 59 | * Introduce a custom DSQ shared across all the CPUs, where we can dispatch 60 | * tasks that will be executed on the first CPU available. 61 | * 62 | * Per-CPU DSQs are also provided, to allow the scheduler to run a task on a 63 | * specific CPU (see dsq_init()). 64 | */ 65 | #define SHARED_DSQ MAX_CPUS 66 | 67 | /* 68 | * The user-space scheduler itself is dispatched using a separate DSQ, that 69 | * is consumed after all other DSQs. 70 | * 71 | * This ensures to work in bursts: tasks are queued, then the user-space 72 | * scheduler runs and dispatches them. Once all these tasks exhaust their 73 | * time slices, the scheduler is invoked again, repeating the cycle. 74 | */ 75 | #define SCHED_DSQ (MAX_CPUS + 1) 76 | 77 | /* 78 | * Safety cap for dispatching usersched threads per invocation. 79 | */ 80 | #define MAX_USERSCHED_DISPATCH 64 81 | 82 | /* 83 | * Scheduler attributes and statistics. 84 | */ 85 | const volatile u32 usersched_pid; /* User-space scheduler PID */ 86 | const volatile u32 khugepaged_pid; /* khugepaged PID */ 87 | u64 usersched_last_run_at; /* Timestamp of the last user-space scheduler execution */ 88 | static u64 nr_cpu_ids; /* Maximum possible CPU number */ 89 | 90 | /* 91 | * Number of tasks that are queued for scheduling. 92 | * 93 | * This number is incremented by the BPF component when a task is queued to the 94 | * user-space scheduler and it must be decremented by the user-space scheduler 95 | * when a task is consumed. 96 | */ 97 | volatile u64 nr_queued; 98 | 99 | /* 100 | * Number of tasks that are waiting for scheduling. 101 | * 102 | * This number must be updated by the user-space scheduler to keep track if 103 | * there is still some scheduling work to do. 104 | */ 105 | volatile u64 nr_scheduled; 106 | 107 | /* 108 | * Amount of currently running tasks. 109 | */ 110 | volatile u64 nr_running, nr_online_cpus; 111 | 112 | /* Dispatch statistics */ 113 | volatile u64 nr_user_dispatches, nr_kernel_dispatches, 114 | nr_cancel_dispatches, nr_bounce_dispatches; 115 | 116 | /* Failure statistics */ 117 | volatile u64 nr_failed_dispatches, nr_sched_congested; 118 | 119 | /* Report additional debugging information */ 120 | const volatile bool debug; 121 | 122 | const volatile bool early_processing; 123 | 124 | const volatile u64 default_slice = 20000000ULL; 125 | 126 | /* Rely on the in-kernel idle CPU selection policy */ 127 | const volatile bool builtin_idle; 128 | 129 | /* Allow to use bpf_printk() only when @debug is set */ 130 | #define dbg_msg(_fmt, ...) do { \ 131 | if (debug) \ 132 | bpf_printk(_fmt, ##__VA_ARGS__); \ 133 | } while(0) 134 | 135 | /* 136 | * CPUs in the system have SMT is enabled. 137 | */ 138 | const volatile bool smt_enabled = true; 139 | 140 | /* 141 | * Allocate/re-allocate a new cpumask. 142 | */ 143 | static int calloc_cpumask(struct bpf_cpumask **p_cpumask) 144 | { 145 | struct bpf_cpumask *cpumask; 146 | 147 | cpumask = bpf_cpumask_create(); 148 | if (!cpumask) 149 | return -ENOMEM; 150 | 151 | cpumask = bpf_kptr_xchg(p_cpumask, cpumask); 152 | if (cpumask) 153 | bpf_cpumask_release(cpumask); 154 | 155 | return 0; 156 | } 157 | 158 | /* 159 | * Maximum amount of tasks queued between kernel and user-space at a certain 160 | * time. 161 | * 162 | * The @queued and @dispatched lists are used in a producer/consumer fashion 163 | * between the BPF part and the user-space part. 164 | */ 165 | #define MAX_ENQUEUED_TASKS 4096 166 | 167 | /* 168 | * Maximum amount of slots reserved to the tasks dispatched via shared queue. 169 | */ 170 | #define MAX_DISPATCH_SLOT (MAX_ENQUEUED_TASKS / 8) 171 | 172 | /* 173 | * The map containing tasks that are queued to user space from the kernel. 174 | * 175 | * This map is drained by the user-space scheduler. 176 | */ 177 | struct { 178 | __uint(type, BPF_MAP_TYPE_RINGBUF); 179 | __uint(max_entries, MAX_ENQUEUED_TASKS * 180 | sizeof(struct queued_task_ctx)); 181 | } queued SEC(".maps"); 182 | 183 | /* 184 | * The user ring buffer containing pids that are dispatched from user space to 185 | * the kernel. 186 | * 187 | * Drained by the kernel in .dispatch(). 188 | */ 189 | struct { 190 | __uint(type, BPF_MAP_TYPE_USER_RINGBUF); 191 | __uint(max_entries, MAX_ENQUEUED_TASKS * 192 | sizeof(struct dispatched_task_ctx)); 193 | } dispatched SEC(".maps"); 194 | 195 | /* 196 | * Map to track PIDs with vtime==0 (priority tasks). 197 | * 198 | * This hashmap stores PIDs as both key and value for tasks that have 199 | * vtime set to 0, indicating they are high priority tasks. 200 | */ 201 | struct { 202 | __uint(type, BPF_MAP_TYPE_HASH); 203 | __type(key, u32); /* PID */ 204 | __type(value, u64); /* time slice */ 205 | __uint(max_entries, MAX_ENQUEUED_TASKS); 206 | } priority_tasks SEC(".maps"); 207 | 208 | struct { 209 | __uint(type, BPF_MAP_TYPE_HASH); 210 | __type(key, s32); /* CPU */ 211 | __type(value, u32); /* PID */ 212 | __uint(max_entries, MAX_CPUS); 213 | } running_task SEC(".maps"); 214 | 215 | /* 216 | * Per-CPU context. 217 | */ 218 | struct cpu_ctx { 219 | struct bpf_cpumask __kptr *l2_cpumask; 220 | struct bpf_cpumask __kptr *l3_cpumask; 221 | }; 222 | 223 | struct { 224 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 225 | __type(key, u32); 226 | __type(value, struct cpu_ctx); 227 | __uint(max_entries, 1); 228 | } cpu_ctx_stor SEC(".maps"); 229 | 230 | /* 231 | * Return a CPU context. 232 | */ 233 | struct cpu_ctx *try_lookup_cpu_ctx(s32 cpu) 234 | { 235 | const u32 idx = 0; 236 | return bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &idx, cpu); 237 | } 238 | 239 | /* 240 | * Per-task local storage. 241 | * 242 | * This contain all the per-task information used internally by the BPF code. 243 | */ 244 | struct task_ctx { 245 | /* 246 | * Temporary cpumask for calculating scheduling domains. 247 | */ 248 | struct bpf_cpumask __kptr *l2_cpumask; 249 | struct bpf_cpumask __kptr *l3_cpumask; 250 | 251 | /* 252 | * Timestamp since last time the task ran on a CPU. 253 | */ 254 | u64 start_ts; 255 | 256 | /* 257 | * Timestamp since last time the task released a CPU. 258 | */ 259 | u64 stop_ts; 260 | 261 | /* 262 | * Execution time (in nanoseconds) since the last sleep event. 263 | */ 264 | u64 exec_runtime; 265 | }; 266 | 267 | /* Map that contains task-local storage. */ 268 | struct { 269 | __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 270 | __uint(map_flags, BPF_F_NO_PREALLOC); 271 | __type(key, int); 272 | __type(value, struct task_ctx); 273 | } task_ctx_stor SEC(".maps"); 274 | 275 | /* 276 | * Return a local task context from a generic task or NULL if the context 277 | * doesn't exist. 278 | */ 279 | struct task_ctx *try_lookup_task_ctx(const struct task_struct *p) 280 | { 281 | struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, 282 | (struct task_struct *)p, 0, 0); 283 | if (!tctx) 284 | dbg_msg("warning: failed to get task context for pid=%d (%s)", 285 | p->pid, p->comm); 286 | return tctx; 287 | } 288 | 289 | /* 290 | * Heartbeat timer used to periodically trigger the check to run the user-space 291 | * scheduler. 292 | * 293 | * Without this timer we may starve the scheduler if the system is completely 294 | * idle and hit the watchdog that would auto-kill this scheduler. 295 | */ 296 | struct usersched_timer { 297 | struct bpf_timer timer; 298 | }; 299 | 300 | struct { 301 | __uint(type, BPF_MAP_TYPE_ARRAY); 302 | __uint(max_entries, 1); 303 | __type(key, u32); 304 | __type(value, struct usersched_timer); 305 | } usersched_timer SEC(".maps"); 306 | 307 | /* 308 | * Time period of the scheduler heartbeat, used to periodically kick the 309 | * user-space scheduler and check if there is any pending activity. 310 | */ 311 | #define USERSCHED_TIMER_NS (NSEC_PER_SEC / 10) 312 | 313 | /* 314 | * Return true if the target task @p is the user-space scheduler. 315 | */ 316 | static inline bool is_usersched_task(const struct task_struct *p) 317 | { 318 | return p->pid == usersched_pid; 319 | } 320 | 321 | /* 322 | * Return true if the target task @p belongs to the user-space scheduler. 323 | */ 324 | static inline bool is_belong_usersched_task(const struct task_struct *p) 325 | { 326 | return p->tgid == usersched_pid; 327 | } 328 | 329 | /* 330 | * Return true if the target task @p is a kernel thread. 331 | */ 332 | static inline bool is_kthread(const struct task_struct *p) 333 | { 334 | return p->flags & PF_KTHREAD; 335 | } 336 | 337 | /* 338 | * Return true if the target task @p is a kworker thread. 339 | */ 340 | static inline bool is_kworker(const struct task_struct *p) 341 | { 342 | return p->flags & PF_WQ_WORKER; 343 | } 344 | 345 | /* 346 | * Return true if the target task @p is kswapd. 347 | */ 348 | static inline bool is_kswapd(const struct task_struct *p) 349 | { 350 | return p->flags & (PF_KSWAPD | PF_KCOMPACTD); 351 | } 352 | 353 | /* 354 | * Return true if the target task @p is khugepaged, false otherwise. 355 | */ 356 | static inline bool is_khugepaged(const struct task_struct *p) 357 | { 358 | return khugepaged_pid && p->pid == khugepaged_pid; 359 | } 360 | 361 | /* 362 | * Return true if @p still wants to run, false otherwise. 363 | */ 364 | static bool is_queued(const struct task_struct *p) 365 | { 366 | return p->scx.flags & SCX_TASK_QUEUED; 367 | } 368 | 369 | /* 370 | * Flag used to wake-up the user-space scheduler. 371 | */ 372 | static volatile u32 usersched_needed; 373 | 374 | /* 375 | * Set user-space scheduler wake-up flag (equivalent to an atomic release 376 | * operation). 377 | */ 378 | static void set_usersched_needed(void) 379 | { 380 | __sync_fetch_and_or(&usersched_needed, 1); 381 | } 382 | 383 | /* 384 | * Check and clear user-space scheduler wake-up flag (equivalent to an atomic 385 | * acquire operation). 386 | */ 387 | static bool test_and_clear_usersched_needed(void) 388 | { 389 | return __sync_fetch_and_and(&usersched_needed, 0) == 1; 390 | } 391 | 392 | /* 393 | * Return true if there's any pending activity to do for the scheduler, false 394 | * otherwise. 395 | * 396 | * NOTE: a task is sent to the user-space scheduler using the "queued" 397 | * ringbuffer, then the scheduler drains the queued tasks and adds them to 398 | * its internal data structures / state; at this point tasks become 399 | * "scheduled" and the user-space scheduler will take care of updating 400 | * nr_scheduled accordingly; lastly tasks will be dispatched and the 401 | * user-space scheduler will update nr_scheduled again. 402 | * 403 | * Checking nr_scheduled and the available data in the ringbuffer allows to 404 | * determine if there is still some pending work to do for the scheduler: 405 | * new tasks have been queued since last check, or there are still tasks 406 | * "queued" or "scheduled" since the previous user-space scheduler run. 407 | * 408 | * If there's no pending action, it is pointless to wake-up the scheduler 409 | * (even if a CPU becomes idle), because there is nothing to do. 410 | * 411 | * Also keep in mind that we don't need any protection here since this code 412 | * doesn't run concurrently with the user-space scheduler (that is single 413 | * threaded), therefore this check is also safe from a concurrency perspective. 414 | */ 415 | static bool usersched_has_pending_tasks(void) 416 | { 417 | if (usersched_needed) 418 | return true; 419 | 420 | if (nr_queued || nr_scheduled) 421 | return true; 422 | 423 | return bpf_ringbuf_query(&queued, BPF_RB_AVAIL_DATA) > 0; 424 | } 425 | 426 | /* 427 | * Return the DSQ ID associated to a CPU, or SHARED_DSQ if the CPU is not 428 | * valid. 429 | */ 430 | static u64 cpu_to_dsq(s32 cpu) 431 | { 432 | if (cpu < 0 || cpu >= MAX_CPUS) { 433 | scx_bpf_error("Invalid cpu: %d", cpu); 434 | return SHARED_DSQ; 435 | } 436 | return (u64)cpu; 437 | } 438 | 439 | /* 440 | * Helper function to update priority tasks map based on vtime. 441 | * If vtime == 0, add PID to map. If vtime != 0, remove PID from map. 442 | */ 443 | static void update_priority_task_map(u32 pid, u64 vtime, u64 slice) 444 | { 445 | if (vtime == 0) { 446 | bpf_map_update_elem(&priority_tasks, &pid, &slice, BPF_ANY); 447 | } else { 448 | bpf_map_delete_elem(&priority_tasks, &pid); 449 | } 450 | } 451 | 452 | /* 453 | * Return true if @this_cpu and @that_cpu are in the same LLC, false 454 | * otherwise. 455 | */ 456 | static inline bool cpus_share_cache(s32 this_cpu, s32 that_cpu) 457 | { 458 | if (this_cpu == that_cpu) 459 | return true; 460 | 461 | return cpu_llc_id(this_cpu) == cpu_llc_id(that_cpu); 462 | } 463 | 464 | /* 465 | * Return true if @this_cpu is faster than @that_cpu, false otherwise. 466 | */ 467 | static inline bool is_cpu_faster(s32 this_cpu, s32 that_cpu) 468 | { 469 | if (this_cpu == that_cpu) 470 | return false; 471 | 472 | return cpu_priority(this_cpu) > cpu_priority(that_cpu); 473 | } 474 | 475 | /* 476 | * Return true if @cpu is a fully-idle SMT core, false otherwise. 477 | */ 478 | static inline bool is_smt_idle(s32 cpu) 479 | { 480 | const struct cpumask *idle_smtmask; 481 | bool is_idle; 482 | 483 | if (!smt_enabled) 484 | return true; 485 | 486 | idle_smtmask = scx_bpf_get_idle_smtmask(); 487 | is_idle = bpf_cpumask_test_cpu(cpu, idle_smtmask); 488 | scx_bpf_put_cpumask(idle_smtmask); 489 | 490 | return is_idle; 491 | } 492 | 493 | /* 494 | * Return true on a wake-up event, false otherwise. 495 | */ 496 | static inline bool is_wakeup(u64 wake_flags) 497 | { 498 | return wake_flags & SCX_WAKE_TTWU; 499 | } 500 | 501 | /* 502 | * Find an idle CPU in the system for the task. 503 | * 504 | * NOTE: the idle CPU selection doesn't need to be formally perfect, it is 505 | * totally fine to accept racy conditions and potentially make mistakes, by 506 | * picking CPUs that are not idle or even offline, the logic has been designed 507 | * to handle these mistakes in favor of a more efficient response and a reduced 508 | * scheduling overhead. 509 | */ 510 | static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) 511 | { 512 | s32 cpu, this_cpu = bpf_get_smp_processor_id(); 513 | bool is_this_cpu_allowed = bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr); 514 | 515 | /* 516 | * For tasks that can run only on a single CPU, we can simply verify if 517 | * their only allowed CPU is still idle. 518 | */ 519 | if (p->nr_cpus_allowed == 1) { 520 | if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 521 | return prev_cpu; 522 | 523 | return -EBUSY; 524 | } 525 | 526 | /* 527 | * On wakeup if the waker's CPU is faster than the wakee's CPU, try 528 | * to move the wakee closer to the waker. 529 | * 530 | * In presence of hybrid cores this helps to naturally migrate 531 | * tasks over to the faster cores. 532 | */ 533 | if (is_wakeup(wake_flags) && 534 | is_cpu_faster(this_cpu, prev_cpu) && is_this_cpu_allowed) { 535 | /* 536 | * If both the waker's CPU and the wakee's CPU are in the 537 | * same LLC and the wakee's CPU is a fully idle SMT core, 538 | * don't migrate. 539 | */ 540 | if (cpus_share_cache(this_cpu, prev_cpu) && 541 | is_smt_idle(prev_cpu) && 542 | scx_bpf_test_and_clear_cpu_idle(prev_cpu)) 543 | return prev_cpu; 544 | 545 | prev_cpu = this_cpu; 546 | } 547 | 548 | /* 549 | * Fallback to the old API if the kernel doesn't support 550 | * scx_bpf_select_cpu_and(). 551 | * 552 | * This is required to support kernels <= 6.16. 553 | */ 554 | if (!bpf_ksym_exists(scx_bpf_select_cpu_and)) { 555 | bool is_idle = false; 556 | 557 | if (!wake_flags) 558 | return -EBUSY; 559 | 560 | cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 561 | 562 | return is_idle ? cpu : -EBUSY; 563 | } 564 | 565 | /* 566 | * Pick any idle CPU usable by the task. 567 | */ 568 | return scx_bpf_select_cpu_and(p, prev_cpu, wake_flags, p->cpus_ptr, 0); 569 | } 570 | 571 | /* 572 | * Wake-up a target @cpu for the dispatched task @p. If @cpu can't be used 573 | * wakeup another valid CPU. 574 | */ 575 | static void kick_task_cpu(const struct task_struct *p, s32 cpu) 576 | { 577 | if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { 578 | /* 579 | * Kick the target CPU anyway, since it may be locked and 580 | * needs to go back to idle to reset its state. 581 | */ 582 | scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 583 | 584 | /* 585 | * Pick any other idle CPU that the task can use. 586 | */ 587 | cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 588 | if (cpu < 0) 589 | return; 590 | } 591 | scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 592 | } 593 | 594 | /* 595 | * Dispatch a task to a target per-CPU DSQ, waking up the corresponding CPU, if 596 | * needed. 597 | */ 598 | static void dispatch_task(const struct dispatched_task_ctx *task) 599 | { 600 | struct task_struct *p; 601 | s32 prev_cpu; 602 | 603 | /* Ignore entry if the task doesn't exist anymore */ 604 | p = bpf_task_from_pid(task->pid); 605 | if (!p) 606 | return; 607 | prev_cpu = scx_bpf_task_cpu(p); 608 | 609 | /* 610 | * Dispatch task to the shared DSQ if the user-space scheduler 611 | * didn't select any specific target CPU. 612 | */ 613 | if (task->cpu == RL_CPU_ANY) { 614 | scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, 615 | task->slice_ns, task->vtime, task->flags); 616 | kick_task_cpu(p, prev_cpu); 617 | 618 | goto out_release; 619 | } 620 | 621 | /* 622 | * If the target CPU selected by the user-space scheduler is not 623 | * valid, dispatch it to the SHARED_DSQ, independently on what the 624 | * user-space scheduler has decided. 625 | */ 626 | if (!bpf_cpumask_test_cpu(task->cpu, p->cpus_ptr)) { 627 | scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, 628 | task->slice_ns, task->vtime, task->flags); 629 | __sync_fetch_and_add(&nr_bounce_dispatches, 1); 630 | kick_task_cpu(p, prev_cpu); 631 | 632 | goto out_release; 633 | } 634 | 635 | /* 636 | * Dispatch a task to a target CPU selected by the user-space 637 | * scheduler. 638 | */ 639 | if (task->vtime) { 640 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(task->cpu), 641 | task->slice_ns, task->vtime, task->flags); 642 | __sync_fetch_and_add(&nr_user_dispatches, 1); 643 | } else { 644 | s32 cur_pid; 645 | u64* elem; 646 | cur_pid = task->pid; 647 | elem = bpf_map_lookup_elem(&priority_tasks, &cur_pid); 648 | if (!elem){ 649 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(task->cpu), 650 | task->slice_ns, task->vtime, task->flags); 651 | __sync_fetch_and_add(&nr_user_dispatches, 1); 652 | } 653 | } 654 | update_priority_task_map(task->pid, task->vtime, task->slice_ns); 655 | 656 | /* 657 | * If the cpumask is not valid anymore, ignore the dispatch event. 658 | * 659 | * This can happen if the task has changed its affinity and the 660 | * target CPU has become invalid. In this case cancelling the 661 | * dispatch allows to prevent potential stalls in the scheduler, 662 | * since the task will be re-enqueued by the core sched-ext code, 663 | * potentially selecting a different CPU. 664 | */ 665 | if (!bpf_cpumask_test_cpu(task->cpu, p->cpus_ptr)) { 666 | scx_bpf_dispatch_cancel(); 667 | __sync_fetch_and_add(&nr_cancel_dispatches, 1); 668 | 669 | goto out_release; 670 | } 671 | 672 | scx_bpf_kick_cpu(task->cpu, SCX_KICK_IDLE); 673 | 674 | out_release: 675 | bpf_task_release(p); 676 | } 677 | 678 | /* 679 | * Return true if the waker commits to release the CPU after waking up @p, 680 | * false otherwise. 681 | */ 682 | static bool is_wake_sync(u64 wake_flags) 683 | { 684 | const struct task_struct *current = (void *)bpf_get_current_task_btf(); 685 | 686 | return (wake_flags & SCX_WAKE_SYNC) && !(current->flags & PF_EXITING); 687 | } 688 | 689 | /* 690 | * Return true it's safe to dispatch directly on @cpu, false otherwise. 691 | */ 692 | static bool can_direct_dispatch(s32 cpu) 693 | { 694 | return !scx_bpf_dsq_nr_queued(SHARED_DSQ) && 695 | !scx_bpf_dsq_nr_queued(cpu_to_dsq(cpu)); 696 | } 697 | 698 | s32 BPF_STRUCT_OPS(goland_select_cpu, struct task_struct *p, s32 prev_cpu, 699 | u64 wake_flags) 700 | { 701 | s32 cpu, this_cpu = bpf_get_smp_processor_id(); 702 | bool is_this_cpu_allowed = bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr); 703 | 704 | /* 705 | * Make sure @prev_cpu is usable, otherwise try to move close to 706 | * the waker's CPU. If the waker's CPU is also not usable, then 707 | * pick the first usable CPU. 708 | */ 709 | if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr)) 710 | prev_cpu = is_this_cpu_allowed ? this_cpu : bpf_cpumask_first(p->cpus_ptr); 711 | 712 | /* 713 | * Scheduler is dispatched directly in .dispatch() when needed, so 714 | * we can skip it here. 715 | */ 716 | if (is_belong_usersched_task(p)) 717 | return prev_cpu; 718 | 719 | /* 720 | * If built-in idle CPU policy is not enabled, completely delegate 721 | * the idle selection policy to user-space and keep re-using the 722 | * same CPU here. 723 | */ 724 | if (!builtin_idle) 725 | return prev_cpu; 726 | 727 | /* 728 | * Pick the idle CPU closest to @prev_cpu usable by the task. 729 | */ 730 | cpu = pick_idle_cpu(p, prev_cpu, wake_flags); 731 | if (cpu >= 0) { 732 | if (can_direct_dispatch(cpu)) { 733 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu), 734 | default_slice, p->scx.dsq_vtime, 0); 735 | __sync_fetch_and_add(&nr_kernel_dispatches, 1); 736 | } 737 | return cpu; 738 | } 739 | 740 | /* 741 | * If we couldn't find an idle CPU, in case of a sync wakeup 742 | * prioritize the waker's CPU. 743 | */ 744 | return prev_cpu; 745 | } 746 | 747 | SEC("syscall") 748 | int do_preempt(struct preempt_cpu_arg *input) 749 | { 750 | scx_bpf_kick_cpu(input->cpu_id, SCX_KICK_PREEMPT); 751 | return 0; 752 | } 753 | 754 | /* 755 | * Select and wake-up an idle CPU for a specific task from the user-space 756 | * scheduler. 757 | */ 758 | SEC("syscall") 759 | int rs_select_cpu(struct task_cpu_arg *input) 760 | { 761 | struct task_struct *p; 762 | int cpu = input->cpu; 763 | 764 | p = bpf_task_from_pid(input->pid); 765 | if (!p) 766 | return -EINVAL; 767 | 768 | /* 769 | * If the target CPU is the current one, treat it as idle when no 770 | * other tasks are queued. 771 | * 772 | * Since this function is invoked by the user-space scheduler, 773 | * which will release the CPU shortly, there is no need to migrate 774 | * the task elsewhere. 775 | */ 776 | if (cpu == bpf_get_smp_processor_id()) { 777 | u64 nr_tasks = nr_running + nr_queued + nr_scheduled + 1; 778 | 779 | if (nr_tasks < nr_online_cpus && !scx_bpf_dsq_nr_queued(cpu)) 780 | goto out_release; 781 | } 782 | 783 | bpf_rcu_read_lock(); 784 | /* 785 | * Kernels that don't provide scx_bpf_select_cpu_and() only allow 786 | * to use the built-in idle CPU selection policy only from 787 | * ops.select_cpu() and opt.enqueue(), return any idle CPU usable 788 | * by the task in this case. 789 | */ 790 | if (!bpf_ksym_exists(scx_bpf_select_cpu_and)) { 791 | if (!scx_bpf_test_and_clear_cpu_idle(cpu)) 792 | cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 793 | } else { 794 | /* 795 | * Set SCX_WAKE_TTWU, pretending to be a wakeup, to prioritize 796 | * faster CPU selection (we probably want to add an option to allow 797 | * the user-space scheduler to use this logic or not). 798 | */ 799 | cpu = pick_idle_cpu(p, cpu, SCX_WAKE_TTWU); 800 | } 801 | bpf_rcu_read_unlock(); 802 | 803 | out_release: 804 | bpf_task_release(p); 805 | 806 | return cpu; 807 | } 808 | 809 | /* 810 | * Fill @task with all the information that need to be sent to the user-space 811 | * scheduler. 812 | */ 813 | static void get_task_info(struct queued_task_ctx *task, 814 | const struct task_struct *p, s32 prev_cpu, u64 enq_flags) 815 | { 816 | struct task_ctx *tctx = try_lookup_task_ctx(p); 817 | 818 | task->pid = p->pid; 819 | task->cpu = prev_cpu; 820 | task->nr_cpus_allowed = p->nr_cpus_allowed; 821 | task->flags = enq_flags; 822 | task->start_ts = tctx ? tctx->start_ts : 0; 823 | task->stop_ts = tctx ? tctx->stop_ts : 0; 824 | task->exec_runtime = tctx ? tctx->exec_runtime : 0; 825 | task->weight = p->scx.weight; 826 | task->vtime = p->scx.dsq_vtime; 827 | task->tgid = p->tgid; 828 | } 829 | 830 | /* 831 | * User-space scheduler is congested: log that and increment congested counter. 832 | */ 833 | static void sched_congested(struct task_struct *p) 834 | { 835 | dbg_msg("congested: pid=%d (%s)", p->pid, p->comm); 836 | __sync_fetch_and_add(&nr_sched_congested, 1); 837 | } 838 | 839 | /* 840 | * Return true if a task has been enqueued as a remote wakeup, false 841 | * otherwise. 842 | */ 843 | static bool is_queued_wakeup(const struct task_struct *p, u64 enq_flags) 844 | { 845 | return !__COMPAT_is_enq_cpu_selected(enq_flags) && !scx_bpf_task_running(p); 846 | } 847 | 848 | /* 849 | * Queue a task to the user-space scheduler. 850 | */ 851 | static void queue_task_to_userspace(struct task_struct *p, s32 prev_cpu, u64 enq_flags) 852 | { 853 | struct queued_task_ctx *task; 854 | 855 | /* 856 | * Allocate a new entry in the ring buffer. 857 | * 858 | * If ring buffer is full, the user-space scheduler is congested, 859 | * so dispatch the task directly using the shared DSQ (the task 860 | * will be consumed by the first CPU available). 861 | */ 862 | task = bpf_ringbuf_reserve(&queued, sizeof(*task), 0); 863 | if (!task) { 864 | sched_congested(p); 865 | scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, 866 | SCX_SLICE_DFL, p->scx.dsq_vtime, enq_flags); 867 | __sync_fetch_and_add(&nr_kernel_dispatches, 1); 868 | return; 869 | } 870 | 871 | /* 872 | * Collect task information and store them in the ring buffer that 873 | * will be consumed by the user-space scheduler. 874 | */ 875 | dbg_msg("enqueue: pid=%d (%s)", p->pid, p->comm); 876 | get_task_info(task, p, prev_cpu, enq_flags); 877 | bpf_ringbuf_submit(task, 0); 878 | __sync_fetch_and_add(&nr_queued, 1); 879 | } 880 | 881 | /* 882 | * Task @p becomes ready to run. We can dispatch the task directly here if the 883 | * user-space scheduler is not required, or enqueue it to be processed by the 884 | * scheduler. 885 | */ 886 | void BPF_STRUCT_OPS(goland_enqueue, struct task_struct *p, u64 enq_flags) 887 | { 888 | s32 prev_cpu = scx_bpf_task_cpu(p), cpu; 889 | bool is_wakeup = is_queued_wakeup(p, enq_flags); 890 | 891 | /* 892 | * Insert the user-space scheduler to its dedicated DSQ, it will be 893 | * consumed from ops.dispatch() only when there's any pending 894 | * scheduling action to do. 895 | */ 896 | if (is_belong_usersched_task(p)) { 897 | if (usersched_has_pending_tasks()) { 898 | /* 899 | * Try to find an idle CPU and dispatch directly to reduce latency. 900 | * This avoids the overhead of going through SCHED_DSQ. 901 | */ 902 | cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 903 | if (cpu >= 0) { 904 | scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, 905 | default_slice, SCX_ENQ_LAST); 906 | scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); 907 | return; 908 | } 909 | } 910 | scx_bpf_dsq_insert(p, SCHED_DSQ, default_slice, SCX_ENQ_LAST); 911 | return; 912 | } 913 | 914 | /* 915 | * Always dispatch per-CPU kthreads directly on their target CPU. 916 | * 917 | * This allows to prioritize critical kernel threads that may 918 | * potentially stall the entire system if they are blocked for too long 919 | * (i.e., ksoftirqd/N, rcuop/N, etc.). 920 | */ 921 | if (is_kthread(p) && p->nr_cpus_allowed == 1 && early_processing) { 922 | cpu = scx_bpf_task_cpu(p); 923 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu), 924 | default_slice, p->scx.dsq_vtime, enq_flags); 925 | __sync_fetch_and_add(&nr_kernel_dispatches, 1); 926 | return; 927 | } 928 | if (is_kswapd(p) || is_khugepaged(p)) { 929 | cpu = scx_bpf_task_cpu(p); 930 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu), 931 | default_slice, p->scx.dsq_vtime, enq_flags); 932 | __sync_fetch_and_add(&nr_kernel_dispatches, 1); 933 | return; 934 | } 935 | 936 | /* 937 | * Handle priority tasks with custom dispatch logic. 938 | */ 939 | u64* elem; 940 | u64 slice; 941 | u32 pid = p->pid; 942 | s32 prio_cpu = -EBUSY; 943 | u64 prio_enq_flags = SCX_ENQ_PREEMPT; 944 | u32* cur_pid_val; 945 | u32 cur_pid; 946 | 947 | elem = bpf_map_lookup_elem(&priority_tasks, &pid); 948 | if (elem) { 949 | prio_cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 950 | if (prio_cpu == -EBUSY) { 951 | prio_cpu = scx_bpf_task_cpu(p); 952 | } 953 | slice = *elem; 954 | if (prio_cpu >= 0) { 955 | cur_pid_val = bpf_map_lookup_elem(&running_task, &prio_cpu); 956 | if (cur_pid_val) { 957 | cur_pid = *cur_pid_val; 958 | elem = bpf_map_lookup_elem(&priority_tasks, &cur_pid); 959 | // If current running task is prioritized, do not preempt it (SCX_ENQ_HEAD). 960 | // Otherwise, keep the flag equals to SCX_ENQ_PREEMPT 961 | if (elem) { 962 | prio_enq_flags = SCX_ENQ_HEAD; 963 | } 964 | } 965 | scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | prio_cpu, 966 | slice, prio_enq_flags); 967 | __sync_fetch_and_add(&nr_user_dispatches, 1); 968 | scx_bpf_kick_cpu(prio_cpu, SCX_KICK_IDLE); 969 | return; 970 | } 971 | } 972 | 973 | /* 974 | * If @builtin_idle is enabled, give the task a chance to be 975 | * directly dispatched only on a wakeup and only if 976 | * ops.select_cpu() was skipped, otherwise the task is always 977 | * queued to the user-space scheduler. 978 | */ 979 | if (!(builtin_idle && is_wakeup)) { 980 | queue_task_to_userspace(p, prev_cpu, enq_flags); 981 | goto out_kick; 982 | } 983 | 984 | /* 985 | * Try to find an idle CPU in the system, if all CPUs are busy 986 | * queue the task to the user-space scheduler. 987 | */ 988 | cpu = pick_idle_cpu(p, prev_cpu, 0); 989 | if (cpu < 0) { 990 | queue_task_to_userspace(p, prev_cpu, enq_flags); 991 | goto out_kick; 992 | } 993 | 994 | /* 995 | * Always force a CPU wakeup, so that the allocated CPU can be 996 | * released and go back idle even if the task isn't directly 997 | * dispatched. 998 | */ 999 | prev_cpu = cpu; 1000 | is_wakeup = true; 1001 | 1002 | /* 1003 | * Perform direct dispatch only if the SHARED_DSQ is empty and 1004 | * the per-CPU DSQ is empty, otherwise we may risk to starve the 1005 | * tasks waiting in the queues. 1006 | */ 1007 | if (!scx_bpf_dsq_nr_queued(SHARED_DSQ) && !scx_bpf_dsq_nr_queued(cpu_to_dsq(cpu))) { 1008 | /* 1009 | * We can race with a dequeue here and the selected idle CPU 1010 | * might be not valid anymore, if the task affinity has changed. 1011 | * 1012 | * In this case just wakeup the picked CPU and ignore the enqueue, 1013 | * another enqueue event for the same task will be received later. 1014 | */ 1015 | if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) 1016 | goto out_kick; 1017 | 1018 | /* 1019 | * Directly dispatch the task to selected idle CPU (queued wakeup). 1020 | */ 1021 | scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu), 1022 | SCX_SLICE_DFL, p->scx.dsq_vtime, enq_flags); 1023 | __sync_fetch_and_add(&nr_kernel_dispatches, 1); 1024 | goto out_kick; 1025 | } 1026 | 1027 | /* 1028 | * If we can't directly dispatch, queue the task to user-space. 1029 | */ 1030 | queue_task_to_userspace(p, prev_cpu, enq_flags); 1031 | 1032 | out_kick: 1033 | /* 1034 | * Wakeup the task's CPU if needed. 1035 | */ 1036 | if (is_wakeup) 1037 | scx_bpf_kick_cpu(prev_cpu, SCX_KICK_IDLE); 1038 | } 1039 | 1040 | /* 1041 | * Handle a task dispatched from user-space, performing the actual low-level 1042 | * BPF dispatch. 1043 | */ 1044 | static long handle_dispatched_task(struct bpf_dynptr *dynptr, void *context) 1045 | { 1046 | const struct dispatched_task_ctx *task; 1047 | 1048 | task = bpf_dynptr_data(dynptr, 0, sizeof(*task)); 1049 | if (!task) 1050 | return 0; 1051 | 1052 | dispatch_task(task); 1053 | 1054 | return !!scx_bpf_dispatch_nr_slots(); 1055 | } 1056 | 1057 | /* 1058 | * Dispatch tasks that are ready to run. 1059 | * 1060 | * This function is called when a CPU's local DSQ is empty and ready to accept 1061 | * new dispatched tasks. 1062 | * 1063 | * We may dispatch tasks also on other CPUs from here, if the scheduler decided 1064 | * so (usually if other CPUs are idle we may want to send more tasks to their 1065 | * local DSQ to optimize the scheduling pipeline). 1066 | */ 1067 | void BPF_STRUCT_OPS(goland_dispatch, s32 cpu, struct task_struct *prev) 1068 | { 1069 | /* 1070 | * Dispatch the user-space scheduler if there's any pending action 1071 | * to do. Keep consuming from SCHED_DSQ until it's empty. 1072 | */ 1073 | if (usersched_has_pending_tasks()) { 1074 | int consumed = 0; 1075 | while (scx_bpf_dsq_move_to_local(SCHED_DSQ) && consumed++ < MAX_USERSCHED_DISPATCH) 1076 | ; 1077 | return; 1078 | } 1079 | 1080 | /* 1081 | * Consume all tasks from the @dispatched list and immediately 1082 | * dispatch them on the target CPU decided by the user-space 1083 | * scheduler. 1084 | */ 1085 | s32 ret = bpf_user_ringbuf_drain(&dispatched, 1086 | handle_dispatched_task, NULL, BPF_RB_NO_WAKEUP); 1087 | if (ret < 0) 1088 | dbg_msg("User ringbuf drain error: %d", ret); 1089 | 1090 | /* 1091 | * Consume a task from the per-CPU DSQ. 1092 | */ 1093 | if (scx_bpf_dsq_move_to_local(cpu_to_dsq(cpu))) 1094 | return; 1095 | 1096 | /* 1097 | * Consume a task from the shared DSQ. 1098 | */ 1099 | if (scx_bpf_dsq_move_to_local(SHARED_DSQ)) 1100 | return; 1101 | 1102 | /* 1103 | * If the current task expired its time slice and no other task 1104 | * wants to run, simply replenish its time slice and let it run for 1105 | * another round on the same CPU. 1106 | * 1107 | * In case of the user-space scheduler task, replenish its time 1108 | * slice only if there're still pending scheduling actions to do. 1109 | */ 1110 | if (prev && is_queued(prev) && 1111 | (!is_belong_usersched_task(prev) || usersched_has_pending_tasks())) 1112 | prev->scx.slice = default_slice; 1113 | } 1114 | 1115 | void BPF_STRUCT_OPS(goland_runnable, struct task_struct *p, u64 enq_flags) 1116 | { 1117 | struct task_ctx *tctx; 1118 | 1119 | if (is_belong_usersched_task(p)) 1120 | return; 1121 | 1122 | tctx = try_lookup_task_ctx(p); 1123 | if (!tctx) 1124 | return; 1125 | 1126 | tctx->exec_runtime = 0; 1127 | } 1128 | 1129 | /* 1130 | * Task @p starts on its selected CPU (update CPU ownership map). 1131 | */ 1132 | void BPF_STRUCT_OPS(goland_running, struct task_struct *p) 1133 | { 1134 | s32 cpu = scx_bpf_task_cpu(p); 1135 | struct task_ctx *tctx; 1136 | 1137 | u32 pid = p->pid; 1138 | bpf_map_update_elem(&running_task, &cpu, &pid, BPF_ANY); 1139 | 1140 | if (is_usersched_task(p)) { 1141 | usersched_last_run_at = scx_bpf_now(); 1142 | return; 1143 | } 1144 | 1145 | dbg_msg("start: pid=%d (%s) cpu=%ld", p->pid, p->comm, cpu); 1146 | 1147 | /* 1148 | * Mark the CPU as busy by setting the pid as owner (ignoring the 1149 | * user-space scheduler). 1150 | */ 1151 | __sync_fetch_and_add(&nr_running, 1); 1152 | 1153 | tctx = try_lookup_task_ctx(p); 1154 | if (!tctx) 1155 | return; 1156 | tctx->start_ts = scx_bpf_now(); 1157 | } 1158 | 1159 | /* 1160 | * Task @p stops running on its associated CPU (update CPU ownership map). 1161 | */ 1162 | void BPF_STRUCT_OPS(goland_stopping, struct task_struct *p, bool runnable) 1163 | { 1164 | u64 now = scx_bpf_now(); 1165 | s32 cpu = scx_bpf_task_cpu(p); 1166 | struct task_ctx *tctx; 1167 | 1168 | if (is_belong_usersched_task(p)) { 1169 | if (nr_scheduled + nr_queued == 0) { 1170 | test_and_clear_usersched_needed(); 1171 | } 1172 | return; 1173 | } 1174 | 1175 | dbg_msg("stop: pid=%d (%s) cpu=%ld", p->pid, p->comm, cpu); 1176 | 1177 | __sync_fetch_and_sub(&nr_running, 1); 1178 | 1179 | tctx = try_lookup_task_ctx(p); 1180 | if (!tctx) 1181 | return; 1182 | tctx->stop_ts = now; 1183 | 1184 | /* 1185 | * Update the partial execution time since last sleep. 1186 | */ 1187 | tctx->exec_runtime += now - tctx->start_ts; 1188 | } 1189 | 1190 | /* 1191 | * A task joins the sched_ext scheduler. 1192 | */ 1193 | void BPF_STRUCT_OPS(goland_enable, struct task_struct *p) 1194 | { 1195 | p->scx.dsq_vtime = 0; 1196 | p->scx.slice = SCX_SLICE_DFL; 1197 | } 1198 | 1199 | /* 1200 | * A new task @p is being created. 1201 | * 1202 | * Allocate and initialize all the internal structures for the task (this 1203 | * function is allowed to block, so it can be used to preallocate memory). 1204 | */ 1205 | s32 BPF_STRUCT_OPS(goland_init_task, struct task_struct *p, 1206 | struct scx_init_task_args *args) 1207 | { 1208 | struct task_ctx *tctx; 1209 | struct bpf_cpumask *cpumask; 1210 | 1211 | tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 1212 | BPF_LOCAL_STORAGE_GET_F_CREATE); 1213 | if (!tctx) 1214 | return -ENOMEM; 1215 | 1216 | /* 1217 | * Create task's L2 cache cpumask. 1218 | */ 1219 | cpumask = bpf_cpumask_create(); 1220 | if (!cpumask) 1221 | return -ENOMEM; 1222 | cpumask = bpf_kptr_xchg(&tctx->l2_cpumask, cpumask); 1223 | if (cpumask) 1224 | bpf_cpumask_release(cpumask); 1225 | 1226 | /* 1227 | * Create task's L3 cache cpumask. 1228 | */ 1229 | cpumask = bpf_cpumask_create(); 1230 | if (!cpumask) 1231 | return -ENOMEM; 1232 | cpumask = bpf_kptr_xchg(&tctx->l3_cpumask, cpumask); 1233 | if (cpumask) 1234 | bpf_cpumask_release(cpumask); 1235 | 1236 | return 0; 1237 | } 1238 | 1239 | /* 1240 | * Heartbeat scheduler timer callback. 1241 | * 1242 | * If the system is completely idle the sched-ext watchdog may incorrectly 1243 | * detect that as a stall and automatically disable the scheduler. So, use this 1244 | * timer to periodically wake-up the scheduler and avoid long inactivity. 1245 | * 1246 | * This can also help to prevent real "stalling" conditions in the scheduler. 1247 | */ 1248 | static int usersched_timer_fn(void *map, int *key, struct bpf_timer *timer) 1249 | { 1250 | struct task_struct *p; 1251 | int err = 0; 1252 | 1253 | /* 1254 | * Trigger the user-space scheduler if it has been inactive for 1255 | * more than USERSCHED_TIMER_NS. 1256 | */ 1257 | if (time_delta(scx_bpf_now(), usersched_last_run_at) >= USERSCHED_TIMER_NS) { 1258 | bpf_rcu_read_lock(); 1259 | p = bpf_task_from_pid(usersched_pid); 1260 | if (p) { 1261 | set_usersched_needed(); 1262 | scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE); 1263 | bpf_task_release(p); 1264 | } 1265 | bpf_rcu_read_unlock(); 1266 | } 1267 | 1268 | /* Re-arm the timer */ 1269 | err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0); 1270 | if (err) 1271 | scx_bpf_error("Failed to arm stats timer"); 1272 | 1273 | return 0; 1274 | } 1275 | 1276 | /* 1277 | * Initialize the heartbeat scheduler timer. 1278 | */ 1279 | static int usersched_timer_init(void) 1280 | { 1281 | struct bpf_timer *timer; 1282 | u32 key = 0; 1283 | int err; 1284 | 1285 | timer = bpf_map_lookup_elem(&usersched_timer, &key); 1286 | if (!timer) { 1287 | scx_bpf_error("Failed to lookup scheduler timer"); 1288 | return -ESRCH; 1289 | } 1290 | bpf_timer_init(timer, &usersched_timer, CLOCK_BOOTTIME); 1291 | bpf_timer_set_callback(timer, usersched_timer_fn); 1292 | err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0); 1293 | if (err) 1294 | scx_bpf_error("Failed to arm scheduler timer"); 1295 | 1296 | return err; 1297 | } 1298 | 1299 | /* 1300 | * Evaluate the amount of online CPUs. 1301 | */ 1302 | static s32 get_nr_online_cpus(void) 1303 | { 1304 | const struct cpumask *online_cpumask; 1305 | int i, cpus = 0; 1306 | 1307 | online_cpumask = scx_bpf_get_online_cpumask(); 1308 | 1309 | bpf_for(i, 0, nr_cpu_ids) { 1310 | if (!bpf_cpumask_test_cpu(i, online_cpumask)) 1311 | continue; 1312 | cpus++; 1313 | } 1314 | 1315 | scx_bpf_put_cpumask(online_cpumask); 1316 | 1317 | return cpus; 1318 | } 1319 | 1320 | /* 1321 | * Create a DSQ for each CPU available in the system and a global shared DSQ. 1322 | * 1323 | * All the tasks processed by the user-space scheduler can be dispatched either 1324 | * to a specific CPU/DSQ or to the first CPU available (SHARED_DSQ). 1325 | * 1326 | * Custom DSQs are then consumed from the .dispatch() callback, that will 1327 | * transfer all the enqueued tasks to the consuming CPU's local DSQ. 1328 | */ 1329 | static int dsq_init(void) 1330 | { 1331 | int err; 1332 | s32 cpu; 1333 | 1334 | /* Initialize amount of online CPUs */ 1335 | nr_online_cpus = get_nr_online_cpus(); 1336 | 1337 | /* Create per-CPU DSQs */ 1338 | bpf_for(cpu, 0, nr_cpu_ids) { 1339 | err = scx_bpf_create_dsq(cpu_to_dsq(cpu), -1); 1340 | if (err) { 1341 | scx_bpf_error("failed to create pcpu DSQ %d: %d", 1342 | cpu, err); 1343 | return err; 1344 | } 1345 | } 1346 | 1347 | /* Create the global shared DSQ */ 1348 | err = scx_bpf_create_dsq(SHARED_DSQ, -1); 1349 | if (err) { 1350 | scx_bpf_error("failed to create shared DSQ: %d", err); 1351 | return err; 1352 | } 1353 | 1354 | /* Create the scheduler's DSQ */ 1355 | err = scx_bpf_create_dsq(SCHED_DSQ, -1); 1356 | if (err) { 1357 | scx_bpf_error("failed to create scheduler DSQ: %d", err); 1358 | return err; 1359 | } 1360 | 1361 | return 0; 1362 | } 1363 | 1364 | static int init_cpumask(struct bpf_cpumask **cpumask) 1365 | { 1366 | struct bpf_cpumask *mask; 1367 | int err = 0; 1368 | 1369 | /* 1370 | * Do nothing if the mask is already initialized. 1371 | */ 1372 | mask = *cpumask; 1373 | if (mask) 1374 | return 0; 1375 | /* 1376 | * Create the CPU mask. 1377 | */ 1378 | err = calloc_cpumask(cpumask); 1379 | if (!err) 1380 | mask = *cpumask; 1381 | if (!mask) 1382 | err = -ENOMEM; 1383 | 1384 | return err; 1385 | } 1386 | 1387 | SEC("syscall") 1388 | int enable_sibling_cpu(struct domain_arg *input) 1389 | { 1390 | struct cpu_ctx *cctx; 1391 | struct bpf_cpumask *mask, **pmask; 1392 | int err = 0; 1393 | 1394 | cctx = try_lookup_cpu_ctx(input->cpu_id); 1395 | if (!cctx) 1396 | return -ENOENT; 1397 | 1398 | /* Make sure the target CPU mask is initialized */ 1399 | switch (input->lvl_id) { 1400 | case 2: 1401 | pmask = &cctx->l2_cpumask; 1402 | break; 1403 | case 3: 1404 | pmask = &cctx->l3_cpumask; 1405 | break; 1406 | default: 1407 | return -EINVAL; 1408 | } 1409 | err = init_cpumask(pmask); 1410 | if (err) 1411 | return err; 1412 | 1413 | bpf_rcu_read_lock(); 1414 | mask = *pmask; 1415 | if (mask) 1416 | bpf_cpumask_set_cpu(input->sibling_cpu_id, mask); 1417 | bpf_rcu_read_unlock(); 1418 | 1419 | return err; 1420 | } 1421 | 1422 | /* 1423 | * Initialize the scheduling class. 1424 | */ 1425 | s32 BPF_STRUCT_OPS_SLEEPABLE(goland_init) 1426 | { 1427 | int err; 1428 | 1429 | /* Compile-time checks */ 1430 | BUILD_BUG_ON((MAX_CPUS % 2)); 1431 | 1432 | /* Initialize maximum possible CPU number */ 1433 | nr_cpu_ids = scx_bpf_nr_cpu_ids(); 1434 | 1435 | /* Initialize goland core */ 1436 | err = dsq_init(); 1437 | if (err) 1438 | return err; 1439 | err = usersched_timer_init(); 1440 | if (err) 1441 | return err; 1442 | 1443 | return 0; 1444 | } 1445 | 1446 | /* 1447 | * A task is being destroyed. 1448 | * 1449 | * Clean up the task from priority tasks map. 1450 | */ 1451 | void BPF_STRUCT_OPS(goland_exit_task, struct task_struct *p, 1452 | struct scx_exit_task_args *args) 1453 | { 1454 | /* Remove task from priority tasks map */ 1455 | update_priority_task_map(p->pid, 1, 0); 1456 | } 1457 | 1458 | /* 1459 | * Unregister the scheduling class. 1460 | */ 1461 | void BPF_STRUCT_OPS(goland_exit, struct scx_exit_info *ei) 1462 | { 1463 | UEI_RECORD(uei, ei); 1464 | } 1465 | 1466 | /* 1467 | * A CPU is about to change its idle state. If the CPU is going idle, ensure 1468 | * that the user-space scheduler has a chance to run if there is any remaining 1469 | * work to do. 1470 | */ 1471 | void BPF_STRUCT_OPS(goland_update_idle, s32 cpu, bool idle) 1472 | { 1473 | /* 1474 | * Don't do anything if we exit from and idle state, a CPU owner will 1475 | * be assigned in .running(). 1476 | */ 1477 | if (!idle) 1478 | return; 1479 | /* 1480 | * A CPU is now available, notify the user-space scheduler that tasks 1481 | * can be dispatched, if there is at least one task waiting to be 1482 | * scheduled, either queued (accounted in nr_queued) or scheduled 1483 | * (accounted in nr_scheduled). 1484 | * 1485 | * NOTE: nr_queued is incremented by the BPF component, more exactly in 1486 | * enqueue(), when a task is sent to the user-space scheduler, then 1487 | * the scheduler drains the queued tasks (updating nr_queued) and adds 1488 | * them to its internal data structures / state; at this point tasks 1489 | * become "scheduled" and the user-space scheduler will take care of 1490 | * updating nr_scheduled accordingly; lastly tasks will be dispatched 1491 | * and the user-space scheduler will update nr_scheduled again. 1492 | * 1493 | * Checking both counters allows to determine if there is still some 1494 | * pending work to do for the scheduler: new tasks have been queued 1495 | * since last check, or there are still tasks "queued" or "scheduled" 1496 | * since the previous user-space scheduler run. If the counters are 1497 | * both zero it is pointless to wake-up the scheduler (even if a CPU 1498 | * becomes idle), because there is nothing to do. 1499 | * 1500 | * Keep in mind that update_idle() doesn't run concurrently with the 1501 | * user-space scheduler (that is single-threaded): this function is 1502 | * naturally serialized with the user-space scheduler code, therefore 1503 | * this check here is also safe from a concurrency perspective. 1504 | */ 1505 | if (nr_queued || nr_scheduled) { 1506 | /* 1507 | * Notify that user-space scheduler should run and kick this CPU 1508 | * to make it immediately ready to accept dispatched tasks. 1509 | */ 1510 | set_usersched_needed(); 1511 | scx_bpf_kick_cpu(cpu, 0); 1512 | } 1513 | } 1514 | 1515 | /* 1516 | * Scheduling class declaration. 1517 | */ 1518 | SCX_OPS_DEFINE(goland, 1519 | .select_cpu = (void *)goland_select_cpu, 1520 | .enqueue = (void *)goland_enqueue, 1521 | .dispatch = (void *)goland_dispatch, 1522 | .update_idle = (void *)goland_update_idle, 1523 | .runnable = (void *)goland_runnable, 1524 | .running = (void *)goland_running, 1525 | .stopping = (void *)goland_stopping, 1526 | .enable = (void *)goland_enable, 1527 | .init_task = (void *)goland_init_task, 1528 | .exit_task = (void *)goland_exit_task, 1529 | .init = (void *)goland_init, 1530 | .exit = (void *)goland_exit, 1531 | .timeout_ms = 5000, 1532 | .dispatch_max_batch = MAX_DISPATCH_SLOT, 1533 | .flags = SCX_OPS_ENQ_LAST | 1534 | SCX_OPS_KEEP_BUILTIN_IDLE, 1535 | .name = "goland"); --------------------------------------------------------------------------------