├── assets
    ├── demo.gif
    └── design.png
├── .gitmodules
├── .gitignore
├── util
    ├── endian.go
    ├── topology.go
    ├── emun_import.go
    └── emun.go
├── go.mod
├── goland_core
    ├── scheduler.go
    ├── uei.go
    ├── bss.go
    ├── task.go
    ├── rodata.go
    └── obj.go
├── .github
    ├── workflows
    │   └── go.yaml
    └── actions
    │   └── build-dependencies
    │       └── action.yaml
├── scripts
    └── test_scheduler.sh
├── go.sum
├── wrapper.h
├── intf.h
├── README.md
├── Makefile
├── wrapper.c
├── main.go
├── LICENSE
└── main.bpf.c


/assets/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gthulhu/qumun/HEAD/assets/demo.gif


--------------------------------------------------------------------------------
/assets/design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gthulhu/qumun/HEAD/assets/design.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "scx"]
2 | 	path = scx
3 | 	url = https://github.com/sched-ext/scx.git
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | libbpf
 3 | libbpfgo
 4 | output
 5 | *.ll
 6 | *.o
 7 | *.skeleton.h
 8 | main
 9 | scx
10 | libwrapper.a
11 | 


--------------------------------------------------------------------------------
/util/endian.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"unsafe"
 6 | )
 7 | 
 8 | func Endian() binary.ByteOrder {
 9 | 	var i int32 = 0x01020304
10 | 	u := unsafe.Pointer(&i)
11 | 	pb := (*byte)(u)
12 | 	b := *pb
13 | 	if b == 0x04 {
14 | 		return binary.LittleEndian
15 | 	}
16 | 
17 | 	return binary.BigEndian
18 | }
19 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/Gthulhu/qumun
 2 | 
 3 | go 1.24.0
 4 | 
 5 | toolchain go1.24.2
 6 | 
 7 | require (
 8 | 	github.com/Gthulhu/plugin v1.0.1
 9 | 	github.com/aquasecurity/libbpfgo v0.8.0-libbpf-1.5
10 | 	golang.org/x/sys v0.37.0
11 | )
12 | 
13 | require github.com/cilium/ebpf v0.20.0
14 | 
15 | replace github.com/aquasecurity/libbpfgo => ./libbpfgo
16 | 


--------------------------------------------------------------------------------
/goland_core/scheduler.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import (
 4 | 	"github.com/Gthulhu/plugin/models"
 5 | )
 6 | 
 7 | func (s *Sched) DrainQueuedTask() int {
 8 | 	if s.plugin != nil {
 9 | 		return s.plugin.DrainQueuedTask(s)
10 | 	}
11 | 	return 0
12 | }
13 | 
14 | func (s *Sched) SelectQueuedTask() *models.QueuedTask {
15 | 	if s.plugin != nil {
16 | 		return s.plugin.SelectQueuedTask(s)
17 | 	}
18 | 	return nil
19 | }
20 | 
21 | func (s *Sched) SelectCPU(t *models.QueuedTask) (error, int32) {
22 | 	if s.plugin != nil {
23 | 		return s.plugin.SelectCPU(s, t)
24 | 	}
25 | 	return s.selectCPU(t)
26 | }
27 | 
28 | func (s *Sched) DetermineTimeSlice(t *models.QueuedTask) uint64 {
29 | 	if s.plugin != nil {
30 | 		return s.plugin.DetermineTimeSlice(s, t)
31 | 	}
32 | 	return 0
33 | }
34 | 
35 | func (s *Sched) GetPoolCount() uint64 {
36 | 	if s.plugin != nil {
37 | 		return s.plugin.GetPoolCount()
38 | 	}
39 | 	return 0
40 | }
41 | 


--------------------------------------------------------------------------------
/.github/workflows/go.yaml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | on:
 3 |   push:
 4 |     branches: [ main ]
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 |   workflow_call:
 9 | jobs:
10 |   self-tests:
11 |     name: Selftests
12 |     runs-on: ubuntu-24.04
13 |     strategy:
14 |       matrix:
15 |         go-version: [ 'stable' ]
16 |     steps:
17 |       - name: Checkout Code
18 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
19 |       - name: Install Dependencies
20 |         uses: ./.github/actions/build-dependencies
21 |         with:
22 |           go-version: ${{ matrix.go-version }}
23 |       - name: Static Selftests
24 |         run: |
25 |           make dep
26 |           git submodule init
27 |           git submodule sync
28 |           git submodule update
29 |           cd scx
30 |           meson setup build --prefix ~
31 |           meson compile -C build
32 |           cd ..
33 |           cd libbpfgo
34 |           make
35 |           cd ..
36 |           make build
37 |           make test
38 |         shell: bash


--------------------------------------------------------------------------------
/.github/actions/build-dependencies/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Build Dependencies
 2 | description: |
 3 |   Install build dependencies to test and compile tracee artifacts
 4 | inputs:
 5 |   go-version:
 6 |     description: go version
 7 |     default: "1.21"
 8 | runs:
 9 |   using: composite
10 |   steps:
11 |     - name: Setup Go
12 |       uses: actions/setup-go@cdcb36043654635271a94b9a6d1392de5bb323a7 # v5.0.1
13 |       with:
14 |         go-version: "${{ inputs.go-version }}"
15 |     - name: Install Compilers & Formatters
16 |       run: |
17 |         sudo apt-get update
18 |         sudo apt-get install --yes bsdutils
19 |         sudo apt-get install --yes build-essential
20 |         sudo apt-get install --yes pkgconf
21 |         sudo apt-get install --yes llvm-17 clang-17 clang-format-17
22 |         sudo apt-get install --yes libbpf-dev libelf-dev libzstd-dev zlib1g-dev
23 |         sudo apt-get install --yes virtme-ng
24 |         sudo apt-get install --yes gcc-multilib
25 |         sudo apt-get install --yes systemtap-sdt-dev
26 |         sudo apt-get install --yes python3 python3-pip ninja-build
27 |         sudo apt-get install --yes libseccomp-dev protobuf-compiler
28 |         pip3 install --user meson
29 |         for tool in "clang" "clang-format" "llc" "llvm-strip"
30 |         do
31 |           sudo rm -f /usr/bin/$tool
32 |           sudo ln -s /usr/bin/$tool-17 /usr/bin/$tool
33 |         done
34 |       shell: bash
35 | 


--------------------------------------------------------------------------------
/scripts/test_scheduler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Scheduler test script
 3 | # This script runs the scheduler and verifies it starts successfully
 4 | 
 5 | set -e
 6 | 
 7 | LOGFILE="/tmp/scheduler_test.log"
 8 | TIMEOUT_DURATION=60
 9 | WARMUP_TIME=15
10 | 
11 | echo "Starting scheduler test..."
12 | 
13 | # Run scheduler in background
14 | timeout ${TIMEOUT_DURATION} ./main > "${LOGFILE}" 2>&1 &
15 | SCHED_PID=$!
16 | 
17 | echo "Scheduler PID: ${SCHED_PID}"
18 | 
19 | # Wait for scheduler to initialize
20 | sleep ${WARMUP_TIME}
21 | 
22 | # Check if scheduler is still running
23 | if ! ps -p ${SCHED_PID} > /dev/null 2>&1; then
24 |     echo "✗ Scheduler crashed during initialization"
25 |     echo "Log output:"
26 |     cat "${LOGFILE}"
27 |     exit 1
28 | fi
29 | 
30 | echo "✓ Scheduler is running"
31 | 
32 | # Check if scheduler started successfully
33 | if grep -q "scheduler started" "${LOGFILE}"; then
34 |     echo "✓ Scheduler started successfully"
35 | else
36 |     echo "✗ Scheduler did not start properly"
37 |     echo "Log output:"
38 |     cat "${LOGFILE}"
39 |     kill ${SCHED_PID} 2>/dev/null || true
40 |     exit 1
41 | fi
42 | 
43 | # Let it run for a few more seconds
44 | sleep 20
45 | 
46 | # Check final stats
47 | if grep -q "bss data" "${LOGFILE}"; then
48 |     echo "✓ Scheduler produced stats"
49 | fi
50 | 
51 | # Clean shutdown
52 | echo "Stopping scheduler..."
53 | kill ${SCHED_PID} 2>/dev/null || true
54 | wait ${SCHED_PID} 2>/dev/null || true
55 | 
56 | echo "✓ Test completed successfully"
57 | exit 0
58 | 


--------------------------------------------------------------------------------
/goland_core/uei.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import "C"
 4 | 
 5 | import (
 6 | 	"bytes"
 7 | 	"encoding/binary"
 8 | 	"fmt"
 9 | 	"log"
10 | 	"unsafe"
11 | 
12 | 	bpf "github.com/aquasecurity/libbpfgo"
13 | )
14 | 
15 | const (
16 | 	UEI_REASON_LEN = 128
17 | 	UEI_MSG_LEN    = 1024
18 | )
19 | 
20 | type UserExitInfo struct {
21 | 	Kind     int32
22 | 	Paid     uint32
23 | 	ExitCode int64
24 | 	Reason   [UEI_REASON_LEN]C.char
25 | 	Message  [UEI_MSG_LEN]C.char
26 | }
27 | 
28 | type UeiMap struct {
29 | 	*bpf.BPFMap
30 | }
31 | 
32 | func (s *Sched) Stopped() bool {
33 | 	uei, err := s.GetUeiData()
34 | 	if err != nil {
35 | 		log.Printf("uei: %v", err)
36 | 		return true
37 | 	}
38 | 	if uei.Kind != 0 || uei.ExitCode != 0 {
39 | 		log.Printf("uei.kind %v, uei.ExitCode: %v", uei.Kind, uei.ExitCode)
40 | 		return true
41 | 	}
42 | 	return false
43 | }
44 | 
45 | func (s *Sched) GetUeiData() (UserExitInfo, error) {
46 | 	if s.uei == nil {
47 | 		return UserExitInfo{}, fmt.Errorf("UeiMap is nil")
48 | 	}
49 | 	i := 0
50 | 	b, err := s.uei.BPFMap.GetValue(unsafe.Pointer(&i))
51 | 	if err != nil {
52 | 		return UserExitInfo{}, err
53 | 	}
54 | 	var uei UserExitInfo
55 | 	buff := bytes.NewBuffer(b)
56 | 	err = binary.Read(buff, binary.LittleEndian, &uei)
57 | 	if err != nil {
58 | 		return UserExitInfo{}, err
59 | 	}
60 | 	return uei, nil
61 | }
62 | 
63 | func (uei *UserExitInfo) GetReason() string {
64 | 	return C.GoString(&uei.Reason[0])
65 | }
66 | 
67 | func (uei *UserExitInfo) GetMessage() string {
68 | 	return C.GoString(&uei.Message[0])
69 | }
70 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/Gthulhu/plugin v1.0.1 h1:RC76Xah9D6IsvSa1N/H3kTTVgHRuRyMM9+y4B1veilM=
 2 | github.com/Gthulhu/plugin v1.0.1/go.mod h1:PJn7yc+XAtSD8peMRyyNN/kznESJSnfQaaTX68yKBDo=
 3 | github.com/cilium/ebpf v0.20.0 h1:atwWj9d3NffHyPZzVlx3hmw1on5CLe9eljR8VuHTwhM=
 4 | github.com/cilium/ebpf v0.20.0/go.mod h1:pzLjFymM+uZPLk/IXZUL63xdx5VXEo+enTzxkZXdycw=
 5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 7 | github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6 h1:teYtXy9B7y5lHTp8V9KPxpYRAVA7dozigQcMiBust1s=
 8 | github.com/go-quicktest/qt v1.101.1-0.20240301121107-c6c8733fa1e6/go.mod h1:p4lGIVX+8Wa6ZPNDvqcxq36XpUDLh42FLetFU7odllI=
 9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
11 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
12 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
13 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
14 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
15 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
16 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
17 | github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
18 | github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
19 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
20 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
21 | golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
22 | golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
23 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
24 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
25 | kernel.org/pub/linux/libs/security/libcap/cap v1.2.76 h1:mrdLPj8ujM6eIKGtd1PkkuCIodpFFDM42Cfm0YODkIM=
26 | kernel.org/pub/linux/libs/security/libcap/cap v1.2.76/go.mod h1:7V2BQeHnVAQwhCnCPJ977giCeGDiywVewWF+8vkpPlc=
27 | kernel.org/pub/linux/libs/security/libcap/psx v1.2.76 h1:3DyzQ30OHt3wiOZVL1se2g1PAPJIU7+tMUyvfMUj1dY=
28 | kernel.org/pub/linux/libs/security/libcap/psx v1.2.76/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24=
29 | 


--------------------------------------------------------------------------------
/wrapper.h:
--------------------------------------------------------------------------------
  1 | #ifndef WRAPPER_H__
  2 | #define WRAPPER_H__
  3 | typedef unsigned int __u32;
  4 | 
  5 | typedef __u32 u32;
  6 | 
  7 | typedef signed char __s8;
  8 | 
  9 | typedef unsigned char __u8;
 10 | 
 11 | typedef short unsigned int __u16;
 12 | 
 13 | typedef int __s32;
 14 | 
 15 | typedef long long int __s64;
 16 | 
 17 | typedef long long unsigned int __u64;
 18 | 
 19 | typedef __s8 s8;
 20 | 
 21 | typedef __u8 u8;
 22 | 
 23 | typedef __u16 u16;
 24 | 
 25 | typedef __s32 s32;
 26 | 
 27 | typedef __s64 s64;
 28 | 
 29 | typedef __u64 u64;
 30 | 
 31 | enum uei_sizes {
 32 | 	UEI_REASON_LEN		= 128,
 33 | 	UEI_MSG_LEN		= 1024,
 34 | 	UEI_DUMP_DFL_LEN	= 32768,
 35 | };
 36 | 
 37 | struct user_exit_info {
 38 | 	int		kind;
 39 | 	s64		exit_code;
 40 | 	char	reason[UEI_REASON_LEN];
 41 | 	char	msg[UEI_MSG_LEN];
 42 | };
 43 | #include "main.skeleton.h"
 44 | 
 45 | void *open_skel();
 46 | 
 47 | u32 get_usersched_pid();
 48 | 
 49 | void set_usersched_pid(u32 id);
 50 | 
 51 | void set_kugepagepid(u32 id);
 52 | 
 53 | void set_debug(bool enabled);
 54 | 
 55 | void set_builtin_idle(bool enabled);
 56 | 
 57 | void set_early_processing(bool enabled);
 58 | 
 59 | void set_default_slice(u64 t);
 60 | 
 61 | u64 get_nr_scheduled();
 62 | 
 63 | u64 get_nr_queued();
 64 | 
 65 | void notify_complete(u64 nr_pending);
 66 | 
 67 | void sub_nr_queued();
 68 | 
 69 | void dec_nr_queued(u64 num);
 70 | 
 71 | void destroy_skel(void *);
 72 | 
 73 | void set_scx_enums(
 74 | 	u64 SCX_OPS_NAME_LEN,
 75 | 	u64 SCX_SLICE_DFL,
 76 | 	u64 SCX_SLICE_INF,
 77 | 	u64 SCX_RQ_ONLINE,
 78 | 	u64 SCX_RQ_CAN_STOP_TICK,
 79 | 	u64 SCX_RQ_BAL_PENDING,
 80 | 	u64 SCX_RQ_BAL_KEEP,
 81 | 	u64 SCX_RQ_BYPASSING,
 82 | 	u64 SCX_RQ_CLK_VALID,
 83 | 	u64 SCX_RQ_IN_WAKEUP,
 84 | 	u64 SCX_RQ_IN_BALANCE,
 85 | 	u64 SCX_DSQ_FLAG_BUILTIN,
 86 | 	u64 SCX_DSQ_FLAG_LOCAL_ON,
 87 | 	u64 SCX_DSQ_INVALID,
 88 | 	u64 SCX_DSQ_GLOBAL,
 89 | 	u64 SCX_DSQ_LOCAL,
 90 | 	u64 SCX_DSQ_LOCAL_ON,
 91 | 	u64 SCX_DSQ_LOCAL_CPU_MASK,
 92 | 	u64 SCX_TASK_QUEUED,
 93 | 	u64 SCX_TASK_RESET_RUNNABLE_AT,
 94 | 	u64 SCX_TASK_DEQD_FOR_SLEEP,
 95 | 	u64 SCX_TASK_STATE_SHIFT,
 96 | 	u64 SCX_TASK_STATE_BITS,
 97 | 	u64 SCX_TASK_STATE_MASK,
 98 | 	u64 SCX_TASK_CURSOR,
 99 | 	u64 SCX_TASK_NONE,
100 | 	u64 SCX_TASK_INIT,
101 | 	u64 SCX_TASK_READY,
102 | 	u64 SCX_TASK_ENABLED,
103 | 	u64 SCX_TASK_NR_STATES,
104 | 	u64 SCX_TASK_DSQ_ON_PRIQ,
105 | 	u64 SCX_KICK_IDLE,
106 | 	u64 SCX_KICK_PREEMPT,
107 | 	u64 SCX_KICK_WAIT,
108 | 	u64 SCX_ENQ_WAKEUP,
109 | 	u64 SCX_ENQ_HEAD,
110 | 	u64 SCX_ENQ_PREEMPT,
111 | 	u64 SCX_ENQ_REENQ,
112 | 	u64 SCX_ENQ_LAST,
113 | 	u64 SCX_ENQ_CLEAR_OPSS,
114 | 	u64 SCX_ENQ_DSQ_PRIQ
115 | );
116 | 
117 | #endif


--------------------------------------------------------------------------------
/goland_core/bss.go:
--------------------------------------------------------------------------------
 1 | package core
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/binary"
 6 | 	"fmt"
 7 | 	"unsafe"
 8 | 
 9 | 	bpf "github.com/aquasecurity/libbpfgo"
10 | )
11 | 
12 | /*
13 | #include "wrapper.h"
14 | */
15 | import "C"
16 | 
17 | type BssData struct {
18 | 	Nr_running            uint64 `json:"nr_running"`            // Number of tasks currently running in the userspace scheduler
19 | 	Nr_queued             uint64 `json:"nr_queued"`             // Number of tasks queued in the userspace scheduler
20 | 	Nr_scheduled          uint64 `json:"nr_scheduled"`          // Number of tasks scheduled by the userspace scheduler
21 | 	Nr_online_cpus        uint64 `json:"nr_online_cpus"`        // Number of online CPUs in the system
22 | 	Usersched_last_run_at uint64 `json:"usersched_last_run_at"` // The PID of the userspace scheduler
23 | 	Nr_user_dispatches    uint64 `json:"nr_user_dispatches"`    // Number of user-space dispatches
24 | 	Nr_kernel_dispatches  uint64 `json:"nr_kernel_dispatches"`  // Number of kernel-space dispatches
25 | 	Nr_cancel_dispatches  uint64 `json:"nr_cancel_dispatches"`  // Number of cancelled dispatches
26 | 	Nr_bounce_dispatches  uint64 `json:"nr_bounce_dispatches"`  // Number of bounce dispatches
27 | 	Nr_failed_dispatches  uint64 `json:"nr_failed_dispatches"`  // Number of failed dispatches
28 | 	Nr_sched_congested    uint64 `json:"nr_sched_congested"`    // Number of times the scheduler was congested
29 | }
30 | 
31 | func (data BssData) String() string {
32 | 	return fmt.Sprintf("Usersched_last_run_at: %v, Nr_queued: %v ", data.Usersched_last_run_at, data.Nr_queued) +
33 | 		fmt.Sprintf("Nr_scheduled: %v, Nr_running: %v ", data.Nr_scheduled, data.Nr_running) +
34 | 		fmt.Sprintf("Nr_online_cpus: %v, Nr_user_dispatches: %v ", data.Nr_online_cpus, data.Nr_user_dispatches) +
35 | 		fmt.Sprintf("Nr_kernel_dispatches: %v, Nr_cancel_dispatches: %v ", data.Nr_kernel_dispatches, data.Nr_cancel_dispatches) +
36 | 		fmt.Sprintf("Nr_bounce_dispatches: %v, Nr_failed_dispatches: %v", data.Nr_bounce_dispatches, data.Nr_failed_dispatches) +
37 | 		fmt.Sprintf("Nr_sched_congested: %v", data.Nr_sched_congested)
38 | }
39 | 
40 | func LoadSkel() unsafe.Pointer {
41 | 	return C.open_skel()
42 | }
43 | 
44 | func GetUserSchedPid() int {
45 | 	return int(C.get_usersched_pid())
46 | }
47 | 
48 | func GetNrQueued() uint64 {
49 | 	return uint64(C.get_nr_queued())
50 | }
51 | func GetNrScheduled() uint64 {
52 | 	return uint64(C.get_nr_scheduled())
53 | }
54 | 
55 | func NotifyComplete(nr_pending uint64) error {
56 | 	C.notify_complete(C.u64(nr_pending))
57 | 	return nil
58 | }
59 | 
60 | func (s *Sched) SubNrQueued() error {
61 | 	C.sub_nr_queued()
62 | 	return nil
63 | }
64 | 
65 | func (s *Sched) DecNrQueued(num int) error {
66 | 	C.dec_nr_queued(C.u64(num))
67 | 	return nil
68 | }
69 | 
70 | type BssMap struct {
71 | 	*bpf.BPFMap
72 | }
73 | 
74 | func (s *Sched) GetBssData() (BssData, error) {
75 | 	if s.bss == nil {
76 | 		return BssData{}, fmt.Errorf("BssMap is nil")
77 | 	}
78 | 	i := 0
79 | 	b, err := s.bss.BPFMap.GetValue(unsafe.Pointer(&i))
80 | 	if err != nil {
81 | 		return BssData{}, err
82 | 	}
83 | 	var bss BssData
84 | 	buff := bytes.NewBuffer(b)
85 | 	err = binary.Read(buff, binary.LittleEndian, &bss)
86 | 	if err != nil {
87 | 		return BssData{}, err
88 | 	}
89 | 	return bss, nil
90 | }
91 | 


--------------------------------------------------------------------------------
/intf.h:
--------------------------------------------------------------------------------
  1 | // This software may be used and distributed according to the terms of the
  2 | // GNU General Public License version 2.
  3 | 
  4 | #ifndef __INTF_H
  5 | #define __INTF_H
  6 | 
  7 | #define MAX(x, y) ((x) > (y) ? (x) : (y))
  8 | #define MIN(x, y) ((x) < (y) ? (x) : (y))
  9 | 
 10 | #define NSEC_PER_SEC	1000000000L
 11 | #define CLOCK_BOOTTIME	7
 12 | 
 13 | #include <stdbool.h>
 14 | #ifndef __kptr
 15 | #ifdef __KERNEL__
 16 | #error "__kptr_ref not defined in the kernel"
 17 | #endif
 18 | #define __kptr
 19 | #endif
 20 | 
 21 | #ifndef __VMLINUX_H__
 22 | typedef unsigned char u8;
 23 | typedef unsigned short u16;
 24 | typedef unsigned int u32;
 25 | typedef unsigned long u64;
 26 | 
 27 | typedef signed char s8;
 28 | typedef signed short s16;
 29 | typedef signed int s32;
 30 | typedef signed long s64;
 31 | 
 32 | typedef int pid_t;
 33 | #endif /* __VMLINUX_H__ */
 34 | 
 35 | /* Check a condition at build time */
 36 | #define BUILD_BUG_ON(expr) \
 37 | 	do { \
 38 | 		extern char __build_assert__[(expr) ? -1 : 1] \
 39 | 			__attribute__((unused)); \
 40 | 	} while(0)
 41 | 
 42 | /*
 43 |  * Maximum amount of CPUs supported by this scheduler (this defines the size of
 44 |  * cpu_map that is used to store the idle state and CPU ownership).
 45 |  */
 46 | #define MAX_CPUS 1024
 47 | 
 48 | /* Special dispatch flags */
 49 | enum {
 50 | 	/*
 51 | 	 * Do not assign any specific CPU to the task.
 52 | 	 *
 53 | 	 * The task will be dispatched to the global shared DSQ and it will run
 54 | 	 * on the first CPU available.
 55 | 	 */
 56 | 	RL_CPU_ANY = 1 << 20,
 57 | };
 58 | 
 59 | /*
 60 |  * Specify a target CPU for a specific PID.
 61 |  */
 62 | struct task_cpu_arg {
 63 | 	pid_t pid;
 64 | 	s32 cpu;
 65 | 	u64 flags;
 66 | };
 67 | 
 68 | struct preempt_cpu_arg {
 69 | 	s32 cpu_id;
 70 | };
 71 | 
 72 | /*
 73 |  * Specify a sibling CPU relationship for a specific scheduling domain.
 74 |  */
 75 | struct domain_arg {
 76 | 	s32 lvl_id;
 77 | 	s32 cpu_id;
 78 | 	s32 sibling_cpu_id;
 79 | };
 80 | 
 81 | /*
 82 |  * Task sent to the user-space scheduler by the BPF dispatcher.
 83 |  *
 84 |  * All attributes are collected from the kernel by the the BPF component.
 85 |  */
 86 | struct queued_task_ctx {
 87 | 	s32 pid;
 88 | 	s32 cpu; /* CPU where the task is running */
 89 | 	u64 nr_cpus_allowed; /* Number of CPUs that the task can use */
 90 | 	u64 flags; /* Task enqueue flags */
 91 | 	u64 start_ts; /* Timestamp since last time the task ran on a CPU */
 92 | 	u64 stop_ts; /* Timestamp since last time the task released a CPU */
 93 | 	u64 exec_runtime; /* Total cpu time since last sleep */
 94 | 	u64 weight; /* Task static priority */
 95 | 	u64 vtime; /* Current task's vruntime */
 96 | 	s32 tgid;
 97 | };
 98 | 
 99 | /*
100 |  * Task sent to the BPF dispatcher by the user-space scheduler.
101 |  *
102 |  * This struct can be easily extended to send more information to the
103 |  * dispatcher (i.e., a target CPU, a variable time slice, etc.).
104 |  */
105 | struct dispatched_task_ctx {
106 | 	s32 pid;
107 | 	s32 cpu; /* CPU where the task should be dispatched */
108 | 	u64 flags; /* task enqueue flags */
109 | 	u64 slice_ns; /* time slice assigned to the task (0=default) */
110 | 	u64 vtime; /* task deadline / vruntime */
111 | 	u64 enq_cnt;
112 | };
113 | 
114 | #endif /* __INTF_H */
115 | 


--------------------------------------------------------------------------------
/util/topology.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io/fs"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 
 11 | 	core "github.com/Gthulhu/qumun/goland_core"
 12 | )
 13 | 
 14 | func parseCPUs(cpuList string) ([]int, error) {
 15 | 	var result []int
 16 | 	segments := strings.Split(cpuList, ",")
 17 | 
 18 | 	for _, segment := range segments {
 19 | 		segment = strings.TrimSpace(segment)
 20 | 		if strings.Contains(segment, "-") {
 21 | 			bounds := strings.Split(segment, "-")
 22 | 			if len(bounds) != 2 {
 23 | 				return nil, fmt.Errorf("invalid range: %s", segment)
 24 | 			}
 25 | 
 26 | 			start, err := strconv.Atoi(bounds[0])
 27 | 			if err != nil {
 28 | 				return nil, fmt.Errorf("invalid start of range: %s", bounds[0])
 29 | 			}
 30 | 
 31 | 			end, err := strconv.Atoi(bounds[1])
 32 | 			if err != nil {
 33 | 				return nil, fmt.Errorf("invalid end of range: %s", bounds[1])
 34 | 			}
 35 | 
 36 | 			if start > end {
 37 | 				return nil, fmt.Errorf("start greater than end in range: %s", segment)
 38 | 			}
 39 | 			for i := start; i <= end; i++ {
 40 | 				result = append(result, i)
 41 | 			}
 42 | 		} else {
 43 | 			num, err := strconv.Atoi(segment)
 44 | 			if err != nil {
 45 | 				return nil, fmt.Errorf("invalid number: %s", segment)
 46 | 			}
 47 | 			result = append(result, num)
 48 | 		}
 49 | 	}
 50 | 
 51 | 	return result, nil
 52 | }
 53 | 
 54 | func GetTopology() (map[string]map[string][]int, error) {
 55 | 	cacheDir := "/sys/devices/system/cpu/"
 56 | 	cacheMap := map[string]map[string][]int{
 57 | 		"L2": {},
 58 | 		"L3": {},
 59 | 	}
 60 | 
 61 | 	err := filepath.Walk(cacheDir, func(path string, info fs.FileInfo, err error) error {
 62 | 		if err != nil {
 63 | 			return err
 64 | 		}
 65 | 		var content []byte
 66 | 		var key string
 67 | 		if strings.HasSuffix(path, "shared_cpu_list") {
 68 | 			if strings.Contains(path, "/cache/index2/") {
 69 | 				content, err = os.ReadFile(path)
 70 | 				if err != nil {
 71 | 					return err
 72 | 				}
 73 | 				key = "L2"
 74 | 
 75 | 			} else if strings.Contains(path, "/cache/index3/") {
 76 | 				content, err = os.ReadFile(path)
 77 | 				if err != nil {
 78 | 					return err
 79 | 				}
 80 | 				key = "L3"
 81 | 			}
 82 | 			cpuIdList, err := parseCPUs(strings.TrimSpace(string(content)))
 83 | 			if err != nil {
 84 | 				return nil
 85 | 			}
 86 | 			cacheMap[key][strings.TrimSpace(string(content))] = cpuIdList
 87 | 		}
 88 | 		return nil
 89 | 	})
 90 | 
 91 | 	if err != nil {
 92 | 		return cacheMap, err
 93 | 	}
 94 | 
 95 | 	return cacheMap, nil
 96 | }
 97 | 
 98 | func initCacheDomains(bpfModule *core.Sched, level int32) error {
 99 | 	topo, err := GetTopology()
100 | 	if err != nil {
101 | 		return err
102 | 	}
103 | 	l := "L2"
104 | 	if level == 3 {
105 | 		l = "L3"
106 | 	}
107 | 	for _, cpuIdList := range topo[l] {
108 | 		for _, cpuId := range cpuIdList {
109 | 			for _, sibCpuId := range cpuIdList {
110 | 				err = bpfModule.EnableSiblingCpu(level, int32(cpuId), int32(sibCpuId))
111 | 				if err != nil {
112 | 					return fmt.Errorf("EnableSiblingCpu failed: lvl %v cpuId %v sibCpuId %v", level, cpuId, sibCpuId)
113 | 				}
114 | 			}
115 | 		}
116 | 	}
117 | 	return nil
118 | }
119 | 
120 | func InitCacheDomains(bpfModule *core.Sched) error {
121 | 	err := initCacheDomains(bpfModule, 2)
122 | 	if err != nil {
123 | 		return err
124 | 	}
125 | 	err = initCacheDomains(bpfModule, 3)
126 | 	if err != nil {
127 | 		return err
128 | 	}
129 | 	return nil
130 | }
131 | 


--------------------------------------------------------------------------------
/util/emun_import.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | /*
  4 | #include "wrapper.h"
  5 | */
  6 | import "C"
  7 | import (
  8 | 	"fmt"
  9 | )
 10 | 
 11 | func defaultScxEnums() *ScxEnums {
 12 | 	return &ScxEnums{
 13 | 		SCX_OPS_NAME_LEN:           128,
 14 | 		SCX_SLICE_DFL:              20000000,
 15 | 		SCX_SLICE_INF:              18446744073709551615,
 16 | 		SCX_RQ_ONLINE:              1,
 17 | 		SCX_RQ_CAN_STOP_TICK:       2,
 18 | 		SCX_RQ_BAL_PENDING:         4,
 19 | 		SCX_RQ_BAL_KEEP:            8,
 20 | 		SCX_RQ_BYPASSING:           16,
 21 | 		SCX_RQ_CLK_VALID:           32,
 22 | 		SCX_RQ_IN_WAKEUP:           65536,
 23 | 		SCX_RQ_IN_BALANCE:          131072,
 24 | 		SCX_DSQ_FLAG_BUILTIN:       9223372036854775808,
 25 | 		SCX_DSQ_FLAG_LOCAL_ON:      4611686018427387904,
 26 | 		SCX_DSQ_INVALID:            9223372036854775808,
 27 | 		SCX_DSQ_GLOBAL:             9223372036854775809,
 28 | 		SCX_DSQ_LOCAL:              9223372036854775810,
 29 | 		SCX_DSQ_LOCAL_ON:           13835058055282163712,
 30 | 		SCX_DSQ_LOCAL_CPU_MASK:     4294967295,
 31 | 		SCX_TASK_QUEUED:            1,
 32 | 		SCX_TASK_RESET_RUNNABLE_AT: 4,
 33 | 		SCX_TASK_DEQD_FOR_SLEEP:    8,
 34 | 		SCX_TASK_STATE_SHIFT:       8,
 35 | 		SCX_TASK_STATE_BITS:        2,
 36 | 		SCX_TASK_STATE_MASK:        768,
 37 | 		SCX_TASK_CURSOR:            18446744071562067968, // -2147483648 as uint64
 38 | 		SCX_TASK_NONE:              0,
 39 | 		SCX_TASK_INIT:              1,
 40 | 		SCX_TASK_READY:             2,
 41 | 		SCX_TASK_ENABLED:           3,
 42 | 		SCX_TASK_NR_STATES:         4,
 43 | 		SCX_TASK_DSQ_ON_PRIQ:       1,
 44 | 		SCX_KICK_IDLE:              1,
 45 | 		SCX_KICK_PREEMPT:           2,
 46 | 		SCX_KICK_WAIT:              4,
 47 | 		SCX_ENQ_WAKEUP:             1,
 48 | 		SCX_ENQ_HEAD:               16,
 49 | 		SCX_ENQ_PREEMPT:            4294967296,
 50 | 		SCX_ENQ_REENQ:              1099511627776,
 51 | 		SCX_ENQ_LAST:               2199023255552,
 52 | 		SCX_ENQ_CLEAR_OPSS:         72057594037927936,
 53 | 		SCX_ENQ_DSQ_PRIQ:           144115188075855872,
 54 | 	}
 55 | }
 56 | 
 57 | func ImportScxEnums() error {
 58 | 	e, err := GetScxEnums()
 59 | 	if err != nil {
 60 | 		e = defaultScxEnums()
 61 | 	}
 62 | 	if e == nil {
 63 | 		return fmt.Errorf("ScxEnums instance is nil")
 64 | 	}
 65 | 	C.set_scx_enums(
 66 | 		(C.u64)(e.SCX_OPS_NAME_LEN),
 67 | 		(C.u64)(e.SCX_SLICE_DFL),
 68 | 		(C.u64)(e.SCX_SLICE_INF),
 69 | 		(C.u64)(e.SCX_RQ_ONLINE),
 70 | 		(C.u64)(e.SCX_RQ_CAN_STOP_TICK),
 71 | 		(C.u64)(e.SCX_RQ_BAL_PENDING),
 72 | 		(C.u64)(e.SCX_RQ_BAL_KEEP),
 73 | 		(C.u64)(e.SCX_RQ_BYPASSING),
 74 | 		(C.u64)(e.SCX_RQ_CLK_VALID),
 75 | 		(C.u64)(e.SCX_RQ_IN_WAKEUP),
 76 | 		(C.u64)(e.SCX_RQ_IN_BALANCE),
 77 | 		(C.u64)(e.SCX_DSQ_FLAG_BUILTIN),
 78 | 		(C.u64)(e.SCX_DSQ_FLAG_LOCAL_ON),
 79 | 		(C.u64)(e.SCX_DSQ_INVALID),
 80 | 		(C.u64)(e.SCX_DSQ_GLOBAL),
 81 | 		(C.u64)(e.SCX_DSQ_LOCAL),
 82 | 		(C.u64)(e.SCX_DSQ_LOCAL_ON),
 83 | 		(C.u64)(e.SCX_DSQ_LOCAL_CPU_MASK),
 84 | 		(C.u64)(e.SCX_TASK_QUEUED),
 85 | 		(C.u64)(e.SCX_TASK_RESET_RUNNABLE_AT),
 86 | 		(C.u64)(e.SCX_TASK_DEQD_FOR_SLEEP),
 87 | 		(C.u64)(e.SCX_TASK_STATE_SHIFT),
 88 | 		(C.u64)(e.SCX_TASK_STATE_BITS),
 89 | 		(C.u64)(e.SCX_TASK_STATE_MASK),
 90 | 		(C.u64)(e.SCX_TASK_CURSOR),
 91 | 		(C.u64)(e.SCX_TASK_NONE),
 92 | 		(C.u64)(e.SCX_TASK_INIT),
 93 | 		(C.u64)(e.SCX_TASK_READY),
 94 | 		(C.u64)(e.SCX_TASK_ENABLED),
 95 | 		(C.u64)(e.SCX_TASK_NR_STATES),
 96 | 		(C.u64)(e.SCX_TASK_DSQ_ON_PRIQ),
 97 | 		(C.u64)(e.SCX_KICK_IDLE),
 98 | 		(C.u64)(e.SCX_KICK_PREEMPT),
 99 | 		(C.u64)(e.SCX_KICK_WAIT),
100 | 		(C.u64)(e.SCX_ENQ_WAKEUP),
101 | 		(C.u64)(e.SCX_ENQ_HEAD),
102 | 		(C.u64)(e.SCX_ENQ_PREEMPT),
103 | 		(C.u64)(e.SCX_ENQ_REENQ),
104 | 		(C.u64)(e.SCX_ENQ_LAST),
105 | 		(C.u64)(e.SCX_ENQ_CLEAR_OPSS),
106 | 		(C.u64)(e.SCX_ENQ_DSQ_PRIQ),
107 | 	)
108 | 	return nil
109 | }
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # qumun - A Custom Linux Scheduler Framework using eBPF and Go
  2 | 
  3 | **qumun** is a scheduler development framework that empowers developers to build custom Linux schedulers using Golang and eBPF.
  4 | 
  5 | The name qumun comes from the Bunun language (an Indigenous people of Taiwan), where it means “heart.”
  6 | Just as the heart powers and sustains the body, a scheduler is the core heartbeat of the operating system — orchestrating tasks, balancing workloads, and keeping everything alive and running.
  7 | 
  8 | By choosing this name, we hope to highlight both the technical role of a scheduler and also share a piece of Taiwan’s Indigenous culture with the global open-source community.
  9 | 
 10 | ## DEMO
 11 | 
 12 | ![](./assets/demo.gif)
 13 | 
 14 | ## Overview
 15 | 
 16 | ![](./assets/design.png)
 17 | 
 18 | This scheduler is designed to prioritize interactive workloads over background CPU-intensive tasks. It's particularly suitable for:
 19 | 
 20 | - Low-latency interactive applications
 21 | - Gaming
 22 | - Video conferencing
 23 | - Live streaming
 24 | 
 25 | The scheduler consists of two main components:
 26 | 1. A BPF component that implements low-level sched-ext functionalities
 27 | 2. A user-space scheduler written in Go with scx_goland_core that implements the actual scheduling policy
 28 | 
 29 | ## Key Features
 30 | 
 31 | - Virtual runtime (vruntime) based scheduling
 32 | - Latency-sensitive task prioritization
 33 | - Dynamic time slice adjustment
 34 | - CPU topology aware task placement
 35 | - Automatic idle CPU selection
 36 | 
 37 | ## How It Works
 38 | 
 39 | The scheduling policy is based on virtual runtime:
 40 | - Each task receives a time slice of execution (slice_ns)
 41 | - The actual execution time is adjusted based on task's static priority (weight)
 42 | - Tasks are dispatched from lowest to highest vruntime
 43 | - Latency-sensitive tasks receive priority boost based on voluntary context switches
 44 | 
 45 | ## Building
 46 | 
 47 | Prerequisites:
 48 | - Go 1.22+
 49 | - LLVM/Clang 17+
 50 | - libbpf
 51 | - Linux kernel 6.12+ with sched_ext support
 52 | 
 53 | ## Usage
 54 | 
 55 | ### Setting Up Dependencies
 56 | 
 57 | First, clone the required dependencies:
 58 | 
 59 | ```bash
 60 | make dep
 61 | git submodule init
 62 | git submodule sync
 63 | git submodule update
 64 | cd scx
 65 | meson setup build --prefix ~
 66 | meson compile -C build
 67 | ```
 68 | 
 69 | This will clone libbpf and the custom libbpfgo fork needed for the project.
 70 | 
 71 | ### Building the Scheduler
 72 | 
 73 | Build the scheduler with:
 74 | 
 75 | ```bash
 76 | make build
 77 | ```
 78 | 
 79 | This compiles the BPF program, builds libbpf, generates the skeleton, and builds the Go application.
 80 | 
 81 | ### Testing the Scheduler
 82 | 
 83 | To test the scheduler in a virtual environment using kernel v6.12.2:
 84 | 
 85 | ```bash
 86 | make test
 87 | ```
 88 | 
 89 | This uses `vng` (virtual kernel playground) to run the scheduler with the appropriate kernel version.
 90 | 
 91 | ### Running in Production
 92 | 
 93 | To run the scheduler on your system:
 94 | 
 95 | ```bash
 96 | sudo ./main
 97 | ```
 98 | 
 99 | The scheduler will run until terminated with Ctrl+C (SIGINT) or SIGTERM.
100 | 
101 | ### Debugging
102 | 
103 | If you need to inspect the BPF components, you can use:
104 | 
105 | ```bash
106 | sudo bpftool prog list            # List loaded BPF programs
107 | sudo bpftool map list             # List BPF maps
108 | sudo cat /sys/kernel/debug/tracing/trace_pipe # View BPF trace output
109 | ```
110 | 
111 | ### Stress Testing by using `stress-ng`
112 | 
113 | ```
114 | stress-ng -c 20 --timeout 20s --metrics-brief
115 | ```
116 | 
117 | ## License
118 | 
119 | This software is distributed under the terms of the GNU General Public License version 2.
120 | 
121 | ## Contributing
122 | 
123 | Contributions are welcome! Please feel free to submit pull requests or open issues for bugs and feature requests.


--------------------------------------------------------------------------------
/goland_core/task.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/binary"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"runtime"
  9 | 	"strconv"
 10 | 	"strings"
 11 | 	"unsafe"
 12 | 
 13 | 	"github.com/Gthulhu/plugin/models"
 14 | )
 15 | 
 16 | func (s *Sched) BlockTilReadyForDequeue(ctx context.Context) {
 17 | 	for {
 18 | 		select {
 19 | 		case t, ok := <-s.queue:
 20 | 			if !ok {
 21 | 				runtime.Gosched()
 22 | 				continue
 23 | 			}
 24 | 			s.queue <- t
 25 | 			return
 26 | 		case <-ctx.Done():
 27 | 			return
 28 | 		}
 29 | 	}
 30 | }
 31 | 
 32 | func (s *Sched) ReadyForDequeue() bool {
 33 | 	select {
 34 | 	case t, ok := <-s.queue:
 35 | 		if !ok {
 36 | 			return false
 37 | 		}
 38 | 		s.queue <- t
 39 | 		return true
 40 | 	default:
 41 | 		return false
 42 | 	}
 43 | }
 44 | 
 45 | func (s *Sched) DequeueTask(task *models.QueuedTask) {
 46 | 	select {
 47 | 	case t := <-s.queue:
 48 | 		err := fastDecode(t, task)
 49 | 		if err != nil {
 50 | 			task.Pid = -1
 51 | 			return
 52 | 		}
 53 | 		return
 54 | 	default:
 55 | 		task.Pid = -1
 56 | 		return
 57 | 	}
 58 | }
 59 | 
 60 | // Task queued for dispatching to the BPF component (see bpf_intf::dispatched_task_ctx).
 61 | type DispatchedTask struct {
 62 | 	Pid        int32  // pid that uniquely identifies a task
 63 | 	Cpu        int32  // target CPU selected by the scheduler
 64 | 	Flags      uint64 // special dispatch flags
 65 | 	SliceNs    uint64 // time slice assigned to the task (0 = default)
 66 | 	Vtime      uint64 // task deadline / vruntime
 67 | 	CpuMaskCnt uint64 // cpumask generation counter (private)
 68 | }
 69 | 
 70 | // NewDispatchedTask creates a DispatchedTask from a QueuedTask.
 71 | func NewDispatchedTask(task *models.QueuedTask) *DispatchedTask {
 72 | 	return &DispatchedTask{
 73 | 		Pid:     task.Pid,
 74 | 		Cpu:     task.Cpu,
 75 | 		Flags:   task.Flags,
 76 | 		SliceNs: 0, // use default time slice
 77 | 		Vtime:   0,
 78 | 	}
 79 | }
 80 | 
 81 | // func (s *Sched) DispatchTask(t *DispatchedTask) error {
 82 | // 	if err := s.urb.Error(); err != nil {
 83 | // 		return err
 84 | // 	}
 85 | // 	s.dispatch <- fastEncode(t)
 86 | // 	return nil
 87 | // }
 88 | 
 89 | func (s *Sched) DispatchTask(t *DispatchedTask) error {
 90 | 	return s.urb.Submit(fastEncode(t))
 91 | }
 92 | 
 93 | func fastDecode(data []byte, task *models.QueuedTask) error {
 94 | 	if len(data) < int(unsafe.Sizeof(models.QueuedTask{})) {
 95 | 		return fmt.Errorf("data length is less than QueuedTask size")
 96 | 	}
 97 | 	task.Pid = int32(binary.LittleEndian.Uint32(data[0:4]))
 98 | 	task.Cpu = int32(binary.LittleEndian.Uint32(data[4:8]))
 99 | 	task.NrCpusAllowed = binary.LittleEndian.Uint64(data[8:16])
100 | 	task.Flags = binary.LittleEndian.Uint64(data[16:24])
101 | 	task.StartTs = binary.LittleEndian.Uint64(data[24:32])
102 | 	task.StopTs = binary.LittleEndian.Uint64(data[32:40])
103 | 	task.SumExecRuntime = binary.LittleEndian.Uint64(data[40:48])
104 | 	task.Weight = binary.LittleEndian.Uint64(data[48:56])
105 | 	task.Vtime = binary.LittleEndian.Uint64(data[56:64])
106 | 	task.Tgid = int32(binary.LittleEndian.Uint32(data[64:68]))
107 | 
108 | 	return nil
109 | }
110 | 
111 | func fastEncode(t *DispatchedTask) []byte {
112 | 	data := make([]byte, 8*8) // 64 bytes
113 | 
114 | 	binary.LittleEndian.PutUint32(data[0:4], uint32(t.Pid))
115 | 	binary.LittleEndian.PutUint32(data[4:8], uint32(t.Cpu))
116 | 	binary.LittleEndian.PutUint64(data[8:16], t.Flags)
117 | 	binary.LittleEndian.PutUint64(data[16:24], t.SliceNs)
118 | 	binary.LittleEndian.PutUint64(data[24:32], t.Vtime)
119 | 	binary.LittleEndian.PutUint64(data[32:40], t.CpuMaskCnt)
120 | 
121 | 	return data
122 | }
123 | 
124 | func IsSMTActive() (bool, error) {
125 | 	data, err := os.ReadFile("/sys/devices/system/cpu/smt/active")
126 | 	if err != nil {
127 | 		return false, err
128 | 	}
129 | 
130 | 	contents := strings.TrimSpace(string(data))
131 | 	smtActive, err := strconv.Atoi(contents)
132 | 	if err != nil {
133 | 		return false, err
134 | 	}
135 | 
136 | 	return smtActive == 1, nil
137 | }
138 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Architecture configuration (default: x86_64, can override with ARCH=arm64)
  2 | # 
  3 | # Usage: 
  4 | #   make build              # Build for x86_64 (default)
  5 | #   make build ARCH=arm64   # Build for ARM64 (requires cross-compilation tools)
  6 | #
  7 | # ARM64 cross-compilation requirements:
  8 | #   1. Install cross-compiler: sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
  9 | #   2. Install ARM64 dev libraries: sudo apt-get install libelf-dev:arm64 zlib1g-dev:arm64 libzstd-dev:arm64
 10 | #   3. May need to configure dpkg for multi-arch: sudo dpkg --add-architecture arm64
 11 | #
 12 | ARCH ?= x86_64
 13 | 
 14 | # Architecture-specific settings
 15 | ifeq ($(ARCH),arm64)
 16 |     ARCH_DEFINE = -D__TARGET_ARCH_arm64
 17 |     ARCH_CPU_FLAGS = -mcpu=v3
 18 |     ARCH_SCHED_INCLUDE = -I scx/scheds/include/arch/aarch64
 19 |     ARCH_INCLUDE_DIR = aarch64-linux-gnu
 20 |     GOARCH_ENV = CGO_ENABLED=1 GOARCH=arm64
 21 |     CGO_CC = aarch64-linux-gnu-gcc
 22 |     LIBBPF_CC = aarch64-linux-gnu-gcc
 23 | else
 24 |     ARCH_DEFINE = -D__TARGET_ARCH_x86
 25 |     ARCH_CPU_FLAGS = -mcpu=v3
 26 |     ARCH_SCHED_INCLUDE = -I scx/scheds/include/arch/x86
 27 |     ARCH_INCLUDE_DIR = x86_64-linux-gnu
 28 |     GOARCH_ENV = 
 29 |     CGO_CC = clang
 30 |     LIBBPF_CC = gcc
 31 | endif
 32 | 
 33 | OUTPUT = output
 34 | LIBBPF_SRC = $(abspath libbpf/src)
 35 | LIBBPF_OBJ = $(abspath $(OUTPUT)/libbpf.a)
 36 | LIBBPF_OBJDIR = $(abspath ./$(OUTPUT)/libbpf)
 37 | LIBBPF_DESTDIR = $(abspath ./$(OUTPUT))
 38 | 
 39 | 
 40 | TARGET = main
 41 | BPF_TARGET = ${TARGET:=.bpf}
 42 | BPF_C = ${BPF_TARGET:=.c}
 43 | BPF_OBJ = ${BPF_C:.c=.o}
 44 | 
 45 | BASEDIR = $(abspath .)
 46 | OUTPUT = output
 47 | LIBBPF_INCLUDE_UAPI = $(abspath ./libbpf/include/uapi)
 48 | LIBBPF_OBJ = $(abspath $(OUTPUT)/libbpf.a)
 49 | LIBBPF_OBJDIR = $(abspath ./$(OUTPUT)/libbpf)
 50 | LIBBPF_DESTDIR = $(abspath ./$(OUTPUT))
 51 | CLANG_BPF_SYS_INCLUDES := `shell $(CLANG) -v -E - </dev/null 2>&1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'`
 52 | CGOFLAG = $(GOARCH_ENV) CC=$(CGO_CC) CGO_CFLAGS="-I$(BASEDIR) -I$(BASEDIR)/$(OUTPUT)" CGO_LDFLAGS="-lelf -lz $(LIBBPF_OBJ) -lzstd $(BASEDIR)/libwrapper.a"
 53 | 
 54 | .PHONY: build
 55 | build: clean $(BPF_OBJ) libbpf libbpf-uapi wrapper
 56 | 	$(CGOFLAG) go build -ldflags "-w -s" main.go
 57 | 
 58 | test: build
 59 | 	@echo "Running scheduler test..."
 60 | 	@chmod +x scripts/test_scheduler.sh
 61 | 	@vng -r v6.12.2 -- bash scripts/test_scheduler.sh
 62 | 
 63 | .PHONY: libbpf-uapi
 64 | libbpf-uapi: $(LIBBPF_SRC)
 65 | 	UAPIDIR=$(LIBBPF_DESTDIR) \
 66 | 		$(MAKE) -C $(LIBBPF_SRC) install_uapi_headers
 67 | 
 68 | .PHONY: libbpf
 69 | libbpf: $(LIBBPF_SRC) $(wildcard $(LIBBPF_SRC)/*.[ch])
 70 | 	$(MAKE) -C $(LIBBPF_SRC) clean
 71 | 	CC="$(LIBBPF_CC)" CFLAGS="-g -O2 -Wall -fpie" \
 72 | 	   $(MAKE) -C $(LIBBPF_SRC) \
 73 | 		BUILD_STATIC_ONLY=1 \
 74 | 		OBJDIR=$(LIBBPF_OBJDIR) \
 75 | 		DESTDIR=$(LIBBPF_DESTDIR) \
 76 | 		INCLUDEDIR= LIBDIR= UAPIDIR= install
 77 | 	$(eval STATIC=-extldflags -static)
 78 | 
 79 | dep:
 80 | 	git clone https://github.com/libbpf/libbpf.git && \
 81 | 	cd libbpf/src && \
 82 | 	git checkout 09b9e83 && \
 83 | 	make && \
 84 | 	sudo make install && \
 85 | 	cd - && \
 86 | 	git clone -b feat/skel https://github.com/Gthulhu/libbpfgo.git
 87 | 
 88 | $(BPF_OBJ): %.o: %.c
 89 | 	clang-17 \
 90 | 		-O2 -g -Wall -target bpf \
 91 | 		$(ARCH_DEFINE) $(ARCH_CPU_FLAGS) -mlittle-endian \
 92 | 		-idirafter /usr/lib/llvm-17/lib/clang/17/include -idirafter /usr/local/include -idirafter /usr/include/$(ARCH_INCLUDE_DIR) -idirafter /usr/include \
 93 | 		-I scx/scheds/vmlinux -I scx/build/libbpf/src/usr/include -I scx/build/libbpf/include/uapi -I scx/scheds/include $(ARCH_SCHED_INCLUDE) -I scx/scheds/include/bpf-compat -I scx/scheds/include/lib \
 94 | 		-Wno-compare-distinct-pointer-types \
 95 | 		-c $< -o $@
 96 | 
 97 | wrapper:
 98 | 	bpftool gen skeleton main.bpf.o > main.skeleton.h
 99 | 	$(CGO_CC) -g -O2 -Wall -fPIC -I scx/build/libbpf/src/usr/include -I scx/build/libbpf/include/uapi -I scx/scheds/include $(ARCH_SCHED_INCLUDE) -I scx/scheds/include/bpf-compat -I scx/scheds/include/lib -c wrapper.c -o wrapper.o
100 | 	ar rcs libwrapper.a wrapper.o
101 | 
102 | clean:
103 | 	rm libwrapper.a || true
104 | 	rm *.skeleton.h || true
105 | 	rm *.ll *.o || true
106 | 	rm main || true


--------------------------------------------------------------------------------
/wrapper.c:
--------------------------------------------------------------------------------
  1 | #include "wrapper.h"
  2 | 
  3 | struct main_bpf *global_obj;
  4 | 
  5 | void *open_skel() {
  6 |     struct main_bpf *obj = NULL;
  7 |     obj = main_bpf__open();
  8 |     main_bpf__create_skeleton(obj);
  9 |     global_obj = obj;
 10 |     return obj->obj;
 11 | }
 12 | 
 13 | u32 get_usersched_pid() {
 14 |     return global_obj->rodata->usersched_pid;
 15 | }
 16 | 
 17 | void set_usersched_pid(u32 id) {
 18 |     global_obj->rodata->usersched_pid = id;
 19 | }
 20 | 
 21 | void set_kugepagepid(u32 id) {
 22 |     global_obj->rodata->khugepaged_pid = id;
 23 | }
 24 | 
 25 | void set_early_processing(bool enabled) {
 26 |     global_obj->rodata->early_processing = enabled;
 27 | }
 28 | 
 29 | void set_default_slice(u64 t) {
 30 |     global_obj->rodata->default_slice = t;
 31 | }
 32 | 
 33 | void set_debug(bool enabled) {
 34 |     global_obj->rodata->debug = enabled;
 35 | }
 36 | 
 37 | void set_builtin_idle(bool enabled) {
 38 |     global_obj->rodata->builtin_idle = enabled;
 39 | }
 40 | 
 41 | u64 get_nr_scheduled() {
 42 |     return global_obj->bss->nr_scheduled;
 43 | }
 44 | 
 45 | u64 get_nr_queued() {
 46 |     return global_obj->bss->nr_queued;
 47 | }
 48 | 
 49 | void notify_complete(u64 nr_pending) {
 50 |     global_obj->bss->nr_scheduled = nr_pending;
 51 | }
 52 | 
 53 | void sub_nr_queued() {
 54 |     if (global_obj->bss->nr_queued){
 55 |         global_obj->bss->nr_queued--;
 56 |     }
 57 | }
 58 | 
 59 | void dec_nr_queued(u64 num) {
 60 |     if (global_obj->bss->nr_queued){
 61 |         global_obj->bss->nr_queued-=num;
 62 |     }
 63 | }
 64 | 
 65 | void destroy_skel(void*skel) {
 66 |     main_bpf__destroy(skel);
 67 | }
 68 | 
 69 | void set_scx_enums(
 70 |     u64 SCX_OPS_NAME_LEN,
 71 |     u64 SCX_SLICE_DFL,
 72 |     u64 SCX_SLICE_INF,
 73 |     u64 SCX_RQ_ONLINE,
 74 |     u64 SCX_RQ_CAN_STOP_TICK,
 75 |     u64 SCX_RQ_BAL_PENDING,
 76 |     u64 SCX_RQ_BAL_KEEP,
 77 |     u64 SCX_RQ_BYPASSING,
 78 |     u64 SCX_RQ_CLK_VALID,
 79 |     u64 SCX_RQ_IN_WAKEUP,
 80 |     u64 SCX_RQ_IN_BALANCE,
 81 |     u64 SCX_DSQ_FLAG_BUILTIN,
 82 |     u64 SCX_DSQ_FLAG_LOCAL_ON,
 83 |     u64 SCX_DSQ_INVALID,
 84 |     u64 SCX_DSQ_GLOBAL,
 85 |     u64 SCX_DSQ_LOCAL,
 86 |     u64 SCX_DSQ_LOCAL_ON,
 87 |     u64 SCX_DSQ_LOCAL_CPU_MASK,
 88 |     u64 SCX_TASK_QUEUED,
 89 |     u64 SCX_TASK_RESET_RUNNABLE_AT,
 90 |     u64 SCX_TASK_DEQD_FOR_SLEEP,
 91 |     u64 SCX_TASK_STATE_SHIFT,
 92 |     u64 SCX_TASK_STATE_BITS,
 93 |     u64 SCX_TASK_STATE_MASK,
 94 |     u64 SCX_TASK_CURSOR,
 95 |     u64 SCX_TASK_NONE,
 96 |     u64 SCX_TASK_INIT,
 97 |     u64 SCX_TASK_READY,
 98 |     u64 SCX_TASK_ENABLED,
 99 |     u64 SCX_TASK_NR_STATES,
100 |     u64 SCX_TASK_DSQ_ON_PRIQ,
101 |     u64 SCX_KICK_IDLE,
102 |     u64 SCX_KICK_PREEMPT,
103 |     u64 SCX_KICK_WAIT,
104 |     u64 SCX_ENQ_WAKEUP,
105 |     u64 SCX_ENQ_HEAD,
106 |     u64 SCX_ENQ_PREEMPT,
107 |     u64 SCX_ENQ_REENQ,
108 |     u64 SCX_ENQ_LAST,
109 |     u64 SCX_ENQ_CLEAR_OPSS,
110 |     u64 SCX_ENQ_DSQ_PRIQ
111 | ) {
112 |     if (!global_obj || !global_obj->rodata) return;
113 |     global_obj->rodata->__SCX_OPS_NAME_LEN = SCX_OPS_NAME_LEN;
114 |     global_obj->rodata->__SCX_SLICE_DFL = SCX_SLICE_DFL;
115 |     global_obj->rodata->__SCX_SLICE_INF = SCX_SLICE_INF;
116 |     global_obj->rodata->__SCX_RQ_ONLINE = SCX_RQ_ONLINE;
117 |     global_obj->rodata->__SCX_RQ_CAN_STOP_TICK = SCX_RQ_CAN_STOP_TICK;
118 |     global_obj->rodata->__SCX_RQ_BAL_PENDING = SCX_RQ_BAL_PENDING;
119 |     global_obj->rodata->__SCX_RQ_BAL_KEEP = SCX_RQ_BAL_KEEP;
120 |     global_obj->rodata->__SCX_RQ_BYPASSING = SCX_RQ_BYPASSING;
121 |     global_obj->rodata->__SCX_RQ_CLK_VALID = SCX_RQ_CLK_VALID;
122 |     global_obj->rodata->__SCX_RQ_IN_WAKEUP = SCX_RQ_IN_WAKEUP;
123 |     global_obj->rodata->__SCX_RQ_IN_BALANCE = SCX_RQ_IN_BALANCE;
124 |     global_obj->rodata->__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN;
125 |     global_obj->rodata->__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON;
126 |     global_obj->rodata->__SCX_DSQ_INVALID = SCX_DSQ_INVALID;
127 |     global_obj->rodata->__SCX_DSQ_GLOBAL = SCX_DSQ_GLOBAL;
128 |     global_obj->rodata->__SCX_DSQ_LOCAL = SCX_DSQ_LOCAL;
129 |     global_obj->rodata->__SCX_DSQ_LOCAL_ON = SCX_DSQ_LOCAL_ON;
130 |     global_obj->rodata->__SCX_DSQ_LOCAL_CPU_MASK = SCX_DSQ_LOCAL_CPU_MASK;
131 |     global_obj->rodata->__SCX_TASK_QUEUED = SCX_TASK_QUEUED;
132 |     global_obj->rodata->__SCX_TASK_RESET_RUNNABLE_AT = SCX_TASK_RESET_RUNNABLE_AT;
133 |     global_obj->rodata->__SCX_TASK_DEQD_FOR_SLEEP = SCX_TASK_DEQD_FOR_SLEEP;
134 |     global_obj->rodata->__SCX_TASK_STATE_SHIFT = SCX_TASK_STATE_SHIFT;
135 |     global_obj->rodata->__SCX_TASK_STATE_BITS = SCX_TASK_STATE_BITS;
136 |     global_obj->rodata->__SCX_TASK_STATE_MASK = SCX_TASK_STATE_MASK;
137 |     global_obj->rodata->__SCX_TASK_CURSOR = SCX_TASK_CURSOR;
138 |     global_obj->rodata->__SCX_TASK_NONE = SCX_TASK_NONE;
139 |     global_obj->rodata->__SCX_TASK_INIT = SCX_TASK_INIT;
140 |     global_obj->rodata->__SCX_TASK_READY = SCX_TASK_READY;
141 |     global_obj->rodata->__SCX_TASK_ENABLED = SCX_TASK_ENABLED;
142 |     global_obj->rodata->__SCX_TASK_NR_STATES = SCX_TASK_NR_STATES;
143 |     global_obj->rodata->__SCX_TASK_DSQ_ON_PRIQ = SCX_TASK_DSQ_ON_PRIQ;
144 |     global_obj->rodata->__SCX_KICK_IDLE = SCX_KICK_IDLE;
145 |     global_obj->rodata->__SCX_KICK_PREEMPT = SCX_KICK_PREEMPT;
146 |     global_obj->rodata->__SCX_KICK_WAIT = SCX_KICK_WAIT;
147 |     global_obj->rodata->__SCX_ENQ_WAKEUP = SCX_ENQ_WAKEUP;
148 |     global_obj->rodata->__SCX_ENQ_HEAD = SCX_ENQ_HEAD;
149 |     global_obj->rodata->__SCX_ENQ_PREEMPT = SCX_ENQ_PREEMPT;
150 |     global_obj->rodata->__SCX_ENQ_REENQ = SCX_ENQ_REENQ;
151 |     global_obj->rodata->__SCX_ENQ_LAST = SCX_ENQ_LAST;
152 |     global_obj->rodata->__SCX_ENQ_CLEAR_OPSS = SCX_ENQ_CLEAR_OPSS;
153 |     global_obj->rodata->__SCX_ENQ_DSQ_PRIQ = SCX_ENQ_DSQ_PRIQ;
154 | }


--------------------------------------------------------------------------------
/goland_core/rodata.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | /*
  4 | #include "wrapper.h"
  5 | */
  6 | import "C"
  7 | 
  8 | import (
  9 | 	"bytes"
 10 | 	"encoding/binary"
 11 | 	"fmt"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"strconv"
 15 | 	"strings"
 16 | 	"unsafe"
 17 | 
 18 | 	bpf "github.com/aquasecurity/libbpfgo"
 19 | )
 20 | 
 21 | type RodataMap struct {
 22 | 	*bpf.BPFMap
 23 | }
 24 | 
 25 | type Rodata struct {
 26 | 	DefaultSlice           uint64   `json:"default_slice"`
 27 | 	SmtEnabled             bool     `json:"smt_enabled"`
 28 | 	Debug                  bool     `json:"debug"`
 29 | 	Pad0                   [54]byte `json:"-"` // __pad0[54]
 30 | 	SCXOpsNameLen          uint64   `json:"scx_ops_name_len"`
 31 | 	SCXSliceDfl            uint64   `json:"scx_slice_dfl"`
 32 | 	SCXSliceInf            uint64   `json:"scx_slice_inf"`
 33 | 	SCXRqOnline            uint64   `json:"scx_rq_online"`
 34 | 	SCXRqCanStopTick       uint64   `json:"scx_rq_can_stop_tick"`
 35 | 	SCXRqBalPending        uint64   `json:"scx_rq_bal_pending"`
 36 | 	SCXRqBalKeep           uint64   `json:"scx_rq_bal_keep"`
 37 | 	SCXRqBypassing         uint64   `json:"scx_rq_bypassing"`
 38 | 	SCXRqClkValid          uint64   `json:"scx_rq_clk_valid"`
 39 | 	SCXRqInWakeup          uint64   `json:"scx_rq_in_wakeup"`
 40 | 	SCXRqInBalance         uint64   `json:"scx_rq_in_balance"`
 41 | 	SCXDsqFlagBuiltin      uint64   `json:"scx_dsq_flag_builtin"`
 42 | 	SCXDsqFlagLocalOn      uint64   `json:"scx_dsq_flag_local_on"`
 43 | 	SCXDsqInvalid          uint64   `json:"scx_dsq_invalid"`
 44 | 	SCXDsqGlobal           uint64   `json:"scx_dsq_global"`
 45 | 	SCXDsqLocal            uint64   `json:"scx_dsq_local"`
 46 | 	SCXDsqLocalOn          uint64   `json:"scx_dsq_local_on"`
 47 | 	SCXDsqLocalCpuMask     uint64   `json:"scx_dsq_local_cpu_mask"`
 48 | 	SCXTaskQueued          uint64   `json:"scx_task_queued"`
 49 | 	SCXTaskResetRunnableAt uint64   `json:"scx_task_reset_runnable_at"`
 50 | 	SCXTaskDeqdForSleep    uint64   `json:"scx_task_deqd_for_sleep"`
 51 | 	SCXTaskStateShift      uint64   `json:"scx_task_state_shift"`
 52 | 	SCXTaskStateBits       uint64   `json:"scx_task_state_bits"`
 53 | 	SCXTaskStateMask       uint64   `json:"scx_task_state_mask"`
 54 | 	SCXTaskCursor          uint64   `json:"scx_task_cursor"`
 55 | 	SCXTaskNone            uint64   `json:"scx_task_none"`
 56 | 	SCXTaskInit            uint64   `json:"scx_task_init"`
 57 | 	SCXTaskReady           uint64   `json:"scx_task_ready"`
 58 | 	SCXTaskEnabled         uint64   `json:"scx_task_enabled"`
 59 | 	SCXTaskNrStates        uint64   `json:"scx_task_nr_states"`
 60 | 	SCXTaskDsqOnPriq       uint64   `json:"scx_task_dsq_on_priq"`
 61 | 	SCXKickIdle            uint64   `json:"scx_kick_idle"`
 62 | 	SCXKickPreempt         uint64   `json:"scx_kick_preempt"`
 63 | 	SCXKickWait            uint64   `json:"scx_kick_wait"`
 64 | 	SCXEnqWakeup           uint64   `json:"scx_enq_wakeup"`
 65 | 	SCXEnqHead             uint64   `json:"scx_enq_head"`
 66 | 	SCXEnqPreempt          uint64   `json:"scx_enq_preempt"`
 67 | 	SCXEnqReenq            uint64   `json:"scx_enq_reenq"`
 68 | 	SCXEnqLast             uint64   `json:"scx_enq_last"`
 69 | 	SCXEnqClearOpss        uint64   `json:"scx_enq_clear_opss"`
 70 | 	SCXEnqDsqPriq          uint64   `json:"scx_enq_dsq_priq"`
 71 | 	UeiDumpLen             uint32   `json:"uei_dump_len"`
 72 | 	UserschedPid           uint32   `json:"usersched_pid"`
 73 | 	KhugepagePid           uint32   `json:"khugepage_pid"`
 74 | 	SwitchPartial          bool     `json:"switch_partial"`
 75 | 	EarlyProcessing        bool     `json:"early_processing"`
 76 | 	BuiltinIdle            bool     `json:"builtin_idle"`
 77 | }
 78 | 
 79 | func (s *Sched) GetRoData() (Rodata, error) {
 80 | 	if s.rodata == nil {
 81 | 		return Rodata{}, fmt.Errorf("BssMap is nil")
 82 | 	}
 83 | 	i := 0
 84 | 	b, err := s.rodata.BPFMap.GetValue(unsafe.Pointer(&i))
 85 | 	if err != nil {
 86 | 		return Rodata{}, err
 87 | 	}
 88 | 	var ro Rodata
 89 | 	buff := bytes.NewBuffer(b)
 90 | 	err = binary.Read(buff, binary.LittleEndian, &ro)
 91 | 	if err != nil {
 92 | 		return Rodata{}, err
 93 | 	}
 94 | 	return ro, nil
 95 | }
 96 | 
 97 | func (s *Sched) AssignUserSchedPid(pid int) error {
 98 | 	C.set_kugepagepid(C.u32(KhugepagePid()))
 99 | 	C.set_usersched_pid(C.u32(pid))
100 | 	return nil
101 | }
102 | 
103 | func (s *Sched) SetDebug(enabled bool) {
104 | 	C.set_debug(C.bool(enabled))
105 | }
106 | 
107 | func (s *Sched) SetBuiltinIdle(enabled bool) {
108 | 	C.set_builtin_idle(C.bool(enabled))
109 | }
110 | 
111 | func (s *Sched) SetEarlyProcessing(enabled bool) {
112 | 	C.set_early_processing(C.bool(enabled))
113 | }
114 | 
115 | func (s *Sched) SetDefaultSlice(t uint64) {
116 | 	C.set_default_slice(C.u64(t))
117 | }
118 | 
119 | // KhugepagePid finds and returns the PID of the khugepaged process
120 | func KhugepagePid() uint32 {
121 | 	procDir := "/proc"
122 | 
123 | 	// Read all entries in /proc
124 | 	entries, err := os.ReadDir(procDir)
125 | 	if err != nil {
126 | 		return 0
127 | 	}
128 | 
129 | 	for _, entry := range entries {
130 | 		// Skip non-directories and non-numeric directories
131 | 		if !entry.IsDir() {
132 | 			continue
133 | 		}
134 | 
135 | 		pidStr := entry.Name()
136 | 		// Check if directory name is numeric (PID)
137 | 		if _, err := strconv.Atoi(pidStr); err != nil {
138 | 			continue
139 | 		}
140 | 
141 | 		// Read the comm file to get process name
142 | 		commPath := filepath.Join(procDir, pidStr, "comm")
143 | 		commData, err := os.ReadFile(commPath)
144 | 		if err != nil {
145 | 			continue
146 | 		}
147 | 
148 | 		comm := strings.TrimSpace(string(commData))
149 | 		if comm != "khugepaged" {
150 | 			continue
151 | 		}
152 | 
153 | 		// Check if exe symlink exists (should not exist for kernel threads like khugepaged)
154 | 		exePath := filepath.Join(procDir, pidStr, "exe")
155 | 		if _, err := os.Readlink(exePath); err == nil {
156 | 			// exe symlink exists, this is not a kernel thread
157 | 			continue
158 | 		}
159 | 
160 | 		// Convert PID string to uint32
161 | 		if pid, err := strconv.ParseUint(pidStr, 10, 32); err == nil {
162 | 			return uint32(pid)
163 | 		}
164 | 	}
165 | 
166 | 	return 0
167 | }
168 | 


--------------------------------------------------------------------------------
/goland_core/obj.go:
--------------------------------------------------------------------------------
  1 | package core
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 	"syscall"
  7 | 	"unsafe"
  8 | 
  9 | 	"github.com/Gthulhu/plugin/models"
 10 | 	"github.com/Gthulhu/plugin/plugin"
 11 | 	bpf "github.com/aquasecurity/libbpfgo"
 12 | 	"github.com/cilium/ebpf"
 13 | 	"golang.org/x/sys/unix"
 14 | )
 15 | 
 16 | const (
 17 | 	RL_CPU_ANY = 1 << 20
 18 | )
 19 | 
 20 | type Sched struct {
 21 | 	mod          *bpf.Module
 22 | 	plugin       plugin.CustomScheduler
 23 | 	bss          *BssMap
 24 | 	uei          *UeiMap
 25 | 	rodata       *RodataMap
 26 | 	structOps    *bpf.BPFMap
 27 | 	urb          *bpf.UserRingBuffer
 28 | 	queue        chan []byte // The map containing tasks that are queued to user space from the kernel.
 29 | 	dispatch     chan []byte
 30 | 	preemptCpu   *ebpf.Program
 31 | 	siblingCpu   *ebpf.Program
 32 | 	selectCpuPrg *ebpf.Program // Cilium eBPF program for syscall-based invocation
 33 | }
 34 | 
 35 | func init() {
 36 | 	unix.Mlockall(syscall.MCL_CURRENT | syscall.MCL_FUTURE)
 37 | }
 38 | 
 39 | func LoadSched(objPath string) *Sched {
 40 | 	obj := LoadSkel()
 41 | 	bpfModule, err := bpf.NewModuleFromFileArgs(bpf.NewModuleArgs{
 42 | 		BPFObjPath:     "",
 43 | 		KernelLogLevel: 0,
 44 | 	})
 45 | 	if err != nil {
 46 | 		panic(err)
 47 | 	}
 48 | 	if err := bpfModule.BPFReplaceExistedObject(obj); err != nil {
 49 | 		panic(err)
 50 | 	}
 51 | 
 52 | 	s := &Sched{
 53 | 		mod: bpfModule,
 54 | 	}
 55 | 
 56 | 	return s
 57 | }
 58 | 
 59 | func (s *Sched) SetPlugin(p plugin.CustomScheduler) {
 60 | 	s.plugin = p
 61 | }
 62 | 
 63 | func (s *Sched) Start() {
 64 | 	var err error
 65 | 	bpfModule := s.mod
 66 | 	bpfModule.BPFLoadObject()
 67 | 	iters := bpfModule.Iterator()
 68 | 	for {
 69 | 		prog := iters.NextProgram()
 70 | 		if prog == nil {
 71 | 			break
 72 | 		}
 73 | 		if prog.Name() == "kprobe_handle_mm_fault" {
 74 | 			log.Println("attach kprobe_handle_mm_fault")
 75 | 			_, err := prog.AttachGeneric()
 76 | 			if err != nil {
 77 | 				log.Panicf("attach kprobe_handle_mm_fault failed: %v", err)
 78 | 			}
 79 | 			continue
 80 | 		}
 81 | 		if prog.Name() == "kretprobe_handle_mm_fault" {
 82 | 			log.Println("attach kretprobe_handle_mm_fault")
 83 | 			_, err := prog.AttachGeneric()
 84 | 			if err != nil {
 85 | 				log.Panicf("attach kretprobe_handle_mm_fault failed: %v", err)
 86 | 			}
 87 | 			continue
 88 | 		}
 89 | 	}
 90 | 	iters = bpfModule.Iterator()
 91 | 	for {
 92 | 		m := iters.NextMap()
 93 | 		if m == nil {
 94 | 			break
 95 | 		}
 96 | 		fmt.Printf("map: %s, type: %s, fd: %d\n", m.Name(), m.Type().String(), m.FileDescriptor())
 97 | 		if m.Name() == "main_bpf.bss" {
 98 | 			s.bss = &BssMap{m}
 99 | 		} else if m.Name() == "main_bpf.data" {
100 | 			s.uei = &UeiMap{m}
101 | 		} else if m.Name() == "main_bpf.rodata" {
102 | 			s.rodata = &RodataMap{m}
103 | 		} else if m.Name() == "queued" {
104 | 			s.queue = make(chan []byte, 128)
105 | 			rb, err := s.mod.InitRingBuf("queued", s.queue)
106 | 			if err != nil {
107 | 				panic(err)
108 | 			}
109 | 			rb.Poll(10)
110 | 		} else if m.Name() == "dispatched" {
111 | 			s.dispatch = make(chan []byte, 128)
112 | 			s.urb, err = s.mod.InitUserRingBuf("dispatched", s.dispatch)
113 | 			if err != nil {
114 | 				panic(err)
115 | 			}
116 | 			// s.urb.Start()
117 | 		}
118 | 		if m.Type().String() == "BPF_MAP_TYPE_STRUCT_OPS" {
119 | 			s.structOps = m
120 | 		}
121 | 	}
122 | 
123 | 	iters = bpfModule.Iterator()
124 | 	for {
125 | 		prog := iters.NextProgram()
126 | 		if prog == nil {
127 | 			break
128 | 		}
129 | 
130 | 		if prog.Name() == "rs_select_cpu" {
131 | 			if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil {
132 | 				s.selectCpuPrg = ciliumProg
133 | 			}
134 | 		}
135 | 
136 | 		if prog.Name() == "enable_sibling_cpu" {
137 | 			if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil {
138 | 				s.siblingCpu = ciliumProg
139 | 			}
140 | 		}
141 | 
142 | 		if prog.Name() == "do_preempt" {
143 | 			if ciliumProg, err := ebpf.NewProgramFromFD(prog.FileDescriptor()); err == nil {
144 | 				s.preemptCpu = ciliumProg
145 | 			}
146 | 		}
147 | 	}
148 | }
149 | 
150 | type task_cpu_arg struct {
151 | 	pid   int32
152 | 	cpu   int32
153 | 	flags uint64
154 | }
155 | 
156 | var selectFailed error = fmt.Errorf("prog (selectCpu) not found")
157 | 
158 | func (s *Sched) DefaultSelectCPU(t *models.QueuedTask) (error, int32) {
159 | 	return s.selectCPU(t)
160 | }
161 | 
162 | func (s *Sched) selectCPU(t *models.QueuedTask) (error, int32) {
163 | 	if s.selectCpuPrg == nil {
164 | 		return selectFailed, 0
165 | 	}
166 | 
167 | 	arg := task_cpu_arg{
168 | 		pid:   t.Pid,
169 | 		cpu:   t.Cpu,
170 | 		flags: t.Flags,
171 | 	}
172 | 
173 | 	data := (*[16]byte)(unsafe.Pointer(&arg))[:]
174 | 
175 | 	ret, err := s.selectCpuPrg.Run(&ebpf.RunOptions{
176 | 		Context: data[:],
177 | 	})
178 | 	if err != nil {
179 | 		return err, 0
180 | 	}
181 | 
182 | 	retVal := int32(ret)
183 | 	if ret > 2147483647 {
184 | 		return nil, RL_CPU_ANY
185 | 	}
186 | 	return nil, retVal
187 | }
188 | 
189 | type preempt_arg struct {
190 | 	cpuId int32
191 | }
192 | 
193 | type domain_arg struct {
194 | 	lvlId        int32
195 | 	cpuId        int32
196 | 	siblingCpuId int32
197 | }
198 | 
199 | func (s *Sched) PreemptCpu(cpuId int32) error {
200 | 	if s.preemptCpu == nil {
201 | 		return fmt.Errorf("prog (preemptCpu) not found")
202 | 	}
203 | 
204 | 	arg := preempt_arg{
205 | 		cpuId: cpuId,
206 | 	}
207 | 	data := (*[4]byte)(unsafe.Pointer(&arg))[:]
208 | 
209 | 	ret, err := s.preemptCpu.Run(&ebpf.RunOptions{
210 | 		Context: data[:],
211 | 	})
212 | 	if err != nil {
213 | 		return err
214 | 	}
215 | 	if ret != 0 {
216 | 		return fmt.Errorf("retVal: %v", ret)
217 | 	}
218 | 	return nil
219 | }
220 | 
221 | func (s *Sched) EnableSiblingCpu(lvlId, cpuId, siblingCpuId int32) error {
222 | 	if s.siblingCpu == nil {
223 | 		return fmt.Errorf("prog (siblingCpu) not found")
224 | 	}
225 | 
226 | 	arg := domain_arg{
227 | 		lvlId:        lvlId,
228 | 		cpuId:        cpuId,
229 | 		siblingCpuId: siblingCpuId,
230 | 	}
231 | 	data := (*[12]byte)(unsafe.Pointer(&arg))[:]
232 | 
233 | 	ret, err := s.siblingCpu.Run(&ebpf.RunOptions{
234 | 		Context: data[:],
235 | 	})
236 | 	if err != nil {
237 | 		return err
238 | 	}
239 | 	if ret != 0 {
240 | 		return fmt.Errorf("retVal: %v", ret)
241 | 	}
242 | 	return nil
243 | }
244 | 
245 | func (s *Sched) Attach() error {
246 | 	_, err := s.structOps.AttachStructOps()
247 | 	return err
248 | }
249 | 
250 | func (s *Sched) Close() {
251 | 	if s.selectCpuPrg != nil {
252 | 		s.selectCpuPrg.Close()
253 | 	}
254 | 	if s.siblingCpu != nil {
255 | 		s.siblingCpu.Close()
256 | 	}
257 | 	if s.preemptCpu != nil {
258 | 		s.preemptCpu.Close()
259 | 	}
260 | 	s.urb.Close()
261 | 	s.mod.Close()
262 | }
263 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"log"
  7 | 	"os"
  8 | 	"os/signal"
  9 | 	"syscall"
 10 | 	"time"
 11 | 
 12 | 	"github.com/Gthulhu/plugin/models"
 13 | 	core "github.com/Gthulhu/qumun/goland_core"
 14 | 	"github.com/Gthulhu/qumun/util"
 15 | )
 16 | 
 17 | const (
 18 | 	MAX_LATENCY_WEIGHT = 1000
 19 | 	SLICE_NS_DEFAULT   = 5000 * 1000 // 5ms
 20 | 	SLICE_NS_MIN       = 500 * 1000
 21 | 	SCX_ENQ_WAKEUP     = 1
 22 | 	NSEC_PER_SEC       = 1000000000 // 1 second in nanoseconds
 23 | 	PF_WQ_WORKER       = 0x00000020
 24 | )
 25 | 
 26 | const taskPoolSize = 4096
 27 | 
 28 | var taskPool = make([]Task, taskPoolSize)
 29 | var taskPoolCount = 0
 30 | var taskPoolHead, taskPoolTail int
 31 | 
 32 | func DrainQueuedTask(s *core.Sched) int {
 33 | 	var count int
 34 | 	for (taskPoolTail+1)%taskPoolSize != taskPoolHead {
 35 | 		var newQueuedTask models.QueuedTask
 36 | 		s.DequeueTask(&newQueuedTask)
 37 | 		if newQueuedTask.Pid == -1 {
 38 | 			s.DecNrQueued(count)
 39 | 			return count
 40 | 		}
 41 | 		deadline := updatedEnqueueTask(s, &newQueuedTask)
 42 | 		t := Task{
 43 | 			QueuedTask: &newQueuedTask,
 44 | 			Deadline:   deadline,
 45 | 		}
 46 | 		InsertTaskToPool(t)
 47 | 		count++
 48 | 	}
 49 | 	return 0
 50 | }
 51 | 
 52 | var timeout = uint64(3 * NSEC_PER_SEC)
 53 | 
 54 | func updatedEnqueueTask(s *core.Sched, t *models.QueuedTask) uint64 {
 55 | 	if minVruntime < t.Vtime {
 56 | 		minVruntime = t.Vtime
 57 | 	}
 58 | 	minVruntimeLocal := saturating_sub(minVruntime, SLICE_NS_DEFAULT)
 59 | 	if t.Vtime == 0 {
 60 | 		t.Vtime = minVruntimeLocal + (SLICE_NS_DEFAULT * 100 / t.Weight)
 61 | 	} else if t.Vtime < minVruntimeLocal {
 62 | 		t.Vtime = minVruntimeLocal
 63 | 	}
 64 | 	t.Vtime += (t.StopTs - t.StartTs) * t.Weight / 100
 65 | 
 66 | 	return t.Vtime + min(t.SumExecRuntime, SLICE_NS_DEFAULT*100)
 67 | }
 68 | 
 69 | func GetTaskFromPool() *models.QueuedTask {
 70 | 	if taskPoolHead == taskPoolTail {
 71 | 		return nil
 72 | 	}
 73 | 	t := &taskPool[taskPoolHead]
 74 | 	taskPoolHead = (taskPoolHead + 1) % taskPoolSize
 75 | 	taskPoolCount--
 76 | 	return t.QueuedTask
 77 | }
 78 | 
 79 | var minVruntime uint64 = 0 // global vruntime
 80 | 
 81 | func now() uint64 {
 82 | 	return uint64(time.Now().UnixNano())
 83 | }
 84 | 
 85 | func calcAvg(oldVal uint64, newVal uint64) uint64 {
 86 | 	return (oldVal - (oldVal >> 2)) + (newVal >> 2)
 87 | }
 88 | 
 89 | func saturating_sub(a, b uint64) uint64 {
 90 | 	if a > b {
 91 | 		return a - b
 92 | 	}
 93 | 	return 0
 94 | }
 95 | 
 96 | type Task struct {
 97 | 	*models.QueuedTask
 98 | 	Deadline  uint64
 99 | 	Timestamp uint64
100 | }
101 | 
102 | func LessQueuedTask(
103 | 	a, b *Task,
104 | ) bool {
105 | 	if a.Deadline != b.Deadline {
106 | 		return a.Deadline < b.Deadline
107 | 	}
108 | 	if a.Timestamp != b.Timestamp {
109 | 		return a.Timestamp < b.Timestamp
110 | 	}
111 | 	return a.Pid < b.Pid
112 | }
113 | 
114 | func InsertTaskToPool(
115 | 	newTask Task,
116 | ) bool {
117 | 	if taskPoolCount >= taskPoolSize-1 {
118 | 		return false
119 | 	}
120 | 	insertIdx := taskPoolTail
121 | 	for i := 0; i < taskPoolCount; i++ {
122 | 		idx := (taskPoolHead + i) % taskPoolSize
123 | 		if LessQueuedTask(
124 | 			&newTask,
125 | 			&taskPool[idx],
126 | 		) {
127 | 			insertIdx = idx
128 | 			break
129 | 		}
130 | 	}
131 | 
132 | 	cur := taskPoolTail
133 | 	for cur != insertIdx {
134 | 		next := (cur - 1 + taskPoolSize) % taskPoolSize
135 | 		taskPool[cur] = taskPool[next]
136 | 		cur = next
137 | 	}
138 | 	taskPool[insertIdx] = newTask
139 | 	taskPoolTail = (taskPoolTail + 1) % taskPoolSize
140 | 	taskPoolCount++
141 | 	return true
142 | }
143 | 
144 | func main() {
145 | 	bpfModule := core.LoadSched("main.bpf.o")
146 | 	defer bpfModule.Close()
147 | 	pid := os.Getpid()
148 | 	err := bpfModule.AssignUserSchedPid(pid)
149 | 	if err != nil {
150 | 		log.Printf("AssignUserSchedPid failed: %v", err)
151 | 	}
152 | 
153 | 	err = util.ImportScxEnums()
154 | 	if err != nil {
155 | 		log.Panicf("ImportScxEnums failed: %v", err)
156 | 	}
157 | 
158 | 	bpfModule.SetDebug(true)
159 | 	bpfModule.SetBuiltinIdle(true)
160 | 	bpfModule.Start()
161 | 
162 | 	err = util.InitCacheDomains(bpfModule)
163 | 	if err != nil {
164 | 		log.Panicf("InitCacheDomains failed: %v", err)
165 | 	}
166 | 
167 | 	if err := bpfModule.Attach(); err != nil {
168 | 		log.Panicf("bpfModule attach failed: %v", err)
169 | 	}
170 | 
171 | 	log.Printf("UserSched's Pid: %v", core.GetUserSchedPid())
172 | 	log.Printf("scheduler started")
173 | 
174 | 	signalChan := make(chan os.Signal, 1)
175 | 	signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
176 | 	cont := true
177 | 	timer := time.NewTicker(1 * time.Second)
178 | 	notifyCount := 0
179 | 
180 | 	ctx, cancel := context.WithCancel(context.Background())
181 | 
182 | 	go func() {
183 | 		var t *models.QueuedTask
184 | 		var task *core.DispatchedTask
185 | 		var err error
186 | 		var cpu int32
187 | 
188 | 		for true {
189 | 			select {
190 | 			case <-ctx.Done():
191 | 				return
192 | 			default:
193 | 			}
194 | 			t = GetTaskFromPool()
195 | 			if t == nil {
196 | 				bpfModule.BlockTilReadyForDequeue(ctx)
197 | 				DrainQueuedTask(bpfModule)
198 | 			} else if t.Pid != -1 {
199 | 				task = core.NewDispatchedTask(t)
200 | 				err, cpu = bpfModule.SelectCPU(t)
201 | 				if err != nil {
202 | 					log.Printf("SelectCPU failed: %v", err)
203 | 					return
204 | 				}
205 | 
206 | 				// Evaluate used task time slice.
207 | 				nrWaiting := core.GetNrQueued() + core.GetNrScheduled() + 1
208 | 				task.Vtime = t.Vtime
209 | 				task.SliceNs = max(SLICE_NS_DEFAULT/nrWaiting, SLICE_NS_MIN)
210 | 				task.Cpu = cpu
211 | 
212 | 				err = bpfModule.DispatchTask(task)
213 | 				if err != nil {
214 | 					log.Printf("DispatchTask failed: %v", err)
215 | 					return
216 | 				}
217 | 
218 | 				err = core.NotifyComplete(uint64(taskPoolCount))
219 | 				if err != nil {
220 | 					log.Printf("NotifyComplete failed: %v", err)
221 | 					return
222 | 				}
223 | 			}
224 | 		}
225 | 	}()
226 | 
227 | 	for cont {
228 | 		select {
229 | 		case <-signalChan:
230 | 			log.Println("receive os signal")
231 | 			cancel()
232 | 			cont = false
233 | 		case <-timer.C:
234 | 			notifyCount++
235 | 			if notifyCount%10 == 0 {
236 | 				bss, err := bpfModule.GetBssData()
237 | 				if err != nil {
238 | 					log.Println("GetBssData failed", "error", err)
239 | 				} else {
240 | 					b, err := json.Marshal(bss)
241 | 					if err != nil {
242 | 						log.Println("json.Marshal failed", "error", err)
243 | 					} else {
244 | 						log.Println("bss data", "data", string(b))
245 | 					}
246 | 				}
247 | 			}
248 | 			if bpfModule.Stopped() {
249 | 				log.Println("bpfModule stopped")
250 | 				uei, err := bpfModule.GetUeiData()
251 | 				if err == nil {
252 | 					log.Println("uei", "kind", uei.Kind, "exitCode", uei.ExitCode, "reason", uei.GetReason(), "message", uei.GetMessage())
253 | 				} else {
254 | 					log.Println("GetUeiData failed", "error", err)
255 | 				}
256 | 				cont = false
257 | 			}
258 | 		}
259 | 	}
260 | 	timer.Stop()
261 | 	log.Println("scheduler exit")
262 | }
263 | 


--------------------------------------------------------------------------------
/util/emun.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"sync"
  8 | 
  9 | 	"github.com/cilium/ebpf/btf"
 10 | )
 11 | 
 12 | // ScxEnums mirrors the Rust Enums struct, holding values read from BTF enums
 13 | // in vmlinux. Missing symbols are left as zero.
 14 | type ScxEnums struct {
 15 | 	SCX_OPS_NAME_LEN           uint64
 16 | 	SCX_SLICE_DFL              uint64
 17 | 	SCX_SLICE_INF              uint64
 18 | 	SCX_RQ_ONLINE              uint64
 19 | 	SCX_RQ_CAN_STOP_TICK       uint64
 20 | 	SCX_RQ_BAL_PENDING         uint64
 21 | 	SCX_RQ_BAL_KEEP            uint64
 22 | 	SCX_RQ_BYPASSING           uint64
 23 | 	SCX_RQ_CLK_VALID           uint64
 24 | 	SCX_RQ_IN_WAKEUP           uint64
 25 | 	SCX_RQ_IN_BALANCE          uint64
 26 | 	SCX_DSQ_FLAG_BUILTIN       uint64
 27 | 	SCX_DSQ_FLAG_LOCAL_ON      uint64
 28 | 	SCX_DSQ_INVALID            uint64
 29 | 	SCX_DSQ_GLOBAL             uint64
 30 | 	SCX_DSQ_LOCAL              uint64
 31 | 	SCX_DSQ_LOCAL_ON           uint64
 32 | 	SCX_DSQ_LOCAL_CPU_MASK     uint64
 33 | 	SCX_TASK_QUEUED            uint64
 34 | 	SCX_TASK_RESET_RUNNABLE_AT uint64
 35 | 	SCX_TASK_DEQD_FOR_SLEEP    uint64
 36 | 	SCX_TASK_STATE_SHIFT       uint64
 37 | 	SCX_TASK_STATE_BITS        uint64
 38 | 	SCX_TASK_STATE_MASK        uint64
 39 | 	SCX_TASK_CURSOR            uint64
 40 | 	SCX_TASK_NONE              uint64
 41 | 	SCX_TASK_INIT              uint64
 42 | 	SCX_TASK_READY             uint64
 43 | 	SCX_TASK_ENABLED           uint64
 44 | 	SCX_TASK_NR_STATES         uint64
 45 | 	SCX_TASK_DSQ_ON_PRIQ       uint64
 46 | 	SCX_KICK_IDLE              uint64
 47 | 	SCX_KICK_PREEMPT           uint64
 48 | 	SCX_KICK_WAIT              uint64
 49 | 	SCX_ENQ_WAKEUP             uint64
 50 | 	SCX_ENQ_HEAD               uint64
 51 | 	SCX_ENQ_PREEMPT            uint64
 52 | 	SCX_ENQ_REENQ              uint64
 53 | 	SCX_ENQ_LAST               uint64
 54 | 	SCX_ENQ_CLEAR_OPSS         uint64
 55 | 	SCX_ENQ_DSQ_PRIQ           uint64
 56 | }
 57 | 
 58 | var (
 59 | 	loadOnce  sync.Once
 60 | 	enumsInst *ScxEnums
 61 | 	loadErr   error
 62 | )
 63 | 
 64 | // VmlinuxBTFPathEnv allows overriding the BTF vmlinux path.
 65 | const VmlinuxBTFPathEnv = "QUMUN_VMLINUX_BTF"
 66 | 
 67 | // Default vmlinux BTF path.
 68 | const defaultVmlinuxBTF = "/sys/kernel/btf/vmlinux"
 69 | 
 70 | // GetScxEnums returns the loaded enumeration values, performing a lazy load on first call.
 71 | func GetScxEnums() (*ScxEnums, error) {
 72 | 	loadOnce.Do(func() {
 73 | 		enumsInst, loadErr = loadFromBTF()
 74 | 	})
 75 | 	return enumsInst, loadErr
 76 | }
 77 | 
 78 | // loadFromBTF performs the actual parsing of BTF enums from vmlinux.
 79 | func loadFromBTF() (*ScxEnums, error) {
 80 | 	path := os.Getenv(VmlinuxBTFPathEnv)
 81 | 	if path == "" {
 82 | 		path = defaultVmlinuxBTF
 83 | 	}
 84 | 	spec, err := btf.LoadSpec(path)
 85 | 	if err != nil {
 86 | 		return nil, fmt.Errorf("load BTF spec from %s: %w", path, err)
 87 | 	}
 88 | 
 89 | 	enumCache := map[string]*btf.Enum{}
 90 | 	// Build a lookup map for required enum type names.
 91 | 	needed := map[string]struct{}{
 92 | 		"scx_public_consts": {},
 93 | 		"scx_rq_flags":      {},
 94 | 		"scx_dsq_id_flags":  {},
 95 | 		"scx_ent_flags":     {},
 96 | 		"scx_task_state":    {},
 97 | 		"scx_ent_dsq_flags": {},
 98 | 		"scx_kick_flags":    {},
 99 | 		"scx_enq_flags":     {},
100 | 	}
101 | 
102 | 	for t, err := range spec.All() {
103 | 		if err != nil {
104 | 			return nil, fmt.Errorf("iterate BTF types: %w", err)
105 | 		}
106 | 		if e, ok := t.(*btf.Enum); ok {
107 | 			if _, wanted := needed[e.Name]; wanted {
108 | 				enumCache[e.Name] = e
109 | 			}
110 | 		}
111 | 	}
112 | 	if len(enumCache) == 0 {
113 | 		return nil, errors.New("no required SCX enum types found in BTF")
114 | 	}
115 | 
116 | 	read := func(enumType, name string) uint64 {
117 | 		e := enumCache[enumType]
118 | 		if e == nil {
119 | 			return 0
120 | 		}
121 | 		for _, v := range e.Values {
122 | 			if v.Name == name {
123 | 				return uint64(v.Value)
124 | 			}
125 | 		}
126 | 		return 0
127 | 	}
128 | 
129 | 	scx := &ScxEnums{
130 | 		SCX_OPS_NAME_LEN:           read("scx_public_consts", "SCX_OPS_NAME_LEN"),
131 | 		SCX_SLICE_DFL:              read("scx_public_consts", "SCX_SLICE_DFL"),
132 | 		SCX_SLICE_INF:              read("scx_public_consts", "SCX_SLICE_INF"),
133 | 		SCX_RQ_ONLINE:              read("scx_rq_flags", "SCX_RQ_ONLINE"),
134 | 		SCX_RQ_CAN_STOP_TICK:       read("scx_rq_flags", "SCX_RQ_CAN_STOP_TICK"),
135 | 		SCX_RQ_BAL_PENDING:         read("scx_rq_flags", "SCX_RQ_BAL_PENDING"),
136 | 		SCX_RQ_BAL_KEEP:            read("scx_rq_flags", "SCX_RQ_BAL_KEEP"),
137 | 		SCX_RQ_BYPASSING:           read("scx_rq_flags", "SCX_RQ_BYPASSING"),
138 | 		SCX_RQ_CLK_VALID:           read("scx_rq_flags", "SCX_RQ_CLK_VALID"),
139 | 		SCX_RQ_IN_WAKEUP:           read("scx_rq_flags", "SCX_RQ_IN_WAKEUP"),
140 | 		SCX_RQ_IN_BALANCE:          read("scx_rq_flags", "SCX_RQ_IN_BALANCE"),
141 | 		SCX_DSQ_FLAG_BUILTIN:       read("scx_dsq_id_flags", "SCX_DSQ_FLAG_BUILTIN"),
142 | 		SCX_DSQ_FLAG_LOCAL_ON:      read("scx_dsq_id_flags", "SCX_DSQ_FLAG_LOCAL_ON"),
143 | 		SCX_DSQ_INVALID:            read("scx_dsq_id_flags", "SCX_DSQ_INVALID"),
144 | 		SCX_DSQ_GLOBAL:             read("scx_dsq_id_flags", "SCX_DSQ_GLOBAL"),
145 | 		SCX_DSQ_LOCAL:              read("scx_dsq_id_flags", "SCX_DSQ_LOCAL"),
146 | 		SCX_DSQ_LOCAL_ON:           read("scx_dsq_id_flags", "SCX_DSQ_LOCAL_ON"),
147 | 		SCX_DSQ_LOCAL_CPU_MASK:     read("scx_dsq_id_flags", "SCX_DSQ_LOCAL_CPU_MASK"),
148 | 		SCX_TASK_QUEUED:            read("scx_ent_flags", "SCX_TASK_QUEUED"),
149 | 		SCX_TASK_RESET_RUNNABLE_AT: read("scx_ent_flags", "SCX_TASK_RESET_RUNNABLE_AT"),
150 | 		SCX_TASK_DEQD_FOR_SLEEP:    read("scx_ent_flags", "SCX_TASK_DEQD_FOR_SLEEP"),
151 | 		SCX_TASK_STATE_SHIFT:       read("scx_ent_flags", "SCX_TASK_STATE_SHIFT"),
152 | 		SCX_TASK_STATE_BITS:        read("scx_ent_flags", "SCX_TASK_STATE_BITS"),
153 | 		SCX_TASK_STATE_MASK:        read("scx_ent_flags", "SCX_TASK_STATE_MASK"),
154 | 		SCX_TASK_CURSOR:            read("scx_ent_flags", "SCX_TASK_CURSOR"),
155 | 		SCX_TASK_NONE:              read("scx_task_state", "SCX_TASK_NONE"),
156 | 		SCX_TASK_INIT:              read("scx_task_state", "SCX_TASK_INIT"),
157 | 		SCX_TASK_READY:             read("scx_task_state", "SCX_TASK_READY"),
158 | 		SCX_TASK_ENABLED:           read("scx_task_state", "SCX_TASK_ENABLED"),
159 | 		SCX_TASK_NR_STATES:         read("scx_task_state", "SCX_TASK_NR_STATES"),
160 | 		SCX_TASK_DSQ_ON_PRIQ:       read("scx_ent_dsq_flags", "SCX_TASK_DSQ_ON_PRIQ"),
161 | 		SCX_KICK_IDLE:              read("scx_kick_flags", "SCX_KICK_IDLE"),
162 | 		SCX_KICK_PREEMPT:           read("scx_kick_flags", "SCX_KICK_PREEMPT"),
163 | 		SCX_KICK_WAIT:              read("scx_kick_flags", "SCX_KICK_WAIT"),
164 | 		SCX_ENQ_WAKEUP:             read("scx_enq_flags", "SCX_ENQ_WAKEUP"),
165 | 		SCX_ENQ_HEAD:               read("scx_enq_flags", "SCX_ENQ_HEAD"),
166 | 		SCX_ENQ_PREEMPT:            read("scx_enq_flags", "SCX_ENQ_PREEMPT"),
167 | 		SCX_ENQ_REENQ:              read("scx_enq_flags", "SCX_ENQ_REENQ"),
168 | 		SCX_ENQ_LAST:               read("scx_enq_flags", "SCX_ENQ_LAST"),
169 | 		SCX_ENQ_CLEAR_OPSS:         read("scx_enq_flags", "SCX_ENQ_CLEAR_OPSS"),
170 | 		SCX_ENQ_DSQ_PRIQ:           read("scx_enq_flags", "SCX_ENQ_DSQ_PRIQ"),
171 | 	}
172 | 
173 | 	return scx, nil
174 | }
175 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.


--------------------------------------------------------------------------------
/main.bpf.c:
--------------------------------------------------------------------------------
   1 | /* Copyright (c) Ian Chen <ychen.desl@gmail.com> */
   2 | /* The scx_goland is based on scx_rustland_core */
   3 | /* Copyright (c) Andrea Righi <andrea.righi@linux.dev> */
   4 | /*
   5 |  * scx_rustland_core: BPF backend for schedulers running in user-space.
   6 |  *
   7 |  * This BPF backend implements the low level sched-ext functionalities for a
   8 |  * user-space counterpart, that implements the actual scheduling policy.
   9 |  *
  10 |  * The BPF part collects total cputime and weight from the tasks that need to
  11 |  * run, then it sends all details to the user-space scheduler that decides the
  12 |  * best order of execution of the tasks (based on the collected metrics).
  13 |  *
  14 |  * The user-space scheduler then returns to the BPF component the list of tasks
  15 |  * to be dispatched in the proper order.
  16 |  *
  17 |  * Messages between the BPF component and the user-space scheduler are passed
  18 |  * using BPF_MAP_TYPE_RINGBUF / BPF_MAP_TYPE_USER_RINGBUF maps: @queued for
  19 |  * the messages sent by the BPF dispatcher to the user-space scheduler and
  20 |  * @dispatched for the messages sent by the user-space scheduler to the BPF
  21 |  * dispatcher.
  22 |  *
  23 |  * The BPF dispatcher is completely agnostic of the particular scheduling
  24 |  * policy implemented in user-space. For this reason developers that are
  25 |  * willing to use this scheduler to experiment scheduling policies should be
  26 |  * able to simply modify the Rust component, without having to deal with any
  27 |  * internal kernel / BPF details.
  28 |  *
  29 |  * This software may be used and distributed according to the terms of the
  30 |  * GNU General Public License version 2.
  31 |  */
  32 | #ifdef LSP
  33 | #define __bpf__
  34 | #include "../../../../scheds/include/scx/common.bpf.h"
  35 | #else
  36 | #include <scx/common.bpf.h>
  37 | #endif
  38 | 
  39 | #include <scx/percpu.bpf.h>
  40 | #include "intf.h"
  41 | #include <bpf/bpf_helpers.h>
  42 | 
  43 | /* Compatibility fallbacks for kernel flag macros that may not be defined
  44 |  * in older build environments or trimmed header sets used during BPF
  45 |  * compilation. Define them as 0 if missing so bitwise checks become no-op.
  46 |  */
  47 | #ifndef PF_KSWAPD
  48 | #define PF_KSWAPD 0
  49 | #endif
  50 | #ifndef PF_KCOMPACTD
  51 | #define PF_KCOMPACTD 0
  52 | #endif
  53 | 
  54 | char _license[] SEC("license") = "GPL";
  55 | 
  56 | UEI_DEFINE(uei);
  57 | 
  58 | /*
  59 |  * Introduce a custom DSQ shared across all the CPUs, where we can dispatch
  60 |  * tasks that will be executed on the first CPU available.
  61 |  *
  62 |  * Per-CPU DSQs are also provided, to allow the scheduler to run a task on a
  63 |  * specific CPU (see dsq_init()).
  64 |  */
  65 | #define SHARED_DSQ MAX_CPUS
  66 | 
  67 | /*
  68 |  * The user-space scheduler itself is dispatched using a separate DSQ, that
  69 |  * is consumed after all other DSQs.
  70 |  *
  71 |  * This ensures to work in bursts: tasks are queued, then the user-space
  72 |  * scheduler runs and dispatches them. Once all these tasks exhaust their
  73 |  * time slices, the scheduler is invoked again, repeating the cycle.
  74 |  */
  75 | #define SCHED_DSQ (MAX_CPUS + 1)
  76 | 
  77 | /*
  78 |  * Safety cap for dispatching usersched threads per invocation.
  79 |  */
  80 | #define MAX_USERSCHED_DISPATCH 64
  81 | 
  82 | /*
  83 |  * Scheduler attributes and statistics.
  84 |  */
  85 | const volatile u32 usersched_pid; /* User-space scheduler PID */
  86 | const volatile u32 khugepaged_pid; /* khugepaged PID */
  87 | u64 usersched_last_run_at; /* Timestamp of the last user-space scheduler execution */
  88 | static u64 nr_cpu_ids; /* Maximum possible CPU number */
  89 | 
  90 | /*
  91 |  * Number of tasks that are queued for scheduling.
  92 |  *
  93 |  * This number is incremented by the BPF component when a task is queued to the
  94 |  * user-space scheduler and it must be decremented by the user-space scheduler
  95 |  * when a task is consumed.
  96 |  */
  97 | volatile u64 nr_queued;
  98 | 
  99 | /*
 100 |  * Number of tasks that are waiting for scheduling.
 101 |  *
 102 |  * This number must be updated by the user-space scheduler to keep track if
 103 |  * there is still some scheduling work to do.
 104 |  */
 105 | volatile u64 nr_scheduled;
 106 | 
 107 | /*
 108 |  * Amount of currently running tasks.
 109 |  */
 110 | volatile u64 nr_running, nr_online_cpus;
 111 | 
 112 | /* Dispatch statistics */
 113 | volatile u64 nr_user_dispatches, nr_kernel_dispatches,
 114 | 	     nr_cancel_dispatches, nr_bounce_dispatches;
 115 | 
 116 | /* Failure statistics */
 117 | volatile u64 nr_failed_dispatches, nr_sched_congested;
 118 | 
 119 | /* Report additional debugging information */
 120 | const volatile bool debug;
 121 | 
 122 | const volatile bool early_processing;
 123 | 
 124 | const volatile u64 default_slice = 20000000ULL; 
 125 | 
 126 | /* Rely on the in-kernel idle CPU selection policy */
 127 | const volatile bool builtin_idle;
 128 | 
 129 | /* Allow to use bpf_printk() only when @debug is set */
 130 | #define dbg_msg(_fmt, ...) do {						\
 131 | 	if (debug)										\
 132 | 		bpf_printk(_fmt, ##__VA_ARGS__);			\
 133 | } while(0)
 134 | 
 135 | /*
 136 |  * CPUs in the system have SMT is enabled.
 137 |  */
 138 | const volatile bool smt_enabled = true;
 139 | 
 140 | /*
 141 |  * Allocate/re-allocate a new cpumask.
 142 |  */
 143 | static int calloc_cpumask(struct bpf_cpumask **p_cpumask)
 144 | {
 145 | 	struct bpf_cpumask *cpumask;
 146 | 
 147 | 	cpumask = bpf_cpumask_create();
 148 | 	if (!cpumask)
 149 | 		return -ENOMEM;
 150 | 
 151 | 	cpumask = bpf_kptr_xchg(p_cpumask, cpumask);
 152 | 	if (cpumask)
 153 | 		bpf_cpumask_release(cpumask);
 154 | 
 155 | 	return 0;
 156 | }
 157 | 
 158 | /*
 159 |  * Maximum amount of tasks queued between kernel and user-space at a certain
 160 |  * time.
 161 |  *
 162 |  * The @queued and @dispatched lists are used in a producer/consumer fashion
 163 |  * between the BPF part and the user-space part.
 164 |  */
 165 | #define MAX_ENQUEUED_TASKS 4096
 166 | 
 167 | /*
 168 |  * Maximum amount of slots reserved to the tasks dispatched via shared queue.
 169 |  */
 170 | #define MAX_DISPATCH_SLOT (MAX_ENQUEUED_TASKS / 8)
 171 | 
 172 | /*
 173 |  * The map containing tasks that are queued to user space from the kernel.
 174 |  *
 175 |  * This map is drained by the user-space scheduler.
 176 |  */
 177 | struct {
 178 | 	__uint(type, BPF_MAP_TYPE_RINGBUF);
 179 | 	__uint(max_entries, MAX_ENQUEUED_TASKS *
 180 | 				sizeof(struct queued_task_ctx));
 181 | } queued SEC(".maps");
 182 | 
 183 | /*
 184 |  * The user ring buffer containing pids that are dispatched from user space to
 185 |  * the kernel.
 186 |  *
 187 |  * Drained by the kernel in .dispatch().
 188 |  */
 189 | struct {
 190 | 	__uint(type, BPF_MAP_TYPE_USER_RINGBUF);
 191 | 	__uint(max_entries, MAX_ENQUEUED_TASKS *
 192 | 				sizeof(struct dispatched_task_ctx));
 193 | } dispatched SEC(".maps");
 194 | 
 195 | /*
 196 |  * Map to track PIDs with vtime==0 (priority tasks).
 197 |  *
 198 |  * This hashmap stores PIDs as both key and value for tasks that have
 199 |  * vtime set to 0, indicating they are high priority tasks.
 200 |  */
 201 | struct {
 202 | 	__uint(type, BPF_MAP_TYPE_HASH);
 203 | 	__type(key, u32);    /* PID */
 204 | 	__type(value, u64);   /* time slice */
 205 | 	__uint(max_entries, MAX_ENQUEUED_TASKS);
 206 | } priority_tasks SEC(".maps");
 207 | 
 208 | struct {
 209 | 	__uint(type, BPF_MAP_TYPE_HASH);
 210 | 	__type(key, s32);    /* CPU */
 211 | 	__type(value, u32);   /* PID */
 212 | 	__uint(max_entries, MAX_CPUS);
 213 | } running_task SEC(".maps");
 214 | 
 215 | /*
 216 |  * Per-CPU context.
 217 |  */
 218 | struct cpu_ctx {
 219 | 	struct bpf_cpumask __kptr *l2_cpumask;
 220 | 	struct bpf_cpumask __kptr *l3_cpumask;
 221 | };
 222 | 
 223 | struct {
 224 | 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 225 | 	__type(key, u32);
 226 | 	__type(value, struct cpu_ctx);
 227 | 	__uint(max_entries, 1);
 228 | } cpu_ctx_stor SEC(".maps");
 229 | 
 230 | /*
 231 |  * Return a CPU context.
 232 |  */
 233 | struct cpu_ctx *try_lookup_cpu_ctx(s32 cpu)
 234 | {
 235 | 	const u32 idx = 0;
 236 | 	return bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &idx, cpu);
 237 | }
 238 | 
 239 | /*
 240 |  * Per-task local storage.
 241 |  *
 242 |  * This contain all the per-task information used internally by the BPF code.
 243 |  */
 244 | struct task_ctx {
 245 | 	/*
 246 | 	 * Temporary cpumask for calculating scheduling domains.
 247 | 	 */
 248 | 	struct bpf_cpumask __kptr *l2_cpumask;
 249 | 	struct bpf_cpumask __kptr *l3_cpumask;
 250 | 
 251 | 	/*
 252 | 	 * Timestamp since last time the task ran on a CPU.
 253 | 	 */
 254 | 	u64 start_ts;
 255 | 
 256 | 	/*
 257 | 	 * Timestamp since last time the task released a CPU.
 258 | 	 */
 259 | 	u64 stop_ts;
 260 | 
 261 | 	/*
 262 | 	 * Execution time (in nanoseconds) since the last sleep event.
 263 | 	 */
 264 | 	u64 exec_runtime;
 265 | };
 266 | 
 267 | /* Map that contains task-local storage. */
 268 | struct {
 269 | 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 270 | 	__uint(map_flags, BPF_F_NO_PREALLOC);
 271 | 	__type(key, int);
 272 | 	__type(value, struct task_ctx);
 273 | } task_ctx_stor SEC(".maps");
 274 | 
 275 | /*
 276 |  * Return a local task context from a generic task or NULL if the context
 277 |  * doesn't exist.
 278 |  */
 279 | struct task_ctx *try_lookup_task_ctx(const struct task_struct *p)
 280 | {
 281 | 	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor,
 282 | 						(struct task_struct *)p, 0, 0);
 283 | 	if (!tctx)
 284 | 		dbg_msg("warning: failed to get task context for pid=%d (%s)",
 285 | 			p->pid, p->comm);
 286 | 	return tctx;
 287 | }
 288 | 
 289 | /*
 290 |  * Heartbeat timer used to periodically trigger the check to run the user-space
 291 |  * scheduler.
 292 |  *
 293 |  * Without this timer we may starve the scheduler if the system is completely
 294 |  * idle and hit the watchdog that would auto-kill this scheduler.
 295 |  */
 296 | struct usersched_timer {
 297 | 	struct bpf_timer timer;
 298 | };
 299 | 
 300 | struct {
 301 | 	__uint(type, BPF_MAP_TYPE_ARRAY);
 302 | 	__uint(max_entries, 1);
 303 | 	__type(key, u32);
 304 | 	__type(value, struct usersched_timer);
 305 | } usersched_timer SEC(".maps");
 306 | 
 307 | /*
 308 |  * Time period of the scheduler heartbeat, used to periodically kick the
 309 |  * user-space scheduler and check if there is any pending activity.
 310 |  */
 311 | #define USERSCHED_TIMER_NS (NSEC_PER_SEC / 10)
 312 | 
 313 | /*
 314 |  * Return true if the target task @p is the user-space scheduler.
 315 |  */
 316 | static inline bool is_usersched_task(const struct task_struct *p)
 317 | {
 318 | 	return p->pid == usersched_pid;
 319 | }
 320 | 
 321 | /*
 322 |  * Return true if the target task @p belongs to the user-space scheduler.
 323 |  */
 324 | static inline bool is_belong_usersched_task(const struct task_struct *p)
 325 | {
 326 | 	return p->tgid == usersched_pid;
 327 | }
 328 | 
 329 | /*
 330 |  * Return true if the target task @p is a kernel thread.
 331 |  */
 332 | static inline bool is_kthread(const struct task_struct *p)
 333 | {
 334 | 	return p->flags & PF_KTHREAD;
 335 | }
 336 | 
 337 | /*
 338 |  * Return true if the target task @p is a kworker thread.
 339 |  */
 340 | static inline bool is_kworker(const struct task_struct *p)
 341 | {
 342 | 	return p->flags & PF_WQ_WORKER;
 343 | }
 344 | 
 345 | /*
 346 |  * Return true if the target task @p is kswapd.
 347 |  */
 348 | static inline bool is_kswapd(const struct task_struct *p)
 349 | {
 350 |         return p->flags & (PF_KSWAPD | PF_KCOMPACTD);
 351 | }
 352 | 
 353 | /*
 354 |  * Return true if the target task @p is khugepaged, false otherwise.
 355 |  */
 356 | static inline bool is_khugepaged(const struct task_struct *p)
 357 | {
 358 | 	return khugepaged_pid && p->pid == khugepaged_pid;
 359 | }
 360 | 
 361 | /*
 362 |  * Return true if @p still wants to run, false otherwise.
 363 |  */
 364 | static bool is_queued(const struct task_struct *p)
 365 | {
 366 | 	return p->scx.flags & SCX_TASK_QUEUED;
 367 | }
 368 | 
 369 | /*
 370 |  * Flag used to wake-up the user-space scheduler.
 371 |  */
 372 | static volatile u32 usersched_needed;
 373 | 
 374 | /*
 375 |  * Set user-space scheduler wake-up flag (equivalent to an atomic release
 376 |  * operation).
 377 |  */
 378 | static void set_usersched_needed(void)
 379 | {
 380 | 	__sync_fetch_and_or(&usersched_needed, 1);
 381 | }
 382 | 
 383 | /*
 384 |  * Check and clear user-space scheduler wake-up flag (equivalent to an atomic
 385 |  * acquire operation).
 386 |  */
 387 | static bool test_and_clear_usersched_needed(void)
 388 | {
 389 | 	return __sync_fetch_and_and(&usersched_needed, 0) == 1;
 390 | }
 391 | 
 392 | /*
 393 |  * Return true if there's any pending activity to do for the scheduler, false
 394 |  * otherwise.
 395 |  *
 396 |  * NOTE: a task is sent to the user-space scheduler using the "queued"
 397 |  * ringbuffer, then the scheduler drains the queued tasks and adds them to
 398 |  * its internal data structures / state; at this point tasks become
 399 |  * "scheduled" and the user-space scheduler will take care of updating
 400 |  * nr_scheduled accordingly; lastly tasks will be dispatched and the
 401 |  * user-space scheduler will update nr_scheduled again.
 402 |  *
 403 |  * Checking nr_scheduled and the available data in the ringbuffer allows to
 404 |  * determine if there is still some pending work to do for the scheduler:
 405 |  * new tasks have been queued since last check, or there are still tasks
 406 |  * "queued" or "scheduled" since the previous user-space scheduler run.
 407 |  *
 408 |  * If there's no pending action, it is pointless to wake-up the scheduler
 409 |  * (even if a CPU becomes idle), because there is nothing to do.
 410 |  *
 411 |  * Also keep in mind that we don't need any protection here since this code
 412 |  * doesn't run concurrently with the user-space scheduler (that is single
 413 |  * threaded), therefore this check is also safe from a concurrency perspective.
 414 |  */
 415 | static bool usersched_has_pending_tasks(void)
 416 | {
 417 | 	if (usersched_needed)
 418 | 		return true;
 419 | 
 420 | 	if (nr_queued || nr_scheduled)
 421 | 		return true;
 422 | 
 423 | 	return bpf_ringbuf_query(&queued, BPF_RB_AVAIL_DATA) > 0;
 424 | }
 425 | 
 426 | /*
 427 |  * Return the DSQ ID associated to a CPU, or SHARED_DSQ if the CPU is not
 428 |  * valid.
 429 |  */
 430 | static u64 cpu_to_dsq(s32 cpu)
 431 | {
 432 | 	if (cpu < 0 || cpu >= MAX_CPUS) {
 433 | 		scx_bpf_error("Invalid cpu: %d", cpu);
 434 | 		return SHARED_DSQ;
 435 | 	}
 436 | 	return (u64)cpu;
 437 | }
 438 | 
 439 | /*
 440 |  * Helper function to update priority tasks map based on vtime.
 441 |  * If vtime == 0, add PID to map. If vtime != 0, remove PID from map.
 442 |  */
 443 | static void update_priority_task_map(u32 pid, u64 vtime, u64 slice)
 444 | {
 445 | 	if (vtime == 0) {
 446 | 		bpf_map_update_elem(&priority_tasks, &pid, &slice, BPF_ANY);
 447 | 	} else {
 448 | 		bpf_map_delete_elem(&priority_tasks, &pid);
 449 | 	}
 450 | }
 451 | 
 452 | /*
 453 |  * Return true if @this_cpu and @that_cpu are in the same LLC, false
 454 |  * otherwise.
 455 |  */
 456 | static inline bool cpus_share_cache(s32 this_cpu, s32 that_cpu)
 457 | {
 458 |         if (this_cpu == that_cpu)
 459 |                 return true;
 460 | 
 461 | 	return cpu_llc_id(this_cpu) == cpu_llc_id(that_cpu);
 462 | }
 463 | 
 464 | /*
 465 |  * Return true if @this_cpu is faster than @that_cpu, false otherwise.
 466 |  */
 467 | static inline bool is_cpu_faster(s32 this_cpu, s32 that_cpu)
 468 | {
 469 |         if (this_cpu == that_cpu)
 470 |                 return false;
 471 | 
 472 | 	return cpu_priority(this_cpu) > cpu_priority(that_cpu);
 473 | }
 474 | 
 475 | /*
 476 |  * Return true if @cpu is a fully-idle SMT core, false otherwise.
 477 |  */
 478 | static inline bool is_smt_idle(s32 cpu)
 479 | {
 480 | 	const struct cpumask *idle_smtmask;
 481 |         bool is_idle;
 482 | 
 483 | 	if (!smt_enabled)
 484 | 		return true;
 485 | 
 486 | 	idle_smtmask = scx_bpf_get_idle_smtmask();
 487 |         is_idle = bpf_cpumask_test_cpu(cpu, idle_smtmask);
 488 |         scx_bpf_put_cpumask(idle_smtmask);
 489 | 
 490 | 	return is_idle;
 491 | }
 492 | 
 493 | /*
 494 |  * Return true on a wake-up event, false otherwise.
 495 |  */
 496 | static inline bool is_wakeup(u64 wake_flags)
 497 | {
 498 | 	return wake_flags & SCX_WAKE_TTWU;
 499 | }
 500 | 
 501 | /*
 502 |  * Find an idle CPU in the system for the task.
 503 |  *
 504 |  * NOTE: the idle CPU selection doesn't need to be formally perfect, it is
 505 |  * totally fine to accept racy conditions and potentially make mistakes, by
 506 |  * picking CPUs that are not idle or even offline, the logic has been designed
 507 |  * to handle these mistakes in favor of a more efficient response and a reduced
 508 |  * scheduling overhead.
 509 |  */
 510 | static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 511 | {
 512 | 	s32 cpu, this_cpu = bpf_get_smp_processor_id();
 513 | 	bool is_this_cpu_allowed = bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr);
 514 | 
 515 | 	/*
 516 | 	 * For tasks that can run only on a single CPU, we can simply verify if
 517 | 	 * their only allowed CPU is still idle.
 518 | 	 */
 519 | 	if (p->nr_cpus_allowed == 1) {
 520 | 		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 521 | 			return prev_cpu;
 522 | 
 523 | 		return -EBUSY;
 524 | 	}
 525 | 
 526 | 	/*
 527 | 	 * On wakeup if the waker's CPU is faster than the wakee's CPU, try
 528 | 	 * to move the wakee closer to the waker.
 529 | 	 *
 530 | 	 * In presence of hybrid cores this helps to naturally migrate
 531 | 	 * tasks over to the faster cores.
 532 | 	 */
 533 | 	if (is_wakeup(wake_flags) &&
 534 | 	    is_cpu_faster(this_cpu, prev_cpu) && is_this_cpu_allowed) {
 535 | 		/*
 536 | 		 * If both the waker's CPU and the wakee's CPU are in the
 537 | 		 * same LLC and the wakee's CPU is a fully idle SMT core,
 538 | 		 * don't migrate.
 539 | 		 */
 540 | 		if (cpus_share_cache(this_cpu, prev_cpu) &&
 541 | 		    is_smt_idle(prev_cpu) &&
 542 | 		    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 543 | 			return prev_cpu;
 544 | 
 545 | 		prev_cpu = this_cpu;
 546 | 	}
 547 | 
 548 | 	/*
 549 | 	 * Fallback to the old API if the kernel doesn't support
 550 | 	 * scx_bpf_select_cpu_and().
 551 | 	 *
 552 | 	 * This is required to support kernels <= 6.16.
 553 | 	 */
 554 | 	if (!bpf_ksym_exists(scx_bpf_select_cpu_and)) {
 555 | 		bool is_idle = false;
 556 | 
 557 | 		if (!wake_flags)
 558 | 			return -EBUSY;
 559 | 
 560 | 		cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 561 | 
 562 | 		return is_idle ? cpu : -EBUSY;
 563 | 	}
 564 | 
 565 | 	/*
 566 | 	 * Pick any idle CPU usable by the task.
 567 | 	 */
 568 | 	return scx_bpf_select_cpu_and(p, prev_cpu, wake_flags, p->cpus_ptr, 0);
 569 | }
 570 | 
 571 | /*
 572 |  * Wake-up a target @cpu for the dispatched task @p. If @cpu can't be used
 573 |  * wakeup another valid CPU.
 574 |  */
 575 | static void kick_task_cpu(const struct task_struct *p, s32 cpu)
 576 | {
 577 | 	if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 578 | 		/*
 579 | 		 * Kick the target CPU anyway, since it may be locked and
 580 | 		 * needs to go back to idle to reset its state.
 581 | 		 */
 582 | 		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 583 | 
 584 | 		/*
 585 | 		 * Pick any other idle CPU that the task can use.
 586 | 		 */
 587 | 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 588 | 		if (cpu < 0)
 589 | 			return;
 590 | 	}
 591 | 	scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 592 | }
 593 | 
 594 | /*
 595 |  * Dispatch a task to a target per-CPU DSQ, waking up the corresponding CPU, if
 596 |  * needed.
 597 |  */
 598 | static void dispatch_task(const struct dispatched_task_ctx *task)
 599 | {
 600 | 	struct task_struct *p;
 601 | 	s32 prev_cpu;
 602 | 
 603 | 	/* Ignore entry if the task doesn't exist anymore */
 604 | 	p = bpf_task_from_pid(task->pid);
 605 | 	if (!p)
 606 | 		return;
 607 | 	prev_cpu = scx_bpf_task_cpu(p);
 608 | 
 609 | 	/*
 610 | 	 * Dispatch task to the shared DSQ if the user-space scheduler
 611 | 	 * didn't select any specific target CPU.
 612 | 	 */
 613 | 	if (task->cpu == RL_CPU_ANY) {
 614 | 		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ,
 615 | 					 task->slice_ns, task->vtime, task->flags);
 616 | 		kick_task_cpu(p, prev_cpu);
 617 | 
 618 | 		goto out_release;
 619 | 	}
 620 | 
 621 | 	/*
 622 | 	 * If the target CPU selected by the user-space scheduler is not
 623 | 	 * valid, dispatch it to the SHARED_DSQ, independently on what the
 624 | 	 * user-space scheduler has decided.
 625 | 	 */
 626 | 	if (!bpf_cpumask_test_cpu(task->cpu, p->cpus_ptr)) {
 627 | 		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ,
 628 | 					 task->slice_ns, task->vtime, task->flags);
 629 | 		__sync_fetch_and_add(&nr_bounce_dispatches, 1);
 630 | 		kick_task_cpu(p, prev_cpu);
 631 | 
 632 | 		goto out_release;
 633 | 	}
 634 | 
 635 | 	/*
 636 | 	 * Dispatch a task to a target CPU selected by the user-space
 637 | 	 * scheduler.
 638 | 	 */
 639 | 	if (task->vtime) {
 640 | 		scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(task->cpu),
 641 | 				task->slice_ns, task->vtime, task->flags);
 642 | 		__sync_fetch_and_add(&nr_user_dispatches, 1);
 643 | 	} else {
 644 | 		s32 cur_pid;
 645 | 		u64* elem;
 646 | 		cur_pid = task->pid;
 647 | 		elem = bpf_map_lookup_elem(&priority_tasks, &cur_pid);
 648 | 		if (!elem){
 649 | 			scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(task->cpu),
 650 | 				task->slice_ns, task->vtime, task->flags);
 651 | 			__sync_fetch_and_add(&nr_user_dispatches, 1);
 652 | 		}
 653 | 	}
 654 | 	update_priority_task_map(task->pid, task->vtime, task->slice_ns);
 655 | 
 656 | 	/*
 657 | 	 * If the cpumask is not valid anymore, ignore the dispatch event.
 658 | 	 *
 659 | 	 * This can happen if the task has changed its affinity and the
 660 | 	 * target CPU has become invalid. In this case cancelling the
 661 | 	 * dispatch allows to prevent potential stalls in the scheduler,
 662 | 	 * since the task will be re-enqueued by the core sched-ext code,
 663 | 	 * potentially selecting a different CPU.
 664 | 	 */
 665 | 	if (!bpf_cpumask_test_cpu(task->cpu, p->cpus_ptr)) {
 666 | 		scx_bpf_dispatch_cancel();
 667 | 		__sync_fetch_and_add(&nr_cancel_dispatches, 1);
 668 | 
 669 | 		goto out_release;
 670 | 	}
 671 | 
 672 | 	scx_bpf_kick_cpu(task->cpu, SCX_KICK_IDLE);
 673 | 
 674 | out_release:
 675 | 	bpf_task_release(p);
 676 | }
 677 | 
 678 | /*
 679 |  * Return true if the waker commits to release the CPU after waking up @p,
 680 |  * false otherwise.
 681 |  */
 682 | static bool is_wake_sync(u64 wake_flags)
 683 | {
 684 | 	const struct task_struct *current = (void *)bpf_get_current_task_btf();
 685 | 
 686 | 	return (wake_flags & SCX_WAKE_SYNC) && !(current->flags & PF_EXITING);
 687 | }
 688 | 
 689 | /*
 690 |  * Return true it's safe to dispatch directly on @cpu, false otherwise.
 691 |  */
 692 | static bool can_direct_dispatch(s32 cpu)
 693 | {
 694 | 	return !scx_bpf_dsq_nr_queued(SHARED_DSQ) &&
 695 | 	       !scx_bpf_dsq_nr_queued(cpu_to_dsq(cpu));
 696 | }
 697 | 
 698 | s32 BPF_STRUCT_OPS(goland_select_cpu, struct task_struct *p, s32 prev_cpu,
 699 | 		   u64 wake_flags)
 700 | {
 701 | 	s32 cpu, this_cpu = bpf_get_smp_processor_id();
 702 | 	bool is_this_cpu_allowed = bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr);
 703 | 
 704 | 	/*
 705 | 	 * Make sure @prev_cpu is usable, otherwise try to move close to
 706 | 	 * the waker's CPU. If the waker's CPU is also not usable, then
 707 | 	 * pick the first usable CPU.
 708 | 	 */
 709 | 	if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
 710 | 		prev_cpu = is_this_cpu_allowed ? this_cpu : bpf_cpumask_first(p->cpus_ptr);
 711 | 
 712 | 	/*
 713 | 	 * Scheduler is dispatched directly in .dispatch() when needed, so
 714 | 	 * we can skip it here.
 715 | 	 */
 716 | 	if (is_belong_usersched_task(p))
 717 | 		return prev_cpu;
 718 | 
 719 | 	/*
 720 | 	 * If built-in idle CPU policy is not enabled, completely delegate
 721 | 	 * the idle selection policy to user-space and keep re-using the
 722 | 	 * same CPU here.
 723 | 	 */
 724 | 	if (!builtin_idle)
 725 | 		return prev_cpu;
 726 | 
 727 | 	/*
 728 | 	 * Pick the idle CPU closest to @prev_cpu usable by the task.
 729 | 	 */
 730 | 	cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
 731 | 	if (cpu >= 0) {
 732 | 		if (can_direct_dispatch(cpu)) {
 733 | 			scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu),
 734 | 						 default_slice, p->scx.dsq_vtime, 0);
 735 | 			__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 736 | 		}
 737 | 		return cpu;
 738 | 	}
 739 | 
 740 | 	/*
 741 | 	 * If we couldn't find an idle CPU, in case of a sync wakeup
 742 | 	 * prioritize the waker's CPU.
 743 | 	 */
 744 | 	return prev_cpu;
 745 | }
 746 | 
 747 | SEC("syscall")
 748 | int do_preempt(struct preempt_cpu_arg *input)
 749 | {	
 750 | 	scx_bpf_kick_cpu(input->cpu_id, SCX_KICK_PREEMPT);
 751 | 	return 0;
 752 | }
 753 | 
 754 | /*
 755 |  * Select and wake-up an idle CPU for a specific task from the user-space
 756 |  * scheduler.
 757 |  */
 758 | SEC("syscall")
 759 | int rs_select_cpu(struct task_cpu_arg *input)
 760 | {
 761 | 	struct task_struct *p;
 762 | 	int cpu = input->cpu;
 763 | 
 764 | 	p = bpf_task_from_pid(input->pid);
 765 | 	if (!p)
 766 | 		return -EINVAL;
 767 | 
 768 | 	/*
 769 | 	 * If the target CPU is the current one, treat it as idle when no
 770 | 	 * other tasks are queued.
 771 | 	 *
 772 | 	 * Since this function is invoked by the user-space scheduler,
 773 | 	 * which will release the CPU shortly, there is no need to migrate
 774 | 	 * the task elsewhere.
 775 | 	 */
 776 | 	if (cpu == bpf_get_smp_processor_id()) {
 777 | 		u64 nr_tasks = nr_running + nr_queued + nr_scheduled + 1;
 778 | 
 779 | 		if (nr_tasks < nr_online_cpus && !scx_bpf_dsq_nr_queued(cpu))
 780 | 			goto out_release;
 781 | 	}
 782 | 
 783 | 	bpf_rcu_read_lock();
 784 | 	/*
 785 | 	 * Kernels that don't provide scx_bpf_select_cpu_and() only allow
 786 | 	 * to use the built-in idle CPU selection policy only from
 787 | 	 * ops.select_cpu() and opt.enqueue(), return any idle CPU usable
 788 | 	 * by the task in this case.
 789 | 	 */
 790 | 	if (!bpf_ksym_exists(scx_bpf_select_cpu_and)) {
 791 | 		if (!scx_bpf_test_and_clear_cpu_idle(cpu))
 792 | 			cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 793 | 	} else {
 794 | 		/*
 795 | 		 * Set SCX_WAKE_TTWU, pretending to be a wakeup, to prioritize
 796 | 		 * faster CPU selection (we probably want to add an option to allow
 797 | 		 * the user-space scheduler to use this logic or not).
 798 | 		 */
 799 | 		cpu = pick_idle_cpu(p, cpu, SCX_WAKE_TTWU);
 800 | 	}
 801 | 	bpf_rcu_read_unlock();
 802 | 
 803 | out_release:
 804 | 	bpf_task_release(p);
 805 | 
 806 | 	return cpu;
 807 | }
 808 | 
 809 | /*
 810 |  * Fill @task with all the information that need to be sent to the user-space
 811 |  * scheduler.
 812 |  */
 813 | static void get_task_info(struct queued_task_ctx *task,
 814 | 			  const struct task_struct *p, s32 prev_cpu, u64 enq_flags)
 815 | {
 816 | 	struct task_ctx *tctx = try_lookup_task_ctx(p);
 817 | 
 818 | 	task->pid = p->pid;
 819 | 	task->cpu = prev_cpu;
 820 | 	task->nr_cpus_allowed = p->nr_cpus_allowed;
 821 | 	task->flags = enq_flags;
 822 | 	task->start_ts = tctx ? tctx->start_ts : 0;
 823 | 	task->stop_ts = tctx ? tctx->stop_ts : 0;
 824 | 	task->exec_runtime = tctx ? tctx->exec_runtime : 0;
 825 | 	task->weight = p->scx.weight;
 826 | 	task->vtime = p->scx.dsq_vtime;
 827 | 	task->tgid = p->tgid;
 828 | }
 829 | 
 830 | /*
 831 |  * User-space scheduler is congested: log that and increment congested counter.
 832 |  */
 833 | static void sched_congested(struct task_struct *p)
 834 | {
 835 | 	dbg_msg("congested: pid=%d (%s)", p->pid, p->comm);
 836 | 	__sync_fetch_and_add(&nr_sched_congested, 1);
 837 | }
 838 | 
 839 | /*
 840 |  * Return true if a task has been enqueued as a remote wakeup, false
 841 |  * otherwise.
 842 |  */
 843 | static bool is_queued_wakeup(const struct task_struct *p, u64 enq_flags)
 844 | {
 845 | 	return !__COMPAT_is_enq_cpu_selected(enq_flags) && !scx_bpf_task_running(p);
 846 | }
 847 | 
 848 | /*
 849 |  * Queue a task to the user-space scheduler.
 850 |  */
 851 | static void queue_task_to_userspace(struct task_struct *p, s32 prev_cpu, u64 enq_flags)
 852 | {
 853 | 	struct queued_task_ctx *task;
 854 | 
 855 | 	/*
 856 | 	 * Allocate a new entry in the ring buffer.
 857 | 	 *
 858 | 	 * If ring buffer is full, the user-space scheduler is congested,
 859 | 	 * so dispatch the task directly using the shared DSQ (the task
 860 | 	 * will be consumed by the first CPU available).
 861 | 	 */
 862 | 	task = bpf_ringbuf_reserve(&queued, sizeof(*task), 0);
 863 | 	if (!task) {
 864 | 		sched_congested(p);
 865 | 		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ,
 866 | 					 SCX_SLICE_DFL, p->scx.dsq_vtime, enq_flags);
 867 | 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 868 | 		return;
 869 | 	}
 870 | 
 871 | 	/*
 872 | 	 * Collect task information and store them in the ring buffer that
 873 | 	 * will be consumed by the user-space scheduler.
 874 | 	 */
 875 | 	dbg_msg("enqueue: pid=%d (%s)", p->pid, p->comm);
 876 | 	get_task_info(task, p, prev_cpu, enq_flags);
 877 | 	bpf_ringbuf_submit(task, 0);
 878 | 	__sync_fetch_and_add(&nr_queued, 1);
 879 | }
 880 | 
 881 | /*
 882 |  * Task @p becomes ready to run. We can dispatch the task directly here if the
 883 |  * user-space scheduler is not required, or enqueue it to be processed by the
 884 |  * scheduler.
 885 |  */
 886 | void BPF_STRUCT_OPS(goland_enqueue, struct task_struct *p, u64 enq_flags)
 887 | {
 888 | 	s32 prev_cpu = scx_bpf_task_cpu(p), cpu;
 889 | 	bool is_wakeup = is_queued_wakeup(p, enq_flags);
 890 | 
 891 | 	/*
 892 | 	 * Insert the user-space scheduler to its dedicated DSQ, it will be
 893 | 	 * consumed from ops.dispatch() only when there's any pending
 894 | 	 * scheduling action to do.
 895 | 	 */
 896 | 	if (is_belong_usersched_task(p)) {
 897 | 		if (usersched_has_pending_tasks()) {
 898 | 			/*
 899 | 			 * Try to find an idle CPU and dispatch directly to reduce latency.
 900 | 			 * This avoids the overhead of going through SCHED_DSQ.
 901 | 			 */
 902 | 			cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 903 | 			if (cpu >= 0) {
 904 | 				scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu,
 905 | 					default_slice, SCX_ENQ_LAST);
 906 | 				scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 907 | 				return;
 908 | 			}
 909 | 		}
 910 | 		scx_bpf_dsq_insert(p, SCHED_DSQ, default_slice, SCX_ENQ_LAST);
 911 | 		return;
 912 | 	}
 913 | 
 914 | 	/*
 915 | 	 * Always dispatch per-CPU kthreads directly on their target CPU.
 916 | 	 *
 917 | 	 * This allows to prioritize critical kernel threads that may
 918 | 	 * potentially stall the entire system if they are blocked for too long
 919 | 	 * (i.e., ksoftirqd/N, rcuop/N, etc.).
 920 | 	 */
 921 | 	if (is_kthread(p) && p->nr_cpus_allowed == 1 && early_processing) {
 922 | 		cpu = scx_bpf_task_cpu(p);
 923 | 		scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu),
 924 | 					 default_slice, p->scx.dsq_vtime, enq_flags);
 925 | 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 926 | 		return;
 927 | 	}
 928 | 	if (is_kswapd(p) || is_khugepaged(p)) {
 929 | 		cpu = scx_bpf_task_cpu(p);
 930 | 		scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu),
 931 | 					 default_slice, p->scx.dsq_vtime, enq_flags);
 932 | 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 933 | 		return;
 934 | 	}
 935 | 
 936 | 	/*
 937 | 	 * Handle priority tasks with custom dispatch logic.
 938 | 	 */
 939 | 	u64* elem;
 940 | 	u64 slice;
 941 | 	u32 pid = p->pid;
 942 | 	s32 prio_cpu = -EBUSY;
 943 | 	u64 prio_enq_flags = SCX_ENQ_PREEMPT;
 944 | 	u32* cur_pid_val;
 945 | 	u32 cur_pid;
 946 | 
 947 | 	elem = bpf_map_lookup_elem(&priority_tasks, &pid);
 948 | 	if (elem) {
 949 | 		prio_cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 950 | 		if (prio_cpu == -EBUSY) {
 951 | 			prio_cpu = scx_bpf_task_cpu(p);
 952 | 		}
 953 | 		slice = *elem;
 954 | 		if (prio_cpu >= 0) {
 955 | 			cur_pid_val = bpf_map_lookup_elem(&running_task, &prio_cpu);
 956 | 			if (cur_pid_val) {
 957 | 				cur_pid = *cur_pid_val;
 958 | 				elem = bpf_map_lookup_elem(&priority_tasks, &cur_pid);
 959 | 				// If current running task is prioritized, do not preempt it (SCX_ENQ_HEAD).
 960 | 				// Otherwise, keep the flag equals to SCX_ENQ_PREEMPT
 961 | 				if (elem) {
 962 | 					prio_enq_flags = SCX_ENQ_HEAD;
 963 | 				}
 964 | 			}
 965 | 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | prio_cpu,
 966 | 				slice, prio_enq_flags);
 967 | 			__sync_fetch_and_add(&nr_user_dispatches, 1);
 968 | 			scx_bpf_kick_cpu(prio_cpu, SCX_KICK_IDLE);
 969 | 			return;
 970 | 		}
 971 | 	}
 972 | 
 973 | 	/*
 974 | 	 * If @builtin_idle is enabled, give the task a chance to be
 975 | 	 * directly dispatched only on a wakeup and only if
 976 | 	 * ops.select_cpu() was skipped, otherwise the task is always
 977 | 	 * queued to the user-space scheduler.
 978 | 	 */
 979 | 	if (!(builtin_idle && is_wakeup)) {
 980 | 		queue_task_to_userspace(p, prev_cpu, enq_flags);
 981 | 		goto out_kick;
 982 | 	}
 983 | 
 984 | 	/*
 985 | 	 * Try to find an idle CPU in the system, if all CPUs are busy
 986 | 	 * queue the task to the user-space scheduler.
 987 | 	 */
 988 | 	cpu = pick_idle_cpu(p, prev_cpu, 0);
 989 | 	if (cpu < 0) {
 990 | 		queue_task_to_userspace(p, prev_cpu, enq_flags);
 991 | 		goto out_kick;
 992 | 	}
 993 | 
 994 | 	/*
 995 | 	 * Always force a CPU wakeup, so that the allocated CPU can be
 996 | 	 * released and go back idle even if the task isn't directly
 997 | 	 * dispatched.
 998 | 	 */
 999 | 	prev_cpu = cpu;
1000 | 	is_wakeup = true;
1001 | 
1002 | 	/*
1003 | 	 * Perform direct dispatch only if the SHARED_DSQ is empty and
1004 | 	 * the per-CPU DSQ is empty, otherwise we may risk to starve the
1005 | 	 * tasks waiting in the queues.
1006 | 	 */
1007 | 	if (!scx_bpf_dsq_nr_queued(SHARED_DSQ) && !scx_bpf_dsq_nr_queued(cpu_to_dsq(cpu))) {
1008 | 		/*
1009 | 		 * We can race with a dequeue here and the selected idle CPU
1010 | 		 * might be not valid anymore, if the task affinity has changed.
1011 | 		 *
1012 | 		 * In this case just wakeup the picked CPU and ignore the enqueue,
1013 | 		 * another enqueue event for the same task will be received later.
1014 | 		 */
1015 | 		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
1016 | 			goto out_kick;
1017 | 
1018 | 		/*
1019 | 		 * Directly dispatch the task to selected idle CPU (queued wakeup).
1020 | 		 */
1021 | 		scx_bpf_dsq_insert_vtime(p, cpu_to_dsq(cpu),
1022 | 					 SCX_SLICE_DFL, p->scx.dsq_vtime, enq_flags);
1023 | 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
1024 | 		goto out_kick;
1025 | 	}
1026 | 
1027 | 	/*
1028 | 	 * If we can't directly dispatch, queue the task to user-space.
1029 | 	 */
1030 | 	queue_task_to_userspace(p, prev_cpu, enq_flags);
1031 | 
1032 | out_kick:
1033 | 	/*
1034 | 	 * Wakeup the task's CPU if needed.
1035 | 	 */
1036 | 	if (is_wakeup)
1037 | 		scx_bpf_kick_cpu(prev_cpu, SCX_KICK_IDLE);
1038 | }
1039 | 
1040 | /*
1041 |  * Handle a task dispatched from user-space, performing the actual low-level
1042 |  * BPF dispatch.
1043 |  */
1044 | static long handle_dispatched_task(struct bpf_dynptr *dynptr, void *context)
1045 | {
1046 | 	const struct dispatched_task_ctx *task;
1047 | 
1048 | 	task = bpf_dynptr_data(dynptr, 0, sizeof(*task));
1049 | 	if (!task)
1050 | 		return 0;
1051 | 
1052 | 	dispatch_task(task);
1053 | 
1054 | 	return !!scx_bpf_dispatch_nr_slots();
1055 | }
1056 | 
1057 | /*
1058 |  * Dispatch tasks that are ready to run.
1059 |  *
1060 |  * This function is called when a CPU's local DSQ is empty and ready to accept
1061 |  * new dispatched tasks.
1062 |  *
1063 |  * We may dispatch tasks also on other CPUs from here, if the scheduler decided
1064 |  * so (usually if other CPUs are idle we may want to send more tasks to their
1065 |  * local DSQ to optimize the scheduling pipeline).
1066 |  */
1067 | void BPF_STRUCT_OPS(goland_dispatch, s32 cpu, struct task_struct *prev)
1068 | {
1069 | 	/*
1070 | 	 * Dispatch the user-space scheduler if there's any pending action
1071 | 	 * to do. Keep consuming from SCHED_DSQ until it's empty.
1072 | 	 */
1073 | 	if (usersched_has_pending_tasks()) {
1074 | 		int consumed = 0;
1075 | 		while (scx_bpf_dsq_move_to_local(SCHED_DSQ) && consumed++ < MAX_USERSCHED_DISPATCH)
1076 | 			;
1077 | 		return;
1078 | 	}
1079 | 
1080 | 	/*
1081 | 	 * Consume all tasks from the @dispatched list and immediately
1082 | 	 * dispatch them on the target CPU decided by the user-space
1083 | 	 * scheduler.
1084 | 	 */
1085 | 	s32 ret = bpf_user_ringbuf_drain(&dispatched,
1086 | 				handle_dispatched_task, NULL, BPF_RB_NO_WAKEUP);
1087 | 	if (ret < 0)
1088 | 		dbg_msg("User ringbuf drain error: %d", ret);
1089 | 
1090 | 	/*
1091 | 	 * Consume a task from the per-CPU DSQ.
1092 | 	 */
1093 | 	if (scx_bpf_dsq_move_to_local(cpu_to_dsq(cpu)))
1094 | 		return;
1095 | 
1096 | 	/*
1097 | 	 * Consume a task from the shared DSQ.
1098 | 	 */
1099 | 	if (scx_bpf_dsq_move_to_local(SHARED_DSQ))
1100 | 		return;
1101 | 
1102 | 	/*
1103 | 	 * If the current task expired its time slice and no other task
1104 | 	 * wants to run, simply replenish its time slice and let it run for
1105 | 	 * another round on the same CPU.
1106 | 	 *
1107 | 	 * In case of the user-space scheduler task, replenish its time
1108 | 	 * slice only if there're still pending scheduling actions to do.
1109 | 	 */
1110 | 	if (prev && is_queued(prev) &&
1111 | 	    (!is_belong_usersched_task(prev) || usersched_has_pending_tasks()))
1112 | 		prev->scx.slice = default_slice;
1113 | }
1114 | 
1115 | void BPF_STRUCT_OPS(goland_runnable, struct task_struct *p, u64 enq_flags)
1116 | {
1117 | 	struct task_ctx *tctx;
1118 | 
1119 | 	if (is_belong_usersched_task(p))
1120 | 		return;
1121 | 
1122 | 	tctx = try_lookup_task_ctx(p);
1123 | 	if (!tctx)
1124 | 		return;
1125 | 
1126 | 	tctx->exec_runtime = 0;
1127 | }
1128 | 
1129 | /*
1130 |  * Task @p starts on its selected CPU (update CPU ownership map).
1131 |  */
1132 | void BPF_STRUCT_OPS(goland_running, struct task_struct *p)
1133 | {
1134 | 	s32 cpu = scx_bpf_task_cpu(p);
1135 | 	struct task_ctx *tctx;
1136 | 
1137 | 	u32 pid = p->pid;
1138 | 	bpf_map_update_elem(&running_task, &cpu, &pid, BPF_ANY);
1139 | 
1140 | 	if (is_usersched_task(p)) {
1141 | 		usersched_last_run_at = scx_bpf_now();
1142 | 		return;
1143 | 	}
1144 | 
1145 | 	dbg_msg("start: pid=%d (%s) cpu=%ld", p->pid, p->comm, cpu);
1146 | 
1147 | 	/*
1148 | 	 * Mark the CPU as busy by setting the pid as owner (ignoring the
1149 | 	 * user-space scheduler).
1150 | 	 */
1151 | 	__sync_fetch_and_add(&nr_running, 1);
1152 | 
1153 | 	tctx = try_lookup_task_ctx(p);
1154 | 	if (!tctx)
1155 | 		return;
1156 | 	tctx->start_ts = scx_bpf_now();
1157 | }
1158 | 
1159 | /*
1160 |  * Task @p stops running on its associated CPU (update CPU ownership map).
1161 |  */
1162 | void BPF_STRUCT_OPS(goland_stopping, struct task_struct *p, bool runnable)
1163 | {
1164 | 	u64 now = scx_bpf_now();
1165 | 	s32 cpu = scx_bpf_task_cpu(p);
1166 | 	struct task_ctx *tctx;
1167 | 
1168 | 	if (is_belong_usersched_task(p)) {
1169 | 		if (nr_scheduled + nr_queued == 0) {
1170 | 			test_and_clear_usersched_needed();
1171 | 		}
1172 | 		return;
1173 | 	}
1174 | 
1175 | 	dbg_msg("stop: pid=%d (%s) cpu=%ld", p->pid, p->comm, cpu);
1176 | 
1177 | 	__sync_fetch_and_sub(&nr_running, 1);
1178 | 
1179 | 	tctx = try_lookup_task_ctx(p);
1180 | 	if (!tctx)
1181 | 		return;
1182 | 	tctx->stop_ts = now;
1183 | 
1184 | 	/*
1185 | 	 * Update the partial execution time since last sleep.
1186 | 	 */
1187 | 	tctx->exec_runtime += now - tctx->start_ts;
1188 | }
1189 | 
1190 | /*
1191 |  * A task joins the sched_ext scheduler.
1192 |  */
1193 | void BPF_STRUCT_OPS(goland_enable, struct task_struct *p)
1194 | {
1195 | 	p->scx.dsq_vtime = 0;
1196 | 	p->scx.slice = SCX_SLICE_DFL;
1197 | }
1198 | 
1199 | /*
1200 |  * A new task @p is being created.
1201 |  *
1202 |  * Allocate and initialize all the internal structures for the task (this
1203 |  * function is allowed to block, so it can be used to preallocate memory).
1204 |  */
1205 | s32 BPF_STRUCT_OPS(goland_init_task, struct task_struct *p,
1206 | 		   struct scx_init_task_args *args)
1207 | {
1208 | 	struct task_ctx *tctx;
1209 | 	struct bpf_cpumask *cpumask;
1210 | 
1211 | 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
1212 | 				    BPF_LOCAL_STORAGE_GET_F_CREATE);
1213 | 	if (!tctx)
1214 | 		return -ENOMEM;
1215 | 
1216 | 	/*
1217 | 	 * Create task's L2 cache cpumask.
1218 | 	 */
1219 | 	cpumask = bpf_cpumask_create();
1220 | 	if (!cpumask)
1221 | 		return -ENOMEM;
1222 | 	cpumask = bpf_kptr_xchg(&tctx->l2_cpumask, cpumask);
1223 | 	if (cpumask)
1224 | 		bpf_cpumask_release(cpumask);
1225 | 
1226 | 	/*
1227 | 	 * Create task's L3 cache cpumask.
1228 | 	 */
1229 | 	cpumask = bpf_cpumask_create();
1230 | 	if (!cpumask)
1231 | 		return -ENOMEM;
1232 | 	cpumask = bpf_kptr_xchg(&tctx->l3_cpumask, cpumask);
1233 | 	if (cpumask)
1234 | 		bpf_cpumask_release(cpumask);
1235 | 
1236 | 	return 0;
1237 | }
1238 | 
1239 | /*
1240 |  * Heartbeat scheduler timer callback.
1241 |  *
1242 |  * If the system is completely idle the sched-ext watchdog may incorrectly
1243 |  * detect that as a stall and automatically disable the scheduler. So, use this
1244 |  * timer to periodically wake-up the scheduler and avoid long inactivity.
1245 |  *
1246 |  * This can also help to prevent real "stalling" conditions in the scheduler.
1247 |  */
1248 | static int usersched_timer_fn(void *map, int *key, struct bpf_timer *timer)
1249 | {
1250 | 	struct task_struct *p;
1251 | 	int err = 0;
1252 | 
1253 | 	/*
1254 | 	 * Trigger the user-space scheduler if it has been inactive for
1255 | 	 * more than USERSCHED_TIMER_NS.
1256 | 	 */
1257 | 	if (time_delta(scx_bpf_now(), usersched_last_run_at) >= USERSCHED_TIMER_NS) {
1258 | 		bpf_rcu_read_lock();
1259 | 		p = bpf_task_from_pid(usersched_pid);
1260 | 		if (p) {
1261 | 			set_usersched_needed();
1262 | 			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
1263 | 			bpf_task_release(p);
1264 | 		}
1265 | 		bpf_rcu_read_unlock();
1266 | 	}
1267 | 
1268 | 	/* Re-arm the timer */
1269 | 	err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
1270 | 	if (err)
1271 | 		scx_bpf_error("Failed to arm stats timer");
1272 | 
1273 | 	return 0;
1274 | }
1275 | 
1276 | /*
1277 |  * Initialize the heartbeat scheduler timer.
1278 |  */
1279 | static int usersched_timer_init(void)
1280 | {
1281 | 	struct bpf_timer *timer;
1282 | 	u32 key = 0;
1283 | 	int err;
1284 | 
1285 | 	timer = bpf_map_lookup_elem(&usersched_timer, &key);
1286 | 	if (!timer) {
1287 | 		scx_bpf_error("Failed to lookup scheduler timer");
1288 | 		return -ESRCH;
1289 | 	}
1290 | 	bpf_timer_init(timer, &usersched_timer, CLOCK_BOOTTIME);
1291 | 	bpf_timer_set_callback(timer, usersched_timer_fn);
1292 | 	err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
1293 | 	if (err)
1294 | 		scx_bpf_error("Failed to arm scheduler timer");
1295 | 
1296 | 	return err;
1297 | }
1298 | 
1299 | /*
1300 |  * Evaluate the amount of online CPUs.
1301 |  */
1302 | static s32 get_nr_online_cpus(void)
1303 | {
1304 | 	const struct cpumask *online_cpumask;
1305 | 	int i, cpus = 0;
1306 | 
1307 | 	online_cpumask = scx_bpf_get_online_cpumask();
1308 | 
1309 | 	bpf_for(i, 0, nr_cpu_ids) {
1310 | 		if (!bpf_cpumask_test_cpu(i, online_cpumask))
1311 | 			continue;
1312 | 		cpus++;
1313 | 	}
1314 | 
1315 | 	scx_bpf_put_cpumask(online_cpumask);
1316 | 
1317 | 	return cpus;
1318 | }
1319 | 
1320 | /*
1321 |  * Create a DSQ for each CPU available in the system and a global shared DSQ.
1322 |  *
1323 |  * All the tasks processed by the user-space scheduler can be dispatched either
1324 |  * to a specific CPU/DSQ or to the first CPU available (SHARED_DSQ).
1325 |  *
1326 |  * Custom DSQs are then consumed from the .dispatch() callback, that will
1327 |  * transfer all the enqueued tasks to the consuming CPU's local DSQ.
1328 |  */
1329 | static int dsq_init(void)
1330 | {
1331 | 	int err;
1332 | 	s32 cpu;
1333 | 
1334 | 	/* Initialize amount of online CPUs */
1335 | 	nr_online_cpus = get_nr_online_cpus();
1336 | 
1337 | 	/* Create per-CPU DSQs */
1338 | 	bpf_for(cpu, 0, nr_cpu_ids) {
1339 | 		err = scx_bpf_create_dsq(cpu_to_dsq(cpu), -1);
1340 | 		if (err) {
1341 | 			scx_bpf_error("failed to create pcpu DSQ %d: %d",
1342 | 				      cpu, err);
1343 | 			return err;
1344 | 		}
1345 | 	}
1346 | 
1347 | 	/* Create the global shared DSQ */
1348 | 	err = scx_bpf_create_dsq(SHARED_DSQ, -1);
1349 | 	if (err) {
1350 | 		scx_bpf_error("failed to create shared DSQ: %d", err);
1351 | 		return err;
1352 | 	}
1353 | 
1354 | 	/* Create the scheduler's DSQ */
1355 | 	err = scx_bpf_create_dsq(SCHED_DSQ, -1);
1356 | 	if (err) {
1357 | 		scx_bpf_error("failed to create scheduler DSQ: %d", err);
1358 | 		return err;
1359 | 	}
1360 | 
1361 | 	return 0;
1362 | }
1363 | 
1364 | static int init_cpumask(struct bpf_cpumask **cpumask)
1365 | {
1366 | 	struct bpf_cpumask *mask;
1367 | 	int err = 0;
1368 | 
1369 | 	/*
1370 | 	 * Do nothing if the mask is already initialized.
1371 | 	 */
1372 | 	mask = *cpumask;
1373 | 	if (mask)
1374 | 		return 0;
1375 | 	/*
1376 | 	 * Create the CPU mask.
1377 | 	 */
1378 | 	err = calloc_cpumask(cpumask);
1379 | 	if (!err)
1380 | 		mask = *cpumask;
1381 | 	if (!mask)
1382 | 		err = -ENOMEM;
1383 | 
1384 | 	return err;
1385 | }
1386 | 
1387 | SEC("syscall")
1388 | int enable_sibling_cpu(struct domain_arg *input)
1389 | {
1390 | 	struct cpu_ctx *cctx;
1391 | 	struct bpf_cpumask *mask, **pmask;
1392 | 	int err = 0;
1393 | 
1394 | 	cctx = try_lookup_cpu_ctx(input->cpu_id);
1395 | 	if (!cctx)
1396 | 		return -ENOENT;
1397 | 
1398 | 	/* Make sure the target CPU mask is initialized */
1399 | 	switch (input->lvl_id) {
1400 | 	case 2:
1401 | 		pmask = &cctx->l2_cpumask;
1402 | 		break;
1403 | 	case 3:
1404 | 		pmask = &cctx->l3_cpumask;
1405 | 		break;
1406 | 	default:
1407 | 		return -EINVAL;
1408 | 	}
1409 | 	err = init_cpumask(pmask);
1410 | 	if (err)
1411 | 		return err;
1412 | 
1413 | 	bpf_rcu_read_lock();
1414 | 	mask = *pmask;
1415 | 	if (mask)
1416 | 		bpf_cpumask_set_cpu(input->sibling_cpu_id, mask);
1417 | 	bpf_rcu_read_unlock();
1418 | 
1419 | 	return err;
1420 | }
1421 | 
1422 | /*
1423 |  * Initialize the scheduling class.
1424 |  */
1425 | s32 BPF_STRUCT_OPS_SLEEPABLE(goland_init)
1426 | {
1427 | 	int err;
1428 | 
1429 | 	/* Compile-time checks */
1430 | 	BUILD_BUG_ON((MAX_CPUS % 2));
1431 | 
1432 | 	/* Initialize maximum possible CPU number */
1433 | 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
1434 | 
1435 | 	/* Initialize goland core */
1436 | 	err = dsq_init();
1437 | 	if (err)
1438 | 		return err;
1439 | 	err = usersched_timer_init();
1440 | 	if (err)
1441 | 		return err;
1442 | 
1443 | 	return 0;
1444 | }
1445 | 
1446 | /*
1447 |  * A task is being destroyed.
1448 |  *
1449 |  * Clean up the task from priority tasks map.
1450 |  */
1451 | void BPF_STRUCT_OPS(goland_exit_task, struct task_struct *p,
1452 | 		    struct scx_exit_task_args *args)
1453 | {
1454 | 	/* Remove task from priority tasks map */
1455 | 	update_priority_task_map(p->pid, 1, 0);
1456 | }
1457 | 
1458 | /*
1459 |  * Unregister the scheduling class.
1460 |  */
1461 | void BPF_STRUCT_OPS(goland_exit, struct scx_exit_info *ei)
1462 | {
1463 | 	UEI_RECORD(uei, ei);
1464 | }
1465 | 
1466 | /*
1467 |  * A CPU is about to change its idle state. If the CPU is going idle, ensure
1468 |  * that the user-space scheduler has a chance to run if there is any remaining
1469 |  * work to do.
1470 |  */
1471 | void BPF_STRUCT_OPS(goland_update_idle, s32 cpu, bool idle)
1472 | {
1473 | 	/*
1474 | 	 * Don't do anything if we exit from and idle state, a CPU owner will
1475 | 	 * be assigned in .running().
1476 | 	 */
1477 | 	if (!idle)
1478 | 		return;
1479 | 	/*
1480 | 	 * A CPU is now available, notify the user-space scheduler that tasks
1481 | 	 * can be dispatched, if there is at least one task waiting to be
1482 | 	 * scheduled, either queued (accounted in nr_queued) or scheduled
1483 | 	 * (accounted in nr_scheduled).
1484 | 	 *
1485 | 	 * NOTE: nr_queued is incremented by the BPF component, more exactly in
1486 | 	 * enqueue(), when a task is sent to the user-space scheduler, then
1487 | 	 * the scheduler drains the queued tasks (updating nr_queued) and adds
1488 | 	 * them to its internal data structures / state; at this point tasks
1489 | 	 * become "scheduled" and the user-space scheduler will take care of
1490 | 	 * updating nr_scheduled accordingly; lastly tasks will be dispatched
1491 | 	 * and the user-space scheduler will update nr_scheduled again.
1492 | 	 *
1493 | 	 * Checking both counters allows to determine if there is still some
1494 | 	 * pending work to do for the scheduler: new tasks have been queued
1495 | 	 * since last check, or there are still tasks "queued" or "scheduled"
1496 | 	 * since the previous user-space scheduler run. If the counters are
1497 | 	 * both zero it is pointless to wake-up the scheduler (even if a CPU
1498 | 	 * becomes idle), because there is nothing to do.
1499 | 	 *
1500 | 	 * Keep in mind that update_idle() doesn't run concurrently with the
1501 | 	 * user-space scheduler (that is single-threaded): this function is
1502 | 	 * naturally serialized with the user-space scheduler code, therefore
1503 | 	 * this check here is also safe from a concurrency perspective.
1504 | 	 */
1505 | 	if (nr_queued || nr_scheduled) {
1506 | 		/*
1507 | 		 * Notify that user-space scheduler should run and kick this CPU
1508 | 		 * to make it immediately ready to accept dispatched tasks.
1509 | 		 */
1510 | 		set_usersched_needed();
1511 | 		scx_bpf_kick_cpu(cpu, 0);
1512 | 	}
1513 | }
1514 | 
1515 | /*
1516 |  * Scheduling class declaration.
1517 |  */
1518 | SCX_OPS_DEFINE(goland,
1519 | 	       .select_cpu		= (void *)goland_select_cpu,
1520 | 	       .enqueue			= (void *)goland_enqueue,
1521 | 	       .dispatch		= (void *)goland_dispatch,
1522 | 		   .update_idle		= (void *)goland_update_idle,
1523 | 	       .runnable		= (void *)goland_runnable,
1524 | 	       .running			= (void *)goland_running,
1525 | 	       .stopping		= (void *)goland_stopping,
1526 | 	       .enable			= (void *)goland_enable,
1527 | 	       .init_task		= (void *)goland_init_task,
1528 | 	       .exit_task		= (void *)goland_exit_task,
1529 | 	       .init			= (void *)goland_init,
1530 | 	       .exit			= (void *)goland_exit,
1531 | 	       .timeout_ms		= 5000,
1532 | 	       .dispatch_max_batch	= MAX_DISPATCH_SLOT,
1533 | 		   .flags			= SCX_OPS_ENQ_LAST |
1534 | 					  SCX_OPS_KEEP_BUILTIN_IDLE,
1535 | 	       .name			= "goland");


--------------------------------------------------------------------------------