├── .gitignore ├── log.sh ├── stop.sh ├── img ├── scheduling.png └── round_robin.png ├── scheduler.sh ├── setup.sh ├── w2n.sh ├── start.sh ├── prio.bpf.c ├── fifo.bpf.c ├── fcfs.bpf.c ├── sched_ext.bpf.c ├── n2w.sh ├── lottery.bpf.c ├── lottery_prio.bpf.c ├── vtime.bpf.c └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | linux-* 3 | vmlinux.h 4 | *-log 5 | *.gz 6 | *.dsc -------------------------------------------------------------------------------- /log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sudo cat /sys/kernel/debug/tracing/trace_pipe 4 | -------------------------------------------------------------------------------- /stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Remove the scheduler 4 | sudo rm /sys/fs/bpf/sched_ext/sched_ops -------------------------------------------------------------------------------- /img/scheduling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parttimenerd/minimal-scheduler/HEAD/img/scheduling.png -------------------------------------------------------------------------------- /img/round_robin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parttimenerd/minimal-scheduler/HEAD/img/round_robin.png -------------------------------------------------------------------------------- /scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Prints the currently running custom scheduler 4 | 5 | cat /sys/kernel/sched_ext/root/ops 6 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Generate the vmlinux.h file to start using in the IDE 3 | 4 | bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h 5 | -------------------------------------------------------------------------------- /w2n.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute the closest nice value for a given weight 4 | 5 | 6 | # Function to compute nice value from cgroup weight using n2w.sh 7 | compute_nice_value() { 8 | local input_weight=$1 9 | local closest_nice=-20 10 | local min_diff=100000000 11 | 12 | for ((nice=-20; nice<=19; nice++)); do 13 | local weight=$(./n2w.sh "$nice") 14 | local diff=$(( input_weight - weight )) 15 | if (( diff < 0 )); then 16 | diff=$(( -diff )) 17 | fi 18 | if (( diff < min_diff )); then 19 | min_diff=$diff 20 | closest_nice=$nice 21 | fi 22 | done 23 | 24 | echo "$closest_nice" 25 | } 26 | 27 | # Main function 28 | if [[ -z "$1" ]]; then 29 | echo "Usage: $0 " 30 | exit 1 31 | fi 32 | 33 | cgroup_weight=$1 34 | nice_value=$(compute_nice_value "$cgroup_weight") 35 | 36 | echo "$nice_value" 37 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Usage: ./start.sh scheduler_file.c 4 | 5 | # Build the scheduler if the C file is younger than the .c.o file or if the .c.o file doesn't exist 6 | # use sched_ext.bpf.c as default 7 | C_FILE=${1:-sched_ext.bpf.c} 8 | 9 | # if --help is passed, print the usage 10 | 11 | if [ "$1" = "--help" ]; then 12 | echo "Usage: ./start.sh scheduler_file.c" 13 | # print all the available scheduler files in the directory 14 | echo "Available scheduler files:" 15 | ls -1 *.bpf.c 16 | exit 0 17 | fi 18 | 19 | ./build.sh $1 20 | 21 | sudo ./stop.sh 22 | 23 | 24 | # Register the scheduler 25 | sudo bpftool struct_ops register ${C_FILE}.o /sys/fs/bpf/sched_ext || (echo "Error attaching scheduler, consider calling stop.sh before" || exit 1) 26 | 27 | # Print scheduler name, fails if it isn't registered properly 28 | cat /sys/kernel/sched_ext/root/ops || (echo "No sched-ext scheduler installed" && exit 1) 29 | -------------------------------------------------------------------------------- /prio.bpf.c: -------------------------------------------------------------------------------- 1 | // Simple priority scheduler 2 | 3 | #include "vmlinux.h" 4 | #include 5 | #include 6 | 7 | // Define a shared Dispatch Queue (DSQ) ID 8 | #define SHARED_DSQ_ID 0 9 | 10 | #define BPF_STRUCT_OPS(name, args...) \ 11 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 12 | 13 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 14 | SEC("struct_ops.s/"#name) \ 15 | BPF_PROG(name, ##args) 16 | 17 | 18 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 19 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 20 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 21 | } 22 | 23 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 24 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 25 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 26 | scx_bpf_dsq_insert_vtime(p, SHARED_DSQ_ID, slice, 100 - p->scx.weight, enq_flags); 27 | return 0; 28 | } 29 | 30 | // Dispatch a task from the shared DSQ to a CPU 31 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 32 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 33 | return 0; 34 | } 35 | 36 | 37 | 38 | 39 | // Define the main scheduler operations structure (sched_ops) 40 | SEC(".struct_ops.link") 41 | struct sched_ext_ops sched_ops = { 42 | .enqueue = (void *)sched_enqueue, 43 | .dispatch = (void *)sched_dispatch, 44 | .init = (void *)sched_init, 45 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 46 | .name = "priority_scheduler" 47 | }; 48 | 49 | // License for the BPF program 50 | char _license[] SEC("license") = "GPL"; 51 | -------------------------------------------------------------------------------- /fifo.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" 2 | #include 3 | #include 4 | 5 | // Define a shared Dispatch Queue (DSQ) ID 6 | #define SHARED_DSQ_ID 0 7 | 8 | #define BPF_STRUCT_OPS(name, args...) \ 9 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 10 | 11 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 12 | SEC("struct_ops.s/"#name) \ 13 | BPF_PROG(name, ##args) 14 | 15 | 16 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 17 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 18 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 19 | } 20 | 21 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 22 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 23 | // Calculate the time slice for the task based on the number of tasks in the queue 24 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 25 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, slice, enq_flags); 26 | return 0; 27 | } 28 | 29 | // Dispatch a task from the shared DSQ to a CPU 30 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 31 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 32 | return 0; 33 | } 34 | 35 | 36 | 37 | 38 | // Define the main scheduler operations structure (sched_ops) 39 | SEC(".struct_ops.link") 40 | struct sched_ext_ops sched_ops = { 41 | .enqueue = (void *)sched_enqueue, 42 | .dispatch = (void *)sched_dispatch, 43 | .init = (void *)sched_init, 44 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 45 | .name = "fifo_scheduler" 46 | }; 47 | 48 | // License for the BPF program 49 | char _license[] SEC("license") = "GPL"; 50 | -------------------------------------------------------------------------------- /fcfs.bpf.c: -------------------------------------------------------------------------------- 1 | // A FIFO scheduler that doesn't interrupt processes 2 | 3 | #include "vmlinux.h" 4 | #include 5 | #include 6 | 7 | // Define a shared Dispatch Queue (DSQ) ID 8 | #define SHARED_DSQ_ID 0 9 | 10 | #define BPF_STRUCT_OPS(name, args...) \ 11 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 12 | 13 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 14 | SEC("struct_ops.s/"#name) \ 15 | BPF_PROG(name, ##args) 16 | 17 | 18 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 19 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 20 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 21 | } 22 | 23 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 24 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 25 | // Give the task an infinite time slice, so it only goes from the CPU 26 | // on its own accord 27 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, -1, enq_flags); 28 | return 0; 29 | } 30 | 31 | // Dispatch a task from the shared DSQ to a CPU 32 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 33 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 34 | return 0; 35 | } 36 | 37 | 38 | 39 | 40 | // Define the main scheduler operations structure (sched_ops) 41 | SEC(".struct_ops.link") 42 | struct sched_ext_ops sched_ops = { 43 | .enqueue = (void *)sched_enqueue, 44 | .dispatch = (void *)sched_dispatch, 45 | .init = (void *)sched_init, 46 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 47 | .name = "fcfs_scheduler" 48 | }; 49 | 50 | // License for the BPF program 51 | char _license[] SEC("license") = "GPL"; 52 | -------------------------------------------------------------------------------- /sched_ext.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" 2 | #include 3 | #include 4 | 5 | // Define a shared Dispatch Queue (DSQ) ID 6 | #define SHARED_DSQ_ID 0 7 | 8 | #define BPF_STRUCT_OPS(name, args...) \ 9 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 10 | 11 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 12 | SEC("struct_ops.s/"#name) \ 13 | BPF_PROG(name, ##args) 14 | 15 | 16 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 17 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 18 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 19 | } 20 | 21 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 22 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 23 | // Calculate the time slice for the task based on the number of tasks in the queue 24 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 25 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, slice, enq_flags); 26 | return 0; 27 | } 28 | 29 | // Dispatch a task from the shared DSQ to a CPU 30 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 31 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 32 | return 0; 33 | } 34 | 35 | 36 | 37 | 38 | 39 | // Define the main scheduler operations structure (sched_ops) 40 | SEC(".struct_ops.link") 41 | struct sched_ext_ops sched_ops = { 42 | .enqueue = (void *)sched_enqueue, 43 | .dispatch = (void *)sched_dispatch, 44 | .init = (void *)sched_init, 45 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 46 | .name = "minimal_scheduler" 47 | }; 48 | 49 | // License for the BPF program 50 | char _license[] SEC("license") = "GPL"; 51 | -------------------------------------------------------------------------------- /n2w.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute the weight in p->scx.weight for a given nice value 4 | 5 | # The weight is evaluated using this table https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/core.c#n10122 6 | # and this function https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/sched.h#n268 7 | # 8 | # In practice weight = sched_weight_to_cgroup(sched_prio_to_weight[nice]) 9 | # so for example if you do the math when nice = 0 you get weight = 100 which is the default weight 10 | # 11 | # – Andrea Righi 12 | 13 | # Nice value to weight mapping 14 | weights=( 15 | 88761 71755 56483 46273 36291 16 | 29154 23254 18705 14949 11916 17 | 9548 7620 6100 4904 3906 18 | 3121 2501 1991 1586 1277 19 | 1024 820 655 526 423 20 | 335 272 215 172 137 21 | 110 87 70 56 45 22 | 36 29 23 18 15 23 | ) 24 | 25 | CGROUP_WEIGHT_MIN=1 26 | CGROUP_WEIGHT_DFL=100 27 | CGROUP_WEIGHT_MAX=10000 28 | 29 | # Function to compute cgroup weight from sched weight 30 | compute_cgroup_weight() { 31 | local weight=$1 32 | local result=$(( (weight * CGROUP_WEIGHT_DFL + 512) / 1024 )) 33 | 34 | # Clamp the result 35 | if (( result < CGROUP_WEIGHT_MIN )); then 36 | result=$CGROUP_WEIGHT_MIN 37 | elif (( result > CGROUP_WEIGHT_MAX )); then 38 | result=$CGROUP_WEIGHT_MAX 39 | fi 40 | 41 | echo "$result" 42 | } 43 | 44 | # Main function 45 | if [[ -z "$1" ]]; then 46 | echo "Usage: $0 " 47 | exit 1 48 | fi 49 | 50 | nice_val=$1 51 | if (( nice_val < -20 || nice_val > 19 )); then 52 | echo "Error: Nice value must be between -20 and 19" 53 | exit 1 54 | fi 55 | 56 | # Convert nice value to index 57 | index=$(( nice_val + 20 )) 58 | weight=${weights[$index]} 59 | cgroup_weight=$(compute_cgroup_weight "$weight") 60 | 61 | echo "$cgroup_weight" 62 | -------------------------------------------------------------------------------- /lottery.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" 2 | #include 3 | #include 4 | 5 | #define BPF_FOR_EACH_ITER (&___it) 6 | 7 | // Define a shared Dispatch Queue (DSQ) ID 8 | #define SHARED_DSQ_ID 0 9 | 10 | #define BPF_STRUCT_OPS(name, args...) \ 11 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 12 | 13 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 14 | SEC("struct_ops.s/"#name) \ 15 | BPF_PROG(name, ##args) 16 | 17 | 18 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 19 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 20 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 21 | } 22 | 23 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 24 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 25 | // Calculate the time slice for the task based on the number of tasks in the queue 26 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 27 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, slice, enq_flags); 28 | return 0; 29 | } 30 | 31 | // Dispatch a task from the shared DSQ to a CPU 32 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 33 | struct task_struct *p; 34 | s32 random = bpf_get_prandom_u32() % scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 35 | bpf_for_each(scx_dsq, p, SHARED_DSQ_ID, 0) { 36 | random = random - 1; 37 | if (random <= 0 && 38 | bpf_cpumask_test_cpu(cpu, p->cpus_ptr) && 39 | scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, 40 | SCX_DSQ_LOCAL_ON | cpu, SCX_ENQ_PREEMPT)) { 41 | bpf_printk("Dispatched task %s to CPU %d", p->comm, cpu); 42 | return 0; 43 | } 44 | }; 45 | return 0; 46 | } 47 | 48 | 49 | 50 | 51 | 52 | // Define the main scheduler operations structure (sched_ops) 53 | SEC(".struct_ops.link") 54 | struct sched_ext_ops sched_ops = { 55 | .enqueue = (void *)sched_enqueue, 56 | .dispatch = (void *)sched_dispatch, 57 | .init = (void *)sched_init, 58 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 59 | .name = "lottery_scheduler" 60 | }; 61 | 62 | // License for the BPF program 63 | char _license[] SEC("license") = "GPL"; 64 | -------------------------------------------------------------------------------- /lottery_prio.bpf.c: -------------------------------------------------------------------------------- 1 | // A lottery scheduler that respects scheduling priorities 2 | 3 | #include "vmlinux.h" 4 | #include 5 | #include 6 | 7 | #define BPF_FOR_EACH_ITER (&___it) 8 | 9 | // Define a shared Dispatch Queue (DSQ) ID 10 | #define SHARED_DSQ_ID 0 11 | 12 | #define BPF_STRUCT_OPS(name, args...) \ 13 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 14 | 15 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 16 | SEC("struct_ops.s/"#name) \ 17 | BPF_PROG(name, ##args) 18 | 19 | 20 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 21 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 22 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 23 | } 24 | 25 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 26 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 27 | // Calculate the time slice for the task based on the number of tasks in the queue 28 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 29 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, slice, enq_flags); 30 | return 0; 31 | } 32 | 33 | // Dispatch a task from the shared DSQ to a CPU 34 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 35 | struct task_struct *p; 36 | 37 | u32 weight_sum = 0; 38 | bpf_for_each(scx_dsq, p, SHARED_DSQ_ID, 0) { 39 | weight_sum += p->scx.weight; 40 | }; 41 | 42 | s32 random = bpf_get_prandom_u32() % weight_sum; 43 | bpf_for_each(scx_dsq, p, SHARED_DSQ_ID, 0) { 44 | random = random - p->scx.weight; 45 | if (random <= 0 && 46 | bpf_cpumask_test_cpu(cpu, p->cpus_ptr) && 47 | scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, 48 | SCX_DSQ_LOCAL_ON | cpu, SCX_ENQ_PREEMPT)) { 49 | bpf_printk("Dispatched task %s to CPU %d, %d of %d", p->comm, cpu, p->scx.weight, weight_sum); 50 | return 0; 51 | } 52 | }; 53 | return 0; 54 | } 55 | 56 | 57 | 58 | 59 | 60 | // Define the main scheduler operations structure (sched_ops) 61 | SEC(".struct_ops.link") 62 | struct sched_ext_ops sched_ops = { 63 | .enqueue = (void *)sched_enqueue, 64 | .dispatch = (void *)sched_dispatch, 65 | .init = (void *)sched_init, 66 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 67 | .name = "lottery_prio_scheduler" 68 | }; 69 | 70 | // License for the BPF program 71 | char _license[] SEC("license") = "GPL"; 72 | -------------------------------------------------------------------------------- /vtime.bpf.c: -------------------------------------------------------------------------------- 1 | // A vruntime scheduler based on https://github.com/sched-ext/scx/blob/main/scheds/c/scx_simple.bpf.c 2 | 3 | #include "vmlinux.h" 4 | #include 5 | #include 6 | 7 | // Define a shared Dispatch Queue (DSQ) ID 8 | #define SHARED_DSQ_ID 0 9 | 10 | #define BPF_STRUCT_OPS(name, args...) \ 11 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 12 | 13 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 14 | SEC("struct_ops.s/"#name) \ 15 | BPF_PROG(name, ##args) 16 | 17 | 18 | #define SLICE SCX_SLICE_DFL 19 | 20 | u64 vtime_now SEC(".data"); 21 | 22 | __always_inline bool isSmaller(u64 a, u64 b) { 23 | return (a - b) < 0; 24 | } 25 | 26 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 27 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 28 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 29 | } 30 | 31 | s32 BPF_STRUCT_OPS(sched_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { 32 | bool is_idle = 0; 33 | s32 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 34 | if (is_idle) { 35 | scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SLICE, 0); 36 | } 37 | return cpu; 38 | } 39 | 40 | // Enqueue a task to the shared DSQ, dispatching it with a time slice 41 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 42 | u64 vtime = p->scx.dsq_vtime; 43 | /* 44 | * Limit the amount of budget that an idling task can accumulate 45 | * to one slice. 46 | */ 47 | if ((isSmaller(vtime, vtime_now - SLICE))) { 48 | vtime = vtime_now - SLICE; 49 | } 50 | scx_bpf_dsq_insert_vtime(p, SHARED_DSQ_ID, SLICE, vtime, enq_flags); 51 | return 0; 52 | } 53 | 54 | // Dispatch a task from the shared DSQ to a CPU 55 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 56 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 57 | return 0; 58 | } 59 | 60 | void BPF_STRUCT_OPS(sched_running, struct task_struct *p) 61 | { 62 | /* 63 | * Global vtime always progresses forward as tasks start executing. The 64 | * test and update can be performed concurrently from multiple CPUs and 65 | * thus racy. Any error should be contained and temporary. Let's just 66 | * live with it. 67 | */ 68 | if (isSmaller(vtime_now, p->scx.dsq_vtime)) 69 | vtime_now = p->scx.dsq_vtime; 70 | } 71 | 72 | void BPF_STRUCT_OPS(sched_stopping, struct task_struct *p, bool runnable) 73 | { 74 | /* 75 | * Scale the execution time by the inverse of the weight and charge. 76 | * 77 | * Note that the default yield implementation yields by setting 78 | * @p->scx.slice to zero and the following would treat the yielding task 79 | * as if it has consumed all its slice. If this penalizes yielding tasks 80 | * too much, determine the execution time by taking explicit timestamps 81 | * instead of depending on @p->scx.slice. 82 | */ 83 | p->scx.dsq_vtime += (SLICE - p->scx.slice) * 100 / p->scx.weight; 84 | } 85 | 86 | void BPF_STRUCT_OPS(sched_enable, struct task_struct *p) 87 | { 88 | // New tasks get the current max vtime 89 | // This prevents old tasks from being starved by new tasks 90 | p->scx.dsq_vtime = vtime_now; 91 | } 92 | 93 | 94 | 95 | 96 | 97 | // Define the main scheduler operations structure (sched_ops) 98 | SEC(".struct_ops.link") 99 | struct sched_ext_ops sched_ops = { 100 | .enqueue = (void *)sched_enqueue, 101 | .dispatch = (void *)sched_dispatch, 102 | .init = (void *)sched_init, 103 | .select_cpu = (void *)sched_select_cpu, 104 | .running = (void *)sched_running, 105 | .stopping = (void *)sched_stopping, 106 | .enable = (void *)sched_enable, 107 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 108 | .name = "vtime_scheduler" 109 | }; 110 | 111 | // License for the BPF program 112 | char _license[] SEC("license") = "GPL"; 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Minimal Scheduler 2 | ================= 3 | 4 | In the following a short tutorial for creating a minimal scheduler written with sched_ext in C. 5 | This scheduler uses a global scheduling queue from which 6 | every CPU gets its tasks to run for a time slice. 7 | The scheduler order is First-In-First-Out. So it essentially implements a [round-robin scheduler](https://en.wikipedia.org/wiki/Round-robin_scheduling): 8 | 9 | ![Round Robin Diagram](https://github.com/parttimenerd/minimal-scheduler/raw/main/img/round_robin.png) 10 | 11 | This short tutorial covers the basics; to learn more, visit the resources from the [scx wiki](https://github.com/sched-ext/scx/wiki). 12 | 13 | Requirements 14 | ------------ 15 | We need a 6.13 or younger kernel. 16 | You can get a kernel patched with the scheduler extensions on Ubuntu 25.04 and older from 17 | [here](https://launchpad.net/~arighi/+archive/ubuntu/sched-ext-unstable), 18 | or you can use [CachyOS](https://cachyos.org/) and install a patched kernel from there. 19 | 20 | Furthermore, you also need 21 | - a recent `clang` for compilation 22 | - `bpftool` for attaching the scheduler 23 | 24 | On Ubuntu, for example, you can run: `apt install clang linux-tools-common linux-tools-$(uname -r)`. 25 | 26 | Nothing more is needed to run it, and you can find the code of this tutorial in the [minimal-scheduler](https://github.com/parttimenerd/minimal-scheduler) repository. I would advise to just cloning it: 27 | 28 | ```sh 29 | git clone https://github.com/parttimenerd/minimal-scheduler 30 | cd minimal-scheduler 31 | ``` 32 | 33 | The scheduler lives in the `sched_ext.bpf.c` file, but before we take a look at it, 34 | I want to show you how you can use this scheduler: 35 | 36 | Usage 37 | ----- 38 | 39 | In short, we only need two steps: 40 | 41 | ```bash 42 | # build and start the scheduler 43 | ./start.sh 44 | 45 | # do something ... 46 | 47 | # stop the scheduler 48 | ./stop.sh 49 | ``` 50 | 51 | I'll show you later what's in these scripts, but first, let's get to the scheduler code: 52 | 53 | The Scheduler 54 | ------------- 55 | 56 | _We assume that you have some experience writing eBPF programs. If not, Liz Rice's book [Learning eBPF](https://cilium.isovalent.com/hubfs/Learning-eBPF%20-%20Full%20book.pdf) is a good starting point._ 57 | 58 | The scheduler code only depends on the Linux bpf kernel headers and sched-ext. So here is the code from [`sched_ext.bpf.c`](https://github.com/parttimenerd/minimal-scheduler/blob/main/sched_ext.bpf.c): 59 | 60 | ```c 61 | // This header is autogenerated, as explained later 62 | #include "vmlinux.h" 63 | // The following two headers come from the Linux headers library 64 | #include 65 | #include 66 | 67 | 68 | // Define a shared Dispatch Queue (DSQ) ID 69 | // We use this as our global scheduling queue 70 | #define SHARED_DSQ_ID 0 71 | 72 | // Two macros that make the later code more readable 73 | // and place the functions in the correct sections 74 | // of the binary file 75 | #define BPF_STRUCT_OPS(name, args...) \ 76 | SEC("struct_ops/"#name) BPF_PROG(name, ##args) 77 | 78 | #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 79 | SEC("struct_ops.s/"#name) \ 80 | BPF_PROG(name, ##args) 81 | 82 | // Initialize the scheduler by creating a shared dispatch queue (DSQ) 83 | s32 BPF_STRUCT_OPS_SLEEPABLE(sched_init) { 84 | // All scx_ functions come from vmlinux.h 85 | return scx_bpf_create_dsq(SHARED_DSQ_ID, -1); 86 | } 87 | 88 | // Enqueue a task to the shared DSQ that wants to run, 89 | // dispatching it with a time slice 90 | int BPF_STRUCT_OPS(sched_enqueue, struct task_struct *p, u64 enq_flags) { 91 | // Calculate the time slice for the task based on the number of tasks in the queue 92 | // This makes the system slightly more responsive than a basic round-robin 93 | // scheduler, which assigns every task the same time slice all the time 94 | // The base time slice is 5_000_000ns or 5ms 95 | u64 slice = 5000000u / scx_bpf_dsq_nr_queued(SHARED_DSQ_ID); 96 | scx_bpf_dsq_insert(p, SHARED_DSQ_ID, slice, enq_flags); 97 | return 0; 98 | } 99 | 100 | // Dispatch a task from the shared DSQ to a CPU, 101 | // whenever a CPU needs something to run, usually after it is finished 102 | // running the previous task for the allotted time slice 103 | int BPF_STRUCT_OPS(sched_dispatch, s32 cpu, struct task_struct *prev) { 104 | scx_bpf_dsq_move_to_local(SHARED_DSQ_ID); 105 | return 0; 106 | } 107 | 108 | // Define the main scheduler operations structure (sched_ops) 109 | SEC(".struct_ops.link") 110 | struct sched_ext_ops sched_ops = { 111 | .enqueue = (void *)sched_enqueue, 112 | .dispatch = (void *)sched_dispatch, 113 | .init = (void *)sched_init, 114 | // There are more functions available, but we'll focus 115 | // on the important ones for a minimal scheduler 116 | .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, 117 | // A name that will appear in 118 | // /sys/kernel/sched_ext/root/ops 119 | // after we attached the scheduler 120 | // The name has to be a valid C identifier 121 | .name = "minimal_scheduler" 122 | }; 123 | 124 | // All schedulers have to be GPLv2 licensed 125 | char _license[] SEC("license") = "GPL"; 126 | ``` 127 | 128 | We can visualize the interaction of all functions in the scheduler with the following diagram: 129 | 130 | ![Scheduler Diagram](https://github.com/parttimenerd/minimal-scheduler/raw/main/img/scheduling.png) 131 | 132 | Now, after you've seen the code, 133 | run the [`start.sh`](https://github.com/parttimenerd/minimal-scheduler/blob/main/start.sh) script to generate the `vmlinux.h` BPF header 134 | and then compile the scheduler code to BPF bytecode: 135 | 136 | ```bash 137 | bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h 138 | clang -target bpf -g -O2 -c sched_ext.bpf.c -o sched_ext.bpf.o -I. 139 | ``` 140 | 141 | And attach the scheduler using the `bpftool`: 142 | ```bash 143 | bpftool struct_ops register sched_ext.bpf.o /sys/fs/bpf/sched_ext 144 | ``` 145 | 146 | The custom scheduler is now the scheduler of this system. You can check this 147 | by accessing the `/sys/kernel/sched_ext/root/ops` file: 148 | 149 | ```bash 150 | > ./scheduler.sh 151 | # or 152 | > cat /sys/kernel/sched_ext/root/ops 153 | minimal_scheduler 154 | ``` 155 | 156 | And by checking `dmesg | tail`: 157 | 158 | ```bash 159 | > sudo dmesg | tail 160 | # ... 161 | [32490.366637] sched_ext: BPF scheduler "minimal_scheduler" enabled 162 | ``` 163 | 164 | Play around with your system and see how it behaves. 165 | 166 | There are three available schedulers: 167 | 168 | - [`fifo.bpf.c`](https://github.com/parttimenerd/minimal-scheduler/blob/main/fifo.bpf.c) (also `sched_ext.bpf.c`) the before mentioned FIFO scheduler 169 | - [`lotterly.bpf.c`](https://github.com/parttimenerd/minimal-scheduler/blob/main/lottery.bpf.c) a lottery scheduler as presented on [my blog](https://mostlynerdless.de/blog/2024/12/17/hello-ebpf-writing-a-lottery-scheduler-in-java-with-sched-ext-17/) 170 | 171 | Try them via `./start.sh [scheduler file name]`. 172 | 173 | If you're done, you can detach the scheduler by running the [`stop.sh`](https://github.com/parttimenerd/minimal-scheduler/blob/main/stop.sh) script 174 | using root privileges. This removes the `/sys/fs/bpf/sched_ext/sched_ops` file. 175 | 176 | Tasks for the Reader 177 | -------------------- 178 | 179 | Now that you know what a basic scheduler looks like, you can start modifying it. 180 | Here are a few suggestions: 181 | 182 | ### Vary the Time Slice 183 | How does your system behave when you increase or decrease the time slice? 184 | 185 | For example, try a time slice of 1s. Do you see any difference in how your cursor moves? 186 | Or try a small time slice of 100us and run a program like that that does some computation, 187 | do you see a difference in its performance? 188 | 189 | ### Use a fixed Time Slice 190 | How does your system behave when you change the scheduler to 191 | use the same time slice, ignoring the number of enqueued processes? 192 | 193 | Try, for example, to create load on your system and see how it behaves. 194 | 195 | ### Limit the Used CPUs 196 | How does your system behave if the scheduler only schedules to specific CPUs? 197 | 198 | Try, for example, to make your system effectively single-core by only consuming tasks 199 | on CPU 0 in `sched_dispatch` (Hint: the `cpu` parameter is the CPU id). 200 | 201 | ### Create multiple Scheduling Queues 202 | How does your system behave with multiple scheduling queues for different 203 | CPUs and processes? 204 | 205 | Try, for example, to create two scheduling queues, with one scheduling queue only 206 | for a process with a specific id (Hint: `task_struct#tgid` gives you the process id) 207 | which is scheduled on half of your CPUs. 208 | 209 | Look into the [linux/sched.h](https://github.com/torvalds/linux/blob/ae90f6a6170d7a7a1aa4fddf664fbd093e3023bc/include/linux/sched.h#L778) header to learn more about `task_struct`. 210 | 211 | ### Use more BPF features 212 | If you already know how to write basic eBPF programs, 213 | use `bpf_trace_printk` and the `running` and `stopping` hooks. 214 | 215 | The `running` hook is called whenever a task starts running on a CPU; get the current CPU id via `smp_processor_id()`: 216 | 217 | ```c 218 | int BPF_STRUCT_OPS(sched_running, struct task_struct *p) { 219 | // ... 220 | return 0; // there are no void functions in eBPF 221 | } 222 | ``` 223 | 224 | The `stopping` hook is called whenever a task stops running: 225 | 226 | ```c 227 | int BPF_STRUCT_OPS(sched_stopping, struct task_struct *p, bool runnable) { 228 | // ... 229 | return 0; 230 | } 231 | ``` 232 | You can use this to create visualizations of the scheduling order. 233 | 234 | ### Going Further 235 | 236 | To do even more, you can look at the collected resources in the [scx wiki](https://github.com/sched-ext/scx/wiki), 237 | especially the well-documented [sched-ext code in the kernel](https://github.com/torvalds/linux/blob/master/kernel/sched/ext.c). 238 | 239 | If you're interested in how to use it in Rust, take a look at [scx](https://github.com/sched-ext/scx) and [scx_rust_scheduler](https://github.com/arighi/scx_rust_scheduler), and for Java at [hello-ebpf](https://mostlynerdless.de/blog/2024/09/10/hello-ebpf-writing-a-linux-scheduler-in-java-with-ebpf-15/). 240 | 241 | License 242 | ------- 243 | GPLv2 244 | --------------------------------------------------------------------------------