├── 6.12.y └── linux-6.12-echo.patch ├── 6.7.y └── linux-6.7-echo.patch ├── 6.8.y ├── lat_sensitive.patch ├── linux-6.8-echo.patch └── powersave.patch └── README.md /6.8.y/lat_sensitive.patch: -------------------------------------------------------------------------------- 1 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 2 | index 49df55bb0ba7..ff6e2d0a1107 100644 3 | --- a/kernel/sched/core.c 4 | +++ b/kernel/sched/core.c 5 | @@ -3347,6 +3347,24 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) 6 | WARN_ON_ONCE(ret); 7 | } 8 | 9 | +#ifdef CONFIG_ECHO_SCHED 10 | +inline void inc_nr_lat_sensitive(unsigned int cpu, struct task_struct *p) 11 | +{ 12 | + if (per_cpu(nr_lat_sensitive, cpu) == 0 || per_cpu(nr_lat_sensitive, cpu) == -10) 13 | + per_cpu(nr_lat_sensitive, cpu) = HZ / 78; 14 | +} 15 | + 16 | +inline void dec_nr_lat_sensitive(unsigned int cpu) 17 | +{ 18 | + if (per_cpu(nr_lat_sensitive, cpu) > -10) { 19 | + per_cpu(nr_lat_sensitive, cpu)--; 20 | + 21 | + if (per_cpu(nr_lat_sensitive, cpu) == 0) 22 | + per_cpu(nr_lat_sensitive, cpu) = -1; 23 | + } 24 | +} 25 | +#endif 26 | + 27 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 28 | { 29 | #ifdef CONFIG_SCHED_DEBUG 30 | @@ -5704,6 +5722,13 @@ void scheduler_tick(void) 31 | if (curr->flags & PF_WQ_WORKER) 32 | wq_worker_tick(curr); 33 | 34 | +#ifdef CONFIG_ECHO_SCHED 35 | + if (idle_cpu(cpu)) 36 | + inc_nr_lat_sensitive(cpu, NULL); 37 | + else 38 | + dec_nr_lat_sensitive(cpu); 39 | +#endif 40 | + 41 | #ifdef CONFIG_SMP 42 | rq->idle_balance = idle_cpu(cpu); 43 | trigger_load_balance(rq); 44 | @@ -9912,6 +9937,10 @@ LIST_HEAD(task_groups); 45 | static struct kmem_cache *task_group_cache __ro_after_init; 46 | #endif 47 | 48 | +#ifdef CONFIG_ECHO_SCHED 49 | +DEFINE_PER_CPU(int, nr_lat_sensitive); 50 | +#endif 51 | + 52 | void __init sched_init(void) 53 | { 54 | unsigned long ptr = 0; 55 | @@ -10050,6 +10079,10 @@ void __init sched_init(void) 56 | hrtick_rq_init(rq); 57 | atomic_set(&rq->nr_iowait, 0); 58 | 59 | +#ifdef CONFIG_ECHO_SCHED 60 | + per_cpu(nr_lat_sensitive, i) = 0; 61 | +#endif 62 | + 63 | #ifdef CONFIG_SCHED_CORE 64 | rq->core = rq; 65 | rq->core_pick = NULL; 66 | diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c 67 | index 95e7f83b5ab8..dbfc307103e5 100644 68 | --- a/kernel/sched/idle.c 69 | +++ b/kernel/sched/idle.c 70 | @@ -237,7 +237,9 @@ static void cpuidle_idle_call(void) 71 | static void do_idle(void) 72 | { 73 | int cpu = smp_processor_id(); 74 | - 75 | +#ifdef CONFIG_ECHO_SCHED 76 | + int pm_disabled = per_cpu(nr_lat_sensitive, cpu); 77 | +#endif 78 | /* 79 | * Check if we need to update blocked load 80 | */ 81 | @@ -305,13 +307,22 @@ static void do_idle(void) 82 | * broadcast device expired for us, we don't want to go deep 83 | * idle as we know that the IPI is going to arrive right away. 84 | */ 85 | - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 86 | + if ( 87 | +#ifdef CONFIG_ECHO_SCHED 88 | + pm_disabled > 0 || 89 | +#endif 90 | + cpu_idle_force_poll || tick_check_broadcast_expired()) { 91 | tick_nohz_idle_restart_tick(); 92 | cpu_idle_poll(); 93 | + dec_nr_lat_sensitive(cpu); 94 | } else { 95 | cpuidle_idle_call(); 96 | } 97 | 98 | +#ifdef CONFIG_ECHO_SCHED 99 | + if (pm_disabled < 0) 100 | + dec_nr_lat_sensitive(cpu); 101 | +#endif 102 | arch_cpu_idle_exit(); 103 | } 104 | 105 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 106 | index e27be055ca86..56b5c0114613 100644 107 | --- a/kernel/sched/sched.h 108 | +++ b/kernel/sched/sched.h 109 | @@ -1901,7 +1901,9 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); 110 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); 111 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); 112 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); 113 | - 114 | +#ifdef CONFIG_ECHO_SCHED 115 | +DECLARE_PER_CPU(int, nr_lat_sensitive); 116 | +#endif 117 | extern struct static_key_false sched_asym_cpucapacity; 118 | extern struct static_key_false sched_cluster_active; 119 | 120 | @@ -2559,6 +2561,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); 121 | #define SCHED_NR_MIGRATE_BREAK 32 122 | #endif 123 | 124 | +#ifdef CONFIG_ECHO_SCHED 125 | +extern inline void inc_nr_lat_sensitive(unsigned int cpu, struct task_struct *p); 126 | +extern inline void dec_nr_lat_sensitive(unsigned int cpu); 127 | +#endif 128 | + 129 | extern const_debug unsigned int sysctl_sched_nr_migrate; 130 | extern const_debug unsigned int sysctl_sched_migration_cost; 131 | 132 | -------------------------------------------------------------------------------- /6.8.y/linux-6.8-echo.patch: -------------------------------------------------------------------------------- 1 | diff --git a/include/linux/sched.h b/include/linux/sched.h 2 | index ffe8f618ab86..947d94be2437 100644 3 | --- a/include/linux/sched.h 4 | +++ b/include/linux/sched.h 5 | @@ -533,6 +533,19 @@ struct sched_statistics { 6 | #endif /* CONFIG_SCHEDSTATS */ 7 | } ____cacheline_aligned; 8 | 9 | +#ifdef CONFIG_ECHO_SCHED 10 | +struct bs_node { 11 | + struct bs_node* next; 12 | + u64 c_vrt_start; 13 | + u64 r_vrt_start; 14 | + u64 vburst; 15 | +#ifdef CONFIG_SCHED_DEBUG 16 | + u64 prev_vburst; 17 | +#endif 18 | + u64 est; 19 | +}; 20 | +#endif 21 | + 22 | struct sched_entity { 23 | /* For load-balancing: */ 24 | struct load_weight load; 25 | @@ -542,14 +555,18 @@ struct sched_entity { 26 | 27 | struct list_head group_node; 28 | unsigned int on_rq; 29 | - 30 | +#ifdef CONFIG_ECHO_SCHED 31 | + struct bs_node bs_node; 32 | +#endif 33 | u64 exec_start; 34 | u64 sum_exec_runtime; 35 | u64 prev_sum_exec_runtime; 36 | u64 vruntime; 37 | s64 vlag; 38 | u64 slice; 39 | - 40 | +#ifdef CONFIG_ECHO_SCHED 41 | + bool yielded; 42 | +#endif 43 | u64 nr_migrations; 44 | 45 | #ifdef CONFIG_FAIR_GROUP_SCHED 46 | diff --git a/init/Kconfig b/init/Kconfig 47 | index bee58f7468c3..933ec5c9a941 100644 48 | --- a/init/Kconfig 49 | +++ b/init/Kconfig 50 | @@ -130,6 +130,12 @@ config THREAD_INFO_IN_TASK 51 | One subtle change that will be needed is to use try_get_task_stack() 52 | and put_task_stack() in save_thread_stack_tsk() and get_wchan(). 53 | 54 | +config ECHO_SCHED 55 | + bool "ECHO CPU Scheduler" 56 | + default y 57 | + help 58 | + https://github.com/hamadmarri/ECHO-CPU-Scheduler 59 | + 60 | menu "General setup" 61 | 62 | config BROKEN 63 | @@ -1008,11 +1014,12 @@ menuconfig CGROUP_SCHED 64 | if CGROUP_SCHED 65 | config FAIR_GROUP_SCHED 66 | bool "Group scheduling for SCHED_OTHER" 67 | - depends on CGROUP_SCHED 68 | - default CGROUP_SCHED 69 | + depends on CGROUP_SCHED && !ECHO_SCHED 70 | + default n 71 | 72 | config CFS_BANDWIDTH 73 | bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" 74 | + depends on !ECHO_SCHED 75 | depends on FAIR_GROUP_SCHED 76 | default n 77 | help 78 | @@ -1281,9 +1288,11 @@ config CHECKPOINT_RESTORE 79 | 80 | config SCHED_AUTOGROUP 81 | bool "Automatic process group scheduling" 82 | + depends on !ECHO_SCHED 83 | select CGROUPS 84 | select CGROUP_SCHED 85 | select FAIR_GROUP_SCHED 86 | + default n 87 | help 88 | This option optimizes the scheduler for common desktop workloads by 89 | automatically creating and populating task groups. This separation 90 | diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz 91 | index 38ef6d06888e..80861d9044e3 100644 92 | --- a/kernel/Kconfig.hz 93 | +++ b/kernel/Kconfig.hz 94 | @@ -5,7 +5,7 @@ 95 | 96 | choice 97 | prompt "Timer frequency" 98 | - default HZ_250 99 | + default HZ_625 100 | help 101 | Allows the configuration of the timer frequency. It is customary 102 | to have the timer interrupt run at 1000 Hz but 100 Hz may be more 103 | @@ -40,6 +40,13 @@ choice 104 | on SMP and NUMA systems and exactly dividing by both PAL and 105 | NTSC frame rates for video and multimedia work. 106 | 107 | + config HZ_625 108 | + bool "625 HZ" 109 | + help 110 | + The default HZ for ECHO is 625HZ - ticks every 1.6ms. 111 | + No need to increase it since the HighRes clock handles 112 | + the task preemption in 105us max. 113 | + 114 | config HZ_1000 115 | bool "1000 HZ" 116 | help 117 | @@ -53,6 +60,7 @@ config HZ 118 | default 100 if HZ_100 119 | default 250 if HZ_250 120 | default 300 if HZ_300 121 | + default 625 if HZ_625 122 | default 1000 if HZ_1000 123 | 124 | config SCHED_HRTICK 125 | diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt 126 | index c2f1fd95a821..d54bb52ccccc 100644 127 | --- a/kernel/Kconfig.preempt 128 | +++ b/kernel/Kconfig.preempt 129 | @@ -117,7 +117,8 @@ config PREEMPT_DYNAMIC 130 | 131 | config SCHED_CORE 132 | bool "Core Scheduling for SMT" 133 | - depends on SCHED_SMT 134 | + depends on SCHED_SMT && !ECHO_SCHED 135 | + default n 136 | help 137 | This option permits Core Scheduling, a means of coordinated task 138 | selection across SMT siblings. When enabled -- see 139 | diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile 140 | index 976092b7bd45..f78ee1bffe16 100644 141 | --- a/kernel/sched/Makefile 142 | +++ b/kernel/sched/Makefile 143 | @@ -29,6 +29,10 @@ endif 144 | # build parallelizes well and finishes roughly at once: 145 | # 146 | obj-y += core.o 147 | +ifeq ($(CONFIG_ECHO_SCHED),y) 148 | +obj-y += bs.o 149 | +else 150 | obj-y += fair.o 151 | +endif 152 | obj-y += build_policy.o 153 | obj-y += build_utility.o 154 | diff --git a/kernel/sched/balancer.h b/kernel/sched/balancer.h 155 | new file mode 100644 156 | index 000000000000..852faad1fc1d 157 | --- /dev/null 158 | +++ b/kernel/sched/balancer.h 159 | @@ -0,0 +1,881 @@ 160 | +#ifdef CONFIG_SMP 161 | +static int 162 | +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 163 | +{ 164 | + if (rq->nr_running) 165 | + return 1; 166 | + 167 | + return newidle_balance(rq, rf) != 0; 168 | +} 169 | + 170 | +static int 171 | +wake_affine_idle(int this_cpu, int prev_cpu, int sync) 172 | +{ 173 | + /* 174 | + * If this_cpu is idle, it implies the wakeup is from interrupt 175 | + * context. Only allow the move if cache is shared. Otherwise an 176 | + * interrupt intensive workload could force all tasks onto one 177 | + * node depending on the IO topology or IRQ affinity settings. 178 | + * 179 | + * If the prev_cpu is idle and cache affine then avoid a migration. 180 | + * There is no guarantee that the cache hot data from an interrupt 181 | + * is more important than cache hot data on the prev_cpu and from 182 | + * a cpufreq perspective, it's better to have higher utilisation 183 | + * on one CPU. 184 | + */ 185 | + if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) 186 | + return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; 187 | + 188 | + if (sync && cpu_rq(this_cpu)->nr_running == 1) 189 | + return this_cpu; 190 | + 191 | + if (available_idle_cpu(prev_cpu)) 192 | + return prev_cpu; 193 | + 194 | + return nr_cpumask_bits; 195 | +} 196 | + 197 | +static int 198 | +wake_affine(struct task_struct *p, int this_cpu, int prev_cpu, int sync) 199 | +{ 200 | + int target = nr_cpumask_bits; 201 | + 202 | + target = wake_affine_idle(this_cpu, prev_cpu, sync); 203 | + 204 | + if (target == nr_cpumask_bits) 205 | + return prev_cpu; 206 | + 207 | + return target; 208 | +} 209 | + 210 | +static int wake_wide(struct task_struct *p) 211 | +{ 212 | + unsigned int master = current->wakee_flips; 213 | + unsigned int slave = p->wakee_flips; 214 | + int factor = __this_cpu_read(sd_llc_size); 215 | + 216 | + if (master < slave) 217 | + swap(master, slave); 218 | + if (slave < factor || master < slave * factor) 219 | + return 0; 220 | + return 1; 221 | +} 222 | + 223 | +static void record_wakee(struct task_struct *p) 224 | +{ 225 | + /* 226 | + * Only decay a single time; tasks that have less then 1 wakeup per 227 | + * jiffy will not have built up many flips. 228 | + */ 229 | + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { 230 | + current->wakee_flips >>= 1; 231 | + current->wakee_flip_decay_ts = jiffies; 232 | + } 233 | + 234 | + if (current->last_wakee != p) { 235 | + current->last_wakee = p; 236 | + current->wakee_flips++; 237 | + } 238 | +} 239 | + 240 | +static int 241 | +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) 242 | +{ 243 | + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); 244 | + int cpu = smp_processor_id(); 245 | + int new_cpu = prev_cpu; 246 | + int want_affine = 0; 247 | + struct rq *rq = cpu_rq(prev_cpu); 248 | + unsigned int min_prev = rq->nr_running; 249 | + unsigned int min = rq->nr_running; 250 | + int this_cpu = smp_processor_id(); 251 | + 252 | + if (wake_flags & WF_TTWU) { 253 | + record_wakee(p); 254 | + 255 | + if ((wake_flags & WF_CURRENT_CPU) && 256 | + cpumask_test_cpu(cpu, p->cpus_ptr)) 257 | + return cpu; 258 | + 259 | + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); 260 | + } 261 | + 262 | + for_each_cpu_wrap(cpu, cpu_online_mask, this_cpu) { 263 | + if (unlikely(!cpumask_test_cpu(cpu, p->cpus_ptr))) 264 | + continue; 265 | + 266 | + if (want_affine) { 267 | + if (cpu != prev_cpu) 268 | + new_cpu = wake_affine(p, cpu, prev_cpu, sync); 269 | + 270 | + return new_cpu; 271 | + } 272 | + 273 | + if (cpu_rq(cpu)->nr_running < min) { 274 | + new_cpu = cpu; 275 | + min = cpu_rq(cpu)->nr_running; 276 | + } 277 | + } 278 | + 279 | + if (min == min_prev) 280 | + return prev_cpu; 281 | + 282 | + return new_cpu; 283 | +} 284 | + 285 | +#ifdef CONFIG_NO_HZ_COMMON 286 | +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) 287 | +{ 288 | + if (cfs_rq->avg.load_avg) 289 | + return true; 290 | + 291 | + if (cfs_rq->avg.util_avg) 292 | + return true; 293 | + 294 | + return false; 295 | +} 296 | + 297 | +static inline bool others_have_blocked(struct rq *rq) 298 | +{ 299 | + if (READ_ONCE(rq->avg_rt.util_avg)) 300 | + return true; 301 | + 302 | + if (READ_ONCE(rq->avg_dl.util_avg)) 303 | + return true; 304 | + 305 | + if (thermal_load_avg(rq)) 306 | + return true; 307 | + 308 | +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ 309 | + if (READ_ONCE(rq->avg_irq.util_avg)) 310 | + return true; 311 | +#endif 312 | + 313 | + return false; 314 | +} 315 | + 316 | +static inline void update_blocked_load_tick(struct rq *rq) 317 | +{ 318 | + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); 319 | +} 320 | + 321 | +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 322 | +{ 323 | + if (!has_blocked) 324 | + rq->has_blocked_load = 0; 325 | +} 326 | +#else 327 | +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } 328 | +static inline bool others_have_blocked(struct rq *rq) { return false; } 329 | +static inline void update_blocked_load_tick(struct rq *rq) {} 330 | +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} 331 | +#endif 332 | + 333 | +static inline int 334 | +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 335 | +{ 336 | + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; 337 | + struct sched_avg *sa = &cfs_rq->avg; 338 | + int decayed = 0; 339 | + 340 | + if (cfs_rq->removed.nr) { 341 | + unsigned long r; 342 | + u32 divider = get_pelt_divider(&cfs_rq->avg); 343 | + 344 | + raw_spin_lock(&cfs_rq->removed.lock); 345 | + swap(cfs_rq->removed.util_avg, removed_util); 346 | + swap(cfs_rq->removed.load_avg, removed_load); 347 | + swap(cfs_rq->removed.runnable_avg, removed_runnable); 348 | + cfs_rq->removed.nr = 0; 349 | + raw_spin_unlock(&cfs_rq->removed.lock); 350 | + 351 | + r = removed_load; 352 | + sub_positive(&sa->load_avg, r); 353 | + sub_positive(&sa->load_sum, r * divider); 354 | + /* See sa->util_sum below */ 355 | + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); 356 | + 357 | + r = removed_util; 358 | + sub_positive(&sa->util_avg, r); 359 | + sub_positive(&sa->util_sum, r * divider); 360 | + /* 361 | + * Because of rounding, se->util_sum might ends up being +1 more than 362 | + * cfs->util_sum. Although this is not a problem by itself, detaching 363 | + * a lot of tasks with the rounding problem between 2 updates of 364 | + * util_avg (~1ms) can make cfs->util_sum becoming null whereas 365 | + * cfs_util_avg is not. 366 | + * Check that util_sum is still above its lower bound for the new 367 | + * util_avg. Given that period_contrib might have moved since the last 368 | + * sync, we are only sure that util_sum must be above or equal to 369 | + * util_avg * minimum possible divider 370 | + */ 371 | + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); 372 | + 373 | + r = removed_runnable; 374 | + sub_positive(&sa->runnable_avg, r); 375 | + sub_positive(&sa->runnable_sum, r * divider); 376 | + /* See sa->util_sum above */ 377 | + sa->runnable_sum = max_t(u32, sa->runnable_sum, 378 | + sa->runnable_avg * PELT_MIN_DIVIDER); 379 | + 380 | + decayed = 1; 381 | + } 382 | + 383 | + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); 384 | + u64_u32_store_copy(sa->last_update_time, 385 | + cfs_rq->last_update_time_copy, 386 | + sa->last_update_time); 387 | + return decayed; 388 | +} 389 | + 390 | +static bool __update_blocked_fair(struct rq *rq, bool *done) 391 | +{ 392 | + struct cfs_rq *cfs_rq = &rq->cfs; 393 | + bool decayed; 394 | + 395 | + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); 396 | + if (cfs_rq_has_blocked(cfs_rq)) 397 | + *done = false; 398 | + 399 | + return decayed; 400 | +} 401 | + 402 | +static bool __update_blocked_others(struct rq *rq, bool *done) 403 | +{ 404 | + const struct sched_class *curr_class; 405 | + u64 now = rq_clock_pelt(rq); 406 | + unsigned long thermal_pressure; 407 | + bool decayed; 408 | + 409 | + /* 410 | + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, 411 | + * DL and IRQ signals have been updated before updating CFS. 412 | + */ 413 | + curr_class = rq->curr->sched_class; 414 | + 415 | + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 416 | + 417 | + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | 418 | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | 419 | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | 420 | + update_irq_load_avg(rq, 0); 421 | + 422 | + if (others_have_blocked(rq)) 423 | + *done = false; 424 | + 425 | + return decayed; 426 | +} 427 | + 428 | +static void update_blocked_averages(int cpu) 429 | +{ 430 | + bool decayed = false, done = true; 431 | + struct rq *rq = cpu_rq(cpu); 432 | + struct rq_flags rf; 433 | + 434 | + rq_lock_irqsave(rq, &rf); 435 | + update_blocked_load_tick(rq); 436 | + update_rq_clock(rq); 437 | + 438 | + decayed |= __update_blocked_others(rq, &done); 439 | + decayed |= __update_blocked_fair(rq, &done); 440 | + 441 | + update_blocked_load_status(rq, !done); 442 | + if (decayed) 443 | + cpufreq_update_util(rq, 0); 444 | + rq_unlock_irqrestore(rq, &rf); 445 | +} 446 | + 447 | +static void pull_from(struct task_struct *p, struct lb_env *env) 448 | +{ 449 | + struct rq_flags rf; 450 | + 451 | + // detach task 452 | + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); 453 | + set_task_cpu(p, env->dst_cpu); 454 | + 455 | + // unlock src rq 456 | + rq_unlock(env->src_rq, env->src_rf); 457 | + 458 | + // lock this rq 459 | + rq_lock(env->dst_rq, &rf); 460 | + update_rq_clock(env->dst_rq); 461 | + 462 | + activate_task(env->dst_rq, p, ENQUEUE_NOCLOCK); 463 | + wakeup_preempt(env->dst_rq, p, 0); 464 | + 465 | + // unlock this rq 466 | + rq_unlock(env->dst_rq, &rf); 467 | + 468 | + local_irq_restore(env->src_rf->flags); 469 | +} 470 | + 471 | +#ifdef CONFIG_NUMA_BALANCING 472 | +/* Runqueue only has SCHED_IDLE tasks enqueued */ 473 | +static int sched_idle_rq(struct rq *rq) 474 | +{ 475 | + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && 476 | + rq->nr_running); 477 | +} 478 | + 479 | +#ifdef CONFIG_SMP 480 | +static int sched_idle_cpu(int cpu) 481 | +{ 482 | + return sched_idle_rq(cpu_rq(cpu)); 483 | +} 484 | +#endif 485 | + 486 | +/* 487 | + * Returns 1, if task migration degrades locality 488 | + * Returns 0, if task migration improves locality i.e migration preferred. 489 | + * Returns -1, if task migration is not affected by locality. 490 | + */ 491 | +static int migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 492 | +{ 493 | + struct numa_group *numa_group = rcu_dereference(p->numa_group); 494 | + unsigned long src_weight, dst_weight; 495 | + int src_nid, dst_nid, dist; 496 | + 497 | + if (!static_branch_likely(&sched_numa_balancing)) 498 | + return -1; 499 | + 500 | + if (!p->numa_faults) 501 | + return -1; 502 | + 503 | + src_nid = cpu_to_node(cpu_of(src_rq)); 504 | + dst_nid = cpu_to_node(cpu_of(dst_rq)); 505 | + 506 | + if (src_nid == dst_nid) 507 | + return -1; 508 | + 509 | + /* Migrating away from the preferred node is always bad. */ 510 | + if (src_nid == p->numa_preferred_nid) { 511 | + if (src_rq->nr_running > src_rq->nr_preferred_running) 512 | + return 1; 513 | + else 514 | + return -1; 515 | + } 516 | + 517 | + /* Encourage migration to the preferred node. */ 518 | + if (dst_nid == p->numa_preferred_nid) 519 | + return 0; 520 | + 521 | + /* Leaving a core idle is often worse than degrading locality. */ 522 | + if (sched_idle_cpu(cpu_of(dst_rq))) 523 | + return -1; 524 | + 525 | + dist = node_distance(src_nid, dst_nid); 526 | + if (numa_group) { 527 | + src_weight = group_weight(p, src_nid, dist); 528 | + dst_weight = group_weight(p, dst_nid, dist); 529 | + } else { 530 | + src_weight = task_weight(p, src_nid, dist); 531 | + dst_weight = task_weight(p, dst_nid, dist); 532 | + } 533 | + 534 | + return dst_weight < src_weight; 535 | +} 536 | + 537 | +#else 538 | +static inline int migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 539 | +{ 540 | + return -1; 541 | +} 542 | +#endif 543 | + 544 | +#define MIN_HOTNESS 0x7FFFFFFFFFFFFFFLL 545 | + 546 | +static s64 task_hotness(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 547 | +{ 548 | + s64 delta; 549 | + 550 | + lockdep_assert_rq_held(src_rq); 551 | + 552 | + if (unlikely(task_has_idle_policy(p))) 553 | + return 0; 554 | + 555 | + /* SMT siblings share cache */ 556 | + if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq))) 557 | + return MIN_HOTNESS; 558 | + 559 | + if (sysctl_sched_migration_cost == -1) 560 | + return 0; 561 | + 562 | + if (sysctl_sched_migration_cost == 0) 563 | + return MIN_HOTNESS; 564 | + 565 | + delta = rq_clock_task(src_rq) - p->se.exec_start; 566 | + 567 | + return delta; 568 | +} 569 | + 570 | +static s64 hotness_of(struct task_struct *p, struct lb_env *env) 571 | +{ 572 | + int tsk_cache_hot; 573 | + 574 | + tsk_cache_hot = migrate_degrades_locality(p, env->dst_rq, env->src_rq); 575 | + 576 | + // 0, if task migration improves locality i.e migration preferred. 577 | + if (tsk_cache_hot == 0) 578 | + return MIN_HOTNESS; 579 | + 580 | + // 1, if task migration degrades locality 581 | + if (tsk_cache_hot == 1) 582 | + return 0; 583 | + 584 | + // -1, if task migration is not affected by locality. 585 | + return task_hotness(p, env->dst_rq, env->src_rq); 586 | +} 587 | + 588 | +static int 589 | +can_migrate_task(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 590 | +{ 591 | + /* Disregard pcpu kthreads; they are where they need to be. */ 592 | + if (kthread_is_per_cpu(p)) 593 | + return 0; 594 | + 595 | + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) 596 | + return 0; 597 | + 598 | + if (task_on_cpu(src_rq, p)) 599 | + return 0; 600 | + 601 | + return 1; 602 | +} 603 | + 604 | +static int move_task(struct rq *dst_rq, struct rq *src_rq, 605 | + struct rq_flags *src_rf) 606 | +{ 607 | + struct cfs_rq *src_cfs_rq = &src_rq->cfs; 608 | + struct task_struct *p = NULL, *tsk_itr; 609 | + struct bs_node *bsn = src_cfs_rq->head; 610 | + s64 tsk_coldest = 0, tsk_hotness; 611 | + 612 | + struct lb_env env = { 613 | + .dst_cpu = cpu_of(dst_rq), 614 | + .dst_rq = dst_rq, 615 | + .src_cpu = cpu_of(src_rq), 616 | + .src_rq = src_rq, 617 | + .src_rf = src_rf, 618 | + .idle = dst_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE, 619 | + }; 620 | + 621 | + while (bsn) { 622 | + tsk_itr = task_of(se_of(bsn)); 623 | + 624 | + if (!can_migrate_task(tsk_itr, dst_rq, src_rq)) { 625 | + bsn = bsn->next; 626 | + continue; 627 | + } 628 | + 629 | + tsk_hotness = hotness_of(tsk_itr, &env); 630 | + 631 | + if (!p) { 632 | + tsk_coldest = tsk_hotness; 633 | + p = tsk_itr; 634 | + } else if (tsk_hotness > tsk_coldest) { 635 | + // greater value means it is colder 636 | + 637 | + tsk_coldest = tsk_hotness; 638 | + p = tsk_itr; 639 | + } 640 | + 641 | + bsn = bsn->next; 642 | + } 643 | + 644 | + if (p) { 645 | + pull_from(p, &env); 646 | + return 1; 647 | + } else { 648 | + rq_unlock(src_rq, src_rf); 649 | + local_irq_restore(src_rf->flags); 650 | + } 651 | + 652 | + return 0; 653 | +} 654 | + 655 | +static int idle_pull_global_candidate(struct rq *dist_rq) 656 | +{ 657 | + struct rq *src_rq; 658 | + struct task_struct *p; 659 | + struct rq_flags rf, src_rf; 660 | + struct bs_node *cand = READ_ONCE(global_candidate.candidate); 661 | + 662 | + if (!cand) 663 | + return 0; 664 | + 665 | + src_rq = READ_ONCE(global_candidate.rq); 666 | + if (!src_rq || src_rq == dist_rq) 667 | + return 0; 668 | + 669 | + rq_lock_irqsave(src_rq, &src_rf); 670 | + update_rq_clock(src_rq); 671 | + raw_spin_lock(&global_candidate.lock); 672 | + cand = global_candidate.candidate; 673 | + if (!cand) 674 | + goto fail_unlock; 675 | + 676 | + p = task_of(se_of(cand)); 677 | + if (task_rq(p) != src_rq || 678 | + !can_migrate_task(p, dist_rq, src_rq)) 679 | + goto fail_unlock; 680 | + 681 | + global_candidate.rq = NULL; 682 | + global_candidate.candidate = NULL; 683 | + global_candidate.est = MAX_EST; 684 | + raw_spin_unlock(&global_candidate.lock); 685 | + 686 | + // detach task 687 | + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); 688 | + set_task_cpu(p, cpu_of(dist_rq)); 689 | + // unlock src rq 690 | + rq_unlock(src_rq, &src_rf); 691 | + 692 | + // lock dist rq 693 | + rq_lock(dist_rq, &rf); 694 | + update_rq_clock(dist_rq); 695 | + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); 696 | + wakeup_preempt(dist_rq, p, 0); 697 | + // unlock dist rq 698 | + rq_unlock(dist_rq, &rf); 699 | + 700 | + local_irq_restore(src_rf.flags); 701 | + 702 | + // printk(KERN_INFO "idle_pull_global_candidate"); 703 | + 704 | + return 1; 705 | + 706 | +fail_unlock: 707 | + raw_spin_unlock(&global_candidate.lock); 708 | + rq_unlock(src_rq, &src_rf); 709 | + local_irq_restore(src_rf.flags); 710 | + return 0; 711 | +} 712 | + 713 | +static void idle_balance(struct rq *this_rq) 714 | +{ 715 | + int this_cpu = this_rq->cpu; 716 | + struct rq *src_rq; 717 | + int src_cpu = -1, cpu; 718 | + unsigned int max = 0; 719 | + struct rq_flags src_rf; 720 | + 721 | + if (idle_pull_global_candidate(this_rq)) 722 | + return; 723 | + 724 | + for_each_online_cpu(cpu) { 725 | + /* 726 | + * Stop searching for tasks to pull if there are 727 | + * now runnable tasks on this rq. 728 | + */ 729 | + if (this_rq->nr_running > 0) 730 | + return; 731 | + 732 | + if (cpu == this_cpu) 733 | + continue; 734 | + 735 | + src_rq = cpu_rq(cpu); 736 | + 737 | + if (src_rq->nr_running <= 1) 738 | + continue; 739 | + 740 | + if (src_rq->nr_running > max) { 741 | + max = src_rq->nr_running; 742 | + src_cpu = cpu; 743 | + } 744 | + } 745 | + 746 | + if (src_cpu == -1) 747 | + return; 748 | + 749 | + src_rq = cpu_rq(src_cpu); 750 | + 751 | + rq_lock_irqsave(src_rq, &src_rf); 752 | + update_rq_clock(src_rq); 753 | + 754 | + if (src_rq->nr_running < 2) { 755 | + rq_unlock(src_rq, &src_rf); 756 | + local_irq_restore(src_rf.flags); 757 | + } else { 758 | + move_task(this_rq, src_rq, &src_rf); 759 | + } 760 | +} 761 | + 762 | +static void active_pull_global_candidate(struct rq *dist_rq) 763 | +{ 764 | + struct cfs_rq *cfs_rq = &dist_rq->cfs; 765 | + u64 cand_est = READ_ONCE(global_candidate.est); 766 | + u64 local_est = READ_ONCE(cfs_rq->local_cand_est); 767 | + struct rq *src_rq; 768 | + struct task_struct *p; 769 | + struct rq_flags rf, src_rf; 770 | + struct bs_node *cand; 771 | + 772 | + cand = READ_ONCE(global_candidate.candidate); 773 | + 774 | + if (!cand) 775 | + return; 776 | + 777 | + if ((s64)(local_est - cand_est) <= 0) 778 | + return; 779 | + 780 | + src_rq = READ_ONCE(global_candidate.rq); 781 | + if (!src_rq || src_rq == dist_rq) 782 | + return; 783 | + 784 | + rq_lock_irqsave(src_rq, &src_rf); 785 | + update_rq_clock(src_rq); 786 | + raw_spin_lock(&global_candidate.lock); 787 | + cand = global_candidate.candidate; 788 | + cand_est = global_candidate.est; 789 | + 790 | + if (!cand) 791 | + goto fail_unlock; 792 | + 793 | + p = task_of(se_of(cand)); 794 | + if (task_rq(p) != src_rq || 795 | + !can_migrate_task(p, dist_rq, src_rq)) 796 | + goto fail_unlock; 797 | + 798 | + if ((s64)(local_est - cand_est) <= 0) 799 | + goto fail_unlock; 800 | + 801 | + global_candidate.rq = NULL; 802 | + global_candidate.candidate = NULL; 803 | + global_candidate.est = MAX_EST; 804 | + raw_spin_unlock(&global_candidate.lock); 805 | + 806 | + // detach task 807 | + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); 808 | + set_task_cpu(p, cpu_of(dist_rq)); 809 | + // unlock src rq 810 | + rq_unlock(src_rq, &src_rf); 811 | + 812 | + // lock dist rq 813 | + rq_lock(dist_rq, &rf); 814 | + update_rq_clock(dist_rq); 815 | + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); 816 | + wakeup_preempt(dist_rq, p, 0); 817 | + // unlock dist rq 818 | + rq_unlock(dist_rq, &rf); 819 | + 820 | + local_irq_restore(src_rf.flags); 821 | + 822 | + // printk(KERN_INFO "active_pull_global_candidate"); 823 | + return; 824 | + 825 | +fail_unlock: 826 | + raw_spin_unlock(&global_candidate.lock); 827 | + rq_unlock(src_rq, &src_rf); 828 | + local_irq_restore(src_rf.flags); 829 | +} 830 | + 831 | +static void nohz_try_pull_from_candidate(void) 832 | +{ 833 | + int cpu; 834 | + struct rq *rq; 835 | + struct cfs_rq *cfs_rq; 836 | +#ifdef CONFIG_NO_HZ_FULL 837 | + struct rq_flags rf; 838 | +#endif 839 | + 840 | + /* first, push to grq*/ 841 | + for_each_online_cpu(cpu) { 842 | + rq = cpu_rq(cpu); 843 | +#ifdef CONFIG_NO_HZ_FULL 844 | + cfs_rq = &rq->cfs; 845 | + 846 | + if (idle_cpu(cpu) || cfs_rq->nr_running > 1) 847 | + goto out; 848 | + 849 | + rq_lock_irqsave(rq, &rf); 850 | + update_rq_clock(rq); 851 | + update_curr(cfs_rq); 852 | + rq_unlock_irqrestore(rq, &rf); 853 | +out: 854 | +#endif 855 | + if (idle_cpu(cpu) || !sched_fair_runnable(rq)) 856 | + idle_pull_global_candidate(rq); 857 | + else 858 | + active_pull_global_candidate(rq); 859 | + } 860 | +} 861 | + 862 | +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) 863 | +{ 864 | + int this_cpu = this_rq->cpu; 865 | + struct rq *src_rq; 866 | + int src_cpu = -1, cpu; 867 | + int pulled_task = 0; 868 | + unsigned int max = 0; 869 | + struct rq_flags src_rf; 870 | + 871 | + update_misfit_status(NULL, this_rq); 872 | + 873 | + /* 874 | + * There is a task waiting to run. No need to search for one. 875 | + * Return 0; the task will be enqueued when switching to idle. 876 | + */ 877 | + if (this_rq->ttwu_pending) 878 | + return 0; 879 | + 880 | + /* 881 | + * We must set idle_stamp _before_ calling idle_balance(), such that we 882 | + * measure the duration of idle_balance() as idle time. 883 | + */ 884 | + this_rq->idle_stamp = rq_clock(this_rq); 885 | + 886 | + /* 887 | + * Do not pull tasks towards !active CPUs... 888 | + */ 889 | + if (!cpu_active(this_cpu)) 890 | + return 0; 891 | + 892 | + rq_unpin_lock(this_rq, rf); 893 | + raw_spin_unlock(&this_rq->__lock); 894 | + 895 | + update_blocked_averages(this_cpu); 896 | + 897 | + pulled_task = idle_pull_global_candidate(this_rq); 898 | + if (pulled_task) 899 | + goto out; 900 | + 901 | + for_each_online_cpu(cpu) { 902 | + /* 903 | + * Stop searching for tasks to pull if there are 904 | + * now runnable tasks on this rq. 905 | + */ 906 | + if (this_rq->nr_running > 0) 907 | + goto out; 908 | + 909 | + if (cpu == this_cpu) 910 | + continue; 911 | + 912 | + src_rq = cpu_rq(cpu); 913 | + 914 | + if (src_rq->nr_running <= 1) 915 | + continue; 916 | + 917 | + if (src_rq->nr_running > max) { 918 | + max = src_rq->nr_running; 919 | + src_cpu = cpu; 920 | + } 921 | + } 922 | + 923 | + if (src_cpu != -1) { 924 | + src_rq = cpu_rq(src_cpu); 925 | + 926 | + rq_lock_irqsave(src_rq, &src_rf); 927 | + update_rq_clock(src_rq); 928 | + 929 | + if (src_rq->nr_running <= 1) { 930 | + rq_unlock(src_rq, &src_rf); 931 | + local_irq_restore(src_rf.flags); 932 | + } else { 933 | + pulled_task = move_task(this_rq, src_rq, &src_rf); 934 | + } 935 | + } 936 | + 937 | +out: 938 | + raw_spin_lock(&this_rq->__lock); 939 | + 940 | + /* 941 | + * While browsing the domains, we released the rq lock, a task could 942 | + * have been enqueued in the meantime. Since we're not going idle, 943 | + * pretend we pulled a task. 944 | + */ 945 | + if (this_rq->cfs.h_nr_running && !pulled_task) 946 | + pulled_task = 1; 947 | + 948 | + /* Is there a task of a high priority class? */ 949 | + if (this_rq->nr_running != this_rq->cfs.h_nr_running) 950 | + pulled_task = -1; 951 | + 952 | + if (pulled_task) 953 | + this_rq->idle_stamp = 0; 954 | + 955 | + rq_repin_lock(this_rq, rf); 956 | + 957 | + return pulled_task; 958 | +} 959 | + 960 | +static inline int on_null_domain(struct rq *rq) 961 | +{ 962 | + return unlikely(!rcu_dereference_sched(rq->sd)); 963 | +} 964 | + 965 | +static void rebalance(struct rq *this_rq) 966 | +{ 967 | + int cpu; 968 | + unsigned int max, min; 969 | + struct rq *max_rq, *min_rq, *c_rq; 970 | + struct rq_flags src_rf; 971 | + 972 | + update_blocked_averages(this_rq->cpu); 973 | + 974 | +again: 975 | + max = min = this_rq->nr_running; 976 | + max_rq = min_rq = this_rq; 977 | + 978 | + for_each_online_cpu(cpu) { 979 | + c_rq = cpu_rq(cpu); 980 | + 981 | + /* 982 | + * Don't need to rebalance while attached to NULL domain or 983 | + * runqueue CPU is not active 984 | + */ 985 | + if (unlikely(on_null_domain(c_rq) || !cpu_active(cpu))) 986 | + continue; 987 | + 988 | + if (c_rq->nr_running < min) { 989 | + min = c_rq->nr_running; 990 | + min_rq = c_rq; 991 | + } 992 | + 993 | + if (c_rq->nr_running > max) { 994 | + max = c_rq->nr_running; 995 | + max_rq = c_rq; 996 | + } 997 | + } 998 | + 999 | + if (min_rq == max_rq || max - min <= 1) 1000 | + return; 1001 | + 1002 | + rq_lock_irqsave(max_rq, &src_rf); 1003 | + update_rq_clock(max_rq); 1004 | + 1005 | + if (max_rq->nr_running <= 1) { 1006 | + rq_unlock(max_rq, &src_rf); 1007 | + local_irq_restore(src_rf.flags); 1008 | + return; 1009 | + } 1010 | + 1011 | + if(move_task(min_rq, max_rq, &src_rf)) 1012 | + goto again; 1013 | +} 1014 | + 1015 | +static void nohz_balancer_kick(struct rq *rq); 1016 | + 1017 | +void trigger_load_balance(struct rq *this_rq) 1018 | +{ 1019 | + int this_cpu = cpu_of(this_rq); 1020 | + 1021 | + if (this_cpu != 0) 1022 | + goto out; 1023 | + 1024 | + nohz_try_pull_from_candidate(); 1025 | + 1026 | + rebalance(this_rq); 1027 | + 1028 | +out: 1029 | + if (time_after_eq(jiffies, this_rq->next_balance)) { 1030 | + this_rq->next_balance = jiffies + msecs_to_jiffies(19); 1031 | + update_blocked_averages(this_rq->cpu); 1032 | + } 1033 | + 1034 | + nohz_balancer_kick(this_rq); 1035 | +} 1036 | + 1037 | +#include "nohz.h" 1038 | + 1039 | +void update_group_capacity(struct sched_domain *sd, int cpu) {} 1040 | +#endif /* CONFIG_SMP */ 1041 | diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c 1042 | new file mode 100644 1043 | index 000000000000..6b3d51b4366c 1044 | --- /dev/null 1045 | +++ b/kernel/sched/bs.c 1046 | @@ -0,0 +1,888 @@ 1047 | +// SPDX-License-Identifier: GPL-2.0 1048 | +/* 1049 | + * Baby Scheduler (BS) Class (SCHED_NORMAL/SCHED_BATCH) 1050 | + * 1051 | + * Copyright (C) 2021, Hamad Al Marri 1052 | + */ 1053 | +#include 1054 | +#include 1055 | +#include 1056 | +#include 1057 | +#include 1058 | +#include 1059 | + 1060 | +#include "sched.h" 1061 | +#include "pelt.h" 1062 | + 1063 | +unsigned int sysctl_sched_base_slice = 4200ULL; 1064 | +unsigned int bs_shared_quota = 35000ULL; // 35us 1065 | +u32 alpha = 500U; 1066 | + 1067 | +struct lb_env { 1068 | + struct rq *src_rq; 1069 | + int src_cpu; 1070 | + 1071 | + int dst_cpu; 1072 | + struct rq *dst_rq; 1073 | + 1074 | + enum cpu_idle_type idle; 1075 | + 1076 | + struct rq_flags *src_rf; 1077 | + unsigned int flags; 1078 | +}; 1079 | + 1080 | +struct global_candidate { 1081 | + struct rq *rq; 1082 | + struct bs_node *candidate; 1083 | + u64 est; 1084 | + 1085 | + // for update 1086 | + raw_spinlock_t lock; 1087 | +}; 1088 | + 1089 | +#define MAX_EST 0xFFFFFFFFFFFFFFFULL 1090 | + 1091 | +struct global_candidate global_candidate = {0, 0, MAX_EST}; 1092 | + 1093 | +#include "fair_numa.h" 1094 | +#include "fair_debug.h" 1095 | +#include "fair_dep_funcs.h" 1096 | + 1097 | +static inline int clear_this_candidate(struct sched_entity *se) 1098 | +{ 1099 | + struct bs_node *bsn = &se->bs_node; 1100 | + struct bs_node *curr_can = READ_ONCE(global_candidate.candidate); 1101 | + 1102 | + if (bsn != curr_can) 1103 | + return 0; 1104 | + 1105 | + WRITE_ONCE(global_candidate.candidate, NULL); 1106 | + WRITE_ONCE(global_candidate.rq, NULL); 1107 | + WRITE_ONCE(global_candidate.est, MAX_EST); 1108 | + 1109 | + return 1; 1110 | +} 1111 | + 1112 | +static inline void clear_rq_candidate(struct cfs_rq *cfs_rq) 1113 | +{ 1114 | + struct rq *rq = READ_ONCE(global_candidate.rq); 1115 | + 1116 | + if (rq != rq_of(cfs_rq)) 1117 | + return; 1118 | + 1119 | + WRITE_ONCE(global_candidate.candidate, NULL); 1120 | + WRITE_ONCE(global_candidate.rq, NULL); 1121 | + WRITE_ONCE(global_candidate.est, MAX_EST); 1122 | +} 1123 | + 1124 | +static inline void __update_candidate(struct cfs_rq *cfs_rq, struct bs_node *bsn) 1125 | +{ 1126 | + unsigned long flags; 1127 | + u64 curr_cand_est; 1128 | + 1129 | + curr_cand_est = READ_ONCE(global_candidate.est); 1130 | + 1131 | + if ((s64)(bsn->est - curr_cand_est) < 0) { 1132 | + raw_spin_lock_irqsave(&global_candidate.lock, flags); 1133 | + global_candidate.rq = rq_of(cfs_rq); 1134 | + global_candidate.candidate = bsn; 1135 | + global_candidate.est = bsn->est; 1136 | + raw_spin_unlock_irqrestore(&global_candidate.lock, flags); 1137 | + } 1138 | +} 1139 | + 1140 | +static inline bool 1141 | +can_be_candidate(struct bs_node *bsn, int this_cpu) 1142 | +{ 1143 | + struct task_struct *p; 1144 | + 1145 | + if (!bsn) 1146 | + return 0; 1147 | + 1148 | + p = task_of(se_of(bsn)); 1149 | + 1150 | + if (kthread_is_per_cpu(p)) 1151 | + return 0; 1152 | + 1153 | + // just migrated 1154 | + if (p->se.avg.last_update_time == 0) 1155 | + return 0; 1156 | + 1157 | + if (task_on_cpu(cpu_rq(this_cpu), p)) 1158 | + return 0; 1159 | + 1160 | + // some tasks are pinned to this cpu 1161 | + if (p->nr_cpus_allowed <= 1) 1162 | + return 0; 1163 | + 1164 | + if (is_migration_disabled(p)) 1165 | + return 0; 1166 | + 1167 | + return 1; 1168 | +} 1169 | + 1170 | +static void update_candidate(struct cfs_rq *cfs_rq) 1171 | +{ 1172 | + struct bs_node *bsn = NULL; 1173 | + int this_cpu = cpu_of(rq_of(cfs_rq)); 1174 | + 1175 | + if (can_be_candidate(cfs_rq->head, this_cpu)) 1176 | + bsn = cfs_rq->head; 1177 | + else if (can_be_candidate(cfs_rq->q2_head, this_cpu)) 1178 | + bsn = cfs_rq->q2_head; 1179 | + 1180 | + if (bsn) 1181 | + __update_candidate(cfs_rq, bsn); 1182 | +} 1183 | + 1184 | +static void update_curr(struct cfs_rq *cfs_rq) 1185 | +{ 1186 | + struct sched_entity *curr = cfs_rq->curr; 1187 | + struct task_struct *curtask = task_of(curr); 1188 | + u64 now = rq_clock_task(rq_of(cfs_rq)); 1189 | + s64 delta_exec, calc; 1190 | + 1191 | + if (unlikely(!curr)) 1192 | + return; 1193 | + 1194 | + delta_exec = now - curr->exec_start; 1195 | + if (unlikely(delta_exec <= 0)) 1196 | + return; 1197 | + 1198 | + curr->exec_start = now; 1199 | + curr->sum_exec_runtime += delta_exec; 1200 | + 1201 | + if (schedstat_enabled()) { 1202 | + struct sched_statistics *stats; 1203 | + 1204 | + stats = __schedstats_from_se(curr); 1205 | + __schedstat_set(stats->exec_max, 1206 | + max(delta_exec, stats->exec_max)); 1207 | + } 1208 | + 1209 | + calc = calc_delta_fair(delta_exec, curr); 1210 | + curr->vruntime += calc; 1211 | + curr->bs_node.vburst += calc; 1212 | + curr->bs_node.c_vrt_start += calc; 1213 | + curr->bs_node.r_vrt_start += calc; 1214 | +#ifdef CONFIG_SCHED_DEBUG 1215 | + curr->bs_node.prev_vburst = curr->bs_node.vburst; 1216 | +#endif 1217 | + update_deadline(cfs_rq, curr); 1218 | + 1219 | + cfs_rq->local_cand_est = curr->bs_node.est; 1220 | + 1221 | + trace_sched_stat_runtime(curtask, delta_exec); 1222 | + account_group_exec_runtime(curtask, delta_exec); 1223 | + cgroup_account_cputime(curtask, delta_exec); 1224 | + if (curtask->dl_server) 1225 | + dl_server_update(curtask->dl_server, delta_exec); 1226 | +} 1227 | + 1228 | +static void update_curr_fair(struct rq *rq) 1229 | +{ 1230 | + update_curr(cfs_rq_of(&rq->curr->se)); 1231 | +} 1232 | + 1233 | +/** 1234 | + * Should `a` preempts `b`? 1235 | + */ 1236 | +static inline bool entity_before(struct bs_node *a, struct bs_node *b) 1237 | +{ 1238 | + return (s64)(a->est - b->est) < 0; 1239 | +} 1240 | + 1241 | +static void __enqueue_entity(struct bs_node **q, struct bs_node *bsn) 1242 | +{ 1243 | + struct bs_node *prev; 1244 | + 1245 | + if (!(*q) || entity_before(bsn, *q)) { 1246 | + bsn->next = *q; 1247 | + *q = bsn; 1248 | + return; 1249 | + } 1250 | + 1251 | + // insert after prev 1252 | + prev = *q; 1253 | + while (prev->next && entity_before(prev->next, bsn)) 1254 | + prev = prev->next; 1255 | + 1256 | + bsn->next = prev->next; 1257 | + prev->next = bsn; 1258 | +} 1259 | + 1260 | +static void __dequeue_entity_from_q2(struct cfs_rq *cfs_rq, struct bs_node *bsn) 1261 | +{ 1262 | + struct bs_node *prev, *itr; 1263 | + 1264 | + itr = cfs_rq->q2_head; 1265 | + prev = NULL; 1266 | + 1267 | + while (itr && itr != bsn) { 1268 | + prev = itr; 1269 | + itr = itr->next; 1270 | + } 1271 | + 1272 | + if (bsn == cfs_rq->q2_head) 1273 | + // if it is the head 1274 | + cfs_rq->q2_head = cfs_rq->q2_head->next; 1275 | + else 1276 | + prev->next = itr->next; 1277 | +} 1278 | + 1279 | +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct bs_node *bsn) 1280 | +{ 1281 | + struct bs_node *prev, *itr; 1282 | + 1283 | + itr = cfs_rq->head; 1284 | + prev = NULL; 1285 | + 1286 | + while (itr && itr != bsn) { 1287 | + prev = itr; 1288 | + itr = itr->next; 1289 | + } 1290 | + 1291 | + if (!itr) { 1292 | + // then it is in q2 1293 | + __dequeue_entity_from_q2(cfs_rq, bsn); 1294 | + return; 1295 | + } 1296 | + 1297 | + if (bsn == cfs_rq->head) 1298 | + // if it is the head 1299 | + cfs_rq->head = cfs_rq->head->next; 1300 | + else 1301 | + prev->next = itr->next; 1302 | +} 1303 | + 1304 | +static void 1305 | +update_est_entity(struct sched_entity *se) 1306 | +{ 1307 | + struct bs_node *bsn = &se->bs_node; 1308 | + u64 vburst = bsn->vburst; 1309 | + u64 prev_est = bsn->est; 1310 | + u64 next_est; 1311 | + 1312 | + /* 1313 | + * * + (1 - ) * 1314 | + */ 1315 | + next_est = (alpha * vburst) + ((1000 - alpha) * prev_est); 1316 | + next_est /= 1000; 1317 | + 1318 | + bsn->est = next_est; 1319 | +} 1320 | + 1321 | +static void 1322 | +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1323 | +{ 1324 | + bool curr = cfs_rq->curr == se; 1325 | + bool wakeup = (flags & ENQUEUE_WAKEUP); 1326 | + 1327 | + update_curr(cfs_rq); 1328 | + account_entity_enqueue(cfs_rq, se); 1329 | + 1330 | + if (!wakeup) 1331 | + update_est_entity(se); 1332 | + 1333 | + /* Entity has migrated, no longer consider this task hot */ 1334 | + if (flags & ENQUEUE_MIGRATED) 1335 | + se->exec_start = 0; 1336 | + 1337 | + if (!curr) 1338 | + __enqueue_entity(&cfs_rq->head, &se->bs_node); 1339 | + 1340 | + se->on_rq = 1; 1341 | +} 1342 | + 1343 | +static void 1344 | +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1345 | +{ 1346 | + update_curr(cfs_rq); 1347 | + update_est_entity(se); 1348 | + 1349 | + if (flags & DEQUEUE_SLEEP) 1350 | + se->bs_node.vburst = 0; 1351 | + 1352 | + if (se != cfs_rq->curr) 1353 | + __dequeue_entity(cfs_rq, &se->bs_node); 1354 | + 1355 | + if (clear_this_candidate(se)) 1356 | + update_candidate(cfs_rq); 1357 | + 1358 | + se->on_rq = 0; 1359 | + account_entity_dequeue(cfs_rq, se); 1360 | +} 1361 | + 1362 | +static void 1363 | +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) 1364 | +{ 1365 | + struct sched_entity *se = &p->se; 1366 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 1367 | + int idle_h_nr_running = task_has_idle_policy(p); 1368 | + int task_new = !(flags & ENQUEUE_WAKEUP); 1369 | + 1370 | + /* 1371 | + * The code below (indirectly) updates schedutil which looks at 1372 | + * the cfs_rq utilization to select a frequency. 1373 | + * Let's add the task's estimated utilization to the cfs_rq's 1374 | + * estimated utilization, before we update schedutil. 1375 | + */ 1376 | + util_est_enqueue(&rq->cfs, p); 1377 | + 1378 | + /* 1379 | + * If in_iowait is set, the code below may not trigger any cpufreq 1380 | + * utilization updates, so do it here explicitly with the IOWAIT flag 1381 | + * passed. 1382 | + */ 1383 | + if (p->in_iowait) 1384 | + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 1385 | + 1386 | + if (!se->on_rq) { 1387 | + enqueue_entity(cfs_rq, se, flags); 1388 | + cfs_rq->h_nr_running++; 1389 | + cfs_rq->idle_h_nr_running += idle_h_nr_running; 1390 | + } 1391 | + 1392 | + se->bs_node.r_vrt_start = 0; 1393 | + 1394 | + update_candidate(cfs_rq); 1395 | + 1396 | + add_nr_running(rq, 1); 1397 | + 1398 | + if (!task_new) 1399 | + update_overutilized_status(rq); 1400 | + 1401 | + hrtick_update(rq); 1402 | +} 1403 | + 1404 | +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 1405 | +{ 1406 | + struct sched_entity *se = &p->se; 1407 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 1408 | + int task_sleep = flags & DEQUEUE_SLEEP; 1409 | + int idle_h_nr_running = task_has_idle_policy(p); 1410 | + 1411 | + util_est_dequeue(&rq->cfs, p); 1412 | + 1413 | + dequeue_entity(cfs_rq, se, flags); 1414 | + 1415 | + cfs_rq->h_nr_running--; 1416 | + cfs_rq->idle_h_nr_running -= idle_h_nr_running; 1417 | + 1418 | + sub_nr_running(rq, 1); 1419 | + util_est_update(&rq->cfs, p, task_sleep); 1420 | + hrtick_update(rq); 1421 | +} 1422 | + 1423 | +static void yield_task_fair(struct rq *rq) 1424 | +{ 1425 | + struct task_struct *curr = rq->curr; 1426 | + struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1427 | + 1428 | + /* 1429 | + * Are we the only task in the tree? 1430 | + */ 1431 | + if (unlikely(rq->nr_running == 1)) 1432 | + return; 1433 | + 1434 | + curr->se.yielded = true; 1435 | + 1436 | + update_rq_clock(rq); 1437 | + /* 1438 | + * Update run-time statistics of the 'current'. 1439 | + */ 1440 | + update_curr(cfs_rq); 1441 | + /* 1442 | + * Tell update_rq_clock() that we've just updated, 1443 | + * so we don't do microscopic update in schedule() 1444 | + * and double the fastpath cost. 1445 | + */ 1446 | + rq_clock_skip_update(rq); 1447 | +} 1448 | + 1449 | +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) 1450 | +{ 1451 | + struct sched_entity *se = &p->se; 1452 | + 1453 | + if (!se->on_rq) 1454 | + return false; 1455 | + 1456 | + yield_task_fair(rq); 1457 | + return true; 1458 | +} 1459 | + 1460 | +static __always_inline 1461 | +int __entity_end_quota(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1462 | +{ 1463 | + unsigned int n = max(cfs_rq->nr_running, 1); 1464 | + unsigned int quota; 1465 | + struct bs_node *bs = &curr->bs_node; 1466 | + 1467 | + quota = max(bs_shared_quota / n, sysctl_sched_base_slice); 1468 | + 1469 | + return (s64)(bs->r_vrt_start - (u64)quota) >= 0; 1470 | +} 1471 | + 1472 | +static int entity_end_quota(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1473 | +{ 1474 | + unsigned int n = cfs_rq->nr_running; 1475 | + 1476 | + if (n <= 1) 1477 | + return 0; 1478 | + 1479 | + return __entity_end_quota(cfs_rq, curr); 1480 | +} 1481 | + 1482 | +static int entity_end_min_slice(struct sched_entity *curr) 1483 | +{ 1484 | + struct bs_node *bs = &curr->bs_node; 1485 | + 1486 | + return (s64)(bs->c_vrt_start - (u64)sysctl_sched_base_slice) >= 0; 1487 | +} 1488 | + 1489 | +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) 1490 | +{ 1491 | + struct cfs_rq *cfs_rq = &rq->cfs; 1492 | + struct task_struct *curr = rq->curr; 1493 | + struct sched_entity *curr_se = &curr->se, *pse = &p->se; 1494 | + int cse_is_idle, pse_is_idle; 1495 | + 1496 | + if (unlikely(curr_se == pse)) 1497 | + return; 1498 | + 1499 | + if (test_tsk_need_resched(curr)) 1500 | + return; 1501 | + 1502 | + /* Idle tasks are by definition preempted by non-idle tasks. */ 1503 | + if (unlikely(task_has_idle_policy(curr)) && 1504 | + likely(!task_has_idle_policy(p))) 1505 | + goto preempt; 1506 | + 1507 | + /* 1508 | + * Batch and idle tasks do not preempt non-idle tasks (their preemption 1509 | + * is driven by the tick): 1510 | + */ 1511 | + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) 1512 | + return; 1513 | + 1514 | + cse_is_idle = se_is_idle(curr_se); 1515 | + pse_is_idle = se_is_idle(pse); 1516 | + 1517 | + /* 1518 | + * Preempt an idle group in favor of a non-idle group (and don't preempt 1519 | + * in the inverse case). 1520 | + */ 1521 | + if (cse_is_idle && !pse_is_idle) 1522 | + goto preempt; 1523 | + if (cse_is_idle != pse_is_idle) 1524 | + return; 1525 | + 1526 | + update_curr(cfs_rq_of(curr_se)); 1527 | + 1528 | + /* 1529 | + * - if curr_se ended quoat then preempt 1530 | + * - if waked entity is before curr_se and 1531 | + * curr_se ended min slice 1532 | + */ 1533 | + if (__entity_end_quota(cfs_rq, curr_se)) 1534 | + goto preempt; 1535 | + 1536 | + if (entity_before(&pse->bs_node, &curr_se->bs_node)) 1537 | + goto preempt; 1538 | + 1539 | + return; 1540 | + 1541 | +preempt: 1542 | + resched_curr(rq); 1543 | +} 1544 | + 1545 | +static void 1546 | +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 1547 | +{ 1548 | + if (se->on_rq) 1549 | + __dequeue_entity(cfs_rq, &se->bs_node); 1550 | + 1551 | + se->exec_start = rq_clock_task(rq_of(cfs_rq)); 1552 | + 1553 | + se->bs_node.c_vrt_start = 0; 1554 | + 1555 | + update_candidate(cfs_rq); 1556 | + cfs_rq->local_cand_est = se->bs_node.est; 1557 | + 1558 | + cfs_rq->curr = se; 1559 | + se->prev_sum_exec_runtime = se->sum_exec_runtime; 1560 | +} 1561 | + 1562 | +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 1563 | +{ 1564 | + if (!cfs_rq->head) 1565 | + return NULL; 1566 | + 1567 | + return se_of(cfs_rq->head); 1568 | +} 1569 | + 1570 | +static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 1571 | +{ 1572 | + struct bs_node *bs_curr = &cfs_rq->curr->bs_node; 1573 | + 1574 | + /* 1575 | + * Here we avoid picking curr 1576 | + * while __pick_first_entity picks the 1577 | + * min since curr == NULL 1578 | + */ 1579 | + if (cfs_rq->head == bs_curr) { 1580 | + if (!cfs_rq->head->next) 1581 | + return NULL; 1582 | + 1583 | + return se_of(cfs_rq->head->next); 1584 | + } 1585 | + 1586 | + return se_of(cfs_rq->head); 1587 | +} 1588 | + 1589 | +static struct sched_entity* pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1590 | +{ 1591 | + if (!cfs_rq->head) { 1592 | + // need to switch to q2 1593 | + cfs_rq->head = cfs_rq->q2_head; 1594 | + cfs_rq->q2_head = NULL; 1595 | + } 1596 | + 1597 | + if (!cfs_rq->head) 1598 | + return NULL; 1599 | + 1600 | + if (!cfs_rq->curr) 1601 | + return __pick_first_entity(cfs_rq); 1602 | + 1603 | + return __pick_next_entity(cfs_rq); 1604 | +} 1605 | + 1606 | +struct task_struct * 1607 | +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 1608 | +{ 1609 | + struct cfs_rq *cfs_rq = &rq->cfs; 1610 | + struct sched_entity *se; 1611 | + struct task_struct *p; 1612 | + int new_tasks; 1613 | + 1614 | + /* 1615 | + * to cpu0, don't push any 1616 | + * candidates to this rq 1617 | + */ 1618 | + cfs_rq->local_cand_est = 0; 1619 | + clear_rq_candidate(cfs_rq); 1620 | + 1621 | +again: 1622 | + if (!sched_fair_runnable(rq)) 1623 | + goto idle; 1624 | + 1625 | + if (prev) 1626 | + put_prev_task(rq, prev); 1627 | + 1628 | + se = pick_next_entity(cfs_rq, NULL); 1629 | + set_next_entity(cfs_rq, se); 1630 | + 1631 | + p = task_of(se); 1632 | + 1633 | +done: __maybe_unused; 1634 | + if (hrtick_enabled_fair(rq)) 1635 | + hrtick_start_fair(rq, p); 1636 | + 1637 | + update_misfit_status(p, rq); 1638 | + 1639 | + return p; 1640 | + 1641 | +idle: 1642 | + cfs_rq->local_cand_est = MAX_EST; 1643 | + 1644 | + if (!rf) 1645 | + return NULL; 1646 | + 1647 | + new_tasks = newidle_balance(rq, rf); 1648 | + 1649 | + /* 1650 | + * Because newidle_balance() releases (and re-acquires) rq->lock, it is 1651 | + * possible for any higher priority task to appear. In that case we 1652 | + * must re-start the pick_next_entity() loop. 1653 | + */ 1654 | + if (new_tasks < 0) 1655 | + return RETRY_TASK; 1656 | + 1657 | + if (new_tasks > 0) 1658 | + goto again; 1659 | + 1660 | + /* 1661 | + * rq is about to be idle, check if we need to update the 1662 | + * lost_idle_time of clock_pelt 1663 | + */ 1664 | + update_idle_rq_clock_pelt(rq); 1665 | + 1666 | + return NULL; 1667 | +} 1668 | + 1669 | +static struct task_struct *__pick_next_task_fair(struct rq *rq) 1670 | +{ 1671 | + return pick_next_task_fair(rq, NULL, NULL); 1672 | +} 1673 | + 1674 | +#ifdef CONFIG_SMP 1675 | +static struct task_struct *pick_task_fair(struct rq *rq) 1676 | +{ 1677 | + struct sched_entity *se; 1678 | + struct cfs_rq *cfs_rq = &rq->cfs; 1679 | + struct sched_entity *curr = cfs_rq->curr; 1680 | + 1681 | + /* 1682 | + * to cpu0, don't push any 1683 | + * candidates to this rq 1684 | + */ 1685 | + cfs_rq->local_cand_est = 0; 1686 | + clear_rq_candidate(cfs_rq); 1687 | + 1688 | + if (!cfs_rq->nr_running) 1689 | + return NULL; 1690 | + 1691 | + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ 1692 | + if (curr) { 1693 | + if (curr->on_rq) 1694 | + update_curr(cfs_rq); 1695 | + else 1696 | + curr = NULL; 1697 | + } 1698 | + 1699 | + se = pick_next_entity(cfs_rq, curr); 1700 | + 1701 | + return task_of(se); 1702 | +} 1703 | +#endif 1704 | + 1705 | +static void __enqueue_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 1706 | +{ 1707 | + if (se->yielded || entity_end_quota(cfs_rq, se)) { 1708 | + se->yielded = false; 1709 | + se->bs_node.r_vrt_start = 0; 1710 | + 1711 | + __enqueue_entity(&cfs_rq->q2_head, &se->bs_node); 1712 | + } else { 1713 | + __enqueue_entity(&cfs_rq->head, &se->bs_node); 1714 | + } 1715 | +} 1716 | + 1717 | +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1718 | +{ 1719 | + /* 1720 | + * If still on the runqueue then deactivate_task() 1721 | + * was not called and update_curr() has to be done: 1722 | + */ 1723 | + if (prev->on_rq) { 1724 | + update_curr(cfs_rq); 1725 | + __enqueue_prev_entity(cfs_rq, prev); 1726 | + } 1727 | + 1728 | + update_est_entity(prev); 1729 | + 1730 | + cfs_rq->curr = NULL; 1731 | +} 1732 | + 1733 | +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 1734 | +{ 1735 | + struct sched_entity *se = &prev->se; 1736 | + 1737 | + put_prev_entity(cfs_rq_of(se), se); 1738 | +} 1739 | + 1740 | +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 1741 | +{ 1742 | + struct sched_entity *se = &p->se; 1743 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 1744 | + 1745 | + set_next_entity(cfs_rq, se); 1746 | +} 1747 | + 1748 | + 1749 | +static void 1750 | +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) 1751 | +{ 1752 | + struct sched_entity *se; 1753 | + 1754 | + update_curr(cfs_rq); 1755 | + 1756 | +#ifdef CONFIG_SCHED_HRTICK 1757 | + /* 1758 | + * queued ticks are scheduled to match the slice, so don't bother 1759 | + * validating it and just reschedule. 1760 | + */ 1761 | + if (queued) { 1762 | + resched_curr(rq_of(cfs_rq)); 1763 | + return; 1764 | + } 1765 | + 1766 | + if (cfs_rq->nr_running <= 1) { 1767 | + clear_rq_candidate(cfs_rq); 1768 | + } else { 1769 | + if (curr->yielded || entity_end_quota(cfs_rq, curr)) { 1770 | + resched_curr(rq_of(cfs_rq)); 1771 | + return; 1772 | + } 1773 | + 1774 | + se = __pick_first_entity(cfs_rq); 1775 | + if (!se) 1776 | + return; 1777 | + 1778 | + if (entity_before(&se->bs_node, &curr->bs_node) && entity_end_min_slice(curr)) { 1779 | + resched_curr(rq_of(cfs_rq)); 1780 | + return; 1781 | + } 1782 | + } 1783 | + 1784 | + /* 1785 | + * don't let the period tick interfere with the hrtick preemption 1786 | + */ 1787 | + if (!sched_feat(DOUBLE_TICK) && 1788 | + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) 1789 | + return; 1790 | +#endif 1791 | +} 1792 | + 1793 | +#include "balancer.h" 1794 | + 1795 | +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 1796 | +{ 1797 | + struct sched_entity *se = &curr->se; 1798 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 1799 | + 1800 | + entity_tick(cfs_rq, se, queued); 1801 | + 1802 | + if (static_branch_unlikely(&sched_numa_balancing)) 1803 | + task_tick_numa(rq, curr); 1804 | + 1805 | + update_misfit_status(curr, rq); 1806 | + update_overutilized_status(task_rq(curr)); 1807 | +} 1808 | + 1809 | +static void task_fork_fair(struct task_struct *p) 1810 | +{ 1811 | + struct cfs_rq *cfs_rq; 1812 | + struct sched_entity *curr; 1813 | + struct rq *rq = this_rq(); 1814 | + struct rq_flags rf; 1815 | + 1816 | + rq_lock(rq, &rf); 1817 | + update_rq_clock(rq); 1818 | + 1819 | + cfs_rq = task_cfs_rq(current); 1820 | + curr = cfs_rq->curr; 1821 | + if (curr) 1822 | + update_curr(cfs_rq); 1823 | + 1824 | + rq_unlock(rq, &rf); 1825 | +} 1826 | + 1827 | +/* 1828 | + * All the scheduling class methods: 1829 | + */ 1830 | +DEFINE_SCHED_CLASS(fair) = { 1831 | + 1832 | + .enqueue_task = enqueue_task_fair, 1833 | + .dequeue_task = dequeue_task_fair, 1834 | + .yield_task = yield_task_fair, 1835 | + .yield_to_task = yield_to_task_fair, 1836 | + 1837 | + .wakeup_preempt = check_preempt_wakeup_fair, 1838 | + 1839 | + .pick_next_task = __pick_next_task_fair, 1840 | + .put_prev_task = put_prev_task_fair, 1841 | + .set_next_task = set_next_task_fair, 1842 | + 1843 | +#ifdef CONFIG_SMP 1844 | + .balance = balance_fair, 1845 | + .pick_task = pick_task_fair, 1846 | + .select_task_rq = select_task_rq_fair, 1847 | + .migrate_task_rq = migrate_task_rq_fair, 1848 | + 1849 | + .rq_online = rq_online_fair, 1850 | + .rq_offline = rq_offline_fair, 1851 | + 1852 | + .task_dead = task_dead_fair, 1853 | + .set_cpus_allowed = set_cpus_allowed_common, 1854 | +#endif 1855 | + 1856 | + .task_tick = task_tick_fair, 1857 | + .task_fork = task_fork_fair, 1858 | + 1859 | + .prio_changed = prio_changed_fair, 1860 | + .switched_from = switched_from_fair, 1861 | + .switched_to = switched_to_fair, 1862 | + 1863 | + .get_rr_interval = get_rr_interval_fair, 1864 | + 1865 | + .update_curr = update_curr_fair, 1866 | +}; 1867 | + 1868 | +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 1869 | + unsigned long weight) 1870 | +{ 1871 | + bool curr = cfs_rq->curr == se; 1872 | + 1873 | + if (se->on_rq) { 1874 | + /* commit outstanding execution time */ 1875 | + if (curr) 1876 | + update_curr(cfs_rq); 1877 | + 1878 | + update_load_sub(&cfs_rq->load, se->load.weight); 1879 | + } 1880 | + dequeue_load_avg(cfs_rq, se); 1881 | + 1882 | + update_load_set(&se->load, weight); 1883 | + 1884 | +#ifdef CONFIG_SMP 1885 | + do { 1886 | + u32 divider = get_pelt_divider(&se->avg); 1887 | + 1888 | + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 1889 | + } while (0); 1890 | +#endif 1891 | + 1892 | + enqueue_load_avg(cfs_rq, se); 1893 | + if (se->on_rq) 1894 | + update_load_add(&cfs_rq->load, se->load.weight); 1895 | +} 1896 | + 1897 | +void reweight_task(struct task_struct *p, int prio) 1898 | +{ 1899 | + struct sched_entity *se = &p->se; 1900 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 1901 | + struct load_weight *load = &se->load; 1902 | + unsigned long weight = scale_load(sched_prio_to_weight[prio]); 1903 | + 1904 | + reweight_entity(cfs_rq, se, weight); 1905 | + load->inv_weight = sched_prio_to_wmult[prio]; 1906 | +} 1907 | + 1908 | +/* Working cpumask for: load_balance, load_balance_newidle. */ 1909 | +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 1910 | +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); 1911 | +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); 1912 | + 1913 | +__init void init_sched_fair_class(void) 1914 | +{ 1915 | +#ifdef CONFIG_SMP 1916 | + int i; 1917 | + 1918 | + for_each_possible_cpu(i) { 1919 | + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); 1920 | + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); 1921 | + zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), 1922 | + GFP_KERNEL, cpu_to_node(i)); 1923 | + } 1924 | + 1925 | + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 1926 | + 1927 | +#ifdef CONFIG_NO_HZ_COMMON 1928 | + nohz.next_balance = jiffies; 1929 | + nohz.next_blocked = jiffies; 1930 | + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 1931 | +#endif 1932 | +#endif /* SMP */ 1933 | + 1934 | +} 1935 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 1936 | index 9116bcc90346..f8f5ad0d2f90 100644 1937 | --- a/kernel/sched/core.c 1938 | +++ b/kernel/sched/core.c 1939 | @@ -4525,6 +4525,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 1940 | p->se.vruntime = 0; 1941 | p->se.vlag = 0; 1942 | p->se.slice = sysctl_sched_base_slice; 1943 | + 1944 | +#ifdef CONFIG_ECHO_SCHED 1945 | + p->se.bs_node.vburst = 0; 1946 | + p->se.bs_node.est = 0; 1947 | +#endif 1948 | + 1949 | INIT_LIST_HEAD(&p->se.group_node); 1950 | 1951 | #ifdef CONFIG_FAIR_GROUP_SCHED 1952 | @@ -4687,6 +4693,15 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, 1953 | 1954 | #ifdef CONFIG_SYSCTL 1955 | static struct ctl_table sched_core_sysctls[] = { 1956 | +#ifdef CONFIG_ECHO_SCHED 1957 | + { 1958 | + .procname = "sched_bs_shared_quota", 1959 | + .data = &bs_shared_quota, 1960 | + .maxlen = sizeof(unsigned int), 1961 | + .mode = 0644, 1962 | + .proc_handler = proc_dointvec, 1963 | + }, 1964 | +#endif 1965 | #ifdef CONFIG_SCHEDSTATS 1966 | { 1967 | .procname = "sched_schedstats", 1968 | @@ -9912,6 +9927,10 @@ void __init sched_init(void) 1969 | 1970 | wait_bit_init(); 1971 | 1972 | +#ifdef CONFIG_ECHO_SCHED 1973 | + printk(KERN_INFO "ECHO CPU scheduler v6.8 by Hamad Al Marri."); 1974 | +#endif 1975 | + 1976 | #ifdef CONFIG_FAIR_GROUP_SCHED 1977 | ptr += 2 * nr_cpu_ids * sizeof(void **); 1978 | #endif 1979 | diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c 1980 | index 8d5d98a5834d..ec7d41bc6d44 100644 1981 | --- a/kernel/sched/debug.c 1982 | +++ b/kernel/sched/debug.c 1983 | @@ -1003,6 +1003,11 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, 1984 | PN(se.exec_start); 1985 | PN(se.vruntime); 1986 | PN(se.sum_exec_runtime); 1987 | +#ifdef CONFIG_ECHO_SCHED 1988 | + PN(se.bs_node.vburst); 1989 | + PN(se.bs_node.prev_vburst); 1990 | + PN(se.bs_node.est); 1991 | +#endif 1992 | 1993 | nr_switches = p->nvcsw + p->nivcsw; 1994 | 1995 | diff --git a/kernel/sched/fair_debug.h b/kernel/sched/fair_debug.h 1996 | new file mode 100644 1997 | index 000000000000..2778cf58000f 1998 | --- /dev/null 1999 | +++ b/kernel/sched/fair_debug.h 2000 | @@ -0,0 +1,137 @@ 2001 | +#ifdef CONFIG_SCHED_DEBUG 2002 | +/* 2003 | + * The initial- and re-scaling of tunables is configurable 2004 | + * 2005 | + * Options are: 2006 | + * 2007 | + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 2008 | + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) 2009 | + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus 2010 | + * 2011 | + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) 2012 | + */ 2013 | +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; 2014 | +static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; 2015 | + 2016 | +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) 2017 | +{ 2018 | + return NULL; 2019 | +} 2020 | + 2021 | +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 2022 | +{ 2023 | + return NULL; 2024 | +} 2025 | + 2026 | +static unsigned int get_update_sysctl_factor(void) 2027 | +{ 2028 | + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); 2029 | + unsigned int factor; 2030 | + 2031 | + switch (sysctl_sched_tunable_scaling) { 2032 | + case SCHED_TUNABLESCALING_NONE: 2033 | + factor = 1; 2034 | + break; 2035 | + case SCHED_TUNABLESCALING_LINEAR: 2036 | + factor = cpus; 2037 | + break; 2038 | + case SCHED_TUNABLESCALING_LOG: 2039 | + default: 2040 | + factor = 1 + ilog2(cpus); 2041 | + break; 2042 | + } 2043 | + 2044 | + return factor; 2045 | +} 2046 | + 2047 | +/************************************************************** 2048 | + * Scheduling class statistics methods: 2049 | + */ 2050 | +#ifdef CONFIG_SMP 2051 | +int sched_update_scaling(void) 2052 | +{ 2053 | + unsigned int factor = get_update_sysctl_factor(); 2054 | + 2055 | +#define WRT_SYSCTL(name) \ 2056 | + (normalized_sysctl_##name = sysctl_##name / (factor)) 2057 | + WRT_SYSCTL(sched_base_slice); 2058 | +#undef WRT_SYSCTL 2059 | + 2060 | + return 0; 2061 | +} 2062 | +#endif 2063 | + 2064 | +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) 2065 | +{ 2066 | + return se->vruntime < 750000ULL; 2067 | +} 2068 | + 2069 | +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ 2070 | + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) 2071 | + 2072 | +void print_cfs_stats(struct seq_file *m, int cpu) 2073 | +{ 2074 | + struct cfs_rq *cfs_rq, *pos; 2075 | + 2076 | + rcu_read_lock(); 2077 | + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) 2078 | + print_cfs_rq(m, cpu, cfs_rq); 2079 | + rcu_read_unlock(); 2080 | +} 2081 | + 2082 | +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 2083 | +{ 2084 | + return (s64)se->vruntime; 2085 | +} 2086 | + 2087 | +/* 2088 | + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true 2089 | + * For this to be so, the result of this function must have a left bias. 2090 | + */ 2091 | +u64 avg_vruntime(struct cfs_rq *cfs_rq) 2092 | +{ 2093 | + struct sched_entity *curr = cfs_rq->curr; 2094 | + s64 avg = cfs_rq->avg_vruntime; 2095 | + long load = cfs_rq->avg_load; 2096 | + 2097 | + if (curr && curr->on_rq) { 2098 | + unsigned long weight = scale_load_down(curr->load.weight); 2099 | + 2100 | + avg += entity_key(cfs_rq, curr) * weight; 2101 | + load += weight; 2102 | + } 2103 | + 2104 | + if (load) { 2105 | + /* sign flips effective floor / ceil */ 2106 | + if (avg < 0) 2107 | + avg -= (load - 1); 2108 | + avg = div_s64(avg, load); 2109 | + } 2110 | + 2111 | + return avg; 2112 | +} 2113 | + 2114 | +#ifdef CONFIG_NUMA_BALANCING 2115 | +void show_numa_stats(struct task_struct *p, struct seq_file *m) 2116 | +{ 2117 | + int node; 2118 | + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; 2119 | + struct numa_group *ng; 2120 | + 2121 | + rcu_read_lock(); 2122 | + ng = rcu_dereference(p->numa_group); 2123 | + for_each_online_node(node) { 2124 | + if (p->numa_faults) { 2125 | + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; 2126 | + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; 2127 | + } 2128 | + if (ng) { 2129 | + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], 2130 | + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 2131 | + } 2132 | + print_numa_stats(m, node, tsf, tpf, gsf, gpf); 2133 | + } 2134 | + rcu_read_unlock(); 2135 | +} 2136 | +#endif // CONFIG_NUMA_BALANCING 2137 | +#endif // CONFIG_SCHED_DEBUG 2138 | diff --git a/kernel/sched/fair_dep_funcs.h b/kernel/sched/fair_dep_funcs.h 2139 | new file mode 100644 2140 | index 000000000000..d4411cded78b 2141 | --- /dev/null 2142 | +++ b/kernel/sched/fair_dep_funcs.h 2143 | @@ -0,0 +1,828 @@ 2144 | +/* 2145 | + * Used by other classes to account runtime. 2146 | + */ 2147 | +s64 update_curr_common(struct rq *rq) 2148 | +{ 2149 | + struct sched_entity *curr = &rq->curr->se; 2150 | + struct task_struct *curtask = task_of(curr); 2151 | + u64 now = rq_clock_task(rq); 2152 | + s64 delta_exec; 2153 | + 2154 | + if (unlikely(!curr)) 2155 | + return 0; 2156 | + 2157 | + delta_exec = now - curr->exec_start; 2158 | + if (unlikely(delta_exec <= 0)) 2159 | + return delta_exec; 2160 | + 2161 | + curr->exec_start = now; 2162 | + curr->sum_exec_runtime += delta_exec; 2163 | + 2164 | + if (schedstat_enabled()) { 2165 | + struct sched_statistics *stats; 2166 | + 2167 | + stats = __schedstats_from_se(curr); 2168 | + __schedstat_set(stats->exec_max, 2169 | + max(delta_exec, stats->exec_max)); 2170 | + } 2171 | + 2172 | + trace_sched_stat_runtime(curtask, delta_exec); 2173 | + account_group_exec_runtime(curtask, delta_exec); 2174 | + cgroup_account_cputime(curtask, delta_exec); 2175 | + if (curtask->dl_server) 2176 | + dl_server_update(curtask->dl_server, delta_exec); 2177 | + 2178 | + return delta_exec; 2179 | +} 2180 | + 2181 | +#if defined(CONFIG_NO_HZ_FULL) && defined(CONFIG_CGROUP_SCHED) 2182 | +bool cfs_task_bw_constrained(struct task_struct *p) 2183 | +{ 2184 | + return false; 2185 | +} 2186 | +#endif 2187 | + 2188 | +/* 2189 | + * After fork, child runs first. If set to 0 (default) then 2190 | + * parent will (try to) run first. 2191 | + */ 2192 | +unsigned int sysctl_sched_child_runs_first __read_mostly; 2193 | + 2194 | +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 2195 | + 2196 | +void __init sched_init_granularity(void) {} 2197 | + 2198 | +#ifdef CONFIG_SMP 2199 | +/* Give new sched_entity start runnable values to heavy its load in infant time */ 2200 | +void init_entity_runnable_average(struct sched_entity *se) {} 2201 | +void post_init_entity_util_avg(struct task_struct *p) {} 2202 | +void update_max_interval(void) {} 2203 | +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); 2204 | +#endif /** CONFIG_SMP */ 2205 | + 2206 | +void init_cfs_rq(struct cfs_rq *cfs_rq) 2207 | +{ 2208 | + cfs_rq->tasks_timeline = RB_ROOT_CACHED; 2209 | +#ifdef CONFIG_SMP 2210 | + raw_spin_lock_init(&cfs_rq->removed.lock); 2211 | +#endif 2212 | +} 2213 | + 2214 | +static inline struct sched_entity *se_of(struct bs_node *bsn) 2215 | +{ 2216 | + return container_of(bsn, struct sched_entity, bs_node); 2217 | +} 2218 | + 2219 | +#ifdef CONFIG_SCHED_SMT 2220 | +DEFINE_STATIC_KEY_FALSE(sched_smt_present); 2221 | +EXPORT_SYMBOL_GPL(sched_smt_present); 2222 | + 2223 | +static inline void set_idle_cores(int cpu, int val) 2224 | +{ 2225 | + struct sched_domain_shared *sds; 2226 | + 2227 | + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 2228 | + if (sds) 2229 | + WRITE_ONCE(sds->has_idle_cores, val); 2230 | +} 2231 | + 2232 | +static inline bool test_idle_cores(int cpu) 2233 | +{ 2234 | + struct sched_domain_shared *sds; 2235 | + 2236 | + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 2237 | + if (sds) 2238 | + return READ_ONCE(sds->has_idle_cores); 2239 | + 2240 | + return false; 2241 | +} 2242 | + 2243 | +void __update_idle_core(struct rq *rq) 2244 | +{ 2245 | + int core = cpu_of(rq); 2246 | + int cpu; 2247 | + 2248 | + rcu_read_lock(); 2249 | + if (test_idle_cores(core)) 2250 | + goto unlock; 2251 | + 2252 | + for_each_cpu(cpu, cpu_smt_mask(core)) { 2253 | + if (cpu == core) 2254 | + continue; 2255 | + 2256 | + if (!available_idle_cpu(cpu)) 2257 | + goto unlock; 2258 | + } 2259 | + 2260 | + set_idle_cores(core, 1); 2261 | +unlock: 2262 | + rcu_read_unlock(); 2263 | +} 2264 | +#endif 2265 | + 2266 | +static inline void update_load_add(struct load_weight *lw, unsigned long inc) 2267 | +{ 2268 | + lw->weight += inc; 2269 | + lw->inv_weight = 0; 2270 | +} 2271 | + 2272 | +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 2273 | +{ 2274 | + lw->weight -= dec; 2275 | + lw->inv_weight = 0; 2276 | +} 2277 | + 2278 | +static inline void update_load_set(struct load_weight *lw, unsigned long w) 2279 | +{ 2280 | + lw->weight = w; 2281 | + lw->inv_weight = 0; 2282 | +} 2283 | + 2284 | +static int se_is_idle(struct sched_entity *se) 2285 | +{ 2286 | + return task_has_idle_policy(task_of(se)); 2287 | +} 2288 | + 2289 | +static void 2290 | +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2291 | +{ 2292 | + update_load_add(&cfs_rq->load, se->load.weight); 2293 | +#ifdef CONFIG_SMP 2294 | + struct rq *rq = rq_of(cfs_rq); 2295 | + 2296 | + account_numa_enqueue(rq, task_of(se)); 2297 | + list_add(&se->group_node, &rq->cfs_tasks); 2298 | +#endif 2299 | + cfs_rq->nr_running++; 2300 | + if (se_is_idle(se)) 2301 | + cfs_rq->idle_nr_running++; 2302 | +} 2303 | + 2304 | +static void 2305 | +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2306 | +{ 2307 | + update_load_sub(&cfs_rq->load, se->load.weight); 2308 | +#ifdef CONFIG_SMP 2309 | + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 2310 | + list_del_init(&se->group_node); 2311 | +#endif 2312 | + cfs_rq->nr_running--; 2313 | + if (se_is_idle(se)) 2314 | + cfs_rq->idle_nr_running--; 2315 | +} 2316 | + 2317 | +/* 2318 | + * Task first catches up with cfs_rq, and then subtract 2319 | + * itself from the cfs_rq (task must be off the queue now). 2320 | + */ 2321 | +static void remove_entity_load_avg(struct sched_entity *se) 2322 | +{ 2323 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 2324 | + unsigned long flags; 2325 | + 2326 | + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); 2327 | + ++cfs_rq->removed.nr; 2328 | + cfs_rq->removed.util_avg += se->avg.util_avg; 2329 | + cfs_rq->removed.load_avg += se->avg.load_avg; 2330 | + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; 2331 | + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); 2332 | +} 2333 | + 2334 | +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) 2335 | +{ 2336 | + struct sched_entity *se = &p->se; 2337 | + 2338 | + /* Tell new CPU we are migrated */ 2339 | + se->avg.last_update_time = 0; 2340 | + 2341 | + p->se.yielded = false; 2342 | + 2343 | + update_scan_period(p, new_cpu); 2344 | +} 2345 | + 2346 | +static void rq_online_fair(struct rq *rq) {} 2347 | + 2348 | +static void rq_offline_fair(struct rq *rq) {} 2349 | + 2350 | +static void task_dead_fair(struct task_struct *p) 2351 | +{ 2352 | + remove_entity_load_avg(&p->se); 2353 | +} 2354 | + 2355 | +static void 2356 | +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 2357 | +{ 2358 | + if (!task_on_rq_queued(p)) 2359 | + return; 2360 | + 2361 | + if (rq->cfs.nr_running == 1) 2362 | + return; 2363 | + 2364 | + /* 2365 | + * Reschedule if we are currently running on this runqueue and 2366 | + * our priority decreased, or if we are not currently running on 2367 | + * this runqueue and our priority is higher than the current's 2368 | + */ 2369 | + if (task_current(rq, p)) { 2370 | + if (p->prio > oldprio) 2371 | + resched_curr(rq); 2372 | + } else 2373 | + wakeup_preempt(rq, p, 0); 2374 | +} 2375 | + 2376 | +static void switched_from_fair(struct rq *rq, struct task_struct *p) {} 2377 | + 2378 | +static void switched_to_fair(struct rq *rq, struct task_struct *p) 2379 | +{ 2380 | + if (task_on_rq_queued(p)) { 2381 | + /* 2382 | + * We were most likely switched from sched_rt, so 2383 | + * kick off the schedule if running, otherwise just see 2384 | + * if we can still preempt the current task. 2385 | + */ 2386 | + if (task_current(rq, p)) 2387 | + resched_curr(rq); 2388 | + else 2389 | + wakeup_preempt(rq, p, 0); 2390 | + } 2391 | +} 2392 | + 2393 | +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 2394 | +{ 2395 | + struct sched_entity *se = &task->se; 2396 | + unsigned int rr_interval = 0; 2397 | + 2398 | + /* 2399 | + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2400 | + * idle runqueue: 2401 | + */ 2402 | + if (rq->cfs.load.weight) 2403 | + rr_interval = NS_TO_JIFFIES(se->slice); 2404 | + 2405 | + return rr_interval; 2406 | +} 2407 | + 2408 | +/* 2409 | + * Remove and clamp on negative, from a local variable. 2410 | + * 2411 | + * A variant of sub_positive(), which does not use explicit load-store 2412 | + * and is thus optimized for local variable updates. 2413 | + */ 2414 | +#define lsub_positive(_ptr, _val) do { \ 2415 | + typeof(_ptr) ptr = (_ptr); \ 2416 | + *ptr -= min_t(typeof(*ptr), *ptr, _val); \ 2417 | +} while (0) 2418 | + 2419 | +static inline unsigned long task_util(struct task_struct *p) 2420 | +{ 2421 | + return READ_ONCE(p->se.avg.util_avg); 2422 | +} 2423 | + 2424 | +static inline unsigned long _task_util_est(struct task_struct *p) 2425 | +{ 2426 | + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; 2427 | +} 2428 | + 2429 | +static unsigned long 2430 | +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) 2431 | +{ 2432 | + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 2433 | + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); 2434 | + unsigned long runnable; 2435 | + 2436 | + if (boost) { 2437 | + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); 2438 | + util = max(util, runnable); 2439 | + } 2440 | + 2441 | + /* 2442 | + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its 2443 | + * contribution. If @p migrates from another CPU to @cpu add its 2444 | + * contribution. In all the other cases @cpu is not impacted by the 2445 | + * migration so its util_avg is already correct. 2446 | + */ 2447 | + if (p && task_cpu(p) == cpu && dst_cpu != cpu) 2448 | + lsub_positive(&util, task_util(p)); 2449 | + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) 2450 | + util += task_util(p); 2451 | + 2452 | + if (sched_feat(UTIL_EST)) { 2453 | + unsigned long util_est; 2454 | + 2455 | + util_est = READ_ONCE(cfs_rq->avg.util_est); 2456 | + 2457 | + /* 2458 | + * During wake-up @p isn't enqueued yet and doesn't contribute 2459 | + * to any cpu_rq(cpu)->cfs.avg.util_est. 2460 | + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p 2461 | + * has been enqueued. 2462 | + * 2463 | + * During exec (@dst_cpu = -1) @p is enqueued and does 2464 | + * contribute to cpu_rq(cpu)->cfs.util_est. 2465 | + * Remove it to "simulate" cpu_util without @p's contribution. 2466 | + * 2467 | + * Despite the task_on_rq_queued(@p) check there is still a 2468 | + * small window for a possible race when an exec 2469 | + * select_task_rq_fair() races with LB's detach_task(). 2470 | + * 2471 | + * detach_task() 2472 | + * deactivate_task() 2473 | + * p->on_rq = TASK_ON_RQ_MIGRATING; 2474 | + * -------------------------------- A 2475 | + * dequeue_task() \ 2476 | + * dequeue_task_fair() + Race Time 2477 | + * util_est_dequeue() / 2478 | + * -------------------------------- B 2479 | + * 2480 | + * The additional check "current == p" is required to further 2481 | + * reduce the race window. 2482 | + */ 2483 | + if (dst_cpu == cpu) 2484 | + util_est += _task_util_est(p); 2485 | + else if (p && unlikely(task_on_rq_queued(p) || current == p)) 2486 | + lsub_positive(&util_est, _task_util_est(p)); 2487 | + 2488 | + util = max(util, util_est); 2489 | + } 2490 | + 2491 | + return min(util, arch_scale_cpu_capacity(cpu)); 2492 | +} 2493 | + 2494 | +unsigned long cpu_util_cfs(int cpu) 2495 | +{ 2496 | + return cpu_util(cpu, NULL, -1, 0); 2497 | +} 2498 | + 2499 | +unsigned long cpu_util_cfs_boost(int cpu) 2500 | +{ 2501 | + return cpu_util(cpu, NULL, -1, 1); 2502 | +} 2503 | + 2504 | +#define WMULT_CONST (~0U) 2505 | +#define WMULT_SHIFT 32 2506 | + 2507 | +static void __update_inv_weight(struct load_weight *lw) 2508 | +{ 2509 | + unsigned long w; 2510 | + 2511 | + if (likely(lw->inv_weight)) 2512 | + return; 2513 | + 2514 | + w = scale_load_down(lw->weight); 2515 | + 2516 | + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 2517 | + lw->inv_weight = 1; 2518 | + else if (unlikely(!w)) 2519 | + lw->inv_weight = WMULT_CONST; 2520 | + else 2521 | + lw->inv_weight = WMULT_CONST / w; 2522 | +} 2523 | + 2524 | +/* 2525 | + * delta_exec * weight / lw.weight 2526 | + * OR 2527 | + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 2528 | + * 2529 | + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case 2530 | + * we're guaranteed shift stays positive because inv_weight is guaranteed to 2531 | + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. 2532 | + * 2533 | + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus 2534 | + * weight/lw.weight <= 1, and therefore our shift will also be positive. 2535 | + */ 2536 | +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 2537 | +{ 2538 | + u64 fact = scale_load_down(weight); 2539 | + u32 fact_hi = (u32)(fact >> 32); 2540 | + int shift = WMULT_SHIFT; 2541 | + int fs; 2542 | + 2543 | + __update_inv_weight(lw); 2544 | + 2545 | + if (unlikely(fact_hi)) { 2546 | + fs = fls(fact_hi); 2547 | + shift -= fs; 2548 | + fact >>= fs; 2549 | + } 2550 | + 2551 | + fact = mul_u32_u32(fact, lw->inv_weight); 2552 | + 2553 | + fact_hi = (u32)(fact >> 32); 2554 | + if (fact_hi) { 2555 | + fs = fls(fact_hi); 2556 | + shift -= fs; 2557 | + fact >>= fs; 2558 | + } 2559 | + 2560 | + return mul_u64_u32_shr(delta_exec, fact, shift); 2561 | +} 2562 | + 2563 | +/* 2564 | + * delta /= w 2565 | + */ 2566 | +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) 2567 | +{ 2568 | + if (unlikely(se->load.weight != NICE_0_LOAD)) 2569 | + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); 2570 | + 2571 | + return delta; 2572 | +} 2573 | + 2574 | +static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 2575 | +{ 2576 | + unsigned int n = cfs_rq->nr_running; 2577 | + 2578 | + if (n <= 1) 2579 | + se->slice = bs_shared_quota; 2580 | + else 2581 | + se->slice = max(bs_shared_quota / n, sysctl_sched_base_slice); 2582 | +} 2583 | + 2584 | +#ifdef CONFIG_SCHED_HRTICK 2585 | +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 2586 | +{ 2587 | + struct sched_entity *se = &p->se; 2588 | + 2589 | + SCHED_WARN_ON(task_rq(p) != rq); 2590 | + 2591 | + if (rq->cfs.h_nr_running > 1) { 2592 | + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2593 | + u64 slice = se->slice; 2594 | + s64 delta = slice - ran; 2595 | + 2596 | + if (se->yielded || delta < 0) { 2597 | + if (task_current(rq, p)) 2598 | + resched_curr(rq); 2599 | + return; 2600 | + } 2601 | + hrtick_start(rq, delta); 2602 | + } 2603 | +} 2604 | + 2605 | +/* 2606 | + * called from enqueue/dequeue and updates the hrtick when the 2607 | + * current task is from our class and nr_running is low enough 2608 | + * to matter. 2609 | + */ 2610 | +static void hrtick_update(struct rq *rq) 2611 | +{ 2612 | + struct task_struct *curr = rq->curr; 2613 | + 2614 | + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) 2615 | + return; 2616 | + 2617 | + hrtick_start_fair(rq, curr); 2618 | +} 2619 | +#else /* !CONFIG_SCHED_HRTICK */ 2620 | +static inline void 2621 | +hrtick_start_fair(struct rq *rq, struct task_struct *p) 2622 | +{ 2623 | +} 2624 | + 2625 | +static inline void hrtick_update(struct rq *rq) 2626 | +{ 2627 | +} 2628 | +#endif 2629 | + 2630 | +/* 2631 | + * The margin used when comparing utilization with CPU capacity. 2632 | + * 2633 | + * (default: ~20%) 2634 | + */ 2635 | +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) 2636 | + 2637 | +static inline int util_fits_cpu(unsigned long util, 2638 | + unsigned long uclamp_min, 2639 | + unsigned long uclamp_max, 2640 | + int cpu) 2641 | +{ 2642 | + unsigned long capacity_orig, capacity_orig_thermal; 2643 | + unsigned long capacity = capacity_of(cpu); 2644 | + bool fits, uclamp_max_fits; 2645 | + 2646 | + /* 2647 | + * Check if the real util fits without any uclamp boost/cap applied. 2648 | + */ 2649 | + fits = fits_capacity(util, capacity); 2650 | + 2651 | + if (!uclamp_is_used()) 2652 | + return fits; 2653 | + 2654 | + /* 2655 | + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and 2656 | + * uclamp_max. We only care about capacity pressure (by using 2657 | + * capacity_of()) for comparing against the real util. 2658 | + * 2659 | + * If a task is boosted to 1024 for example, we don't want a tiny 2660 | + * pressure to skew the check whether it fits a CPU or not. 2661 | + * 2662 | + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it 2663 | + * should fit a little cpu even if there's some pressure. 2664 | + * 2665 | + * Only exception is for thermal pressure since it has a direct impact 2666 | + * on available OPP of the system. 2667 | + * 2668 | + * We honour it for uclamp_min only as a drop in performance level 2669 | + * could result in not getting the requested minimum performance level. 2670 | + * 2671 | + * For uclamp_max, we can tolerate a drop in performance level as the 2672 | + * goal is to cap the task. So it's okay if it's getting less. 2673 | + */ 2674 | + capacity_orig = arch_scale_cpu_capacity(cpu); 2675 | + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); 2676 | + 2677 | + /* 2678 | + * We want to force a task to fit a cpu as implied by uclamp_max. 2679 | + * But we do have some corner cases to cater for.. 2680 | + * 2681 | + * 2682 | + * C=z 2683 | + * | ___ 2684 | + * | C=y | | 2685 | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max 2686 | + * | C=x | | | | 2687 | + * | ___ | | | | 2688 | + * | | | | | | | (util somewhere in this region) 2689 | + * | | | | | | | 2690 | + * | | | | | | | 2691 | + * +---------------------------------------- 2692 | + * cpu0 cpu1 cpu2 2693 | + * 2694 | + * In the above example if a task is capped to a specific performance 2695 | + * point, y, then when: 2696 | + * 2697 | + * * util = 80% of x then it does not fit on cpu0 and should migrate 2698 | + * to cpu1 2699 | + * * util = 80% of y then it is forced to fit on cpu1 to honour 2700 | + * uclamp_max request. 2701 | + * 2702 | + * which is what we're enforcing here. A task always fits if 2703 | + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, 2704 | + * the normal upmigration rules should withhold still. 2705 | + * 2706 | + * Only exception is when we are on max capacity, then we need to be 2707 | + * careful not to block overutilized state. This is so because: 2708 | + * 2709 | + * 1. There's no concept of capping at max_capacity! We can't go 2710 | + * beyond this performance level anyway. 2711 | + * 2. The system is being saturated when we're operating near 2712 | + * max capacity, it doesn't make sense to block overutilized. 2713 | + */ 2714 | + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); 2715 | + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); 2716 | + fits = fits || uclamp_max_fits; 2717 | + 2718 | + /* 2719 | + * 2720 | + * C=z 2721 | + * | ___ (region a, capped, util >= uclamp_max) 2722 | + * | C=y | | 2723 | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max 2724 | + * | C=x | | | | 2725 | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) 2726 | + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min 2727 | + * | | | | | | | 2728 | + * | | | | | | | (region c, boosted, util < uclamp_min) 2729 | + * +---------------------------------------- 2730 | + * cpu0 cpu1 cpu2 2731 | + * 2732 | + * a) If util > uclamp_max, then we're capped, we don't care about 2733 | + * actual fitness value here. We only care if uclamp_max fits 2734 | + * capacity without taking margin/pressure into account. 2735 | + * See comment above. 2736 | + * 2737 | + * b) If uclamp_min <= util <= uclamp_max, then the normal 2738 | + * fits_capacity() rules apply. Except we need to ensure that we 2739 | + * enforce we remain within uclamp_max, see comment above. 2740 | + * 2741 | + * c) If util < uclamp_min, then we are boosted. Same as (b) but we 2742 | + * need to take into account the boosted value fits the CPU without 2743 | + * taking margin/pressure into account. 2744 | + * 2745 | + * Cases (a) and (b) are handled in the 'fits' variable already. We 2746 | + * just need to consider an extra check for case (c) after ensuring we 2747 | + * handle the case uclamp_min > uclamp_max. 2748 | + */ 2749 | + uclamp_min = min(uclamp_min, uclamp_max); 2750 | + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) 2751 | + return -1; 2752 | + 2753 | + return fits; 2754 | +} 2755 | + 2756 | +static inline bool cpu_overutilized(int cpu) 2757 | +{ 2758 | + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 2759 | + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 2760 | + 2761 | + /* Return true only if the utilization doesn't fit CPU's capacity */ 2762 | + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); 2763 | +} 2764 | + 2765 | +static inline void update_overutilized_status(struct rq *rq) 2766 | +{ 2767 | + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { 2768 | + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); 2769 | + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); 2770 | + } 2771 | +} 2772 | + 2773 | +static inline unsigned long task_util_est(struct task_struct *p) 2774 | +{ 2775 | + return max(task_util(p), _task_util_est(p)); 2776 | +} 2777 | + 2778 | +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, 2779 | + struct task_struct *p) 2780 | +{ 2781 | + unsigned int enqueued; 2782 | + 2783 | + if (!sched_feat(UTIL_EST)) 2784 | + return; 2785 | + 2786 | + /* Update root cfs_rq's estimated utilization */ 2787 | + enqueued = cfs_rq->avg.util_est; 2788 | + enqueued += _task_util_est(p); 2789 | + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); 2790 | + 2791 | + trace_sched_util_est_cfs_tp(cfs_rq); 2792 | +} 2793 | + 2794 | +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, 2795 | + struct task_struct *p) 2796 | +{ 2797 | + unsigned int enqueued; 2798 | + 2799 | + if (!sched_feat(UTIL_EST)) 2800 | + return; 2801 | + 2802 | + /* Update root cfs_rq's estimated utilization */ 2803 | + enqueued = cfs_rq->avg.util_est; 2804 | + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); 2805 | + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); 2806 | + 2807 | + trace_sched_util_est_cfs_tp(cfs_rq); 2808 | +} 2809 | + 2810 | +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) 2811 | + 2812 | +static inline unsigned long task_runnable(struct task_struct *p) 2813 | +{ 2814 | + return READ_ONCE(p->se.avg.runnable_avg); 2815 | +} 2816 | + 2817 | +static inline void util_est_update(struct cfs_rq *cfs_rq, 2818 | + struct task_struct *p, 2819 | + bool task_sleep) 2820 | +{ 2821 | + unsigned int ewma, dequeued, last_ewma_diff; 2822 | + 2823 | + if (!sched_feat(UTIL_EST)) 2824 | + return; 2825 | + 2826 | + /* 2827 | + * Skip update of task's estimated utilization when the task has not 2828 | + * yet completed an activation, e.g. being migrated. 2829 | + */ 2830 | + if (!task_sleep) 2831 | + return; 2832 | + 2833 | + /* Get current estimate of utilization */ 2834 | + ewma = READ_ONCE(p->se.avg.util_est); 2835 | + 2836 | + /* 2837 | + * If the PELT values haven't changed since enqueue time, 2838 | + * skip the util_est update. 2839 | + */ 2840 | + if (ewma & UTIL_AVG_UNCHANGED) 2841 | + return; 2842 | + 2843 | + /* Get utilization at dequeue */ 2844 | + dequeued = task_util(p); 2845 | + 2846 | + /* 2847 | + * Reset EWMA on utilization increases, the moving average is used only 2848 | + * to smooth utilization decreases. 2849 | + */ 2850 | + if (ewma <= dequeued) { 2851 | + ewma = dequeued; 2852 | + goto done; 2853 | + } 2854 | + 2855 | + /* 2856 | + * Skip update of task's estimated utilization when its members are 2857 | + * already ~1% close to its last activation value. 2858 | + */ 2859 | + last_ewma_diff = ewma - dequeued; 2860 | + if (last_ewma_diff < UTIL_EST_MARGIN) 2861 | + goto done; 2862 | + 2863 | + /* 2864 | + * To avoid overestimation of actual task utilization, skip updates if 2865 | + * we cannot grant there is idle time in this CPU. 2866 | + */ 2867 | + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) 2868 | + return; 2869 | + 2870 | + /* 2871 | + * To avoid underestimate of task utilization, skip updates of EWMA if 2872 | + * we cannot grant that thread got all CPU time it wanted. 2873 | + */ 2874 | + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) 2875 | + goto done; 2876 | + 2877 | + 2878 | + /* 2879 | + * Update Task's estimated utilization 2880 | + * 2881 | + * When *p completes an activation we can consolidate another sample 2882 | + * of the task size. This is done by using this value to update the 2883 | + * Exponential Weighted Moving Average (EWMA): 2884 | + * 2885 | + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) 2886 | + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) 2887 | + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) 2888 | + * = w * ( -last_ewma_diff ) + ewma(t-1) 2889 | + * = w * (-last_ewma_diff + ewma(t-1) / w) 2890 | + * 2891 | + * Where 'w' is the weight of new samples, which is configured to be 2892 | + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) 2893 | + */ 2894 | + ewma <<= UTIL_EST_WEIGHT_SHIFT; 2895 | + ewma -= last_ewma_diff; 2896 | + ewma >>= UTIL_EST_WEIGHT_SHIFT; 2897 | +done: 2898 | + ewma |= UTIL_AVG_UNCHANGED; 2899 | + WRITE_ONCE(p->se.avg.util_est, ewma); 2900 | + 2901 | + trace_sched_util_est_se_tp(&p->se); 2902 | +} 2903 | + 2904 | +static inline int task_fits_cpu(struct task_struct *p, int cpu) 2905 | +{ 2906 | + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); 2907 | + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); 2908 | + unsigned long util = task_util_est(p); 2909 | + /* 2910 | + * Return true only if the cpu fully fits the task requirements, which 2911 | + * include the utilization but also the performance hints. 2912 | + */ 2913 | + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); 2914 | +} 2915 | + 2916 | + 2917 | +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 2918 | +{ 2919 | + if (!sched_asym_cpucap_active()) 2920 | + return; 2921 | + 2922 | + if (!p || p->nr_cpus_allowed == 1) { 2923 | + rq->misfit_task_load = 0; 2924 | + return; 2925 | + } 2926 | + 2927 | + if (task_fits_cpu(p, cpu_of(rq))) { 2928 | + rq->misfit_task_load = 0; 2929 | + return; 2930 | + } 2931 | + 2932 | + /* 2933 | + * Make sure that misfit_task_load will not be null even if 2934 | + * task_h_load() returns 0. 2935 | + */ 2936 | + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); 2937 | +} 2938 | + 2939 | +static inline void 2940 | +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2941 | +{ 2942 | + cfs_rq->avg.load_avg += se->avg.load_avg; 2943 | + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 2944 | +} 2945 | + 2946 | +/* 2947 | + * Unsigned subtract and clamp on underflow. 2948 | + * 2949 | + * Explicitly do a load-store to ensure the intermediate value never hits 2950 | + * memory. This allows lockless observations without ever seeing the negative 2951 | + * values. 2952 | + */ 2953 | +#define sub_positive(_ptr, _val) do { \ 2954 | + typeof(_ptr) ptr = (_ptr); \ 2955 | + typeof(*ptr) val = (_val); \ 2956 | + typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2957 | + res = var - val; \ 2958 | + if (res > var) \ 2959 | + res = 0; \ 2960 | + WRITE_ONCE(*ptr, res); \ 2961 | +} while (0) 2962 | + 2963 | +static inline void 2964 | +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2965 | +{ 2966 | + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 2967 | + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); 2968 | + /* See update_cfs_rq_load_avg() */ 2969 | + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 2970 | + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); 2971 | +} 2972 | diff --git a/kernel/sched/fair_numa.h b/kernel/sched/fair_numa.h 2973 | new file mode 100644 2974 | index 000000000000..1d9f6ff65a3e 2975 | --- /dev/null 2976 | +++ b/kernel/sched/fair_numa.h 2977 | @@ -0,0 +1,2288 @@ 2978 | +static unsigned long capacity_of(int cpu) 2979 | +{ 2980 | + return cpu_rq(cpu)->cpu_capacity; 2981 | +} 2982 | + 2983 | +static unsigned long task_h_load(struct task_struct *p) 2984 | +{ 2985 | + return p->se.avg.load_avg; 2986 | +} 2987 | + 2988 | +static inline bool is_core_idle(int cpu) 2989 | +{ 2990 | +#ifdef CONFIG_SCHED_SMT 2991 | + int sibling; 2992 | + 2993 | + for_each_cpu(sibling, cpu_smt_mask(cpu)) { 2994 | + if (cpu == sibling) 2995 | + continue; 2996 | + 2997 | + if (!idle_cpu(sibling)) 2998 | + return false; 2999 | + } 3000 | +#endif 3001 | + 3002 | + return true; 3003 | +} 3004 | + 3005 | +#ifdef CONFIG_NUMA_BALANCING 3006 | +/* 3007 | + * Approximate time to scan a full NUMA task in ms. The task scan period is 3008 | + * calculated based on the tasks virtual memory size and 3009 | + * numa_balancing_scan_size. 3010 | + */ 3011 | +unsigned int sysctl_numa_balancing_scan_period_min = 1000; 3012 | +unsigned int sysctl_numa_balancing_scan_period_max = 60000; 3013 | + 3014 | +/* Portion of address space to scan in MB */ 3015 | +unsigned int sysctl_numa_balancing_scan_size = 256; 3016 | + 3017 | +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 3018 | +unsigned int sysctl_numa_balancing_scan_delay = 1000; 3019 | + 3020 | +/* The page with hint page fault latency < threshold in ms is considered hot */ 3021 | +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; 3022 | + 3023 | +struct numa_group { 3024 | + refcount_t refcount; 3025 | + 3026 | + spinlock_t lock; /* nr_tasks, tasks */ 3027 | + int nr_tasks; 3028 | + pid_t gid; 3029 | + int active_nodes; 3030 | + 3031 | + struct rcu_head rcu; 3032 | + unsigned long total_faults; 3033 | + unsigned long max_faults_cpu; 3034 | + /* 3035 | + * faults[] array is split into two regions: faults_mem and faults_cpu. 3036 | + * 3037 | + * Faults_cpu is used to decide whether memory should move 3038 | + * towards the CPU. As a consequence, these stats are weighted 3039 | + * more by CPU use than by memory faults. 3040 | + */ 3041 | + unsigned long faults[]; 3042 | +}; 3043 | + 3044 | +/* 3045 | + * For functions that can be called in multiple contexts that permit reading 3046 | + * ->numa_group (see struct task_struct for locking rules). 3047 | + */ 3048 | +static struct numa_group *deref_task_numa_group(struct task_struct *p) 3049 | +{ 3050 | + return rcu_dereference_check(p->numa_group, p == current || 3051 | + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); 3052 | +} 3053 | + 3054 | +static struct numa_group *deref_curr_numa_group(struct task_struct *p) 3055 | +{ 3056 | + return rcu_dereference_protected(p->numa_group, p == current); 3057 | +} 3058 | + 3059 | +static inline unsigned long group_faults_priv(struct numa_group *ng); 3060 | +static inline unsigned long group_faults_shared(struct numa_group *ng); 3061 | + 3062 | +static unsigned int task_nr_scan_windows(struct task_struct *p) 3063 | +{ 3064 | + unsigned long rss = 0; 3065 | + unsigned long nr_scan_pages; 3066 | + 3067 | + /* 3068 | + * Calculations based on RSS as non-present and empty pages are skipped 3069 | + * by the PTE scanner and NUMA hinting faults should be trapped based 3070 | + * on resident pages 3071 | + */ 3072 | + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); 3073 | + rss = get_mm_rss(p->mm); 3074 | + if (!rss) 3075 | + rss = nr_scan_pages; 3076 | + 3077 | + rss = round_up(rss, nr_scan_pages); 3078 | + return rss / nr_scan_pages; 3079 | +} 3080 | + 3081 | +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ 3082 | +#define MAX_SCAN_WINDOW 2560 3083 | + 3084 | +static unsigned int task_scan_min(struct task_struct *p) 3085 | +{ 3086 | + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); 3087 | + unsigned int scan, floor; 3088 | + unsigned int windows = 1; 3089 | + 3090 | + if (scan_size < MAX_SCAN_WINDOW) 3091 | + windows = MAX_SCAN_WINDOW / scan_size; 3092 | + floor = 1000 / windows; 3093 | + 3094 | + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 3095 | + return max_t(unsigned int, floor, scan); 3096 | +} 3097 | + 3098 | +static unsigned int task_scan_start(struct task_struct *p) 3099 | +{ 3100 | + unsigned long smin = task_scan_min(p); 3101 | + unsigned long period = smin; 3102 | + struct numa_group *ng; 3103 | + 3104 | + /* Scale the maximum scan period with the amount of shared memory. */ 3105 | + rcu_read_lock(); 3106 | + ng = rcu_dereference(p->numa_group); 3107 | + if (ng) { 3108 | + unsigned long shared = group_faults_shared(ng); 3109 | + unsigned long private = group_faults_priv(ng); 3110 | + 3111 | + period *= refcount_read(&ng->refcount); 3112 | + period *= shared + 1; 3113 | + period /= private + shared + 1; 3114 | + } 3115 | + rcu_read_unlock(); 3116 | + 3117 | + return max(smin, period); 3118 | +} 3119 | + 3120 | +static unsigned int task_scan_max(struct task_struct *p) 3121 | +{ 3122 | + unsigned long smin = task_scan_min(p); 3123 | + unsigned long smax; 3124 | + struct numa_group *ng; 3125 | + 3126 | + /* Watch for min being lower than max due to floor calculations */ 3127 | + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); 3128 | + 3129 | + /* Scale the maximum scan period with the amount of shared memory. */ 3130 | + ng = deref_curr_numa_group(p); 3131 | + if (ng) { 3132 | + unsigned long shared = group_faults_shared(ng); 3133 | + unsigned long private = group_faults_priv(ng); 3134 | + unsigned long period = smax; 3135 | + 3136 | + period *= refcount_read(&ng->refcount); 3137 | + period *= shared + 1; 3138 | + period /= private + shared + 1; 3139 | + 3140 | + smax = max(smax, period); 3141 | + } 3142 | + 3143 | + return max(smin, smax); 3144 | +} 3145 | + 3146 | +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 3147 | +{ 3148 | + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); 3149 | + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); 3150 | +} 3151 | + 3152 | +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) 3153 | +{ 3154 | + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); 3155 | + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); 3156 | +} 3157 | + 3158 | +/* Shared or private faults. */ 3159 | +#define NR_NUMA_HINT_FAULT_TYPES 2 3160 | + 3161 | +/* Memory and CPU locality */ 3162 | +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) 3163 | + 3164 | +/* Averaged statistics, and temporary buffers. */ 3165 | +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) 3166 | + 3167 | +pid_t task_numa_group_id(struct task_struct *p) 3168 | +{ 3169 | + struct numa_group *ng; 3170 | + pid_t gid = 0; 3171 | + 3172 | + rcu_read_lock(); 3173 | + ng = rcu_dereference(p->numa_group); 3174 | + if (ng) 3175 | + gid = ng->gid; 3176 | + rcu_read_unlock(); 3177 | + 3178 | + return gid; 3179 | +} 3180 | + 3181 | +/* 3182 | + * The averaged statistics, shared & private, memory & CPU, 3183 | + * occupy the first half of the array. The second half of the 3184 | + * array is for current counters, which are averaged into the 3185 | + * first set by task_numa_placement. 3186 | + */ 3187 | +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) 3188 | +{ 3189 | + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; 3190 | +} 3191 | + 3192 | +static inline unsigned long task_faults(struct task_struct *p, int nid) 3193 | +{ 3194 | + if (!p->numa_faults) 3195 | + return 0; 3196 | + 3197 | + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + 3198 | + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; 3199 | +} 3200 | + 3201 | +static inline unsigned long group_faults(struct task_struct *p, int nid) 3202 | +{ 3203 | + struct numa_group *ng = deref_task_numa_group(p); 3204 | + 3205 | + if (!ng) 3206 | + return 0; 3207 | + 3208 | + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + 3209 | + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; 3210 | +} 3211 | + 3212 | +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 3213 | +{ 3214 | + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + 3215 | + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; 3216 | +} 3217 | + 3218 | +static inline unsigned long group_faults_priv(struct numa_group *ng) 3219 | +{ 3220 | + unsigned long faults = 0; 3221 | + int node; 3222 | + 3223 | + for_each_online_node(node) { 3224 | + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 3225 | + } 3226 | + 3227 | + return faults; 3228 | +} 3229 | + 3230 | +static inline unsigned long group_faults_shared(struct numa_group *ng) 3231 | +{ 3232 | + unsigned long faults = 0; 3233 | + int node; 3234 | + 3235 | + for_each_online_node(node) { 3236 | + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; 3237 | + } 3238 | + 3239 | + return faults; 3240 | +} 3241 | + 3242 | +/* 3243 | + * A node triggering more than 1/3 as many NUMA faults as the maximum is 3244 | + * considered part of a numa group's pseudo-interleaving set. Migrations 3245 | + * between these nodes are slowed down, to allow things to settle down. 3246 | + */ 3247 | +#define ACTIVE_NODE_FRACTION 3 3248 | + 3249 | +static bool numa_is_active_node(int nid, struct numa_group *ng) 3250 | +{ 3251 | + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 3252 | +} 3253 | + 3254 | +/* Handle placement on systems where not all nodes are directly connected. */ 3255 | +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, 3256 | + int lim_dist, bool task) 3257 | +{ 3258 | + unsigned long score = 0; 3259 | + int node, max_dist; 3260 | + 3261 | + /* 3262 | + * All nodes are directly connected, and the same distance 3263 | + * from each other. No need for fancy placement algorithms. 3264 | + */ 3265 | + if (sched_numa_topology_type == NUMA_DIRECT) 3266 | + return 0; 3267 | + 3268 | + /* sched_max_numa_distance may be changed in parallel. */ 3269 | + max_dist = READ_ONCE(sched_max_numa_distance); 3270 | + /* 3271 | + * This code is called for each node, introducing N^2 complexity, 3272 | + * which should be ok given the number of nodes rarely exceeds 8. 3273 | + */ 3274 | + for_each_online_node(node) { 3275 | + unsigned long faults; 3276 | + int dist = node_distance(nid, node); 3277 | + 3278 | + /* 3279 | + * The furthest away nodes in the system are not interesting 3280 | + * for placement; nid was already counted. 3281 | + */ 3282 | + if (dist >= max_dist || node == nid) 3283 | + continue; 3284 | + 3285 | + /* 3286 | + * On systems with a backplane NUMA topology, compare groups 3287 | + * of nodes, and move tasks towards the group with the most 3288 | + * memory accesses. When comparing two nodes at distance 3289 | + * "hoplimit", only nodes closer by than "hoplimit" are part 3290 | + * of each group. Skip other nodes. 3291 | + */ 3292 | + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) 3293 | + continue; 3294 | + 3295 | + /* Add up the faults from nearby nodes. */ 3296 | + if (task) 3297 | + faults = task_faults(p, node); 3298 | + else 3299 | + faults = group_faults(p, node); 3300 | + 3301 | + /* 3302 | + * On systems with a glueless mesh NUMA topology, there are 3303 | + * no fixed "groups of nodes". Instead, nodes that are not 3304 | + * directly connected bounce traffic through intermediate 3305 | + * nodes; a numa_group can occupy any set of nodes. 3306 | + * The further away a node is, the less the faults count. 3307 | + * This seems to result in good task placement. 3308 | + */ 3309 | + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 3310 | + faults *= (max_dist - dist); 3311 | + faults /= (max_dist - LOCAL_DISTANCE); 3312 | + } 3313 | + 3314 | + score += faults; 3315 | + } 3316 | + 3317 | + return score; 3318 | +} 3319 | + 3320 | +/* 3321 | + * These return the fraction of accesses done by a particular task, or 3322 | + * task group, on a particular numa node. The group weight is given a 3323 | + * larger multiplier, in order to group tasks together that are almost 3324 | + * evenly spread out between numa nodes. 3325 | + */ 3326 | +static inline unsigned long task_weight(struct task_struct *p, int nid, 3327 | + int dist) 3328 | +{ 3329 | + unsigned long faults, total_faults; 3330 | + 3331 | + if (!p->numa_faults) 3332 | + return 0; 3333 | + 3334 | + total_faults = p->total_numa_faults; 3335 | + 3336 | + if (!total_faults) 3337 | + return 0; 3338 | + 3339 | + faults = task_faults(p, nid); 3340 | + faults += score_nearby_nodes(p, nid, dist, true); 3341 | + 3342 | + return 1000 * faults / total_faults; 3343 | +} 3344 | + 3345 | +static inline unsigned long group_weight(struct task_struct *p, int nid, 3346 | + int dist) 3347 | +{ 3348 | + struct numa_group *ng = deref_task_numa_group(p); 3349 | + unsigned long faults, total_faults; 3350 | + 3351 | + if (!ng) 3352 | + return 0; 3353 | + 3354 | + total_faults = ng->total_faults; 3355 | + 3356 | + if (!total_faults) 3357 | + return 0; 3358 | + 3359 | + faults = group_faults(p, nid); 3360 | + faults += score_nearby_nodes(p, nid, dist, false); 3361 | + 3362 | + return 1000 * faults / total_faults; 3363 | +} 3364 | + 3365 | +/* 3366 | + * If memory tiering mode is enabled, cpupid of slow memory page is 3367 | + * used to record scan time instead of CPU and PID. When tiering mode 3368 | + * is disabled at run time, the scan time (in cpupid) will be 3369 | + * interpreted as CPU and PID. So CPU needs to be checked to avoid to 3370 | + * access out of array bound. 3371 | + */ 3372 | +static inline bool cpupid_valid(int cpupid) 3373 | +{ 3374 | + return cpupid_to_cpu(cpupid) < nr_cpu_ids; 3375 | +} 3376 | + 3377 | +/* 3378 | + * For memory tiering mode, if there are enough free pages (more than 3379 | + * enough watermark defined here) in fast memory node, to take full 3380 | + * advantage of fast memory capacity, all recently accessed slow 3381 | + * memory pages will be migrated to fast memory node without 3382 | + * considering hot threshold. 3383 | + */ 3384 | +static bool pgdat_free_space_enough(struct pglist_data *pgdat) 3385 | +{ 3386 | + int z; 3387 | + unsigned long enough_wmark; 3388 | + 3389 | + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, 3390 | + pgdat->node_present_pages >> 4); 3391 | + for (z = pgdat->nr_zones - 1; z >= 0; z--) { 3392 | + struct zone *zone = pgdat->node_zones + z; 3393 | + 3394 | + if (!populated_zone(zone)) 3395 | + continue; 3396 | + 3397 | + if (zone_watermark_ok(zone, 0, 3398 | + wmark_pages(zone, WMARK_PROMO) + enough_wmark, 3399 | + ZONE_MOVABLE, 0)) 3400 | + return true; 3401 | + } 3402 | + return false; 3403 | +} 3404 | + 3405 | +/* 3406 | + * For memory tiering mode, when page tables are scanned, the scan 3407 | + * time will be recorded in struct page in addition to make page 3408 | + * PROT_NONE for slow memory page. So when the page is accessed, in 3409 | + * hint page fault handler, the hint page fault latency is calculated 3410 | + * via, 3411 | + * 3412 | + * hint page fault latency = hint page fault time - scan time 3413 | + * 3414 | + * The smaller the hint page fault latency, the higher the possibility 3415 | + * for the page to be hot. 3416 | + */ 3417 | +static int numa_hint_fault_latency(struct folio *folio) 3418 | +{ 3419 | + int last_time, time; 3420 | + 3421 | + time = jiffies_to_msecs(jiffies); 3422 | + last_time = folio_xchg_access_time(folio, time); 3423 | + 3424 | + return (time - last_time) & PAGE_ACCESS_TIME_MASK; 3425 | +} 3426 | + 3427 | +/* 3428 | + * For memory tiering mode, too high promotion/demotion throughput may 3429 | + * hurt application latency. So we provide a mechanism to rate limit 3430 | + * the number of pages that are tried to be promoted. 3431 | + */ 3432 | +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, 3433 | + unsigned long rate_limit, int nr) 3434 | +{ 3435 | + unsigned long nr_cand; 3436 | + unsigned int now, start; 3437 | + 3438 | + now = jiffies_to_msecs(jiffies); 3439 | + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); 3440 | + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 3441 | + start = pgdat->nbp_rl_start; 3442 | + if (now - start > MSEC_PER_SEC && 3443 | + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) 3444 | + pgdat->nbp_rl_nr_cand = nr_cand; 3445 | + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) 3446 | + return true; 3447 | + return false; 3448 | +} 3449 | + 3450 | +#define NUMA_MIGRATION_ADJUST_STEPS 16 3451 | + 3452 | +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, 3453 | + unsigned long rate_limit, 3454 | + unsigned int ref_th) 3455 | +{ 3456 | + unsigned int now, start, th_period, unit_th, th; 3457 | + unsigned long nr_cand, ref_cand, diff_cand; 3458 | + 3459 | + now = jiffies_to_msecs(jiffies); 3460 | + th_period = sysctl_numa_balancing_scan_period_max; 3461 | + start = pgdat->nbp_th_start; 3462 | + if (now - start > th_period && 3463 | + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { 3464 | + ref_cand = rate_limit * 3465 | + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; 3466 | + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 3467 | + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; 3468 | + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; 3469 | + th = pgdat->nbp_threshold ? : ref_th; 3470 | + if (diff_cand > ref_cand * 11 / 10) 3471 | + th = max(th - unit_th, unit_th); 3472 | + else if (diff_cand < ref_cand * 9 / 10) 3473 | + th = min(th + unit_th, ref_th * 2); 3474 | + pgdat->nbp_th_nr_cand = nr_cand; 3475 | + pgdat->nbp_threshold = th; 3476 | + } 3477 | +} 3478 | + 3479 | +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ 3480 | +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; 3481 | + 3482 | +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, 3483 | + int src_nid, int dst_cpu) 3484 | +{ 3485 | + struct numa_group *ng = deref_curr_numa_group(p); 3486 | + int dst_nid = cpu_to_node(dst_cpu); 3487 | + int last_cpupid, this_cpupid; 3488 | + 3489 | + /* 3490 | + * The pages in slow memory node should be migrated according 3491 | + * to hot/cold instead of private/shared. 3492 | + */ 3493 | + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 3494 | + !node_is_toptier(src_nid)) { 3495 | + struct pglist_data *pgdat; 3496 | + unsigned long rate_limit; 3497 | + unsigned int latency, th, def_th; 3498 | + 3499 | + pgdat = NODE_DATA(dst_nid); 3500 | + if (pgdat_free_space_enough(pgdat)) { 3501 | + /* workload changed, reset hot threshold */ 3502 | + pgdat->nbp_threshold = 0; 3503 | + return true; 3504 | + } 3505 | + 3506 | + def_th = sysctl_numa_balancing_hot_threshold; 3507 | + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ 3508 | + (20 - PAGE_SHIFT); 3509 | + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); 3510 | + 3511 | + th = pgdat->nbp_threshold ? : def_th; 3512 | + latency = numa_hint_fault_latency(folio); 3513 | + if (latency >= th) 3514 | + return false; 3515 | + 3516 | + return !numa_promotion_rate_limit(pgdat, rate_limit, 3517 | + folio_nr_pages(folio)); 3518 | + } 3519 | + 3520 | + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 3521 | + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); 3522 | + 3523 | + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 3524 | + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) 3525 | + return false; 3526 | + 3527 | + /* 3528 | + * Allow first faults or private faults to migrate immediately early in 3529 | + * the lifetime of a task. The magic number 4 is based on waiting for 3530 | + * two full passes of the "multi-stage node selection" test that is 3531 | + * executed below. 3532 | + */ 3533 | + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && 3534 | + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 3535 | + return true; 3536 | + 3537 | + /* 3538 | + * Multi-stage node selection is used in conjunction with a periodic 3539 | + * migration fault to build a temporal task<->page relation. By using 3540 | + * a two-stage filter we remove short/unlikely relations. 3541 | + * 3542 | + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate 3543 | + * a task's usage of a particular page (n_p) per total usage of this 3544 | + * page (n_t) (in a given time-span) to a probability. 3545 | + * 3546 | + * Our periodic faults will sample this probability and getting the 3547 | + * same result twice in a row, given these samples are fully 3548 | + * independent, is then given by P(n)^2, provided our sample period 3549 | + * is sufficiently short compared to the usage pattern. 3550 | + * 3551 | + * This quadric squishes small probabilities, making it less likely we 3552 | + * act on an unlikely task<->page relation. 3553 | + */ 3554 | + if (!cpupid_pid_unset(last_cpupid) && 3555 | + cpupid_to_nid(last_cpupid) != dst_nid) 3556 | + return false; 3557 | + 3558 | + /* Always allow migrate on private faults */ 3559 | + if (cpupid_match_pid(p, last_cpupid)) 3560 | + return true; 3561 | + 3562 | + /* A shared fault, but p->numa_group has not been set up yet. */ 3563 | + if (!ng) 3564 | + return true; 3565 | + 3566 | + /* 3567 | + * Destination node is much more heavily used than the source 3568 | + * node? Allow migration. 3569 | + */ 3570 | + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * 3571 | + ACTIVE_NODE_FRACTION) 3572 | + return true; 3573 | + 3574 | + /* 3575 | + * Distribute memory according to CPU & memory use on each node, 3576 | + * with 3/4 hysteresis to avoid unnecessary memory migrations: 3577 | + * 3578 | + * faults_cpu(dst) 3 faults_cpu(src) 3579 | + * --------------- * - > --------------- 3580 | + * faults_mem(dst) 4 faults_mem(src) 3581 | + */ 3582 | + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > 3583 | + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 3584 | +} 3585 | + 3586 | +/* 3587 | + * 'numa_type' describes the node at the moment of load balancing. 3588 | + */ 3589 | +enum numa_type { 3590 | + /* The node has spare capacity that can be used to run more tasks. */ 3591 | + node_has_spare = 0, 3592 | + /* 3593 | + * The node is fully used and the tasks don't compete for more CPU 3594 | + * cycles. Nevertheless, some tasks might wait before running. 3595 | + */ 3596 | + node_fully_busy, 3597 | + /* 3598 | + * The node is overloaded and can't provide expected CPU cycles to all 3599 | + * tasks. 3600 | + */ 3601 | + node_overloaded 3602 | +}; 3603 | + 3604 | +/* Cached statistics for all CPUs within a node */ 3605 | +struct numa_stats { 3606 | + unsigned long load; 3607 | + unsigned long runnable; 3608 | + unsigned long util; 3609 | + /* Total compute capacity of CPUs on a node */ 3610 | + unsigned long compute_capacity; 3611 | + unsigned int nr_running; 3612 | + unsigned int weight; 3613 | + enum numa_type node_type; 3614 | + int idle_cpu; 3615 | +}; 3616 | + 3617 | +struct task_numa_env { 3618 | + struct task_struct *p; 3619 | + 3620 | + int src_cpu, src_nid; 3621 | + int dst_cpu, dst_nid; 3622 | + int imb_numa_nr; 3623 | + 3624 | + struct numa_stats src_stats, dst_stats; 3625 | + 3626 | + int imbalance_pct; 3627 | + int dist; 3628 | + 3629 | + struct task_struct *best_task; 3630 | + long best_imp; 3631 | + int best_cpu; 3632 | +}; 3633 | + 3634 | +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) 3635 | +{ 3636 | + return cfs_rq->avg.load_avg; 3637 | +} 3638 | + 3639 | +static unsigned long cpu_load(struct rq *rq) 3640 | +{ 3641 | + return cfs_rq_load_avg(&rq->cfs); 3642 | +} 3643 | + 3644 | +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) 3645 | +{ 3646 | + return cfs_rq->avg.runnable_avg; 3647 | +} 3648 | + 3649 | +static unsigned long cpu_runnable(struct rq *rq) 3650 | +{ 3651 | + return cfs_rq_runnable_avg(&rq->cfs); 3652 | +} 3653 | + 3654 | +static inline enum 3655 | +numa_type numa_classify(unsigned int imbalance_pct, 3656 | + struct numa_stats *ns) 3657 | +{ 3658 | + if ((ns->nr_running > ns->weight) && 3659 | + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || 3660 | + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) 3661 | + return node_overloaded; 3662 | + 3663 | + if ((ns->nr_running < ns->weight) || 3664 | + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && 3665 | + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) 3666 | + return node_has_spare; 3667 | + 3668 | + return node_fully_busy; 3669 | +} 3670 | + 3671 | +#ifdef CONFIG_SCHED_SMT 3672 | +/* Forward declarations of select_idle_sibling helpers */ 3673 | +static inline bool test_idle_cores(int cpu); 3674 | +static inline int numa_idle_core(int idle_core, int cpu) 3675 | +{ 3676 | + if (!static_branch_likely(&sched_smt_present) || 3677 | + idle_core >= 0 || !test_idle_cores(cpu)) 3678 | + return idle_core; 3679 | + 3680 | + /* 3681 | + * Prefer cores instead of packing HT siblings 3682 | + * and triggering future load balancing. 3683 | + */ 3684 | + if (is_core_idle(cpu)) 3685 | + idle_core = cpu; 3686 | + 3687 | + return idle_core; 3688 | +} 3689 | +#else 3690 | +static inline int numa_idle_core(int idle_core, int cpu) 3691 | +{ 3692 | + return idle_core; 3693 | +} 3694 | +#endif 3695 | + 3696 | +/* 3697 | + * Gather all necessary information to make NUMA balancing placement 3698 | + * decisions that are compatible with standard load balancer. This 3699 | + * borrows code and logic from update_sg_lb_stats but sharing a 3700 | + * common implementation is impractical. 3701 | + */ 3702 | +static void update_numa_stats(struct task_numa_env *env, 3703 | + struct numa_stats *ns, int nid, 3704 | + bool find_idle) 3705 | +{ 3706 | + int cpu, idle_core = -1; 3707 | + 3708 | + memset(ns, 0, sizeof(*ns)); 3709 | + ns->idle_cpu = -1; 3710 | + 3711 | + rcu_read_lock(); 3712 | + for_each_cpu(cpu, cpumask_of_node(nid)) { 3713 | + struct rq *rq = cpu_rq(cpu); 3714 | + 3715 | + ns->load += cpu_load(rq); 3716 | + ns->runnable += cpu_runnable(rq); 3717 | + ns->util += cpu_util_cfs(cpu); 3718 | + ns->nr_running += rq->cfs.h_nr_running; 3719 | + ns->compute_capacity += capacity_of(cpu); 3720 | + 3721 | + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { 3722 | + if (READ_ONCE(rq->numa_migrate_on) || 3723 | + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) 3724 | + continue; 3725 | + 3726 | + if (ns->idle_cpu == -1) 3727 | + ns->idle_cpu = cpu; 3728 | + 3729 | + idle_core = numa_idle_core(idle_core, cpu); 3730 | + } 3731 | + } 3732 | + rcu_read_unlock(); 3733 | + 3734 | + ns->weight = cpumask_weight(cpumask_of_node(nid)); 3735 | + 3736 | + ns->node_type = numa_classify(env->imbalance_pct, ns); 3737 | + 3738 | + if (idle_core >= 0) 3739 | + ns->idle_cpu = idle_core; 3740 | +} 3741 | + 3742 | +static void task_numa_assign(struct task_numa_env *env, 3743 | + struct task_struct *p, long imp) 3744 | +{ 3745 | + struct rq *rq = cpu_rq(env->dst_cpu); 3746 | + 3747 | + /* Check if run-queue part of active NUMA balance. */ 3748 | + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { 3749 | + int cpu; 3750 | + int start = env->dst_cpu; 3751 | + 3752 | + /* Find alternative idle CPU. */ 3753 | + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { 3754 | + if (cpu == env->best_cpu || !idle_cpu(cpu) || 3755 | + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { 3756 | + continue; 3757 | + } 3758 | + 3759 | + env->dst_cpu = cpu; 3760 | + rq = cpu_rq(env->dst_cpu); 3761 | + if (!xchg(&rq->numa_migrate_on, 1)) 3762 | + goto assign; 3763 | + } 3764 | + 3765 | + /* Failed to find an alternative idle CPU */ 3766 | + return; 3767 | + } 3768 | + 3769 | +assign: 3770 | + /* 3771 | + * Clear previous best_cpu/rq numa-migrate flag, since task now 3772 | + * found a better CPU to move/swap. 3773 | + */ 3774 | + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { 3775 | + rq = cpu_rq(env->best_cpu); 3776 | + WRITE_ONCE(rq->numa_migrate_on, 0); 3777 | + } 3778 | + 3779 | + if (env->best_task) 3780 | + put_task_struct(env->best_task); 3781 | + if (p) 3782 | + get_task_struct(p); 3783 | + 3784 | + env->best_task = p; 3785 | + env->best_imp = imp; 3786 | + env->best_cpu = env->dst_cpu; 3787 | +} 3788 | + 3789 | +static bool load_too_imbalanced(long src_load, long dst_load, 3790 | + struct task_numa_env *env) 3791 | +{ 3792 | + long imb, old_imb; 3793 | + long orig_src_load, orig_dst_load; 3794 | + long src_capacity, dst_capacity; 3795 | + 3796 | + /* 3797 | + * The load is corrected for the CPU capacity available on each node. 3798 | + * 3799 | + * src_load dst_load 3800 | + * ------------ vs --------- 3801 | + * src_capacity dst_capacity 3802 | + */ 3803 | + src_capacity = env->src_stats.compute_capacity; 3804 | + dst_capacity = env->dst_stats.compute_capacity; 3805 | + 3806 | + imb = abs(dst_load * src_capacity - src_load * dst_capacity); 3807 | + 3808 | + orig_src_load = env->src_stats.load; 3809 | + orig_dst_load = env->dst_stats.load; 3810 | + 3811 | + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); 3812 | + 3813 | + /* Would this change make things worse? */ 3814 | + return (imb > old_imb); 3815 | +} 3816 | + 3817 | +/* 3818 | + * Maximum NUMA importance can be 1998 (2*999); 3819 | + * SMALLIMP @ 30 would be close to 1998/64. 3820 | + * Used to deter task migration. 3821 | + */ 3822 | +#define SMALLIMP 30 3823 | + 3824 | +/* 3825 | + * This checks if the overall compute and NUMA accesses of the system would 3826 | + * be improved if the source tasks was migrated to the target dst_cpu taking 3827 | + * into account that it might be best if task running on the dst_cpu should 3828 | + * be exchanged with the source task 3829 | + */ 3830 | +static bool task_numa_compare(struct task_numa_env *env, 3831 | + long taskimp, long groupimp, bool maymove) 3832 | +{ 3833 | + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); 3834 | + struct rq *dst_rq = cpu_rq(env->dst_cpu); 3835 | + long imp = p_ng ? groupimp : taskimp; 3836 | + struct task_struct *cur; 3837 | + long src_load, dst_load; 3838 | + int dist = env->dist; 3839 | + long moveimp = imp; 3840 | + long load; 3841 | + bool stopsearch = false; 3842 | + 3843 | + if (READ_ONCE(dst_rq->numa_migrate_on)) 3844 | + return false; 3845 | + 3846 | + rcu_read_lock(); 3847 | + cur = rcu_dereference(dst_rq->curr); 3848 | + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) 3849 | + cur = NULL; 3850 | + 3851 | + /* 3852 | + * Because we have preemption enabled we can get migrated around and 3853 | + * end try selecting ourselves (current == env->p) as a swap candidate. 3854 | + */ 3855 | + if (cur == env->p) { 3856 | + stopsearch = true; 3857 | + goto unlock; 3858 | + } 3859 | + 3860 | + if (!cur) { 3861 | + if (maymove && moveimp >= env->best_imp) 3862 | + goto assign; 3863 | + else 3864 | + goto unlock; 3865 | + } 3866 | + 3867 | + /* Skip this swap candidate if cannot move to the source cpu. */ 3868 | + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) 3869 | + goto unlock; 3870 | + 3871 | + /* 3872 | + * Skip this swap candidate if it is not moving to its preferred 3873 | + * node and the best task is. 3874 | + */ 3875 | + if (env->best_task && 3876 | + env->best_task->numa_preferred_nid == env->src_nid && 3877 | + cur->numa_preferred_nid != env->src_nid) { 3878 | + goto unlock; 3879 | + } 3880 | + 3881 | + /* 3882 | + * "imp" is the fault differential for the source task between the 3883 | + * source and destination node. Calculate the total differential for 3884 | + * the source task and potential destination task. The more negative 3885 | + * the value is, the more remote accesses that would be expected to 3886 | + * be incurred if the tasks were swapped. 3887 | + * 3888 | + * If dst and source tasks are in the same NUMA group, or not 3889 | + * in any group then look only at task weights. 3890 | + */ 3891 | + cur_ng = rcu_dereference(cur->numa_group); 3892 | + if (cur_ng == p_ng) { 3893 | + /* 3894 | + * Do not swap within a group or between tasks that have 3895 | + * no group if there is spare capacity. Swapping does 3896 | + * not address the load imbalance and helps one task at 3897 | + * the cost of punishing another. 3898 | + */ 3899 | + if (env->dst_stats.node_type == node_has_spare) 3900 | + goto unlock; 3901 | + 3902 | + imp = taskimp + task_weight(cur, env->src_nid, dist) - 3903 | + task_weight(cur, env->dst_nid, dist); 3904 | + /* 3905 | + * Add some hysteresis to prevent swapping the 3906 | + * tasks within a group over tiny differences. 3907 | + */ 3908 | + if (cur_ng) 3909 | + imp -= imp / 16; 3910 | + } else { 3911 | + /* 3912 | + * Compare the group weights. If a task is all by itself 3913 | + * (not part of a group), use the task weight instead. 3914 | + */ 3915 | + if (cur_ng && p_ng) 3916 | + imp += group_weight(cur, env->src_nid, dist) - 3917 | + group_weight(cur, env->dst_nid, dist); 3918 | + else 3919 | + imp += task_weight(cur, env->src_nid, dist) - 3920 | + task_weight(cur, env->dst_nid, dist); 3921 | + } 3922 | + 3923 | + /* Discourage picking a task already on its preferred node */ 3924 | + if (cur->numa_preferred_nid == env->dst_nid) 3925 | + imp -= imp / 16; 3926 | + 3927 | + /* 3928 | + * Encourage picking a task that moves to its preferred node. 3929 | + * This potentially makes imp larger than it's maximum of 3930 | + * 1998 (see SMALLIMP and task_weight for why) but in this 3931 | + * case, it does not matter. 3932 | + */ 3933 | + if (cur->numa_preferred_nid == env->src_nid) 3934 | + imp += imp / 8; 3935 | + 3936 | + if (maymove && moveimp > imp && moveimp > env->best_imp) { 3937 | + imp = moveimp; 3938 | + cur = NULL; 3939 | + goto assign; 3940 | + } 3941 | + 3942 | + /* 3943 | + * Prefer swapping with a task moving to its preferred node over a 3944 | + * task that is not. 3945 | + */ 3946 | + if (env->best_task && cur->numa_preferred_nid == env->src_nid && 3947 | + env->best_task->numa_preferred_nid != env->src_nid) { 3948 | + goto assign; 3949 | + } 3950 | + 3951 | + /* 3952 | + * If the NUMA importance is less than SMALLIMP, 3953 | + * task migration might only result in ping pong 3954 | + * of tasks and also hurt performance due to cache 3955 | + * misses. 3956 | + */ 3957 | + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) 3958 | + goto unlock; 3959 | + 3960 | + /* 3961 | + * In the overloaded case, try and keep the load balanced. 3962 | + */ 3963 | + load = task_h_load(env->p) - task_h_load(cur); 3964 | + if (!load) 3965 | + goto assign; 3966 | + 3967 | + dst_load = env->dst_stats.load + load; 3968 | + src_load = env->src_stats.load - load; 3969 | + 3970 | + if (load_too_imbalanced(src_load, dst_load, env)) 3971 | + goto unlock; 3972 | + 3973 | +assign: 3974 | + /* Evaluate an idle CPU for a task numa move. */ 3975 | + if (!cur) { 3976 | + int cpu = env->dst_stats.idle_cpu; 3977 | + 3978 | + /* Nothing cached so current CPU went idle since the search. */ 3979 | + if (cpu < 0) 3980 | + cpu = env->dst_cpu; 3981 | + 3982 | + /* 3983 | + * If the CPU is no longer truly idle and the previous best CPU 3984 | + * is, keep using it. 3985 | + */ 3986 | + if (!idle_cpu(cpu) && env->best_cpu >= 0 && 3987 | + idle_cpu(env->best_cpu)) { 3988 | + cpu = env->best_cpu; 3989 | + } 3990 | + 3991 | + env->dst_cpu = cpu; 3992 | + } 3993 | + 3994 | + task_numa_assign(env, cur, imp); 3995 | + 3996 | + /* 3997 | + * If a move to idle is allowed because there is capacity or load 3998 | + * balance improves then stop the search. While a better swap 3999 | + * candidate may exist, a search is not free. 4000 | + */ 4001 | + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) 4002 | + stopsearch = true; 4003 | + 4004 | + /* 4005 | + * If a swap candidate must be identified and the current best task 4006 | + * moves its preferred node then stop the search. 4007 | + */ 4008 | + if (!maymove && env->best_task && 4009 | + env->best_task->numa_preferred_nid == env->src_nid) { 4010 | + stopsearch = true; 4011 | + } 4012 | +unlock: 4013 | + rcu_read_unlock(); 4014 | + 4015 | + return stopsearch; 4016 | +} 4017 | + 4018 | +#define NUMA_IMBALANCE_MIN 2 4019 | + 4020 | +static inline long 4021 | +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) 4022 | +{ 4023 | + /* 4024 | + * Allow a NUMA imbalance if busy CPUs is less than the maximum 4025 | + * threshold. Above this threshold, individual tasks may be contending 4026 | + * for both memory bandwidth and any shared HT resources. This is an 4027 | + * approximation as the number of running tasks may not be related to 4028 | + * the number of busy CPUs due to sched_setaffinity. 4029 | + */ 4030 | + if (dst_running > imb_numa_nr) 4031 | + return imbalance; 4032 | + 4033 | + /* 4034 | + * Allow a small imbalance based on a simple pair of communicating 4035 | + * tasks that remain local when the destination is lightly loaded. 4036 | + */ 4037 | + if (imbalance <= NUMA_IMBALANCE_MIN) 4038 | + return 0; 4039 | + 4040 | + return imbalance; 4041 | +} 4042 | + 4043 | +static void task_numa_find_cpu(struct task_numa_env *env, 4044 | + long taskimp, long groupimp) 4045 | +{ 4046 | + bool maymove = false; 4047 | + int cpu; 4048 | + 4049 | + /* 4050 | + * If dst node has spare capacity, then check if there is an 4051 | + * imbalance that would be overruled by the load balancer. 4052 | + */ 4053 | + if (env->dst_stats.node_type == node_has_spare) { 4054 | + unsigned int imbalance; 4055 | + int src_running, dst_running; 4056 | + 4057 | + /* 4058 | + * Would movement cause an imbalance? Note that if src has 4059 | + * more running tasks that the imbalance is ignored as the 4060 | + * move improves the imbalance from the perspective of the 4061 | + * CPU load balancer. 4062 | + * */ 4063 | + src_running = env->src_stats.nr_running - 1; 4064 | + dst_running = env->dst_stats.nr_running + 1; 4065 | + imbalance = max(0, dst_running - src_running); 4066 | + imbalance = adjust_numa_imbalance(imbalance, dst_running, 4067 | + env->imb_numa_nr); 4068 | + 4069 | + /* Use idle CPU if there is no imbalance */ 4070 | + if (!imbalance) { 4071 | + maymove = true; 4072 | + if (env->dst_stats.idle_cpu >= 0) { 4073 | + env->dst_cpu = env->dst_stats.idle_cpu; 4074 | + task_numa_assign(env, NULL, 0); 4075 | + return; 4076 | + } 4077 | + } 4078 | + } else { 4079 | + long src_load, dst_load, load; 4080 | + /* 4081 | + * If the improvement from just moving env->p direction is better 4082 | + * than swapping tasks around, check if a move is possible. 4083 | + */ 4084 | + load = task_h_load(env->p); 4085 | + dst_load = env->dst_stats.load + load; 4086 | + src_load = env->src_stats.load - load; 4087 | + maymove = !load_too_imbalanced(src_load, dst_load, env); 4088 | + } 4089 | + 4090 | + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 4091 | + /* Skip this CPU if the source task cannot migrate */ 4092 | + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) 4093 | + continue; 4094 | + 4095 | + env->dst_cpu = cpu; 4096 | + if (task_numa_compare(env, taskimp, groupimp, maymove)) 4097 | + break; 4098 | + } 4099 | +} 4100 | + 4101 | +static int task_numa_migrate(struct task_struct *p) 4102 | +{ 4103 | + struct task_numa_env env = { 4104 | + .p = p, 4105 | + 4106 | + .src_cpu = task_cpu(p), 4107 | + .src_nid = task_node(p), 4108 | + 4109 | + .imbalance_pct = 112, 4110 | + 4111 | + .best_task = NULL, 4112 | + .best_imp = 0, 4113 | + .best_cpu = -1, 4114 | + }; 4115 | + unsigned long taskweight, groupweight; 4116 | + struct sched_domain *sd; 4117 | + long taskimp, groupimp; 4118 | + struct numa_group *ng; 4119 | + struct rq *best_rq; 4120 | + int nid, ret, dist; 4121 | + 4122 | + /* 4123 | + * Pick the lowest SD_NUMA domain, as that would have the smallest 4124 | + * imbalance and would be the first to start moving tasks about. 4125 | + * 4126 | + * And we want to avoid any moving of tasks about, as that would create 4127 | + * random movement of tasks -- counter the numa conditions we're trying 4128 | + * to satisfy here. 4129 | + */ 4130 | + rcu_read_lock(); 4131 | + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); 4132 | + if (sd) { 4133 | + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 4134 | + env.imb_numa_nr = sd->imb_numa_nr; 4135 | + } 4136 | + rcu_read_unlock(); 4137 | + 4138 | + /* 4139 | + * Cpusets can break the scheduler domain tree into smaller 4140 | + * balance domains, some of which do not cross NUMA boundaries. 4141 | + * Tasks that are "trapped" in such domains cannot be migrated 4142 | + * elsewhere, so there is no point in (re)trying. 4143 | + */ 4144 | + if (unlikely(!sd)) { 4145 | + sched_setnuma(p, task_node(p)); 4146 | + return -EINVAL; 4147 | + } 4148 | + 4149 | + env.dst_nid = p->numa_preferred_nid; 4150 | + dist = env.dist = node_distance(env.src_nid, env.dst_nid); 4151 | + taskweight = task_weight(p, env.src_nid, dist); 4152 | + groupweight = group_weight(p, env.src_nid, dist); 4153 | + update_numa_stats(&env, &env.src_stats, env.src_nid, false); 4154 | + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; 4155 | + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; 4156 | + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 4157 | + 4158 | + /* Try to find a spot on the preferred nid. */ 4159 | + task_numa_find_cpu(&env, taskimp, groupimp); 4160 | + 4161 | + /* 4162 | + * Look at other nodes in these cases: 4163 | + * - there is no space available on the preferred_nid 4164 | + * - the task is part of a numa_group that is interleaved across 4165 | + * multiple NUMA nodes; in order to better consolidate the group, 4166 | + * we need to check other locations. 4167 | + */ 4168 | + ng = deref_curr_numa_group(p); 4169 | + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { 4170 | + for_each_node_state(nid, N_CPU) { 4171 | + if (nid == env.src_nid || nid == p->numa_preferred_nid) 4172 | + continue; 4173 | + 4174 | + dist = node_distance(env.src_nid, env.dst_nid); 4175 | + if (sched_numa_topology_type == NUMA_BACKPLANE && 4176 | + dist != env.dist) { 4177 | + taskweight = task_weight(p, env.src_nid, dist); 4178 | + groupweight = group_weight(p, env.src_nid, dist); 4179 | + } 4180 | + 4181 | + /* Only consider nodes where both task and groups benefit */ 4182 | + taskimp = task_weight(p, nid, dist) - taskweight; 4183 | + groupimp = group_weight(p, nid, dist) - groupweight; 4184 | + if (taskimp < 0 && groupimp < 0) 4185 | + continue; 4186 | + 4187 | + env.dist = dist; 4188 | + env.dst_nid = nid; 4189 | + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 4190 | + task_numa_find_cpu(&env, taskimp, groupimp); 4191 | + } 4192 | + } 4193 | + 4194 | + /* 4195 | + * If the task is part of a workload that spans multiple NUMA nodes, 4196 | + * and is migrating into one of the workload's active nodes, remember 4197 | + * this node as the task's preferred numa node, so the workload can 4198 | + * settle down. 4199 | + * A task that migrated to a second choice node will be better off 4200 | + * trying for a better one later. Do not set the preferred node here. 4201 | + */ 4202 | + if (ng) { 4203 | + if (env.best_cpu == -1) 4204 | + nid = env.src_nid; 4205 | + else 4206 | + nid = cpu_to_node(env.best_cpu); 4207 | + 4208 | + if (nid != p->numa_preferred_nid) 4209 | + sched_setnuma(p, nid); 4210 | + } 4211 | + 4212 | + /* No better CPU than the current one was found. */ 4213 | + if (env.best_cpu == -1) { 4214 | + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); 4215 | + return -EAGAIN; 4216 | + } 4217 | + 4218 | + best_rq = cpu_rq(env.best_cpu); 4219 | + if (env.best_task == NULL) { 4220 | + ret = migrate_task_to(p, env.best_cpu); 4221 | + WRITE_ONCE(best_rq->numa_migrate_on, 0); 4222 | + if (ret != 0) 4223 | + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); 4224 | + return ret; 4225 | + } 4226 | + 4227 | + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); 4228 | + WRITE_ONCE(best_rq->numa_migrate_on, 0); 4229 | + 4230 | + if (ret != 0) 4231 | + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); 4232 | + put_task_struct(env.best_task); 4233 | + return ret; 4234 | +} 4235 | + 4236 | +/* Attempt to migrate a task to a CPU on the preferred node. */ 4237 | +static void numa_migrate_preferred(struct task_struct *p) 4238 | +{ 4239 | + unsigned long interval = HZ; 4240 | + 4241 | + /* This task has no NUMA fault statistics yet */ 4242 | + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) 4243 | + return; 4244 | + 4245 | + /* Periodically retry migrating the task to the preferred node */ 4246 | + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 4247 | + p->numa_migrate_retry = jiffies + interval; 4248 | + 4249 | + /* Success if task is already running on preferred CPU */ 4250 | + if (task_node(p) == p->numa_preferred_nid) 4251 | + return; 4252 | + 4253 | + /* Otherwise, try migrate to a CPU on the preferred node */ 4254 | + task_numa_migrate(p); 4255 | +} 4256 | + 4257 | +/* 4258 | + * Find out how many nodes the workload is actively running on. Do this by 4259 | + * tracking the nodes from which NUMA hinting faults are triggered. This can 4260 | + * be different from the set of nodes where the workload's memory is currently 4261 | + * located. 4262 | + */ 4263 | +static void numa_group_count_active_nodes(struct numa_group *numa_group) 4264 | +{ 4265 | + unsigned long faults, max_faults = 0; 4266 | + int nid, active_nodes = 0; 4267 | + 4268 | + for_each_node_state(nid, N_CPU) { 4269 | + faults = group_faults_cpu(numa_group, nid); 4270 | + if (faults > max_faults) 4271 | + max_faults = faults; 4272 | + } 4273 | + 4274 | + for_each_node_state(nid, N_CPU) { 4275 | + faults = group_faults_cpu(numa_group, nid); 4276 | + if (faults * ACTIVE_NODE_FRACTION > max_faults) 4277 | + active_nodes++; 4278 | + } 4279 | + 4280 | + numa_group->max_faults_cpu = max_faults; 4281 | + numa_group->active_nodes = active_nodes; 4282 | +} 4283 | + 4284 | +/* 4285 | + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 4286 | + * increments. The more local the fault statistics are, the higher the scan 4287 | + * period will be for the next scan window. If local/(local+remote) ratio is 4288 | + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) 4289 | + * the scan period will decrease. Aim for 70% local accesses. 4290 | + */ 4291 | +#define NUMA_PERIOD_SLOTS 10 4292 | +#define NUMA_PERIOD_THRESHOLD 7 4293 | + 4294 | +/* 4295 | + * Increase the scan period (slow down scanning) if the majority of 4296 | + * our memory is already on our local node, or if the majority of 4297 | + * the page accesses are shared with other processes. 4298 | + * Otherwise, decrease the scan period. 4299 | + */ 4300 | +static void update_task_scan_period(struct task_struct *p, 4301 | + unsigned long shared, unsigned long private) 4302 | +{ 4303 | + unsigned int period_slot; 4304 | + int lr_ratio, ps_ratio; 4305 | + int diff; 4306 | + 4307 | + unsigned long remote = p->numa_faults_locality[0]; 4308 | + unsigned long local = p->numa_faults_locality[1]; 4309 | + 4310 | + /* 4311 | + * If there were no record hinting faults then either the task is 4312 | + * completely idle or all activity is in areas that are not of interest 4313 | + * to automatic numa balancing. Related to that, if there were failed 4314 | + * migration then it implies we are migrating too quickly or the local 4315 | + * node is overloaded. In either case, scan slower 4316 | + */ 4317 | + if (local + shared == 0 || p->numa_faults_locality[2]) { 4318 | + p->numa_scan_period = min(p->numa_scan_period_max, 4319 | + p->numa_scan_period << 1); 4320 | + 4321 | + p->mm->numa_next_scan = jiffies + 4322 | + msecs_to_jiffies(p->numa_scan_period); 4323 | + 4324 | + return; 4325 | + } 4326 | + 4327 | + /* 4328 | + * Prepare to scale scan period relative to the current period. 4329 | + * == NUMA_PERIOD_THRESHOLD scan period stays the same 4330 | + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) 4331 | + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) 4332 | + */ 4333 | + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); 4334 | + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); 4335 | + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); 4336 | + 4337 | + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { 4338 | + /* 4339 | + * Most memory accesses are local. There is no need to 4340 | + * do fast NUMA scanning, since memory is already local. 4341 | + */ 4342 | + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; 4343 | + if (!slot) 4344 | + slot = 1; 4345 | + diff = slot * period_slot; 4346 | + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { 4347 | + /* 4348 | + * Most memory accesses are shared with other tasks. 4349 | + * There is no point in continuing fast NUMA scanning, 4350 | + * since other tasks may just move the memory elsewhere. 4351 | + */ 4352 | + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; 4353 | + if (!slot) 4354 | + slot = 1; 4355 | + diff = slot * period_slot; 4356 | + } else { 4357 | + /* 4358 | + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, 4359 | + * yet they are not on the local NUMA node. Speed up 4360 | + * NUMA scanning to get the memory moved over. 4361 | + */ 4362 | + int ratio = max(lr_ratio, ps_ratio); 4363 | + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; 4364 | + } 4365 | + 4366 | + p->numa_scan_period = clamp(p->numa_scan_period + diff, 4367 | + task_scan_min(p), task_scan_max(p)); 4368 | + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 4369 | +} 4370 | + 4371 | +/* 4372 | + * Get the fraction of time the task has been running since the last 4373 | + * NUMA placement cycle. The scheduler keeps similar statistics, but 4374 | + * decays those on a 32ms period, which is orders of magnitude off 4375 | + * from the dozens-of-seconds NUMA balancing period. Use the scheduler 4376 | + * stats only if the task is so new there are no NUMA statistics yet. 4377 | + */ 4378 | +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) 4379 | +{ 4380 | + u64 runtime, delta, now; 4381 | + /* Use the start of this time slice to avoid calculations. */ 4382 | + now = p->se.exec_start; 4383 | + runtime = p->se.sum_exec_runtime; 4384 | + 4385 | + if (p->last_task_numa_placement) { 4386 | + delta = runtime - p->last_sum_exec_runtime; 4387 | + *period = now - p->last_task_numa_placement; 4388 | + 4389 | + /* Avoid time going backwards, prevent potential divide error: */ 4390 | + if (unlikely((s64)*period < 0)) 4391 | + *period = 0; 4392 | + } else { 4393 | + delta = p->se.avg.load_sum; 4394 | + *period = LOAD_AVG_MAX; 4395 | + } 4396 | + 4397 | + p->last_sum_exec_runtime = runtime; 4398 | + p->last_task_numa_placement = now; 4399 | + 4400 | + return delta; 4401 | +} 4402 | + 4403 | +/* 4404 | + * Determine the preferred nid for a task in a numa_group. This needs to 4405 | + * be done in a way that produces consistent results with group_weight, 4406 | + * otherwise workloads might not converge. 4407 | + */ 4408 | +static int preferred_group_nid(struct task_struct *p, int nid) 4409 | +{ 4410 | + nodemask_t nodes; 4411 | + int dist; 4412 | + 4413 | + /* Direct connections between all NUMA nodes. */ 4414 | + if (sched_numa_topology_type == NUMA_DIRECT) 4415 | + return nid; 4416 | + 4417 | + /* 4418 | + * On a system with glueless mesh NUMA topology, group_weight 4419 | + * scores nodes according to the number of NUMA hinting faults on 4420 | + * both the node itself, and on nearby nodes. 4421 | + */ 4422 | + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 4423 | + unsigned long score, max_score = 0; 4424 | + int node, max_node = nid; 4425 | + 4426 | + dist = sched_max_numa_distance; 4427 | + 4428 | + for_each_node_state(node, N_CPU) { 4429 | + score = group_weight(p, node, dist); 4430 | + if (score > max_score) { 4431 | + max_score = score; 4432 | + max_node = node; 4433 | + } 4434 | + } 4435 | + return max_node; 4436 | + } 4437 | + 4438 | + /* 4439 | + * Finding the preferred nid in a system with NUMA backplane 4440 | + * interconnect topology is more involved. The goal is to locate 4441 | + * tasks from numa_groups near each other in the system, and 4442 | + * untangle workloads from different sides of the system. This requires 4443 | + * searching down the hierarchy of node groups, recursively searching 4444 | + * inside the highest scoring group of nodes. The nodemask tricks 4445 | + * keep the complexity of the search down. 4446 | + */ 4447 | + nodes = node_states[N_CPU]; 4448 | + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 4449 | + unsigned long max_faults = 0; 4450 | + nodemask_t max_group = NODE_MASK_NONE; 4451 | + int a, b; 4452 | + 4453 | + /* Are there nodes at this distance from each other? */ 4454 | + if (!find_numa_distance(dist)) 4455 | + continue; 4456 | + 4457 | + for_each_node_mask(a, nodes) { 4458 | + unsigned long faults = 0; 4459 | + nodemask_t this_group; 4460 | + nodes_clear(this_group); 4461 | + 4462 | + /* Sum group's NUMA faults; includes a==b case. */ 4463 | + for_each_node_mask(b, nodes) { 4464 | + if (node_distance(a, b) < dist) { 4465 | + faults += group_faults(p, b); 4466 | + node_set(b, this_group); 4467 | + node_clear(b, nodes); 4468 | + } 4469 | + } 4470 | + 4471 | + /* Remember the top group. */ 4472 | + if (faults > max_faults) { 4473 | + max_faults = faults; 4474 | + max_group = this_group; 4475 | + /* 4476 | + * subtle: at the smallest distance there is 4477 | + * just one node left in each "group", the 4478 | + * winner is the preferred nid. 4479 | + */ 4480 | + nid = a; 4481 | + } 4482 | + } 4483 | + /* Next round, evaluate the nodes within max_group. */ 4484 | + if (!max_faults) 4485 | + break; 4486 | + nodes = max_group; 4487 | + } 4488 | + return nid; 4489 | +} 4490 | + 4491 | +static void task_numa_placement(struct task_struct *p) 4492 | +{ 4493 | + int seq, nid, max_nid = NUMA_NO_NODE; 4494 | + unsigned long max_faults = 0; 4495 | + unsigned long fault_types[2] = { 0, 0 }; 4496 | + unsigned long total_faults; 4497 | + u64 runtime, period; 4498 | + spinlock_t *group_lock = NULL; 4499 | + struct numa_group *ng; 4500 | + 4501 | + /* 4502 | + * The p->mm->numa_scan_seq field gets updated without 4503 | + * exclusive access. Use READ_ONCE() here to ensure 4504 | + * that the field is read in a single access: 4505 | + */ 4506 | + seq = READ_ONCE(p->mm->numa_scan_seq); 4507 | + if (p->numa_scan_seq == seq) 4508 | + return; 4509 | + p->numa_scan_seq = seq; 4510 | + p->numa_scan_period_max = task_scan_max(p); 4511 | + 4512 | + total_faults = p->numa_faults_locality[0] + 4513 | + p->numa_faults_locality[1]; 4514 | + runtime = numa_get_avg_runtime(p, &period); 4515 | + 4516 | + /* If the task is part of a group prevent parallel updates to group stats */ 4517 | + ng = deref_curr_numa_group(p); 4518 | + if (ng) { 4519 | + group_lock = &ng->lock; 4520 | + spin_lock_irq(group_lock); 4521 | + } 4522 | + 4523 | + /* Find the node with the highest number of faults */ 4524 | + for_each_online_node(nid) { 4525 | + /* Keep track of the offsets in numa_faults array */ 4526 | + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; 4527 | + unsigned long faults = 0, group_faults = 0; 4528 | + int priv; 4529 | + 4530 | + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 4531 | + long diff, f_diff, f_weight; 4532 | + 4533 | + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); 4534 | + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); 4535 | + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); 4536 | + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); 4537 | + 4538 | + /* Decay existing window, copy faults since last scan */ 4539 | + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; 4540 | + fault_types[priv] += p->numa_faults[membuf_idx]; 4541 | + p->numa_faults[membuf_idx] = 0; 4542 | + 4543 | + /* 4544 | + * Normalize the faults_from, so all tasks in a group 4545 | + * count according to CPU use, instead of by the raw 4546 | + * number of faults. Tasks with little runtime have 4547 | + * little over-all impact on throughput, and thus their 4548 | + * faults are less important. 4549 | + */ 4550 | + f_weight = div64_u64(runtime << 16, period + 1); 4551 | + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / 4552 | + (total_faults + 1); 4553 | + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; 4554 | + p->numa_faults[cpubuf_idx] = 0; 4555 | + 4556 | + p->numa_faults[mem_idx] += diff; 4557 | + p->numa_faults[cpu_idx] += f_diff; 4558 | + faults += p->numa_faults[mem_idx]; 4559 | + p->total_numa_faults += diff; 4560 | + if (ng) { 4561 | + /* 4562 | + * safe because we can only change our own group 4563 | + * 4564 | + * mem_idx represents the offset for a given 4565 | + * nid and priv in a specific region because it 4566 | + * is at the beginning of the numa_faults array. 4567 | + */ 4568 | + ng->faults[mem_idx] += diff; 4569 | + ng->faults[cpu_idx] += f_diff; 4570 | + ng->total_faults += diff; 4571 | + group_faults += ng->faults[mem_idx]; 4572 | + } 4573 | + } 4574 | + 4575 | + if (!ng) { 4576 | + if (faults > max_faults) { 4577 | + max_faults = faults; 4578 | + max_nid = nid; 4579 | + } 4580 | + } else if (group_faults > max_faults) { 4581 | + max_faults = group_faults; 4582 | + max_nid = nid; 4583 | + } 4584 | + } 4585 | + 4586 | + /* Cannot migrate task to CPU-less node */ 4587 | + max_nid = numa_nearest_node(max_nid, N_CPU); 4588 | + 4589 | + if (ng) { 4590 | + numa_group_count_active_nodes(ng); 4591 | + spin_unlock_irq(group_lock); 4592 | + max_nid = preferred_group_nid(p, max_nid); 4593 | + } 4594 | + 4595 | + if (max_faults) { 4596 | + /* Set the new preferred node */ 4597 | + if (max_nid != p->numa_preferred_nid) 4598 | + sched_setnuma(p, max_nid); 4599 | + } 4600 | + 4601 | + update_task_scan_period(p, fault_types[0], fault_types[1]); 4602 | +} 4603 | + 4604 | +static inline int get_numa_group(struct numa_group *grp) 4605 | +{ 4606 | + return refcount_inc_not_zero(&grp->refcount); 4607 | +} 4608 | + 4609 | +static inline void put_numa_group(struct numa_group *grp) 4610 | +{ 4611 | + if (refcount_dec_and_test(&grp->refcount)) 4612 | + kfree_rcu(grp, rcu); 4613 | +} 4614 | + 4615 | +static void task_numa_group(struct task_struct *p, int cpupid, int flags, 4616 | + int *priv) 4617 | +{ 4618 | + struct numa_group *grp, *my_grp; 4619 | + struct task_struct *tsk; 4620 | + bool join = false; 4621 | + int cpu = cpupid_to_cpu(cpupid); 4622 | + int i; 4623 | + 4624 | + if (unlikely(!deref_curr_numa_group(p))) { 4625 | + unsigned int size = sizeof(struct numa_group) + 4626 | + NR_NUMA_HINT_FAULT_STATS * 4627 | + nr_node_ids * sizeof(unsigned long); 4628 | + 4629 | + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 4630 | + if (!grp) 4631 | + return; 4632 | + 4633 | + refcount_set(&grp->refcount, 1); 4634 | + grp->active_nodes = 1; 4635 | + grp->max_faults_cpu = 0; 4636 | + spin_lock_init(&grp->lock); 4637 | + grp->gid = p->pid; 4638 | + 4639 | + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 4640 | + grp->faults[i] = p->numa_faults[i]; 4641 | + 4642 | + grp->total_faults = p->total_numa_faults; 4643 | + 4644 | + grp->nr_tasks++; 4645 | + rcu_assign_pointer(p->numa_group, grp); 4646 | + } 4647 | + 4648 | + rcu_read_lock(); 4649 | + tsk = READ_ONCE(cpu_rq(cpu)->curr); 4650 | + 4651 | + if (!cpupid_match_pid(tsk, cpupid)) 4652 | + goto no_join; 4653 | + 4654 | + grp = rcu_dereference(tsk->numa_group); 4655 | + if (!grp) 4656 | + goto no_join; 4657 | + 4658 | + my_grp = deref_curr_numa_group(p); 4659 | + if (grp == my_grp) 4660 | + goto no_join; 4661 | + 4662 | + /* 4663 | + * Only join the other group if its bigger; if we're the bigger group, 4664 | + * the other task will join us. 4665 | + */ 4666 | + if (my_grp->nr_tasks > grp->nr_tasks) 4667 | + goto no_join; 4668 | + 4669 | + /* 4670 | + * Tie-break on the grp address. 4671 | + */ 4672 | + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) 4673 | + goto no_join; 4674 | + 4675 | + /* Always join threads in the same process. */ 4676 | + if (tsk->mm == current->mm) 4677 | + join = true; 4678 | + 4679 | + /* Simple filter to avoid false positives due to PID collisions */ 4680 | + if (flags & TNF_SHARED) 4681 | + join = true; 4682 | + 4683 | + /* Update priv based on whether false sharing was detected */ 4684 | + *priv = !join; 4685 | + 4686 | + if (join && !get_numa_group(grp)) 4687 | + goto no_join; 4688 | + 4689 | + rcu_read_unlock(); 4690 | + 4691 | + if (!join) 4692 | + return; 4693 | + 4694 | + WARN_ON_ONCE(irqs_disabled()); 4695 | + double_lock_irq(&my_grp->lock, &grp->lock); 4696 | + 4697 | + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 4698 | + my_grp->faults[i] -= p->numa_faults[i]; 4699 | + grp->faults[i] += p->numa_faults[i]; 4700 | + } 4701 | + my_grp->total_faults -= p->total_numa_faults; 4702 | + grp->total_faults += p->total_numa_faults; 4703 | + 4704 | + my_grp->nr_tasks--; 4705 | + grp->nr_tasks++; 4706 | + 4707 | + spin_unlock(&my_grp->lock); 4708 | + spin_unlock_irq(&grp->lock); 4709 | + 4710 | + rcu_assign_pointer(p->numa_group, grp); 4711 | + 4712 | + put_numa_group(my_grp); 4713 | + return; 4714 | + 4715 | +no_join: 4716 | + rcu_read_unlock(); 4717 | + return; 4718 | +} 4719 | + 4720 | +/* 4721 | + * Get rid of NUMA statistics associated with a task (either current or dead). 4722 | + * If @final is set, the task is dead and has reached refcount zero, so we can 4723 | + * safely free all relevant data structures. Otherwise, there might be 4724 | + * concurrent reads from places like load balancing and procfs, and we should 4725 | + * reset the data back to default state without freeing ->numa_faults. 4726 | + */ 4727 | +void task_numa_free(struct task_struct *p, bool final) 4728 | +{ 4729 | + /* safe: p either is current or is being freed by current */ 4730 | + struct numa_group *grp = rcu_dereference_raw(p->numa_group); 4731 | + unsigned long *numa_faults = p->numa_faults; 4732 | + unsigned long flags; 4733 | + int i; 4734 | + 4735 | + if (!numa_faults) 4736 | + return; 4737 | + 4738 | + if (grp) { 4739 | + spin_lock_irqsave(&grp->lock, flags); 4740 | + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 4741 | + grp->faults[i] -= p->numa_faults[i]; 4742 | + grp->total_faults -= p->total_numa_faults; 4743 | + 4744 | + grp->nr_tasks--; 4745 | + spin_unlock_irqrestore(&grp->lock, flags); 4746 | + RCU_INIT_POINTER(p->numa_group, NULL); 4747 | + put_numa_group(grp); 4748 | + } 4749 | + 4750 | + if (final) { 4751 | + p->numa_faults = NULL; 4752 | + kfree(numa_faults); 4753 | + } else { 4754 | + p->total_numa_faults = 0; 4755 | + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 4756 | + numa_faults[i] = 0; 4757 | + } 4758 | +} 4759 | + 4760 | +/* 4761 | + * Got a PROT_NONE fault for a page on @node. 4762 | + */ 4763 | +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) 4764 | +{ 4765 | + struct task_struct *p = current; 4766 | + bool migrated = flags & TNF_MIGRATED; 4767 | + int cpu_node = task_node(current); 4768 | + int local = !!(flags & TNF_FAULT_LOCAL); 4769 | + struct numa_group *ng; 4770 | + int priv; 4771 | + 4772 | + if (!static_branch_likely(&sched_numa_balancing)) 4773 | + return; 4774 | + 4775 | + /* for example, ksmd faulting in a user's mm */ 4776 | + if (!p->mm) 4777 | + return; 4778 | + 4779 | + /* 4780 | + * NUMA faults statistics are unnecessary for the slow memory 4781 | + * node for memory tiering mode. 4782 | + */ 4783 | + if (!node_is_toptier(mem_node) && 4784 | + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || 4785 | + !cpupid_valid(last_cpupid))) 4786 | + return; 4787 | + 4788 | + /* Allocate buffer to track faults on a per-node basis */ 4789 | + if (unlikely(!p->numa_faults)) { 4790 | + int size = sizeof(*p->numa_faults) * 4791 | + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 4792 | + 4793 | + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 4794 | + if (!p->numa_faults) 4795 | + return; 4796 | + 4797 | + p->total_numa_faults = 0; 4798 | + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 4799 | + } 4800 | + 4801 | + /* 4802 | + * First accesses are treated as private, otherwise consider accesses 4803 | + * to be private if the accessing pid has not changed 4804 | + */ 4805 | + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { 4806 | + priv = 1; 4807 | + } else { 4808 | + priv = cpupid_match_pid(p, last_cpupid); 4809 | + if (!priv && !(flags & TNF_NO_GROUP)) 4810 | + task_numa_group(p, last_cpupid, flags, &priv); 4811 | + } 4812 | + 4813 | + /* 4814 | + * If a workload spans multiple NUMA nodes, a shared fault that 4815 | + * occurs wholly within the set of nodes that the workload is 4816 | + * actively using should be counted as local. This allows the 4817 | + * scan rate to slow down when a workload has settled down. 4818 | + */ 4819 | + ng = deref_curr_numa_group(p); 4820 | + if (!priv && !local && ng && ng->active_nodes > 1 && 4821 | + numa_is_active_node(cpu_node, ng) && 4822 | + numa_is_active_node(mem_node, ng)) 4823 | + local = 1; 4824 | + 4825 | + /* 4826 | + * Retry to migrate task to preferred node periodically, in case it 4827 | + * previously failed, or the scheduler moved us. 4828 | + */ 4829 | + if (time_after(jiffies, p->numa_migrate_retry)) { 4830 | + task_numa_placement(p); 4831 | + numa_migrate_preferred(p); 4832 | + } 4833 | + 4834 | + if (migrated) 4835 | + p->numa_pages_migrated += pages; 4836 | + if (flags & TNF_MIGRATE_FAIL) 4837 | + p->numa_faults_locality[2] += pages; 4838 | + 4839 | + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 4840 | + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; 4841 | + p->numa_faults_locality[local] += pages; 4842 | +} 4843 | + 4844 | +static void reset_ptenuma_scan(struct task_struct *p) 4845 | +{ 4846 | + /* 4847 | + * We only did a read acquisition of the mmap sem, so 4848 | + * p->mm->numa_scan_seq is written to without exclusive access 4849 | + * and the update is not guaranteed to be atomic. That's not 4850 | + * much of an issue though, since this is just used for 4851 | + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not 4852 | + * expensive, to avoid any form of compiler optimizations: 4853 | + */ 4854 | + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); 4855 | + p->mm->numa_scan_offset = 0; 4856 | +} 4857 | + 4858 | +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) 4859 | +{ 4860 | + unsigned long pids; 4861 | + /* 4862 | + * Allow unconditional access first two times, so that all the (pages) 4863 | + * of VMAs get prot_none fault introduced irrespective of accesses. 4864 | + * This is also done to avoid any side effect of task scanning 4865 | + * amplifying the unfairness of disjoint set of VMAs' access. 4866 | + */ 4867 | + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) 4868 | + return true; 4869 | + 4870 | + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; 4871 | + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) 4872 | + return true; 4873 | + 4874 | + /* 4875 | + * Complete a scan that has already started regardless of PID access, or 4876 | + * some VMAs may never be scanned in multi-threaded applications: 4877 | + */ 4878 | + if (mm->numa_scan_offset > vma->vm_start) { 4879 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); 4880 | + return true; 4881 | + } 4882 | + 4883 | + return false; 4884 | +} 4885 | + 4886 | +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) 4887 | + 4888 | +/* 4889 | + * The expensive part of numa migration is done from task_work context. 4890 | + * Triggered from task_tick_numa(). 4891 | + */ 4892 | +static void task_numa_work(struct callback_head *work) 4893 | +{ 4894 | + unsigned long migrate, next_scan, now = jiffies; 4895 | + struct task_struct *p = current; 4896 | + struct mm_struct *mm = p->mm; 4897 | + u64 runtime = p->se.sum_exec_runtime; 4898 | + struct vm_area_struct *vma; 4899 | + unsigned long start, end; 4900 | + unsigned long nr_pte_updates = 0; 4901 | + long pages, virtpages; 4902 | + struct vma_iterator vmi; 4903 | + bool vma_pids_skipped; 4904 | + bool vma_pids_forced = false; 4905 | + 4906 | + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); 4907 | + 4908 | + work->next = work; 4909 | + /* 4910 | + * Who cares about NUMA placement when they're dying. 4911 | + * 4912 | + * NOTE: make sure not to dereference p->mm before this check, 4913 | + * exit_task_work() happens _after_ exit_mm() so we could be called 4914 | + * without p->mm even though we still had it when we enqueued this 4915 | + * work. 4916 | + */ 4917 | + if (p->flags & PF_EXITING) 4918 | + return; 4919 | + 4920 | + if (!mm->numa_next_scan) { 4921 | + mm->numa_next_scan = now + 4922 | + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 4923 | + } 4924 | + 4925 | + /* 4926 | + * Enforce maximal scan/migration frequency.. 4927 | + */ 4928 | + migrate = mm->numa_next_scan; 4929 | + if (time_before(now, migrate)) 4930 | + return; 4931 | + 4932 | + if (p->numa_scan_period == 0) { 4933 | + p->numa_scan_period_max = task_scan_max(p); 4934 | + p->numa_scan_period = task_scan_start(p); 4935 | + } 4936 | + 4937 | + next_scan = now + msecs_to_jiffies(p->numa_scan_period); 4938 | + if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) 4939 | + return; 4940 | + 4941 | + /* 4942 | + * Delay this task enough that another task of this mm will likely win 4943 | + * the next time around. 4944 | + */ 4945 | + p->node_stamp += 2 * TICK_NSEC; 4946 | + 4947 | + pages = sysctl_numa_balancing_scan_size; 4948 | + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ 4949 | + virtpages = pages * 8; /* Scan up to this much virtual space */ 4950 | + if (!pages) 4951 | + return; 4952 | + 4953 | + 4954 | + if (!mmap_read_trylock(mm)) 4955 | + return; 4956 | + 4957 | + /* 4958 | + * VMAs are skipped if the current PID has not trapped a fault within 4959 | + * the VMA recently. Allow scanning to be forced if there is no 4960 | + * suitable VMA remaining. 4961 | + */ 4962 | + vma_pids_skipped = false; 4963 | + 4964 | +retry_pids: 4965 | + start = mm->numa_scan_offset; 4966 | + vma_iter_init(&vmi, mm, start); 4967 | + vma = vma_next(&vmi); 4968 | + if (!vma) { 4969 | + reset_ptenuma_scan(p); 4970 | + start = 0; 4971 | + vma_iter_set(&vmi, start); 4972 | + vma = vma_next(&vmi); 4973 | + } 4974 | + 4975 | + do { 4976 | + if (!vma_migratable(vma) || !vma_policy_mof(vma) || 4977 | + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { 4978 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); 4979 | + continue; 4980 | + } 4981 | + 4982 | + /* 4983 | + * Shared library pages mapped by multiple processes are not 4984 | + * migrated as it is expected they are cache replicated. Avoid 4985 | + * hinting faults in read-only file-backed mappings or the vdso 4986 | + * as migrating the pages will be of marginal benefit. 4987 | + */ 4988 | + if (!vma->vm_mm || 4989 | + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { 4990 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); 4991 | + continue; 4992 | + } 4993 | + 4994 | + /* 4995 | + * Skip inaccessible VMAs to avoid any confusion between 4996 | + * PROT_NONE and NUMA hinting ptes 4997 | + */ 4998 | + if (!vma_is_accessible(vma)) { 4999 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); 5000 | + continue; 5001 | + } 5002 | + 5003 | + /* Initialise new per-VMA NUMAB state. */ 5004 | + if (!vma->numab_state) { 5005 | + vma->numab_state = kzalloc(sizeof(struct vma_numab_state), 5006 | + GFP_KERNEL); 5007 | + if (!vma->numab_state) 5008 | + continue; 5009 | + 5010 | + vma->numab_state->start_scan_seq = mm->numa_scan_seq; 5011 | + 5012 | + vma->numab_state->next_scan = now + 5013 | + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 5014 | + 5015 | + /* Reset happens after 4 times scan delay of scan start */ 5016 | + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + 5017 | + msecs_to_jiffies(VMA_PID_RESET_PERIOD); 5018 | + 5019 | + /* 5020 | + * Ensure prev_scan_seq does not match numa_scan_seq, 5021 | + * to prevent VMAs being skipped prematurely on the 5022 | + * first scan: 5023 | + */ 5024 | + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; 5025 | + } 5026 | + 5027 | + /* 5028 | + * Scanning the VMA's of short lived tasks add more overhead. So 5029 | + * delay the scan for new VMAs. 5030 | + */ 5031 | + if (mm->numa_scan_seq && time_before(jiffies, 5032 | + vma->numab_state->next_scan)) { 5033 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); 5034 | + continue; 5035 | + } 5036 | + 5037 | + /* RESET access PIDs regularly for old VMAs. */ 5038 | + if (mm->numa_scan_seq && 5039 | + time_after(jiffies, vma->numab_state->pids_active_reset)) { 5040 | + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + 5041 | + msecs_to_jiffies(VMA_PID_RESET_PERIOD); 5042 | + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); 5043 | + vma->numab_state->pids_active[1] = 0; 5044 | + } 5045 | + 5046 | + /* Do not rescan VMAs twice within the same sequence. */ 5047 | + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { 5048 | + mm->numa_scan_offset = vma->vm_end; 5049 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); 5050 | + continue; 5051 | + } 5052 | + 5053 | + /* 5054 | + * Do not scan the VMA if task has not accessed it, unless no other 5055 | + * VMA candidate exists. 5056 | + */ 5057 | + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { 5058 | + vma_pids_skipped = true; 5059 | + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); 5060 | + continue; 5061 | + } 5062 | + 5063 | + do { 5064 | + start = max(start, vma->vm_start); 5065 | + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 5066 | + end = min(end, vma->vm_end); 5067 | + nr_pte_updates = change_prot_numa(vma, start, end); 5068 | + 5069 | + /* 5070 | + * Try to scan sysctl_numa_balancing_size worth of 5071 | + * hpages that have at least one present PTE that 5072 | + * is not already pte-numa. If the VMA contains 5073 | + * areas that are unused or already full of prot_numa 5074 | + * PTEs, scan up to virtpages, to skip through those 5075 | + * areas faster. 5076 | + */ 5077 | + if (nr_pte_updates) 5078 | + pages -= (end - start) >> PAGE_SHIFT; 5079 | + virtpages -= (end - start) >> PAGE_SHIFT; 5080 | + 5081 | + start = end; 5082 | + if (pages <= 0 || virtpages <= 0) 5083 | + goto out; 5084 | + 5085 | + cond_resched(); 5086 | + } while (end != vma->vm_end); 5087 | + 5088 | + /* VMA scan is complete, do not scan until next sequence. */ 5089 | + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; 5090 | + 5091 | + /* 5092 | + * Only force scan within one VMA at a time, to limit the 5093 | + * cost of scanning a potentially uninteresting VMA. 5094 | + */ 5095 | + if (vma_pids_forced) 5096 | + break; 5097 | + } for_each_vma(vmi, vma); 5098 | + 5099 | + /* 5100 | + * If no VMAs are remaining and VMAs were skipped due to the PID 5101 | + * not accessing the VMA previously, then force a scan to ensure 5102 | + * forward progress: 5103 | + */ 5104 | + if (!vma && !vma_pids_forced && vma_pids_skipped) { 5105 | + vma_pids_forced = true; 5106 | + goto retry_pids; 5107 | + } 5108 | + 5109 | +out: 5110 | + /* 5111 | + * It is possible to reach the end of the VMA list but the last few 5112 | + * VMAs are not guaranteed to the vma_migratable. If they are not, we 5113 | + * would find the !migratable VMA on the next scan but not reset the 5114 | + * scanner to the start so check it now. 5115 | + */ 5116 | + if (vma) 5117 | + mm->numa_scan_offset = start; 5118 | + else 5119 | + reset_ptenuma_scan(p); 5120 | + mmap_read_unlock(mm); 5121 | + 5122 | + /* 5123 | + * Make sure tasks use at least 32x as much time to run other code 5124 | + * than they used here, to limit NUMA PTE scanning overhead to 3% max. 5125 | + * Usually update_task_scan_period slows down scanning enough; on an 5126 | + * overloaded system we need to limit overhead on a per task basis. 5127 | + */ 5128 | + if (unlikely(p->se.sum_exec_runtime != runtime)) { 5129 | + u64 diff = p->se.sum_exec_runtime - runtime; 5130 | + p->node_stamp += 32 * diff; 5131 | + } 5132 | +} 5133 | + 5134 | +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) 5135 | +{ 5136 | + int mm_users = 0; 5137 | + struct mm_struct *mm = p->mm; 5138 | + 5139 | + if (mm) { 5140 | + mm_users = atomic_read(&mm->mm_users); 5141 | + if (mm_users == 1) { 5142 | + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 5143 | + mm->numa_scan_seq = 0; 5144 | + } 5145 | + } 5146 | + p->node_stamp = 0; 5147 | + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; 5148 | + p->numa_scan_period = sysctl_numa_balancing_scan_delay; 5149 | + p->numa_migrate_retry = 0; 5150 | + /* Protect against double add, see task_tick_numa and task_numa_work */ 5151 | + p->numa_work.next = &p->numa_work; 5152 | + p->numa_faults = NULL; 5153 | + p->numa_pages_migrated = 0; 5154 | + p->total_numa_faults = 0; 5155 | + RCU_INIT_POINTER(p->numa_group, NULL); 5156 | + p->last_task_numa_placement = 0; 5157 | + p->last_sum_exec_runtime = 0; 5158 | + 5159 | + init_task_work(&p->numa_work, task_numa_work); 5160 | + 5161 | + /* New address space, reset the preferred nid */ 5162 | + if (!(clone_flags & CLONE_VM)) { 5163 | + p->numa_preferred_nid = NUMA_NO_NODE; 5164 | + return; 5165 | + } 5166 | + 5167 | + /* 5168 | + * New thread, keep existing numa_preferred_nid which should be copied 5169 | + * already by arch_dup_task_struct but stagger when scans start. 5170 | + */ 5171 | + if (mm) { 5172 | + unsigned int delay; 5173 | + 5174 | + delay = min_t(unsigned int, task_scan_max(current), 5175 | + current->numa_scan_period * mm_users * NSEC_PER_MSEC); 5176 | + delay += 2 * TICK_NSEC; 5177 | + p->node_stamp = delay; 5178 | + } 5179 | +} 5180 | + 5181 | +/* 5182 | + * Drive the periodic memory faults.. 5183 | + */ 5184 | +static void task_tick_numa(struct rq *rq, struct task_struct *curr) 5185 | +{ 5186 | + struct callback_head *work = &curr->numa_work; 5187 | + u64 period, now; 5188 | + 5189 | + /* 5190 | + * We don't care about NUMA placement if we don't have memory. 5191 | + */ 5192 | + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) 5193 | + return; 5194 | + 5195 | + /* 5196 | + * Using runtime rather than walltime has the dual advantage that 5197 | + * we (mostly) drive the selection from busy threads and that the 5198 | + * task needs to have done some actual work before we bother with 5199 | + * NUMA placement. 5200 | + */ 5201 | + now = curr->se.sum_exec_runtime; 5202 | + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; 5203 | + 5204 | + if (now > curr->node_stamp + period) { 5205 | + if (!curr->node_stamp) 5206 | + curr->numa_scan_period = task_scan_start(curr); 5207 | + curr->node_stamp += period; 5208 | + 5209 | + if (!time_before(jiffies, curr->mm->numa_next_scan)) 5210 | + task_work_add(curr, work, TWA_RESUME); 5211 | + } 5212 | +} 5213 | + 5214 | +static void update_scan_period(struct task_struct *p, int new_cpu) 5215 | +{ 5216 | + int src_nid = cpu_to_node(task_cpu(p)); 5217 | + int dst_nid = cpu_to_node(new_cpu); 5218 | + 5219 | + if (!static_branch_likely(&sched_numa_balancing)) 5220 | + return; 5221 | + 5222 | + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) 5223 | + return; 5224 | + 5225 | + if (src_nid == dst_nid) 5226 | + return; 5227 | + 5228 | + /* 5229 | + * Allow resets if faults have been trapped before one scan 5230 | + * has completed. This is most likely due to a new task that 5231 | + * is pulled cross-node due to wakeups or load balancing. 5232 | + */ 5233 | + if (p->numa_scan_seq) { 5234 | + /* 5235 | + * Avoid scan adjustments if moving to the preferred 5236 | + * node or if the task was not previously running on 5237 | + * the preferred node. 5238 | + */ 5239 | + if (dst_nid == p->numa_preferred_nid || 5240 | + (p->numa_preferred_nid != NUMA_NO_NODE && 5241 | + src_nid != p->numa_preferred_nid)) 5242 | + return; 5243 | + } 5244 | + 5245 | + p->numa_scan_period = task_scan_start(p); 5246 | +} 5247 | + 5248 | +#else 5249 | +static void task_tick_numa(struct rq *rq, struct task_struct *curr) 5250 | +{ 5251 | +} 5252 | + 5253 | +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) 5254 | +{ 5255 | +} 5256 | + 5257 | +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 5258 | +{ 5259 | +} 5260 | + 5261 | +static inline void update_scan_period(struct task_struct *p, int new_cpu) 5262 | +{ 5263 | +} 5264 | + 5265 | +#endif /* CONFIG_NUMA_BALANCING */ 5266 | diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c 5267 | index 31231925f1ec..95e7f83b5ab8 100644 5268 | --- a/kernel/sched/idle.c 5269 | +++ b/kernel/sched/idle.c 5270 | @@ -311,6 +311,7 @@ static void do_idle(void) 5271 | } else { 5272 | cpuidle_idle_call(); 5273 | } 5274 | + 5275 | arch_cpu_idle_exit(); 5276 | } 5277 | 5278 | diff --git a/kernel/sched/nohz.h b/kernel/sched/nohz.h 5279 | new file mode 100644 5280 | index 000000000000..f00aeacb8f23 5281 | --- /dev/null 5282 | +++ b/kernel/sched/nohz.h 5283 | @@ -0,0 +1,511 @@ 5284 | +#ifdef CONFIG_NO_HZ_COMMON 5285 | +static struct { 5286 | + cpumask_var_t idle_cpus_mask; 5287 | + atomic_t nr_cpus; 5288 | + int has_blocked; /* Idle CPUS has blocked load */ 5289 | + int needs_update; /* Newly idle CPUs need their next_balance collated */ 5290 | + unsigned long next_balance; /* in jiffy units */ 5291 | + unsigned long next_blocked; /* Next update of blocked load in jiffies */ 5292 | +} nohz ____cacheline_aligned; 5293 | + 5294 | +static bool update_nohz_stats(struct rq *rq) 5295 | +{ 5296 | + unsigned int cpu = rq->cpu; 5297 | + 5298 | + if (!rq->has_blocked_load) 5299 | + return false; 5300 | + 5301 | + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) 5302 | + return false; 5303 | + 5304 | + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) 5305 | + return true; 5306 | + 5307 | + return rq->has_blocked_load; 5308 | +} 5309 | + 5310 | +/* 5311 | + * Internal function that runs load balance for all idle cpus. The load balance 5312 | + * can be a simple update of blocked load or a complete load balance with 5313 | + * tasks movement depending of flags. 5314 | + */ 5315 | +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) 5316 | +{ 5317 | + /* Earliest time when we have to do rebalance again */ 5318 | + unsigned long now = jiffies; 5319 | + unsigned long next_balance = now + 60*HZ; 5320 | + bool has_blocked_load = false; 5321 | + int update_next_balance = 0; 5322 | + int this_cpu = this_rq->cpu; 5323 | + int balance_cpu; 5324 | + struct rq *rq; 5325 | + 5326 | + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); 5327 | + 5328 | + /* 5329 | + * We assume there will be no idle load after this update and clear 5330 | + * the has_blocked flag. If a cpu enters idle in the mean time, it will 5331 | + * set the has_blocked flag and trigger another update of idle load. 5332 | + * Because a cpu that becomes idle, is added to idle_cpus_mask before 5333 | + * setting the flag, we are sure to not clear the state and not 5334 | + * check the load of an idle cpu. 5335 | + * 5336 | + * Same applies to idle_cpus_mask vs needs_update. 5337 | + */ 5338 | + if (flags & NOHZ_STATS_KICK) 5339 | + WRITE_ONCE(nohz.has_blocked, 0); 5340 | + if (flags & NOHZ_NEXT_KICK) 5341 | + WRITE_ONCE(nohz.needs_update, 0); 5342 | + 5343 | + /* 5344 | + * Ensures that if we miss the CPU, we must see the has_blocked 5345 | + * store from nohz_balance_enter_idle(). 5346 | + */ 5347 | + smp_mb(); 5348 | + 5349 | + /* 5350 | + * Start with the next CPU after this_cpu so we will end with this_cpu and let a 5351 | + * chance for other idle cpu to pull load. 5352 | + */ 5353 | + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { 5354 | + if (!idle_cpu(balance_cpu)) 5355 | + continue; 5356 | + 5357 | + /* 5358 | + * If this CPU gets work to do, stop the load balancing 5359 | + * work being done for other CPUs. Next load 5360 | + * balancing owner will pick it up. 5361 | + */ 5362 | + if (need_resched()) { 5363 | + if (flags & NOHZ_STATS_KICK) 5364 | + has_blocked_load = true; 5365 | + if (flags & NOHZ_NEXT_KICK) 5366 | + WRITE_ONCE(nohz.needs_update, 1); 5367 | + goto abort; 5368 | + } 5369 | + 5370 | + rq = cpu_rq(balance_cpu); 5371 | + 5372 | + if (flags & NOHZ_STATS_KICK) 5373 | + has_blocked_load |= update_nohz_stats(rq); 5374 | + 5375 | + /* 5376 | + * If time for next balance is due, 5377 | + * do the balance. 5378 | + */ 5379 | + if (time_after_eq(jiffies, rq->next_balance)) { 5380 | + struct rq_flags rf; 5381 | + 5382 | + rq_lock_irqsave(rq, &rf); 5383 | + update_rq_clock(rq); 5384 | + rq_unlock_irqrestore(rq, &rf); 5385 | + 5386 | + if (flags & NOHZ_BALANCE_KICK) 5387 | + idle_balance(rq); 5388 | + } 5389 | + 5390 | + if (time_after(next_balance, rq->next_balance)) { 5391 | + next_balance = rq->next_balance; 5392 | + update_next_balance = 1; 5393 | + } 5394 | + } 5395 | + 5396 | + /* 5397 | + * next_balance will be updated only when there is a need. 5398 | + * When the CPU is attached to null domain for ex, it will not be 5399 | + * updated. 5400 | + */ 5401 | + if (likely(update_next_balance)) 5402 | + nohz.next_balance = next_balance; 5403 | + 5404 | + if (flags & NOHZ_STATS_KICK) 5405 | + WRITE_ONCE(nohz.next_blocked, 5406 | + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); 5407 | + 5408 | +abort: 5409 | + /* There is still blocked load, enable periodic update */ 5410 | + if (has_blocked_load) 5411 | + WRITE_ONCE(nohz.has_blocked, 1); 5412 | +} 5413 | + 5414 | +/* 5415 | + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 5416 | + * rebalancing for all the cpus for whom scheduler ticks are stopped. 5417 | + */ 5418 | +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 5419 | +{ 5420 | + unsigned int flags = this_rq->nohz_idle_balance; 5421 | + 5422 | + if (!flags) 5423 | + return false; 5424 | + 5425 | + this_rq->nohz_idle_balance = 0; 5426 | + 5427 | + if (idle != CPU_IDLE) 5428 | + return false; 5429 | + 5430 | + _nohz_idle_balance(this_rq, flags); 5431 | + 5432 | + return true; 5433 | +} 5434 | + 5435 | +/* 5436 | + * Check if we need to directly run the ILB for updating blocked load before 5437 | + * entering idle state. Here we run ILB directly without issuing IPIs. 5438 | + * 5439 | + * Note that when this function is called, the tick may not yet be stopped on 5440 | + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and 5441 | + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates 5442 | + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle 5443 | + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is 5444 | + * called from this function on (this) CPU that's not yet in the mask. That's 5445 | + * OK because the goal of nohz_run_idle_balance() is to run ILB only for 5446 | + * updating the blocked load of already idle CPUs without waking up one of 5447 | + * those idle CPUs and outside the preempt disable / irq off phase of the local 5448 | + * cpu about to enter idle, because it can take a long time. 5449 | + */ 5450 | +void nohz_run_idle_balance(int cpu) 5451 | +{ 5452 | + unsigned int flags; 5453 | + 5454 | + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); 5455 | + 5456 | + /* 5457 | + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen 5458 | + * (ie NOHZ_STATS_KICK set) and will do the same. 5459 | + */ 5460 | + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) 5461 | + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); 5462 | +} 5463 | + 5464 | +static void set_cpu_sd_state_busy(int cpu) 5465 | +{ 5466 | + struct sched_domain *sd; 5467 | + 5468 | + rcu_read_lock(); 5469 | + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 5470 | + 5471 | + if (!sd || !sd->nohz_idle) 5472 | + goto unlock; 5473 | + sd->nohz_idle = 0; 5474 | + 5475 | + atomic_inc(&sd->shared->nr_busy_cpus); 5476 | +unlock: 5477 | + rcu_read_unlock(); 5478 | +} 5479 | + 5480 | +void nohz_balance_exit_idle(struct rq *rq) 5481 | +{ 5482 | + SCHED_WARN_ON(rq != this_rq()); 5483 | + 5484 | + if (likely(!rq->nohz_tick_stopped)) 5485 | + return; 5486 | + 5487 | + rq->nohz_tick_stopped = 0; 5488 | + cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); 5489 | + atomic_dec(&nohz.nr_cpus); 5490 | + 5491 | + set_cpu_sd_state_busy(rq->cpu); 5492 | +} 5493 | + 5494 | +static void set_cpu_sd_state_idle(int cpu) 5495 | +{ 5496 | + struct sched_domain *sd; 5497 | + 5498 | + rcu_read_lock(); 5499 | + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 5500 | + 5501 | + if (!sd || sd->nohz_idle) 5502 | + goto unlock; 5503 | + sd->nohz_idle = 1; 5504 | + 5505 | + atomic_dec(&sd->shared->nr_busy_cpus); 5506 | +unlock: 5507 | + rcu_read_unlock(); 5508 | +} 5509 | + 5510 | +/* 5511 | + * This routine will record that the CPU is going idle with tick stopped. 5512 | + * This info will be used in performing idle load balancing in the future. 5513 | + */ 5514 | +void nohz_balance_enter_idle(int cpu) 5515 | +{ 5516 | + struct rq *rq = cpu_rq(cpu); 5517 | + 5518 | + SCHED_WARN_ON(cpu != smp_processor_id()); 5519 | + 5520 | + /* If this CPU is going down, then nothing needs to be done: */ 5521 | + if (!cpu_active(cpu)) 5522 | + return; 5523 | + 5524 | + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 5525 | + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) 5526 | + return; 5527 | + 5528 | + /* 5529 | + * Can be set safely without rq->lock held 5530 | + * If a clear happens, it will have evaluated last additions because 5531 | + * rq->lock is held during the check and the clear 5532 | + */ 5533 | + rq->has_blocked_load = 1; 5534 | + 5535 | + /* 5536 | + * The tick is still stopped but load could have been added in the 5537 | + * meantime. We set the nohz.has_blocked flag to trig a check of the 5538 | + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear 5539 | + * of nohz.has_blocked can only happen after checking the new load 5540 | + */ 5541 | + if (rq->nohz_tick_stopped) 5542 | + goto out; 5543 | + 5544 | + /* If we're a completely isolated CPU, we don't play: */ 5545 | + if (on_null_domain(rq)) 5546 | + return; 5547 | + 5548 | + rq->nohz_tick_stopped = 1; 5549 | + 5550 | + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 5551 | + atomic_inc(&nohz.nr_cpus); 5552 | + 5553 | + /* 5554 | + * Ensures that if nohz_idle_balance() fails to observe our 5555 | + * @idle_cpus_mask store, it must observe the @has_blocked 5556 | + * and @needs_update stores. 5557 | + */ 5558 | + smp_mb__after_atomic(); 5559 | + 5560 | + set_cpu_sd_state_idle(cpu); 5561 | + 5562 | + WRITE_ONCE(nohz.needs_update, 1); 5563 | +out: 5564 | + /* 5565 | + * Each time a cpu enter idle, we assume that it has blocked load and 5566 | + * enable the periodic update of the load of idle cpus 5567 | + */ 5568 | + WRITE_ONCE(nohz.has_blocked, 1); 5569 | +} 5570 | + 5571 | +/* 5572 | + * run_rebalance_domains is triggered when needed from the scheduler tick. 5573 | + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). 5574 | + */ 5575 | +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) 5576 | +{ 5577 | + struct rq *this_rq = this_rq(); 5578 | + enum cpu_idle_type idle = this_rq->idle_balance ? 5579 | + CPU_IDLE : CPU_NOT_IDLE; 5580 | + 5581 | + /* 5582 | + * If this CPU has a pending nohz_balance_kick, then do the 5583 | + * balancing on behalf of the other idle CPUs whose ticks are 5584 | + * stopped. Do nohz_idle_balance *before* rebalance_domains to 5585 | + * give the idle CPUs a chance to load balance. Else we may 5586 | + * load balance only within the local sched_domain hierarchy 5587 | + * and abort nohz_idle_balance altogether if we pull some load. 5588 | + */ 5589 | + if (nohz_idle_balance(this_rq, idle)) 5590 | + return; 5591 | + 5592 | + /* normal load balance */ 5593 | + update_blocked_averages(this_rq->cpu); 5594 | +} 5595 | + 5596 | +static inline int find_new_ilb(void) 5597 | +{ 5598 | + const struct cpumask *hk_mask; 5599 | + int ilb_cpu; 5600 | + 5601 | + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); 5602 | + 5603 | + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { 5604 | + 5605 | + if (ilb_cpu == smp_processor_id()) 5606 | + continue; 5607 | + 5608 | + if (idle_cpu(ilb_cpu)) 5609 | + return ilb_cpu; 5610 | + } 5611 | + 5612 | + return -1; 5613 | +} 5614 | + 5615 | +/* 5616 | + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU 5617 | + * SMP function call (IPI). 5618 | + * 5619 | + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). 5620 | + */ 5621 | +static void kick_ilb(unsigned int flags) 5622 | +{ 5623 | + int ilb_cpu; 5624 | + 5625 | + /* 5626 | + * Increase nohz.next_balance only when if full ilb is triggered but 5627 | + * not if we only update stats. 5628 | + */ 5629 | + if (flags & NOHZ_BALANCE_KICK) 5630 | + nohz.next_balance = jiffies+1; 5631 | + 5632 | + ilb_cpu = find_new_ilb(); 5633 | + if (ilb_cpu < 0) 5634 | + return; 5635 | + 5636 | + /* 5637 | + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets 5638 | + * the first flag owns it; cleared by nohz_csd_func(). 5639 | + */ 5640 | + flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); 5641 | + if (flags & NOHZ_KICK_MASK) 5642 | + return; 5643 | + 5644 | + /* 5645 | + * This way we generate an IPI on the target CPU which 5646 | + * is idle, and the softirq performing NOHZ idle load balancing 5647 | + * will be run before returning from the IPI. 5648 | + */ 5649 | + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); 5650 | +} 5651 | + 5652 | +static inline int 5653 | +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) 5654 | +{ 5655 | + return ((rq->cpu_capacity * sd->imbalance_pct) < 5656 | + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); 5657 | +} 5658 | + 5659 | +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) 5660 | +{ 5661 | + if (!sched_smt_active()) 5662 | + return true; 5663 | + 5664 | + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); 5665 | +} 5666 | + 5667 | +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) 5668 | +{ 5669 | + return rq->misfit_task_load && 5670 | + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || 5671 | + check_cpu_capacity(rq, sd)); 5672 | +} 5673 | + 5674 | +/* 5675 | + * Current decision point for kicking the idle load balancer in the presence 5676 | + * of idle CPUs in the system. 5677 | + */ 5678 | +static void nohz_balancer_kick(struct rq *rq) 5679 | +{ 5680 | + unsigned long now = jiffies; 5681 | + struct sched_domain_shared *sds; 5682 | + struct sched_domain *sd; 5683 | + int nr_busy, i, cpu = rq->cpu; 5684 | + unsigned int flags = 0; 5685 | + 5686 | + if (unlikely(rq->idle_balance)) 5687 | + return; 5688 | + 5689 | + /* 5690 | + * We may be recently in ticked or tickless idle mode. At the first 5691 | + * busy tick after returning from idle, we will update the busy stats. 5692 | + */ 5693 | + nohz_balance_exit_idle(rq); 5694 | + 5695 | + /* 5696 | + * None are in tickless mode and hence no need for NOHZ idle load 5697 | + * balancing: 5698 | + */ 5699 | + if (likely(!atomic_read(&nohz.nr_cpus))) 5700 | + return; 5701 | + 5702 | + if (READ_ONCE(nohz.has_blocked) && 5703 | + time_after(now, READ_ONCE(nohz.next_blocked))) 5704 | + flags = NOHZ_STATS_KICK; 5705 | + 5706 | + if (time_before(now, nohz.next_balance)) 5707 | + goto out; 5708 | + 5709 | + if (rq->nr_running >= 2) { 5710 | + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 5711 | + goto out; 5712 | + } 5713 | + 5714 | + rcu_read_lock(); 5715 | + 5716 | + sd = rcu_dereference(rq->sd); 5717 | + if (sd) { 5718 | + /* 5719 | + * If there's a runnable CFS task and the current CPU has reduced 5720 | + * capacity, kick the ILB to see if there's a better CPU to run on: 5721 | + */ 5722 | + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { 5723 | + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 5724 | + goto unlock; 5725 | + } 5726 | + } 5727 | + 5728 | + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); 5729 | + if (sd) { 5730 | + /* 5731 | + * When ASYM_PACKING; see if there's a more preferred CPU 5732 | + * currently idle; in which case, kick the ILB to move tasks 5733 | + * around. 5734 | + * 5735 | + * When balancing betwen cores, all the SMT siblings of the 5736 | + * preferred CPU must be idle. 5737 | + */ 5738 | + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { 5739 | + if (sched_use_asym_prio(sd, i) && 5740 | + sched_asym_prefer(i, cpu)) { 5741 | + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 5742 | + goto unlock; 5743 | + } 5744 | + } 5745 | + } 5746 | + 5747 | + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); 5748 | + if (sd) { 5749 | + /* 5750 | + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU 5751 | + * to run the misfit task on. 5752 | + */ 5753 | + if (check_misfit_status(rq, sd)) { 5754 | + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 5755 | + goto unlock; 5756 | + } 5757 | + 5758 | + /* 5759 | + * For asymmetric systems, we do not want to nicely balance 5760 | + * cache use, instead we want to embrace asymmetry and only 5761 | + * ensure tasks have enough CPU capacity. 5762 | + * 5763 | + * Skip the LLC logic because it's not relevant in that case. 5764 | + */ 5765 | + goto unlock; 5766 | + } 5767 | + 5768 | + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5769 | + if (sds) { 5770 | + /* 5771 | + * If there is an imbalance between LLC domains (IOW we could 5772 | + * increase the overall cache utilization), we need a less-loaded LLC 5773 | + * domain to pull some load from. Likewise, we may need to spread 5774 | + * load within the current LLC domain (e.g. packed SMT cores but 5775 | + * other CPUs are idle). We can't really know from here how busy 5776 | + * the others are - so just get a NOHZ balance going if it looks 5777 | + * like this LLC domain has tasks we could move. 5778 | + */ 5779 | + nr_busy = atomic_read(&sds->nr_busy_cpus); 5780 | + if (nr_busy > 1) { 5781 | + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 5782 | + goto unlock; 5783 | + } 5784 | + } 5785 | +unlock: 5786 | + rcu_read_unlock(); 5787 | +out: 5788 | + if (READ_ONCE(nohz.needs_update)) 5789 | + flags |= NOHZ_NEXT_KICK; 5790 | + 5791 | + if (flags) 5792 | + kick_ilb(flags); 5793 | +} 5794 | +#endif /* CONFIG_NO_HZ_COMMON */ 5795 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 5796 | index 001fe047bd5d..e27be055ca86 100644 5797 | --- a/kernel/sched/sched.h 5798 | +++ b/kernel/sched/sched.h 5799 | @@ -109,6 +109,10 @@ extern int sysctl_sched_rt_period; 5800 | extern int sysctl_sched_rt_runtime; 5801 | extern int sched_rr_timeslice; 5802 | 5803 | +#ifdef CONFIG_ECHO_SCHED 5804 | +extern unsigned int bs_shared_quota; 5805 | +#endif 5806 | + 5807 | /* 5808 | * Helpers for converting nanosecond timing to jiffy resolution 5809 | */ 5810 | @@ -574,7 +578,9 @@ struct cfs_rq { 5811 | unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 5812 | unsigned int idle_nr_running; /* SCHED_IDLE */ 5813 | unsigned int idle_h_nr_running; /* SCHED_IDLE */ 5814 | - 5815 | +#ifdef CONFIG_ECHO_SCHED 5816 | + u64 local_cand_est; 5817 | +#endif 5818 | s64 avg_vruntime; 5819 | u64 avg_load; 5820 | 5821 | @@ -596,6 +602,10 @@ struct cfs_rq { 5822 | * It is set to NULL otherwise (i.e when none are currently running). 5823 | */ 5824 | struct sched_entity *curr; 5825 | +#ifdef CONFIG_ECHO_SCHED 5826 | + struct bs_node *head; 5827 | + struct bs_node *q2_head; 5828 | +#endif 5829 | struct sched_entity *next; 5830 | 5831 | #ifdef CONFIG_SCHED_DEBUG 5832 | @@ -1891,6 +1901,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); 5833 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); 5834 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); 5835 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); 5836 | + 5837 | extern struct static_key_false sched_asym_cpucapacity; 5838 | extern struct static_key_false sched_cluster_active; 5839 | 5840 | diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig 5841 | index bae8f11070be..93caca5d2528 100644 5842 | --- a/kernel/time/Kconfig 5843 | +++ b/kernel/time/Kconfig 5844 | @@ -119,7 +119,7 @@ config NO_HZ_FULL 5845 | bool "Full dynticks system (tickless)" 5846 | # NO_HZ_COMMON dependency 5847 | # We need at least one periodic CPU for timekeeping 5848 | - depends on SMP 5849 | + depends on SMP && !ECHO_SCHED 5850 | depends on HAVE_CONTEXT_TRACKING_USER 5851 | # VIRT_CPU_ACCOUNTING_GEN dependency 5852 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN 5853 | -------------------------------------------------------------------------------- /6.8.y/powersave.patch: -------------------------------------------------------------------------------- 1 | diff --git a/kernel/sched/balancer.h b/kernel/sched/balancer.h 2 | index 852faad1fc1d..bd44a2512fc0 100644 3 | --- a/kernel/sched/balancer.h 4 | +++ b/kernel/sched/balancer.h 5 | @@ -8,6 +8,308 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 6 | return newidle_balance(rq, rf) != 0; 7 | } 8 | 9 | +struct energy_env { 10 | + unsigned long task_busy_time; 11 | + unsigned long pd_busy_time; 12 | + unsigned long cpu_cap; 13 | + unsigned long pd_cap; 14 | +}; 15 | + 16 | +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); 17 | + 18 | +static inline unsigned long 19 | +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, 20 | + struct task_struct *p, int dst_cpu) 21 | +{ 22 | + unsigned long max_util = 0; 23 | + int cpu; 24 | + 25 | + for_each_cpu(cpu, pd_cpus) { 26 | + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; 27 | + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); 28 | + unsigned long eff_util, min, max; 29 | + 30 | + /* 31 | + * Performance domain frequency: utilization clamping 32 | + * must be considered since it affects the selection 33 | + * of the performance domain frequency. 34 | + * NOTE: in case RT tasks are running, by default the 35 | + * FREQUENCY_UTIL's utilization can be max OPP. 36 | + */ 37 | + eff_util = effective_cpu_util(cpu, util, &min, &max); 38 | + 39 | + /* Task's uclamp can modify min and max value */ 40 | + if (tsk && uclamp_is_used()) { 41 | + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); 42 | + 43 | + /* 44 | + * If there is no active max uclamp constraint, 45 | + * directly use task's one, otherwise keep max. 46 | + */ 47 | + if (uclamp_rq_is_idle(cpu_rq(cpu))) 48 | + max = uclamp_eff_value(p, UCLAMP_MAX); 49 | + else 50 | + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); 51 | + } 52 | + 53 | + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); 54 | + max_util = max(max_util, eff_util); 55 | + } 56 | + 57 | + return min(max_util, eenv->cpu_cap); 58 | +} 59 | + 60 | +static inline unsigned long 61 | +compute_energy(struct energy_env *eenv, struct perf_domain *pd, 62 | + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) 63 | +{ 64 | + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); 65 | + unsigned long busy_time = eenv->pd_busy_time; 66 | + unsigned long energy; 67 | + 68 | + if (dst_cpu >= 0) 69 | + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); 70 | + 71 | + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); 72 | + 73 | + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); 74 | + 75 | + return energy; 76 | +} 77 | + 78 | +static inline void eenv_pd_busy_time(struct energy_env *eenv, 79 | + struct cpumask *pd_cpus, 80 | + struct task_struct *p) 81 | +{ 82 | + unsigned long busy_time = 0; 83 | + int cpu; 84 | + 85 | + for_each_cpu(cpu, pd_cpus) { 86 | + unsigned long util = cpu_util(cpu, p, -1, 0); 87 | + 88 | + busy_time += effective_cpu_util(cpu, util, NULL, NULL); 89 | + } 90 | + 91 | + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); 92 | +} 93 | + 94 | +static inline void eenv_task_busy_time(struct energy_env *eenv, 95 | + struct task_struct *p, int prev_cpu) 96 | +{ 97 | + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); 98 | + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); 99 | + 100 | + if (unlikely(irq >= max_cap)) 101 | + busy_time = max_cap; 102 | + else 103 | + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); 104 | + 105 | + eenv->task_busy_time = busy_time; 106 | +} 107 | + 108 | +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) 109 | +{ 110 | + return u64_u32_load_copy(cfs_rq->avg.last_update_time, 111 | + cfs_rq->last_update_time_copy); 112 | +} 113 | + 114 | +static void sync_entity_load_avg(struct sched_entity *se) 115 | +{ 116 | + struct cfs_rq *cfs_rq = cfs_rq_of(se); 117 | + u64 last_update_time; 118 | + 119 | + last_update_time = cfs_rq_last_update_time(cfs_rq); 120 | + __update_load_avg_blocked_se(last_update_time, se); 121 | +} 122 | + 123 | +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) 124 | +{ 125 | + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 126 | + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; 127 | + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; 128 | + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; 129 | + struct root_domain *rd = this_rq()->rd; 130 | + int cpu, best_energy_cpu, target = -1; 131 | + int prev_fits = -1, best_fits = -1; 132 | + unsigned long best_thermal_cap = 0; 133 | + unsigned long prev_thermal_cap = 0; 134 | + struct sched_domain *sd; 135 | + struct perf_domain *pd; 136 | + struct energy_env eenv; 137 | + 138 | + rcu_read_lock(); 139 | + pd = rcu_dereference(rd->pd); 140 | + if (!pd || READ_ONCE(rd->overutilized)) 141 | + goto unlock; 142 | + 143 | + /* 144 | + * Energy-aware wake-up happens on the lowest sched_domain starting 145 | + * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. 146 | + */ 147 | + sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); 148 | + while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 149 | + sd = sd->parent; 150 | + if (!sd) 151 | + goto unlock; 152 | + 153 | + target = prev_cpu; 154 | + 155 | + sync_entity_load_avg(&p->se); 156 | + if (!task_util_est(p) && p_util_min == 0) 157 | + goto unlock; 158 | + 159 | + eenv_task_busy_time(&eenv, p, prev_cpu); 160 | + 161 | + for (; pd; pd = pd->next) { 162 | + unsigned long util_min = p_util_min, util_max = p_util_max; 163 | + unsigned long cpu_cap, cpu_thermal_cap, util; 164 | + long prev_spare_cap = -1, max_spare_cap = -1; 165 | + unsigned long rq_util_min, rq_util_max; 166 | + unsigned long cur_delta, base_energy; 167 | + int max_spare_cap_cpu = -1; 168 | + int fits, max_fits = -1; 169 | + 170 | + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); 171 | + 172 | + if (cpumask_empty(cpus)) 173 | + continue; 174 | + 175 | + /* Account thermal pressure for the energy estimation */ 176 | + cpu = cpumask_first(cpus); 177 | + cpu_thermal_cap = arch_scale_cpu_capacity(cpu); 178 | + cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); 179 | + 180 | + eenv.cpu_cap = cpu_thermal_cap; 181 | + eenv.pd_cap = 0; 182 | + 183 | + for_each_cpu(cpu, cpus) { 184 | + struct rq *rq = cpu_rq(cpu); 185 | + 186 | + eenv.pd_cap += cpu_thermal_cap; 187 | + 188 | + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 189 | + continue; 190 | + 191 | + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 192 | + continue; 193 | + 194 | + util = cpu_util(cpu, p, cpu, 0); 195 | + cpu_cap = capacity_of(cpu); 196 | + 197 | + /* 198 | + * Skip CPUs that cannot satisfy the capacity request. 199 | + * IOW, placing the task there would make the CPU 200 | + * overutilized. Take uclamp into account to see how 201 | + * much capacity we can get out of the CPU; this is 202 | + * aligned with sched_cpu_util(). 203 | + */ 204 | + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { 205 | + /* 206 | + * Open code uclamp_rq_util_with() except for 207 | + * the clamp() part. Ie: apply max aggregation 208 | + * only. util_fits_cpu() logic requires to 209 | + * operate on non clamped util but must use the 210 | + * max-aggregated uclamp_{min, max}. 211 | + */ 212 | + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); 213 | + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); 214 | + 215 | + util_min = max(rq_util_min, p_util_min); 216 | + util_max = max(rq_util_max, p_util_max); 217 | + } 218 | + 219 | + fits = util_fits_cpu(util, util_min, util_max, cpu); 220 | + if (!fits) 221 | + continue; 222 | + 223 | + lsub_positive(&cpu_cap, util); 224 | + 225 | + if (cpu == prev_cpu) { 226 | + /* Always use prev_cpu as a candidate. */ 227 | + prev_spare_cap = cpu_cap; 228 | + prev_fits = fits; 229 | + } else if ((fits > max_fits) || 230 | + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { 231 | + /* 232 | + * Find the CPU with the maximum spare capacity 233 | + * among the remaining CPUs in the performance 234 | + * domain. 235 | + */ 236 | + max_spare_cap = cpu_cap; 237 | + max_spare_cap_cpu = cpu; 238 | + max_fits = fits; 239 | + } 240 | + } 241 | + 242 | + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) 243 | + continue; 244 | + 245 | + eenv_pd_busy_time(&eenv, cpus, p); 246 | + /* Compute the 'base' energy of the pd, without @p */ 247 | + base_energy = compute_energy(&eenv, pd, cpus, p, -1); 248 | + 249 | + /* Evaluate the energy impact of using prev_cpu. */ 250 | + if (prev_spare_cap > -1) { 251 | + prev_delta = compute_energy(&eenv, pd, cpus, p, 252 | + prev_cpu); 253 | + /* CPU utilization has changed */ 254 | + if (prev_delta < base_energy) 255 | + goto unlock; 256 | + prev_delta -= base_energy; 257 | + prev_thermal_cap = cpu_thermal_cap; 258 | + best_delta = min(best_delta, prev_delta); 259 | + } 260 | + 261 | + /* Evaluate the energy impact of using max_spare_cap_cpu. */ 262 | + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { 263 | + /* Current best energy cpu fits better */ 264 | + if (max_fits < best_fits) 265 | + continue; 266 | + 267 | + /* 268 | + * Both don't fit performance hint (i.e. uclamp_min) 269 | + * but best energy cpu has better capacity. 270 | + */ 271 | + if ((max_fits < 0) && 272 | + (cpu_thermal_cap <= best_thermal_cap)) 273 | + continue; 274 | + 275 | + cur_delta = compute_energy(&eenv, pd, cpus, p, 276 | + max_spare_cap_cpu); 277 | + /* CPU utilization has changed */ 278 | + if (cur_delta < base_energy) 279 | + goto unlock; 280 | + cur_delta -= base_energy; 281 | + 282 | + /* 283 | + * Both fit for the task but best energy cpu has lower 284 | + * energy impact. 285 | + */ 286 | + if ((max_fits > 0) && (best_fits > 0) && 287 | + (cur_delta >= best_delta)) 288 | + continue; 289 | + 290 | + best_delta = cur_delta; 291 | + best_energy_cpu = max_spare_cap_cpu; 292 | + best_fits = max_fits; 293 | + best_thermal_cap = cpu_thermal_cap; 294 | + } 295 | + } 296 | + rcu_read_unlock(); 297 | + 298 | + if ((best_fits > prev_fits) || 299 | + ((best_fits > 0) && (best_delta < prev_delta)) || 300 | + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) 301 | + target = best_energy_cpu; 302 | + 303 | + return target; 304 | + 305 | +unlock: 306 | + rcu_read_unlock(); 307 | + 308 | + return target; 309 | +} 310 | + 311 | static int 312 | wake_affine_idle(int this_cpu, int prev_cpu, int sync) 313 | { 314 | @@ -90,13 +392,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) 315 | unsigned int min = rq->nr_running; 316 | int this_cpu = smp_processor_id(); 317 | 318 | - if (wake_flags & WF_TTWU) { 319 | + if (IS_PWR_SAVE_ENABLED || (wake_flags & WF_TTWU)) { 320 | record_wakee(p); 321 | 322 | if ((wake_flags & WF_CURRENT_CPU) && 323 | cpumask_test_cpu(cpu, p->cpus_ptr)) 324 | return cpu; 325 | 326 | + if (IS_PWR_SAVE_ENABLED || sched_energy_enabled()) { 327 | + new_cpu = find_energy_efficient_cpu(p, prev_cpu); 328 | + if (new_cpu >= 0) 329 | + return new_cpu; 330 | + new_cpu = prev_cpu; 331 | + } 332 | + 333 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); 334 | } 335 | 336 | @@ -382,6 +691,64 @@ static inline int migrate_degrades_locality(struct task_struct *p, struct rq *ds 337 | } 338 | #endif 339 | 340 | +static int 341 | +can_migrate_task_powersave(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 342 | +{ 343 | + int tsk_cache_hot; 344 | + 345 | + /* Disregard pcpu kthreads; they are where they need to be. */ 346 | + if (kthread_is_per_cpu(p)) 347 | + return 0; 348 | + 349 | + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) 350 | + return 0; 351 | + 352 | + if (task_on_cpu(src_rq, p)) 353 | + return 0; 354 | + 355 | + tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq); 356 | + if (tsk_cache_hot > 0) 357 | + return 0; 358 | + 359 | + return 1; 360 | +} 361 | + 362 | +static int move_task_powersave(struct rq *dist_rq, struct rq *src_rq, 363 | + struct rq_flags *src_rf) 364 | +{ 365 | + struct cfs_rq *src_cfs_rq = &src_rq->cfs; 366 | + struct task_struct *p; 367 | + struct bs_node *bsn = src_cfs_rq->head; 368 | + struct lb_env env = { 369 | + .dst_cpu = cpu_of(dist_rq), 370 | + .dst_rq = dist_rq, 371 | + .src_cpu = cpu_of(src_rq), 372 | + .src_rq = src_rq, 373 | + .src_rf = src_rf, 374 | + .idle = dist_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE, 375 | + }; 376 | + 377 | + while (bsn) { 378 | + p = task_of(se_of(bsn)); 379 | + if (can_migrate_task_powersave(p, dist_rq, src_rq)) { 380 | + pull_from(p, &env); 381 | + return 1; 382 | + } 383 | + 384 | + bsn = bsn->next; 385 | + } 386 | + 387 | + /* 388 | + * Here we know we have not migrated any task, 389 | + * thus, we need to unlock and return 0 390 | + * Note: the pull_from does the unlocking for us. 391 | + */ 392 | + rq_unlock(src_rq, src_rf); 393 | + local_irq_restore(src_rf->flags); 394 | + 395 | + return 0; 396 | +} 397 | + 398 | #define MIN_HOTNESS 0x7FFFFFFFFFFFFFFLL 399 | 400 | static s64 task_hotness(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) 401 | @@ -559,6 +926,9 @@ static void idle_balance(struct rq *this_rq) 402 | unsigned int max = 0; 403 | struct rq_flags src_rf; 404 | 405 | + if (IS_PWR_SAVE_ENABLED) 406 | + return; 407 | + 408 | if (idle_pull_global_candidate(this_rq)) 409 | return; 410 | 411 | @@ -727,7 +1097,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) 412 | /* 413 | * Do not pull tasks towards !active CPUs... 414 | */ 415 | - if (!cpu_active(this_cpu)) 416 | + if (IS_PWR_SAVE_ENABLED || !cpu_active(this_cpu)) 417 | return 0; 418 | 419 | rq_unpin_lock(this_rq, rf); 420 | @@ -849,7 +1219,9 @@ static void rebalance(struct rq *this_rq) 421 | return; 422 | } 423 | 424 | - if(move_task(min_rq, max_rq, &src_rf)) 425 | + if (IS_PWR_SAVE_ENABLED && idle_cpu(cpu_of(min_rq)) && max - min == 2) 426 | + move_task_powersave(min_rq, max_rq, &src_rf); 427 | + else if(move_task(min_rq, max_rq, &src_rf)) 428 | goto again; 429 | } 430 | 431 | diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c 432 | index 7760327e5194..f1c67710e8cd 100644 433 | --- a/kernel/sched/bs.c 434 | +++ b/kernel/sched/bs.c 435 | @@ -17,6 +17,7 @@ 436 | unsigned int sysctl_sched_base_slice = 7000ULL; 437 | unsigned int bs_shared_quota = 105000ULL; // 105us 438 | u32 alpha = 500U; 439 | +unsigned int __read_mostly echo_powersave = 0; 440 | 441 | struct lb_env { 442 | struct rq *src_rq; 443 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 444 | index 49df55bb0ba7..6e9deddc59d6 100644 445 | --- a/kernel/sched/core.c 446 | +++ b/kernel/sched/core.c 447 | @@ -4701,6 +4701,15 @@ static struct ctl_table sched_core_sysctls[] = { 448 | .mode = 0644, 449 | .proc_handler = proc_dointvec, 450 | }, 451 | + { 452 | + .procname = "sched_echo_powersave", 453 | + .data = &echo_powersave, 454 | + .maxlen = sizeof(int), 455 | + .mode = 0644, 456 | + .proc_handler = proc_dointvec_minmax, 457 | + .extra1 = SYSCTL_ZERO, 458 | + .extra2 = SYSCTL_THREE, 459 | + }, 460 | #endif 461 | #ifdef CONFIG_SCHEDSTATS 462 | { 463 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 464 | index e27be055ca86..071d09f66fa0 100644 465 | --- a/kernel/sched/sched.h 466 | +++ b/kernel/sched/sched.h 467 | @@ -88,6 +88,10 @@ 468 | # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) 469 | #endif 470 | 471 | +#ifdef CONFIG_ECHO_SCHED 472 | +#define IS_PWR_SAVE_ENABLED (echo_powersave == 1) 473 | +#endif 474 | + 475 | struct rq; 476 | struct cpuidle_state; 477 | 478 | @@ -111,6 +115,7 @@ extern int sched_rr_timeslice; 479 | 480 | #ifdef CONFIG_ECHO_SCHED 481 | extern unsigned int bs_shared_quota; 482 | +extern unsigned int echo_powersave; 483 | #endif 484 | 485 | /* 486 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ECHO CPU Scheduler 2 | 3 | **Enhanced CPU Handling Orchestrator** 4 | 5 | It is a CPU processes scheduler patch for Linux kernel. 6 | 7 | This scheduler includes the following features: - 8 | 9 | - Highly multitasking handling with max 35us quota. 10 | - All tasks in a CPU have a shared quota = 35us in which every task runs (35us / # of tasks) 11 | - Minimum slice for a running a task is 7us unless waked up task that must run before the current task then it preempts it. 12 | - Calculate the estimation of tasks - SRTF (Shortest Remaining Task Next). This uses moving average to calculate virtual runtime. 13 | - Next task is picked with the smallest estimated virtual runtime. 14 | - Load balancer as in TT scheduler with tiny changes. CPU0 is responsible of moving tasks among other CPUs. Also, the candidate 15 | balancer is enabled by default. 16 | 17 | ## Comparison with other schedulers 18 | 19 | https://github.com/hamadmarri/benchmarks 20 | 21 | 22 | 23 | ## Policy 24 | The policy is a mix of SRTF and RR (Round Robin) where virtual runtime calculation is 25 | ported from CFS (it calculates the burst adjusted based on the priority of the task). Each round the tasks will run starting from 26 | the least estimated vruntime and each task will run `shared_quota/#tasks` ex. `35us / 3 = ~11.7us` 27 | 28 | If a wake up task has smaller estimated vruntime then it will preempt the current task and run. Every time the task consumes its 29 | quota it will be placed in a second queue unless it is the only task that is running. After finishing the round, all tasks are placed 30 | in the second queue. The scheduler switches the queue head from q1 to q2, and q2 become q1 and vise versa. 31 | 32 | 33 | ## Defaults and Sysctls 34 | - The default HZ for ECHO is 625HZ - ticks every 1.6ms. No need to increase it since the HighRes clock handles the task preemption in 35us max. 35 | - `kernel.sched_bs_shared_quota` by default is 35000 (35us) can be tuned with sysctl 36 | ex. `sysctl kernel.sched_bs_shared_quota=4800000` larger values saves CPU caches but reduces interactivity and multitasking. 37 | - There are kernel configurations that must be disabled: 38 | - CONFIG_FAIR_GROUP_SCHED 39 | - CONFIG_SCHED_AUTOGROUP 40 | - CONFIG_SCHED_CORE 41 | 42 | 43 | ## Telegram Group 44 | 45 | https://t.me/tt_sched 46 | 47 | Hamad 48 | --------------------------------------------------------------------------------