├── 6.12.y
    └── linux-6.12-echo.patch
├── 6.7.y
    └── linux-6.7-echo.patch
├── 6.8.y
    ├── lat_sensitive.patch
    ├── linux-6.8-echo.patch
    └── powersave.patch
└── README.md


/6.8.y/lat_sensitive.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
  2 | index 49df55bb0ba7..ff6e2d0a1107 100644
  3 | --- a/kernel/sched/core.c
  4 | +++ b/kernel/sched/core.c
  5 | @@ -3347,6 +3347,24 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
  6 |  	WARN_ON_ONCE(ret);
  7 |  }
  8 |  
  9 | +#ifdef CONFIG_ECHO_SCHED
 10 | +inline void inc_nr_lat_sensitive(unsigned int cpu, struct task_struct *p)
 11 | +{
 12 | +	if (per_cpu(nr_lat_sensitive, cpu) == 0 || per_cpu(nr_lat_sensitive, cpu) == -10)
 13 | +		per_cpu(nr_lat_sensitive, cpu) = HZ / 78;
 14 | +}
 15 | +
 16 | +inline void dec_nr_lat_sensitive(unsigned int cpu)
 17 | +{
 18 | +	if (per_cpu(nr_lat_sensitive, cpu) > -10) {
 19 | +		per_cpu(nr_lat_sensitive, cpu)--;
 20 | +
 21 | +		if (per_cpu(nr_lat_sensitive, cpu) == 0)
 22 | +			per_cpu(nr_lat_sensitive, cpu) = -1;
 23 | +	}
 24 | +}
 25 | +#endif
 26 | +
 27 |  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 28 |  {
 29 |  #ifdef CONFIG_SCHED_DEBUG
 30 | @@ -5704,6 +5722,13 @@ void scheduler_tick(void)
 31 |  	if (curr->flags & PF_WQ_WORKER)
 32 |  		wq_worker_tick(curr);
 33 |  
 34 | +#ifdef CONFIG_ECHO_SCHED
 35 | +	if (idle_cpu(cpu))
 36 | +		inc_nr_lat_sensitive(cpu, NULL);
 37 | +	else
 38 | +		dec_nr_lat_sensitive(cpu);
 39 | +#endif
 40 | +
 41 |  #ifdef CONFIG_SMP
 42 |  	rq->idle_balance = idle_cpu(cpu);
 43 |  	trigger_load_balance(rq);
 44 | @@ -9912,6 +9937,10 @@ LIST_HEAD(task_groups);
 45 |  static struct kmem_cache *task_group_cache __ro_after_init;
 46 |  #endif
 47 |  
 48 | +#ifdef CONFIG_ECHO_SCHED
 49 | +DEFINE_PER_CPU(int, nr_lat_sensitive);
 50 | +#endif
 51 | +
 52 |  void __init sched_init(void)
 53 |  {
 54 |  	unsigned long ptr = 0;
 55 | @@ -10050,6 +10079,10 @@ void __init sched_init(void)
 56 |  		hrtick_rq_init(rq);
 57 |  		atomic_set(&rq->nr_iowait, 0);
 58 |  
 59 | +#ifdef CONFIG_ECHO_SCHED
 60 | +		per_cpu(nr_lat_sensitive, i) = 0;
 61 | +#endif
 62 | +
 63 |  #ifdef CONFIG_SCHED_CORE
 64 |  		rq->core = rq;
 65 |  		rq->core_pick = NULL;
 66 | diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
 67 | index 95e7f83b5ab8..dbfc307103e5 100644
 68 | --- a/kernel/sched/idle.c
 69 | +++ b/kernel/sched/idle.c
 70 | @@ -237,7 +237,9 @@ static void cpuidle_idle_call(void)
 71 |  static void do_idle(void)
 72 |  {
 73 |  	int cpu = smp_processor_id();
 74 | -
 75 | +#ifdef CONFIG_ECHO_SCHED
 76 | +	int pm_disabled = per_cpu(nr_lat_sensitive, cpu);
 77 | +#endif
 78 |  	/*
 79 |  	 * Check if we need to update blocked load
 80 |  	 */
 81 | @@ -305,13 +307,22 @@ static void do_idle(void)
 82 |  		 * broadcast device expired for us, we don't want to go deep
 83 |  		 * idle as we know that the IPI is going to arrive right away.
 84 |  		 */
 85 | -		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
 86 | +		if (
 87 | +#ifdef CONFIG_ECHO_SCHED
 88 | +			pm_disabled > 0 ||
 89 | +#endif
 90 | +			cpu_idle_force_poll || tick_check_broadcast_expired()) {
 91 |  			tick_nohz_idle_restart_tick();
 92 |  			cpu_idle_poll();
 93 | +			dec_nr_lat_sensitive(cpu);
 94 |  		} else {
 95 |  			cpuidle_idle_call();
 96 |  		}
 97 |  
 98 | +#ifdef CONFIG_ECHO_SCHED
 99 | +		if (pm_disabled < 0)
100 | +			dec_nr_lat_sensitive(cpu);
101 | +#endif
102 |  		arch_cpu_idle_exit();
103 |  	}
104 |  
105 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
106 | index e27be055ca86..56b5c0114613 100644
107 | --- a/kernel/sched/sched.h
108 | +++ b/kernel/sched/sched.h
109 | @@ -1901,7 +1901,9 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
110 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
111 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
112 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
113 | -
114 | +#ifdef CONFIG_ECHO_SCHED
115 | +DECLARE_PER_CPU(int, nr_lat_sensitive);
116 | +#endif
117 |  extern struct static_key_false sched_asym_cpucapacity;
118 |  extern struct static_key_false sched_cluster_active;
119 |  
120 | @@ -2559,6 +2561,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
121 |  #define SCHED_NR_MIGRATE_BREAK 32
122 |  #endif
123 |  
124 | +#ifdef CONFIG_ECHO_SCHED
125 | +extern inline void inc_nr_lat_sensitive(unsigned int cpu, struct task_struct *p);
126 | +extern inline void dec_nr_lat_sensitive(unsigned int cpu);
127 | +#endif
128 | +
129 |  extern const_debug unsigned int sysctl_sched_nr_migrate;
130 |  extern const_debug unsigned int sysctl_sched_migration_cost;
131 |  
132 | 


--------------------------------------------------------------------------------
/6.8.y/linux-6.8-echo.patch:
--------------------------------------------------------------------------------
   1 | diff --git a/include/linux/sched.h b/include/linux/sched.h
   2 | index ffe8f618ab86..947d94be2437 100644
   3 | --- a/include/linux/sched.h
   4 | +++ b/include/linux/sched.h
   5 | @@ -533,6 +533,19 @@ struct sched_statistics {
   6 |  #endif /* CONFIG_SCHEDSTATS */
   7 |  } ____cacheline_aligned;
   8 |  
   9 | +#ifdef CONFIG_ECHO_SCHED
  10 | +struct bs_node {
  11 | +	struct bs_node*                 next;
  12 | +	u64				c_vrt_start;
  13 | +	u64				r_vrt_start;
  14 | +	u64				vburst;
  15 | +#ifdef CONFIG_SCHED_DEBUG
  16 | +	u64				prev_vburst;
  17 | +#endif
  18 | +	u64				est;
  19 | +};
  20 | +#endif
  21 | +
  22 |  struct sched_entity {
  23 |  	/* For load-balancing: */
  24 |  	struct load_weight		load;
  25 | @@ -542,14 +555,18 @@ struct sched_entity {
  26 |  
  27 |  	struct list_head		group_node;
  28 |  	unsigned int			on_rq;
  29 | -
  30 | +#ifdef CONFIG_ECHO_SCHED
  31 | +	struct bs_node                  bs_node;
  32 | +#endif
  33 |  	u64				exec_start;
  34 |  	u64				sum_exec_runtime;
  35 |  	u64				prev_sum_exec_runtime;
  36 |  	u64				vruntime;
  37 |  	s64				vlag;
  38 |  	u64				slice;
  39 | -
  40 | +#ifdef CONFIG_ECHO_SCHED
  41 | +	bool				yielded;
  42 | +#endif
  43 |  	u64				nr_migrations;
  44 |  
  45 |  #ifdef CONFIG_FAIR_GROUP_SCHED
  46 | diff --git a/init/Kconfig b/init/Kconfig
  47 | index bee58f7468c3..933ec5c9a941 100644
  48 | --- a/init/Kconfig
  49 | +++ b/init/Kconfig
  50 | @@ -130,6 +130,12 @@ config THREAD_INFO_IN_TASK
  51 |  	  One subtle change that will be needed is to use try_get_task_stack()
  52 |  	  and put_task_stack() in save_thread_stack_tsk() and get_wchan().
  53 |  
  54 | +config ECHO_SCHED
  55 | +	bool "ECHO CPU Scheduler"
  56 | +	default y
  57 | +	help
  58 | +	  https://github.com/hamadmarri/ECHO-CPU-Scheduler
  59 | +
  60 |  menu "General setup"
  61 |  
  62 |  config BROKEN
  63 | @@ -1008,11 +1014,12 @@ menuconfig CGROUP_SCHED
  64 |  if CGROUP_SCHED
  65 |  config FAIR_GROUP_SCHED
  66 |  	bool "Group scheduling for SCHED_OTHER"
  67 | -	depends on CGROUP_SCHED
  68 | -	default CGROUP_SCHED
  69 | +	depends on CGROUP_SCHED && !ECHO_SCHED
  70 | +	default n
  71 |  
  72 |  config CFS_BANDWIDTH
  73 |  	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
  74 | +	depends on !ECHO_SCHED
  75 |  	depends on FAIR_GROUP_SCHED
  76 |  	default n
  77 |  	help
  78 | @@ -1281,9 +1288,11 @@ config CHECKPOINT_RESTORE
  79 |  
  80 |  config SCHED_AUTOGROUP
  81 |  	bool "Automatic process group scheduling"
  82 | +	depends on !ECHO_SCHED
  83 |  	select CGROUPS
  84 |  	select CGROUP_SCHED
  85 |  	select FAIR_GROUP_SCHED
  86 | +	default n
  87 |  	help
  88 |  	  This option optimizes the scheduler for common desktop workloads by
  89 |  	  automatically creating and populating task groups.  This separation
  90 | diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
  91 | index 38ef6d06888e..80861d9044e3 100644
  92 | --- a/kernel/Kconfig.hz
  93 | +++ b/kernel/Kconfig.hz
  94 | @@ -5,7 +5,7 @@
  95 |  
  96 |  choice
  97 |  	prompt "Timer frequency"
  98 | -	default HZ_250
  99 | +	default HZ_625
 100 |  	help
 101 |  	 Allows the configuration of the timer frequency. It is customary
 102 |  	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
 103 | @@ -40,6 +40,13 @@ choice
 104 |  	 on SMP and NUMA systems and exactly dividing by both PAL and
 105 |  	 NTSC frame rates for video and multimedia work.
 106 |  
 107 | +	config HZ_625
 108 | +		bool "625 HZ"
 109 | +	help
 110 | +	 The default HZ for ECHO is 625HZ - ticks every 1.6ms.
 111 | +	 No need to increase it since the HighRes clock handles
 112 | +	 the task preemption in 105us max.
 113 | +
 114 |  	config HZ_1000
 115 |  		bool "1000 HZ"
 116 |  	help
 117 | @@ -53,6 +60,7 @@ config HZ
 118 |  	default 100 if HZ_100
 119 |  	default 250 if HZ_250
 120 |  	default 300 if HZ_300
 121 | +	default 625 if HZ_625
 122 |  	default 1000 if HZ_1000
 123 |  
 124 |  config SCHED_HRTICK
 125 | diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
 126 | index c2f1fd95a821..d54bb52ccccc 100644
 127 | --- a/kernel/Kconfig.preempt
 128 | +++ b/kernel/Kconfig.preempt
 129 | @@ -117,7 +117,8 @@ config PREEMPT_DYNAMIC
 130 |  
 131 |  config SCHED_CORE
 132 |  	bool "Core Scheduling for SMT"
 133 | -	depends on SCHED_SMT
 134 | +	depends on SCHED_SMT && !ECHO_SCHED
 135 | +	default n
 136 |  	help
 137 |  	  This option permits Core Scheduling, a means of coordinated task
 138 |  	  selection across SMT siblings. When enabled -- see
 139 | diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
 140 | index 976092b7bd45..f78ee1bffe16 100644
 141 | --- a/kernel/sched/Makefile
 142 | +++ b/kernel/sched/Makefile
 143 | @@ -29,6 +29,10 @@ endif
 144 |  # build parallelizes well and finishes roughly at once:
 145 |  #
 146 |  obj-y += core.o
 147 | +ifeq ($(CONFIG_ECHO_SCHED),y)
 148 | +obj-y += bs.o
 149 | +else
 150 |  obj-y += fair.o
 151 | +endif
 152 |  obj-y += build_policy.o
 153 |  obj-y += build_utility.o
 154 | diff --git a/kernel/sched/balancer.h b/kernel/sched/balancer.h
 155 | new file mode 100644
 156 | index 000000000000..852faad1fc1d
 157 | --- /dev/null
 158 | +++ b/kernel/sched/balancer.h
 159 | @@ -0,0 +1,881 @@
 160 | +#ifdef CONFIG_SMP
 161 | +static int
 162 | +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 163 | +{
 164 | +	if (rq->nr_running)
 165 | +		return 1;
 166 | +
 167 | +	return newidle_balance(rq, rf) != 0;
 168 | +}
 169 | +
 170 | +static int
 171 | +wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 172 | +{
 173 | +	/*
 174 | +	 * If this_cpu is idle, it implies the wakeup is from interrupt
 175 | +	 * context. Only allow the move if cache is shared. Otherwise an
 176 | +	 * interrupt intensive workload could force all tasks onto one
 177 | +	 * node depending on the IO topology or IRQ affinity settings.
 178 | +	 *
 179 | +	 * If the prev_cpu is idle and cache affine then avoid a migration.
 180 | +	 * There is no guarantee that the cache hot data from an interrupt
 181 | +	 * is more important than cache hot data on the prev_cpu and from
 182 | +	 * a cpufreq perspective, it's better to have higher utilisation
 183 | +	 * on one CPU.
 184 | +	 */
 185 | +	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 186 | +		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 187 | +
 188 | +	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 189 | +		return this_cpu;
 190 | +
 191 | +	if (available_idle_cpu(prev_cpu))
 192 | +		return prev_cpu;
 193 | +
 194 | +	return nr_cpumask_bits;
 195 | +}
 196 | +
 197 | +static int
 198 | +wake_affine(struct task_struct *p, int this_cpu, int prev_cpu, int sync)
 199 | +{
 200 | +	int target = nr_cpumask_bits;
 201 | +
 202 | +	target = wake_affine_idle(this_cpu, prev_cpu, sync);
 203 | +
 204 | +	if (target == nr_cpumask_bits)
 205 | +		return prev_cpu;
 206 | +
 207 | +	return target;
 208 | +}
 209 | +
 210 | +static int wake_wide(struct task_struct *p)
 211 | +{
 212 | +	unsigned int master = current->wakee_flips;
 213 | +	unsigned int slave = p->wakee_flips;
 214 | +	int factor = __this_cpu_read(sd_llc_size);
 215 | +
 216 | +	if (master < slave)
 217 | +		swap(master, slave);
 218 | +	if (slave < factor || master < slave * factor)
 219 | +		return 0;
 220 | +	return 1;
 221 | +}
 222 | +
 223 | +static void record_wakee(struct task_struct *p)
 224 | +{
 225 | +	/*
 226 | +	 * Only decay a single time; tasks that have less then 1 wakeup per
 227 | +	 * jiffy will not have built up many flips.
 228 | +	 */
 229 | +	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
 230 | +		current->wakee_flips >>= 1;
 231 | +		current->wakee_flip_decay_ts = jiffies;
 232 | +	}
 233 | +
 234 | +	if (current->last_wakee != p) {
 235 | +		current->last_wakee = p;
 236 | +		current->wakee_flips++;
 237 | +	}
 238 | +}
 239 | +
 240 | +static int
 241 | +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 242 | +{
 243 | +	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 244 | +	int cpu = smp_processor_id();
 245 | +	int new_cpu = prev_cpu;
 246 | +	int want_affine = 0;
 247 | +	struct rq *rq = cpu_rq(prev_cpu);
 248 | +	unsigned int min_prev = rq->nr_running;
 249 | +	unsigned int min = rq->nr_running;
 250 | +	int this_cpu = smp_processor_id();
 251 | +
 252 | +	if (wake_flags & WF_TTWU) {
 253 | +		record_wakee(p);
 254 | +
 255 | +		if ((wake_flags & WF_CURRENT_CPU) &&
 256 | +		    cpumask_test_cpu(cpu, p->cpus_ptr))
 257 | +			return cpu;
 258 | +
 259 | +		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 260 | +	}
 261 | +
 262 | +	for_each_cpu_wrap(cpu, cpu_online_mask, this_cpu) {
 263 | +		if (unlikely(!cpumask_test_cpu(cpu, p->cpus_ptr)))
 264 | +			continue;
 265 | +
 266 | +		if (want_affine) {
 267 | +			if (cpu != prev_cpu)
 268 | +				new_cpu = wake_affine(p, cpu, prev_cpu, sync);
 269 | +
 270 | +			return new_cpu;
 271 | +		}
 272 | +
 273 | +		if (cpu_rq(cpu)->nr_running < min) {
 274 | +			new_cpu = cpu;
 275 | +			min = cpu_rq(cpu)->nr_running;
 276 | +		}
 277 | +	}
 278 | +
 279 | +	if (min == min_prev)
 280 | +		return prev_cpu;
 281 | +
 282 | +	return new_cpu;
 283 | +}
 284 | +
 285 | +#ifdef CONFIG_NO_HZ_COMMON
 286 | +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 287 | +{
 288 | +	if (cfs_rq->avg.load_avg)
 289 | +		return true;
 290 | +
 291 | +	if (cfs_rq->avg.util_avg)
 292 | +		return true;
 293 | +
 294 | +	return false;
 295 | +}
 296 | +
 297 | +static inline bool others_have_blocked(struct rq *rq)
 298 | +{
 299 | +	if (READ_ONCE(rq->avg_rt.util_avg))
 300 | +		return true;
 301 | +
 302 | +	if (READ_ONCE(rq->avg_dl.util_avg))
 303 | +		return true;
 304 | +
 305 | +	if (thermal_load_avg(rq))
 306 | +		return true;
 307 | +
 308 | +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 309 | +	if (READ_ONCE(rq->avg_irq.util_avg))
 310 | +		return true;
 311 | +#endif
 312 | +
 313 | +	return false;
 314 | +}
 315 | +
 316 | +static inline void update_blocked_load_tick(struct rq *rq)
 317 | +{
 318 | +	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
 319 | +}
 320 | +
 321 | +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
 322 | +{
 323 | +	if (!has_blocked)
 324 | +		rq->has_blocked_load = 0;
 325 | +}
 326 | +#else
 327 | +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 328 | +static inline bool others_have_blocked(struct rq *rq) { return false; }
 329 | +static inline void update_blocked_load_tick(struct rq *rq) {}
 330 | +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 331 | +#endif
 332 | +
 333 | +static inline int
 334 | +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 335 | +{
 336 | +	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
 337 | +	struct sched_avg *sa = &cfs_rq->avg;
 338 | +	int decayed = 0;
 339 | +
 340 | +	if (cfs_rq->removed.nr) {
 341 | +		unsigned long r;
 342 | +		u32 divider = get_pelt_divider(&cfs_rq->avg);
 343 | +
 344 | +		raw_spin_lock(&cfs_rq->removed.lock);
 345 | +		swap(cfs_rq->removed.util_avg, removed_util);
 346 | +		swap(cfs_rq->removed.load_avg, removed_load);
 347 | +		swap(cfs_rq->removed.runnable_avg, removed_runnable);
 348 | +		cfs_rq->removed.nr = 0;
 349 | +		raw_spin_unlock(&cfs_rq->removed.lock);
 350 | +
 351 | +		r = removed_load;
 352 | +		sub_positive(&sa->load_avg, r);
 353 | +		sub_positive(&sa->load_sum, r * divider);
 354 | +		/* See sa->util_sum below */
 355 | +		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
 356 | +
 357 | +		r = removed_util;
 358 | +		sub_positive(&sa->util_avg, r);
 359 | +		sub_positive(&sa->util_sum, r * divider);
 360 | +		/*
 361 | +		 * Because of rounding, se->util_sum might ends up being +1 more than
 362 | +		 * cfs->util_sum. Although this is not a problem by itself, detaching
 363 | +		 * a lot of tasks with the rounding problem between 2 updates of
 364 | +		 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
 365 | +		 * cfs_util_avg is not.
 366 | +		 * Check that util_sum is still above its lower bound for the new
 367 | +		 * util_avg. Given that period_contrib might have moved since the last
 368 | +		 * sync, we are only sure that util_sum must be above or equal to
 369 | +		 *    util_avg * minimum possible divider
 370 | +		 */
 371 | +		sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
 372 | +
 373 | +		r = removed_runnable;
 374 | +		sub_positive(&sa->runnable_avg, r);
 375 | +		sub_positive(&sa->runnable_sum, r * divider);
 376 | +		/* See sa->util_sum above */
 377 | +		sa->runnable_sum = max_t(u32, sa->runnable_sum,
 378 | +					      sa->runnable_avg * PELT_MIN_DIVIDER);
 379 | +
 380 | +		decayed = 1;
 381 | +	}
 382 | +
 383 | +	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
 384 | +	u64_u32_store_copy(sa->last_update_time,
 385 | +			   cfs_rq->last_update_time_copy,
 386 | +			   sa->last_update_time);
 387 | +	return decayed;
 388 | +}
 389 | +
 390 | +static bool __update_blocked_fair(struct rq *rq, bool *done)
 391 | +{
 392 | +	struct cfs_rq *cfs_rq = &rq->cfs;
 393 | +	bool decayed;
 394 | +
 395 | +	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
 396 | +	if (cfs_rq_has_blocked(cfs_rq))
 397 | +		*done = false;
 398 | +
 399 | +	return decayed;
 400 | +}
 401 | +
 402 | +static bool __update_blocked_others(struct rq *rq, bool *done)
 403 | +{
 404 | +	const struct sched_class *curr_class;
 405 | +	u64 now = rq_clock_pelt(rq);
 406 | +	unsigned long thermal_pressure;
 407 | +	bool decayed;
 408 | +
 409 | +	/*
 410 | +	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
 411 | +	 * DL and IRQ signals have been updated before updating CFS.
 412 | +	 */
 413 | +	curr_class = rq->curr->sched_class;
 414 | +
 415 | +	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 416 | +
 417 | +	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
 418 | +		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
 419 | +		  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
 420 | +		  update_irq_load_avg(rq, 0);
 421 | +
 422 | +	if (others_have_blocked(rq))
 423 | +		*done = false;
 424 | +
 425 | +	return decayed;
 426 | +}
 427 | +
 428 | +static void update_blocked_averages(int cpu)
 429 | +{
 430 | +	bool decayed = false, done = true;
 431 | +	struct rq *rq = cpu_rq(cpu);
 432 | +	struct rq_flags rf;
 433 | +
 434 | +	rq_lock_irqsave(rq, &rf);
 435 | +	update_blocked_load_tick(rq);
 436 | +	update_rq_clock(rq);
 437 | +
 438 | +	decayed |= __update_blocked_others(rq, &done);
 439 | +	decayed |= __update_blocked_fair(rq, &done);
 440 | +
 441 | +	update_blocked_load_status(rq, !done);
 442 | +	if (decayed)
 443 | +		cpufreq_update_util(rq, 0);
 444 | +	rq_unlock_irqrestore(rq, &rf);
 445 | +}
 446 | +
 447 | +static void pull_from(struct task_struct *p, struct lb_env *env)
 448 | +{
 449 | +	struct rq_flags rf;
 450 | +
 451 | +	// detach task
 452 | +	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 453 | +	set_task_cpu(p, env->dst_cpu);
 454 | +
 455 | +	// unlock src rq
 456 | +	rq_unlock(env->src_rq, env->src_rf);
 457 | +
 458 | +	// lock this rq
 459 | +	rq_lock(env->dst_rq, &rf);
 460 | +	update_rq_clock(env->dst_rq);
 461 | +
 462 | +	activate_task(env->dst_rq, p, ENQUEUE_NOCLOCK);
 463 | +	wakeup_preempt(env->dst_rq, p, 0);
 464 | +
 465 | +	// unlock this rq
 466 | +	rq_unlock(env->dst_rq, &rf);
 467 | +
 468 | +	local_irq_restore(env->src_rf->flags);
 469 | +}
 470 | +
 471 | +#ifdef CONFIG_NUMA_BALANCING
 472 | +/* Runqueue only has SCHED_IDLE tasks enqueued */
 473 | +static int sched_idle_rq(struct rq *rq)
 474 | +{
 475 | +	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
 476 | +			rq->nr_running);
 477 | +}
 478 | +
 479 | +#ifdef CONFIG_SMP
 480 | +static int sched_idle_cpu(int cpu)
 481 | +{
 482 | +	return sched_idle_rq(cpu_rq(cpu));
 483 | +}
 484 | +#endif
 485 | +
 486 | +/*
 487 | + * Returns 1, if task migration degrades locality
 488 | + * Returns 0, if task migration improves locality i.e migration preferred.
 489 | + * Returns -1, if task migration is not affected by locality.
 490 | + */
 491 | +static int migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
 492 | +{
 493 | +	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 494 | +	unsigned long src_weight, dst_weight;
 495 | +	int src_nid, dst_nid, dist;
 496 | +
 497 | +	if (!static_branch_likely(&sched_numa_balancing))
 498 | +		return -1;
 499 | +
 500 | +	if (!p->numa_faults)
 501 | +		return -1;
 502 | +
 503 | +	src_nid = cpu_to_node(cpu_of(src_rq));
 504 | +	dst_nid = cpu_to_node(cpu_of(dst_rq));
 505 | +
 506 | +	if (src_nid == dst_nid)
 507 | +		return -1;
 508 | +
 509 | +	/* Migrating away from the preferred node is always bad. */
 510 | +	if (src_nid == p->numa_preferred_nid) {
 511 | +		if (src_rq->nr_running > src_rq->nr_preferred_running)
 512 | +			return 1;
 513 | +		else
 514 | +			return -1;
 515 | +	}
 516 | +
 517 | +	/* Encourage migration to the preferred node. */
 518 | +	if (dst_nid == p->numa_preferred_nid)
 519 | +		return 0;
 520 | +
 521 | +	/* Leaving a core idle is often worse than degrading locality. */
 522 | +	if (sched_idle_cpu(cpu_of(dst_rq)))
 523 | +		return -1;
 524 | +
 525 | +	dist = node_distance(src_nid, dst_nid);
 526 | +	if (numa_group) {
 527 | +		src_weight = group_weight(p, src_nid, dist);
 528 | +		dst_weight = group_weight(p, dst_nid, dist);
 529 | +	} else {
 530 | +		src_weight = task_weight(p, src_nid, dist);
 531 | +		dst_weight = task_weight(p, dst_nid, dist);
 532 | +	}
 533 | +
 534 | +	return dst_weight < src_weight;
 535 | +}
 536 | +
 537 | +#else
 538 | +static inline int migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
 539 | +{
 540 | +	return -1;
 541 | +}
 542 | +#endif
 543 | +
 544 | +#define MIN_HOTNESS 0x7FFFFFFFFFFFFFFLL
 545 | +
 546 | +static s64 task_hotness(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
 547 | +{
 548 | +	s64 delta;
 549 | +
 550 | +	lockdep_assert_rq_held(src_rq);
 551 | +
 552 | +	if (unlikely(task_has_idle_policy(p)))
 553 | +		return 0;
 554 | +
 555 | +	/* SMT siblings share cache */
 556 | +	if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq)))
 557 | +		return MIN_HOTNESS;
 558 | +
 559 | +	if (sysctl_sched_migration_cost == -1)
 560 | +		return 0;
 561 | +
 562 | +	if (sysctl_sched_migration_cost == 0)
 563 | +		return MIN_HOTNESS;
 564 | +
 565 | +	delta = rq_clock_task(src_rq) - p->se.exec_start;
 566 | +
 567 | +	return delta;
 568 | +}
 569 | +
 570 | +static s64 hotness_of(struct task_struct *p, struct lb_env *env)
 571 | +{
 572 | +	int tsk_cache_hot;
 573 | +
 574 | +	tsk_cache_hot = migrate_degrades_locality(p, env->dst_rq, env->src_rq);
 575 | +
 576 | +	// 0, if task migration improves locality i.e migration preferred.
 577 | +	if (tsk_cache_hot == 0)
 578 | +		return MIN_HOTNESS;
 579 | +
 580 | +	// 1, if task migration degrades locality
 581 | +	if (tsk_cache_hot == 1)
 582 | +		return 0;
 583 | +
 584 | +	// -1, if task migration is not affected by locality.
 585 | +	return task_hotness(p, env->dst_rq, env->src_rq);
 586 | +}
 587 | +
 588 | +static int
 589 | +can_migrate_task(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
 590 | +{
 591 | +	/* Disregard pcpu kthreads; they are where they need to be. */
 592 | +	if (kthread_is_per_cpu(p))
 593 | +		return 0;
 594 | +
 595 | +	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
 596 | +		return 0;
 597 | +
 598 | +	if (task_on_cpu(src_rq, p))
 599 | +		return 0;
 600 | +
 601 | +	return 1;
 602 | +}
 603 | +
 604 | +static int move_task(struct rq *dst_rq, struct rq *src_rq,
 605 | +			struct rq_flags *src_rf)
 606 | +{
 607 | +	struct cfs_rq *src_cfs_rq = &src_rq->cfs;
 608 | +	struct task_struct *p = NULL, *tsk_itr;
 609 | +	struct bs_node *bsn = src_cfs_rq->head;
 610 | +	s64 tsk_coldest = 0, tsk_hotness;
 611 | +
 612 | +	struct lb_env env = {
 613 | +		.dst_cpu	= cpu_of(dst_rq),
 614 | +		.dst_rq		= dst_rq,
 615 | +		.src_cpu	= cpu_of(src_rq),
 616 | +		.src_rq		= src_rq,
 617 | +		.src_rf		= src_rf,
 618 | +		.idle		= dst_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE,
 619 | +	};
 620 | +
 621 | +	while (bsn) {
 622 | +		tsk_itr = task_of(se_of(bsn));
 623 | +
 624 | +		if (!can_migrate_task(tsk_itr, dst_rq, src_rq)) {
 625 | +			bsn = bsn->next;
 626 | +			continue;
 627 | +		}
 628 | +
 629 | +		tsk_hotness = hotness_of(tsk_itr, &env);
 630 | +
 631 | +		if (!p) {
 632 | +			tsk_coldest = tsk_hotness;
 633 | +			p = tsk_itr;
 634 | +		} else if (tsk_hotness > tsk_coldest) {
 635 | +			// greater value means it is colder
 636 | +
 637 | +			tsk_coldest = tsk_hotness;
 638 | +			p = tsk_itr;
 639 | +		}
 640 | +
 641 | +		bsn = bsn->next;
 642 | +	}
 643 | +
 644 | +	if (p) {
 645 | +		pull_from(p, &env);
 646 | +		return 1;
 647 | +	} else {
 648 | +		rq_unlock(src_rq, src_rf);
 649 | +		local_irq_restore(src_rf->flags);
 650 | +	}
 651 | +
 652 | +	return 0;
 653 | +}
 654 | +
 655 | +static int idle_pull_global_candidate(struct rq *dist_rq)
 656 | +{
 657 | +	struct rq *src_rq;
 658 | +	struct task_struct *p;
 659 | +	struct rq_flags rf, src_rf;
 660 | +	struct bs_node *cand = READ_ONCE(global_candidate.candidate);
 661 | +
 662 | +	if (!cand)
 663 | +		return 0;
 664 | +
 665 | +	src_rq = READ_ONCE(global_candidate.rq);
 666 | +	if (!src_rq || src_rq == dist_rq)
 667 | +		return 0;
 668 | +
 669 | +	rq_lock_irqsave(src_rq, &src_rf);
 670 | +	update_rq_clock(src_rq);
 671 | +		raw_spin_lock(&global_candidate.lock);
 672 | +			cand = global_candidate.candidate;
 673 | +			if (!cand)
 674 | +				goto fail_unlock;
 675 | +
 676 | +			p = task_of(se_of(cand));
 677 | +			if (task_rq(p) != src_rq ||
 678 | +			    !can_migrate_task(p, dist_rq, src_rq))
 679 | +				goto fail_unlock;
 680 | +
 681 | +			global_candidate.rq = NULL;
 682 | +			global_candidate.candidate = NULL;
 683 | +			global_candidate.est = MAX_EST;
 684 | +		raw_spin_unlock(&global_candidate.lock);
 685 | +
 686 | +		// detach task
 687 | +		deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
 688 | +		set_task_cpu(p, cpu_of(dist_rq));
 689 | +	// unlock src rq
 690 | +	rq_unlock(src_rq, &src_rf);
 691 | +
 692 | +	// lock dist rq
 693 | +	rq_lock(dist_rq, &rf);
 694 | +	update_rq_clock(dist_rq);
 695 | +		activate_task(dist_rq, p, ENQUEUE_NOCLOCK);
 696 | +		wakeup_preempt(dist_rq, p, 0);
 697 | +	// unlock dist rq
 698 | +	rq_unlock(dist_rq, &rf);
 699 | +
 700 | +	local_irq_restore(src_rf.flags);
 701 | +
 702 | +	// printk(KERN_INFO "idle_pull_global_candidate");
 703 | +
 704 | +	return 1;
 705 | +
 706 | +fail_unlock:
 707 | +	raw_spin_unlock(&global_candidate.lock);
 708 | +	rq_unlock(src_rq, &src_rf);
 709 | +	local_irq_restore(src_rf.flags);
 710 | +	return 0;
 711 | +}
 712 | +
 713 | +static void idle_balance(struct rq *this_rq)
 714 | +{
 715 | +	int this_cpu = this_rq->cpu;
 716 | +	struct rq *src_rq;
 717 | +	int src_cpu = -1, cpu;
 718 | +	unsigned int max = 0;
 719 | +	struct rq_flags src_rf;
 720 | +
 721 | +	if (idle_pull_global_candidate(this_rq))
 722 | +		return;
 723 | +
 724 | +	for_each_online_cpu(cpu) {
 725 | +		/*
 726 | +		 * Stop searching for tasks to pull if there are
 727 | +		 * now runnable tasks on this rq.
 728 | +		 */
 729 | +		if (this_rq->nr_running > 0)
 730 | +			return;
 731 | +
 732 | +		if (cpu == this_cpu)
 733 | +			continue;
 734 | +
 735 | +		src_rq = cpu_rq(cpu);
 736 | +
 737 | +		if (src_rq->nr_running <= 1)
 738 | +			continue;
 739 | +
 740 | +		if (src_rq->nr_running > max) {
 741 | +			max = src_rq->nr_running;
 742 | +			src_cpu = cpu;
 743 | +		}
 744 | +	}
 745 | +
 746 | +	if (src_cpu == -1)
 747 | +		return;
 748 | +
 749 | +	src_rq = cpu_rq(src_cpu);
 750 | +
 751 | +	rq_lock_irqsave(src_rq, &src_rf);
 752 | +	update_rq_clock(src_rq);
 753 | +
 754 | +	if (src_rq->nr_running < 2) {
 755 | +		rq_unlock(src_rq, &src_rf);
 756 | +		local_irq_restore(src_rf.flags);
 757 | +	} else {
 758 | +		move_task(this_rq, src_rq, &src_rf);
 759 | +	}
 760 | +}
 761 | +
 762 | +static void active_pull_global_candidate(struct rq *dist_rq)
 763 | +{
 764 | +	struct cfs_rq *cfs_rq = &dist_rq->cfs;
 765 | +	u64 cand_est = READ_ONCE(global_candidate.est);
 766 | +	u64 local_est = READ_ONCE(cfs_rq->local_cand_est);
 767 | +	struct rq *src_rq;
 768 | +	struct task_struct *p;
 769 | +	struct rq_flags rf, src_rf;
 770 | +	struct bs_node *cand;
 771 | +
 772 | +	cand = READ_ONCE(global_candidate.candidate);
 773 | +
 774 | +	if (!cand)
 775 | +		return;
 776 | +
 777 | +	if ((s64)(local_est - cand_est) <= 0)
 778 | +		return;
 779 | +
 780 | +	src_rq = READ_ONCE(global_candidate.rq);
 781 | +	if (!src_rq || src_rq == dist_rq)
 782 | +		return;
 783 | +
 784 | +	rq_lock_irqsave(src_rq, &src_rf);
 785 | +	update_rq_clock(src_rq);
 786 | +		raw_spin_lock(&global_candidate.lock);
 787 | +			cand = global_candidate.candidate;
 788 | +			cand_est = global_candidate.est;
 789 | +
 790 | +			if (!cand)
 791 | +				goto fail_unlock;
 792 | +
 793 | +			p = task_of(se_of(cand));
 794 | +			if (task_rq(p) != src_rq ||
 795 | +			    !can_migrate_task(p, dist_rq, src_rq))
 796 | +				goto fail_unlock;
 797 | +
 798 | +			if ((s64)(local_est - cand_est) <= 0)
 799 | +				goto fail_unlock;
 800 | +
 801 | +			global_candidate.rq = NULL;
 802 | +			global_candidate.candidate = NULL;
 803 | +			global_candidate.est = MAX_EST;
 804 | +		raw_spin_unlock(&global_candidate.lock);
 805 | +
 806 | +		// detach task
 807 | +		deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
 808 | +		set_task_cpu(p, cpu_of(dist_rq));
 809 | +	// unlock src rq
 810 | +	rq_unlock(src_rq, &src_rf);
 811 | +
 812 | +	// lock dist rq
 813 | +	rq_lock(dist_rq, &rf);
 814 | +	update_rq_clock(dist_rq);
 815 | +		activate_task(dist_rq, p, ENQUEUE_NOCLOCK);
 816 | +		wakeup_preempt(dist_rq, p, 0);
 817 | +	// unlock dist rq
 818 | +	rq_unlock(dist_rq, &rf);
 819 | +
 820 | +	local_irq_restore(src_rf.flags);
 821 | +
 822 | +	// printk(KERN_INFO "active_pull_global_candidate");
 823 | +	return;
 824 | +
 825 | +fail_unlock:
 826 | +	raw_spin_unlock(&global_candidate.lock);
 827 | +	rq_unlock(src_rq, &src_rf);
 828 | +	local_irq_restore(src_rf.flags);
 829 | +}
 830 | +
 831 | +static void nohz_try_pull_from_candidate(void)
 832 | +{
 833 | +	int cpu;
 834 | +	struct rq *rq;
 835 | +	struct cfs_rq *cfs_rq;
 836 | +#ifdef CONFIG_NO_HZ_FULL
 837 | +	struct rq_flags rf;
 838 | +#endif
 839 | +
 840 | +	/* first, push to grq*/
 841 | +	for_each_online_cpu(cpu) {
 842 | +		rq = cpu_rq(cpu);
 843 | +#ifdef CONFIG_NO_HZ_FULL
 844 | +		cfs_rq = &rq->cfs;
 845 | +
 846 | +		if (idle_cpu(cpu) || cfs_rq->nr_running > 1)
 847 | +			goto out;
 848 | +
 849 | +		rq_lock_irqsave(rq, &rf);
 850 | +		update_rq_clock(rq);
 851 | +		update_curr(cfs_rq);
 852 | +		rq_unlock_irqrestore(rq, &rf);
 853 | +out:
 854 | +#endif
 855 | +		if (idle_cpu(cpu) || !sched_fair_runnable(rq))
 856 | +			idle_pull_global_candidate(rq);
 857 | +		else
 858 | +			active_pull_global_candidate(rq);
 859 | +	}
 860 | +}
 861 | +
 862 | +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 863 | +{
 864 | +	int this_cpu = this_rq->cpu;
 865 | +	struct rq *src_rq;
 866 | +	int src_cpu = -1, cpu;
 867 | +	int pulled_task = 0;
 868 | +	unsigned int max = 0;
 869 | +	struct rq_flags src_rf;
 870 | +
 871 | +	update_misfit_status(NULL, this_rq);
 872 | +
 873 | +	/*
 874 | +	 * There is a task waiting to run. No need to search for one.
 875 | +	 * Return 0; the task will be enqueued when switching to idle.
 876 | +	 */
 877 | +	if (this_rq->ttwu_pending)
 878 | +		return 0;
 879 | +
 880 | +	/*
 881 | +	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 882 | +	 * measure the duration of idle_balance() as idle time.
 883 | +	 */
 884 | +	this_rq->idle_stamp = rq_clock(this_rq);
 885 | +
 886 | +	/*
 887 | +	 * Do not pull tasks towards !active CPUs...
 888 | +	 */
 889 | +	if (!cpu_active(this_cpu))
 890 | +		return 0;
 891 | +
 892 | +	rq_unpin_lock(this_rq, rf);
 893 | +	raw_spin_unlock(&this_rq->__lock);
 894 | +
 895 | +	update_blocked_averages(this_cpu);
 896 | +
 897 | +	pulled_task = idle_pull_global_candidate(this_rq);
 898 | +	if (pulled_task)
 899 | +		goto out;
 900 | +
 901 | +	for_each_online_cpu(cpu) {
 902 | +		/*
 903 | +		 * Stop searching for tasks to pull if there are
 904 | +		 * now runnable tasks on this rq.
 905 | +		 */
 906 | +		if (this_rq->nr_running > 0)
 907 | +			goto out;
 908 | +
 909 | +		if (cpu == this_cpu)
 910 | +			continue;
 911 | +
 912 | +		src_rq = cpu_rq(cpu);
 913 | +
 914 | +		if (src_rq->nr_running <= 1)
 915 | +			continue;
 916 | +
 917 | +		if (src_rq->nr_running > max) {
 918 | +			max = src_rq->nr_running;
 919 | +			src_cpu = cpu;
 920 | +		}
 921 | +	}
 922 | +
 923 | +	if (src_cpu != -1) {
 924 | +		src_rq = cpu_rq(src_cpu);
 925 | +
 926 | +		rq_lock_irqsave(src_rq, &src_rf);
 927 | +		update_rq_clock(src_rq);
 928 | +
 929 | +		if (src_rq->nr_running <= 1) {
 930 | +			rq_unlock(src_rq, &src_rf);
 931 | +			local_irq_restore(src_rf.flags);
 932 | +		} else {
 933 | +			pulled_task = move_task(this_rq, src_rq, &src_rf);
 934 | +		}
 935 | +	}
 936 | +
 937 | +out:
 938 | +	raw_spin_lock(&this_rq->__lock);
 939 | +
 940 | +	/*
 941 | +	 * While browsing the domains, we released the rq lock, a task could
 942 | +	 * have been enqueued in the meantime. Since we're not going idle,
 943 | +	 * pretend we pulled a task.
 944 | +	 */
 945 | +	if (this_rq->cfs.h_nr_running && !pulled_task)
 946 | +		pulled_task = 1;
 947 | +
 948 | +	/* Is there a task of a high priority class? */
 949 | +	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 950 | +		pulled_task = -1;
 951 | +
 952 | +	if (pulled_task)
 953 | +		this_rq->idle_stamp = 0;
 954 | +
 955 | +	rq_repin_lock(this_rq, rf);
 956 | +
 957 | +	return pulled_task;
 958 | +}
 959 | +
 960 | +static inline int on_null_domain(struct rq *rq)
 961 | +{
 962 | +	return unlikely(!rcu_dereference_sched(rq->sd));
 963 | +}
 964 | +
 965 | +static void rebalance(struct rq *this_rq)
 966 | +{
 967 | +	int cpu;
 968 | +	unsigned int max, min;
 969 | +	struct rq *max_rq, *min_rq, *c_rq;
 970 | +	struct rq_flags src_rf;
 971 | +
 972 | +	update_blocked_averages(this_rq->cpu);
 973 | +
 974 | +again:
 975 | +	max = min = this_rq->nr_running;
 976 | +	max_rq = min_rq = this_rq;
 977 | +
 978 | +	for_each_online_cpu(cpu) {
 979 | +		c_rq = cpu_rq(cpu);
 980 | +
 981 | +		/*
 982 | +		 * Don't need to rebalance while attached to NULL domain or
 983 | +		 * runqueue CPU is not active
 984 | +		 */
 985 | +		if (unlikely(on_null_domain(c_rq) || !cpu_active(cpu)))
 986 | +			continue;
 987 | +
 988 | +		if (c_rq->nr_running < min) {
 989 | +			min = c_rq->nr_running;
 990 | +			min_rq = c_rq;
 991 | +		}
 992 | +
 993 | +		if (c_rq->nr_running > max) {
 994 | +			max = c_rq->nr_running;
 995 | +			max_rq = c_rq;
 996 | +		}
 997 | +	}
 998 | +
 999 | +	if (min_rq == max_rq || max - min <= 1)
1000 | +		return;
1001 | +
1002 | +	rq_lock_irqsave(max_rq, &src_rf);
1003 | +	update_rq_clock(max_rq);
1004 | +
1005 | +	if (max_rq->nr_running <= 1) {
1006 | +		rq_unlock(max_rq, &src_rf);
1007 | +		local_irq_restore(src_rf.flags);
1008 | +		return;
1009 | +	}
1010 | +
1011 | +	if(move_task(min_rq, max_rq, &src_rf))
1012 | +		goto again;
1013 | +}
1014 | +
1015 | +static void nohz_balancer_kick(struct rq *rq);
1016 | +
1017 | +void trigger_load_balance(struct rq *this_rq)
1018 | +{
1019 | +	int this_cpu = cpu_of(this_rq);
1020 | +
1021 | +	if (this_cpu != 0)
1022 | +		goto out;
1023 | +
1024 | +	nohz_try_pull_from_candidate();
1025 | +
1026 | +	rebalance(this_rq);
1027 | +
1028 | +out:
1029 | +	if (time_after_eq(jiffies, this_rq->next_balance)) {
1030 | +		this_rq->next_balance = jiffies + msecs_to_jiffies(19);
1031 | +		update_blocked_averages(this_rq->cpu);
1032 | +	}
1033 | +
1034 | +	nohz_balancer_kick(this_rq);
1035 | +}
1036 | +
1037 | +#include "nohz.h"
1038 | +
1039 | +void update_group_capacity(struct sched_domain *sd, int cpu) {}
1040 | +#endif /* CONFIG_SMP */
1041 | diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
1042 | new file mode 100644
1043 | index 000000000000..6b3d51b4366c
1044 | --- /dev/null
1045 | +++ b/kernel/sched/bs.c
1046 | @@ -0,0 +1,888 @@
1047 | +// SPDX-License-Identifier: GPL-2.0
1048 | +/*
1049 | + * Baby Scheduler (BS) Class (SCHED_NORMAL/SCHED_BATCH)
1050 | + *
1051 | + *  Copyright (C) 2021, Hamad Al Marri <hamad.s.almarri@gmail.com>
1052 | + */
1053 | +#include <linux/sched/cputime.h>
1054 | +#include <linux/sched/isolation.h>
1055 | +#include <linux/sched/nohz.h>
1056 | +#include <linux/memory-tiers.h>
1057 | +#include <linux/mempolicy.h>
1058 | +#include <linux/task_work.h>
1059 | +
1060 | +#include "sched.h"
1061 | +#include "pelt.h"
1062 | +
1063 | +unsigned int sysctl_sched_base_slice	= 4200ULL;
1064 | +unsigned int bs_shared_quota		= 35000ULL; // 35us
1065 | +u32 alpha				= 500U;
1066 | +
1067 | +struct lb_env {
1068 | +	struct rq		*src_rq;
1069 | +	int			src_cpu;
1070 | +
1071 | +	int			dst_cpu;
1072 | +	struct rq		*dst_rq;
1073 | +
1074 | +	enum cpu_idle_type	idle;
1075 | +
1076 | +	struct rq_flags		*src_rf;
1077 | +	unsigned int		flags;
1078 | +};
1079 | +
1080 | +struct global_candidate {
1081 | +	struct rq *rq;
1082 | +	struct bs_node *candidate;
1083 | +	u64 est;
1084 | +
1085 | +	// for update
1086 | +	raw_spinlock_t lock;
1087 | +};
1088 | +
1089 | +#define MAX_EST 0xFFFFFFFFFFFFFFFULL
1090 | +
1091 | +struct global_candidate global_candidate = {0, 0, MAX_EST};
1092 | +
1093 | +#include "fair_numa.h"
1094 | +#include "fair_debug.h"
1095 | +#include "fair_dep_funcs.h"
1096 | +
1097 | +static inline int clear_this_candidate(struct sched_entity *se)
1098 | +{
1099 | +	struct bs_node *bsn = &se->bs_node;
1100 | +	struct bs_node *curr_can = READ_ONCE(global_candidate.candidate);
1101 | +
1102 | +	if (bsn != curr_can)
1103 | +		return 0;
1104 | +
1105 | +	WRITE_ONCE(global_candidate.candidate, NULL);
1106 | +	WRITE_ONCE(global_candidate.rq, NULL);
1107 | +	WRITE_ONCE(global_candidate.est, MAX_EST);
1108 | +
1109 | +	return 1;
1110 | +}
1111 | +
1112 | +static inline void clear_rq_candidate(struct cfs_rq *cfs_rq)
1113 | +{
1114 | +	struct rq *rq = READ_ONCE(global_candidate.rq);
1115 | +
1116 | +	if (rq != rq_of(cfs_rq))
1117 | +		return;
1118 | +
1119 | +	WRITE_ONCE(global_candidate.candidate, NULL);
1120 | +	WRITE_ONCE(global_candidate.rq, NULL);
1121 | +	WRITE_ONCE(global_candidate.est, MAX_EST);
1122 | +}
1123 | +
1124 | +static inline void __update_candidate(struct cfs_rq *cfs_rq, struct bs_node *bsn)
1125 | +{
1126 | +	unsigned long flags;
1127 | +	u64 curr_cand_est;
1128 | +
1129 | +	curr_cand_est = READ_ONCE(global_candidate.est);
1130 | +
1131 | +	if ((s64)(bsn->est - curr_cand_est) < 0) {
1132 | +		raw_spin_lock_irqsave(&global_candidate.lock, flags);
1133 | +		global_candidate.rq = rq_of(cfs_rq);
1134 | +		global_candidate.candidate = bsn;
1135 | +		global_candidate.est = bsn->est;
1136 | +		raw_spin_unlock_irqrestore(&global_candidate.lock, flags);
1137 | +	}
1138 | +}
1139 | +
1140 | +static inline bool
1141 | +can_be_candidate(struct bs_node *bsn, int this_cpu)
1142 | +{
1143 | +	struct task_struct *p;
1144 | +
1145 | +	if (!bsn)
1146 | +		return 0;
1147 | +
1148 | +	p = task_of(se_of(bsn));
1149 | +
1150 | +	if (kthread_is_per_cpu(p))
1151 | +		return 0;
1152 | +
1153 | +	// just migrated
1154 | +	if (p->se.avg.last_update_time == 0)
1155 | +		return 0;
1156 | +
1157 | +	if (task_on_cpu(cpu_rq(this_cpu), p))
1158 | +		return 0;
1159 | +
1160 | +	// some tasks are pinned to this cpu
1161 | +	if (p->nr_cpus_allowed <= 1)
1162 | +		return 0;
1163 | +
1164 | +	if (is_migration_disabled(p))
1165 | +		return 0;
1166 | +
1167 | +	return 1;
1168 | +}
1169 | +
1170 | +static void update_candidate(struct cfs_rq *cfs_rq)
1171 | +{
1172 | +	struct bs_node *bsn = NULL;
1173 | +	int this_cpu = cpu_of(rq_of(cfs_rq));
1174 | +
1175 | +	if (can_be_candidate(cfs_rq->head, this_cpu))
1176 | +		bsn = cfs_rq->head;
1177 | +	else if (can_be_candidate(cfs_rq->q2_head, this_cpu))
1178 | +		bsn = cfs_rq->q2_head;
1179 | +
1180 | +	if (bsn)
1181 | +		__update_candidate(cfs_rq, bsn);
1182 | +}
1183 | +
1184 | +static void update_curr(struct cfs_rq *cfs_rq)
1185 | +{
1186 | +	struct sched_entity *curr = cfs_rq->curr;
1187 | +	struct task_struct *curtask = task_of(curr);
1188 | +	u64 now = rq_clock_task(rq_of(cfs_rq));
1189 | +	s64 delta_exec, calc;
1190 | +
1191 | +	if (unlikely(!curr))
1192 | +		return;
1193 | +
1194 | +	delta_exec = now - curr->exec_start;
1195 | +	if (unlikely(delta_exec <= 0))
1196 | +		return;
1197 | +
1198 | +	curr->exec_start = now;
1199 | +	curr->sum_exec_runtime += delta_exec;
1200 | +
1201 | +	if (schedstat_enabled()) {
1202 | +		struct sched_statistics *stats;
1203 | +
1204 | +		stats = __schedstats_from_se(curr);
1205 | +		__schedstat_set(stats->exec_max,
1206 | +				max(delta_exec, stats->exec_max));
1207 | +	}
1208 | +
1209 | +	calc = calc_delta_fair(delta_exec, curr);
1210 | +	curr->vruntime			+= calc;
1211 | +	curr->bs_node.vburst		+= calc;
1212 | +	curr->bs_node.c_vrt_start	+= calc;
1213 | +	curr->bs_node.r_vrt_start	+= calc;
1214 | +#ifdef CONFIG_SCHED_DEBUG
1215 | +	curr->bs_node.prev_vburst = curr->bs_node.vburst;
1216 | +#endif
1217 | +	update_deadline(cfs_rq, curr);
1218 | +
1219 | +	cfs_rq->local_cand_est = curr->bs_node.est;
1220 | +
1221 | +	trace_sched_stat_runtime(curtask, delta_exec);
1222 | +	account_group_exec_runtime(curtask, delta_exec);
1223 | +	cgroup_account_cputime(curtask, delta_exec);
1224 | +	if (curtask->dl_server)
1225 | +		dl_server_update(curtask->dl_server, delta_exec);
1226 | +}
1227 | +
1228 | +static void update_curr_fair(struct rq *rq)
1229 | +{
1230 | +	update_curr(cfs_rq_of(&rq->curr->se));
1231 | +}
1232 | +
1233 | +/**
1234 | + * Should `a` preempts `b`?
1235 | + */
1236 | +static inline bool entity_before(struct bs_node *a, struct bs_node *b)
1237 | +{
1238 | +	return (s64)(a->est - b->est) < 0;
1239 | +}
1240 | +
1241 | +static void __enqueue_entity(struct bs_node **q, struct bs_node *bsn)
1242 | +{
1243 | +	struct bs_node *prev;
1244 | +
1245 | +	if (!(*q) || entity_before(bsn, *q)) {
1246 | +		bsn->next = *q;
1247 | +		*q = bsn;
1248 | +		return;
1249 | +	}
1250 | +
1251 | +	// insert after prev
1252 | +	prev = *q;
1253 | +	while (prev->next && entity_before(prev->next, bsn))
1254 | +		prev = prev->next;
1255 | +
1256 | +	bsn->next = prev->next;
1257 | +	prev->next = bsn;
1258 | +}
1259 | +
1260 | +static void __dequeue_entity_from_q2(struct cfs_rq *cfs_rq, struct bs_node *bsn)
1261 | +{
1262 | +	struct bs_node *prev, *itr;
1263 | +
1264 | +	itr  = cfs_rq->q2_head;
1265 | +	prev = NULL;
1266 | +
1267 | +	while (itr && itr != bsn) {
1268 | +		prev = itr;
1269 | +		itr = itr->next;
1270 | +	}
1271 | +
1272 | +	if (bsn == cfs_rq->q2_head)
1273 | +		// if it is the head
1274 | +		cfs_rq->q2_head = cfs_rq->q2_head->next;
1275 | +	else
1276 | +		prev->next = itr->next;
1277 | +}
1278 | +
1279 | +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct bs_node *bsn)
1280 | +{
1281 | +	struct bs_node *prev, *itr;
1282 | +
1283 | +	itr  = cfs_rq->head;
1284 | +	prev = NULL;
1285 | +
1286 | +	while (itr && itr != bsn) {
1287 | +		prev = itr;
1288 | +		itr = itr->next;
1289 | +	}
1290 | +
1291 | +	if (!itr) {
1292 | +		// then it is in q2
1293 | +		__dequeue_entity_from_q2(cfs_rq, bsn);
1294 | +		return;
1295 | +	}
1296 | +
1297 | +	if (bsn == cfs_rq->head)
1298 | +		// if it is the head
1299 | +		cfs_rq->head = cfs_rq->head->next;
1300 | +	else
1301 | +		prev->next = itr->next;
1302 | +}
1303 | +
1304 | +static void
1305 | +update_est_entity(struct sched_entity *se)
1306 | +{
1307 | +	struct bs_node *bsn = &se->bs_node;
1308 | +	u64 vburst	= bsn->vburst;
1309 | +	u64 prev_est	= bsn->est;
1310 | +	u64 next_est;
1311 | +
1312 | +	/*
1313 | +	 * <alpha> * <prev burst> + (1 - <alpha>) * <prev estimated>
1314 | +	 */
1315 | +	next_est = (alpha * vburst) + ((1000 - alpha) * prev_est);
1316 | +	next_est /= 1000;
1317 | +
1318 | +	bsn->est = next_est;
1319 | +}
1320 | +
1321 | +static void
1322 | +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1323 | +{
1324 | +	bool curr = cfs_rq->curr == se;
1325 | +	bool wakeup = (flags & ENQUEUE_WAKEUP);
1326 | +
1327 | +	update_curr(cfs_rq);
1328 | +	account_entity_enqueue(cfs_rq, se);
1329 | +
1330 | +	if (!wakeup)
1331 | +		update_est_entity(se);
1332 | +
1333 | +	/* Entity has migrated, no longer consider this task hot */
1334 | +	if (flags & ENQUEUE_MIGRATED)
1335 | +		se->exec_start = 0;
1336 | +
1337 | +	if (!curr)
1338 | +		__enqueue_entity(&cfs_rq->head, &se->bs_node);
1339 | +
1340 | +	se->on_rq = 1;
1341 | +}
1342 | +
1343 | +static void
1344 | +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1345 | +{
1346 | +	update_curr(cfs_rq);
1347 | +	update_est_entity(se);
1348 | +
1349 | +	if (flags & DEQUEUE_SLEEP)
1350 | +		se->bs_node.vburst = 0;
1351 | +
1352 | +	if (se != cfs_rq->curr)
1353 | +		__dequeue_entity(cfs_rq, &se->bs_node);
1354 | +
1355 | +	if (clear_this_candidate(se))
1356 | +		update_candidate(cfs_rq);
1357 | +
1358 | +	se->on_rq = 0;
1359 | +	account_entity_dequeue(cfs_rq, se);
1360 | +}
1361 | +
1362 | +static void
1363 | +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1364 | +{
1365 | +	struct sched_entity *se = &p->se;
1366 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1367 | +	int idle_h_nr_running = task_has_idle_policy(p);
1368 | +	int task_new = !(flags & ENQUEUE_WAKEUP);
1369 | +
1370 | +	/*
1371 | +	 * The code below (indirectly) updates schedutil which looks at
1372 | +	 * the cfs_rq utilization to select a frequency.
1373 | +	 * Let's add the task's estimated utilization to the cfs_rq's
1374 | +	 * estimated utilization, before we update schedutil.
1375 | +	 */
1376 | +	util_est_enqueue(&rq->cfs, p);
1377 | +
1378 | +	/*
1379 | +	 * If in_iowait is set, the code below may not trigger any cpufreq
1380 | +	 * utilization updates, so do it here explicitly with the IOWAIT flag
1381 | +	 * passed.
1382 | +	 */
1383 | +	if (p->in_iowait)
1384 | +		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
1385 | +
1386 | +	if (!se->on_rq) {
1387 | +		enqueue_entity(cfs_rq, se, flags);
1388 | +		cfs_rq->h_nr_running++;
1389 | +		cfs_rq->idle_h_nr_running += idle_h_nr_running;
1390 | +	}
1391 | +
1392 | +	se->bs_node.r_vrt_start = 0;
1393 | +
1394 | +	update_candidate(cfs_rq);
1395 | +
1396 | +	add_nr_running(rq, 1);
1397 | +
1398 | +	if (!task_new)
1399 | +		update_overutilized_status(rq);
1400 | +
1401 | +	hrtick_update(rq);
1402 | +}
1403 | +
1404 | +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1405 | +{
1406 | +	struct sched_entity *se = &p->se;
1407 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1408 | +	int task_sleep = flags & DEQUEUE_SLEEP;
1409 | +	int idle_h_nr_running = task_has_idle_policy(p);
1410 | +
1411 | +	util_est_dequeue(&rq->cfs, p);
1412 | +
1413 | +	dequeue_entity(cfs_rq, se, flags);
1414 | +
1415 | +	cfs_rq->h_nr_running--;
1416 | +	cfs_rq->idle_h_nr_running -= idle_h_nr_running;
1417 | +
1418 | +	sub_nr_running(rq, 1);
1419 | +	util_est_update(&rq->cfs, p, task_sleep);
1420 | +	hrtick_update(rq);
1421 | +}
1422 | +
1423 | +static void yield_task_fair(struct rq *rq)
1424 | +{
1425 | +	struct task_struct *curr = rq->curr;
1426 | +	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1427 | +
1428 | +	/*
1429 | +	 * Are we the only task in the tree?
1430 | +	 */
1431 | +	if (unlikely(rq->nr_running == 1))
1432 | +		return;
1433 | +
1434 | +	curr->se.yielded = true;
1435 | +
1436 | +	update_rq_clock(rq);
1437 | +	/*
1438 | +	 * Update run-time statistics of the 'current'.
1439 | +	 */
1440 | +	update_curr(cfs_rq);
1441 | +	/*
1442 | +	 * Tell update_rq_clock() that we've just updated,
1443 | +	 * so we don't do microscopic update in schedule()
1444 | +	 * and double the fastpath cost.
1445 | +	 */
1446 | +	rq_clock_skip_update(rq);
1447 | +}
1448 | +
1449 | +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
1450 | +{
1451 | +	struct sched_entity *se = &p->se;
1452 | +
1453 | +	if (!se->on_rq)
1454 | +		return false;
1455 | +
1456 | +	yield_task_fair(rq);
1457 | +	return true;
1458 | +}
1459 | +
1460 | +static __always_inline
1461 | +int __entity_end_quota(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1462 | +{
1463 | +	unsigned int n = max(cfs_rq->nr_running, 1);
1464 | +	unsigned int quota;
1465 | +	struct bs_node *bs = &curr->bs_node;
1466 | +
1467 | +	quota = max(bs_shared_quota / n, sysctl_sched_base_slice);
1468 | +
1469 | +	return (s64)(bs->r_vrt_start - (u64)quota) >= 0;
1470 | +}
1471 | +
1472 | +static int entity_end_quota(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1473 | +{
1474 | +	unsigned int n = cfs_rq->nr_running;
1475 | +
1476 | +	if (n <= 1)
1477 | +		return 0;
1478 | +
1479 | +	return __entity_end_quota(cfs_rq, curr);
1480 | +}
1481 | +
1482 | +static int entity_end_min_slice(struct sched_entity *curr)
1483 | +{
1484 | +	struct bs_node *bs = &curr->bs_node;
1485 | +
1486 | +	return (s64)(bs->c_vrt_start - (u64)sysctl_sched_base_slice) >= 0;
1487 | +}
1488 | +
1489 | +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
1490 | +{
1491 | +	struct cfs_rq *cfs_rq = &rq->cfs;
1492 | +	struct task_struct *curr = rq->curr;
1493 | +	struct sched_entity *curr_se = &curr->se, *pse = &p->se;
1494 | +	int cse_is_idle, pse_is_idle;
1495 | +
1496 | +	if (unlikely(curr_se == pse))
1497 | +		return;
1498 | +
1499 | +	if (test_tsk_need_resched(curr))
1500 | +		return;
1501 | +
1502 | +	/* Idle tasks are by definition preempted by non-idle tasks. */
1503 | +	if (unlikely(task_has_idle_policy(curr)) &&
1504 | +	    likely(!task_has_idle_policy(p)))
1505 | +		goto preempt;
1506 | +
1507 | +	/*
1508 | +	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1509 | +	 * is driven by the tick):
1510 | +	 */
1511 | +	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
1512 | +		return;
1513 | +
1514 | +	cse_is_idle = se_is_idle(curr_se);
1515 | +	pse_is_idle = se_is_idle(pse);
1516 | +
1517 | +	/*
1518 | +	 * Preempt an idle group in favor of a non-idle group (and don't preempt
1519 | +	 * in the inverse case).
1520 | +	 */
1521 | +	if (cse_is_idle && !pse_is_idle)
1522 | +		goto preempt;
1523 | +	if (cse_is_idle != pse_is_idle)
1524 | +		return;
1525 | +
1526 | +	update_curr(cfs_rq_of(curr_se));
1527 | +
1528 | +	/*
1529 | +	 * - if curr_se ended quoat then preempt
1530 | +	 * - if waked entity is before curr_se and
1531 | +	 *   curr_se ended min slice
1532 | +	 */
1533 | +	if (__entity_end_quota(cfs_rq, curr_se))
1534 | +		goto preempt;
1535 | +
1536 | +	if (entity_before(&pse->bs_node, &curr_se->bs_node))
1537 | +		goto preempt;
1538 | +
1539 | +	return;
1540 | +
1541 | +preempt:
1542 | +	resched_curr(rq);
1543 | +}
1544 | +
1545 | +static void
1546 | +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1547 | +{
1548 | +	if (se->on_rq)
1549 | +		__dequeue_entity(cfs_rq, &se->bs_node);
1550 | +
1551 | +	se->exec_start = rq_clock_task(rq_of(cfs_rq));
1552 | +
1553 | +	se->bs_node.c_vrt_start = 0;
1554 | +
1555 | +	update_candidate(cfs_rq);
1556 | +	cfs_rq->local_cand_est = se->bs_node.est;
1557 | +
1558 | +	cfs_rq->curr = se;
1559 | +	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1560 | +}
1561 | +
1562 | +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
1563 | +{
1564 | +	if (!cfs_rq->head)
1565 | +		return NULL;
1566 | +
1567 | +	return se_of(cfs_rq->head);
1568 | +}
1569 | +
1570 | +static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
1571 | +{
1572 | +	struct bs_node *bs_curr = &cfs_rq->curr->bs_node;
1573 | +
1574 | +	/*
1575 | +	 * Here we avoid picking curr
1576 | +	 * while __pick_first_entity picks the
1577 | +	 * min since curr == NULL
1578 | +	 */
1579 | +	if (cfs_rq->head == bs_curr) {
1580 | +		if (!cfs_rq->head->next)
1581 | +			return NULL;
1582 | +
1583 | +		return se_of(cfs_rq->head->next);
1584 | +	}
1585 | +
1586 | +	return se_of(cfs_rq->head);
1587 | +}
1588 | +
1589 | +static struct sched_entity* pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1590 | +{
1591 | +	if (!cfs_rq->head) {
1592 | +		// need to switch to q2
1593 | +		cfs_rq->head = cfs_rq->q2_head;
1594 | +		cfs_rq->q2_head = NULL;
1595 | +	}
1596 | +
1597 | +	if (!cfs_rq->head)
1598 | +		return NULL;
1599 | +
1600 | +	if (!cfs_rq->curr)
1601 | +		return __pick_first_entity(cfs_rq);
1602 | +
1603 | +	return __pick_next_entity(cfs_rq);
1604 | +}
1605 | +
1606 | +struct task_struct *
1607 | +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1608 | +{
1609 | +	struct cfs_rq *cfs_rq = &rq->cfs;
1610 | +	struct sched_entity *se;
1611 | +	struct task_struct *p;
1612 | +	int new_tasks;
1613 | +
1614 | +	/*
1615 | +	 * to cpu0, don't push any
1616 | +	 * candidates to this rq
1617 | +	 */
1618 | +	cfs_rq->local_cand_est = 0;
1619 | +	clear_rq_candidate(cfs_rq);
1620 | +
1621 | +again:
1622 | +	if (!sched_fair_runnable(rq))
1623 | +		goto idle;
1624 | +
1625 | +	if (prev)
1626 | +		put_prev_task(rq, prev);
1627 | +
1628 | +	se = pick_next_entity(cfs_rq, NULL);
1629 | +	set_next_entity(cfs_rq, se);
1630 | +
1631 | +	p = task_of(se);
1632 | +
1633 | +done: __maybe_unused;
1634 | +	if (hrtick_enabled_fair(rq))
1635 | +		hrtick_start_fair(rq, p);
1636 | +
1637 | +	update_misfit_status(p, rq);
1638 | +
1639 | +	return p;
1640 | +
1641 | +idle:
1642 | +	cfs_rq->local_cand_est = MAX_EST;
1643 | +
1644 | +	if (!rf)
1645 | +		return NULL;
1646 | +
1647 | +	new_tasks = newidle_balance(rq, rf);
1648 | +
1649 | +	/*
1650 | +	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
1651 | +	 * possible for any higher priority task to appear. In that case we
1652 | +	 * must re-start the pick_next_entity() loop.
1653 | +	 */
1654 | +	if (new_tasks < 0)
1655 | +		return RETRY_TASK;
1656 | +
1657 | +	if (new_tasks > 0)
1658 | +		goto again;
1659 | +
1660 | +	/*
1661 | +	 * rq is about to be idle, check if we need to update the
1662 | +	 * lost_idle_time of clock_pelt
1663 | +	 */
1664 | +	update_idle_rq_clock_pelt(rq);
1665 | +
1666 | +	return NULL;
1667 | +}
1668 | +
1669 | +static struct task_struct *__pick_next_task_fair(struct rq *rq)
1670 | +{
1671 | +	return pick_next_task_fair(rq, NULL, NULL);
1672 | +}
1673 | +
1674 | +#ifdef CONFIG_SMP
1675 | +static struct task_struct *pick_task_fair(struct rq *rq)
1676 | +{
1677 | +	struct sched_entity *se;
1678 | +	struct cfs_rq *cfs_rq = &rq->cfs;
1679 | +	struct sched_entity *curr = cfs_rq->curr;
1680 | +
1681 | +	/*
1682 | +	 * to cpu0, don't push any
1683 | +	 * candidates to this rq
1684 | +	 */
1685 | +	cfs_rq->local_cand_est = 0;
1686 | +	clear_rq_candidate(cfs_rq);
1687 | +
1688 | +	if (!cfs_rq->nr_running)
1689 | +		return NULL;
1690 | +
1691 | +	/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
1692 | +	if (curr) {
1693 | +		if (curr->on_rq)
1694 | +			update_curr(cfs_rq);
1695 | +		else
1696 | +			curr = NULL;
1697 | +	}
1698 | +
1699 | +	se = pick_next_entity(cfs_rq, curr);
1700 | +
1701 | +	return task_of(se);
1702 | +}
1703 | +#endif
1704 | +
1705 | +static void __enqueue_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1706 | +{
1707 | +	if (se->yielded || entity_end_quota(cfs_rq, se)) {
1708 | +		se->yielded = false;
1709 | +		se->bs_node.r_vrt_start = 0;
1710 | +
1711 | +		__enqueue_entity(&cfs_rq->q2_head, &se->bs_node);
1712 | +	} else {
1713 | +		__enqueue_entity(&cfs_rq->head, &se->bs_node);
1714 | +	}
1715 | +}
1716 | +
1717 | +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1718 | +{
1719 | +	/*
1720 | +	 * If still on the runqueue then deactivate_task()
1721 | +	 * was not called and update_curr() has to be done:
1722 | +	 */
1723 | +	if (prev->on_rq) {
1724 | +		update_curr(cfs_rq);
1725 | +		__enqueue_prev_entity(cfs_rq, prev);
1726 | +	}
1727 | +
1728 | +	update_est_entity(prev);
1729 | +
1730 | +	cfs_rq->curr = NULL;
1731 | +}
1732 | +
1733 | +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1734 | +{
1735 | +	struct sched_entity *se = &prev->se;
1736 | +
1737 | +	put_prev_entity(cfs_rq_of(se), se);
1738 | +}
1739 | +
1740 | +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1741 | +{
1742 | +	struct sched_entity *se = &p->se;
1743 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1744 | +
1745 | +	set_next_entity(cfs_rq, se);
1746 | +}
1747 | +
1748 | +
1749 | +static void
1750 | +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1751 | +{
1752 | +	struct sched_entity *se;
1753 | +
1754 | +	update_curr(cfs_rq);
1755 | +
1756 | +#ifdef CONFIG_SCHED_HRTICK
1757 | +	/*
1758 | +	 * queued ticks are scheduled to match the slice, so don't bother
1759 | +	 * validating it and just reschedule.
1760 | +	 */
1761 | +	if (queued) {
1762 | +		resched_curr(rq_of(cfs_rq));
1763 | +		return;
1764 | +	}
1765 | +
1766 | +	if (cfs_rq->nr_running <= 1) {
1767 | +		clear_rq_candidate(cfs_rq);
1768 | +	} else {
1769 | +		if (curr->yielded || entity_end_quota(cfs_rq, curr)) {
1770 | +			resched_curr(rq_of(cfs_rq));
1771 | +			return;
1772 | +		}
1773 | +
1774 | +		se = __pick_first_entity(cfs_rq);
1775 | +		if (!se)
1776 | +			return;
1777 | +
1778 | +		 if (entity_before(&se->bs_node, &curr->bs_node) && entity_end_min_slice(curr)) {
1779 | +			resched_curr(rq_of(cfs_rq));
1780 | +			return;
1781 | +		}
1782 | +	}
1783 | +
1784 | +	/*
1785 | +	 * don't let the period tick interfere with the hrtick preemption
1786 | +	 */
1787 | +	if (!sched_feat(DOUBLE_TICK) &&
1788 | +			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1789 | +		return;
1790 | +#endif
1791 | +}
1792 | +
1793 | +#include "balancer.h"
1794 | +
1795 | +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1796 | +{
1797 | +	struct sched_entity *se = &curr->se;
1798 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1799 | +
1800 | +	entity_tick(cfs_rq, se, queued);
1801 | +
1802 | +	if (static_branch_unlikely(&sched_numa_balancing))
1803 | +		task_tick_numa(rq, curr);
1804 | +
1805 | +	update_misfit_status(curr, rq);
1806 | +	update_overutilized_status(task_rq(curr));
1807 | +}
1808 | +
1809 | +static void task_fork_fair(struct task_struct *p)
1810 | +{
1811 | +	struct cfs_rq *cfs_rq;
1812 | +	struct sched_entity *curr;
1813 | +	struct rq *rq = this_rq();
1814 | +	struct rq_flags rf;
1815 | +
1816 | +	rq_lock(rq, &rf);
1817 | +	update_rq_clock(rq);
1818 | +
1819 | +	cfs_rq = task_cfs_rq(current);
1820 | +	curr = cfs_rq->curr;
1821 | +	if (curr)
1822 | +		update_curr(cfs_rq);
1823 | +
1824 | +	rq_unlock(rq, &rf);
1825 | +}
1826 | +
1827 | +/*
1828 | + * All the scheduling class methods:
1829 | + */
1830 | +DEFINE_SCHED_CLASS(fair) = {
1831 | +
1832 | +	.enqueue_task		= enqueue_task_fair,
1833 | +	.dequeue_task		= dequeue_task_fair,
1834 | +	.yield_task		= yield_task_fair,
1835 | +	.yield_to_task		= yield_to_task_fair,
1836 | +
1837 | +	.wakeup_preempt		= check_preempt_wakeup_fair,
1838 | +
1839 | +	.pick_next_task		= __pick_next_task_fair,
1840 | +	.put_prev_task		= put_prev_task_fair,
1841 | +	.set_next_task          = set_next_task_fair,
1842 | +
1843 | +#ifdef CONFIG_SMP
1844 | +	.balance		= balance_fair,
1845 | +	.pick_task		= pick_task_fair,
1846 | +	.select_task_rq		= select_task_rq_fair,
1847 | +	.migrate_task_rq	= migrate_task_rq_fair,
1848 | +
1849 | +	.rq_online		= rq_online_fair,
1850 | +	.rq_offline		= rq_offline_fair,
1851 | +
1852 | +	.task_dead		= task_dead_fair,
1853 | +	.set_cpus_allowed	= set_cpus_allowed_common,
1854 | +#endif
1855 | +
1856 | +	.task_tick		= task_tick_fair,
1857 | +	.task_fork		= task_fork_fair,
1858 | +
1859 | +	.prio_changed		= prio_changed_fair,
1860 | +	.switched_from		= switched_from_fair,
1861 | +	.switched_to		= switched_to_fair,
1862 | +
1863 | +	.get_rr_interval	= get_rr_interval_fair,
1864 | +
1865 | +	.update_curr		= update_curr_fair,
1866 | +};
1867 | +
1868 | +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1869 | +			    unsigned long weight)
1870 | +{
1871 | +	bool curr = cfs_rq->curr == se;
1872 | +
1873 | +	if (se->on_rq) {
1874 | +		/* commit outstanding execution time */
1875 | +		if (curr)
1876 | +			update_curr(cfs_rq);
1877 | +
1878 | +		update_load_sub(&cfs_rq->load, se->load.weight);
1879 | +	}
1880 | +	dequeue_load_avg(cfs_rq, se);
1881 | +
1882 | +	update_load_set(&se->load, weight);
1883 | +
1884 | +#ifdef CONFIG_SMP
1885 | +	do {
1886 | +		u32 divider = get_pelt_divider(&se->avg);
1887 | +
1888 | +		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
1889 | +	} while (0);
1890 | +#endif
1891 | +
1892 | +	enqueue_load_avg(cfs_rq, se);
1893 | +	if (se->on_rq)
1894 | +		update_load_add(&cfs_rq->load, se->load.weight);
1895 | +}
1896 | +
1897 | +void reweight_task(struct task_struct *p, int prio)
1898 | +{
1899 | +	struct sched_entity *se = &p->se;
1900 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1901 | +	struct load_weight *load = &se->load;
1902 | +	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
1903 | +
1904 | +	reweight_entity(cfs_rq, se, weight);
1905 | +	load->inv_weight = sched_prio_to_wmult[prio];
1906 | +}
1907 | +
1908 | +/* Working cpumask for: load_balance, load_balance_newidle. */
1909 | +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1910 | +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
1911 | +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
1912 | +
1913 | +__init void init_sched_fair_class(void)
1914 | +{
1915 | +#ifdef CONFIG_SMP
1916 | +	int i;
1917 | +
1918 | +	for_each_possible_cpu(i) {
1919 | +		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
1920 | +		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
1921 | +		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
1922 | +					GFP_KERNEL, cpu_to_node(i));
1923 | +	}
1924 | +
1925 | +	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
1926 | +
1927 | +#ifdef CONFIG_NO_HZ_COMMON
1928 | +	nohz.next_balance = jiffies;
1929 | +	nohz.next_blocked = jiffies;
1930 | +	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
1931 | +#endif
1932 | +#endif /* SMP */
1933 | +
1934 | +}
1935 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
1936 | index 9116bcc90346..f8f5ad0d2f90 100644
1937 | --- a/kernel/sched/core.c
1938 | +++ b/kernel/sched/core.c
1939 | @@ -4525,6 +4525,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1940 |  	p->se.vruntime			= 0;
1941 |  	p->se.vlag			= 0;
1942 |  	p->se.slice			= sysctl_sched_base_slice;
1943 | +
1944 | +#ifdef CONFIG_ECHO_SCHED
1945 | +	p->se.bs_node.vburst		= 0;
1946 | +	p->se.bs_node.est		= 0;
1947 | +#endif
1948 | +
1949 |  	INIT_LIST_HEAD(&p->se.group_node);
1950 |  
1951 |  #ifdef CONFIG_FAIR_GROUP_SCHED
1952 | @@ -4687,6 +4693,15 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
1953 |  
1954 |  #ifdef CONFIG_SYSCTL
1955 |  static struct ctl_table sched_core_sysctls[] = {
1956 | +#ifdef CONFIG_ECHO_SCHED
1957 | +	{
1958 | +		.procname	= "sched_bs_shared_quota",
1959 | +		.data		= &bs_shared_quota,
1960 | +		.maxlen		= sizeof(unsigned int),
1961 | +		.mode		= 0644,
1962 | +		.proc_handler	= proc_dointvec,
1963 | +	},
1964 | +#endif
1965 |  #ifdef CONFIG_SCHEDSTATS
1966 |  	{
1967 |  		.procname       = "sched_schedstats",
1968 | @@ -9912,6 +9927,10 @@ void __init sched_init(void)
1969 |  
1970 |  	wait_bit_init();
1971 |  
1972 | +#ifdef CONFIG_ECHO_SCHED
1973 | +	printk(KERN_INFO "ECHO CPU scheduler v6.8 by Hamad Al Marri.");
1974 | +#endif
1975 | +
1976 |  #ifdef CONFIG_FAIR_GROUP_SCHED
1977 |  	ptr += 2 * nr_cpu_ids * sizeof(void **);
1978 |  #endif
1979 | diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
1980 | index 8d5d98a5834d..ec7d41bc6d44 100644
1981 | --- a/kernel/sched/debug.c
1982 | +++ b/kernel/sched/debug.c
1983 | @@ -1003,6 +1003,11 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
1984 |  	PN(se.exec_start);
1985 |  	PN(se.vruntime);
1986 |  	PN(se.sum_exec_runtime);
1987 | +#ifdef CONFIG_ECHO_SCHED
1988 | +	PN(se.bs_node.vburst);
1989 | +	PN(se.bs_node.prev_vburst);
1990 | +	PN(se.bs_node.est);
1991 | +#endif
1992 |  
1993 |  	nr_switches = p->nvcsw + p->nivcsw;
1994 |  
1995 | diff --git a/kernel/sched/fair_debug.h b/kernel/sched/fair_debug.h
1996 | new file mode 100644
1997 | index 000000000000..2778cf58000f
1998 | --- /dev/null
1999 | +++ b/kernel/sched/fair_debug.h
2000 | @@ -0,0 +1,137 @@
2001 | +#ifdef CONFIG_SCHED_DEBUG
2002 | +/*
2003 | + * The initial- and re-scaling of tunables is configurable
2004 | + *
2005 | + * Options are:
2006 | + *
2007 | + *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
2008 | + *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
2009 | + *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
2010 | + *
2011 | + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
2012 | + */
2013 | +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
2014 | +static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
2015 | +
2016 | +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
2017 | +{
2018 | +	return NULL;
2019 | +}
2020 | +
2021 | +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
2022 | +{
2023 | +	return NULL;
2024 | +}
2025 | +
2026 | +static unsigned int get_update_sysctl_factor(void)
2027 | +{
2028 | +	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
2029 | +	unsigned int factor;
2030 | +
2031 | +	switch (sysctl_sched_tunable_scaling) {
2032 | +	case SCHED_TUNABLESCALING_NONE:
2033 | +		factor = 1;
2034 | +		break;
2035 | +	case SCHED_TUNABLESCALING_LINEAR:
2036 | +		factor = cpus;
2037 | +		break;
2038 | +	case SCHED_TUNABLESCALING_LOG:
2039 | +	default:
2040 | +		factor = 1 + ilog2(cpus);
2041 | +		break;
2042 | +	}
2043 | +
2044 | +	return factor;
2045 | +}
2046 | +
2047 | +/**************************************************************
2048 | + * Scheduling class statistics methods:
2049 | + */
2050 | +#ifdef CONFIG_SMP
2051 | +int sched_update_scaling(void)
2052 | +{
2053 | +	unsigned int factor = get_update_sysctl_factor();
2054 | +
2055 | +#define WRT_SYSCTL(name) \
2056 | +	(normalized_sysctl_##name = sysctl_##name / (factor))
2057 | +	WRT_SYSCTL(sched_base_slice);
2058 | +#undef WRT_SYSCTL
2059 | +
2060 | +	return 0;
2061 | +}
2062 | +#endif
2063 | +
2064 | +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
2065 | +{
2066 | +	return se->vruntime < 750000ULL;
2067 | +}
2068 | +
2069 | +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
2070 | +		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
2071 | +
2072 | +void print_cfs_stats(struct seq_file *m, int cpu)
2073 | +{
2074 | +	struct cfs_rq *cfs_rq, *pos;
2075 | +
2076 | +	rcu_read_lock();
2077 | +	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
2078 | +		print_cfs_rq(m, cpu, cfs_rq);
2079 | +	rcu_read_unlock();
2080 | +}
2081 | +
2082 | +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
2083 | +{
2084 | +	return (s64)se->vruntime;
2085 | +}
2086 | +
2087 | +/*
2088 | + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
2089 | + * For this to be so, the result of this function must have a left bias.
2090 | + */
2091 | +u64 avg_vruntime(struct cfs_rq *cfs_rq)
2092 | +{
2093 | +	struct sched_entity *curr = cfs_rq->curr;
2094 | +	s64 avg = cfs_rq->avg_vruntime;
2095 | +	long load = cfs_rq->avg_load;
2096 | +
2097 | +	if (curr && curr->on_rq) {
2098 | +		unsigned long weight = scale_load_down(curr->load.weight);
2099 | +
2100 | +		avg += entity_key(cfs_rq, curr) * weight;
2101 | +		load += weight;
2102 | +	}
2103 | +
2104 | +	if (load) {
2105 | +		/* sign flips effective floor / ceil */
2106 | +		if (avg < 0)
2107 | +			avg -= (load - 1);
2108 | +		avg = div_s64(avg, load);
2109 | +	}
2110 | +
2111 | +	return avg;
2112 | +}
2113 | +
2114 | +#ifdef CONFIG_NUMA_BALANCING
2115 | +void show_numa_stats(struct task_struct *p, struct seq_file *m)
2116 | +{
2117 | +	int node;
2118 | +	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
2119 | +	struct numa_group *ng;
2120 | +
2121 | +	rcu_read_lock();
2122 | +	ng = rcu_dereference(p->numa_group);
2123 | +	for_each_online_node(node) {
2124 | +		if (p->numa_faults) {
2125 | +			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
2126 | +			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
2127 | +		}
2128 | +		if (ng) {
2129 | +			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
2130 | +			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
2131 | +		}
2132 | +		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
2133 | +	}
2134 | +	rcu_read_unlock();
2135 | +}
2136 | +#endif // CONFIG_NUMA_BALANCING
2137 | +#endif // CONFIG_SCHED_DEBUG
2138 | diff --git a/kernel/sched/fair_dep_funcs.h b/kernel/sched/fair_dep_funcs.h
2139 | new file mode 100644
2140 | index 000000000000..d4411cded78b
2141 | --- /dev/null
2142 | +++ b/kernel/sched/fair_dep_funcs.h
2143 | @@ -0,0 +1,828 @@
2144 | +/*
2145 | + * Used by other classes to account runtime.
2146 | + */
2147 | +s64 update_curr_common(struct rq *rq)
2148 | +{
2149 | +	struct sched_entity *curr = &rq->curr->se;
2150 | +	struct task_struct *curtask = task_of(curr);
2151 | +	u64 now = rq_clock_task(rq);
2152 | +	s64 delta_exec;
2153 | +
2154 | +	if (unlikely(!curr))
2155 | +		return 0;
2156 | +
2157 | +	delta_exec = now - curr->exec_start;
2158 | +	if (unlikely(delta_exec <= 0))
2159 | +		return delta_exec;
2160 | +
2161 | +	curr->exec_start = now;
2162 | +	curr->sum_exec_runtime += delta_exec;
2163 | +
2164 | +	if (schedstat_enabled()) {
2165 | +		struct sched_statistics *stats;
2166 | +
2167 | +		stats = __schedstats_from_se(curr);
2168 | +		__schedstat_set(stats->exec_max,
2169 | +				max(delta_exec, stats->exec_max));
2170 | +	}
2171 | +
2172 | +	trace_sched_stat_runtime(curtask, delta_exec);
2173 | +	account_group_exec_runtime(curtask, delta_exec);
2174 | +	cgroup_account_cputime(curtask, delta_exec);
2175 | +	if (curtask->dl_server)
2176 | +		dl_server_update(curtask->dl_server, delta_exec);
2177 | +
2178 | +	return delta_exec;
2179 | +}
2180 | +
2181 | +#if defined(CONFIG_NO_HZ_FULL) && defined(CONFIG_CGROUP_SCHED)
2182 | +bool cfs_task_bw_constrained(struct task_struct *p)
2183 | +{
2184 | +	return false;
2185 | +}
2186 | +#endif
2187 | +
2188 | +/*
2189 | + * After fork, child runs first. If set to 0 (default) then
2190 | + * parent will (try to) run first.
2191 | + */
2192 | +unsigned int sysctl_sched_child_runs_first __read_mostly;
2193 | +
2194 | +const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
2195 | +
2196 | +void __init sched_init_granularity(void) {}
2197 | +
2198 | +#ifdef CONFIG_SMP
2199 | +/* Give new sched_entity start runnable values to heavy its load in infant time */
2200 | +void init_entity_runnable_average(struct sched_entity *se) {}
2201 | +void post_init_entity_util_avg(struct task_struct *p) {}
2202 | +void update_max_interval(void) {}
2203 | +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
2204 | +#endif /** CONFIG_SMP */
2205 | +
2206 | +void init_cfs_rq(struct cfs_rq *cfs_rq)
2207 | +{
2208 | +	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
2209 | +#ifdef CONFIG_SMP
2210 | +	raw_spin_lock_init(&cfs_rq->removed.lock);
2211 | +#endif
2212 | +}
2213 | +
2214 | +static inline struct sched_entity *se_of(struct bs_node *bsn)
2215 | +{
2216 | +	return container_of(bsn, struct sched_entity, bs_node);
2217 | +}
2218 | +
2219 | +#ifdef CONFIG_SCHED_SMT
2220 | +DEFINE_STATIC_KEY_FALSE(sched_smt_present);
2221 | +EXPORT_SYMBOL_GPL(sched_smt_present);
2222 | +
2223 | +static inline void set_idle_cores(int cpu, int val)
2224 | +{
2225 | +	struct sched_domain_shared *sds;
2226 | +
2227 | +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
2228 | +	if (sds)
2229 | +		WRITE_ONCE(sds->has_idle_cores, val);
2230 | +}
2231 | +
2232 | +static inline bool test_idle_cores(int cpu)
2233 | +{
2234 | +	struct sched_domain_shared *sds;
2235 | +
2236 | +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
2237 | +	if (sds)
2238 | +		return READ_ONCE(sds->has_idle_cores);
2239 | +
2240 | +	return false;
2241 | +}
2242 | +
2243 | +void __update_idle_core(struct rq *rq)
2244 | +{
2245 | +	int core = cpu_of(rq);
2246 | +	int cpu;
2247 | +
2248 | +	rcu_read_lock();
2249 | +	if (test_idle_cores(core))
2250 | +		goto unlock;
2251 | +
2252 | +	for_each_cpu(cpu, cpu_smt_mask(core)) {
2253 | +		if (cpu == core)
2254 | +			continue;
2255 | +
2256 | +		if (!available_idle_cpu(cpu))
2257 | +			goto unlock;
2258 | +	}
2259 | +
2260 | +	set_idle_cores(core, 1);
2261 | +unlock:
2262 | +	rcu_read_unlock();
2263 | +}
2264 | +#endif
2265 | +
2266 | +static inline void update_load_add(struct load_weight *lw, unsigned long inc)
2267 | +{
2268 | +	lw->weight += inc;
2269 | +	lw->inv_weight = 0;
2270 | +}
2271 | +
2272 | +static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
2273 | +{
2274 | +	lw->weight -= dec;
2275 | +	lw->inv_weight = 0;
2276 | +}
2277 | +
2278 | +static inline void update_load_set(struct load_weight *lw, unsigned long w)
2279 | +{
2280 | +	lw->weight = w;
2281 | +	lw->inv_weight = 0;
2282 | +}
2283 | +
2284 | +static int se_is_idle(struct sched_entity *se)
2285 | +{
2286 | +	return task_has_idle_policy(task_of(se));
2287 | +}
2288 | +
2289 | +static void
2290 | +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2291 | +{
2292 | +	update_load_add(&cfs_rq->load, se->load.weight);
2293 | +#ifdef CONFIG_SMP
2294 | +	struct rq *rq = rq_of(cfs_rq);
2295 | +
2296 | +	account_numa_enqueue(rq, task_of(se));
2297 | +	list_add(&se->group_node, &rq->cfs_tasks);
2298 | +#endif
2299 | +	cfs_rq->nr_running++;
2300 | +	if (se_is_idle(se))
2301 | +		cfs_rq->idle_nr_running++;
2302 | +}
2303 | +
2304 | +static void
2305 | +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2306 | +{
2307 | +	update_load_sub(&cfs_rq->load, se->load.weight);
2308 | +#ifdef CONFIG_SMP
2309 | +	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2310 | +	list_del_init(&se->group_node);
2311 | +#endif
2312 | +	cfs_rq->nr_running--;
2313 | +	if (se_is_idle(se))
2314 | +		cfs_rq->idle_nr_running--;
2315 | +}
2316 | +
2317 | +/*
2318 | + * Task first catches up with cfs_rq, and then subtract
2319 | + * itself from the cfs_rq (task must be off the queue now).
2320 | + */
2321 | +static void remove_entity_load_avg(struct sched_entity *se)
2322 | +{
2323 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2324 | +	unsigned long flags;
2325 | +
2326 | +	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
2327 | +	++cfs_rq->removed.nr;
2328 | +	cfs_rq->removed.util_avg	+= se->avg.util_avg;
2329 | +	cfs_rq->removed.load_avg	+= se->avg.load_avg;
2330 | +	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
2331 | +	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
2332 | +}
2333 | +
2334 | +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
2335 | +{
2336 | +	struct sched_entity *se = &p->se;
2337 | +
2338 | +	/* Tell new CPU we are migrated */
2339 | +	se->avg.last_update_time = 0;
2340 | +
2341 | +	p->se.yielded = false;
2342 | +
2343 | +	update_scan_period(p, new_cpu);
2344 | +}
2345 | +
2346 | +static void rq_online_fair(struct rq *rq) {}
2347 | +
2348 | +static void rq_offline_fair(struct rq *rq) {}
2349 | +
2350 | +static void task_dead_fair(struct task_struct *p)
2351 | +{
2352 | +	remove_entity_load_avg(&p->se);
2353 | +}
2354 | +
2355 | +static void
2356 | +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
2357 | +{
2358 | +	if (!task_on_rq_queued(p))
2359 | +		return;
2360 | +
2361 | +	if (rq->cfs.nr_running == 1)
2362 | +		return;
2363 | +
2364 | +	/*
2365 | +	 * Reschedule if we are currently running on this runqueue and
2366 | +	 * our priority decreased, or if we are not currently running on
2367 | +	 * this runqueue and our priority is higher than the current's
2368 | +	 */
2369 | +	if (task_current(rq, p)) {
2370 | +		if (p->prio > oldprio)
2371 | +			resched_curr(rq);
2372 | +	} else
2373 | +		wakeup_preempt(rq, p, 0);
2374 | +}
2375 | +
2376 | +static void switched_from_fair(struct rq *rq, struct task_struct *p) {}
2377 | +
2378 | +static void switched_to_fair(struct rq *rq, struct task_struct *p)
2379 | +{
2380 | +	if (task_on_rq_queued(p)) {
2381 | +		/*
2382 | +		 * We were most likely switched from sched_rt, so
2383 | +		 * kick off the schedule if running, otherwise just see
2384 | +		 * if we can still preempt the current task.
2385 | +		 */
2386 | +		if (task_current(rq, p))
2387 | +			resched_curr(rq);
2388 | +		else
2389 | +			wakeup_preempt(rq, p, 0);
2390 | +	}
2391 | +}
2392 | +
2393 | +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2394 | +{
2395 | +	struct sched_entity *se = &task->se;
2396 | +	unsigned int rr_interval = 0;
2397 | +
2398 | +	/*
2399 | +	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2400 | +	 * idle runqueue:
2401 | +	 */
2402 | +	if (rq->cfs.load.weight)
2403 | +		rr_interval = NS_TO_JIFFIES(se->slice);
2404 | +
2405 | +	return rr_interval;
2406 | +}
2407 | +
2408 | +/*
2409 | + * Remove and clamp on negative, from a local variable.
2410 | + *
2411 | + * A variant of sub_positive(), which does not use explicit load-store
2412 | + * and is thus optimized for local variable updates.
2413 | + */
2414 | +#define lsub_positive(_ptr, _val) do {				\
2415 | +	typeof(_ptr) ptr = (_ptr);				\
2416 | +	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
2417 | +} while (0)
2418 | +
2419 | +static inline unsigned long task_util(struct task_struct *p)
2420 | +{
2421 | +	return READ_ONCE(p->se.avg.util_avg);
2422 | +}
2423 | +
2424 | +static inline unsigned long _task_util_est(struct task_struct *p)
2425 | +{
2426 | +	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
2427 | +}
2428 | +
2429 | +static unsigned long
2430 | +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
2431 | +{
2432 | +	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
2433 | +	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
2434 | +	unsigned long runnable;
2435 | +
2436 | +	if (boost) {
2437 | +		runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
2438 | +		util = max(util, runnable);
2439 | +	}
2440 | +
2441 | +	/*
2442 | +	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
2443 | +	 * contribution. If @p migrates from another CPU to @cpu add its
2444 | +	 * contribution. In all the other cases @cpu is not impacted by the
2445 | +	 * migration so its util_avg is already correct.
2446 | +	 */
2447 | +	if (p && task_cpu(p) == cpu && dst_cpu != cpu)
2448 | +		lsub_positive(&util, task_util(p));
2449 | +	else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
2450 | +		util += task_util(p);
2451 | +
2452 | +	if (sched_feat(UTIL_EST)) {
2453 | +		unsigned long util_est;
2454 | +
2455 | +		util_est = READ_ONCE(cfs_rq->avg.util_est);
2456 | +
2457 | +		/*
2458 | +		 * During wake-up @p isn't enqueued yet and doesn't contribute
2459 | +		 * to any cpu_rq(cpu)->cfs.avg.util_est.
2460 | +		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
2461 | +		 * has been enqueued.
2462 | +		 *
2463 | +		 * During exec (@dst_cpu = -1) @p is enqueued and does
2464 | +		 * contribute to cpu_rq(cpu)->cfs.util_est.
2465 | +		 * Remove it to "simulate" cpu_util without @p's contribution.
2466 | +		 *
2467 | +		 * Despite the task_on_rq_queued(@p) check there is still a
2468 | +		 * small window for a possible race when an exec
2469 | +		 * select_task_rq_fair() races with LB's detach_task().
2470 | +		 *
2471 | +		 *   detach_task()
2472 | +		 *     deactivate_task()
2473 | +		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
2474 | +		 *       -------------------------------- A
2475 | +		 *       dequeue_task()                    \
2476 | +		 *         dequeue_task_fair()              + Race Time
2477 | +		 *           util_est_dequeue()            /
2478 | +		 *       -------------------------------- B
2479 | +		 *
2480 | +		 * The additional check "current == p" is required to further
2481 | +		 * reduce the race window.
2482 | +		 */
2483 | +		if (dst_cpu == cpu)
2484 | +			util_est += _task_util_est(p);
2485 | +		else if (p && unlikely(task_on_rq_queued(p) || current == p))
2486 | +			lsub_positive(&util_est, _task_util_est(p));
2487 | +
2488 | +		util = max(util, util_est);
2489 | +	}
2490 | +
2491 | +	return min(util, arch_scale_cpu_capacity(cpu));
2492 | +}
2493 | +
2494 | +unsigned long cpu_util_cfs(int cpu)
2495 | +{
2496 | +	return cpu_util(cpu, NULL, -1, 0);
2497 | +}
2498 | +
2499 | +unsigned long cpu_util_cfs_boost(int cpu)
2500 | +{
2501 | +	return cpu_util(cpu, NULL, -1, 1);
2502 | +}
2503 | +
2504 | +#define WMULT_CONST	(~0U)
2505 | +#define WMULT_SHIFT	32
2506 | +
2507 | +static void __update_inv_weight(struct load_weight *lw)
2508 | +{
2509 | +	unsigned long w;
2510 | +
2511 | +	if (likely(lw->inv_weight))
2512 | +		return;
2513 | +
2514 | +	w = scale_load_down(lw->weight);
2515 | +
2516 | +	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
2517 | +		lw->inv_weight = 1;
2518 | +	else if (unlikely(!w))
2519 | +		lw->inv_weight = WMULT_CONST;
2520 | +	else
2521 | +		lw->inv_weight = WMULT_CONST / w;
2522 | +}
2523 | +
2524 | +/*
2525 | + * delta_exec * weight / lw.weight
2526 | + *   OR
2527 | + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
2528 | + *
2529 | + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
2530 | + * we're guaranteed shift stays positive because inv_weight is guaranteed to
2531 | + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
2532 | + *
2533 | + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
2534 | + * weight/lw.weight <= 1, and therefore our shift will also be positive.
2535 | + */
2536 | +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
2537 | +{
2538 | +	u64 fact = scale_load_down(weight);
2539 | +	u32 fact_hi = (u32)(fact >> 32);
2540 | +	int shift = WMULT_SHIFT;
2541 | +	int fs;
2542 | +
2543 | +	__update_inv_weight(lw);
2544 | +
2545 | +	if (unlikely(fact_hi)) {
2546 | +		fs = fls(fact_hi);
2547 | +		shift -= fs;
2548 | +		fact >>= fs;
2549 | +	}
2550 | +
2551 | +	fact = mul_u32_u32(fact, lw->inv_weight);
2552 | +
2553 | +	fact_hi = (u32)(fact >> 32);
2554 | +	if (fact_hi) {
2555 | +		fs = fls(fact_hi);
2556 | +		shift -= fs;
2557 | +		fact >>= fs;
2558 | +	}
2559 | +
2560 | +	return mul_u64_u32_shr(delta_exec, fact, shift);
2561 | +}
2562 | +
2563 | +/*
2564 | + * delta /= w
2565 | + */
2566 | +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
2567 | +{
2568 | +	if (unlikely(se->load.weight != NICE_0_LOAD))
2569 | +		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
2570 | +
2571 | +	return delta;
2572 | +}
2573 | +
2574 | +static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
2575 | +{
2576 | +	unsigned int n = cfs_rq->nr_running;
2577 | +
2578 | +	if (n <= 1)
2579 | +		se->slice = bs_shared_quota;
2580 | +	else
2581 | +		se->slice = max(bs_shared_quota / n, sysctl_sched_base_slice);
2582 | +}
2583 | +
2584 | +#ifdef CONFIG_SCHED_HRTICK
2585 | +static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2586 | +{
2587 | +	struct sched_entity *se = &p->se;
2588 | +
2589 | +	SCHED_WARN_ON(task_rq(p) != rq);
2590 | +
2591 | +	if (rq->cfs.h_nr_running > 1) {
2592 | +		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2593 | +		u64 slice = se->slice;
2594 | +		s64 delta = slice - ran;
2595 | +
2596 | +		if (se->yielded || delta < 0) {
2597 | +			if (task_current(rq, p))
2598 | +				resched_curr(rq);
2599 | +			return;
2600 | +		}
2601 | +		hrtick_start(rq, delta);
2602 | +	}
2603 | +}
2604 | +
2605 | +/*
2606 | + * called from enqueue/dequeue and updates the hrtick when the
2607 | + * current task is from our class and nr_running is low enough
2608 | + * to matter.
2609 | + */
2610 | +static void hrtick_update(struct rq *rq)
2611 | +{
2612 | +	struct task_struct *curr = rq->curr;
2613 | +
2614 | +	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
2615 | +		return;
2616 | +
2617 | +	hrtick_start_fair(rq, curr);
2618 | +}
2619 | +#else /* !CONFIG_SCHED_HRTICK */
2620 | +static inline void
2621 | +hrtick_start_fair(struct rq *rq, struct task_struct *p)
2622 | +{
2623 | +}
2624 | +
2625 | +static inline void hrtick_update(struct rq *rq)
2626 | +{
2627 | +}
2628 | +#endif
2629 | +
2630 | +/*
2631 | + * The margin used when comparing utilization with CPU capacity.
2632 | + *
2633 | + * (default: ~20%)
2634 | + */
2635 | +#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
2636 | +
2637 | +static inline int util_fits_cpu(unsigned long util,
2638 | +				unsigned long uclamp_min,
2639 | +				unsigned long uclamp_max,
2640 | +				int cpu)
2641 | +{
2642 | +	unsigned long capacity_orig, capacity_orig_thermal;
2643 | +	unsigned long capacity = capacity_of(cpu);
2644 | +	bool fits, uclamp_max_fits;
2645 | +
2646 | +	/*
2647 | +	 * Check if the real util fits without any uclamp boost/cap applied.
2648 | +	 */
2649 | +	fits = fits_capacity(util, capacity);
2650 | +
2651 | +	if (!uclamp_is_used())
2652 | +		return fits;
2653 | +
2654 | +	/*
2655 | +	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
2656 | +	 * uclamp_max. We only care about capacity pressure (by using
2657 | +	 * capacity_of()) for comparing against the real util.
2658 | +	 *
2659 | +	 * If a task is boosted to 1024 for example, we don't want a tiny
2660 | +	 * pressure to skew the check whether it fits a CPU or not.
2661 | +	 *
2662 | +	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
2663 | +	 * should fit a little cpu even if there's some pressure.
2664 | +	 *
2665 | +	 * Only exception is for thermal pressure since it has a direct impact
2666 | +	 * on available OPP of the system.
2667 | +	 *
2668 | +	 * We honour it for uclamp_min only as a drop in performance level
2669 | +	 * could result in not getting the requested minimum performance level.
2670 | +	 *
2671 | +	 * For uclamp_max, we can tolerate a drop in performance level as the
2672 | +	 * goal is to cap the task. So it's okay if it's getting less.
2673 | +	 */
2674 | +	capacity_orig = arch_scale_cpu_capacity(cpu);
2675 | +	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
2676 | +
2677 | +	/*
2678 | +	 * We want to force a task to fit a cpu as implied by uclamp_max.
2679 | +	 * But we do have some corner cases to cater for..
2680 | +	 *
2681 | +	 *
2682 | +	 *                                 C=z
2683 | +	 *   |                             ___
2684 | +	 *   |                  C=y       |   |
2685 | +	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
2686 | +	 *   |      C=x        |   |      |   |
2687 | +	 *   |      ___        |   |      |   |
2688 | +	 *   |     |   |       |   |      |   |    (util somewhere in this region)
2689 | +	 *   |     |   |       |   |      |   |
2690 | +	 *   |     |   |       |   |      |   |
2691 | +	 *   +----------------------------------------
2692 | +	 *         cpu0        cpu1       cpu2
2693 | +	 *
2694 | +	 *   In the above example if a task is capped to a specific performance
2695 | +	 *   point, y, then when:
2696 | +	 *
2697 | +	 *   * util = 80% of x then it does not fit on cpu0 and should migrate
2698 | +	 *     to cpu1
2699 | +	 *   * util = 80% of y then it is forced to fit on cpu1 to honour
2700 | +	 *     uclamp_max request.
2701 | +	 *
2702 | +	 *   which is what we're enforcing here. A task always fits if
2703 | +	 *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
2704 | +	 *   the normal upmigration rules should withhold still.
2705 | +	 *
2706 | +	 *   Only exception is when we are on max capacity, then we need to be
2707 | +	 *   careful not to block overutilized state. This is so because:
2708 | +	 *
2709 | +	 *     1. There's no concept of capping at max_capacity! We can't go
2710 | +	 *        beyond this performance level anyway.
2711 | +	 *     2. The system is being saturated when we're operating near
2712 | +	 *        max capacity, it doesn't make sense to block overutilized.
2713 | +	 */
2714 | +	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
2715 | +	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
2716 | +	fits = fits || uclamp_max_fits;
2717 | +
2718 | +	/*
2719 | +	 *
2720 | +	 *                                 C=z
2721 | +	 *   |                             ___       (region a, capped, util >= uclamp_max)
2722 | +	 *   |                  C=y       |   |
2723 | +	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
2724 | +	 *   |      C=x        |   |      |   |
2725 | +	 *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
2726 | +	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
2727 | +	 *   |     |   |       |   |      |   |
2728 | +	 *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
2729 | +	 *   +----------------------------------------
2730 | +	 *         cpu0        cpu1       cpu2
2731 | +	 *
2732 | +	 * a) If util > uclamp_max, then we're capped, we don't care about
2733 | +	 *    actual fitness value here. We only care if uclamp_max fits
2734 | +	 *    capacity without taking margin/pressure into account.
2735 | +	 *    See comment above.
2736 | +	 *
2737 | +	 * b) If uclamp_min <= util <= uclamp_max, then the normal
2738 | +	 *    fits_capacity() rules apply. Except we need to ensure that we
2739 | +	 *    enforce we remain within uclamp_max, see comment above.
2740 | +	 *
2741 | +	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
2742 | +	 *    need to take into account the boosted value fits the CPU without
2743 | +	 *    taking margin/pressure into account.
2744 | +	 *
2745 | +	 * Cases (a) and (b) are handled in the 'fits' variable already. We
2746 | +	 * just need to consider an extra check for case (c) after ensuring we
2747 | +	 * handle the case uclamp_min > uclamp_max.
2748 | +	 */
2749 | +	uclamp_min = min(uclamp_min, uclamp_max);
2750 | +	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
2751 | +		return -1;
2752 | +
2753 | +	return fits;
2754 | +}
2755 | +
2756 | +static inline bool cpu_overutilized(int cpu)
2757 | +{
2758 | +	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
2759 | +	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
2760 | +
2761 | +	/* Return true only if the utilization doesn't fit CPU's capacity */
2762 | +	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
2763 | +}
2764 | +
2765 | +static inline void update_overutilized_status(struct rq *rq)
2766 | +{
2767 | +	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
2768 | +		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
2769 | +		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
2770 | +	}
2771 | +}
2772 | +
2773 | +static inline unsigned long task_util_est(struct task_struct *p)
2774 | +{
2775 | +	return max(task_util(p), _task_util_est(p));
2776 | +}
2777 | +
2778 | +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
2779 | +				    struct task_struct *p)
2780 | +{
2781 | +	unsigned int enqueued;
2782 | +
2783 | +	if (!sched_feat(UTIL_EST))
2784 | +		return;
2785 | +
2786 | +	/* Update root cfs_rq's estimated utilization */
2787 | +	enqueued  = cfs_rq->avg.util_est;
2788 | +	enqueued += _task_util_est(p);
2789 | +	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
2790 | +
2791 | +	trace_sched_util_est_cfs_tp(cfs_rq);
2792 | +}
2793 | +
2794 | +static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
2795 | +				    struct task_struct *p)
2796 | +{
2797 | +	unsigned int enqueued;
2798 | +
2799 | +	if (!sched_feat(UTIL_EST))
2800 | +		return;
2801 | +
2802 | +	/* Update root cfs_rq's estimated utilization */
2803 | +	enqueued  = cfs_rq->avg.util_est;
2804 | +	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
2805 | +	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
2806 | +
2807 | +	trace_sched_util_est_cfs_tp(cfs_rq);
2808 | +}
2809 | +
2810 | +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
2811 | +
2812 | +static inline unsigned long task_runnable(struct task_struct *p)
2813 | +{
2814 | +	return READ_ONCE(p->se.avg.runnable_avg);
2815 | +}
2816 | +
2817 | +static inline void util_est_update(struct cfs_rq *cfs_rq,
2818 | +				   struct task_struct *p,
2819 | +				   bool task_sleep)
2820 | +{
2821 | +	unsigned int ewma, dequeued, last_ewma_diff;
2822 | +
2823 | +	if (!sched_feat(UTIL_EST))
2824 | +		return;
2825 | +
2826 | +	/*
2827 | +	 * Skip update of task's estimated utilization when the task has not
2828 | +	 * yet completed an activation, e.g. being migrated.
2829 | +	 */
2830 | +	if (!task_sleep)
2831 | +		return;
2832 | +
2833 | +	/* Get current estimate of utilization */
2834 | +	ewma = READ_ONCE(p->se.avg.util_est);
2835 | +
2836 | +	/*
2837 | +	 * If the PELT values haven't changed since enqueue time,
2838 | +	 * skip the util_est update.
2839 | +	 */
2840 | +	if (ewma & UTIL_AVG_UNCHANGED)
2841 | +		return;
2842 | +
2843 | +	/* Get utilization at dequeue */
2844 | +	dequeued = task_util(p);
2845 | +
2846 | +	/*
2847 | +	 * Reset EWMA on utilization increases, the moving average is used only
2848 | +	 * to smooth utilization decreases.
2849 | +	 */
2850 | +	if (ewma <= dequeued) {
2851 | +		ewma = dequeued;
2852 | +		goto done;
2853 | +	}
2854 | +
2855 | +	/*
2856 | +	 * Skip update of task's estimated utilization when its members are
2857 | +	 * already ~1% close to its last activation value.
2858 | +	 */
2859 | +	last_ewma_diff = ewma - dequeued;
2860 | +	if (last_ewma_diff < UTIL_EST_MARGIN)
2861 | +		goto done;
2862 | +
2863 | +	/*
2864 | +	 * To avoid overestimation of actual task utilization, skip updates if
2865 | +	 * we cannot grant there is idle time in this CPU.
2866 | +	 */
2867 | +	if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
2868 | +		return;
2869 | +
2870 | +	/*
2871 | +	 * To avoid underestimate of task utilization, skip updates of EWMA if
2872 | +	 * we cannot grant that thread got all CPU time it wanted.
2873 | +	 */
2874 | +	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
2875 | +		goto done;
2876 | +
2877 | +
2878 | +	/*
2879 | +	 * Update Task's estimated utilization
2880 | +	 *
2881 | +	 * When *p completes an activation we can consolidate another sample
2882 | +	 * of the task size. This is done by using this value to update the
2883 | +	 * Exponential Weighted Moving Average (EWMA):
2884 | +	 *
2885 | +	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
2886 | +	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
2887 | +	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
2888 | +	 *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
2889 | +	 *          = w * (-last_ewma_diff +  ewma(t-1) / w)
2890 | +	 *
2891 | +	 * Where 'w' is the weight of new samples, which is configured to be
2892 | +	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
2893 | +	 */
2894 | +	ewma <<= UTIL_EST_WEIGHT_SHIFT;
2895 | +	ewma  -= last_ewma_diff;
2896 | +	ewma >>= UTIL_EST_WEIGHT_SHIFT;
2897 | +done:
2898 | +	ewma |= UTIL_AVG_UNCHANGED;
2899 | +	WRITE_ONCE(p->se.avg.util_est, ewma);
2900 | +
2901 | +	trace_sched_util_est_se_tp(&p->se);
2902 | +}
2903 | +
2904 | +static inline int task_fits_cpu(struct task_struct *p, int cpu)
2905 | +{
2906 | +	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
2907 | +	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
2908 | +	unsigned long util = task_util_est(p);
2909 | +	/*
2910 | +	 * Return true only if the cpu fully fits the task requirements, which
2911 | +	 * include the utilization but also the performance hints.
2912 | +	 */
2913 | +	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
2914 | +}
2915 | +
2916 | +
2917 | +static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
2918 | +{
2919 | +	if (!sched_asym_cpucap_active())
2920 | +		return;
2921 | +
2922 | +	if (!p || p->nr_cpus_allowed == 1) {
2923 | +		rq->misfit_task_load = 0;
2924 | +		return;
2925 | +	}
2926 | +
2927 | +	if (task_fits_cpu(p, cpu_of(rq))) {
2928 | +		rq->misfit_task_load = 0;
2929 | +		return;
2930 | +	}
2931 | +
2932 | +	/*
2933 | +	 * Make sure that misfit_task_load will not be null even if
2934 | +	 * task_h_load() returns 0.
2935 | +	 */
2936 | +	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
2937 | +}
2938 | +
2939 | +static inline void
2940 | +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2941 | +{
2942 | +	cfs_rq->avg.load_avg += se->avg.load_avg;
2943 | +	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2944 | +}
2945 | +
2946 | +/*
2947 | + * Unsigned subtract and clamp on underflow.
2948 | + *
2949 | + * Explicitly do a load-store to ensure the intermediate value never hits
2950 | + * memory. This allows lockless observations without ever seeing the negative
2951 | + * values.
2952 | + */
2953 | +#define sub_positive(_ptr, _val) do {				\
2954 | +	typeof(_ptr) ptr = (_ptr);				\
2955 | +	typeof(*ptr) val = (_val);				\
2956 | +	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
2957 | +	res = var - val;					\
2958 | +	if (res > var)						\
2959 | +		res = 0;					\
2960 | +	WRITE_ONCE(*ptr, res);					\
2961 | +} while (0)
2962 | +
2963 | +static inline void
2964 | +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2965 | +{
2966 | +	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2967 | +	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2968 | +	/* See update_cfs_rq_load_avg() */
2969 | +	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
2970 | +					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
2971 | +}
2972 | diff --git a/kernel/sched/fair_numa.h b/kernel/sched/fair_numa.h
2973 | new file mode 100644
2974 | index 000000000000..1d9f6ff65a3e
2975 | --- /dev/null
2976 | +++ b/kernel/sched/fair_numa.h
2977 | @@ -0,0 +1,2288 @@
2978 | +static unsigned long capacity_of(int cpu)
2979 | +{
2980 | +	return cpu_rq(cpu)->cpu_capacity;
2981 | +}
2982 | +
2983 | +static unsigned long task_h_load(struct task_struct *p)
2984 | +{
2985 | +	return p->se.avg.load_avg;
2986 | +}
2987 | +
2988 | +static inline bool is_core_idle(int cpu)
2989 | +{
2990 | +#ifdef CONFIG_SCHED_SMT
2991 | +	int sibling;
2992 | +
2993 | +	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
2994 | +		if (cpu == sibling)
2995 | +			continue;
2996 | +
2997 | +		if (!idle_cpu(sibling))
2998 | +			return false;
2999 | +	}
3000 | +#endif
3001 | +
3002 | +	return true;
3003 | +}
3004 | +
3005 | +#ifdef CONFIG_NUMA_BALANCING
3006 | +/*
3007 | + * Approximate time to scan a full NUMA task in ms. The task scan period is
3008 | + * calculated based on the tasks virtual memory size and
3009 | + * numa_balancing_scan_size.
3010 | + */
3011 | +unsigned int sysctl_numa_balancing_scan_period_min = 1000;
3012 | +unsigned int sysctl_numa_balancing_scan_period_max = 60000;
3013 | +
3014 | +/* Portion of address space to scan in MB */
3015 | +unsigned int sysctl_numa_balancing_scan_size = 256;
3016 | +
3017 | +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
3018 | +unsigned int sysctl_numa_balancing_scan_delay = 1000;
3019 | +
3020 | +/* The page with hint page fault latency < threshold in ms is considered hot */
3021 | +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
3022 | +
3023 | +struct numa_group {
3024 | +	refcount_t refcount;
3025 | +
3026 | +	spinlock_t lock; /* nr_tasks, tasks */
3027 | +	int nr_tasks;
3028 | +	pid_t gid;
3029 | +	int active_nodes;
3030 | +
3031 | +	struct rcu_head rcu;
3032 | +	unsigned long total_faults;
3033 | +	unsigned long max_faults_cpu;
3034 | +	/*
3035 | +	 * faults[] array is split into two regions: faults_mem and faults_cpu.
3036 | +	 *
3037 | +	 * Faults_cpu is used to decide whether memory should move
3038 | +	 * towards the CPU. As a consequence, these stats are weighted
3039 | +	 * more by CPU use than by memory faults.
3040 | +	 */
3041 | +	unsigned long faults[];
3042 | +};
3043 | +
3044 | +/*
3045 | + * For functions that can be called in multiple contexts that permit reading
3046 | + * ->numa_group (see struct task_struct for locking rules).
3047 | + */
3048 | +static struct numa_group *deref_task_numa_group(struct task_struct *p)
3049 | +{
3050 | +	return rcu_dereference_check(p->numa_group, p == current ||
3051 | +		(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
3052 | +}
3053 | +
3054 | +static struct numa_group *deref_curr_numa_group(struct task_struct *p)
3055 | +{
3056 | +	return rcu_dereference_protected(p->numa_group, p == current);
3057 | +}
3058 | +
3059 | +static inline unsigned long group_faults_priv(struct numa_group *ng);
3060 | +static inline unsigned long group_faults_shared(struct numa_group *ng);
3061 | +
3062 | +static unsigned int task_nr_scan_windows(struct task_struct *p)
3063 | +{
3064 | +	unsigned long rss = 0;
3065 | +	unsigned long nr_scan_pages;
3066 | +
3067 | +	/*
3068 | +	 * Calculations based on RSS as non-present and empty pages are skipped
3069 | +	 * by the PTE scanner and NUMA hinting faults should be trapped based
3070 | +	 * on resident pages
3071 | +	 */
3072 | +	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
3073 | +	rss = get_mm_rss(p->mm);
3074 | +	if (!rss)
3075 | +		rss = nr_scan_pages;
3076 | +
3077 | +	rss = round_up(rss, nr_scan_pages);
3078 | +	return rss / nr_scan_pages;
3079 | +}
3080 | +
3081 | +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
3082 | +#define MAX_SCAN_WINDOW 2560
3083 | +
3084 | +static unsigned int task_scan_min(struct task_struct *p)
3085 | +{
3086 | +	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
3087 | +	unsigned int scan, floor;
3088 | +	unsigned int windows = 1;
3089 | +
3090 | +	if (scan_size < MAX_SCAN_WINDOW)
3091 | +		windows = MAX_SCAN_WINDOW / scan_size;
3092 | +	floor = 1000 / windows;
3093 | +
3094 | +	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
3095 | +	return max_t(unsigned int, floor, scan);
3096 | +}
3097 | +
3098 | +static unsigned int task_scan_start(struct task_struct *p)
3099 | +{
3100 | +	unsigned long smin = task_scan_min(p);
3101 | +	unsigned long period = smin;
3102 | +	struct numa_group *ng;
3103 | +
3104 | +	/* Scale the maximum scan period with the amount of shared memory. */
3105 | +	rcu_read_lock();
3106 | +	ng = rcu_dereference(p->numa_group);
3107 | +	if (ng) {
3108 | +		unsigned long shared = group_faults_shared(ng);
3109 | +		unsigned long private = group_faults_priv(ng);
3110 | +
3111 | +		period *= refcount_read(&ng->refcount);
3112 | +		period *= shared + 1;
3113 | +		period /= private + shared + 1;
3114 | +	}
3115 | +	rcu_read_unlock();
3116 | +
3117 | +	return max(smin, period);
3118 | +}
3119 | +
3120 | +static unsigned int task_scan_max(struct task_struct *p)
3121 | +{
3122 | +	unsigned long smin = task_scan_min(p);
3123 | +	unsigned long smax;
3124 | +	struct numa_group *ng;
3125 | +
3126 | +	/* Watch for min being lower than max due to floor calculations */
3127 | +	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
3128 | +
3129 | +	/* Scale the maximum scan period with the amount of shared memory. */
3130 | +	ng = deref_curr_numa_group(p);
3131 | +	if (ng) {
3132 | +		unsigned long shared = group_faults_shared(ng);
3133 | +		unsigned long private = group_faults_priv(ng);
3134 | +		unsigned long period = smax;
3135 | +
3136 | +		period *= refcount_read(&ng->refcount);
3137 | +		period *= shared + 1;
3138 | +		period /= private + shared + 1;
3139 | +
3140 | +		smax = max(smax, period);
3141 | +	}
3142 | +
3143 | +	return max(smin, smax);
3144 | +}
3145 | +
3146 | +static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3147 | +{
3148 | +	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
3149 | +	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
3150 | +}
3151 | +
3152 | +static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3153 | +{
3154 | +	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
3155 | +	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
3156 | +}
3157 | +
3158 | +/* Shared or private faults. */
3159 | +#define NR_NUMA_HINT_FAULT_TYPES 2
3160 | +
3161 | +/* Memory and CPU locality */
3162 | +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
3163 | +
3164 | +/* Averaged statistics, and temporary buffers. */
3165 | +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
3166 | +
3167 | +pid_t task_numa_group_id(struct task_struct *p)
3168 | +{
3169 | +	struct numa_group *ng;
3170 | +	pid_t gid = 0;
3171 | +
3172 | +	rcu_read_lock();
3173 | +	ng = rcu_dereference(p->numa_group);
3174 | +	if (ng)
3175 | +		gid = ng->gid;
3176 | +	rcu_read_unlock();
3177 | +
3178 | +	return gid;
3179 | +}
3180 | +
3181 | +/*
3182 | + * The averaged statistics, shared & private, memory & CPU,
3183 | + * occupy the first half of the array. The second half of the
3184 | + * array is for current counters, which are averaged into the
3185 | + * first set by task_numa_placement.
3186 | + */
3187 | +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
3188 | +{
3189 | +	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
3190 | +}
3191 | +
3192 | +static inline unsigned long task_faults(struct task_struct *p, int nid)
3193 | +{
3194 | +	if (!p->numa_faults)
3195 | +		return 0;
3196 | +
3197 | +	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
3198 | +		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
3199 | +}
3200 | +
3201 | +static inline unsigned long group_faults(struct task_struct *p, int nid)
3202 | +{
3203 | +	struct numa_group *ng = deref_task_numa_group(p);
3204 | +
3205 | +	if (!ng)
3206 | +		return 0;
3207 | +
3208 | +	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
3209 | +		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
3210 | +}
3211 | +
3212 | +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
3213 | +{
3214 | +	return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
3215 | +		group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
3216 | +}
3217 | +
3218 | +static inline unsigned long group_faults_priv(struct numa_group *ng)
3219 | +{
3220 | +	unsigned long faults = 0;
3221 | +	int node;
3222 | +
3223 | +	for_each_online_node(node) {
3224 | +		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
3225 | +	}
3226 | +
3227 | +	return faults;
3228 | +}
3229 | +
3230 | +static inline unsigned long group_faults_shared(struct numa_group *ng)
3231 | +{
3232 | +	unsigned long faults = 0;
3233 | +	int node;
3234 | +
3235 | +	for_each_online_node(node) {
3236 | +		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
3237 | +	}
3238 | +
3239 | +	return faults;
3240 | +}
3241 | +
3242 | +/*
3243 | + * A node triggering more than 1/3 as many NUMA faults as the maximum is
3244 | + * considered part of a numa group's pseudo-interleaving set. Migrations
3245 | + * between these nodes are slowed down, to allow things to settle down.
3246 | + */
3247 | +#define ACTIVE_NODE_FRACTION 3
3248 | +
3249 | +static bool numa_is_active_node(int nid, struct numa_group *ng)
3250 | +{
3251 | +	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
3252 | +}
3253 | +
3254 | +/* Handle placement on systems where not all nodes are directly connected. */
3255 | +static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
3256 | +					int lim_dist, bool task)
3257 | +{
3258 | +	unsigned long score = 0;
3259 | +	int node, max_dist;
3260 | +
3261 | +	/*
3262 | +	 * All nodes are directly connected, and the same distance
3263 | +	 * from each other. No need for fancy placement algorithms.
3264 | +	 */
3265 | +	if (sched_numa_topology_type == NUMA_DIRECT)
3266 | +		return 0;
3267 | +
3268 | +	/* sched_max_numa_distance may be changed in parallel. */
3269 | +	max_dist = READ_ONCE(sched_max_numa_distance);
3270 | +	/*
3271 | +	 * This code is called for each node, introducing N^2 complexity,
3272 | +	 * which should be ok given the number of nodes rarely exceeds 8.
3273 | +	 */
3274 | +	for_each_online_node(node) {
3275 | +		unsigned long faults;
3276 | +		int dist = node_distance(nid, node);
3277 | +
3278 | +		/*
3279 | +		 * The furthest away nodes in the system are not interesting
3280 | +		 * for placement; nid was already counted.
3281 | +		 */
3282 | +		if (dist >= max_dist || node == nid)
3283 | +			continue;
3284 | +
3285 | +		/*
3286 | +		 * On systems with a backplane NUMA topology, compare groups
3287 | +		 * of nodes, and move tasks towards the group with the most
3288 | +		 * memory accesses. When comparing two nodes at distance
3289 | +		 * "hoplimit", only nodes closer by than "hoplimit" are part
3290 | +		 * of each group. Skip other nodes.
3291 | +		 */
3292 | +		if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
3293 | +			continue;
3294 | +
3295 | +		/* Add up the faults from nearby nodes. */
3296 | +		if (task)
3297 | +			faults = task_faults(p, node);
3298 | +		else
3299 | +			faults = group_faults(p, node);
3300 | +
3301 | +		/*
3302 | +		 * On systems with a glueless mesh NUMA topology, there are
3303 | +		 * no fixed "groups of nodes". Instead, nodes that are not
3304 | +		 * directly connected bounce traffic through intermediate
3305 | +		 * nodes; a numa_group can occupy any set of nodes.
3306 | +		 * The further away a node is, the less the faults count.
3307 | +		 * This seems to result in good task placement.
3308 | +		 */
3309 | +		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
3310 | +			faults *= (max_dist - dist);
3311 | +			faults /= (max_dist - LOCAL_DISTANCE);
3312 | +		}
3313 | +
3314 | +		score += faults;
3315 | +	}
3316 | +
3317 | +	return score;
3318 | +}
3319 | +
3320 | +/*
3321 | + * These return the fraction of accesses done by a particular task, or
3322 | + * task group, on a particular numa node.  The group weight is given a
3323 | + * larger multiplier, in order to group tasks together that are almost
3324 | + * evenly spread out between numa nodes.
3325 | + */
3326 | +static inline unsigned long task_weight(struct task_struct *p, int nid,
3327 | +					int dist)
3328 | +{
3329 | +	unsigned long faults, total_faults;
3330 | +
3331 | +	if (!p->numa_faults)
3332 | +		return 0;
3333 | +
3334 | +	total_faults = p->total_numa_faults;
3335 | +
3336 | +	if (!total_faults)
3337 | +		return 0;
3338 | +
3339 | +	faults = task_faults(p, nid);
3340 | +	faults += score_nearby_nodes(p, nid, dist, true);
3341 | +
3342 | +	return 1000 * faults / total_faults;
3343 | +}
3344 | +
3345 | +static inline unsigned long group_weight(struct task_struct *p, int nid,
3346 | +					 int dist)
3347 | +{
3348 | +	struct numa_group *ng = deref_task_numa_group(p);
3349 | +	unsigned long faults, total_faults;
3350 | +
3351 | +	if (!ng)
3352 | +		return 0;
3353 | +
3354 | +	total_faults = ng->total_faults;
3355 | +
3356 | +	if (!total_faults)
3357 | +		return 0;
3358 | +
3359 | +	faults = group_faults(p, nid);
3360 | +	faults += score_nearby_nodes(p, nid, dist, false);
3361 | +
3362 | +	return 1000 * faults / total_faults;
3363 | +}
3364 | +
3365 | +/*
3366 | + * If memory tiering mode is enabled, cpupid of slow memory page is
3367 | + * used to record scan time instead of CPU and PID.  When tiering mode
3368 | + * is disabled at run time, the scan time (in cpupid) will be
3369 | + * interpreted as CPU and PID.  So CPU needs to be checked to avoid to
3370 | + * access out of array bound.
3371 | + */
3372 | +static inline bool cpupid_valid(int cpupid)
3373 | +{
3374 | +	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
3375 | +}
3376 | +
3377 | +/*
3378 | + * For memory tiering mode, if there are enough free pages (more than
3379 | + * enough watermark defined here) in fast memory node, to take full
3380 | + * advantage of fast memory capacity, all recently accessed slow
3381 | + * memory pages will be migrated to fast memory node without
3382 | + * considering hot threshold.
3383 | + */
3384 | +static bool pgdat_free_space_enough(struct pglist_data *pgdat)
3385 | +{
3386 | +	int z;
3387 | +	unsigned long enough_wmark;
3388 | +
3389 | +	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
3390 | +			   pgdat->node_present_pages >> 4);
3391 | +	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
3392 | +		struct zone *zone = pgdat->node_zones + z;
3393 | +
3394 | +		if (!populated_zone(zone))
3395 | +			continue;
3396 | +
3397 | +		if (zone_watermark_ok(zone, 0,
3398 | +				      wmark_pages(zone, WMARK_PROMO) + enough_wmark,
3399 | +				      ZONE_MOVABLE, 0))
3400 | +			return true;
3401 | +	}
3402 | +	return false;
3403 | +}
3404 | +
3405 | +/*
3406 | + * For memory tiering mode, when page tables are scanned, the scan
3407 | + * time will be recorded in struct page in addition to make page
3408 | + * PROT_NONE for slow memory page.  So when the page is accessed, in
3409 | + * hint page fault handler, the hint page fault latency is calculated
3410 | + * via,
3411 | + *
3412 | + *	hint page fault latency = hint page fault time - scan time
3413 | + *
3414 | + * The smaller the hint page fault latency, the higher the possibility
3415 | + * for the page to be hot.
3416 | + */
3417 | +static int numa_hint_fault_latency(struct folio *folio)
3418 | +{
3419 | +	int last_time, time;
3420 | +
3421 | +	time = jiffies_to_msecs(jiffies);
3422 | +	last_time = folio_xchg_access_time(folio, time);
3423 | +
3424 | +	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
3425 | +}
3426 | +
3427 | +/*
3428 | + * For memory tiering mode, too high promotion/demotion throughput may
3429 | + * hurt application latency.  So we provide a mechanism to rate limit
3430 | + * the number of pages that are tried to be promoted.
3431 | + */
3432 | +static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
3433 | +				      unsigned long rate_limit, int nr)
3434 | +{
3435 | +	unsigned long nr_cand;
3436 | +	unsigned int now, start;
3437 | +
3438 | +	now = jiffies_to_msecs(jiffies);
3439 | +	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
3440 | +	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
3441 | +	start = pgdat->nbp_rl_start;
3442 | +	if (now - start > MSEC_PER_SEC &&
3443 | +	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
3444 | +		pgdat->nbp_rl_nr_cand = nr_cand;
3445 | +	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
3446 | +		return true;
3447 | +	return false;
3448 | +}
3449 | +
3450 | +#define NUMA_MIGRATION_ADJUST_STEPS	16
3451 | +
3452 | +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
3453 | +					    unsigned long rate_limit,
3454 | +					    unsigned int ref_th)
3455 | +{
3456 | +	unsigned int now, start, th_period, unit_th, th;
3457 | +	unsigned long nr_cand, ref_cand, diff_cand;
3458 | +
3459 | +	now = jiffies_to_msecs(jiffies);
3460 | +	th_period = sysctl_numa_balancing_scan_period_max;
3461 | +	start = pgdat->nbp_th_start;
3462 | +	if (now - start > th_period &&
3463 | +	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
3464 | +		ref_cand = rate_limit *
3465 | +			sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
3466 | +		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
3467 | +		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
3468 | +		unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
3469 | +		th = pgdat->nbp_threshold ? : ref_th;
3470 | +		if (diff_cand > ref_cand * 11 / 10)
3471 | +			th = max(th - unit_th, unit_th);
3472 | +		else if (diff_cand < ref_cand * 9 / 10)
3473 | +			th = min(th + unit_th, ref_th * 2);
3474 | +		pgdat->nbp_th_nr_cand = nr_cand;
3475 | +		pgdat->nbp_threshold = th;
3476 | +	}
3477 | +}
3478 | +
3479 | +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
3480 | +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
3481 | +
3482 | +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
3483 | +				int src_nid, int dst_cpu)
3484 | +{
3485 | +	struct numa_group *ng = deref_curr_numa_group(p);
3486 | +	int dst_nid = cpu_to_node(dst_cpu);
3487 | +	int last_cpupid, this_cpupid;
3488 | +
3489 | +	/*
3490 | +	 * The pages in slow memory node should be migrated according
3491 | +	 * to hot/cold instead of private/shared.
3492 | +	 */
3493 | +	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
3494 | +	    !node_is_toptier(src_nid)) {
3495 | +		struct pglist_data *pgdat;
3496 | +		unsigned long rate_limit;
3497 | +		unsigned int latency, th, def_th;
3498 | +
3499 | +		pgdat = NODE_DATA(dst_nid);
3500 | +		if (pgdat_free_space_enough(pgdat)) {
3501 | +			/* workload changed, reset hot threshold */
3502 | +			pgdat->nbp_threshold = 0;
3503 | +			return true;
3504 | +		}
3505 | +
3506 | +		def_th = sysctl_numa_balancing_hot_threshold;
3507 | +		rate_limit = sysctl_numa_balancing_promote_rate_limit << \
3508 | +			(20 - PAGE_SHIFT);
3509 | +		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
3510 | +
3511 | +		th = pgdat->nbp_threshold ? : def_th;
3512 | +		latency = numa_hint_fault_latency(folio);
3513 | +		if (latency >= th)
3514 | +			return false;
3515 | +
3516 | +		return !numa_promotion_rate_limit(pgdat, rate_limit,
3517 | +						  folio_nr_pages(folio));
3518 | +	}
3519 | +
3520 | +	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
3521 | +	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
3522 | +
3523 | +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
3524 | +	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
3525 | +		return false;
3526 | +
3527 | +	/*
3528 | +	 * Allow first faults or private faults to migrate immediately early in
3529 | +	 * the lifetime of a task. The magic number 4 is based on waiting for
3530 | +	 * two full passes of the "multi-stage node selection" test that is
3531 | +	 * executed below.
3532 | +	 */
3533 | +	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
3534 | +	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
3535 | +		return true;
3536 | +
3537 | +	/*
3538 | +	 * Multi-stage node selection is used in conjunction with a periodic
3539 | +	 * migration fault to build a temporal task<->page relation. By using
3540 | +	 * a two-stage filter we remove short/unlikely relations.
3541 | +	 *
3542 | +	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
3543 | +	 * a task's usage of a particular page (n_p) per total usage of this
3544 | +	 * page (n_t) (in a given time-span) to a probability.
3545 | +	 *
3546 | +	 * Our periodic faults will sample this probability and getting the
3547 | +	 * same result twice in a row, given these samples are fully
3548 | +	 * independent, is then given by P(n)^2, provided our sample period
3549 | +	 * is sufficiently short compared to the usage pattern.
3550 | +	 *
3551 | +	 * This quadric squishes small probabilities, making it less likely we
3552 | +	 * act on an unlikely task<->page relation.
3553 | +	 */
3554 | +	if (!cpupid_pid_unset(last_cpupid) &&
3555 | +				cpupid_to_nid(last_cpupid) != dst_nid)
3556 | +		return false;
3557 | +
3558 | +	/* Always allow migrate on private faults */
3559 | +	if (cpupid_match_pid(p, last_cpupid))
3560 | +		return true;
3561 | +
3562 | +	/* A shared fault, but p->numa_group has not been set up yet. */
3563 | +	if (!ng)
3564 | +		return true;
3565 | +
3566 | +	/*
3567 | +	 * Destination node is much more heavily used than the source
3568 | +	 * node? Allow migration.
3569 | +	 */
3570 | +	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
3571 | +					ACTIVE_NODE_FRACTION)
3572 | +		return true;
3573 | +
3574 | +	/*
3575 | +	 * Distribute memory according to CPU & memory use on each node,
3576 | +	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
3577 | +	 *
3578 | +	 * faults_cpu(dst)   3   faults_cpu(src)
3579 | +	 * --------------- * - > ---------------
3580 | +	 * faults_mem(dst)   4   faults_mem(src)
3581 | +	 */
3582 | +	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
3583 | +	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
3584 | +}
3585 | +
3586 | +/*
3587 | + * 'numa_type' describes the node at the moment of load balancing.
3588 | + */
3589 | +enum numa_type {
3590 | +	/* The node has spare capacity that can be used to run more tasks.  */
3591 | +	node_has_spare = 0,
3592 | +	/*
3593 | +	 * The node is fully used and the tasks don't compete for more CPU
3594 | +	 * cycles. Nevertheless, some tasks might wait before running.
3595 | +	 */
3596 | +	node_fully_busy,
3597 | +	/*
3598 | +	 * The node is overloaded and can't provide expected CPU cycles to all
3599 | +	 * tasks.
3600 | +	 */
3601 | +	node_overloaded
3602 | +};
3603 | +
3604 | +/* Cached statistics for all CPUs within a node */
3605 | +struct numa_stats {
3606 | +	unsigned long load;
3607 | +	unsigned long runnable;
3608 | +	unsigned long util;
3609 | +	/* Total compute capacity of CPUs on a node */
3610 | +	unsigned long compute_capacity;
3611 | +	unsigned int nr_running;
3612 | +	unsigned int weight;
3613 | +	enum numa_type node_type;
3614 | +	int idle_cpu;
3615 | +};
3616 | +
3617 | +struct task_numa_env {
3618 | +	struct task_struct *p;
3619 | +
3620 | +	int src_cpu, src_nid;
3621 | +	int dst_cpu, dst_nid;
3622 | +	int imb_numa_nr;
3623 | +
3624 | +	struct numa_stats src_stats, dst_stats;
3625 | +
3626 | +	int imbalance_pct;
3627 | +	int dist;
3628 | +
3629 | +	struct task_struct *best_task;
3630 | +	long best_imp;
3631 | +	int best_cpu;
3632 | +};
3633 | +
3634 | +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3635 | +{
3636 | +	return cfs_rq->avg.load_avg;
3637 | +}
3638 | +
3639 | +static unsigned long cpu_load(struct rq *rq)
3640 | +{
3641 | +	return cfs_rq_load_avg(&rq->cfs);
3642 | +}
3643 | +
3644 | +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3645 | +{
3646 | +	return cfs_rq->avg.runnable_avg;
3647 | +}
3648 | +
3649 | +static unsigned long cpu_runnable(struct rq *rq)
3650 | +{
3651 | +	return cfs_rq_runnable_avg(&rq->cfs);
3652 | +}
3653 | +
3654 | +static inline enum
3655 | +numa_type numa_classify(unsigned int imbalance_pct,
3656 | +			 struct numa_stats *ns)
3657 | +{
3658 | +	if ((ns->nr_running > ns->weight) &&
3659 | +	    (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
3660 | +	     ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
3661 | +		return node_overloaded;
3662 | +
3663 | +	if ((ns->nr_running < ns->weight) ||
3664 | +	    (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
3665 | +	     ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
3666 | +		return node_has_spare;
3667 | +
3668 | +	return node_fully_busy;
3669 | +}
3670 | +
3671 | +#ifdef CONFIG_SCHED_SMT
3672 | +/* Forward declarations of select_idle_sibling helpers */
3673 | +static inline bool test_idle_cores(int cpu);
3674 | +static inline int numa_idle_core(int idle_core, int cpu)
3675 | +{
3676 | +	if (!static_branch_likely(&sched_smt_present) ||
3677 | +	    idle_core >= 0 || !test_idle_cores(cpu))
3678 | +		return idle_core;
3679 | +
3680 | +	/*
3681 | +	 * Prefer cores instead of packing HT siblings
3682 | +	 * and triggering future load balancing.
3683 | +	 */
3684 | +	if (is_core_idle(cpu))
3685 | +		idle_core = cpu;
3686 | +
3687 | +	return idle_core;
3688 | +}
3689 | +#else
3690 | +static inline int numa_idle_core(int idle_core, int cpu)
3691 | +{
3692 | +	return idle_core;
3693 | +}
3694 | +#endif
3695 | +
3696 | +/*
3697 | + * Gather all necessary information to make NUMA balancing placement
3698 | + * decisions that are compatible with standard load balancer. This
3699 | + * borrows code and logic from update_sg_lb_stats but sharing a
3700 | + * common implementation is impractical.
3701 | + */
3702 | +static void update_numa_stats(struct task_numa_env *env,
3703 | +			      struct numa_stats *ns, int nid,
3704 | +			      bool find_idle)
3705 | +{
3706 | +	int cpu, idle_core = -1;
3707 | +
3708 | +	memset(ns, 0, sizeof(*ns));
3709 | +	ns->idle_cpu = -1;
3710 | +
3711 | +	rcu_read_lock();
3712 | +	for_each_cpu(cpu, cpumask_of_node(nid)) {
3713 | +		struct rq *rq = cpu_rq(cpu);
3714 | +
3715 | +		ns->load += cpu_load(rq);
3716 | +		ns->runnable += cpu_runnable(rq);
3717 | +		ns->util += cpu_util_cfs(cpu);
3718 | +		ns->nr_running += rq->cfs.h_nr_running;
3719 | +		ns->compute_capacity += capacity_of(cpu);
3720 | +
3721 | +		if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
3722 | +			if (READ_ONCE(rq->numa_migrate_on) ||
3723 | +			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))
3724 | +				continue;
3725 | +
3726 | +			if (ns->idle_cpu == -1)
3727 | +				ns->idle_cpu = cpu;
3728 | +
3729 | +			idle_core = numa_idle_core(idle_core, cpu);
3730 | +		}
3731 | +	}
3732 | +	rcu_read_unlock();
3733 | +
3734 | +	ns->weight = cpumask_weight(cpumask_of_node(nid));
3735 | +
3736 | +	ns->node_type = numa_classify(env->imbalance_pct, ns);
3737 | +
3738 | +	if (idle_core >= 0)
3739 | +		ns->idle_cpu = idle_core;
3740 | +}
3741 | +
3742 | +static void task_numa_assign(struct task_numa_env *env,
3743 | +			     struct task_struct *p, long imp)
3744 | +{
3745 | +	struct rq *rq = cpu_rq(env->dst_cpu);
3746 | +
3747 | +	/* Check if run-queue part of active NUMA balance. */
3748 | +	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
3749 | +		int cpu;
3750 | +		int start = env->dst_cpu;
3751 | +
3752 | +		/* Find alternative idle CPU. */
3753 | +		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
3754 | +			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
3755 | +			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
3756 | +				continue;
3757 | +			}
3758 | +
3759 | +			env->dst_cpu = cpu;
3760 | +			rq = cpu_rq(env->dst_cpu);
3761 | +			if (!xchg(&rq->numa_migrate_on, 1))
3762 | +				goto assign;
3763 | +		}
3764 | +
3765 | +		/* Failed to find an alternative idle CPU */
3766 | +		return;
3767 | +	}
3768 | +
3769 | +assign:
3770 | +	/*
3771 | +	 * Clear previous best_cpu/rq numa-migrate flag, since task now
3772 | +	 * found a better CPU to move/swap.
3773 | +	 */
3774 | +	if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
3775 | +		rq = cpu_rq(env->best_cpu);
3776 | +		WRITE_ONCE(rq->numa_migrate_on, 0);
3777 | +	}
3778 | +
3779 | +	if (env->best_task)
3780 | +		put_task_struct(env->best_task);
3781 | +	if (p)
3782 | +		get_task_struct(p);
3783 | +
3784 | +	env->best_task = p;
3785 | +	env->best_imp = imp;
3786 | +	env->best_cpu = env->dst_cpu;
3787 | +}
3788 | +
3789 | +static bool load_too_imbalanced(long src_load, long dst_load,
3790 | +				struct task_numa_env *env)
3791 | +{
3792 | +	long imb, old_imb;
3793 | +	long orig_src_load, orig_dst_load;
3794 | +	long src_capacity, dst_capacity;
3795 | +
3796 | +	/*
3797 | +	 * The load is corrected for the CPU capacity available on each node.
3798 | +	 *
3799 | +	 * src_load        dst_load
3800 | +	 * ------------ vs ---------
3801 | +	 * src_capacity    dst_capacity
3802 | +	 */
3803 | +	src_capacity = env->src_stats.compute_capacity;
3804 | +	dst_capacity = env->dst_stats.compute_capacity;
3805 | +
3806 | +	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
3807 | +
3808 | +	orig_src_load = env->src_stats.load;
3809 | +	orig_dst_load = env->dst_stats.load;
3810 | +
3811 | +	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
3812 | +
3813 | +	/* Would this change make things worse? */
3814 | +	return (imb > old_imb);
3815 | +}
3816 | +
3817 | +/*
3818 | + * Maximum NUMA importance can be 1998 (2*999);
3819 | + * SMALLIMP @ 30 would be close to 1998/64.
3820 | + * Used to deter task migration.
3821 | + */
3822 | +#define SMALLIMP	30
3823 | +
3824 | +/*
3825 | + * This checks if the overall compute and NUMA accesses of the system would
3826 | + * be improved if the source tasks was migrated to the target dst_cpu taking
3827 | + * into account that it might be best if task running on the dst_cpu should
3828 | + * be exchanged with the source task
3829 | + */
3830 | +static bool task_numa_compare(struct task_numa_env *env,
3831 | +			      long taskimp, long groupimp, bool maymove)
3832 | +{
3833 | +	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
3834 | +	struct rq *dst_rq = cpu_rq(env->dst_cpu);
3835 | +	long imp = p_ng ? groupimp : taskimp;
3836 | +	struct task_struct *cur;
3837 | +	long src_load, dst_load;
3838 | +	int dist = env->dist;
3839 | +	long moveimp = imp;
3840 | +	long load;
3841 | +	bool stopsearch = false;
3842 | +
3843 | +	if (READ_ONCE(dst_rq->numa_migrate_on))
3844 | +		return false;
3845 | +
3846 | +	rcu_read_lock();
3847 | +	cur = rcu_dereference(dst_rq->curr);
3848 | +	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
3849 | +		cur = NULL;
3850 | +
3851 | +	/*
3852 | +	 * Because we have preemption enabled we can get migrated around and
3853 | +	 * end try selecting ourselves (current == env->p) as a swap candidate.
3854 | +	 */
3855 | +	if (cur == env->p) {
3856 | +		stopsearch = true;
3857 | +		goto unlock;
3858 | +	}
3859 | +
3860 | +	if (!cur) {
3861 | +		if (maymove && moveimp >= env->best_imp)
3862 | +			goto assign;
3863 | +		else
3864 | +			goto unlock;
3865 | +	}
3866 | +
3867 | +	/* Skip this swap candidate if cannot move to the source cpu. */
3868 | +	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
3869 | +		goto unlock;
3870 | +
3871 | +	/*
3872 | +	 * Skip this swap candidate if it is not moving to its preferred
3873 | +	 * node and the best task is.
3874 | +	 */
3875 | +	if (env->best_task &&
3876 | +	    env->best_task->numa_preferred_nid == env->src_nid &&
3877 | +	    cur->numa_preferred_nid != env->src_nid) {
3878 | +		goto unlock;
3879 | +	}
3880 | +
3881 | +	/*
3882 | +	 * "imp" is the fault differential for the source task between the
3883 | +	 * source and destination node. Calculate the total differential for
3884 | +	 * the source task and potential destination task. The more negative
3885 | +	 * the value is, the more remote accesses that would be expected to
3886 | +	 * be incurred if the tasks were swapped.
3887 | +	 *
3888 | +	 * If dst and source tasks are in the same NUMA group, or not
3889 | +	 * in any group then look only at task weights.
3890 | +	 */
3891 | +	cur_ng = rcu_dereference(cur->numa_group);
3892 | +	if (cur_ng == p_ng) {
3893 | +		/*
3894 | +		 * Do not swap within a group or between tasks that have
3895 | +		 * no group if there is spare capacity. Swapping does
3896 | +		 * not address the load imbalance and helps one task at
3897 | +		 * the cost of punishing another.
3898 | +		 */
3899 | +		if (env->dst_stats.node_type == node_has_spare)
3900 | +			goto unlock;
3901 | +
3902 | +		imp = taskimp + task_weight(cur, env->src_nid, dist) -
3903 | +		      task_weight(cur, env->dst_nid, dist);
3904 | +		/*
3905 | +		 * Add some hysteresis to prevent swapping the
3906 | +		 * tasks within a group over tiny differences.
3907 | +		 */
3908 | +		if (cur_ng)
3909 | +			imp -= imp / 16;
3910 | +	} else {
3911 | +		/*
3912 | +		 * Compare the group weights. If a task is all by itself
3913 | +		 * (not part of a group), use the task weight instead.
3914 | +		 */
3915 | +		if (cur_ng && p_ng)
3916 | +			imp += group_weight(cur, env->src_nid, dist) -
3917 | +			       group_weight(cur, env->dst_nid, dist);
3918 | +		else
3919 | +			imp += task_weight(cur, env->src_nid, dist) -
3920 | +			       task_weight(cur, env->dst_nid, dist);
3921 | +	}
3922 | +
3923 | +	/* Discourage picking a task already on its preferred node */
3924 | +	if (cur->numa_preferred_nid == env->dst_nid)
3925 | +		imp -= imp / 16;
3926 | +
3927 | +	/*
3928 | +	 * Encourage picking a task that moves to its preferred node.
3929 | +	 * This potentially makes imp larger than it's maximum of
3930 | +	 * 1998 (see SMALLIMP and task_weight for why) but in this
3931 | +	 * case, it does not matter.
3932 | +	 */
3933 | +	if (cur->numa_preferred_nid == env->src_nid)
3934 | +		imp += imp / 8;
3935 | +
3936 | +	if (maymove && moveimp > imp && moveimp > env->best_imp) {
3937 | +		imp = moveimp;
3938 | +		cur = NULL;
3939 | +		goto assign;
3940 | +	}
3941 | +
3942 | +	/*
3943 | +	 * Prefer swapping with a task moving to its preferred node over a
3944 | +	 * task that is not.
3945 | +	 */
3946 | +	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
3947 | +	    env->best_task->numa_preferred_nid != env->src_nid) {
3948 | +		goto assign;
3949 | +	}
3950 | +
3951 | +	/*
3952 | +	 * If the NUMA importance is less than SMALLIMP,
3953 | +	 * task migration might only result in ping pong
3954 | +	 * of tasks and also hurt performance due to cache
3955 | +	 * misses.
3956 | +	 */
3957 | +	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
3958 | +		goto unlock;
3959 | +
3960 | +	/*
3961 | +	 * In the overloaded case, try and keep the load balanced.
3962 | +	 */
3963 | +	load = task_h_load(env->p) - task_h_load(cur);
3964 | +	if (!load)
3965 | +		goto assign;
3966 | +
3967 | +	dst_load = env->dst_stats.load + load;
3968 | +	src_load = env->src_stats.load - load;
3969 | +
3970 | +	if (load_too_imbalanced(src_load, dst_load, env))
3971 | +		goto unlock;
3972 | +
3973 | +assign:
3974 | +	/* Evaluate an idle CPU for a task numa move. */
3975 | +	if (!cur) {
3976 | +		int cpu = env->dst_stats.idle_cpu;
3977 | +
3978 | +		/* Nothing cached so current CPU went idle since the search. */
3979 | +		if (cpu < 0)
3980 | +			cpu = env->dst_cpu;
3981 | +
3982 | +		/*
3983 | +		 * If the CPU is no longer truly idle and the previous best CPU
3984 | +		 * is, keep using it.
3985 | +		 */
3986 | +		if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
3987 | +		    idle_cpu(env->best_cpu)) {
3988 | +			cpu = env->best_cpu;
3989 | +		}
3990 | +
3991 | +		env->dst_cpu = cpu;
3992 | +	}
3993 | +
3994 | +	task_numa_assign(env, cur, imp);
3995 | +
3996 | +	/*
3997 | +	 * If a move to idle is allowed because there is capacity or load
3998 | +	 * balance improves then stop the search. While a better swap
3999 | +	 * candidate may exist, a search is not free.
4000 | +	 */
4001 | +	if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
4002 | +		stopsearch = true;
4003 | +
4004 | +	/*
4005 | +	 * If a swap candidate must be identified and the current best task
4006 | +	 * moves its preferred node then stop the search.
4007 | +	 */
4008 | +	if (!maymove && env->best_task &&
4009 | +	    env->best_task->numa_preferred_nid == env->src_nid) {
4010 | +		stopsearch = true;
4011 | +	}
4012 | +unlock:
4013 | +	rcu_read_unlock();
4014 | +
4015 | +	return stopsearch;
4016 | +}
4017 | +
4018 | +#define NUMA_IMBALANCE_MIN 2
4019 | +
4020 | +static inline long
4021 | +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
4022 | +{
4023 | +	/*
4024 | +	 * Allow a NUMA imbalance if busy CPUs is less than the maximum
4025 | +	 * threshold. Above this threshold, individual tasks may be contending
4026 | +	 * for both memory bandwidth and any shared HT resources.  This is an
4027 | +	 * approximation as the number of running tasks may not be related to
4028 | +	 * the number of busy CPUs due to sched_setaffinity.
4029 | +	 */
4030 | +	if (dst_running > imb_numa_nr)
4031 | +		return imbalance;
4032 | +
4033 | +	/*
4034 | +	 * Allow a small imbalance based on a simple pair of communicating
4035 | +	 * tasks that remain local when the destination is lightly loaded.
4036 | +	 */
4037 | +	if (imbalance <= NUMA_IMBALANCE_MIN)
4038 | +		return 0;
4039 | +
4040 | +	return imbalance;
4041 | +}
4042 | +
4043 | +static void task_numa_find_cpu(struct task_numa_env *env,
4044 | +				long taskimp, long groupimp)
4045 | +{
4046 | +	bool maymove = false;
4047 | +	int cpu;
4048 | +
4049 | +	/*
4050 | +	 * If dst node has spare capacity, then check if there is an
4051 | +	 * imbalance that would be overruled by the load balancer.
4052 | +	 */
4053 | +	if (env->dst_stats.node_type == node_has_spare) {
4054 | +		unsigned int imbalance;
4055 | +		int src_running, dst_running;
4056 | +
4057 | +		/*
4058 | +		 * Would movement cause an imbalance? Note that if src has
4059 | +		 * more running tasks that the imbalance is ignored as the
4060 | +		 * move improves the imbalance from the perspective of the
4061 | +		 * CPU load balancer.
4062 | +		 * */
4063 | +		src_running = env->src_stats.nr_running - 1;
4064 | +		dst_running = env->dst_stats.nr_running + 1;
4065 | +		imbalance = max(0, dst_running - src_running);
4066 | +		imbalance = adjust_numa_imbalance(imbalance, dst_running,
4067 | +						  env->imb_numa_nr);
4068 | +
4069 | +		/* Use idle CPU if there is no imbalance */
4070 | +		if (!imbalance) {
4071 | +			maymove = true;
4072 | +			if (env->dst_stats.idle_cpu >= 0) {
4073 | +				env->dst_cpu = env->dst_stats.idle_cpu;
4074 | +				task_numa_assign(env, NULL, 0);
4075 | +				return;
4076 | +			}
4077 | +		}
4078 | +	} else {
4079 | +		long src_load, dst_load, load;
4080 | +		/*
4081 | +		 * If the improvement from just moving env->p direction is better
4082 | +		 * than swapping tasks around, check if a move is possible.
4083 | +		 */
4084 | +		load = task_h_load(env->p);
4085 | +		dst_load = env->dst_stats.load + load;
4086 | +		src_load = env->src_stats.load - load;
4087 | +		maymove = !load_too_imbalanced(src_load, dst_load, env);
4088 | +	}
4089 | +
4090 | +	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
4091 | +		/* Skip this CPU if the source task cannot migrate */
4092 | +		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
4093 | +			continue;
4094 | +
4095 | +		env->dst_cpu = cpu;
4096 | +		if (task_numa_compare(env, taskimp, groupimp, maymove))
4097 | +			break;
4098 | +	}
4099 | +}
4100 | +
4101 | +static int task_numa_migrate(struct task_struct *p)
4102 | +{
4103 | +	struct task_numa_env env = {
4104 | +		.p = p,
4105 | +
4106 | +		.src_cpu = task_cpu(p),
4107 | +		.src_nid = task_node(p),
4108 | +
4109 | +		.imbalance_pct = 112,
4110 | +
4111 | +		.best_task = NULL,
4112 | +		.best_imp = 0,
4113 | +		.best_cpu = -1,
4114 | +	};
4115 | +	unsigned long taskweight, groupweight;
4116 | +	struct sched_domain *sd;
4117 | +	long taskimp, groupimp;
4118 | +	struct numa_group *ng;
4119 | +	struct rq *best_rq;
4120 | +	int nid, ret, dist;
4121 | +
4122 | +	/*
4123 | +	 * Pick the lowest SD_NUMA domain, as that would have the smallest
4124 | +	 * imbalance and would be the first to start moving tasks about.
4125 | +	 *
4126 | +	 * And we want to avoid any moving of tasks about, as that would create
4127 | +	 * random movement of tasks -- counter the numa conditions we're trying
4128 | +	 * to satisfy here.
4129 | +	 */
4130 | +	rcu_read_lock();
4131 | +	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
4132 | +	if (sd) {
4133 | +		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
4134 | +		env.imb_numa_nr = sd->imb_numa_nr;
4135 | +	}
4136 | +	rcu_read_unlock();
4137 | +
4138 | +	/*
4139 | +	 * Cpusets can break the scheduler domain tree into smaller
4140 | +	 * balance domains, some of which do not cross NUMA boundaries.
4141 | +	 * Tasks that are "trapped" in such domains cannot be migrated
4142 | +	 * elsewhere, so there is no point in (re)trying.
4143 | +	 */
4144 | +	if (unlikely(!sd)) {
4145 | +		sched_setnuma(p, task_node(p));
4146 | +		return -EINVAL;
4147 | +	}
4148 | +
4149 | +	env.dst_nid = p->numa_preferred_nid;
4150 | +	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
4151 | +	taskweight = task_weight(p, env.src_nid, dist);
4152 | +	groupweight = group_weight(p, env.src_nid, dist);
4153 | +	update_numa_stats(&env, &env.src_stats, env.src_nid, false);
4154 | +	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
4155 | +	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
4156 | +	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
4157 | +
4158 | +	/* Try to find a spot on the preferred nid. */
4159 | +	task_numa_find_cpu(&env, taskimp, groupimp);
4160 | +
4161 | +	/*
4162 | +	 * Look at other nodes in these cases:
4163 | +	 * - there is no space available on the preferred_nid
4164 | +	 * - the task is part of a numa_group that is interleaved across
4165 | +	 *   multiple NUMA nodes; in order to better consolidate the group,
4166 | +	 *   we need to check other locations.
4167 | +	 */
4168 | +	ng = deref_curr_numa_group(p);
4169 | +	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
4170 | +		for_each_node_state(nid, N_CPU) {
4171 | +			if (nid == env.src_nid || nid == p->numa_preferred_nid)
4172 | +				continue;
4173 | +
4174 | +			dist = node_distance(env.src_nid, env.dst_nid);
4175 | +			if (sched_numa_topology_type == NUMA_BACKPLANE &&
4176 | +						dist != env.dist) {
4177 | +				taskweight = task_weight(p, env.src_nid, dist);
4178 | +				groupweight = group_weight(p, env.src_nid, dist);
4179 | +			}
4180 | +
4181 | +			/* Only consider nodes where both task and groups benefit */
4182 | +			taskimp = task_weight(p, nid, dist) - taskweight;
4183 | +			groupimp = group_weight(p, nid, dist) - groupweight;
4184 | +			if (taskimp < 0 && groupimp < 0)
4185 | +				continue;
4186 | +
4187 | +			env.dist = dist;
4188 | +			env.dst_nid = nid;
4189 | +			update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
4190 | +			task_numa_find_cpu(&env, taskimp, groupimp);
4191 | +		}
4192 | +	}
4193 | +
4194 | +	/*
4195 | +	 * If the task is part of a workload that spans multiple NUMA nodes,
4196 | +	 * and is migrating into one of the workload's active nodes, remember
4197 | +	 * this node as the task's preferred numa node, so the workload can
4198 | +	 * settle down.
4199 | +	 * A task that migrated to a second choice node will be better off
4200 | +	 * trying for a better one later. Do not set the preferred node here.
4201 | +	 */
4202 | +	if (ng) {
4203 | +		if (env.best_cpu == -1)
4204 | +			nid = env.src_nid;
4205 | +		else
4206 | +			nid = cpu_to_node(env.best_cpu);
4207 | +
4208 | +		if (nid != p->numa_preferred_nid)
4209 | +			sched_setnuma(p, nid);
4210 | +	}
4211 | +
4212 | +	/* No better CPU than the current one was found. */
4213 | +	if (env.best_cpu == -1) {
4214 | +		trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
4215 | +		return -EAGAIN;
4216 | +	}
4217 | +
4218 | +	best_rq = cpu_rq(env.best_cpu);
4219 | +	if (env.best_task == NULL) {
4220 | +		ret = migrate_task_to(p, env.best_cpu);
4221 | +		WRITE_ONCE(best_rq->numa_migrate_on, 0);
4222 | +		if (ret != 0)
4223 | +			trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
4224 | +		return ret;
4225 | +	}
4226 | +
4227 | +	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
4228 | +	WRITE_ONCE(best_rq->numa_migrate_on, 0);
4229 | +
4230 | +	if (ret != 0)
4231 | +		trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
4232 | +	put_task_struct(env.best_task);
4233 | +	return ret;
4234 | +}
4235 | +
4236 | +/* Attempt to migrate a task to a CPU on the preferred node. */
4237 | +static void numa_migrate_preferred(struct task_struct *p)
4238 | +{
4239 | +	unsigned long interval = HZ;
4240 | +
4241 | +	/* This task has no NUMA fault statistics yet */
4242 | +	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
4243 | +		return;
4244 | +
4245 | +	/* Periodically retry migrating the task to the preferred node */
4246 | +	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
4247 | +	p->numa_migrate_retry = jiffies + interval;
4248 | +
4249 | +	/* Success if task is already running on preferred CPU */
4250 | +	if (task_node(p) == p->numa_preferred_nid)
4251 | +		return;
4252 | +
4253 | +	/* Otherwise, try migrate to a CPU on the preferred node */
4254 | +	task_numa_migrate(p);
4255 | +}
4256 | +
4257 | +/*
4258 | + * Find out how many nodes the workload is actively running on. Do this by
4259 | + * tracking the nodes from which NUMA hinting faults are triggered. This can
4260 | + * be different from the set of nodes where the workload's memory is currently
4261 | + * located.
4262 | + */
4263 | +static void numa_group_count_active_nodes(struct numa_group *numa_group)
4264 | +{
4265 | +	unsigned long faults, max_faults = 0;
4266 | +	int nid, active_nodes = 0;
4267 | +
4268 | +	for_each_node_state(nid, N_CPU) {
4269 | +		faults = group_faults_cpu(numa_group, nid);
4270 | +		if (faults > max_faults)
4271 | +			max_faults = faults;
4272 | +	}
4273 | +
4274 | +	for_each_node_state(nid, N_CPU) {
4275 | +		faults = group_faults_cpu(numa_group, nid);
4276 | +		if (faults * ACTIVE_NODE_FRACTION > max_faults)
4277 | +			active_nodes++;
4278 | +	}
4279 | +
4280 | +	numa_group->max_faults_cpu = max_faults;
4281 | +	numa_group->active_nodes = active_nodes;
4282 | +}
4283 | +
4284 | +/*
4285 | + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
4286 | + * increments. The more local the fault statistics are, the higher the scan
4287 | + * period will be for the next scan window. If local/(local+remote) ratio is
4288 | + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
4289 | + * the scan period will decrease. Aim for 70% local accesses.
4290 | + */
4291 | +#define NUMA_PERIOD_SLOTS 10
4292 | +#define NUMA_PERIOD_THRESHOLD 7
4293 | +
4294 | +/*
4295 | + * Increase the scan period (slow down scanning) if the majority of
4296 | + * our memory is already on our local node, or if the majority of
4297 | + * the page accesses are shared with other processes.
4298 | + * Otherwise, decrease the scan period.
4299 | + */
4300 | +static void update_task_scan_period(struct task_struct *p,
4301 | +			unsigned long shared, unsigned long private)
4302 | +{
4303 | +	unsigned int period_slot;
4304 | +	int lr_ratio, ps_ratio;
4305 | +	int diff;
4306 | +
4307 | +	unsigned long remote = p->numa_faults_locality[0];
4308 | +	unsigned long local = p->numa_faults_locality[1];
4309 | +
4310 | +	/*
4311 | +	 * If there were no record hinting faults then either the task is
4312 | +	 * completely idle or all activity is in areas that are not of interest
4313 | +	 * to automatic numa balancing. Related to that, if there were failed
4314 | +	 * migration then it implies we are migrating too quickly or the local
4315 | +	 * node is overloaded. In either case, scan slower
4316 | +	 */
4317 | +	if (local + shared == 0 || p->numa_faults_locality[2]) {
4318 | +		p->numa_scan_period = min(p->numa_scan_period_max,
4319 | +			p->numa_scan_period << 1);
4320 | +
4321 | +		p->mm->numa_next_scan = jiffies +
4322 | +			msecs_to_jiffies(p->numa_scan_period);
4323 | +
4324 | +		return;
4325 | +	}
4326 | +
4327 | +	/*
4328 | +	 * Prepare to scale scan period relative to the current period.
4329 | +	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
4330 | +	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
4331 | +	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
4332 | +	 */
4333 | +	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
4334 | +	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
4335 | +	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
4336 | +
4337 | +	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
4338 | +		/*
4339 | +		 * Most memory accesses are local. There is no need to
4340 | +		 * do fast NUMA scanning, since memory is already local.
4341 | +		 */
4342 | +		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
4343 | +		if (!slot)
4344 | +			slot = 1;
4345 | +		diff = slot * period_slot;
4346 | +	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
4347 | +		/*
4348 | +		 * Most memory accesses are shared with other tasks.
4349 | +		 * There is no point in continuing fast NUMA scanning,
4350 | +		 * since other tasks may just move the memory elsewhere.
4351 | +		 */
4352 | +		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
4353 | +		if (!slot)
4354 | +			slot = 1;
4355 | +		diff = slot * period_slot;
4356 | +	} else {
4357 | +		/*
4358 | +		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
4359 | +		 * yet they are not on the local NUMA node. Speed up
4360 | +		 * NUMA scanning to get the memory moved over.
4361 | +		 */
4362 | +		int ratio = max(lr_ratio, ps_ratio);
4363 | +		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
4364 | +	}
4365 | +
4366 | +	p->numa_scan_period = clamp(p->numa_scan_period + diff,
4367 | +			task_scan_min(p), task_scan_max(p));
4368 | +	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
4369 | +}
4370 | +
4371 | +/*
4372 | + * Get the fraction of time the task has been running since the last
4373 | + * NUMA placement cycle. The scheduler keeps similar statistics, but
4374 | + * decays those on a 32ms period, which is orders of magnitude off
4375 | + * from the dozens-of-seconds NUMA balancing period. Use the scheduler
4376 | + * stats only if the task is so new there are no NUMA statistics yet.
4377 | + */
4378 | +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
4379 | +{
4380 | +	u64 runtime, delta, now;
4381 | +	/* Use the start of this time slice to avoid calculations. */
4382 | +	now = p->se.exec_start;
4383 | +	runtime = p->se.sum_exec_runtime;
4384 | +
4385 | +	if (p->last_task_numa_placement) {
4386 | +		delta = runtime - p->last_sum_exec_runtime;
4387 | +		*period = now - p->last_task_numa_placement;
4388 | +
4389 | +		/* Avoid time going backwards, prevent potential divide error: */
4390 | +		if (unlikely((s64)*period < 0))
4391 | +			*period = 0;
4392 | +	} else {
4393 | +		delta = p->se.avg.load_sum;
4394 | +		*period = LOAD_AVG_MAX;
4395 | +	}
4396 | +
4397 | +	p->last_sum_exec_runtime = runtime;
4398 | +	p->last_task_numa_placement = now;
4399 | +
4400 | +	return delta;
4401 | +}
4402 | +
4403 | +/*
4404 | + * Determine the preferred nid for a task in a numa_group. This needs to
4405 | + * be done in a way that produces consistent results with group_weight,
4406 | + * otherwise workloads might not converge.
4407 | + */
4408 | +static int preferred_group_nid(struct task_struct *p, int nid)
4409 | +{
4410 | +	nodemask_t nodes;
4411 | +	int dist;
4412 | +
4413 | +	/* Direct connections between all NUMA nodes. */
4414 | +	if (sched_numa_topology_type == NUMA_DIRECT)
4415 | +		return nid;
4416 | +
4417 | +	/*
4418 | +	 * On a system with glueless mesh NUMA topology, group_weight
4419 | +	 * scores nodes according to the number of NUMA hinting faults on
4420 | +	 * both the node itself, and on nearby nodes.
4421 | +	 */
4422 | +	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
4423 | +		unsigned long score, max_score = 0;
4424 | +		int node, max_node = nid;
4425 | +
4426 | +		dist = sched_max_numa_distance;
4427 | +
4428 | +		for_each_node_state(node, N_CPU) {
4429 | +			score = group_weight(p, node, dist);
4430 | +			if (score > max_score) {
4431 | +				max_score = score;
4432 | +				max_node = node;
4433 | +			}
4434 | +		}
4435 | +		return max_node;
4436 | +	}
4437 | +
4438 | +	/*
4439 | +	 * Finding the preferred nid in a system with NUMA backplane
4440 | +	 * interconnect topology is more involved. The goal is to locate
4441 | +	 * tasks from numa_groups near each other in the system, and
4442 | +	 * untangle workloads from different sides of the system. This requires
4443 | +	 * searching down the hierarchy of node groups, recursively searching
4444 | +	 * inside the highest scoring group of nodes. The nodemask tricks
4445 | +	 * keep the complexity of the search down.
4446 | +	 */
4447 | +	nodes = node_states[N_CPU];
4448 | +	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
4449 | +		unsigned long max_faults = 0;
4450 | +		nodemask_t max_group = NODE_MASK_NONE;
4451 | +		int a, b;
4452 | +
4453 | +		/* Are there nodes at this distance from each other? */
4454 | +		if (!find_numa_distance(dist))
4455 | +			continue;
4456 | +
4457 | +		for_each_node_mask(a, nodes) {
4458 | +			unsigned long faults = 0;
4459 | +			nodemask_t this_group;
4460 | +			nodes_clear(this_group);
4461 | +
4462 | +			/* Sum group's NUMA faults; includes a==b case. */
4463 | +			for_each_node_mask(b, nodes) {
4464 | +				if (node_distance(a, b) < dist) {
4465 | +					faults += group_faults(p, b);
4466 | +					node_set(b, this_group);
4467 | +					node_clear(b, nodes);
4468 | +				}
4469 | +			}
4470 | +
4471 | +			/* Remember the top group. */
4472 | +			if (faults > max_faults) {
4473 | +				max_faults = faults;
4474 | +				max_group = this_group;
4475 | +				/*
4476 | +				 * subtle: at the smallest distance there is
4477 | +				 * just one node left in each "group", the
4478 | +				 * winner is the preferred nid.
4479 | +				 */
4480 | +				nid = a;
4481 | +			}
4482 | +		}
4483 | +		/* Next round, evaluate the nodes within max_group. */
4484 | +		if (!max_faults)
4485 | +			break;
4486 | +		nodes = max_group;
4487 | +	}
4488 | +	return nid;
4489 | +}
4490 | +
4491 | +static void task_numa_placement(struct task_struct *p)
4492 | +{
4493 | +	int seq, nid, max_nid = NUMA_NO_NODE;
4494 | +	unsigned long max_faults = 0;
4495 | +	unsigned long fault_types[2] = { 0, 0 };
4496 | +	unsigned long total_faults;
4497 | +	u64 runtime, period;
4498 | +	spinlock_t *group_lock = NULL;
4499 | +	struct numa_group *ng;
4500 | +
4501 | +	/*
4502 | +	 * The p->mm->numa_scan_seq field gets updated without
4503 | +	 * exclusive access. Use READ_ONCE() here to ensure
4504 | +	 * that the field is read in a single access:
4505 | +	 */
4506 | +	seq = READ_ONCE(p->mm->numa_scan_seq);
4507 | +	if (p->numa_scan_seq == seq)
4508 | +		return;
4509 | +	p->numa_scan_seq = seq;
4510 | +	p->numa_scan_period_max = task_scan_max(p);
4511 | +
4512 | +	total_faults = p->numa_faults_locality[0] +
4513 | +		       p->numa_faults_locality[1];
4514 | +	runtime = numa_get_avg_runtime(p, &period);
4515 | +
4516 | +	/* If the task is part of a group prevent parallel updates to group stats */
4517 | +	ng = deref_curr_numa_group(p);
4518 | +	if (ng) {
4519 | +		group_lock = &ng->lock;
4520 | +		spin_lock_irq(group_lock);
4521 | +	}
4522 | +
4523 | +	/* Find the node with the highest number of faults */
4524 | +	for_each_online_node(nid) {
4525 | +		/* Keep track of the offsets in numa_faults array */
4526 | +		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
4527 | +		unsigned long faults = 0, group_faults = 0;
4528 | +		int priv;
4529 | +
4530 | +		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
4531 | +			long diff, f_diff, f_weight;
4532 | +
4533 | +			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
4534 | +			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
4535 | +			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
4536 | +			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
4537 | +
4538 | +			/* Decay existing window, copy faults since last scan */
4539 | +			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
4540 | +			fault_types[priv] += p->numa_faults[membuf_idx];
4541 | +			p->numa_faults[membuf_idx] = 0;
4542 | +
4543 | +			/*
4544 | +			 * Normalize the faults_from, so all tasks in a group
4545 | +			 * count according to CPU use, instead of by the raw
4546 | +			 * number of faults. Tasks with little runtime have
4547 | +			 * little over-all impact on throughput, and thus their
4548 | +			 * faults are less important.
4549 | +			 */
4550 | +			f_weight = div64_u64(runtime << 16, period + 1);
4551 | +			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
4552 | +				   (total_faults + 1);
4553 | +			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
4554 | +			p->numa_faults[cpubuf_idx] = 0;
4555 | +
4556 | +			p->numa_faults[mem_idx] += diff;
4557 | +			p->numa_faults[cpu_idx] += f_diff;
4558 | +			faults += p->numa_faults[mem_idx];
4559 | +			p->total_numa_faults += diff;
4560 | +			if (ng) {
4561 | +				/*
4562 | +				 * safe because we can only change our own group
4563 | +				 *
4564 | +				 * mem_idx represents the offset for a given
4565 | +				 * nid and priv in a specific region because it
4566 | +				 * is at the beginning of the numa_faults array.
4567 | +				 */
4568 | +				ng->faults[mem_idx] += diff;
4569 | +				ng->faults[cpu_idx] += f_diff;
4570 | +				ng->total_faults += diff;
4571 | +				group_faults += ng->faults[mem_idx];
4572 | +			}
4573 | +		}
4574 | +
4575 | +		if (!ng) {
4576 | +			if (faults > max_faults) {
4577 | +				max_faults = faults;
4578 | +				max_nid = nid;
4579 | +			}
4580 | +		} else if (group_faults > max_faults) {
4581 | +			max_faults = group_faults;
4582 | +			max_nid = nid;
4583 | +		}
4584 | +	}
4585 | +
4586 | +	/* Cannot migrate task to CPU-less node */
4587 | +	max_nid = numa_nearest_node(max_nid, N_CPU);
4588 | +
4589 | +	if (ng) {
4590 | +		numa_group_count_active_nodes(ng);
4591 | +		spin_unlock_irq(group_lock);
4592 | +		max_nid = preferred_group_nid(p, max_nid);
4593 | +	}
4594 | +
4595 | +	if (max_faults) {
4596 | +		/* Set the new preferred node */
4597 | +		if (max_nid != p->numa_preferred_nid)
4598 | +			sched_setnuma(p, max_nid);
4599 | +	}
4600 | +
4601 | +	update_task_scan_period(p, fault_types[0], fault_types[1]);
4602 | +}
4603 | +
4604 | +static inline int get_numa_group(struct numa_group *grp)
4605 | +{
4606 | +	return refcount_inc_not_zero(&grp->refcount);
4607 | +}
4608 | +
4609 | +static inline void put_numa_group(struct numa_group *grp)
4610 | +{
4611 | +	if (refcount_dec_and_test(&grp->refcount))
4612 | +		kfree_rcu(grp, rcu);
4613 | +}
4614 | +
4615 | +static void task_numa_group(struct task_struct *p, int cpupid, int flags,
4616 | +			int *priv)
4617 | +{
4618 | +	struct numa_group *grp, *my_grp;
4619 | +	struct task_struct *tsk;
4620 | +	bool join = false;
4621 | +	int cpu = cpupid_to_cpu(cpupid);
4622 | +	int i;
4623 | +
4624 | +	if (unlikely(!deref_curr_numa_group(p))) {
4625 | +		unsigned int size = sizeof(struct numa_group) +
4626 | +				    NR_NUMA_HINT_FAULT_STATS *
4627 | +				    nr_node_ids * sizeof(unsigned long);
4628 | +
4629 | +		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
4630 | +		if (!grp)
4631 | +			return;
4632 | +
4633 | +		refcount_set(&grp->refcount, 1);
4634 | +		grp->active_nodes = 1;
4635 | +		grp->max_faults_cpu = 0;
4636 | +		spin_lock_init(&grp->lock);
4637 | +		grp->gid = p->pid;
4638 | +
4639 | +		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
4640 | +			grp->faults[i] = p->numa_faults[i];
4641 | +
4642 | +		grp->total_faults = p->total_numa_faults;
4643 | +
4644 | +		grp->nr_tasks++;
4645 | +		rcu_assign_pointer(p->numa_group, grp);
4646 | +	}
4647 | +
4648 | +	rcu_read_lock();
4649 | +	tsk = READ_ONCE(cpu_rq(cpu)->curr);
4650 | +
4651 | +	if (!cpupid_match_pid(tsk, cpupid))
4652 | +		goto no_join;
4653 | +
4654 | +	grp = rcu_dereference(tsk->numa_group);
4655 | +	if (!grp)
4656 | +		goto no_join;
4657 | +
4658 | +	my_grp = deref_curr_numa_group(p);
4659 | +	if (grp == my_grp)
4660 | +		goto no_join;
4661 | +
4662 | +	/*
4663 | +	 * Only join the other group if its bigger; if we're the bigger group,
4664 | +	 * the other task will join us.
4665 | +	 */
4666 | +	if (my_grp->nr_tasks > grp->nr_tasks)
4667 | +		goto no_join;
4668 | +
4669 | +	/*
4670 | +	 * Tie-break on the grp address.
4671 | +	 */
4672 | +	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
4673 | +		goto no_join;
4674 | +
4675 | +	/* Always join threads in the same process. */
4676 | +	if (tsk->mm == current->mm)
4677 | +		join = true;
4678 | +
4679 | +	/* Simple filter to avoid false positives due to PID collisions */
4680 | +	if (flags & TNF_SHARED)
4681 | +		join = true;
4682 | +
4683 | +	/* Update priv based on whether false sharing was detected */
4684 | +	*priv = !join;
4685 | +
4686 | +	if (join && !get_numa_group(grp))
4687 | +		goto no_join;
4688 | +
4689 | +	rcu_read_unlock();
4690 | +
4691 | +	if (!join)
4692 | +		return;
4693 | +
4694 | +	WARN_ON_ONCE(irqs_disabled());
4695 | +	double_lock_irq(&my_grp->lock, &grp->lock);
4696 | +
4697 | +	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
4698 | +		my_grp->faults[i] -= p->numa_faults[i];
4699 | +		grp->faults[i] += p->numa_faults[i];
4700 | +	}
4701 | +	my_grp->total_faults -= p->total_numa_faults;
4702 | +	grp->total_faults += p->total_numa_faults;
4703 | +
4704 | +	my_grp->nr_tasks--;
4705 | +	grp->nr_tasks++;
4706 | +
4707 | +	spin_unlock(&my_grp->lock);
4708 | +	spin_unlock_irq(&grp->lock);
4709 | +
4710 | +	rcu_assign_pointer(p->numa_group, grp);
4711 | +
4712 | +	put_numa_group(my_grp);
4713 | +	return;
4714 | +
4715 | +no_join:
4716 | +	rcu_read_unlock();
4717 | +	return;
4718 | +}
4719 | +
4720 | +/*
4721 | + * Get rid of NUMA statistics associated with a task (either current or dead).
4722 | + * If @final is set, the task is dead and has reached refcount zero, so we can
4723 | + * safely free all relevant data structures. Otherwise, there might be
4724 | + * concurrent reads from places like load balancing and procfs, and we should
4725 | + * reset the data back to default state without freeing ->numa_faults.
4726 | + */
4727 | +void task_numa_free(struct task_struct *p, bool final)
4728 | +{
4729 | +	/* safe: p either is current or is being freed by current */
4730 | +	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
4731 | +	unsigned long *numa_faults = p->numa_faults;
4732 | +	unsigned long flags;
4733 | +	int i;
4734 | +
4735 | +	if (!numa_faults)
4736 | +		return;
4737 | +
4738 | +	if (grp) {
4739 | +		spin_lock_irqsave(&grp->lock, flags);
4740 | +		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
4741 | +			grp->faults[i] -= p->numa_faults[i];
4742 | +		grp->total_faults -= p->total_numa_faults;
4743 | +
4744 | +		grp->nr_tasks--;
4745 | +		spin_unlock_irqrestore(&grp->lock, flags);
4746 | +		RCU_INIT_POINTER(p->numa_group, NULL);
4747 | +		put_numa_group(grp);
4748 | +	}
4749 | +
4750 | +	if (final) {
4751 | +		p->numa_faults = NULL;
4752 | +		kfree(numa_faults);
4753 | +	} else {
4754 | +		p->total_numa_faults = 0;
4755 | +		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
4756 | +			numa_faults[i] = 0;
4757 | +	}
4758 | +}
4759 | +
4760 | +/*
4761 | + * Got a PROT_NONE fault for a page on @node.
4762 | + */
4763 | +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
4764 | +{
4765 | +	struct task_struct *p = current;
4766 | +	bool migrated = flags & TNF_MIGRATED;
4767 | +	int cpu_node = task_node(current);
4768 | +	int local = !!(flags & TNF_FAULT_LOCAL);
4769 | +	struct numa_group *ng;
4770 | +	int priv;
4771 | +
4772 | +	if (!static_branch_likely(&sched_numa_balancing))
4773 | +		return;
4774 | +
4775 | +	/* for example, ksmd faulting in a user's mm */
4776 | +	if (!p->mm)
4777 | +		return;
4778 | +
4779 | +	/*
4780 | +	 * NUMA faults statistics are unnecessary for the slow memory
4781 | +	 * node for memory tiering mode.
4782 | +	 */
4783 | +	if (!node_is_toptier(mem_node) &&
4784 | +	    (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
4785 | +	     !cpupid_valid(last_cpupid)))
4786 | +		return;
4787 | +
4788 | +	/* Allocate buffer to track faults on a per-node basis */
4789 | +	if (unlikely(!p->numa_faults)) {
4790 | +		int size = sizeof(*p->numa_faults) *
4791 | +			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
4792 | +
4793 | +		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
4794 | +		if (!p->numa_faults)
4795 | +			return;
4796 | +
4797 | +		p->total_numa_faults = 0;
4798 | +		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
4799 | +	}
4800 | +
4801 | +	/*
4802 | +	 * First accesses are treated as private, otherwise consider accesses
4803 | +	 * to be private if the accessing pid has not changed
4804 | +	 */
4805 | +	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
4806 | +		priv = 1;
4807 | +	} else {
4808 | +		priv = cpupid_match_pid(p, last_cpupid);
4809 | +		if (!priv && !(flags & TNF_NO_GROUP))
4810 | +			task_numa_group(p, last_cpupid, flags, &priv);
4811 | +	}
4812 | +
4813 | +	/*
4814 | +	 * If a workload spans multiple NUMA nodes, a shared fault that
4815 | +	 * occurs wholly within the set of nodes that the workload is
4816 | +	 * actively using should be counted as local. This allows the
4817 | +	 * scan rate to slow down when a workload has settled down.
4818 | +	 */
4819 | +	ng = deref_curr_numa_group(p);
4820 | +	if (!priv && !local && ng && ng->active_nodes > 1 &&
4821 | +				numa_is_active_node(cpu_node, ng) &&
4822 | +				numa_is_active_node(mem_node, ng))
4823 | +		local = 1;
4824 | +
4825 | +	/*
4826 | +	 * Retry to migrate task to preferred node periodically, in case it
4827 | +	 * previously failed, or the scheduler moved us.
4828 | +	 */
4829 | +	if (time_after(jiffies, p->numa_migrate_retry)) {
4830 | +		task_numa_placement(p);
4831 | +		numa_migrate_preferred(p);
4832 | +	}
4833 | +
4834 | +	if (migrated)
4835 | +		p->numa_pages_migrated += pages;
4836 | +	if (flags & TNF_MIGRATE_FAIL)
4837 | +		p->numa_faults_locality[2] += pages;
4838 | +
4839 | +	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
4840 | +	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
4841 | +	p->numa_faults_locality[local] += pages;
4842 | +}
4843 | +
4844 | +static void reset_ptenuma_scan(struct task_struct *p)
4845 | +{
4846 | +	/*
4847 | +	 * We only did a read acquisition of the mmap sem, so
4848 | +	 * p->mm->numa_scan_seq is written to without exclusive access
4849 | +	 * and the update is not guaranteed to be atomic. That's not
4850 | +	 * much of an issue though, since this is just used for
4851 | +	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
4852 | +	 * expensive, to avoid any form of compiler optimizations:
4853 | +	 */
4854 | +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
4855 | +	p->mm->numa_scan_offset = 0;
4856 | +}
4857 | +
4858 | +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
4859 | +{
4860 | +	unsigned long pids;
4861 | +	/*
4862 | +	 * Allow unconditional access first two times, so that all the (pages)
4863 | +	 * of VMAs get prot_none fault introduced irrespective of accesses.
4864 | +	 * This is also done to avoid any side effect of task scanning
4865 | +	 * amplifying the unfairness of disjoint set of VMAs' access.
4866 | +	 */
4867 | +	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
4868 | +		return true;
4869 | +
4870 | +	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
4871 | +	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
4872 | +		return true;
4873 | +
4874 | +	/*
4875 | +	 * Complete a scan that has already started regardless of PID access, or
4876 | +	 * some VMAs may never be scanned in multi-threaded applications:
4877 | +	 */
4878 | +	if (mm->numa_scan_offset > vma->vm_start) {
4879 | +		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
4880 | +		return true;
4881 | +	}
4882 | +
4883 | +	return false;
4884 | +}
4885 | +
4886 | +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
4887 | +
4888 | +/*
4889 | + * The expensive part of numa migration is done from task_work context.
4890 | + * Triggered from task_tick_numa().
4891 | + */
4892 | +static void task_numa_work(struct callback_head *work)
4893 | +{
4894 | +	unsigned long migrate, next_scan, now = jiffies;
4895 | +	struct task_struct *p = current;
4896 | +	struct mm_struct *mm = p->mm;
4897 | +	u64 runtime = p->se.sum_exec_runtime;
4898 | +	struct vm_area_struct *vma;
4899 | +	unsigned long start, end;
4900 | +	unsigned long nr_pte_updates = 0;
4901 | +	long pages, virtpages;
4902 | +	struct vma_iterator vmi;
4903 | +	bool vma_pids_skipped;
4904 | +	bool vma_pids_forced = false;
4905 | +
4906 | +	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
4907 | +
4908 | +	work->next = work;
4909 | +	/*
4910 | +	 * Who cares about NUMA placement when they're dying.
4911 | +	 *
4912 | +	 * NOTE: make sure not to dereference p->mm before this check,
4913 | +	 * exit_task_work() happens _after_ exit_mm() so we could be called
4914 | +	 * without p->mm even though we still had it when we enqueued this
4915 | +	 * work.
4916 | +	 */
4917 | +	if (p->flags & PF_EXITING)
4918 | +		return;
4919 | +
4920 | +	if (!mm->numa_next_scan) {
4921 | +		mm->numa_next_scan = now +
4922 | +			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
4923 | +	}
4924 | +
4925 | +	/*
4926 | +	 * Enforce maximal scan/migration frequency..
4927 | +	 */
4928 | +	migrate = mm->numa_next_scan;
4929 | +	if (time_before(now, migrate))
4930 | +		return;
4931 | +
4932 | +	if (p->numa_scan_period == 0) {
4933 | +		p->numa_scan_period_max = task_scan_max(p);
4934 | +		p->numa_scan_period = task_scan_start(p);
4935 | +	}
4936 | +
4937 | +	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
4938 | +	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
4939 | +		return;
4940 | +
4941 | +	/*
4942 | +	 * Delay this task enough that another task of this mm will likely win
4943 | +	 * the next time around.
4944 | +	 */
4945 | +	p->node_stamp += 2 * TICK_NSEC;
4946 | +
4947 | +	pages = sysctl_numa_balancing_scan_size;
4948 | +	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
4949 | +	virtpages = pages * 8;	   /* Scan up to this much virtual space */
4950 | +	if (!pages)
4951 | +		return;
4952 | +
4953 | +
4954 | +	if (!mmap_read_trylock(mm))
4955 | +		return;
4956 | +
4957 | +	/*
4958 | +	 * VMAs are skipped if the current PID has not trapped a fault within
4959 | +	 * the VMA recently. Allow scanning to be forced if there is no
4960 | +	 * suitable VMA remaining.
4961 | +	 */
4962 | +	vma_pids_skipped = false;
4963 | +
4964 | +retry_pids:
4965 | +	start = mm->numa_scan_offset;
4966 | +	vma_iter_init(&vmi, mm, start);
4967 | +	vma = vma_next(&vmi);
4968 | +	if (!vma) {
4969 | +		reset_ptenuma_scan(p);
4970 | +		start = 0;
4971 | +		vma_iter_set(&vmi, start);
4972 | +		vma = vma_next(&vmi);
4973 | +	}
4974 | +
4975 | +	do {
4976 | +		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
4977 | +			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
4978 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
4979 | +			continue;
4980 | +		}
4981 | +
4982 | +		/*
4983 | +		 * Shared library pages mapped by multiple processes are not
4984 | +		 * migrated as it is expected they are cache replicated. Avoid
4985 | +		 * hinting faults in read-only file-backed mappings or the vdso
4986 | +		 * as migrating the pages will be of marginal benefit.
4987 | +		 */
4988 | +		if (!vma->vm_mm ||
4989 | +		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
4990 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
4991 | +			continue;
4992 | +		}
4993 | +
4994 | +		/*
4995 | +		 * Skip inaccessible VMAs to avoid any confusion between
4996 | +		 * PROT_NONE and NUMA hinting ptes
4997 | +		 */
4998 | +		if (!vma_is_accessible(vma)) {
4999 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
5000 | +			continue;
5001 | +		}
5002 | +
5003 | +		/* Initialise new per-VMA NUMAB state. */
5004 | +		if (!vma->numab_state) {
5005 | +			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
5006 | +				GFP_KERNEL);
5007 | +			if (!vma->numab_state)
5008 | +				continue;
5009 | +
5010 | +			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
5011 | +
5012 | +			vma->numab_state->next_scan = now +
5013 | +				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
5014 | +
5015 | +			/* Reset happens after 4 times scan delay of scan start */
5016 | +			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
5017 | +				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
5018 | +
5019 | +			/*
5020 | +			 * Ensure prev_scan_seq does not match numa_scan_seq,
5021 | +			 * to prevent VMAs being skipped prematurely on the
5022 | +			 * first scan:
5023 | +			 */
5024 | +			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
5025 | +		}
5026 | +
5027 | +		/*
5028 | +		 * Scanning the VMA's of short lived tasks add more overhead. So
5029 | +		 * delay the scan for new VMAs.
5030 | +		 */
5031 | +		if (mm->numa_scan_seq && time_before(jiffies,
5032 | +						vma->numab_state->next_scan)) {
5033 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
5034 | +			continue;
5035 | +		}
5036 | +
5037 | +		/* RESET access PIDs regularly for old VMAs. */
5038 | +		if (mm->numa_scan_seq &&
5039 | +				time_after(jiffies, vma->numab_state->pids_active_reset)) {
5040 | +			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
5041 | +				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
5042 | +			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
5043 | +			vma->numab_state->pids_active[1] = 0;
5044 | +		}
5045 | +
5046 | +		/* Do not rescan VMAs twice within the same sequence. */
5047 | +		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
5048 | +			mm->numa_scan_offset = vma->vm_end;
5049 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
5050 | +			continue;
5051 | +		}
5052 | +
5053 | +		/*
5054 | +		 * Do not scan the VMA if task has not accessed it, unless no other
5055 | +		 * VMA candidate exists.
5056 | +		 */
5057 | +		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
5058 | +			vma_pids_skipped = true;
5059 | +			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
5060 | +			continue;
5061 | +		}
5062 | +
5063 | +		do {
5064 | +			start = max(start, vma->vm_start);
5065 | +			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
5066 | +			end = min(end, vma->vm_end);
5067 | +			nr_pte_updates = change_prot_numa(vma, start, end);
5068 | +
5069 | +			/*
5070 | +			 * Try to scan sysctl_numa_balancing_size worth of
5071 | +			 * hpages that have at least one present PTE that
5072 | +			 * is not already pte-numa. If the VMA contains
5073 | +			 * areas that are unused or already full of prot_numa
5074 | +			 * PTEs, scan up to virtpages, to skip through those
5075 | +			 * areas faster.
5076 | +			 */
5077 | +			if (nr_pte_updates)
5078 | +				pages -= (end - start) >> PAGE_SHIFT;
5079 | +			virtpages -= (end - start) >> PAGE_SHIFT;
5080 | +
5081 | +			start = end;
5082 | +			if (pages <= 0 || virtpages <= 0)
5083 | +				goto out;
5084 | +
5085 | +			cond_resched();
5086 | +		} while (end != vma->vm_end);
5087 | +
5088 | +		/* VMA scan is complete, do not scan until next sequence. */
5089 | +		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
5090 | +
5091 | +		/*
5092 | +		 * Only force scan within one VMA at a time, to limit the
5093 | +		 * cost of scanning a potentially uninteresting VMA.
5094 | +		 */
5095 | +		if (vma_pids_forced)
5096 | +			break;
5097 | +	} for_each_vma(vmi, vma);
5098 | +
5099 | +	/*
5100 | +	 * If no VMAs are remaining and VMAs were skipped due to the PID
5101 | +	 * not accessing the VMA previously, then force a scan to ensure
5102 | +	 * forward progress:
5103 | +	 */
5104 | +	if (!vma && !vma_pids_forced && vma_pids_skipped) {
5105 | +		vma_pids_forced = true;
5106 | +		goto retry_pids;
5107 | +	}
5108 | +
5109 | +out:
5110 | +	/*
5111 | +	 * It is possible to reach the end of the VMA list but the last few
5112 | +	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
5113 | +	 * would find the !migratable VMA on the next scan but not reset the
5114 | +	 * scanner to the start so check it now.
5115 | +	 */
5116 | +	if (vma)
5117 | +		mm->numa_scan_offset = start;
5118 | +	else
5119 | +		reset_ptenuma_scan(p);
5120 | +	mmap_read_unlock(mm);
5121 | +
5122 | +	/*
5123 | +	 * Make sure tasks use at least 32x as much time to run other code
5124 | +	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
5125 | +	 * Usually update_task_scan_period slows down scanning enough; on an
5126 | +	 * overloaded system we need to limit overhead on a per task basis.
5127 | +	 */
5128 | +	if (unlikely(p->se.sum_exec_runtime != runtime)) {
5129 | +		u64 diff = p->se.sum_exec_runtime - runtime;
5130 | +		p->node_stamp += 32 * diff;
5131 | +	}
5132 | +}
5133 | +
5134 | +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
5135 | +{
5136 | +	int mm_users = 0;
5137 | +	struct mm_struct *mm = p->mm;
5138 | +
5139 | +	if (mm) {
5140 | +		mm_users = atomic_read(&mm->mm_users);
5141 | +		if (mm_users == 1) {
5142 | +			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
5143 | +			mm->numa_scan_seq = 0;
5144 | +		}
5145 | +	}
5146 | +	p->node_stamp			= 0;
5147 | +	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
5148 | +	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
5149 | +	p->numa_migrate_retry		= 0;
5150 | +	/* Protect against double add, see task_tick_numa and task_numa_work */
5151 | +	p->numa_work.next		= &p->numa_work;
5152 | +	p->numa_faults			= NULL;
5153 | +	p->numa_pages_migrated		= 0;
5154 | +	p->total_numa_faults		= 0;
5155 | +	RCU_INIT_POINTER(p->numa_group, NULL);
5156 | +	p->last_task_numa_placement	= 0;
5157 | +	p->last_sum_exec_runtime	= 0;
5158 | +
5159 | +	init_task_work(&p->numa_work, task_numa_work);
5160 | +
5161 | +	/* New address space, reset the preferred nid */
5162 | +	if (!(clone_flags & CLONE_VM)) {
5163 | +		p->numa_preferred_nid = NUMA_NO_NODE;
5164 | +		return;
5165 | +	}
5166 | +
5167 | +	/*
5168 | +	 * New thread, keep existing numa_preferred_nid which should be copied
5169 | +	 * already by arch_dup_task_struct but stagger when scans start.
5170 | +	 */
5171 | +	if (mm) {
5172 | +		unsigned int delay;
5173 | +
5174 | +		delay = min_t(unsigned int, task_scan_max(current),
5175 | +			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
5176 | +		delay += 2 * TICK_NSEC;
5177 | +		p->node_stamp = delay;
5178 | +	}
5179 | +}
5180 | +
5181 | +/*
5182 | + * Drive the periodic memory faults..
5183 | + */
5184 | +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
5185 | +{
5186 | +	struct callback_head *work = &curr->numa_work;
5187 | +	u64 period, now;
5188 | +
5189 | +	/*
5190 | +	 * We don't care about NUMA placement if we don't have memory.
5191 | +	 */
5192 | +	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
5193 | +		return;
5194 | +
5195 | +	/*
5196 | +	 * Using runtime rather than walltime has the dual advantage that
5197 | +	 * we (mostly) drive the selection from busy threads and that the
5198 | +	 * task needs to have done some actual work before we bother with
5199 | +	 * NUMA placement.
5200 | +	 */
5201 | +	now = curr->se.sum_exec_runtime;
5202 | +	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
5203 | +
5204 | +	if (now > curr->node_stamp + period) {
5205 | +		if (!curr->node_stamp)
5206 | +			curr->numa_scan_period = task_scan_start(curr);
5207 | +		curr->node_stamp += period;
5208 | +
5209 | +		if (!time_before(jiffies, curr->mm->numa_next_scan))
5210 | +			task_work_add(curr, work, TWA_RESUME);
5211 | +	}
5212 | +}
5213 | +
5214 | +static void update_scan_period(struct task_struct *p, int new_cpu)
5215 | +{
5216 | +	int src_nid = cpu_to_node(task_cpu(p));
5217 | +	int dst_nid = cpu_to_node(new_cpu);
5218 | +
5219 | +	if (!static_branch_likely(&sched_numa_balancing))
5220 | +		return;
5221 | +
5222 | +	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
5223 | +		return;
5224 | +
5225 | +	if (src_nid == dst_nid)
5226 | +		return;
5227 | +
5228 | +	/*
5229 | +	 * Allow resets if faults have been trapped before one scan
5230 | +	 * has completed. This is most likely due to a new task that
5231 | +	 * is pulled cross-node due to wakeups or load balancing.
5232 | +	 */
5233 | +	if (p->numa_scan_seq) {
5234 | +		/*
5235 | +		 * Avoid scan adjustments if moving to the preferred
5236 | +		 * node or if the task was not previously running on
5237 | +		 * the preferred node.
5238 | +		 */
5239 | +		if (dst_nid == p->numa_preferred_nid ||
5240 | +		    (p->numa_preferred_nid != NUMA_NO_NODE &&
5241 | +			src_nid != p->numa_preferred_nid))
5242 | +			return;
5243 | +	}
5244 | +
5245 | +	p->numa_scan_period = task_scan_start(p);
5246 | +}
5247 | +
5248 | +#else
5249 | +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
5250 | +{
5251 | +}
5252 | +
5253 | +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
5254 | +{
5255 | +}
5256 | +
5257 | +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
5258 | +{
5259 | +}
5260 | +
5261 | +static inline void update_scan_period(struct task_struct *p, int new_cpu)
5262 | +{
5263 | +}
5264 | +
5265 | +#endif /* CONFIG_NUMA_BALANCING */
5266 | diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
5267 | index 31231925f1ec..95e7f83b5ab8 100644
5268 | --- a/kernel/sched/idle.c
5269 | +++ b/kernel/sched/idle.c
5270 | @@ -311,6 +311,7 @@ static void do_idle(void)
5271 |  		} else {
5272 |  			cpuidle_idle_call();
5273 |  		}
5274 | +
5275 |  		arch_cpu_idle_exit();
5276 |  	}
5277 |  
5278 | diff --git a/kernel/sched/nohz.h b/kernel/sched/nohz.h
5279 | new file mode 100644
5280 | index 000000000000..f00aeacb8f23
5281 | --- /dev/null
5282 | +++ b/kernel/sched/nohz.h
5283 | @@ -0,0 +1,511 @@
5284 | +#ifdef CONFIG_NO_HZ_COMMON
5285 | +static struct {
5286 | +	cpumask_var_t idle_cpus_mask;
5287 | +	atomic_t nr_cpus;
5288 | +	int has_blocked;		/* Idle CPUS has blocked load */
5289 | +	int needs_update;		/* Newly idle CPUs need their next_balance collated */
5290 | +	unsigned long next_balance;     /* in jiffy units */
5291 | +	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
5292 | +} nohz ____cacheline_aligned;
5293 | +
5294 | +static bool update_nohz_stats(struct rq *rq)
5295 | +{
5296 | +	unsigned int cpu = rq->cpu;
5297 | +
5298 | +	if (!rq->has_blocked_load)
5299 | +		return false;
5300 | +
5301 | +	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
5302 | +		return false;
5303 | +
5304 | +	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
5305 | +		return true;
5306 | +
5307 | +	return rq->has_blocked_load;
5308 | +}
5309 | +
5310 | +/*
5311 | + * Internal function that runs load balance for all idle cpus. The load balance
5312 | + * can be a simple update of blocked load or a complete load balance with
5313 | + * tasks movement depending of flags.
5314 | + */
5315 | +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
5316 | +{
5317 | +	/* Earliest time when we have to do rebalance again */
5318 | +	unsigned long now = jiffies;
5319 | +	unsigned long next_balance = now + 60*HZ;
5320 | +	bool has_blocked_load = false;
5321 | +	int update_next_balance = 0;
5322 | +	int this_cpu = this_rq->cpu;
5323 | +	int balance_cpu;
5324 | +	struct rq *rq;
5325 | +
5326 | +	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
5327 | +
5328 | +	/*
5329 | +	 * We assume there will be no idle load after this update and clear
5330 | +	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
5331 | +	 * set the has_blocked flag and trigger another update of idle load.
5332 | +	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
5333 | +	 * setting the flag, we are sure to not clear the state and not
5334 | +	 * check the load of an idle cpu.
5335 | +	 *
5336 | +	 * Same applies to idle_cpus_mask vs needs_update.
5337 | +	 */
5338 | +	if (flags & NOHZ_STATS_KICK)
5339 | +		WRITE_ONCE(nohz.has_blocked, 0);
5340 | +	if (flags & NOHZ_NEXT_KICK)
5341 | +		WRITE_ONCE(nohz.needs_update, 0);
5342 | +
5343 | +	/*
5344 | +	 * Ensures that if we miss the CPU, we must see the has_blocked
5345 | +	 * store from nohz_balance_enter_idle().
5346 | +	 */
5347 | +	smp_mb();
5348 | +
5349 | +	/*
5350 | +	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
5351 | +	 * chance for other idle cpu to pull load.
5352 | +	 */
5353 | +	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
5354 | +		if (!idle_cpu(balance_cpu))
5355 | +			continue;
5356 | +
5357 | +		/*
5358 | +		 * If this CPU gets work to do, stop the load balancing
5359 | +		 * work being done for other CPUs. Next load
5360 | +		 * balancing owner will pick it up.
5361 | +		 */
5362 | +		if (need_resched()) {
5363 | +			if (flags & NOHZ_STATS_KICK)
5364 | +				has_blocked_load = true;
5365 | +			if (flags & NOHZ_NEXT_KICK)
5366 | +				WRITE_ONCE(nohz.needs_update, 1);
5367 | +			goto abort;
5368 | +		}
5369 | +
5370 | +		rq = cpu_rq(balance_cpu);
5371 | +
5372 | +		if (flags & NOHZ_STATS_KICK)
5373 | +			has_blocked_load |= update_nohz_stats(rq);
5374 | +
5375 | +		/*
5376 | +		 * If time for next balance is due,
5377 | +		 * do the balance.
5378 | +		 */
5379 | +		if (time_after_eq(jiffies, rq->next_balance)) {
5380 | +			struct rq_flags rf;
5381 | +
5382 | +			rq_lock_irqsave(rq, &rf);
5383 | +			update_rq_clock(rq);
5384 | +			rq_unlock_irqrestore(rq, &rf);
5385 | +
5386 | +			if (flags & NOHZ_BALANCE_KICK)
5387 | +				idle_balance(rq);
5388 | +		}
5389 | +
5390 | +		if (time_after(next_balance, rq->next_balance)) {
5391 | +			next_balance = rq->next_balance;
5392 | +			update_next_balance = 1;
5393 | +		}
5394 | +	}
5395 | +
5396 | +	/*
5397 | +	 * next_balance will be updated only when there is a need.
5398 | +	 * When the CPU is attached to null domain for ex, it will not be
5399 | +	 * updated.
5400 | +	 */
5401 | +	if (likely(update_next_balance))
5402 | +		nohz.next_balance = next_balance;
5403 | +
5404 | +	if (flags & NOHZ_STATS_KICK)
5405 | +		WRITE_ONCE(nohz.next_blocked,
5406 | +			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
5407 | +
5408 | +abort:
5409 | +	/* There is still blocked load, enable periodic update */
5410 | +	if (has_blocked_load)
5411 | +		WRITE_ONCE(nohz.has_blocked, 1);
5412 | +}
5413 | +
5414 | +/*
5415 | + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5416 | + * rebalancing for all the cpus for whom scheduler ticks are stopped.
5417 | + */
5418 | +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
5419 | +{
5420 | +	unsigned int flags = this_rq->nohz_idle_balance;
5421 | +
5422 | +	if (!flags)
5423 | +		return false;
5424 | +
5425 | +	this_rq->nohz_idle_balance = 0;
5426 | +
5427 | +	if (idle != CPU_IDLE)
5428 | +		return false;
5429 | +
5430 | +	_nohz_idle_balance(this_rq, flags);
5431 | +
5432 | +	return true;
5433 | +}
5434 | +
5435 | +/*
5436 | + * Check if we need to directly run the ILB for updating blocked load before
5437 | + * entering idle state. Here we run ILB directly without issuing IPIs.
5438 | + *
5439 | + * Note that when this function is called, the tick may not yet be stopped on
5440 | + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
5441 | + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
5442 | + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
5443 | + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
5444 | + * called from this function on (this) CPU that's not yet in the mask. That's
5445 | + * OK because the goal of nohz_run_idle_balance() is to run ILB only for
5446 | + * updating the blocked load of already idle CPUs without waking up one of
5447 | + * those idle CPUs and outside the preempt disable / irq off phase of the local
5448 | + * cpu about to enter idle, because it can take a long time.
5449 | + */
5450 | +void nohz_run_idle_balance(int cpu)
5451 | +{
5452 | +	unsigned int flags;
5453 | +
5454 | +	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
5455 | +
5456 | +	/*
5457 | +	 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
5458 | +	 * (ie NOHZ_STATS_KICK set) and will do the same.
5459 | +	 */
5460 | +	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
5461 | +		_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
5462 | +}
5463 | +
5464 | +static void set_cpu_sd_state_busy(int cpu)
5465 | +{
5466 | +	struct sched_domain *sd;
5467 | +
5468 | +	rcu_read_lock();
5469 | +	sd = rcu_dereference(per_cpu(sd_llc, cpu));
5470 | +
5471 | +	if (!sd || !sd->nohz_idle)
5472 | +		goto unlock;
5473 | +	sd->nohz_idle = 0;
5474 | +
5475 | +	atomic_inc(&sd->shared->nr_busy_cpus);
5476 | +unlock:
5477 | +	rcu_read_unlock();
5478 | +}
5479 | +
5480 | +void nohz_balance_exit_idle(struct rq *rq)
5481 | +{
5482 | +	SCHED_WARN_ON(rq != this_rq());
5483 | +
5484 | +	if (likely(!rq->nohz_tick_stopped))
5485 | +		return;
5486 | +
5487 | +	rq->nohz_tick_stopped = 0;
5488 | +	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
5489 | +	atomic_dec(&nohz.nr_cpus);
5490 | +
5491 | +	set_cpu_sd_state_busy(rq->cpu);
5492 | +}
5493 | +
5494 | +static void set_cpu_sd_state_idle(int cpu)
5495 | +{
5496 | +	struct sched_domain *sd;
5497 | +
5498 | +	rcu_read_lock();
5499 | +	sd = rcu_dereference(per_cpu(sd_llc, cpu));
5500 | +
5501 | +	if (!sd || sd->nohz_idle)
5502 | +		goto unlock;
5503 | +	sd->nohz_idle = 1;
5504 | +
5505 | +	atomic_dec(&sd->shared->nr_busy_cpus);
5506 | +unlock:
5507 | +	rcu_read_unlock();
5508 | +}
5509 | +
5510 | +/*
5511 | + * This routine will record that the CPU is going idle with tick stopped.
5512 | + * This info will be used in performing idle load balancing in the future.
5513 | + */
5514 | +void nohz_balance_enter_idle(int cpu)
5515 | +{
5516 | +	struct rq *rq = cpu_rq(cpu);
5517 | +
5518 | +	SCHED_WARN_ON(cpu != smp_processor_id());
5519 | +
5520 | +	/* If this CPU is going down, then nothing needs to be done: */
5521 | +	if (!cpu_active(cpu))
5522 | +		return;
5523 | +
5524 | +	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
5525 | +	if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
5526 | +		return;
5527 | +
5528 | +	/*
5529 | +	 * Can be set safely without rq->lock held
5530 | +	 * If a clear happens, it will have evaluated last additions because
5531 | +	 * rq->lock is held during the check and the clear
5532 | +	 */
5533 | +	rq->has_blocked_load = 1;
5534 | +
5535 | +	/*
5536 | +	 * The tick is still stopped but load could have been added in the
5537 | +	 * meantime. We set the nohz.has_blocked flag to trig a check of the
5538 | +	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
5539 | +	 * of nohz.has_blocked can only happen after checking the new load
5540 | +	 */
5541 | +	if (rq->nohz_tick_stopped)
5542 | +		goto out;
5543 | +
5544 | +	/* If we're a completely isolated CPU, we don't play: */
5545 | +	if (on_null_domain(rq))
5546 | +		return;
5547 | +
5548 | +	rq->nohz_tick_stopped = 1;
5549 | +
5550 | +	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5551 | +	atomic_inc(&nohz.nr_cpus);
5552 | +
5553 | +	/*
5554 | +	 * Ensures that if nohz_idle_balance() fails to observe our
5555 | +	 * @idle_cpus_mask store, it must observe the @has_blocked
5556 | +	 * and @needs_update stores.
5557 | +	 */
5558 | +	smp_mb__after_atomic();
5559 | +
5560 | +	set_cpu_sd_state_idle(cpu);
5561 | +
5562 | +	WRITE_ONCE(nohz.needs_update, 1);
5563 | +out:
5564 | +	/*
5565 | +	 * Each time a cpu enter idle, we assume that it has blocked load and
5566 | +	 * enable the periodic update of the load of idle cpus
5567 | +	 */
5568 | +	WRITE_ONCE(nohz.has_blocked, 1);
5569 | +}
5570 | +
5571 | +/*
5572 | + * run_rebalance_domains is triggered when needed from the scheduler tick.
5573 | + * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5574 | + */
5575 | +static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
5576 | +{
5577 | +	struct rq *this_rq = this_rq();
5578 | +	enum cpu_idle_type idle = this_rq->idle_balance ?
5579 | +						CPU_IDLE : CPU_NOT_IDLE;
5580 | +
5581 | +	/*
5582 | +	 * If this CPU has a pending nohz_balance_kick, then do the
5583 | +	 * balancing on behalf of the other idle CPUs whose ticks are
5584 | +	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
5585 | +	 * give the idle CPUs a chance to load balance. Else we may
5586 | +	 * load balance only within the local sched_domain hierarchy
5587 | +	 * and abort nohz_idle_balance altogether if we pull some load.
5588 | +	 */
5589 | +	if (nohz_idle_balance(this_rq, idle))
5590 | +		return;
5591 | +
5592 | +	/* normal load balance */
5593 | +	update_blocked_averages(this_rq->cpu);
5594 | +}
5595 | +
5596 | +static inline int find_new_ilb(void)
5597 | +{
5598 | +	const struct cpumask *hk_mask;
5599 | +	int ilb_cpu;
5600 | +
5601 | +	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
5602 | +
5603 | +	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
5604 | +
5605 | +		if (ilb_cpu == smp_processor_id())
5606 | +			continue;
5607 | +
5608 | +		if (idle_cpu(ilb_cpu))
5609 | +			return ilb_cpu;
5610 | +	}
5611 | +
5612 | +	return -1;
5613 | +}
5614 | +
5615 | +/*
5616 | + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
5617 | + * SMP function call (IPI).
5618 | + *
5619 | + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
5620 | + */
5621 | +static void kick_ilb(unsigned int flags)
5622 | +{
5623 | +	int ilb_cpu;
5624 | +
5625 | +	/*
5626 | +	 * Increase nohz.next_balance only when if full ilb is triggered but
5627 | +	 * not if we only update stats.
5628 | +	 */
5629 | +	if (flags & NOHZ_BALANCE_KICK)
5630 | +		nohz.next_balance = jiffies+1;
5631 | +
5632 | +	ilb_cpu = find_new_ilb();
5633 | +	if (ilb_cpu < 0)
5634 | +		return;
5635 | +
5636 | +	/*
5637 | +	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
5638 | +	 * the first flag owns it; cleared by nohz_csd_func().
5639 | +	 */
5640 | +	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
5641 | +	if (flags & NOHZ_KICK_MASK)
5642 | +		return;
5643 | +
5644 | +	/*
5645 | +	 * This way we generate an IPI on the target CPU which
5646 | +	 * is idle, and the softirq performing NOHZ idle load balancing
5647 | +	 * will be run before returning from the IPI.
5648 | +	 */
5649 | +	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
5650 | +}
5651 | +
5652 | +static inline int
5653 | +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
5654 | +{
5655 | +	return ((rq->cpu_capacity * sd->imbalance_pct) <
5656 | +				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
5657 | +}
5658 | +
5659 | +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
5660 | +{
5661 | +	if (!sched_smt_active())
5662 | +		return true;
5663 | +
5664 | +	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
5665 | +}
5666 | +
5667 | +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
5668 | +{
5669 | +	return rq->misfit_task_load &&
5670 | +		(arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
5671 | +		 check_cpu_capacity(rq, sd));
5672 | +}
5673 | +
5674 | +/*
5675 | + * Current decision point for kicking the idle load balancer in the presence
5676 | + * of idle CPUs in the system.
5677 | + */
5678 | +static void nohz_balancer_kick(struct rq *rq)
5679 | +{
5680 | +	unsigned long now = jiffies;
5681 | +	struct sched_domain_shared *sds;
5682 | +	struct sched_domain *sd;
5683 | +	int nr_busy, i, cpu = rq->cpu;
5684 | +	unsigned int flags = 0;
5685 | +
5686 | +	if (unlikely(rq->idle_balance))
5687 | +		return;
5688 | +
5689 | +	/*
5690 | +	 * We may be recently in ticked or tickless idle mode. At the first
5691 | +	 * busy tick after returning from idle, we will update the busy stats.
5692 | +	 */
5693 | +	nohz_balance_exit_idle(rq);
5694 | +
5695 | +	/*
5696 | +	 * None are in tickless mode and hence no need for NOHZ idle load
5697 | +	 * balancing:
5698 | +	 */
5699 | +	if (likely(!atomic_read(&nohz.nr_cpus)))
5700 | +		return;
5701 | +
5702 | +	if (READ_ONCE(nohz.has_blocked) &&
5703 | +	    time_after(now, READ_ONCE(nohz.next_blocked)))
5704 | +		flags = NOHZ_STATS_KICK;
5705 | +
5706 | +	if (time_before(now, nohz.next_balance))
5707 | +		goto out;
5708 | +
5709 | +	if (rq->nr_running >= 2) {
5710 | +		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
5711 | +		goto out;
5712 | +	}
5713 | +
5714 | +	rcu_read_lock();
5715 | +
5716 | +	sd = rcu_dereference(rq->sd);
5717 | +	if (sd) {
5718 | +		/*
5719 | +		 * If there's a runnable CFS task and the current CPU has reduced
5720 | +		 * capacity, kick the ILB to see if there's a better CPU to run on:
5721 | +		 */
5722 | +		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
5723 | +			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
5724 | +			goto unlock;
5725 | +		}
5726 | +	}
5727 | +
5728 | +	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
5729 | +	if (sd) {
5730 | +		/*
5731 | +		 * When ASYM_PACKING; see if there's a more preferred CPU
5732 | +		 * currently idle; in which case, kick the ILB to move tasks
5733 | +		 * around.
5734 | +		 *
5735 | +		 * When balancing betwen cores, all the SMT siblings of the
5736 | +		 * preferred CPU must be idle.
5737 | +		 */
5738 | +		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
5739 | +			if (sched_use_asym_prio(sd, i) &&
5740 | +			    sched_asym_prefer(i, cpu)) {
5741 | +				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
5742 | +				goto unlock;
5743 | +			}
5744 | +		}
5745 | +	}
5746 | +
5747 | +	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
5748 | +	if (sd) {
5749 | +		/*
5750 | +		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
5751 | +		 * to run the misfit task on.
5752 | +		 */
5753 | +		if (check_misfit_status(rq, sd)) {
5754 | +			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
5755 | +			goto unlock;
5756 | +		}
5757 | +
5758 | +		/*
5759 | +		 * For asymmetric systems, we do not want to nicely balance
5760 | +		 * cache use, instead we want to embrace asymmetry and only
5761 | +		 * ensure tasks have enough CPU capacity.
5762 | +		 *
5763 | +		 * Skip the LLC logic because it's not relevant in that case.
5764 | +		 */
5765 | +		goto unlock;
5766 | +	}
5767 | +
5768 | +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5769 | +	if (sds) {
5770 | +		/*
5771 | +		 * If there is an imbalance between LLC domains (IOW we could
5772 | +		 * increase the overall cache utilization), we need a less-loaded LLC
5773 | +		 * domain to pull some load from. Likewise, we may need to spread
5774 | +		 * load within the current LLC domain (e.g. packed SMT cores but
5775 | +		 * other CPUs are idle). We can't really know from here how busy
5776 | +		 * the others are - so just get a NOHZ balance going if it looks
5777 | +		 * like this LLC domain has tasks we could move.
5778 | +		 */
5779 | +		nr_busy = atomic_read(&sds->nr_busy_cpus);
5780 | +		if (nr_busy > 1) {
5781 | +			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
5782 | +			goto unlock;
5783 | +		}
5784 | +	}
5785 | +unlock:
5786 | +	rcu_read_unlock();
5787 | +out:
5788 | +	if (READ_ONCE(nohz.needs_update))
5789 | +		flags |= NOHZ_NEXT_KICK;
5790 | +
5791 | +	if (flags)
5792 | +		kick_ilb(flags);
5793 | +}
5794 | +#endif /* CONFIG_NO_HZ_COMMON */
5795 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
5796 | index 001fe047bd5d..e27be055ca86 100644
5797 | --- a/kernel/sched/sched.h
5798 | +++ b/kernel/sched/sched.h
5799 | @@ -109,6 +109,10 @@ extern int sysctl_sched_rt_period;
5800 |  extern int sysctl_sched_rt_runtime;
5801 |  extern int sched_rr_timeslice;
5802 |  
5803 | +#ifdef CONFIG_ECHO_SCHED
5804 | +extern unsigned int bs_shared_quota;
5805 | +#endif
5806 | +
5807 |  /*
5808 |   * Helpers for converting nanosecond timing to jiffy resolution
5809 |   */
5810 | @@ -574,7 +578,9 @@ struct cfs_rq {
5811 |  	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
5812 |  	unsigned int		idle_nr_running;   /* SCHED_IDLE */
5813 |  	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
5814 | -
5815 | +#ifdef CONFIG_ECHO_SCHED
5816 | +	u64			local_cand_est;
5817 | +#endif
5818 |  	s64			avg_vruntime;
5819 |  	u64			avg_load;
5820 |  
5821 | @@ -596,6 +602,10 @@ struct cfs_rq {
5822 |  	 * It is set to NULL otherwise (i.e when none are currently running).
5823 |  	 */
5824 |  	struct sched_entity	*curr;
5825 | +#ifdef CONFIG_ECHO_SCHED
5826 | +	struct bs_node		*head;
5827 | +	struct bs_node		*q2_head;
5828 | +#endif
5829 |  	struct sched_entity	*next;
5830 |  
5831 |  #ifdef	CONFIG_SCHED_DEBUG
5832 | @@ -1891,6 +1901,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
5833 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
5834 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
5835 |  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
5836 | +
5837 |  extern struct static_key_false sched_asym_cpucapacity;
5838 |  extern struct static_key_false sched_cluster_active;
5839 |  
5840 | diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
5841 | index bae8f11070be..93caca5d2528 100644
5842 | --- a/kernel/time/Kconfig
5843 | +++ b/kernel/time/Kconfig
5844 | @@ -119,7 +119,7 @@ config NO_HZ_FULL
5845 |  	bool "Full dynticks system (tickless)"
5846 |  	# NO_HZ_COMMON dependency
5847 |  	# We need at least one periodic CPU for timekeeping
5848 | -	depends on SMP
5849 | +	depends on SMP && !ECHO_SCHED
5850 |  	depends on HAVE_CONTEXT_TRACKING_USER
5851 |  	# VIRT_CPU_ACCOUNTING_GEN dependency
5852 |  	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
5853 | 


--------------------------------------------------------------------------------
/6.8.y/powersave.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/kernel/sched/balancer.h b/kernel/sched/balancer.h
  2 | index 852faad1fc1d..bd44a2512fc0 100644
  3 | --- a/kernel/sched/balancer.h
  4 | +++ b/kernel/sched/balancer.h
  5 | @@ -8,6 +8,308 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  6 |  	return newidle_balance(rq, rf) != 0;
  7 |  }
  8 |  
  9 | +struct energy_env {
 10 | +	unsigned long task_busy_time;
 11 | +	unsigned long pd_busy_time;
 12 | +	unsigned long cpu_cap;
 13 | +	unsigned long pd_cap;
 14 | +};
 15 | +
 16 | +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 17 | +
 18 | +static inline unsigned long
 19 | +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 20 | +		 struct task_struct *p, int dst_cpu)
 21 | +{
 22 | +	unsigned long max_util = 0;
 23 | +	int cpu;
 24 | +
 25 | +	for_each_cpu(cpu, pd_cpus) {
 26 | +		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 27 | +		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
 28 | +		unsigned long eff_util, min, max;
 29 | +
 30 | +		/*
 31 | +		 * Performance domain frequency: utilization clamping
 32 | +		 * must be considered since it affects the selection
 33 | +		 * of the performance domain frequency.
 34 | +		 * NOTE: in case RT tasks are running, by default the
 35 | +		 * FREQUENCY_UTIL's utilization can be max OPP.
 36 | +		 */
 37 | +		eff_util = effective_cpu_util(cpu, util, &min, &max);
 38 | +
 39 | +		/* Task's uclamp can modify min and max value */
 40 | +		if (tsk && uclamp_is_used()) {
 41 | +			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
 42 | +
 43 | +			/*
 44 | +			 * If there is no active max uclamp constraint,
 45 | +			 * directly use task's one, otherwise keep max.
 46 | +			 */
 47 | +			if (uclamp_rq_is_idle(cpu_rq(cpu)))
 48 | +				max = uclamp_eff_value(p, UCLAMP_MAX);
 49 | +			else
 50 | +				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
 51 | +		}
 52 | +
 53 | +		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 54 | +		max_util = max(max_util, eff_util);
 55 | +	}
 56 | +
 57 | +	return min(max_util, eenv->cpu_cap);
 58 | +}
 59 | +
 60 | +static inline unsigned long
 61 | +compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 62 | +	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
 63 | +{
 64 | +	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
 65 | +	unsigned long busy_time = eenv->pd_busy_time;
 66 | +	unsigned long energy;
 67 | +
 68 | +	if (dst_cpu >= 0)
 69 | +		busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
 70 | +
 71 | +	energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
 72 | +
 73 | +	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
 74 | +
 75 | +	return energy;
 76 | +}
 77 | +
 78 | +static inline void eenv_pd_busy_time(struct energy_env *eenv,
 79 | +				     struct cpumask *pd_cpus,
 80 | +				     struct task_struct *p)
 81 | +{
 82 | +	unsigned long busy_time = 0;
 83 | +	int cpu;
 84 | +
 85 | +	for_each_cpu(cpu, pd_cpus) {
 86 | +		unsigned long util = cpu_util(cpu, p, -1, 0);
 87 | +
 88 | +		busy_time += effective_cpu_util(cpu, util, NULL, NULL);
 89 | +	}
 90 | +
 91 | +	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
 92 | +}
 93 | +
 94 | +static inline void eenv_task_busy_time(struct energy_env *eenv,
 95 | +				       struct task_struct *p, int prev_cpu)
 96 | +{
 97 | +	unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
 98 | +	unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
 99 | +
100 | +	if (unlikely(irq >= max_cap))
101 | +		busy_time = max_cap;
102 | +	else
103 | +		busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
104 | +
105 | +	eenv->task_busy_time = busy_time;
106 | +}
107 | +
108 | +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
109 | +{
110 | +	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
111 | +				 cfs_rq->last_update_time_copy);
112 | +}
113 | +
114 | +static void sync_entity_load_avg(struct sched_entity *se)
115 | +{
116 | +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
117 | +	u64 last_update_time;
118 | +
119 | +	last_update_time = cfs_rq_last_update_time(cfs_rq);
120 | +	__update_load_avg_blocked_se(last_update_time, se);
121 | +}
122 | +
123 | +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
124 | +{
125 | +	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
126 | +	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
127 | +	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
128 | +	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
129 | +	struct root_domain *rd = this_rq()->rd;
130 | +	int cpu, best_energy_cpu, target = -1;
131 | +	int prev_fits = -1, best_fits = -1;
132 | +	unsigned long best_thermal_cap = 0;
133 | +	unsigned long prev_thermal_cap = 0;
134 | +	struct sched_domain *sd;
135 | +	struct perf_domain *pd;
136 | +	struct energy_env eenv;
137 | +
138 | +	rcu_read_lock();
139 | +	pd = rcu_dereference(rd->pd);
140 | +	if (!pd || READ_ONCE(rd->overutilized))
141 | +		goto unlock;
142 | +
143 | +	/*
144 | +	 * Energy-aware wake-up happens on the lowest sched_domain starting
145 | +	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
146 | +	 */
147 | +	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
148 | +	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
149 | +		sd = sd->parent;
150 | +	if (!sd)
151 | +		goto unlock;
152 | +
153 | +	target = prev_cpu;
154 | +
155 | +	sync_entity_load_avg(&p->se);
156 | +	if (!task_util_est(p) && p_util_min == 0)
157 | +		goto unlock;
158 | +
159 | +	eenv_task_busy_time(&eenv, p, prev_cpu);
160 | +
161 | +	for (; pd; pd = pd->next) {
162 | +		unsigned long util_min = p_util_min, util_max = p_util_max;
163 | +		unsigned long cpu_cap, cpu_thermal_cap, util;
164 | +		long prev_spare_cap = -1, max_spare_cap = -1;
165 | +		unsigned long rq_util_min, rq_util_max;
166 | +		unsigned long cur_delta, base_energy;
167 | +		int max_spare_cap_cpu = -1;
168 | +		int fits, max_fits = -1;
169 | +
170 | +		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
171 | +
172 | +		if (cpumask_empty(cpus))
173 | +			continue;
174 | +
175 | +		/* Account thermal pressure for the energy estimation */
176 | +		cpu = cpumask_first(cpus);
177 | +		cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
178 | +		cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
179 | +
180 | +		eenv.cpu_cap = cpu_thermal_cap;
181 | +		eenv.pd_cap = 0;
182 | +
183 | +		for_each_cpu(cpu, cpus) {
184 | +			struct rq *rq = cpu_rq(cpu);
185 | +
186 | +			eenv.pd_cap += cpu_thermal_cap;
187 | +
188 | +			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
189 | +				continue;
190 | +
191 | +			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
192 | +				continue;
193 | +
194 | +			util = cpu_util(cpu, p, cpu, 0);
195 | +			cpu_cap = capacity_of(cpu);
196 | +
197 | +			/*
198 | +			 * Skip CPUs that cannot satisfy the capacity request.
199 | +			 * IOW, placing the task there would make the CPU
200 | +			 * overutilized. Take uclamp into account to see how
201 | +			 * much capacity we can get out of the CPU; this is
202 | +			 * aligned with sched_cpu_util().
203 | +			 */
204 | +			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
205 | +				/*
206 | +				 * Open code uclamp_rq_util_with() except for
207 | +				 * the clamp() part. Ie: apply max aggregation
208 | +				 * only. util_fits_cpu() logic requires to
209 | +				 * operate on non clamped util but must use the
210 | +				 * max-aggregated uclamp_{min, max}.
211 | +				 */
212 | +				rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
213 | +				rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
214 | +
215 | +				util_min = max(rq_util_min, p_util_min);
216 | +				util_max = max(rq_util_max, p_util_max);
217 | +			}
218 | +
219 | +			fits = util_fits_cpu(util, util_min, util_max, cpu);
220 | +			if (!fits)
221 | +				continue;
222 | +
223 | +			lsub_positive(&cpu_cap, util);
224 | +
225 | +			if (cpu == prev_cpu) {
226 | +				/* Always use prev_cpu as a candidate. */
227 | +				prev_spare_cap = cpu_cap;
228 | +				prev_fits = fits;
229 | +			} else if ((fits > max_fits) ||
230 | +				   ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
231 | +				/*
232 | +				 * Find the CPU with the maximum spare capacity
233 | +				 * among the remaining CPUs in the performance
234 | +				 * domain.
235 | +				 */
236 | +				max_spare_cap = cpu_cap;
237 | +				max_spare_cap_cpu = cpu;
238 | +				max_fits = fits;
239 | +			}
240 | +		}
241 | +
242 | +		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
243 | +			continue;
244 | +
245 | +		eenv_pd_busy_time(&eenv, cpus, p);
246 | +		/* Compute the 'base' energy of the pd, without @p */
247 | +		base_energy = compute_energy(&eenv, pd, cpus, p, -1);
248 | +
249 | +		/* Evaluate the energy impact of using prev_cpu. */
250 | +		if (prev_spare_cap > -1) {
251 | +			prev_delta = compute_energy(&eenv, pd, cpus, p,
252 | +						    prev_cpu);
253 | +			/* CPU utilization has changed */
254 | +			if (prev_delta < base_energy)
255 | +				goto unlock;
256 | +			prev_delta -= base_energy;
257 | +			prev_thermal_cap = cpu_thermal_cap;
258 | +			best_delta = min(best_delta, prev_delta);
259 | +		}
260 | +
261 | +		/* Evaluate the energy impact of using max_spare_cap_cpu. */
262 | +		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
263 | +			/* Current best energy cpu fits better */
264 | +			if (max_fits < best_fits)
265 | +				continue;
266 | +
267 | +			/*
268 | +			 * Both don't fit performance hint (i.e. uclamp_min)
269 | +			 * but best energy cpu has better capacity.
270 | +			 */
271 | +			if ((max_fits < 0) &&
272 | +			    (cpu_thermal_cap <= best_thermal_cap))
273 | +				continue;
274 | +
275 | +			cur_delta = compute_energy(&eenv, pd, cpus, p,
276 | +						   max_spare_cap_cpu);
277 | +			/* CPU utilization has changed */
278 | +			if (cur_delta < base_energy)
279 | +				goto unlock;
280 | +			cur_delta -= base_energy;
281 | +
282 | +			/*
283 | +			 * Both fit for the task but best energy cpu has lower
284 | +			 * energy impact.
285 | +			 */
286 | +			if ((max_fits > 0) && (best_fits > 0) &&
287 | +			    (cur_delta >= best_delta))
288 | +				continue;
289 | +
290 | +			best_delta = cur_delta;
291 | +			best_energy_cpu = max_spare_cap_cpu;
292 | +			best_fits = max_fits;
293 | +			best_thermal_cap = cpu_thermal_cap;
294 | +		}
295 | +	}
296 | +	rcu_read_unlock();
297 | +
298 | +	if ((best_fits > prev_fits) ||
299 | +	    ((best_fits > 0) && (best_delta < prev_delta)) ||
300 | +	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
301 | +		target = best_energy_cpu;
302 | +
303 | +	return target;
304 | +
305 | +unlock:
306 | +	rcu_read_unlock();
307 | +
308 | +	return target;
309 | +}
310 | +
311 |  static int
312 |  wake_affine_idle(int this_cpu, int prev_cpu, int sync)
313 |  {
314 | @@ -90,13 +392,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
315 |  	unsigned int min = rq->nr_running;
316 |  	int this_cpu = smp_processor_id();
317 |  
318 | -	if (wake_flags & WF_TTWU) {
319 | +	if (IS_PWR_SAVE_ENABLED || (wake_flags & WF_TTWU)) {
320 |  		record_wakee(p);
321 |  
322 |  		if ((wake_flags & WF_CURRENT_CPU) &&
323 |  		    cpumask_test_cpu(cpu, p->cpus_ptr))
324 |  			return cpu;
325 |  
326 | +		if (IS_PWR_SAVE_ENABLED || sched_energy_enabled()) {
327 | +			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
328 | +			if (new_cpu >= 0)
329 | +				return new_cpu;
330 | +			new_cpu = prev_cpu;
331 | +		}
332 | +
333 |  		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
334 |  	}
335 |  
336 | @@ -382,6 +691,64 @@ static inline int migrate_degrades_locality(struct task_struct *p, struct rq *ds
337 |  }
338 |  #endif
339 |  
340 | +static int
341 | +can_migrate_task_powersave(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
342 | +{
343 | +	int tsk_cache_hot;
344 | +
345 | +	/* Disregard pcpu kthreads; they are where they need to be. */
346 | +	if (kthread_is_per_cpu(p))
347 | +		return 0;
348 | +
349 | +	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
350 | +		return 0;
351 | +
352 | +	if (task_on_cpu(src_rq, p))
353 | +		return 0;
354 | +
355 | +	tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq);
356 | +	if (tsk_cache_hot > 0)
357 | +		return 0;
358 | +
359 | +	return 1;
360 | +}
361 | +
362 | +static int move_task_powersave(struct rq *dist_rq, struct rq *src_rq,
363 | +			struct rq_flags *src_rf)
364 | +{
365 | +	struct cfs_rq *src_cfs_rq = &src_rq->cfs;
366 | +	struct task_struct *p;
367 | +	struct bs_node *bsn = src_cfs_rq->head;
368 | +	struct lb_env env = {
369 | +		.dst_cpu	= cpu_of(dist_rq),
370 | +		.dst_rq		= dist_rq,
371 | +		.src_cpu	= cpu_of(src_rq),
372 | +		.src_rq		= src_rq,
373 | +		.src_rf		= src_rf,
374 | +		.idle		= dist_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE,
375 | +	};
376 | +
377 | +	while (bsn) {
378 | +		p = task_of(se_of(bsn));
379 | +		if (can_migrate_task_powersave(p, dist_rq, src_rq)) {
380 | +			pull_from(p, &env);
381 | +			return 1;
382 | +		}
383 | +
384 | +		bsn = bsn->next;
385 | +	}
386 | +
387 | +	/*
388 | +	 * Here we know we have not migrated any task,
389 | +	 * thus, we need to unlock and return 0
390 | +	 * Note: the pull_from does the unlocking for us.
391 | +	 */
392 | +	rq_unlock(src_rq, src_rf);
393 | +	local_irq_restore(src_rf->flags);
394 | +
395 | +	return 0;
396 | +}
397 | +
398 |  #define MIN_HOTNESS 0x7FFFFFFFFFFFFFFLL
399 |  
400 |  static s64 task_hotness(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
401 | @@ -559,6 +926,9 @@ static void idle_balance(struct rq *this_rq)
402 |  	unsigned int max = 0;
403 |  	struct rq_flags src_rf;
404 |  
405 | +	if (IS_PWR_SAVE_ENABLED)
406 | +		return;
407 | +
408 |  	if (idle_pull_global_candidate(this_rq))
409 |  		return;
410 |  
411 | @@ -727,7 +1097,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
412 |  	/*
413 |  	 * Do not pull tasks towards !active CPUs...
414 |  	 */
415 | -	if (!cpu_active(this_cpu))
416 | +	if (IS_PWR_SAVE_ENABLED || !cpu_active(this_cpu))
417 |  		return 0;
418 |  
419 |  	rq_unpin_lock(this_rq, rf);
420 | @@ -849,7 +1219,9 @@ static void rebalance(struct rq *this_rq)
421 |  		return;
422 |  	}
423 |  
424 | -	if(move_task(min_rq, max_rq, &src_rf))
425 | +	if (IS_PWR_SAVE_ENABLED && idle_cpu(cpu_of(min_rq)) && max - min == 2)
426 | +		move_task_powersave(min_rq, max_rq, &src_rf);
427 | +	else if(move_task(min_rq, max_rq, &src_rf))
428 |  		goto again;
429 |  }
430 |  
431 | diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
432 | index 7760327e5194..f1c67710e8cd 100644
433 | --- a/kernel/sched/bs.c
434 | +++ b/kernel/sched/bs.c
435 | @@ -17,6 +17,7 @@
436 |  unsigned int sysctl_sched_base_slice	= 7000ULL;
437 |  unsigned int bs_shared_quota		= 105000ULL; // 105us
438 |  u32 alpha				= 500U;
439 | +unsigned int __read_mostly echo_powersave = 0;
440 |  
441 |  struct lb_env {
442 |  	struct rq		*src_rq;
443 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
444 | index 49df55bb0ba7..6e9deddc59d6 100644
445 | --- a/kernel/sched/core.c
446 | +++ b/kernel/sched/core.c
447 | @@ -4701,6 +4701,15 @@ static struct ctl_table sched_core_sysctls[] = {
448 |  		.mode		= 0644,
449 |  		.proc_handler	= proc_dointvec,
450 |  	},
451 | +	{
452 | +		.procname	= "sched_echo_powersave",
453 | +		.data		= &echo_powersave,
454 | +		.maxlen		= sizeof(int),
455 | +		.mode		= 0644,
456 | +		.proc_handler	= proc_dointvec_minmax,
457 | +		.extra1		= SYSCTL_ZERO,
458 | +		.extra2		= SYSCTL_THREE,
459 | +	},
460 |  #endif
461 |  #ifdef CONFIG_SCHEDSTATS
462 |  	{
463 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
464 | index e27be055ca86..071d09f66fa0 100644
465 | --- a/kernel/sched/sched.h
466 | +++ b/kernel/sched/sched.h
467 | @@ -88,6 +88,10 @@
468 |  # define SCHED_WARN_ON(x)      ({ (void)(x), 0; })
469 |  #endif
470 |  
471 | +#ifdef CONFIG_ECHO_SCHED
472 | +#define IS_PWR_SAVE_ENABLED (echo_powersave == 1)
473 | +#endif
474 | +
475 |  struct rq;
476 |  struct cpuidle_state;
477 |  
478 | @@ -111,6 +115,7 @@ extern int sched_rr_timeslice;
479 |  
480 |  #ifdef CONFIG_ECHO_SCHED
481 |  extern unsigned int bs_shared_quota;
482 | +extern unsigned int echo_powersave;
483 |  #endif
484 |  
485 |  /*
486 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ECHO CPU Scheduler
 2 | 
 3 | **Enhanced CPU Handling Orchestrator**
 4 | 
 5 | It is a CPU processes scheduler patch for Linux kernel.
 6 | 
 7 | This scheduler includes the following features: -
 8 | 
 9 | - Highly multitasking handling with max 35us quota.
10 | - All tasks in a CPU have a shared quota = 35us in which every task runs (35us / # of tasks)
11 | - Minimum slice for a running a task is 7us unless waked up task that must run before the current task then it preempts it.
12 | - Calculate the estimation of tasks - SRTF (Shortest Remaining Task Next). This uses moving average to calculate virtual runtime.
13 | - Next task is picked with the smallest estimated virtual runtime.
14 | - Load balancer as in TT scheduler with tiny changes. CPU0 is responsible of moving tasks among other CPUs. Also, the candidate
15 | balancer is enabled by default.
16 | 
17 | ## Comparison with other schedulers
18 | 
19 | https://github.com/hamadmarri/benchmarks
20 | 
21 | 
22 | 
23 | ## Policy
24 | The policy is a mix of SRTF and RR (Round Robin) where virtual runtime calculation is
25 | ported from CFS (it calculates the burst adjusted based on the priority of the task). Each round the tasks will run starting from
26 | the least estimated vruntime and each task will run `shared_quota/#tasks` ex. `35us / 3 = ~11.7us`
27 | 
28 | If a wake up task has smaller estimated vruntime then it will preempt the current task and run. Every time the task consumes its
29 | quota it will be placed in a second queue unless it is the only task that is running. After finishing the round, all tasks are placed
30 | in the second queue. The scheduler switches the queue head from q1 to q2, and q2 become q1 and vise versa.
31 | 
32 | 
33 | ## Defaults and Sysctls
34 | - The default HZ for ECHO is 625HZ - ticks every 1.6ms. No need to increase it since the HighRes clock handles the task preemption in 35us max.
35 | - `kernel.sched_bs_shared_quota` by default is 35000 (35us) can be tuned with sysctl
36 | ex. `sysctl kernel.sched_bs_shared_quota=4800000` larger values saves CPU caches but reduces interactivity and multitasking.
37 | - There are kernel configurations that must be disabled:
38 | 	- CONFIG_FAIR_GROUP_SCHED
39 | 	- CONFIG_SCHED_AUTOGROUP
40 | 	- CONFIG_SCHED_CORE
41 | 
42 | 
43 | ## Telegram Group
44 | 
45 | https://t.me/tt_sched
46 | 
47 | Hamad
48 | 


--------------------------------------------------------------------------------