├── README.md
├── BLD-3.16.patch
├── BLD-3.19.patch
├── BLD-3.17.patch
├── BLD-3.18.patch
├── BLD-4.8.patch
├── BLD-4.1.patch
├── BLD-4.4.patch
├── BLD-4.5.patch
├── BLD-4.3.patch
└── BLD-4.6.patch


/README.md:
--------------------------------------------------------------------------------
1 | bld-patches
2 | ===========
3 | 
4 | Directory of BLD patches, where all the patches will be kept as single patch for stable Linux releases. See Wiki for some idea.
5 | 


--------------------------------------------------------------------------------
/BLD-3.16.patch:
--------------------------------------------------------------------------------
  1 |  BLD-3.16 for Linux kernel 3.16. Nothing special, just rebased
  2 | for 3.16.
  3 | 
  4 | Thanks,
  5 | Rakib
  6 | 
  7 |  Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
  8 | 
  9 | diff --git a/init/Kconfig b/init/Kconfig
 10 | index 9d76b99..847f34d 100644
 11 | --- a/init/Kconfig
 12 | +++ b/init/Kconfig
 13 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
 14 |  	depends on BROKEN || !SMP
 15 |  	default y
 16 |  
 17 | +config BLD
 18 | +	bool "An alternate CPU load distribution technique for task scheduler"
 19 | +	depends on SMP
 20 | +	default y
 21 | +	help
 22 | +	  This is an alternate CPU load distribution technique based for task
 23 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 24 | +	  suitable for NUMA, should work well on SMP.
 25 | +
 26 |  config INIT_ENV_ARG_LIMIT
 27 |  	int
 28 |  	default 32 if !UML
 29 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 30 | new file mode 100644
 31 | index 0000000..5a067c1
 32 | --- /dev/null
 33 | +++ b/kernel/sched/bld.h
 34 | @@ -0,0 +1,207 @@
 35 | +#ifdef CONFIG_BLD
 36 | +
 37 | +static DEFINE_RWLOCK(rt_list_lock);
 38 | +static LIST_HEAD(rt_rq_head);
 39 | +static LIST_HEAD(cfs_rq_head);
 40 | +static DEFINE_RWLOCK(cfs_list_lock);
 41 | +
 42 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 43 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 44 | +{
 45 | +	return cfs_rq->rq;
 46 | +}
 47 | +#else
 48 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 49 | +{
 50 | +	return container_of(cfs_rq, struct rq, cfs);
 51 | +}
 52 | +#endif
 53 | +
 54 | +#ifdef CONFIG_RT_GROUP_SCHED
 55 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 56 | +{
 57 | +	return rt_rq->rq;
 58 | +}
 59 | +#else
 60 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 61 | +{
 62 | +	return container_of(rt_rq, struct rq, rt);
 63 | +}
 64 | +#endif
 65 | +
 66 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 67 | +{
 68 | +	int cpu = smp_processor_id(), i;
 69 | +	unsigned long load, min_load = ULONG_MAX;
 70 | +	struct rq *rq;
 71 | +
 72 | +	if (task_type) {
 73 | +		for_each_cpu(i, mask) {
 74 | +			rq = cpu_rq(i);
 75 | +			load = rq->cfs.load.weight;
 76 | +			if (load < min_load) {
 77 | +				min_load = load;
 78 | +				cpu = i;
 79 | +			}
 80 | +		}
 81 | +	} else {
 82 | +		min_load = -1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load > min_load) {
 88 | +				min_load = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = p->prio;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index bc1638b..b429ce5 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/mm.h>
255 | @@ -86,6 +88,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -831,6 +834,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	update_rq_clock(rq);
265 |  	sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -838,6 +843,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	update_rq_clock(rq);
274 |  	sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1398,7 +1405,14 @@ out:
282 |  static inline
283 |  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
284 |  {
285 | +#ifndef	CONFIG_BLD
286 |  	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
287 | +#else
288 | +	if (dl_task(p))
289 | +		cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
290 | +	else
291 | +		cpu = bld_get_cpu(p, sd_flags, wake_flags);
292 | +#endif
293 |  
294 |  	/*
295 |  	 * In order not to call set_task_cpu() on a blocking task we need
296 | @@ -1568,9 +1582,13 @@ void scheduler_ipi(void)
297 |  	 */
298 |  	preempt_fold_need_resched();
299 |  
300 | +#ifndef	CONFIG_BLD
301 |  	if (llist_empty(&this_rq()->wake_list)
302 |  			&& !tick_nohz_full_cpu(smp_processor_id())
303 |  			&& !got_nohz_idle_kick())
304 | +#else
305 | +	if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()))
306 | +#endif
307 |  		return;
308 |  
309 |  	/*
310 | @@ -1593,13 +1611,16 @@ void scheduler_ipi(void)
311 |  	/*
312 |  	 * Check if someone kicked us for doing the nohz idle load balance.
313 |  	 */
314 | +#ifndef	CONFIG_BLD
315 |  	if (unlikely(got_nohz_idle_kick())) {
316 |  		this_rq()->idle_balance = 1;
317 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
318 |  	}
319 | +#endif
320 |  	irq_exit();
321 |  }
322 |  
323 | +#ifndef	CONFIG_BLD
324 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
325 |  {
326 |  	struct rq *rq = cpu_rq(cpu);
327 | @@ -1611,6 +1632,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
328 |  			trace_sched_wake_idle_without_ipi(cpu);
329 |  	}
330 |  }
331 | +#endif
332 |  
333 |  bool cpus_share_cache(int this_cpu, int that_cpu)
334 |  {
335 | @@ -1622,7 +1644,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
336 |  {
337 |  	struct rq *rq = cpu_rq(cpu);
338 |  
339 | -#if defined(CONFIG_SMP)
340 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
341 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
342 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
343 |  		ttwu_queue_remote(p, cpu);
344 | @@ -1930,7 +1952,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
345 |  	 * Silence PROVE_RCU.
346 |  	 */
347 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
348 | -	set_task_cpu(p, cpu);
349 | +	__set_task_cpu(p, cpu);
350 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
351 |  
352 |  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
353 | @@ -2398,7 +2420,14 @@ void sched_exec(void)
354 |  	int dest_cpu;
355 |  
356 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
357 | +#ifndef	CONFIG_BLD
358 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
359 | +#else
360 | +	if (dl_task(p))
361 | +		dest_cpu = task_cpu(p);
362 | +	else
363 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
364 | +#endif
365 |  	if (dest_cpu == smp_processor_id())
366 |  		goto unlock;
367 |  
368 | @@ -2508,8 +2537,10 @@ void scheduler_tick(void)
369 |  
370 |  #ifdef CONFIG_SMP
371 |  	rq->idle_balance = idle_cpu(cpu);
372 | +#ifndef	CONFIG_BLD
373 |  	trigger_load_balance(rq);
374 |  #endif
375 | +#endif
376 |  	rq_last_tick_reset(rq);
377 |  }
378 |  
379 | @@ -6990,6 +7021,15 @@ void __init sched_init(void)
380 |  #endif
381 |  		init_rq_hrtick(rq);
382 |  		atomic_set(&rq->nr_iowait, 0);
383 | +#ifdef CONFIG_BLD
384 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
385 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
386 | +		rq->cfs.pos = 0;
387 | +
388 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
389 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
390 | +		rq->rt.lowbit = INT_MAX;
391 | +#endif
392 |  	}
393 |  
394 |  	set_load_weight(&init_task);
395 | @@ -7030,6 +7070,9 @@ void __init sched_init(void)
396 |  	init_sched_fair_class();
397 |  
398 |  	scheduler_running = 1;
399 | +#ifdef	CONFIG_BLD
400 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
401 | +#endif
402 |  }
403 |  
404 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
405 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
406 | index fea7d33..651aa1d 100644
407 | --- a/kernel/sched/fair.c
408 | +++ b/kernel/sched/fair.c
409 | @@ -4101,6 +4101,7 @@ static void task_waking_fair(struct task_struct *p)
410 |  	record_wakee(p);
411 |  }
412 |  
413 | +#ifndef	CONFIG_BLD
414 |  #ifdef CONFIG_FAIR_GROUP_SCHED
415 |  /*
416 |   * effective_load() calculates the load change as seen from the root_task_group
417 | @@ -4550,6 +4551,7 @@ unlock:
418 |  
419 |  	return new_cpu;
420 |  }
421 | +#endif	/* CONFIG_BLD */
422 |  
423 |  /*
424 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
425 | @@ -4845,6 +4847,7 @@ simple:
426 |  	return p;
427 |  
428 |  idle:
429 | +#ifndef	CONFIG_BLD
430 |  	new_tasks = idle_balance(rq);
431 |  	/*
432 |  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
433 | @@ -4856,7 +4859,7 @@ idle:
434 |  
435 |  	if (new_tasks > 0)
436 |  		goto again;
437 | -
438 | +#endif
439 |  	return NULL;
440 |  }
441 |  
442 | @@ -6931,12 +6934,40 @@ static inline int on_null_domain(struct rq *rq)
443 |   *   needed, they will kick the idle load balancer, which then does idle
444 |   *   load balancing for all the idle CPUs.
445 |   */
446 | +#ifndef	CONFIG_BLD
447 |  static struct {
448 |  	cpumask_var_t idle_cpus_mask;
449 |  	atomic_t nr_cpus;
450 |  	unsigned long next_balance;     /* in jiffy units */
451 |  } nohz ____cacheline_aligned;
452 |  
453 | +static inline void nohz_balance_exit_idle(int cpu)
454 | +{
455 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
456 | +		/*
457 | +		 * Completely isolated CPUs don't ever set, so we must test.
458 | +		 */
459 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
460 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
461 | +			atomic_dec(&nohz.nr_cpus);
462 | +		}
463 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
464 | +	}
465 | +}
466 | +
467 | +static int sched_ilb_notifier(struct notifier_block *nfb,
468 | +					unsigned long action, void *hcpu)
469 | +{
470 | +	switch (action & ~CPU_TASKS_FROZEN) {
471 | +	case CPU_DYING:
472 | +		nohz_balance_exit_idle(smp_processor_id());
473 | +		return NOTIFY_OK;
474 | +	default:
475 | +		return NOTIFY_DONE;
476 | +	}
477 | +}
478 | +#endif	/* CONFIG_BLD */
479 | +
480 |  static inline int find_new_ilb(void)
481 |  {
482 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
483 | @@ -6975,20 +7006,6 @@ static void nohz_balancer_kick(void)
484 |  	return;
485 |  }
486 |  
487 | -static inline void nohz_balance_exit_idle(int cpu)
488 | -{
489 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
490 | -		/*
491 | -		 * Completely isolated CPUs don't ever set, so we must test.
492 | -		 */
493 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
494 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
495 | -			atomic_dec(&nohz.nr_cpus);
496 | -		}
497 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
498 | -	}
499 | -}
500 | -
501 |  static inline void set_cpu_sd_state_busy(void)
502 |  {
503 |  	struct sched_domain *sd;
504 | @@ -7029,6 +7046,7 @@ unlock:
505 |   */
506 |  void nohz_balance_enter_idle(int cpu)
507 |  {
508 | +#ifndef	CONFIG_BLD
509 |  	/*
510 |  	 * If this cpu is going down, then nothing needs to be done.
511 |  	 */
512 | @@ -7047,23 +7065,10 @@ void nohz_balance_enter_idle(int cpu)
513 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
514 |  	atomic_inc(&nohz.nr_cpus);
515 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
516 | -}
517 | -
518 | -static int sched_ilb_notifier(struct notifier_block *nfb,
519 | -					unsigned long action, void *hcpu)
520 | -{
521 | -	switch (action & ~CPU_TASKS_FROZEN) {
522 | -	case CPU_DYING:
523 | -		nohz_balance_exit_idle(smp_processor_id());
524 | -		return NOTIFY_OK;
525 | -	default:
526 | -		return NOTIFY_DONE;
527 | -	}
528 | +#endif
529 |  }
530 |  #endif
531 |  
532 | -static DEFINE_SPINLOCK(balancing);
533 | -
534 |  /*
535 |   * Scale the max load_balance interval with the number of CPUs in the system.
536 |   * This trades load-balance latency on larger machines for less cross talk.
537 | @@ -7073,6 +7078,9 @@ void update_max_interval(void)
538 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
539 |  }
540 |  
541 | +#ifndef	CONFIG_BLD
542 | +static DEFINE_SPINLOCK(balancing);
543 | +
544 |  /*
545 |   * It checks each scheduling domain to see if it is due to be balanced,
546 |   * and initiates a balancing operation if so.
547 | @@ -7321,6 +7329,7 @@ void trigger_load_balance(struct rq *rq)
548 |  		nohz_balancer_kick();
549 |  #endif
550 |  }
551 | +#endif	/* CONFIG_BLD */
552 |  
553 |  static void rq_online_fair(struct rq *rq)
554 |  {
555 | @@ -7764,7 +7773,9 @@ const struct sched_class fair_sched_class = {
556 |  	.put_prev_task		= put_prev_task_fair,
557 |  
558 |  #ifdef CONFIG_SMP
559 | +#ifndef	CONFIG_BLD
560 |  	.select_task_rq		= select_task_rq_fair,
561 | +#endif
562 |  	.migrate_task_rq	= migrate_task_rq_fair,
563 |  
564 |  	.rq_online		= rq_online_fair,
565 | @@ -7802,6 +7813,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
566 |  
567 |  __init void init_sched_fair_class(void)
568 |  {
569 | +#ifndef	CONFIG_BLD
570 |  #ifdef CONFIG_SMP
571 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
572 |  
573 | @@ -7811,5 +7823,5 @@ __init void init_sched_fair_class(void)
574 |  	cpu_notifier(sched_ilb_notifier, 0);
575 |  #endif
576 |  #endif /* SMP */
577 | -
578 | +#endif /* BLD */
579 |  }
580 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
581 | index a490831..c9d22c3 100644
582 | --- a/kernel/sched/rt.c
583 | +++ b/kernel/sched/rt.c
584 | @@ -1291,6 +1291,7 @@ static void yield_task_rt(struct rq *rq)
585 |  #ifdef CONFIG_SMP
586 |  static int find_lowest_rq(struct task_struct *task);
587 |  
588 | +#ifndef	CONFIG_BLD
589 |  static int
590 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
591 |  {
592 | @@ -1344,6 +1345,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
593 |  out:
594 |  	return cpu;
595 |  }
596 | +#endif	/* CONFIG_BLD */
597 |  
598 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
599 |  {
600 | @@ -2108,7 +2110,9 @@ const struct sched_class rt_sched_class = {
601 |  	.put_prev_task		= put_prev_task_rt,
602 |  
603 |  #ifdef CONFIG_SMP
604 | +#ifndef	CONFIG_BLD
605 |  	.select_task_rq		= select_task_rq_rt,
606 | +#endif
607 |  
608 |  	.set_cpus_allowed       = set_cpus_allowed_rt,
609 |  	.rq_online              = rq_online_rt,
610 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
611 | index 31cc02e..1c497d2 100644
612 | --- a/kernel/sched/sched.h
613 | +++ b/kernel/sched/sched.h
614 | @@ -358,9 +358,8 @@ struct cfs_rq {
615 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
616 |  #endif /* CONFIG_SMP */
617 |  
618 | -#ifdef CONFIG_FAIR_GROUP_SCHED
619 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
620 | -
621 | +#ifdef CONFIG_FAIR_GROUP_SCHED
622 |  	/*
623 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
624 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
625 | @@ -384,6 +383,11 @@ struct cfs_rq {
626 |  	struct list_head throttled_list;
627 |  #endif /* CONFIG_CFS_BANDWIDTH */
628 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
629 | +
630 | +#ifdef CONFIG_BLD
631 | +	struct list_head bld_cfs_list;
632 | +	char pos;
633 | +#endif
634 |  };
635 |  
636 |  static inline int rt_bandwidth_enabled(void)
637 | @@ -417,12 +421,16 @@ struct rt_rq {
638 |  	/* Nests inside the rq lock: */
639 |  	raw_spinlock_t rt_runtime_lock;
640 |  
641 | +	struct rq *rq;
642 |  #ifdef CONFIG_RT_GROUP_SCHED
643 |  	unsigned long rt_nr_boosted;
644 |  
645 | -	struct rq *rq;
646 |  	struct task_group *tg;
647 |  #endif
648 | +#ifdef CONFIG_BLD
649 | +	struct list_head bld_rt_list;
650 | +	int lowbit;
651 | +#endif
652 |  };
653 |  
654 |  /* Deadline class' related fields in a runqueue */
655 | 


--------------------------------------------------------------------------------
/BLD-3.19.patch:
--------------------------------------------------------------------------------
  1 | BLD patch for Linux-3.19. Rebased on for Linux 3.19.
  2 | 
  3 | diff --git a/init/Kconfig b/init/Kconfig
  4 | index 9afb971..062ca7f 100644
  5 | --- a/init/Kconfig
  6 | +++ b/init/Kconfig
  7 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  8 |  	depends on BROKEN || !SMP
  9 |  	default y
 10 |  
 11 | +config BLD
 12 | +	bool "An alternate CPU load distribution technique for task scheduler"
 13 | +	depends on SMP
 14 | +	default y
 15 | +	help
 16 | +	  This is an alternate CPU load distribution technique based for task
 17 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 18 | +	  suitable for NUMA, should work well on SMP.
 19 | +
 20 |  config INIT_ENV_ARG_LIMIT
 21 |  	int
 22 |  	default 32 if !UML
 23 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 24 | new file mode 100644
 25 | index 0000000..f1f9fba
 26 | --- /dev/null
 27 | +++ b/kernel/sched/bld.h
 28 | @@ -0,0 +1,215 @@
 29 | +#ifdef CONFIG_BLD
 30 | +
 31 | +static DEFINE_RWLOCK(rt_list_lock);
 32 | +static LIST_HEAD(rt_rq_head);
 33 | +static LIST_HEAD(cfs_rq_head);
 34 | +static DEFINE_RWLOCK(cfs_list_lock);
 35 | +
 36 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 37 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 38 | +{
 39 | +	return cfs_rq->rq;
 40 | +}
 41 | +#else
 42 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 43 | +{
 44 | +	return container_of(cfs_rq, struct rq, cfs);
 45 | +}
 46 | +#endif
 47 | +
 48 | +#ifdef CONFIG_RT_GROUP_SCHED
 49 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 50 | +{
 51 | +	return rt_rq->rq;
 52 | +}
 53 | +#else
 54 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 55 | +{
 56 | +	return container_of(rt_rq, struct rq, rt);
 57 | +}
 58 | +#endif
 59 | +
 60 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 61 | +{
 62 | +	int cpu = smp_processor_id(), i;
 63 | +	unsigned long load, varload;
 64 | +	struct rq *rq;
 65 | +
 66 | +	if (task_type) {
 67 | +		varload = ULONG_MAX;
 68 | +		for_each_cpu(i, mask) {
 69 | +			rq = cpu_rq(i);
 70 | +			load = rq->cfs.load.weight;
 71 | +			if (load < varload) {
 72 | +				varload = load;
 73 | +				cpu = i;
 74 | +			}
 75 | +		}
 76 | +	} else {
 77 | +		/* Here's an attempt to get a CPU within the mask where
 78 | +		 * we can preempt easily. To achieve this we tried to
 79 | +		 * maintain a lowbit, which indicate the lowest bit set on
 80 | +		 * array bitmap. Since all CPUs contains high priority
 81 | +		 * kernel threads therefore we eliminate 0, so it might not
 82 | +		 * be right every time, but it's just an indicator.
 83 | +		 */
 84 | +		varload = 1;
 85 | +
 86 | +		for_each_cpu(i, mask) {
 87 | +			rq = cpu_rq(i);
 88 | +			load = rq->rt.lowbit;
 89 | +			if (load >= varload) {
 90 | +				varload = load;
 91 | +				cpu = i;
 92 | +			}
 93 | +		}
 94 | +	}
 95 | +
 96 | +	return cpu;
 97 | +}
 98 | +
 99 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
100 | +{
101 | +	struct cfs_rq *cfs;
102 | +	unsigned long flags;
103 | +	unsigned int cpu = smp_processor_id();
104 | +
105 | +	read_lock_irqsave(&cfs_list_lock, flags);
106 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
107 | +		cpu = cpu_of(rq_of_cfs(cfs));
108 | +		if (cpu_online(cpu))
109 | +			break;
110 | +	}
111 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
112 | +	return cpu;
113 | +}
114 | +
115 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
116 | +{
117 | +	struct rt_rq *rt;
118 | +	unsigned long flags;
119 | +	unsigned int cpu = smp_processor_id();
120 | +
121 | +	read_lock_irqsave(&rt_list_lock, flags);
122 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
123 | +		cpu = cpu_of(rq_of_rt(rt));
124 | +		if (cpu_online(cpu))
125 | +			break;
126 | +	}
127 | +	read_unlock_irqrestore(&rt_list_lock, flags);
128 | +	return cpu;
129 | +}
130 | +
131 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
132 | +{
133 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
134 | +	struct cpumask *tmpmask;
135 | +
136 | +	if (p->nr_cpus_allowed == 1)
137 | +		return task_cpu(p);
138 | +
139 | +	if (sd_flags & SD_BALANCE_WAKE) {
140 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
141 | +			want_affine = 1;
142 | +		}
143 | +	}
144 | +
145 | +	if (want_affine)
146 | +		tmpmask = tsk_cpus_allowed(p);
147 | +	else
148 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
149 | +
150 | +	if (rt_task(p))
151 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
152 | +	else
153 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
154 | +
155 | +	return cpu;
156 | +}
157 | +
158 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
159 | +{
160 | +	unsigned long flag;
161 | +	int firstbit;
162 | +	struct rt_rq *first;
163 | +	struct rt_prio_array *array = &rq->rt.active;
164 | +
165 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
166 | +	firstbit = sched_find_first_bit(array->bitmap);
167 | +
168 | +	/* Maintaining rt.lowbit */
169 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
170 | +		rq->rt.lowbit = firstbit;
171 | +
172 | +	if (rq->rt.lowbit < first->lowbit) {
173 | +		write_lock_irqsave(&rt_list_lock, flag);
174 | +		list_del(&rq->rt.bld_rt_list);
175 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
176 | +		write_unlock_irqrestore(&rt_list_lock, flag);
177 | +	}
178 | +}
179 | +
180 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
181 | +{
182 | +	unsigned int cpu;
183 | +
184 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
185 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
186 | +	else {
187 | +		if (rt_task(p))
188 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
189 | +		else
190 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
191 | +	}
192 | +
193 | +	return cpu;
194 | +}
195 | +
196 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
197 | +{
198 | +	unsigned long flag;
199 | +	if (rt_task(p)) {
200 | +		track_load_rt(rq, p);
201 | +	} else {
202 | +		if (rq->cfs.pos != 2) {
203 | +			struct cfs_rq *last;
204 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
205 | +			if (rq->cfs.load.weight >= last->load.weight) {
206 | +				write_lock_irqsave(&cfs_list_lock, flag);
207 | +				list_del(&rq->cfs.bld_cfs_list);
208 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
209 | +				rq->cfs.pos = 2; last->pos = 1;
210 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
211 | +			}
212 | +		}
213 | +	}
214 | +}
215 | +
216 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
217 | +{
218 | +	unsigned long flag;
219 | +	if (rt_task(p)) {
220 | +		track_load_rt(rq, p);
221 | +	} else {
222 | +		if (rq->cfs.pos != 0) {
223 | +			struct cfs_rq *first;
224 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
225 | +			if (rq->cfs.load.weight <= first->load.weight) {
226 | +				write_lock_irqsave(&cfs_list_lock, flag);
227 | +				list_del(&rq->cfs.bld_cfs_list);
228 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
229 | +				rq->cfs.pos = 0; first->pos = 1;
230 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
231 | +			}
232 | +		}
233 | +	}
234 | +}
235 | +#else
236 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
237 | +{
238 | +}
239 | +
240 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
241 | +{
242 | +}
243 | +#endif /* CONFIG_BLD */
244 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
245 | index 5eab11d..ab18e8d 100644
246 | --- a/kernel/sched/core.c
247 | +++ b/kernel/sched/core.c
248 | @@ -24,6 +24,8 @@
249 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
250 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
251 |   *              Thomas Gleixner, Mike Kravetz
252 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
253 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
254 |   */
255 |  
256 |  #include <linux/mm.h>
257 | @@ -86,6 +88,7 @@
258 |  #include "sched.h"
259 |  #include "../workqueue_internal.h"
260 |  #include "../smpboot.h"
261 | +#include "bld.h"
262 |  
263 |  #define CREATE_TRACE_POINTS
264 |  #include <trace/events/sched.h>
265 | @@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
266 |  	update_rq_clock(rq);
267 |  	sched_info_queued(rq, p);
268 |  	p->sched_class->enqueue_task(rq, p, flags);
269 | +	if (!dl_task(p))
270 | +		bld_track_load_activate(rq, p);
271 |  }
272 |  
273 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
274 | @@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
275 |  	update_rq_clock(rq);
276 |  	sched_info_dequeued(rq, p);
277 |  	p->sched_class->dequeue_task(rq, p, flags);
278 | +	if (!dl_task(p))
279 | +		bld_track_load_deactivate(rq, p);
280 |  }
281 |  
282 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
283 | @@ -1412,7 +1419,14 @@ static inline
284 |  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
285 |  {
286 |  	if (p->nr_cpus_allowed > 1)
287 | +#ifndef	CONFIG_BLD
288 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
289 | +#else
290 | +		if(dl_task(p))
291 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
292 | +		else
293 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
294 | +#endif
295 |  
296 |  	/*
297 |  	 * In order not to call set_task_cpu() on a blocking task we need
298 | @@ -1582,7 +1596,11 @@ void scheduler_ipi(void)
299 |  	 */
300 |  	preempt_fold_need_resched();
301 |  
302 | +#ifndef	CONFIG_BLD
303 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
304 | +#else
305 | +	if (llist_empty(&this_rq()->wake_list))
306 | +#endif
307 |  		return;
308 |  
309 |  	/*
310 | @@ -1604,13 +1622,16 @@ void scheduler_ipi(void)
311 |  	/*
312 |  	 * Check if someone kicked us for doing the nohz idle load balance.
313 |  	 */
314 | +#ifndef	CONFIG_BLD
315 |  	if (unlikely(got_nohz_idle_kick())) {
316 |  		this_rq()->idle_balance = 1;
317 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
318 |  	}
319 | +#endif
320 |  	irq_exit();
321 |  }
322 |  
323 | +#ifndef	CONFIG_BLD
324 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
325 |  {
326 |  	struct rq *rq = cpu_rq(cpu);
327 | @@ -1623,6 +1644,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
328 |  	}
329 |  }
330 |  
331 | +#endif
332 | +
333 | +bool cpus_share_cache(int this_cpu, int that_cpu)
334 | +{
335 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
336 | +}
337 | +
338 |  void wake_up_if_idle(int cpu)
339 |  {
340 |  	struct rq *rq = cpu_rq(cpu);
341 | @@ -1646,18 +1674,13 @@ void wake_up_if_idle(int cpu)
342 |  out:
343 |  	rcu_read_unlock();
344 |  }
345 | -
346 | -bool cpus_share_cache(int this_cpu, int that_cpu)
347 | -{
348 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
349 | -}
350 |  #endif /* CONFIG_SMP */
351 |  
352 |  static void ttwu_queue(struct task_struct *p, int cpu)
353 |  {
354 |  	struct rq *rq = cpu_rq(cpu);
355 |  
356 | -#if defined(CONFIG_SMP)
357 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
358 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
359 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
360 |  		ttwu_queue_remote(p, cpu);
361 | @@ -1978,7 +2001,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
362 |  	 * Silence PROVE_RCU.
363 |  	 */
364 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
365 | -	set_task_cpu(p, cpu);
366 | +	__set_task_cpu(p, cpu);
367 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
368 |  
369 |  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
370 | @@ -2445,7 +2468,14 @@ void sched_exec(void)
371 |  	int dest_cpu;
372 |  
373 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
374 | +#ifndef	CONFIG_BLD
375 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
376 | +#else
377 | +	if (dl_task(p))
378 | +		dest_cpu = task_cpu(p);
379 | +	else
380 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
381 | +#endif
382 |  	if (dest_cpu == smp_processor_id())
383 |  		goto unlock;
384 |  
385 | @@ -2533,8 +2563,10 @@ void scheduler_tick(void)
386 |  
387 |  #ifdef CONFIG_SMP
388 |  	rq->idle_balance = idle_cpu(cpu);
389 | +#ifndef	CONFIG_BLD
390 |  	trigger_load_balance(rq);
391 |  #endif
392 | +#endif
393 |  	rq_last_tick_reset(rq);
394 |  }
395 |  
396 | @@ -7261,6 +7293,15 @@ void __init sched_init(void)
397 |  #endif
398 |  		init_rq_hrtick(rq);
399 |  		atomic_set(&rq->nr_iowait, 0);
400 | +#ifdef CONFIG_BLD
401 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
402 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
403 | +		rq->cfs.pos = 0;
404 | +
405 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
406 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
407 | +		rq->rt.lowbit = INT_MAX;
408 | +#endif
409 |  	}
410 |  
411 |  	set_load_weight(&init_task);
412 | @@ -7301,6 +7342,9 @@ void __init sched_init(void)
413 |  	init_sched_fair_class();
414 |  
415 |  	scheduler_running = 1;
416 | +#ifdef	CONFIG_BLD
417 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
418 | +#endif
419 |  }
420 |  
421 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
422 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
423 | index fe331fc..37d3839 100644
424 | --- a/kernel/sched/fair.c
425 | +++ b/kernel/sched/fair.c
426 | @@ -4351,6 +4351,7 @@ static void task_waking_fair(struct task_struct *p)
427 |  	record_wakee(p);
428 |  }
429 |  
430 | +#ifndef	CONFIG_BLD
431 |  #ifdef CONFIG_FAIR_GROUP_SCHED
432 |  /*
433 |   * effective_load() calculates the load change as seen from the root_task_group
434 | @@ -4803,6 +4804,7 @@ unlock:
435 |  
436 |  	return new_cpu;
437 |  }
438 | +#endif	/* CONFIG_BLD */
439 |  
440 |  /*
441 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
442 | @@ -5098,6 +5100,7 @@ simple:
443 |  	return p;
444 |  
445 |  idle:
446 | +#ifndef	CONFIG_BLD
447 |  	new_tasks = idle_balance(rq);
448 |  	/*
449 |  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
450 | @@ -5109,7 +5112,7 @@ idle:
451 |  
452 |  	if (new_tasks > 0)
453 |  		goto again;
454 | -
455 | +#endif
456 |  	return NULL;
457 |  }
458 |  
459 | @@ -7293,12 +7296,39 @@ static inline int on_null_domain(struct rq *rq)
460 |   *   needed, they will kick the idle load balancer, which then does idle
461 |   *   load balancing for all the idle CPUs.
462 |   */
463 | +#ifndef	CONFIG_BLD
464 |  static struct {
465 |  	cpumask_var_t idle_cpus_mask;
466 |  	atomic_t nr_cpus;
467 |  	unsigned long next_balance;     /* in jiffy units */
468 |  } nohz ____cacheline_aligned;
469 |  
470 | +static inline void nohz_balance_exit_idle(int cpu)
471 | +{
472 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
473 | +		/*
474 | +		 * Completely isolated CPUs don't ever set, so we must test.
475 | +		 */
476 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
477 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
478 | +			atomic_dec(&nohz.nr_cpus);
479 | +		}
480 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
481 | +	}
482 | +}
483 | +
484 | +static int sched_ilb_notifier(struct notifier_block *nfb,
485 | +					unsigned long action, void *hcpu)
486 | +{
487 | +	switch (action & ~CPU_TASKS_FROZEN) {
488 | +	case CPU_DYING:
489 | +		nohz_balance_exit_idle(smp_processor_id());
490 | +		return NOTIFY_OK;
491 | +	default:
492 | +		return NOTIFY_DONE;
493 | +	}
494 | +}
495 | +
496 |  static inline int find_new_ilb(void)
497 |  {
498 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
499 | @@ -7336,20 +7366,7 @@ static void nohz_balancer_kick(void)
500 |  	smp_send_reschedule(ilb_cpu);
501 |  	return;
502 |  }
503 | -
504 | -static inline void nohz_balance_exit_idle(int cpu)
505 | -{
506 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
507 | -		/*
508 | -		 * Completely isolated CPUs don't ever set, so we must test.
509 | -		 */
510 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
511 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
512 | -			atomic_dec(&nohz.nr_cpus);
513 | -		}
514 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
515 | -	}
516 | -}
517 | +#endif	/* CONFIG_BLD */
518 |  
519 |  static inline void set_cpu_sd_state_busy(void)
520 |  {
521 | @@ -7391,6 +7408,7 @@ unlock:
522 |   */
523 |  void nohz_balance_enter_idle(int cpu)
524 |  {
525 | +#ifndef	CONFIG_BLD
526 |  	/*
527 |  	 * If this cpu is going down, then nothing needs to be done.
528 |  	 */
529 | @@ -7409,23 +7427,10 @@ void nohz_balance_enter_idle(int cpu)
530 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
531 |  	atomic_inc(&nohz.nr_cpus);
532 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
533 | -}
534 | -
535 | -static int sched_ilb_notifier(struct notifier_block *nfb,
536 | -					unsigned long action, void *hcpu)
537 | -{
538 | -	switch (action & ~CPU_TASKS_FROZEN) {
539 | -	case CPU_DYING:
540 | -		nohz_balance_exit_idle(smp_processor_id());
541 | -		return NOTIFY_OK;
542 | -	default:
543 | -		return NOTIFY_DONE;
544 | -	}
545 | +#endif
546 |  }
547 |  #endif
548 |  
549 | -static DEFINE_SPINLOCK(balancing);
550 | -
551 |  /*
552 |   * Scale the max load_balance interval with the number of CPUs in the system.
553 |   * This trades load-balance latency on larger machines for less cross talk.
554 | @@ -7435,6 +7440,9 @@ void update_max_interval(void)
555 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
556 |  }
557 |  
558 | +#ifndef	CONFIG_BLD
559 | +static DEFINE_SPINLOCK(balancing);
560 | +
561 |  /*
562 |   * It checks each scheduling domain to see if it is due to be balanced,
563 |   * and initiates a balancing operation if so.
564 | @@ -7683,6 +7691,7 @@ void trigger_load_balance(struct rq *rq)
565 |  		nohz_balancer_kick();
566 |  #endif
567 |  }
568 | +#endif	/* CONFIG_BLD */
569 |  
570 |  static void rq_online_fair(struct rq *rq)
571 |  {
572 | @@ -8128,7 +8137,9 @@ const struct sched_class fair_sched_class = {
573 |  	.put_prev_task		= put_prev_task_fair,
574 |  
575 |  #ifdef CONFIG_SMP
576 | +#ifndef	CONFIG_BLD
577 |  	.select_task_rq		= select_task_rq_fair,
578 | +#endif
579 |  	.migrate_task_rq	= migrate_task_rq_fair,
580 |  
581 |  	.rq_online		= rq_online_fair,
582 | @@ -8168,6 +8179,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
583 |  
584 |  __init void init_sched_fair_class(void)
585 |  {
586 | +#ifndef	CONFIG_BLD
587 |  #ifdef CONFIG_SMP
588 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
589 |  
590 | @@ -8177,5 +8189,5 @@ __init void init_sched_fair_class(void)
591 |  	cpu_notifier(sched_ilb_notifier, 0);
592 |  #endif
593 |  #endif /* SMP */
594 | -
595 | +#endif /* BLD */
596 |  }
597 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
598 | index ee15f5a..bfdd0b7 100644
599 | --- a/kernel/sched/rt.c
600 | +++ b/kernel/sched/rt.c
601 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq)
602 |  #ifdef CONFIG_SMP
603 |  static int find_lowest_rq(struct task_struct *task);
604 |  
605 | +#ifndef	CONFIG_BLD
606 |  static int
607 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
608 |  {
609 | @@ -1345,6 +1346,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
610 |  out:
611 |  	return cpu;
612 |  }
613 | +#endif	/* CONFIG_BLD */
614 |  
615 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
616 |  {
617 | @@ -2114,7 +2116,9 @@ const struct sched_class rt_sched_class = {
618 |  	.put_prev_task		= put_prev_task_rt,
619 |  
620 |  #ifdef CONFIG_SMP
621 | +#ifndef	CONFIG_BLD
622 |  	.select_task_rq		= select_task_rq_rt,
623 | +#endif
624 |  
625 |  	.set_cpus_allowed       = set_cpus_allowed_rt,
626 |  	.rq_online              = rq_online_rt,
627 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
628 | index 9a2a45c..24b1c7e 100644
629 | --- a/kernel/sched/sched.h
630 | +++ b/kernel/sched/sched.h
631 | @@ -385,9 +385,8 @@ struct cfs_rq {
632 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
633 |  #endif /* CONFIG_SMP */
634 |  
635 | -#ifdef CONFIG_FAIR_GROUP_SCHED
636 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
637 | -
638 | +#ifdef CONFIG_FAIR_GROUP_SCHED
639 |  	/*
640 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
641 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
642 | @@ -411,6 +410,11 @@ struct cfs_rq {
643 |  	struct list_head throttled_list;
644 |  #endif /* CONFIG_CFS_BANDWIDTH */
645 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
646 | +
647 | +#ifdef CONFIG_BLD
648 | +	struct list_head bld_cfs_list;
649 | +	char pos;
650 | +#endif
651 |  };
652 |  
653 |  static inline int rt_bandwidth_enabled(void)
654 | @@ -444,12 +448,16 @@ struct rt_rq {
655 |  	/* Nests inside the rq lock: */
656 |  	raw_spinlock_t rt_runtime_lock;
657 |  
658 | +	struct rq *rq;
659 |  #ifdef CONFIG_RT_GROUP_SCHED
660 |  	unsigned long rt_nr_boosted;
661 |  
662 | -	struct rq *rq;
663 |  	struct task_group *tg;
664 |  #endif
665 | +#ifdef CONFIG_BLD
666 | +	struct list_head bld_rt_list;
667 | +	int lowbit;
668 | +#endif
669 |  };
670 |  
671 |  /* Deadline class' related fields in a runqueue */
672 | 


--------------------------------------------------------------------------------
/BLD-3.17.patch:
--------------------------------------------------------------------------------
  1 | BLD patch for Linux 3.17, contains a build fix when CONFIG_BLD=n.
  2 | Below shows a stat of default netperf run on localhost system
  3 | (client/server) running on local system (core i3, 2g ram).
  4 | 
  5 | 	    tcp_stream	tcp_rr    udp_stream     udp_rr
  6 | 
  7 | mainline    9343.54    20812.03	   18231.74	24396.074
  8 | 				   18210.71
  9 | 
 10 | bld	   14738.35    29224.54    26475.75	34910.08
 11 | 				   26462.53
 12 | 
 13 | These are average of 5 runs of each tests. BLD performs better
 14 | and shows ~(35-40)% improvement. And, recently Luis Cruz backports
 15 | BLD's previous release BLD-3.16 for Android and experimentally
 16 | ran it on his galaxy SIII, these could be found at following link:
 17 | 
 18 | https://github.com/SyNtheticNightmar3/bld-patches
 19 | 
 20 | If you are interested in running it on Android, take a look at the
 21 | above link.
 22 | 
 23 | Thanks,
 24 | Rakib
 25 | 
 26 | Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
 27 | ---
 28 | 
 29 | diff --git a/init/Kconfig b/init/Kconfig
 30 | index 80a6907..65319c6 100644
 31 | --- a/init/Kconfig
 32 | +++ b/init/Kconfig
 33 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
 34 |  	depends on BROKEN || !SMP
 35 |  	default y
 36 |  
 37 | +config BLD
 38 | +	bool "An alternate CPU load distribution technique for task scheduler"
 39 | +	depends on SMP
 40 | +	default y
 41 | +	help
 42 | +	  This is an alternate CPU load distribution technique based for task
 43 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 44 | +	  suitable for NUMA, should work well on SMP.
 45 | +
 46 |  config INIT_ENV_ARG_LIMIT
 47 |  	int
 48 |  	default 32 if !UML
 49 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 50 | new file mode 100644
 51 | index 0000000..097dd23
 52 | --- /dev/null
 53 | +++ b/kernel/sched/bld.h
 54 | @@ -0,0 +1,207 @@
 55 | +#ifdef CONFIG_BLD
 56 | +
 57 | +static DEFINE_RWLOCK(rt_list_lock);
 58 | +static LIST_HEAD(rt_rq_head);
 59 | +static LIST_HEAD(cfs_rq_head);
 60 | +static DEFINE_RWLOCK(cfs_list_lock);
 61 | +
 62 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 63 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 64 | +{
 65 | +	return cfs_rq->rq;
 66 | +}
 67 | +#else
 68 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 69 | +{
 70 | +	return container_of(cfs_rq, struct rq, cfs);
 71 | +}
 72 | +#endif
 73 | +
 74 | +#ifdef CONFIG_RT_GROUP_SCHED
 75 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 76 | +{
 77 | +	return rt_rq->rq;
 78 | +}
 79 | +#else
 80 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 81 | +{
 82 | +	return container_of(rt_rq, struct rq, rt);
 83 | +}
 84 | +#endif
 85 | +
 86 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 87 | +{
 88 | +	int cpu = smp_processor_id(), i;
 89 | +	unsigned long load, min_load = ULONG_MAX;
 90 | +	struct rq *rq;
 91 | +
 92 | +	if (task_type) {
 93 | +		for_each_cpu(i, mask) {
 94 | +			rq = cpu_rq(i);
 95 | +			load = rq->cfs.load.weight;
 96 | +			if (load < min_load) {
 97 | +				min_load = load;
 98 | +				cpu = i;
 99 | +			}
100 | +		}
101 | +	} else {
102 | +		min_load = -1;
103 | +
104 | +		for_each_cpu(i, mask) {
105 | +			rq = cpu_rq(i);
106 | +			load = rq->rt.lowbit;
107 | +			if (load > min_load) {
108 | +				min_load = load;
109 | +				cpu = i;
110 | +			}
111 | +		}
112 | +	}
113 | +
114 | +	return cpu;
115 | +}
116 | +
117 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
118 | +{
119 | +	struct cfs_rq *cfs;
120 | +	unsigned long flags;
121 | +	unsigned int cpu = smp_processor_id();
122 | +
123 | +	read_lock_irqsave(&cfs_list_lock, flags);
124 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
125 | +		cpu = cpu_of(rq_of_cfs(cfs));
126 | +		if (cpu_online(cpu))
127 | +			break;
128 | +	}
129 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
130 | +	return cpu;
131 | +}
132 | +
133 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
134 | +{
135 | +	struct rt_rq *rt;
136 | +	unsigned long flags;
137 | +	unsigned int cpu = smp_processor_id();
138 | +
139 | +	read_lock_irqsave(&rt_list_lock, flags);
140 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
141 | +		cpu = cpu_of(rq_of_rt(rt));
142 | +		if (cpu_online(cpu))
143 | +			break;
144 | +	}
145 | +	read_unlock_irqrestore(&rt_list_lock, flags);
146 | +	return cpu;
147 | +}
148 | +
149 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
150 | +{
151 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
152 | +	struct cpumask *tmpmask;
153 | +
154 | +	if (p->nr_cpus_allowed == 1)
155 | +		return task_cpu(p);
156 | +
157 | +	if (sd_flags & SD_BALANCE_WAKE) {
158 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
159 | +			want_affine = 1;
160 | +		}
161 | +	}
162 | +
163 | +	if (want_affine)
164 | +		tmpmask = tsk_cpus_allowed(p);
165 | +	else
166 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
167 | +
168 | +	if (rt_task(p))
169 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
170 | +	else
171 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
172 | +
173 | +	return cpu;
174 | +}
175 | +
176 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
177 | +{
178 | +	unsigned long flag;
179 | +	int firstbit;
180 | +	struct rt_rq *first;
181 | +	struct rt_prio_array *array = &rq->rt.active;
182 | +
183 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
184 | +	firstbit = sched_find_first_bit(array->bitmap);
185 | +
186 | +	/* Maintaining rt.lowbit */
187 | +	if (firstbit <= rq->rt.lowbit)
188 | +		rq->rt.lowbit = p->prio;
189 | +
190 | +	if (rq->rt.lowbit < first->lowbit) {
191 | +		write_lock_irqsave(&rt_list_lock, flag);
192 | +		list_del(&rq->rt.bld_rt_list);
193 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
194 | +		write_unlock_irqrestore(&rt_list_lock, flag);
195 | +	}
196 | +}
197 | +
198 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
199 | +{
200 | +	unsigned int cpu;
201 | +
202 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
203 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
204 | +	else {
205 | +		if (rt_task(p))
206 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
207 | +		else
208 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
209 | +	}
210 | +
211 | +	return cpu;
212 | +}
213 | +
214 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 2) {
221 | +			struct cfs_rq *last;
222 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight >= last->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 2; last->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +
234 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +	unsigned long flag;
237 | +	if (rt_task(p)) {
238 | +		track_load_rt(rq, p);
239 | +	} else {
240 | +		if (rq->cfs.pos != 0) {
241 | +			struct cfs_rq *first;
242 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
243 | +			if (rq->cfs.load.weight <= first->load.weight) {
244 | +				write_lock_irqsave(&cfs_list_lock, flag);
245 | +				list_del(&rq->cfs.bld_cfs_list);
246 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
247 | +				rq->cfs.pos = 0; first->pos = 1;
248 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
249 | +			}
250 | +		}
251 | +	}
252 | +}
253 | +#else
254 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
255 | +{
256 | +}
257 | +
258 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
259 | +{
260 | +}
261 | +#endif /* CONFIG_BLD */
262 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
263 | index ec1a286..e2c3cef 100644
264 | --- a/kernel/sched/core.c
265 | +++ b/kernel/sched/core.c
266 | @@ -24,6 +24,8 @@
267 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
268 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
269 |   *              Thomas Gleixner, Mike Kravetz
270 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
271 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
272 |   */
273 |  
274 |  #include <linux/mm.h>
275 | @@ -86,6 +88,7 @@
276 |  #include "sched.h"
277 |  #include "../workqueue_internal.h"
278 |  #include "../smpboot.h"
279 | +#include "bld.h"
280 |  
281 |  #define CREATE_TRACE_POINTS
282 |  #include <trace/events/sched.h>
283 | @@ -842,6 +845,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
284 |  	update_rq_clock(rq);
285 |  	sched_info_queued(rq, p);
286 |  	p->sched_class->enqueue_task(rq, p, flags);
287 | +	if (!dl_task(p))
288 | +		bld_track_load_activate(rq, p);
289 |  }
290 |  
291 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
292 | @@ -849,6 +854,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
293 |  	update_rq_clock(rq);
294 |  	sched_info_dequeued(rq, p);
295 |  	p->sched_class->dequeue_task(rq, p, flags);
296 | +	if (!dl_task(p))
297 | +		bld_track_load_deactivate(rq, p);
298 |  }
299 |  
300 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
301 | @@ -1409,7 +1416,14 @@ out:
302 |  static inline
303 |  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
304 |  {
305 | +#ifndef	CONFIG_BLD
306 |  	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
307 | +#else
308 | +	if (dl_task(p))
309 | +		cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
310 | +	else
311 | +		cpu = bld_get_cpu(p, sd_flags, wake_flags);
312 | +#endif
313 |  
314 |  	/*
315 |  	 * In order not to call set_task_cpu() on a blocking task we need
316 | @@ -1579,7 +1593,11 @@ void scheduler_ipi(void)
317 |  	 */
318 |  	preempt_fold_need_resched();
319 |  
320 | +#ifndef	CONFIG_BLD
321 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
322 | +#else
323 | +	if (llist_empty(&this_rq()->wake_list))
324 | +#endif
325 |  		return;
326 |  
327 |  	/*
328 | @@ -1601,13 +1619,16 @@ void scheduler_ipi(void)
329 |  	/*
330 |  	 * Check if someone kicked us for doing the nohz idle load balance.
331 |  	 */
332 | +#ifndef	CONFIG_BLD
333 |  	if (unlikely(got_nohz_idle_kick())) {
334 |  		this_rq()->idle_balance = 1;
335 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
336 |  	}
337 | +#endif
338 |  	irq_exit();
339 |  }
340 |  
341 | +#ifndef	CONFIG_BLD
342 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
343 |  {
344 |  	struct rq *rq = cpu_rq(cpu);
345 | @@ -1619,6 +1640,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
346 |  			trace_sched_wake_idle_without_ipi(cpu);
347 |  	}
348 |  }
349 | +#endif
350 |  
351 |  bool cpus_share_cache(int this_cpu, int that_cpu)
352 |  {
353 | @@ -1630,7 +1652,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
354 |  {
355 |  	struct rq *rq = cpu_rq(cpu);
356 |  
357 | -#if defined(CONFIG_SMP)
358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
359 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
360 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
361 |  		ttwu_queue_remote(p, cpu);
362 | @@ -1938,7 +1960,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
363 |  	 * Silence PROVE_RCU.
364 |  	 */
365 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
366 | -	set_task_cpu(p, cpu);
367 | +	__set_task_cpu(p, cpu);
368 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
369 |  
370 |  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
371 | @@ -2413,7 +2435,14 @@ void sched_exec(void)
372 |  	int dest_cpu;
373 |  
374 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
375 | +#ifndef	CONFIG_BLD
376 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
377 | +#else
378 | +	if (dl_task(p))
379 | +		dest_cpu = task_cpu(p);
380 | +	else
381 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
382 | +#endif
383 |  	if (dest_cpu == smp_processor_id())
384 |  		goto unlock;
385 |  
386 | @@ -2530,8 +2559,10 @@ void scheduler_tick(void)
387 |  
388 |  #ifdef CONFIG_SMP
389 |  	rq->idle_balance = idle_cpu(cpu);
390 | +#ifndef	CONFIG_BLD
391 |  	trigger_load_balance(rq);
392 |  #endif
393 | +#endif
394 |  	rq_last_tick_reset(rq);
395 |  }
396 |  
397 | @@ -7030,6 +7061,15 @@ void __init sched_init(void)
398 |  #endif
399 |  		init_rq_hrtick(rq);
400 |  		atomic_set(&rq->nr_iowait, 0);
401 | +#ifdef CONFIG_BLD
402 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
403 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
404 | +		rq->cfs.pos = 0;
405 | +
406 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
407 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
408 | +		rq->rt.lowbit = INT_MAX;
409 | +#endif
410 |  	}
411 |  
412 |  	set_load_weight(&init_task);
413 | @@ -7070,6 +7110,9 @@ void __init sched_init(void)
414 |  	init_sched_fair_class();
415 |  
416 |  	scheduler_running = 1;
417 | +#ifdef	CONFIG_BLD
418 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
419 | +#endif
420 |  }
421 |  
422 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
424 | index bfa3c86..20fc00c 100644
425 | --- a/kernel/sched/fair.c
426 | +++ b/kernel/sched/fair.c
427 | @@ -4136,6 +4136,7 @@ static void task_waking_fair(struct task_struct *p)
428 |  	record_wakee(p);
429 |  }
430 |  
431 | +#ifndef	CONFIG_BLD
432 |  #ifdef CONFIG_FAIR_GROUP_SCHED
433 |  /*
434 |   * effective_load() calculates the load change as seen from the root_task_group
435 | @@ -4585,6 +4586,7 @@ unlock:
436 |  
437 |  	return new_cpu;
438 |  }
439 | +#endif	/* CONFIG_BLD */
440 |  
441 |  /*
442 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
443 | @@ -4880,6 +4882,7 @@ simple:
444 |  	return p;
445 |  
446 |  idle:
447 | +#ifndef	CONFIG_BLD
448 |  	new_tasks = idle_balance(rq);
449 |  	/*
450 |  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
451 | @@ -4891,7 +4894,7 @@ idle:
452 |  
453 |  	if (new_tasks > 0)
454 |  		goto again;
455 | -
456 | +#endif
457 |  	return NULL;
458 |  }
459 |  
460 | @@ -6981,12 +6984,39 @@ static inline int on_null_domain(struct rq *rq)
461 |   *   needed, they will kick the idle load balancer, which then does idle
462 |   *   load balancing for all the idle CPUs.
463 |   */
464 | +#ifndef	CONFIG_BLD
465 |  static struct {
466 |  	cpumask_var_t idle_cpus_mask;
467 |  	atomic_t nr_cpus;
468 |  	unsigned long next_balance;     /* in jiffy units */
469 |  } nohz ____cacheline_aligned;
470 |  
471 | +static inline void nohz_balance_exit_idle(int cpu)
472 | +{
473 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
474 | +		/*
475 | +		 * Completely isolated CPUs don't ever set, so we must test.
476 | +		 */
477 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
478 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
479 | +			atomic_dec(&nohz.nr_cpus);
480 | +		}
481 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
482 | +	}
483 | +}
484 | +
485 | +static int sched_ilb_notifier(struct notifier_block *nfb,
486 | +					unsigned long action, void *hcpu)
487 | +{
488 | +	switch (action & ~CPU_TASKS_FROZEN) {
489 | +	case CPU_DYING:
490 | +		nohz_balance_exit_idle(smp_processor_id());
491 | +		return NOTIFY_OK;
492 | +	default:
493 | +		return NOTIFY_DONE;
494 | +	}
495 | +}
496 | +
497 |  static inline int find_new_ilb(void)
498 |  {
499 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
500 | @@ -7024,20 +7054,7 @@ static void nohz_balancer_kick(void)
501 |  	smp_send_reschedule(ilb_cpu);
502 |  	return;
503 |  }
504 | -
505 | -static inline void nohz_balance_exit_idle(int cpu)
506 | -{
507 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
508 | -		/*
509 | -		 * Completely isolated CPUs don't ever set, so we must test.
510 | -		 */
511 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
512 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
513 | -			atomic_dec(&nohz.nr_cpus);
514 | -		}
515 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
516 | -	}
517 | -}
518 | +#endif	/* CONFIG_BLD */
519 |  
520 |  static inline void set_cpu_sd_state_busy(void)
521 |  {
522 | @@ -7079,6 +7096,7 @@ unlock:
523 |   */
524 |  void nohz_balance_enter_idle(int cpu)
525 |  {
526 | +#ifndef	CONFIG_BLD
527 |  	/*
528 |  	 * If this cpu is going down, then nothing needs to be done.
529 |  	 */
530 | @@ -7097,23 +7115,10 @@ void nohz_balance_enter_idle(int cpu)
531 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
532 |  	atomic_inc(&nohz.nr_cpus);
533 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
534 | -}
535 | -
536 | -static int sched_ilb_notifier(struct notifier_block *nfb,
537 | -					unsigned long action, void *hcpu)
538 | -{
539 | -	switch (action & ~CPU_TASKS_FROZEN) {
540 | -	case CPU_DYING:
541 | -		nohz_balance_exit_idle(smp_processor_id());
542 | -		return NOTIFY_OK;
543 | -	default:
544 | -		return NOTIFY_DONE;
545 | -	}
546 | +#endif
547 |  }
548 |  #endif
549 |  
550 | -static DEFINE_SPINLOCK(balancing);
551 | -
552 |  /*
553 |   * Scale the max load_balance interval with the number of CPUs in the system.
554 |   * This trades load-balance latency on larger machines for less cross talk.
555 | @@ -7123,6 +7128,9 @@ void update_max_interval(void)
556 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
557 |  }
558 |  
559 | +#ifndef	CONFIG_BLD
560 | +static DEFINE_SPINLOCK(balancing);
561 | +
562 |  /*
563 |   * It checks each scheduling domain to see if it is due to be balanced,
564 |   * and initiates a balancing operation if so.
565 | @@ -7371,6 +7379,7 @@ void trigger_load_balance(struct rq *rq)
566 |  		nohz_balancer_kick();
567 |  #endif
568 |  }
569 | +#endif	/* CONFIG_BLD */
570 |  
571 |  static void rq_online_fair(struct rq *rq)
572 |  {
573 | @@ -7816,7 +7825,9 @@ const struct sched_class fair_sched_class = {
574 |  	.put_prev_task		= put_prev_task_fair,
575 |  
576 |  #ifdef CONFIG_SMP
577 | +#ifndef	CONFIG_BLD
578 |  	.select_task_rq		= select_task_rq_fair,
579 | +#endif
580 |  	.migrate_task_rq	= migrate_task_rq_fair,
581 |  
582 |  	.rq_online		= rq_online_fair,
583 | @@ -7854,6 +7865,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
584 |  
585 |  __init void init_sched_fair_class(void)
586 |  {
587 | +#ifndef	CONFIG_BLD
588 |  #ifdef CONFIG_SMP
589 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
590 |  
591 | @@ -7863,5 +7875,5 @@ __init void init_sched_fair_class(void)
592 |  	cpu_notifier(sched_ilb_notifier, 0);
593 |  #endif
594 |  #endif /* SMP */
595 | -
596 | +#endif /* BLD */
597 |  }
598 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
599 | index 5f6edca..ea0946c 100644
600 | --- a/kernel/sched/rt.c
601 | +++ b/kernel/sched/rt.c
602 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq)
603 |  #ifdef CONFIG_SMP
604 |  static int find_lowest_rq(struct task_struct *task);
605 |  
606 | +#ifndef	CONFIG_BLD
607 |  static int
608 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
609 |  {
610 | @@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
611 |  out:
612 |  	return cpu;
613 |  }
614 | +#endif	/* CONFIG_BLD */
615 |  
616 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
617 |  {
618 | @@ -2112,7 +2114,9 @@ const struct sched_class rt_sched_class = {
619 |  	.put_prev_task		= put_prev_task_rt,
620 |  
621 |  #ifdef CONFIG_SMP
622 | +#ifndef	CONFIG_BLD
623 |  	.select_task_rq		= select_task_rq_rt,
624 | +#endif
625 |  
626 |  	.set_cpus_allowed       = set_cpus_allowed_rt,
627 |  	.rq_online              = rq_online_rt,
628 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
629 | index 579712f..a00914d 100644
630 | --- a/kernel/sched/sched.h
631 | +++ b/kernel/sched/sched.h
632 | @@ -358,9 +358,8 @@ struct cfs_rq {
633 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
634 |  #endif /* CONFIG_SMP */
635 |  
636 | -#ifdef CONFIG_FAIR_GROUP_SCHED
637 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
638 | -
639 | +#ifdef CONFIG_FAIR_GROUP_SCHED
640 |  	/*
641 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
642 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
643 | @@ -384,6 +383,11 @@ struct cfs_rq {
644 |  	struct list_head throttled_list;
645 |  #endif /* CONFIG_CFS_BANDWIDTH */
646 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
647 | +
648 | +#ifdef CONFIG_BLD
649 | +	struct list_head bld_cfs_list;
650 | +	char pos;
651 | +#endif
652 |  };
653 |  
654 |  static inline int rt_bandwidth_enabled(void)
655 | @@ -417,12 +421,16 @@ struct rt_rq {
656 |  	/* Nests inside the rq lock: */
657 |  	raw_spinlock_t rt_runtime_lock;
658 |  
659 | +	struct rq *rq;
660 |  #ifdef CONFIG_RT_GROUP_SCHED
661 |  	unsigned long rt_nr_boosted;
662 |  
663 | -	struct rq *rq;
664 |  	struct task_group *tg;
665 |  #endif
666 | +#ifdef CONFIG_BLD
667 | +	struct list_head bld_rt_list;
668 | +	int lowbit;
669 | +#endif
670 |  };
671 |  
672 |  /* Deadline class' related fields in a runqueue */
673 | 


--------------------------------------------------------------------------------
/BLD-3.18.patch:
--------------------------------------------------------------------------------
  1 |  BLD patch for Linux-3.18. Changes since previous release:
  2 | 	
  3 | 	 * Contains changes to address issue at the time of wakeup
  4 | 	of rt tasks, it was reported by Peter Junos <petoju@gmail.com>.
  5 | 
  6 | BLD has some positive impact on vmlinux size too, the following shows
  7 | the picture:
  8 | 
  9 | $ cat size.mainline 
 10 |    text	   data	    bss	    dec	    hex	filename
 11 | 12769041	2056008	11722752	26547801	1951659	vmlinux
 12 | 
 13 | $ cat size.bld
 14 |    text	   data	    bss	    dec	    hex	filename
 15 | 12755462	2056040	11722752	26534254	194e16e	vmlinux
 16 | 
 17 | and the config could be found here:
 18 | 
 19 | https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17
 20 | 
 21 | After previous release, Mike Galbraith shows that on systems BLD can
 22 | reduce throughput significantly, due to L2 misses and where no L3 is
 23 | available, that issue is yet to address, I lack those kind of systems,
 24 | so it might take some time.
 25 | 
 26 | Thanks,
 27 | Rakib
 28 | 
 29 | ---
 30 | 
 31 | diff --git a/init/Kconfig b/init/Kconfig
 32 | index 2081a4d..becfd85 100644
 33 | --- a/init/Kconfig
 34 | +++ b/init/Kconfig
 35 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
 36 |  	depends on BROKEN || !SMP
 37 |  	default y
 38 |  
 39 | +config BLD
 40 | +	bool "An alternate CPU load distribution technique for task scheduler"
 41 | +	depends on SMP
 42 | +	default y
 43 | +	help
 44 | +	  This is an alternate CPU load distribution technique based for task
 45 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 46 | +	  suitable for NUMA, should work well on SMP.
 47 | +
 48 |  config INIT_ENV_ARG_LIMIT
 49 |  	int
 50 |  	default 32 if !UML
 51 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 52 | new file mode 100644
 53 | index 0000000..f1f9fba
 54 | --- /dev/null
 55 | +++ b/kernel/sched/bld.h
 56 | @@ -0,0 +1,215 @@
 57 | +#ifdef CONFIG_BLD
 58 | +
 59 | +static DEFINE_RWLOCK(rt_list_lock);
 60 | +static LIST_HEAD(rt_rq_head);
 61 | +static LIST_HEAD(cfs_rq_head);
 62 | +static DEFINE_RWLOCK(cfs_list_lock);
 63 | +
 64 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 65 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 66 | +{
 67 | +	return cfs_rq->rq;
 68 | +}
 69 | +#else
 70 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 71 | +{
 72 | +	return container_of(cfs_rq, struct rq, cfs);
 73 | +}
 74 | +#endif
 75 | +
 76 | +#ifdef CONFIG_RT_GROUP_SCHED
 77 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 78 | +{
 79 | +	return rt_rq->rq;
 80 | +}
 81 | +#else
 82 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 83 | +{
 84 | +	return container_of(rt_rq, struct rq, rt);
 85 | +}
 86 | +#endif
 87 | +
 88 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 89 | +{
 90 | +	int cpu = smp_processor_id(), i;
 91 | +	unsigned long load, varload;
 92 | +	struct rq *rq;
 93 | +
 94 | +	if (task_type) {
 95 | +		varload = ULONG_MAX;
 96 | +		for_each_cpu(i, mask) {
 97 | +			rq = cpu_rq(i);
 98 | +			load = rq->cfs.load.weight;
 99 | +			if (load < varload) {
100 | +				varload = load;
101 | +				cpu = i;
102 | +			}
103 | +		}
104 | +	} else {
105 | +		/* Here's an attempt to get a CPU within the mask where
106 | +		 * we can preempt easily. To achieve this we tried to
107 | +		 * maintain a lowbit, which indicate the lowest bit set on
108 | +		 * array bitmap. Since all CPUs contains high priority
109 | +		 * kernel threads therefore we eliminate 0, so it might not
110 | +		 * be right every time, but it's just an indicator.
111 | +		 */
112 | +		varload = 1;
113 | +
114 | +		for_each_cpu(i, mask) {
115 | +			rq = cpu_rq(i);
116 | +			load = rq->rt.lowbit;
117 | +			if (load >= varload) {
118 | +				varload = load;
119 | +				cpu = i;
120 | +			}
121 | +		}
122 | +	}
123 | +
124 | +	return cpu;
125 | +}
126 | +
127 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
128 | +{
129 | +	struct cfs_rq *cfs;
130 | +	unsigned long flags;
131 | +	unsigned int cpu = smp_processor_id();
132 | +
133 | +	read_lock_irqsave(&cfs_list_lock, flags);
134 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
135 | +		cpu = cpu_of(rq_of_cfs(cfs));
136 | +		if (cpu_online(cpu))
137 | +			break;
138 | +	}
139 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
140 | +	return cpu;
141 | +}
142 | +
143 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
144 | +{
145 | +	struct rt_rq *rt;
146 | +	unsigned long flags;
147 | +	unsigned int cpu = smp_processor_id();
148 | +
149 | +	read_lock_irqsave(&rt_list_lock, flags);
150 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
151 | +		cpu = cpu_of(rq_of_rt(rt));
152 | +		if (cpu_online(cpu))
153 | +			break;
154 | +	}
155 | +	read_unlock_irqrestore(&rt_list_lock, flags);
156 | +	return cpu;
157 | +}
158 | +
159 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
160 | +{
161 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
162 | +	struct cpumask *tmpmask;
163 | +
164 | +	if (p->nr_cpus_allowed == 1)
165 | +		return task_cpu(p);
166 | +
167 | +	if (sd_flags & SD_BALANCE_WAKE) {
168 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
169 | +			want_affine = 1;
170 | +		}
171 | +	}
172 | +
173 | +	if (want_affine)
174 | +		tmpmask = tsk_cpus_allowed(p);
175 | +	else
176 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
177 | +
178 | +	if (rt_task(p))
179 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
180 | +	else
181 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
182 | +
183 | +	return cpu;
184 | +}
185 | +
186 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
187 | +{
188 | +	unsigned long flag;
189 | +	int firstbit;
190 | +	struct rt_rq *first;
191 | +	struct rt_prio_array *array = &rq->rt.active;
192 | +
193 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
194 | +	firstbit = sched_find_first_bit(array->bitmap);
195 | +
196 | +	/* Maintaining rt.lowbit */
197 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
198 | +		rq->rt.lowbit = firstbit;
199 | +
200 | +	if (rq->rt.lowbit < first->lowbit) {
201 | +		write_lock_irqsave(&rt_list_lock, flag);
202 | +		list_del(&rq->rt.bld_rt_list);
203 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
204 | +		write_unlock_irqrestore(&rt_list_lock, flag);
205 | +	}
206 | +}
207 | +
208 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
209 | +{
210 | +	unsigned int cpu;
211 | +
212 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
213 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
214 | +	else {
215 | +		if (rt_task(p))
216 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
217 | +		else
218 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
219 | +	}
220 | +
221 | +	return cpu;
222 | +}
223 | +
224 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
225 | +{
226 | +	unsigned long flag;
227 | +	if (rt_task(p)) {
228 | +		track_load_rt(rq, p);
229 | +	} else {
230 | +		if (rq->cfs.pos != 2) {
231 | +			struct cfs_rq *last;
232 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
233 | +			if (rq->cfs.load.weight >= last->load.weight) {
234 | +				write_lock_irqsave(&cfs_list_lock, flag);
235 | +				list_del(&rq->cfs.bld_cfs_list);
236 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
237 | +				rq->cfs.pos = 2; last->pos = 1;
238 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
239 | +			}
240 | +		}
241 | +	}
242 | +}
243 | +
244 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
245 | +{
246 | +	unsigned long flag;
247 | +	if (rt_task(p)) {
248 | +		track_load_rt(rq, p);
249 | +	} else {
250 | +		if (rq->cfs.pos != 0) {
251 | +			struct cfs_rq *first;
252 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
253 | +			if (rq->cfs.load.weight <= first->load.weight) {
254 | +				write_lock_irqsave(&cfs_list_lock, flag);
255 | +				list_del(&rq->cfs.bld_cfs_list);
256 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
257 | +				rq->cfs.pos = 0; first->pos = 1;
258 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
259 | +			}
260 | +		}
261 | +	}
262 | +}
263 | +#else
264 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
265 | +{
266 | +}
267 | +
268 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
269 | +{
270 | +}
271 | +#endif /* CONFIG_BLD */
272 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
273 | index 89e7283..bd702c6 100644
274 | --- a/kernel/sched/core.c
275 | +++ b/kernel/sched/core.c
276 | @@ -24,6 +24,8 @@
277 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
278 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
279 |   *              Thomas Gleixner, Mike Kravetz
280 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
281 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
282 |   */
283 |  
284 |  #include <linux/mm.h>
285 | @@ -86,6 +88,7 @@
286 |  #include "sched.h"
287 |  #include "../workqueue_internal.h"
288 |  #include "../smpboot.h"
289 | +#include "bld.h"
290 |  
291 |  #define CREATE_TRACE_POINTS
292 |  #include <trace/events/sched.h>
293 | @@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
294 |  	update_rq_clock(rq);
295 |  	sched_info_queued(rq, p);
296 |  	p->sched_class->enqueue_task(rq, p, flags);
297 | +	if (!dl_task(p))
298 | +		bld_track_load_activate(rq, p);
299 |  }
300 |  
301 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
302 | @@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
303 |  	update_rq_clock(rq);
304 |  	sched_info_dequeued(rq, p);
305 |  	p->sched_class->dequeue_task(rq, p, flags);
306 | +	if (!dl_task(p))
307 | +		bld_track_load_deactivate(rq, p);
308 |  }
309 |  
310 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
311 | @@ -1407,7 +1414,14 @@ out:
312 |  static inline
313 |  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
314 |  {
315 | +#ifndef	CONFIG_BLD
316 |  	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
317 | +#else
318 | +	if (dl_task(p))
319 | +		cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
320 | +	else
321 | +		cpu = bld_get_cpu(p, sd_flags, wake_flags);
322 | +#endif
323 |  
324 |  	/*
325 |  	 * In order not to call set_task_cpu() on a blocking task we need
326 | @@ -1577,7 +1591,11 @@ void scheduler_ipi(void)
327 |  	 */
328 |  	preempt_fold_need_resched();
329 |  
330 | +#ifndef	CONFIG_BLD
331 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
332 | +#else
333 | +	if (llist_empty(&this_rq()->wake_list))
334 | +#endif
335 |  		return;
336 |  
337 |  	/*
338 | @@ -1599,13 +1617,16 @@ void scheduler_ipi(void)
339 |  	/*
340 |  	 * Check if someone kicked us for doing the nohz idle load balance.
341 |  	 */
342 | +#ifndef	CONFIG_BLD
343 |  	if (unlikely(got_nohz_idle_kick())) {
344 |  		this_rq()->idle_balance = 1;
345 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
346 |  	}
347 | +#endif
348 |  	irq_exit();
349 |  }
350 |  
351 | +#ifndef	CONFIG_BLD
352 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
353 |  {
354 |  	struct rq *rq = cpu_rq(cpu);
355 | @@ -1618,6 +1639,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
356 |  	}
357 |  }
358 |  
359 | +#endif
360 | +
361 | +bool cpus_share_cache(int this_cpu, int that_cpu)
362 | +{
363 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
364 | +}
365 | +
366 |  void wake_up_if_idle(int cpu)
367 |  {
368 |  	struct rq *rq = cpu_rq(cpu);
369 | @@ -1636,18 +1664,13 @@ void wake_up_if_idle(int cpu)
370 |  		raw_spin_unlock_irqrestore(&rq->lock, flags);
371 |  	}
372 |  }
373 | -
374 | -bool cpus_share_cache(int this_cpu, int that_cpu)
375 | -{
376 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
377 | -}
378 |  #endif /* CONFIG_SMP */
379 |  
380 |  static void ttwu_queue(struct task_struct *p, int cpu)
381 |  {
382 |  	struct rq *rq = cpu_rq(cpu);
383 |  
384 | -#if defined(CONFIG_SMP)
385 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
386 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
387 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
388 |  		ttwu_queue_remote(p, cpu);
389 | @@ -1966,7 +1989,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
390 |  	 * Silence PROVE_RCU.
391 |  	 */
392 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
393 | -	set_task_cpu(p, cpu);
394 | +	__set_task_cpu(p, cpu);
395 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
396 |  
397 |  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
398 | @@ -2451,7 +2474,14 @@ void sched_exec(void)
399 |  	int dest_cpu;
400 |  
401 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
402 | +#ifndef	CONFIG_BLD
403 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
404 | +#else
405 | +	if (dl_task(p))
406 | +		dest_cpu = task_cpu(p);
407 | +	else
408 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
409 | +#endif
410 |  	if (dest_cpu == smp_processor_id())
411 |  		goto unlock;
412 |  
413 | @@ -2539,8 +2569,10 @@ void scheduler_tick(void)
414 |  
415 |  #ifdef CONFIG_SMP
416 |  	rq->idle_balance = idle_cpu(cpu);
417 | +#ifndef	CONFIG_BLD
418 |  	trigger_load_balance(rq);
419 |  #endif
420 | +#endif
421 |  	rq_last_tick_reset(rq);
422 |  }
423 |  
424 | @@ -7126,6 +7158,15 @@ void __init sched_init(void)
425 |  #endif
426 |  		init_rq_hrtick(rq);
427 |  		atomic_set(&rq->nr_iowait, 0);
428 | +#ifdef CONFIG_BLD
429 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
430 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
431 | +		rq->cfs.pos = 0;
432 | +
433 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
434 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
435 | +		rq->rt.lowbit = INT_MAX;
436 | +#endif
437 |  	}
438 |  
439 |  	set_load_weight(&init_task);
440 | @@ -7166,6 +7207,9 @@ void __init sched_init(void)
441 |  	init_sched_fair_class();
442 |  
443 |  	scheduler_running = 1;
444 | +#ifdef	CONFIG_BLD
445 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
446 | +#endif
447 |  }
448 |  
449 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
450 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
451 | index ef2b104..ea45f76 100644
452 | --- a/kernel/sched/fair.c
453 | +++ b/kernel/sched/fair.c
454 | @@ -4164,6 +4164,7 @@ static void task_waking_fair(struct task_struct *p)
455 |  	record_wakee(p);
456 |  }
457 |  
458 | +#ifndef	CONFIG_BLD
459 |  #ifdef CONFIG_FAIR_GROUP_SCHED
460 |  /*
461 |   * effective_load() calculates the load change as seen from the root_task_group
462 | @@ -4619,6 +4620,7 @@ unlock:
463 |  
464 |  	return new_cpu;
465 |  }
466 | +#endif	/* CONFIG_BLD */
467 |  
468 |  /*
469 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
470 | @@ -4914,6 +4916,7 @@ simple:
471 |  	return p;
472 |  
473 |  idle:
474 | +#ifndef	CONFIG_BLD
475 |  	new_tasks = idle_balance(rq);
476 |  	/*
477 |  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
478 | @@ -4925,7 +4928,7 @@ idle:
479 |  
480 |  	if (new_tasks > 0)
481 |  		goto again;
482 | -
483 | +#endif
484 |  	return NULL;
485 |  }
486 |  
487 | @@ -7107,12 +7110,39 @@ static inline int on_null_domain(struct rq *rq)
488 |   *   needed, they will kick the idle load balancer, which then does idle
489 |   *   load balancing for all the idle CPUs.
490 |   */
491 | +#ifndef	CONFIG_BLD
492 |  static struct {
493 |  	cpumask_var_t idle_cpus_mask;
494 |  	atomic_t nr_cpus;
495 |  	unsigned long next_balance;     /* in jiffy units */
496 |  } nohz ____cacheline_aligned;
497 |  
498 | +static inline void nohz_balance_exit_idle(int cpu)
499 | +{
500 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
501 | +		/*
502 | +		 * Completely isolated CPUs don't ever set, so we must test.
503 | +		 */
504 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
505 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
506 | +			atomic_dec(&nohz.nr_cpus);
507 | +		}
508 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
509 | +	}
510 | +}
511 | +
512 | +static int sched_ilb_notifier(struct notifier_block *nfb,
513 | +					unsigned long action, void *hcpu)
514 | +{
515 | +	switch (action & ~CPU_TASKS_FROZEN) {
516 | +	case CPU_DYING:
517 | +		nohz_balance_exit_idle(smp_processor_id());
518 | +		return NOTIFY_OK;
519 | +	default:
520 | +		return NOTIFY_DONE;
521 | +	}
522 | +}
523 | +
524 |  static inline int find_new_ilb(void)
525 |  {
526 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
527 | @@ -7150,20 +7180,7 @@ static void nohz_balancer_kick(void)
528 |  	smp_send_reschedule(ilb_cpu);
529 |  	return;
530 |  }
531 | -
532 | -static inline void nohz_balance_exit_idle(int cpu)
533 | -{
534 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
535 | -		/*
536 | -		 * Completely isolated CPUs don't ever set, so we must test.
537 | -		 */
538 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
539 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
540 | -			atomic_dec(&nohz.nr_cpus);
541 | -		}
542 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
543 | -	}
544 | -}
545 | +#endif	/* CONFIG_BLD */
546 |  
547 |  static inline void set_cpu_sd_state_busy(void)
548 |  {
549 | @@ -7205,6 +7222,7 @@ unlock:
550 |   */
551 |  void nohz_balance_enter_idle(int cpu)
552 |  {
553 | +#ifndef	CONFIG_BLD
554 |  	/*
555 |  	 * If this cpu is going down, then nothing needs to be done.
556 |  	 */
557 | @@ -7223,23 +7241,10 @@ void nohz_balance_enter_idle(int cpu)
558 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
559 |  	atomic_inc(&nohz.nr_cpus);
560 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
561 | -}
562 | -
563 | -static int sched_ilb_notifier(struct notifier_block *nfb,
564 | -					unsigned long action, void *hcpu)
565 | -{
566 | -	switch (action & ~CPU_TASKS_FROZEN) {
567 | -	case CPU_DYING:
568 | -		nohz_balance_exit_idle(smp_processor_id());
569 | -		return NOTIFY_OK;
570 | -	default:
571 | -		return NOTIFY_DONE;
572 | -	}
573 | +#endif
574 |  }
575 |  #endif
576 |  
577 | -static DEFINE_SPINLOCK(balancing);
578 | -
579 |  /*
580 |   * Scale the max load_balance interval with the number of CPUs in the system.
581 |   * This trades load-balance latency on larger machines for less cross talk.
582 | @@ -7249,6 +7254,9 @@ void update_max_interval(void)
583 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
584 |  }
585 |  
586 | +#ifndef	CONFIG_BLD
587 | +static DEFINE_SPINLOCK(balancing);
588 | +
589 |  /*
590 |   * It checks each scheduling domain to see if it is due to be balanced,
591 |   * and initiates a balancing operation if so.
592 | @@ -7497,6 +7505,7 @@ void trigger_load_balance(struct rq *rq)
593 |  		nohz_balancer_kick();
594 |  #endif
595 |  }
596 | +#endif	/* CONFIG_BLD */
597 |  
598 |  static void rq_online_fair(struct rq *rq)
599 |  {
600 | @@ -7942,7 +7951,9 @@ const struct sched_class fair_sched_class = {
601 |  	.put_prev_task		= put_prev_task_fair,
602 |  
603 |  #ifdef CONFIG_SMP
604 | +#ifndef	CONFIG_BLD
605 |  	.select_task_rq		= select_task_rq_fair,
606 | +#endif
607 |  	.migrate_task_rq	= migrate_task_rq_fair,
608 |  
609 |  	.rq_online		= rq_online_fair,
610 | @@ -7982,6 +7993,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
611 |  
612 |  __init void init_sched_fair_class(void)
613 |  {
614 | +#ifndef	CONFIG_BLD
615 |  #ifdef CONFIG_SMP
616 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
617 |  
618 | @@ -7991,5 +8003,5 @@ __init void init_sched_fair_class(void)
619 |  	cpu_notifier(sched_ilb_notifier, 0);
620 |  #endif
621 |  #endif /* SMP */
622 | -
623 | +#endif /* BLD */
624 |  }
625 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
626 | index 20bca39..03a0ed3 100644
627 | --- a/kernel/sched/rt.c
628 | +++ b/kernel/sched/rt.c
629 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq)
630 |  #ifdef CONFIG_SMP
631 |  static int find_lowest_rq(struct task_struct *task);
632 |  
633 | +#ifndef	CONFIG_BLD
634 |  static int
635 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
636 |  {
637 | @@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
638 |  out:
639 |  	return cpu;
640 |  }
641 | +#endif	/* CONFIG_BLD */
642 |  
643 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
644 |  {
645 | @@ -2111,7 +2113,9 @@ const struct sched_class rt_sched_class = {
646 |  	.put_prev_task		= put_prev_task_rt,
647 |  
648 |  #ifdef CONFIG_SMP
649 | +#ifndef	CONFIG_BLD
650 |  	.select_task_rq		= select_task_rq_rt,
651 | +#endif
652 |  
653 |  	.set_cpus_allowed       = set_cpus_allowed_rt,
654 |  	.rq_online              = rq_online_rt,
655 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
656 | index 2df8ef0..79fce51 100644
657 | --- a/kernel/sched/sched.h
658 | +++ b/kernel/sched/sched.h
659 | @@ -366,9 +366,8 @@ struct cfs_rq {
660 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
661 |  #endif /* CONFIG_SMP */
662 |  
663 | -#ifdef CONFIG_FAIR_GROUP_SCHED
664 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
665 | -
666 | +#ifdef CONFIG_FAIR_GROUP_SCHED
667 |  	/*
668 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
669 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
670 | @@ -392,6 +391,11 @@ struct cfs_rq {
671 |  	struct list_head throttled_list;
672 |  #endif /* CONFIG_CFS_BANDWIDTH */
673 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
674 | +
675 | +#ifdef CONFIG_BLD
676 | +	struct list_head bld_cfs_list;
677 | +	char pos;
678 | +#endif
679 |  };
680 |  
681 |  static inline int rt_bandwidth_enabled(void)
682 | @@ -425,12 +429,16 @@ struct rt_rq {
683 |  	/* Nests inside the rq lock: */
684 |  	raw_spinlock_t rt_runtime_lock;
685 |  
686 | +	struct rq *rq;
687 |  #ifdef CONFIG_RT_GROUP_SCHED
688 |  	unsigned long rt_nr_boosted;
689 |  
690 | -	struct rq *rq;
691 |  	struct task_group *tg;
692 |  #endif
693 | +#ifdef CONFIG_BLD
694 | +	struct list_head bld_rt_list;
695 | +	int lowbit;
696 | +#endif
697 |  };
698 |  
699 |  /* Deadline class' related fields in a runqueue */
700 | 


--------------------------------------------------------------------------------
/BLD-4.8.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/init/Kconfig b/init/Kconfig
  2 | index cac3f09..4e49d16 100644
  3 | --- a/init/Kconfig
  4 | +++ b/init/Kconfig
  5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  6 |  	depends on BROKEN || !SMP
  7 |  	default y
  8 |  
  9 | +config BLD
 10 | +	bool "An alternate CPU load distribution technique for task scheduler"
 11 | +	depends on SMP
 12 | +	default y
 13 | +	help
 14 | +	  This is an alternate CPU load distribution technique based for task
 15 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 16 | +	  suitable for NUMA, should work well on SMP.
 17 | +
 18 |  config INIT_ENV_ARG_LIMIT
 19 |  	int
 20 |  	default 32 if !UML
 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 22 | new file mode 100644
 23 | index 0000000..f1f9fba
 24 | --- /dev/null
 25 | +++ b/kernel/sched/bld.h
 26 | @@ -0,0 +1,215 @@
 27 | +#ifdef CONFIG_BLD
 28 | +
 29 | +static DEFINE_RWLOCK(rt_list_lock);
 30 | +static LIST_HEAD(rt_rq_head);
 31 | +static LIST_HEAD(cfs_rq_head);
 32 | +static DEFINE_RWLOCK(cfs_list_lock);
 33 | +
 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 36 | +{
 37 | +	return cfs_rq->rq;
 38 | +}
 39 | +#else
 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 41 | +{
 42 | +	return container_of(cfs_rq, struct rq, cfs);
 43 | +}
 44 | +#endif
 45 | +
 46 | +#ifdef CONFIG_RT_GROUP_SCHED
 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 48 | +{
 49 | +	return rt_rq->rq;
 50 | +}
 51 | +#else
 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 53 | +{
 54 | +	return container_of(rt_rq, struct rq, rt);
 55 | +}
 56 | +#endif
 57 | +
 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 59 | +{
 60 | +	int cpu = smp_processor_id(), i;
 61 | +	unsigned long load, varload;
 62 | +	struct rq *rq;
 63 | +
 64 | +	if (task_type) {
 65 | +		varload = ULONG_MAX;
 66 | +		for_each_cpu(i, mask) {
 67 | +			rq = cpu_rq(i);
 68 | +			load = rq->cfs.load.weight;
 69 | +			if (load < varload) {
 70 | +				varload = load;
 71 | +				cpu = i;
 72 | +			}
 73 | +		}
 74 | +	} else {
 75 | +		/* Here's an attempt to get a CPU within the mask where
 76 | +		 * we can preempt easily. To achieve this we tried to
 77 | +		 * maintain a lowbit, which indicate the lowest bit set on
 78 | +		 * array bitmap. Since all CPUs contains high priority
 79 | +		 * kernel threads therefore we eliminate 0, so it might not
 80 | +		 * be right every time, but it's just an indicator.
 81 | +		 */
 82 | +		varload = 1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load >= varload) {
 88 | +				varload = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = firstbit;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index 44817c6..f0f3321 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/kasan.h>
255 | @@ -87,6 +89,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -751,6 +754,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	if (!(flags & ENQUEUE_RESTORE))
265 |  		sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -759,6 +764,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	if (!(flags & DEQUEUE_SAVE))
274 |  		sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1588,11 +1595,17 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
282 |  {
283 |  	lockdep_assert_held(&p->pi_lock);
284 |  
285 | +#ifndef	CONFIG_BLD
286 |  	if (tsk_nr_cpus_allowed(p) > 1)
287 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
288 |  	else
289 |  		cpu = cpumask_any(tsk_cpus_allowed(p));
290 | -
291 | +#else
292 | +	if (dl_task(p))
293 | +		cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
294 | +	else
295 | +		cpu = bld_get_cpu(p, sd_flags, wake_flags);
296 | +#endif
297 |  	/*
298 |  	 * In order not to call set_task_cpu() on a blocking task we need
299 |  	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
300 | @@ -1795,7 +1808,11 @@ void scheduler_ipi(void)
301 |  	 */
302 |  	preempt_fold_need_resched();
303 |  
304 | +#ifndef	CONFIG_BLD
305 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
306 | +#else
307 | +	if (llist_empty(&this_rq()->wake_list))
308 | +#endif
309 |  		return;
310 |  
311 |  	/*
312 | @@ -1817,13 +1834,16 @@ void scheduler_ipi(void)
313 |  	/*
314 |  	 * Check if someone kicked us for doing the nohz idle load balance.
315 |  	 */
316 | +#ifndef	CONFIG_BLD
317 |  	if (unlikely(got_nohz_idle_kick())) {
318 |  		this_rq()->idle_balance = 1;
319 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
320 |  	}
321 | +#endif
322 |  	irq_exit();
323 |  }
324 |  
325 | +#ifndef	CONFIG_BLD
326 |  static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
327 |  {
328 |  	struct rq *rq = cpu_rq(cpu);
329 | @@ -1837,6 +1857,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
330 |  			trace_sched_wake_idle_without_ipi(cpu);
331 |  	}
332 |  }
333 | +#endif	/* CONFIG_BLD */
334 |  
335 |  void wake_up_if_idle(int cpu)
336 |  {
337 | @@ -1873,7 +1894,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
338 |  	struct rq *rq = cpu_rq(cpu);
339 |  	struct pin_cookie cookie;
340 |  
341 | -#if defined(CONFIG_SMP)
342 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
343 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
344 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
345 |  		ttwu_queue_remote(p, cpu, wake_flags);
346 | @@ -2971,7 +2992,14 @@ void sched_exec(void)
347 |  	int dest_cpu;
348 |  
349 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
350 | +#ifndef	CONFIG_BLD
351 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
352 | +#else
353 | +	if (dl_task(p))
354 | +		dest_cpu = task_cpu(p);
355 | +	else
356 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
357 | +#endif
358 |  	if (dest_cpu == smp_processor_id())
359 |  		goto unlock;
360 |  
361 | @@ -3078,8 +3106,10 @@ void scheduler_tick(void)
362 |  
363 |  #ifdef CONFIG_SMP
364 |  	rq->idle_balance = idle_cpu(cpu);
365 | +#ifndef	CONFIG_BLD
366 |  	trigger_load_balance(rq);
367 |  #endif
368 | +#endif
369 |  	rq_last_tick_reset(rq);
370 |  }
371 |  
372 | @@ -7313,7 +7343,9 @@ int sched_cpu_dying(unsigned int cpu)
373 |  	raw_spin_unlock_irqrestore(&rq->lock, flags);
374 |  	calc_load_migrate(rq);
375 |  	update_max_interval();
376 | +#ifndef	CONFIG_BLD
377 |  	nohz_balance_exit_idle(cpu);
378 | +#endif
379 |  	hrtick_clear(rq);
380 |  	return 0;
381 |  }
382 | @@ -7519,6 +7551,15 @@ void __init sched_init(void)
383 |  #endif /* CONFIG_SMP */
384 |  		init_rq_hrtick(rq);
385 |  		atomic_set(&rq->nr_iowait, 0);
386 | +#ifdef CONFIG_BLD
387 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
388 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
389 | +		rq->cfs.pos = 0;
390 | +
391 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
392 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
393 | +		rq->rt.lowbit = INT_MAX;
394 | +#endif
395 |  	}
396 |  
397 |  	set_load_weight(&init_task);
398 | @@ -7561,6 +7602,9 @@ void __init sched_init(void)
399 |  	init_schedstats();
400 |  
401 |  	scheduler_running = 1;
402 | +#ifdef	CONFIG_BLD
403 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
404 | +#endif
405 |  }
406 |  
407 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
408 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
409 | index 039de34..f823e5b 100644
410 | --- a/kernel/sched/fair.c
411 | +++ b/kernel/sched/fair.c
412 | @@ -4924,6 +4924,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
413 |  	return 0;
414 |  }
415 |  
416 | +#ifndef	CONFIG_BLD
417 |  #ifdef CONFIG_FAIR_GROUP_SCHED
418 |  /*
419 |   * effective_load() calculates the load change as seen from the root_task_group
420 | @@ -5455,6 +5456,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
421 |  
422 |  	return new_cpu;
423 |  }
424 | +#endif	/* CONFIG_BLD */
425 |  
426 |  /*
427 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
428 | @@ -5785,6 +5787,7 @@ idle:
429 |  	 * further scheduler activity on it and we're being very careful to
430 |  	 * re-start the picking loop.
431 |  	 */
432 | +#ifndef	CONFIG_BLD
433 |  	lockdep_unpin_lock(&rq->lock, cookie);
434 |  	new_tasks = idle_balance(rq);
435 |  	lockdep_repin_lock(&rq->lock, cookie);
436 | @@ -5798,7 +5801,7 @@ idle:
437 |  
438 |  	if (new_tasks > 0)
439 |  		goto again;
440 | -
441 | +#endif /* CONFIG_BLD */
442 |  	return NULL;
443 |  }
444 |  
445 | @@ -6459,8 +6462,9 @@ static unsigned long task_h_load(struct task_struct *p)
446 |  }
447 |  #endif
448 |  
449 | -/********** Helpers for find_busiest_group ************************/
450 | +#ifndef	CONFIG_BLD
451 |  
452 | +/********** Helpers for find_busiest_group ************************/
453 |  enum group_type {
454 |  	group_other = 0,
455 |  	group_imbalanced,
456 | @@ -6551,6 +6555,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
457 |  
458 |  	return load_idx;
459 |  }
460 | +#endif	/* CONFIG_BLD	*/
461 |  
462 |  static unsigned long scale_rt_capacity(int cpu)
463 |  {
464 | @@ -6659,6 +6664,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
465 |  	sdg->sgc->capacity = capacity;
466 |  }
467 |  
468 | +#ifndef	CONFIG_BLD
469 |  /*
470 |   * Check whether the capacity of the rq has been noticeably reduced by side
471 |   * activity. The imbalance_pct is used for the threshold.
472 | @@ -7892,6 +7898,7 @@ static inline int on_null_domain(struct rq *rq)
473 |  {
474 |  	return unlikely(!rcu_dereference_sched(rq->sd));
475 |  }
476 | +#endif	/* CONFIG_BLD */
477 |  
478 |  #ifdef CONFIG_NO_HZ_COMMON
479 |  /*
480 | @@ -7900,12 +7907,39 @@ static inline int on_null_domain(struct rq *rq)
481 |   *   needed, they will kick the idle load balancer, which then does idle
482 |   *   load balancing for all the idle CPUs.
483 |   */
484 | +#ifndef	CONFIG_BLD
485 |  static struct {
486 |  	cpumask_var_t idle_cpus_mask;
487 |  	atomic_t nr_cpus;
488 |  	unsigned long next_balance;     /* in jiffy units */
489 |  } nohz ____cacheline_aligned;
490 |  
491 | +void nohz_balance_exit_idle(unsigned int cpu)
492 | +{
493 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
494 | +		/*
495 | +		 * Completely isolated CPUs don't ever set, so we must test.
496 | +		 */
497 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
498 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
499 | +			atomic_dec(&nohz.nr_cpus);
500 | +		}
501 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
502 | +	}
503 | +}
504 | +
505 | +static int sched_ilb_notifier(struct notifier_block *nfb,
506 | +					unsigned long action, void *hcpu)
507 | +{
508 | +	switch (action & ~CPU_TASKS_FROZEN) {
509 | +	case CPU_DYING:
510 | +		nohz_balance_exit_idle(smp_processor_id());
511 | +		return NOTIFY_OK;
512 | +	default:
513 | +		return NOTIFY_DONE;
514 | +	}
515 | +}
516 | +
517 |  static inline int find_new_ilb(void)
518 |  {
519 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
520 | @@ -7944,20 +7978,6 @@ static void nohz_balancer_kick(void)
521 |  	return;
522 |  }
523 |  
524 | -void nohz_balance_exit_idle(unsigned int cpu)
525 | -{
526 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
527 | -		/*
528 | -		 * Completely isolated CPUs don't ever set, so we must test.
529 | -		 */
530 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
531 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
532 | -			atomic_dec(&nohz.nr_cpus);
533 | -		}
534 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
535 | -	}
536 | -}
537 | -
538 |  static inline void set_cpu_sd_state_busy(void)
539 |  {
540 |  	struct sched_domain *sd;
541 | @@ -7974,6 +7994,8 @@ static inline void set_cpu_sd_state_busy(void)
542 |  unlock:
543 |  	rcu_read_unlock();
544 |  }
545 | +#endif	/* CONFIG_BLD */
546 | +#endif	/* NO_HZ_COMMON */
547 |  
548 |  void set_cpu_sd_state_idle(void)
549 |  {
550 | @@ -7998,6 +8020,7 @@ unlock:
551 |   */
552 |  void nohz_balance_enter_idle(int cpu)
553 |  {
554 | +#ifndef	CONFIG_BLD
555 |  	/*
556 |  	 * If this cpu is going down, then nothing needs to be done.
557 |  	 */
558 | @@ -8016,10 +8039,8 @@ void nohz_balance_enter_idle(int cpu)
559 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
560 |  	atomic_inc(&nohz.nr_cpus);
561 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
562 | -}
563 |  #endif
564 | -
565 | -static DEFINE_SPINLOCK(balancing);
566 | +}
567 |  
568 |  /*
569 |   * Scale the max load_balance interval with the number of CPUs in the system.
570 | @@ -8030,6 +8051,9 @@ void update_max_interval(void)
571 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
572 |  }
573 |  
574 | +#ifndef	CONFIG_BLD
575 | +static DEFINE_SPINLOCK(balancing);
576 | +
577 |  /*
578 |   * It checks each scheduling domain to see if it is due to be balanced,
579 |   * and initiates a balancing operation if so.
580 | @@ -8317,6 +8341,7 @@ void trigger_load_balance(struct rq *rq)
581 |  		nohz_balancer_kick();
582 |  #endif
583 |  }
584 | +#endif	/* CONFIG_BLD */
585 |  
586 |  static void rq_online_fair(struct rq *rq)
587 |  {
588 | @@ -8332,7 +8357,6 @@ static void rq_offline_fair(struct rq *rq)
589 |  	/* Ensure any throttled groups are reachable by pick_next_task */
590 |  	unthrottle_offline_cfs_rqs(rq);
591 |  }
592 | -
593 |  #endif /* CONFIG_SMP */
594 |  
595 |  /*
596 | @@ -8791,7 +8815,9 @@ const struct sched_class fair_sched_class = {
597 |  	.put_prev_task		= put_prev_task_fair,
598 |  
599 |  #ifdef CONFIG_SMP
600 | +#ifndef	CONFIG_BLD
601 |  	.select_task_rq		= select_task_rq_fair,
602 | +#endif
603 |  	.migrate_task_rq	= migrate_task_rq_fair,
604 |  
605 |  	.rq_online		= rq_online_fair,
606 | @@ -8852,6 +8878,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
607 |  
608 |  __init void init_sched_fair_class(void)
609 |  {
610 | +#ifndef	CONFIG_BLD
611 |  #ifdef CONFIG_SMP
612 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
613 |  
614 | @@ -8860,5 +8887,5 @@ __init void init_sched_fair_class(void)
615 |  	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
616 |  #endif
617 |  #endif /* SMP */
618 | -
619 | +#endif /* BLD */
620 |  }
621 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
622 | index d5690b7..6f3589e 100644
623 | --- a/kernel/sched/rt.c
624 | +++ b/kernel/sched/rt.c
625 | @@ -1375,6 +1375,7 @@ static void yield_task_rt(struct rq *rq)
626 |  #ifdef CONFIG_SMP
627 |  static int find_lowest_rq(struct task_struct *task);
628 |  
629 | +#ifndef	CONFIG_BLD
630 |  static int
631 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
632 |  {
633 | @@ -1430,6 +1431,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
634 |  out:
635 |  	return cpu;
636 |  }
637 | +#endif	/* CONFIG_BLD */
638 |  
639 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
640 |  {
641 | @@ -2335,7 +2337,9 @@ const struct sched_class rt_sched_class = {
642 |  	.put_prev_task		= put_prev_task_rt,
643 |  
644 |  #ifdef CONFIG_SMP
645 | +#ifndef	CONFIG_BLD
646 |  	.select_task_rq		= select_task_rq_rt,
647 | +#endif
648 |  
649 |  	.set_cpus_allowed       = set_cpus_allowed_common,
650 |  	.rq_online              = rq_online_rt,
651 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
652 | index c64fc51..a1d329b 100644
653 | --- a/kernel/sched/sched.h
654 | +++ b/kernel/sched/sched.h
655 | @@ -416,9 +416,8 @@ struct cfs_rq {
656 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
657 |  #endif /* CONFIG_SMP */
658 |  
659 | -#ifdef CONFIG_FAIR_GROUP_SCHED
660 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
661 | -
662 | +#ifdef CONFIG_FAIR_GROUP_SCHED
663 |  	/*
664 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
665 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
666 | @@ -442,6 +441,11 @@ struct cfs_rq {
667 |  	struct list_head throttled_list;
668 |  #endif /* CONFIG_CFS_BANDWIDTH */
669 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
670 | +
671 | +#ifdef CONFIG_BLD
672 | +	struct list_head bld_cfs_list;
673 | +	char pos;
674 | +#endif
675 |  };
676 |  
677 |  static inline int rt_bandwidth_enabled(void)
678 | @@ -487,12 +491,16 @@ struct rt_rq {
679 |  	/* Nests inside the rq lock: */
680 |  	raw_spinlock_t rt_runtime_lock;
681 |  
682 | +	struct rq *rq;
683 |  #ifdef CONFIG_RT_GROUP_SCHED
684 |  	unsigned long rt_nr_boosted;
685 |  
686 | -	struct rq *rq;
687 |  	struct task_group *tg;
688 |  #endif
689 | +#ifdef CONFIG_BLD
690 | +	struct list_head bld_rt_list;
691 | +	int lowbit;
692 | +#endif
693 |  };
694 |  
695 |  /* Deadline class' related fields in a runqueue */
696 | 


--------------------------------------------------------------------------------
/BLD-4.1.patch:
--------------------------------------------------------------------------------
  1 |  BLD patch for Linux-4.1. Just code rebase on Linux-4.1.
  2 | 
  3 | 
  4 | diff --git a/init/Kconfig b/init/Kconfig
  5 | index dc24dec..87860d4 100644
  6 | --- a/init/Kconfig
  7 | +++ b/init/Kconfig
  8 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  9 |  	depends on BROKEN || !SMP
 10 |  	default y
 11 |  
 12 | +config BLD
 13 | +	bool "An alternate CPU load distribution technique for task scheduler"
 14 | +	depends on SMP
 15 | +	default y
 16 | +	help
 17 | +	  This is an alternate CPU load distribution technique based for task
 18 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 19 | +	  suitable for NUMA, should work well on SMP.
 20 | +
 21 |  config INIT_ENV_ARG_LIMIT
 22 |  	int
 23 |  	default 32 if !UML
 24 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 25 | new file mode 100644
 26 | index 0000000..f1f9fba
 27 | --- /dev/null
 28 | +++ b/kernel/sched/bld.h
 29 | @@ -0,0 +1,215 @@
 30 | +#ifdef CONFIG_BLD
 31 | +
 32 | +static DEFINE_RWLOCK(rt_list_lock);
 33 | +static LIST_HEAD(rt_rq_head);
 34 | +static LIST_HEAD(cfs_rq_head);
 35 | +static DEFINE_RWLOCK(cfs_list_lock);
 36 | +
 37 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 38 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 39 | +{
 40 | +	return cfs_rq->rq;
 41 | +}
 42 | +#else
 43 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 44 | +{
 45 | +	return container_of(cfs_rq, struct rq, cfs);
 46 | +}
 47 | +#endif
 48 | +
 49 | +#ifdef CONFIG_RT_GROUP_SCHED
 50 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 51 | +{
 52 | +	return rt_rq->rq;
 53 | +}
 54 | +#else
 55 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 56 | +{
 57 | +	return container_of(rt_rq, struct rq, rt);
 58 | +}
 59 | +#endif
 60 | +
 61 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 62 | +{
 63 | +	int cpu = smp_processor_id(), i;
 64 | +	unsigned long load, varload;
 65 | +	struct rq *rq;
 66 | +
 67 | +	if (task_type) {
 68 | +		varload = ULONG_MAX;
 69 | +		for_each_cpu(i, mask) {
 70 | +			rq = cpu_rq(i);
 71 | +			load = rq->cfs.load.weight;
 72 | +			if (load < varload) {
 73 | +				varload = load;
 74 | +				cpu = i;
 75 | +			}
 76 | +		}
 77 | +	} else {
 78 | +		/* Here's an attempt to get a CPU within the mask where
 79 | +		 * we can preempt easily. To achieve this we tried to
 80 | +		 * maintain a lowbit, which indicate the lowest bit set on
 81 | +		 * array bitmap. Since all CPUs contains high priority
 82 | +		 * kernel threads therefore we eliminate 0, so it might not
 83 | +		 * be right every time, but it's just an indicator.
 84 | +		 */
 85 | +		varload = 1;
 86 | +
 87 | +		for_each_cpu(i, mask) {
 88 | +			rq = cpu_rq(i);
 89 | +			load = rq->rt.lowbit;
 90 | +			if (load >= varload) {
 91 | +				varload = load;
 92 | +				cpu = i;
 93 | +			}
 94 | +		}
 95 | +	}
 96 | +
 97 | +	return cpu;
 98 | +}
 99 | +
100 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
101 | +{
102 | +	struct cfs_rq *cfs;
103 | +	unsigned long flags;
104 | +	unsigned int cpu = smp_processor_id();
105 | +
106 | +	read_lock_irqsave(&cfs_list_lock, flags);
107 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
108 | +		cpu = cpu_of(rq_of_cfs(cfs));
109 | +		if (cpu_online(cpu))
110 | +			break;
111 | +	}
112 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
113 | +	return cpu;
114 | +}
115 | +
116 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
117 | +{
118 | +	struct rt_rq *rt;
119 | +	unsigned long flags;
120 | +	unsigned int cpu = smp_processor_id();
121 | +
122 | +	read_lock_irqsave(&rt_list_lock, flags);
123 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
124 | +		cpu = cpu_of(rq_of_rt(rt));
125 | +		if (cpu_online(cpu))
126 | +			break;
127 | +	}
128 | +	read_unlock_irqrestore(&rt_list_lock, flags);
129 | +	return cpu;
130 | +}
131 | +
132 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
133 | +{
134 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
135 | +	struct cpumask *tmpmask;
136 | +
137 | +	if (p->nr_cpus_allowed == 1)
138 | +		return task_cpu(p);
139 | +
140 | +	if (sd_flags & SD_BALANCE_WAKE) {
141 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
142 | +			want_affine = 1;
143 | +		}
144 | +	}
145 | +
146 | +	if (want_affine)
147 | +		tmpmask = tsk_cpus_allowed(p);
148 | +	else
149 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
150 | +
151 | +	if (rt_task(p))
152 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
153 | +	else
154 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
155 | +
156 | +	return cpu;
157 | +}
158 | +
159 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
160 | +{
161 | +	unsigned long flag;
162 | +	int firstbit;
163 | +	struct rt_rq *first;
164 | +	struct rt_prio_array *array = &rq->rt.active;
165 | +
166 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
167 | +	firstbit = sched_find_first_bit(array->bitmap);
168 | +
169 | +	/* Maintaining rt.lowbit */
170 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
171 | +		rq->rt.lowbit = firstbit;
172 | +
173 | +	if (rq->rt.lowbit < first->lowbit) {
174 | +		write_lock_irqsave(&rt_list_lock, flag);
175 | +		list_del(&rq->rt.bld_rt_list);
176 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
177 | +		write_unlock_irqrestore(&rt_list_lock, flag);
178 | +	}
179 | +}
180 | +
181 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
182 | +{
183 | +	unsigned int cpu;
184 | +
185 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
186 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
187 | +	else {
188 | +		if (rt_task(p))
189 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
190 | +		else
191 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
192 | +	}
193 | +
194 | +	return cpu;
195 | +}
196 | +
197 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
198 | +{
199 | +	unsigned long flag;
200 | +	if (rt_task(p)) {
201 | +		track_load_rt(rq, p);
202 | +	} else {
203 | +		if (rq->cfs.pos != 2) {
204 | +			struct cfs_rq *last;
205 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
206 | +			if (rq->cfs.load.weight >= last->load.weight) {
207 | +				write_lock_irqsave(&cfs_list_lock, flag);
208 | +				list_del(&rq->cfs.bld_cfs_list);
209 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
210 | +				rq->cfs.pos = 2; last->pos = 1;
211 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
212 | +			}
213 | +		}
214 | +	}
215 | +}
216 | +
217 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
218 | +{
219 | +	unsigned long flag;
220 | +	if (rt_task(p)) {
221 | +		track_load_rt(rq, p);
222 | +	} else {
223 | +		if (rq->cfs.pos != 0) {
224 | +			struct cfs_rq *first;
225 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
226 | +			if (rq->cfs.load.weight <= first->load.weight) {
227 | +				write_lock_irqsave(&cfs_list_lock, flag);
228 | +				list_del(&rq->cfs.bld_cfs_list);
229 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
230 | +				rq->cfs.pos = 0; first->pos = 1;
231 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
232 | +			}
233 | +		}
234 | +	}
235 | +}
236 | +#else
237 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
238 | +{
239 | +}
240 | +
241 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
242 | +{
243 | +}
244 | +#endif /* CONFIG_BLD */
245 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
246 | index 1236732..5fb8bf7 100644
247 | --- a/kernel/sched/core.c
248 | +++ b/kernel/sched/core.c
249 | @@ -24,6 +24,8 @@
250 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
251 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
252 |   *              Thomas Gleixner, Mike Kravetz
253 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
254 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
255 |   */
256 |  
257 |  #include <linux/mm.h>
258 | @@ -86,6 +88,7 @@
259 |  #include "sched.h"
260 |  #include "../workqueue_internal.h"
261 |  #include "../smpboot.h"
262 | +#include "bld.h"
263 |  
264 |  #define CREATE_TRACE_POINTS
265 |  #include <trace/events/sched.h>
266 | @@ -807,6 +810,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
267 |  	update_rq_clock(rq);
268 |  	sched_info_queued(rq, p);
269 |  	p->sched_class->enqueue_task(rq, p, flags);
270 | +	if (!dl_task(p))
271 | +		bld_track_load_activate(rq, p);
272 |  }
273 |  
274 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
275 | @@ -814,6 +819,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
276 |  	update_rq_clock(rq);
277 |  	sched_info_dequeued(rq, p);
278 |  	p->sched_class->dequeue_task(rq, p, flags);
279 | +	if (!dl_task(p))
280 | +		bld_track_load_deactivate(rq, p);
281 |  }
282 |  
283 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
284 | @@ -1379,7 +1386,14 @@ static inline
285 |  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
286 |  {
287 |  	if (p->nr_cpus_allowed > 1)
288 | +#ifndef	CONFIG_BLD
289 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
290 | +#else
291 | +		if(dl_task(p))
292 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
293 | +		else
294 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
295 | +#endif
296 |  
297 |  	/*
298 |  	 * In order not to call set_task_cpu() on a blocking task we need
299 | @@ -1549,7 +1563,11 @@ void scheduler_ipi(void)
300 |  	 */
301 |  	preempt_fold_need_resched();
302 |  
303 | +#ifndef	CONFIG_BLD
304 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
305 | +#else
306 | +	if (llist_empty(&this_rq()->wake_list))
307 | +#endif
308 |  		return;
309 |  
310 |  	/*
311 | @@ -1571,13 +1589,16 @@ void scheduler_ipi(void)
312 |  	/*
313 |  	 * Check if someone kicked us for doing the nohz idle load balance.
314 |  	 */
315 | +#ifndef	CONFIG_BLD
316 |  	if (unlikely(got_nohz_idle_kick())) {
317 |  		this_rq()->idle_balance = 1;
318 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
319 |  	}
320 | +#endif
321 |  	irq_exit();
322 |  }
323 |  
324 | +#ifndef	CONFIG_BLD
325 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
326 |  {
327 |  	struct rq *rq = cpu_rq(cpu);
328 | @@ -1590,6 +1611,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
329 |  	}
330 |  }
331 |  
332 | +#endif
333 | +
334 | +bool cpus_share_cache(int this_cpu, int that_cpu)
335 | +{
336 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
337 | +}
338 | +
339 |  void wake_up_if_idle(int cpu)
340 |  {
341 |  	struct rq *rq = cpu_rq(cpu);
342 | @@ -1613,18 +1641,13 @@ void wake_up_if_idle(int cpu)
343 |  out:
344 |  	rcu_read_unlock();
345 |  }
346 | -
347 | -bool cpus_share_cache(int this_cpu, int that_cpu)
348 | -{
349 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
350 | -}
351 |  #endif /* CONFIG_SMP */
352 |  
353 |  static void ttwu_queue(struct task_struct *p, int cpu)
354 |  {
355 |  	struct rq *rq = cpu_rq(cpu);
356 |  
357 | -#if defined(CONFIG_SMP)
358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
359 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
360 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
361 |  		ttwu_queue_remote(p, cpu);
362 | @@ -1948,7 +1971,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
363 |  	 * Silence PROVE_RCU.
364 |  	 */
365 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
366 | -	set_task_cpu(p, cpu);
367 | +	__set_task_cpu(p, cpu);
368 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
369 |  
370 |  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
371 | @@ -2415,7 +2438,14 @@ void sched_exec(void)
372 |  	int dest_cpu;
373 |  
374 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
375 | +#ifndef	CONFIG_BLD
376 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
377 | +#else
378 | +	if (dl_task(p))
379 | +		dest_cpu = task_cpu(p);
380 | +	else
381 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
382 | +#endif
383 |  	if (dest_cpu == smp_processor_id())
384 |  		goto unlock;
385 |  
386 | @@ -2503,8 +2533,10 @@ void scheduler_tick(void)
387 |  
388 |  #ifdef CONFIG_SMP
389 |  	rq->idle_balance = idle_cpu(cpu);
390 | +#ifndef	CONFIG_BLD
391 |  	trigger_load_balance(rq);
392 |  #endif
393 | +#endif
394 |  	rq_last_tick_reset(rq);
395 |  }
396 |  
397 | @@ -7221,6 +7253,15 @@ void __init sched_init(void)
398 |  #endif
399 |  		init_rq_hrtick(rq);
400 |  		atomic_set(&rq->nr_iowait, 0);
401 | +#ifdef CONFIG_BLD
402 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
403 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
404 | +		rq->cfs.pos = 0;
405 | +
406 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
407 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
408 | +		rq->rt.lowbit = INT_MAX;
409 | +#endif
410 |  	}
411 |  
412 |  	set_load_weight(&init_task);
413 | @@ -7261,6 +7302,9 @@ void __init sched_init(void)
414 |  	init_sched_fair_class();
415 |  
416 |  	scheduler_running = 1;
417 | +#ifdef	CONFIG_BLD
418 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
419 | +#endif
420 |  }
421 |  
422 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
424 | index c2980e8..ffe8e78 100644
425 | --- a/kernel/sched/fair.c
426 | +++ b/kernel/sched/fair.c
427 | @@ -4424,6 +4424,7 @@ static void task_waking_fair(struct task_struct *p)
428 |  	record_wakee(p);
429 |  }
430 |  
431 | +#ifndef	CONFIG_BLD
432 |  #ifdef CONFIG_FAIR_GROUP_SCHED
433 |  /*
434 |   * effective_load() calculates the load change as seen from the root_task_group
435 | @@ -4903,6 +4904,7 @@ unlock:
436 |  
437 |  	return new_cpu;
438 |  }
439 | +#endif	/* CONFIG_BLD */
440 |  
441 |  /*
442 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
443 | @@ -5198,6 +5200,7 @@ simple:
444 |  	return p;
445 |  
446 |  idle:
447 | +#ifndef	CONFIG_BLD
448 |  	new_tasks = idle_balance(rq);
449 |  	/*
450 |  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
451 | @@ -5209,7 +5212,7 @@ idle:
452 |  
453 |  	if (new_tasks > 0)
454 |  		goto again;
455 | -
456 | +#endif
457 |  	return NULL;
458 |  }
459 |  
460 | @@ -5921,8 +5924,9 @@ static unsigned long task_h_load(struct task_struct *p)
461 |  }
462 |  #endif
463 |  
464 | -/********** Helpers for find_busiest_group ************************/
465 | +#ifndef	CONFIG_BLD
466 |  
467 | +/********** Helpers for find_busiest_group ************************/
468 |  enum group_type {
469 |  	group_other = 0,
470 |  	group_imbalanced,
471 | @@ -6014,6 +6018,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
472 |  	return load_idx;
473 |  }
474 |  
475 | +#endif	/* CONFIG_BLD */
476 |  static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
477 |  {
478 |  	if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
479 | @@ -6141,6 +6146,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
480 |  	sdg->sgc->capacity = capacity;
481 |  }
482 |  
483 | +#ifndef	CONFIG_BLD
484 |  /*
485 |   * Check whether the capacity of the rq has been noticeably reduced by side
486 |   * activity. The imbalance_pct is used for the threshold.
487 | @@ -7377,6 +7383,8 @@ static inline int on_null_domain(struct rq *rq)
488 |  	return unlikely(!rcu_dereference_sched(rq->sd));
489 |  }
490 |  
491 | +#endif	/* CONFIG_BLD */
492 | +
493 |  #ifdef CONFIG_NO_HZ_COMMON
494 |  /*
495 |   * idle load balancing details
496 | @@ -7384,12 +7392,39 @@ static inline int on_null_domain(struct rq *rq)
497 |   *   needed, they will kick the idle load balancer, which then does idle
498 |   *   load balancing for all the idle CPUs.
499 |   */
500 | +#ifndef	CONFIG_BLD
501 |  static struct {
502 |  	cpumask_var_t idle_cpus_mask;
503 |  	atomic_t nr_cpus;
504 |  	unsigned long next_balance;     /* in jiffy units */
505 |  } nohz ____cacheline_aligned;
506 |  
507 | +static inline void nohz_balance_exit_idle(int cpu)
508 | +{
509 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
510 | +		/*
511 | +		 * Completely isolated CPUs don't ever set, so we must test.
512 | +		 */
513 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
514 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
515 | +			atomic_dec(&nohz.nr_cpus);
516 | +		}
517 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
518 | +	}
519 | +}
520 | +
521 | +static int sched_ilb_notifier(struct notifier_block *nfb,
522 | +					unsigned long action, void *hcpu)
523 | +{
524 | +	switch (action & ~CPU_TASKS_FROZEN) {
525 | +	case CPU_DYING:
526 | +		nohz_balance_exit_idle(smp_processor_id());
527 | +		return NOTIFY_OK;
528 | +	default:
529 | +		return NOTIFY_DONE;
530 | +	}
531 | +}
532 | +
533 |  static inline int find_new_ilb(void)
534 |  {
535 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
536 | @@ -7427,20 +7462,7 @@ static void nohz_balancer_kick(void)
537 |  	smp_send_reschedule(ilb_cpu);
538 |  	return;
539 |  }
540 | -
541 | -static inline void nohz_balance_exit_idle(int cpu)
542 | -{
543 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
544 | -		/*
545 | -		 * Completely isolated CPUs don't ever set, so we must test.
546 | -		 */
547 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
548 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
549 | -			atomic_dec(&nohz.nr_cpus);
550 | -		}
551 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
552 | -	}
553 | -}
554 | +#endif	/* CONFIG_BLD */
555 |  
556 |  static inline void set_cpu_sd_state_busy(void)
557 |  {
558 | @@ -7482,6 +7504,7 @@ unlock:
559 |   */
560 |  void nohz_balance_enter_idle(int cpu)
561 |  {
562 | +#ifndef	CONFIG_BLD
563 |  	/*
564 |  	 * If this cpu is going down, then nothing needs to be done.
565 |  	 */
566 | @@ -7500,23 +7523,10 @@ void nohz_balance_enter_idle(int cpu)
567 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
568 |  	atomic_inc(&nohz.nr_cpus);
569 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
570 | -}
571 | -
572 | -static int sched_ilb_notifier(struct notifier_block *nfb,
573 | -					unsigned long action, void *hcpu)
574 | -{
575 | -	switch (action & ~CPU_TASKS_FROZEN) {
576 | -	case CPU_DYING:
577 | -		nohz_balance_exit_idle(smp_processor_id());
578 | -		return NOTIFY_OK;
579 | -	default:
580 | -		return NOTIFY_DONE;
581 | -	}
582 | +#endif
583 |  }
584 |  #endif
585 |  
586 | -static DEFINE_SPINLOCK(balancing);
587 | -
588 |  /*
589 |   * Scale the max load_balance interval with the number of CPUs in the system.
590 |   * This trades load-balance latency on larger machines for less cross talk.
591 | @@ -7526,6 +7536,9 @@ void update_max_interval(void)
592 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
593 |  }
594 |  
595 | +#ifndef	CONFIG_BLD
596 | +static DEFINE_SPINLOCK(balancing);
597 | +
598 |  /*
599 |   * It checks each scheduling domain to see if it is due to be balanced,
600 |   * and initiates a balancing operation if so.
601 | @@ -7787,6 +7800,7 @@ void trigger_load_balance(struct rq *rq)
602 |  		nohz_balancer_kick();
603 |  #endif
604 |  }
605 | +#endif	/* CONFIG_BLD */
606 |  
607 |  static void rq_online_fair(struct rq *rq)
608 |  {
609 | @@ -8232,7 +8246,9 @@ const struct sched_class fair_sched_class = {
610 |  	.put_prev_task		= put_prev_task_fair,
611 |  
612 |  #ifdef CONFIG_SMP
613 | +#ifndef	CONFIG_BLD
614 |  	.select_task_rq		= select_task_rq_fair,
615 | +#endif
616 |  	.migrate_task_rq	= migrate_task_rq_fair,
617 |  
618 |  	.rq_online		= rq_online_fair,
619 | @@ -8272,6 +8288,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
620 |  
621 |  __init void init_sched_fair_class(void)
622 |  {
623 | +#ifndef	CONFIG_BLD
624 |  #ifdef CONFIG_SMP
625 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
626 |  
627 | @@ -8281,5 +8298,5 @@ __init void init_sched_fair_class(void)
628 |  	cpu_notifier(sched_ilb_notifier, 0);
629 |  #endif
630 |  #endif /* SMP */
631 | -
632 | +#endif /* BLD */
633 |  }
634 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
635 | index 575da76..105468eb 100644
636 | --- a/kernel/sched/rt.c
637 | +++ b/kernel/sched/rt.c
638 | @@ -1310,6 +1310,7 @@ static void yield_task_rt(struct rq *rq)
639 |  #ifdef CONFIG_SMP
640 |  static int find_lowest_rq(struct task_struct *task);
641 |  
642 | +#ifndef	CONFIG_BLD
643 |  static int
644 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
645 |  {
646 | @@ -1365,6 +1366,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
647 |  out:
648 |  	return cpu;
649 |  }
650 | +#endif	/* CONFIG_BLD */
651 |  
652 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
653 |  {
654 | @@ -2309,7 +2311,9 @@ const struct sched_class rt_sched_class = {
655 |  	.put_prev_task		= put_prev_task_rt,
656 |  
657 |  #ifdef CONFIG_SMP
658 | +#ifndef	CONFIG_BLD
659 |  	.select_task_rq		= select_task_rq_rt,
660 | +#endif
661 |  
662 |  	.set_cpus_allowed       = set_cpus_allowed_rt,
663 |  	.rq_online              = rq_online_rt,
664 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
665 | index e0e1299..11aad03 100644
666 | --- a/kernel/sched/sched.h
667 | +++ b/kernel/sched/sched.h
668 | @@ -392,9 +392,8 @@ struct cfs_rq {
669 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
670 |  #endif /* CONFIG_SMP */
671 |  
672 | -#ifdef CONFIG_FAIR_GROUP_SCHED
673 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
674 | -
675 | +#ifdef CONFIG_FAIR_GROUP_SCHED
676 |  	/*
677 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
678 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
679 | @@ -418,6 +417,11 @@ struct cfs_rq {
680 |  	struct list_head throttled_list;
681 |  #endif /* CONFIG_CFS_BANDWIDTH */
682 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
683 | +
684 | +#ifdef CONFIG_BLD
685 | +	struct list_head bld_cfs_list;
686 | +	char pos;
687 | +#endif
688 |  };
689 |  
690 |  static inline int rt_bandwidth_enabled(void)
691 | @@ -462,12 +466,16 @@ struct rt_rq {
692 |  	/* Nests inside the rq lock: */
693 |  	raw_spinlock_t rt_runtime_lock;
694 |  
695 | +	struct rq *rq;
696 |  #ifdef CONFIG_RT_GROUP_SCHED
697 |  	unsigned long rt_nr_boosted;
698 |  
699 | -	struct rq *rq;
700 |  	struct task_group *tg;
701 |  #endif
702 | +#ifdef CONFIG_BLD
703 | +	struct list_head bld_rt_list;
704 | +	int lowbit;
705 | +#endif
706 |  };
707 |  
708 |  /* Deadline class' related fields in a runqueue */
709 | 


--------------------------------------------------------------------------------
/BLD-4.4.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/init/Kconfig b/init/Kconfig
  2 | index 235c7a2..01a91fb 100644
  3 | --- a/init/Kconfig
  4 | +++ b/init/Kconfig
  5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  6 |  	depends on BROKEN || !SMP
  7 |  	default y
  8 |  
  9 | +config BLD
 10 | +	bool "An alternate CPU load distribution technique for task scheduler"
 11 | +	depends on SMP
 12 | +	default y
 13 | +	help
 14 | +	  This is an alternate CPU load distribution technique based for task
 15 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 16 | +	  suitable for NUMA, should work well on SMP.
 17 | +
 18 |  config INIT_ENV_ARG_LIMIT
 19 |  	int
 20 |  	default 32 if !UML
 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 22 | new file mode 100644
 23 | index 0000000..f1f9fba
 24 | --- /dev/null
 25 | +++ b/kernel/sched/bld.h
 26 | @@ -0,0 +1,215 @@
 27 | +#ifdef CONFIG_BLD
 28 | +
 29 | +static DEFINE_RWLOCK(rt_list_lock);
 30 | +static LIST_HEAD(rt_rq_head);
 31 | +static LIST_HEAD(cfs_rq_head);
 32 | +static DEFINE_RWLOCK(cfs_list_lock);
 33 | +
 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 36 | +{
 37 | +	return cfs_rq->rq;
 38 | +}
 39 | +#else
 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 41 | +{
 42 | +	return container_of(cfs_rq, struct rq, cfs);
 43 | +}
 44 | +#endif
 45 | +
 46 | +#ifdef CONFIG_RT_GROUP_SCHED
 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 48 | +{
 49 | +	return rt_rq->rq;
 50 | +}
 51 | +#else
 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 53 | +{
 54 | +	return container_of(rt_rq, struct rq, rt);
 55 | +}
 56 | +#endif
 57 | +
 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 59 | +{
 60 | +	int cpu = smp_processor_id(), i;
 61 | +	unsigned long load, varload;
 62 | +	struct rq *rq;
 63 | +
 64 | +	if (task_type) {
 65 | +		varload = ULONG_MAX;
 66 | +		for_each_cpu(i, mask) {
 67 | +			rq = cpu_rq(i);
 68 | +			load = rq->cfs.load.weight;
 69 | +			if (load < varload) {
 70 | +				varload = load;
 71 | +				cpu = i;
 72 | +			}
 73 | +		}
 74 | +	} else {
 75 | +		/* Here's an attempt to get a CPU within the mask where
 76 | +		 * we can preempt easily. To achieve this we tried to
 77 | +		 * maintain a lowbit, which indicate the lowest bit set on
 78 | +		 * array bitmap. Since all CPUs contains high priority
 79 | +		 * kernel threads therefore we eliminate 0, so it might not
 80 | +		 * be right every time, but it's just an indicator.
 81 | +		 */
 82 | +		varload = 1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load >= varload) {
 88 | +				varload = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = firstbit;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index 732e993..ffb231a 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/mm.h>
255 | @@ -86,6 +88,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -833,6 +836,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	if (!(flags & ENQUEUE_RESTORE))
265 |  		sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -841,6 +846,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	if (!(flags & DEQUEUE_SAVE))
274 |  		sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1625,7 +1632,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
282 |  	lockdep_assert_held(&p->pi_lock);
283 |  
284 |  	if (p->nr_cpus_allowed > 1)
285 | +#ifndef	CONFIG_BLD
286 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
287 | +#else
288 | +		if(dl_task(p))
289 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
290 | +		else
291 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
292 | +#endif
293 |  
294 |  	/*
295 |  	 * In order not to call set_task_cpu() on a blocking task we need
296 | @@ -1815,7 +1829,11 @@ void scheduler_ipi(void)
297 |  	 */
298 |  	preempt_fold_need_resched();
299 |  
300 | +#ifndef	CONFIG_BLD
301 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
302 | +#else
303 | +	if (llist_empty(&this_rq()->wake_list))
304 | +#endif
305 |  		return;
306 |  
307 |  	/*
308 | @@ -1837,13 +1855,16 @@ void scheduler_ipi(void)
309 |  	/*
310 |  	 * Check if someone kicked us for doing the nohz idle load balance.
311 |  	 */
312 | +#ifndef	CONFIG_BLD
313 |  	if (unlikely(got_nohz_idle_kick())) {
314 |  		this_rq()->idle_balance = 1;
315 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
316 |  	}
317 | +#endif
318 |  	irq_exit();
319 |  }
320 |  
321 | +#ifndef	CONFIG_BLD
322 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
323 |  {
324 |  	struct rq *rq = cpu_rq(cpu);
325 | @@ -1856,6 +1877,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
326 |  	}
327 |  }
328 |  
329 | +#endif
330 | +
331 | +bool cpus_share_cache(int this_cpu, int that_cpu)
332 | +{
333 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
334 | +}
335 | +
336 |  void wake_up_if_idle(int cpu)
337 |  {
338 |  	struct rq *rq = cpu_rq(cpu);
339 | @@ -1879,18 +1907,13 @@ void wake_up_if_idle(int cpu)
340 |  out:
341 |  	rcu_read_unlock();
342 |  }
343 | -
344 | -bool cpus_share_cache(int this_cpu, int that_cpu)
345 | -{
346 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
347 | -}
348 |  #endif /* CONFIG_SMP */
349 |  
350 |  static void ttwu_queue(struct task_struct *p, int cpu)
351 |  {
352 |  	struct rq *rq = cpu_rq(cpu);
353 |  
354 | -#if defined(CONFIG_SMP)
355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
356 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
357 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
358 |  		ttwu_queue_remote(p, cpu);
359 | @@ -2243,7 +2266,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
360 |  	 * Silence PROVE_RCU.
361 |  	 */
362 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
363 | -	set_task_cpu(p, cpu);
364 | +	__set_task_cpu(p, cpu);
365 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
366 |  
367 |  #ifdef CONFIG_SCHED_INFO
368 | @@ -2788,7 +2811,14 @@ void sched_exec(void)
369 |  	int dest_cpu;
370 |  
371 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
372 | +#ifndef	CONFIG_BLD
373 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
374 | +#else
375 | +	if (dl_task(p))
376 | +		dest_cpu = task_cpu(p);
377 | +	else
378 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
379 | +#endif
380 |  	if (dest_cpu == smp_processor_id())
381 |  		goto unlock;
382 |  
383 | @@ -2877,8 +2907,10 @@ void scheduler_tick(void)
384 |  
385 |  #ifdef CONFIG_SMP
386 |  	rq->idle_balance = idle_cpu(cpu);
387 | +#ifndef	CONFIG_BLD
388 |  	trigger_load_balance(rq);
389 |  #endif
390 | +#endif
391 |  	rq_last_tick_reset(rq);
392 |  }
393 |  
394 | @@ -7492,6 +7524,15 @@ void __init sched_init(void)
395 |  #endif
396 |  		init_rq_hrtick(rq);
397 |  		atomic_set(&rq->nr_iowait, 0);
398 | +#ifdef CONFIG_BLD
399 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
400 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
401 | +		rq->cfs.pos = 0;
402 | +
403 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
404 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
405 | +		rq->rt.lowbit = INT_MAX;
406 | +#endif
407 |  	}
408 |  
409 |  	set_load_weight(&init_task);
410 | @@ -7532,6 +7573,9 @@ void __init sched_init(void)
411 |  	init_sched_fair_class();
412 |  
413 |  	scheduler_running = 1;
414 | +#ifdef	CONFIG_BLD
415 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
416 | +#endif
417 |  }
418 |  
419 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
421 | index cfdc0e6..08fd5f4 100644
422 | --- a/kernel/sched/fair.c
423 | +++ b/kernel/sched/fair.c
424 | @@ -4518,6 +4518,7 @@ static void task_waking_fair(struct task_struct *p)
425 |  	record_wakee(p);
426 |  }
427 |  
428 | +#ifndef	CONFIG_BLD
429 |  #ifdef CONFIG_FAIR_GROUP_SCHED
430 |  /*
431 |   * effective_load() calculates the load change as seen from the root_task_group
432 | @@ -5003,6 +5004,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
433 |  
434 |  	return new_cpu;
435 |  }
436 | +#endif	/* CONFIG_BLD */
437 |  
438 |  /*
439 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
440 | @@ -5308,6 +5310,7 @@ idle:
441 |  	 * further scheduler activity on it and we're being very careful to
442 |  	 * re-start the picking loop.
443 |  	 */
444 | +#ifndef	CONFIG_BLD
445 |  	lockdep_unpin_lock(&rq->lock);
446 |  	new_tasks = idle_balance(rq);
447 |  	lockdep_pin_lock(&rq->lock);
448 | @@ -5321,7 +5324,7 @@ idle:
449 |  
450 |  	if (new_tasks > 0)
451 |  		goto again;
452 | -
453 | +#endif
454 |  	return NULL;
455 |  }
456 |  
457 | @@ -5982,8 +5985,9 @@ static unsigned long task_h_load(struct task_struct *p)
458 |  }
459 |  #endif
460 |  
461 | -/********** Helpers for find_busiest_group ************************/
462 | +#ifndef	CONFIG_BLD
463 |  
464 | +/********** Helpers for find_busiest_group ************************/
465 |  enum group_type {
466 |  	group_other = 0,
467 |  	group_imbalanced,
468 | @@ -6074,6 +6078,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
469 |  
470 |  	return load_idx;
471 |  }
472 | +#endif	/* CONFIG_BLD	*/
473 |  
474 |  static unsigned long scale_rt_capacity(int cpu)
475 |  {
476 | @@ -6182,6 +6187,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
477 |  	sdg->sgc->capacity = capacity;
478 |  }
479 |  
480 | +#ifndef	CONFIG_BLD
481 |  /*
482 |   * Check whether the capacity of the rq has been noticeably reduced by side
483 |   * activity. The imbalance_pct is used for the threshold.
484 | @@ -7415,6 +7421,8 @@ static inline int on_null_domain(struct rq *rq)
485 |  	return unlikely(!rcu_dereference_sched(rq->sd));
486 |  }
487 |  
488 | +#endif	/* CONFIG_BLD */
489 | +
490 |  #ifdef CONFIG_NO_HZ_COMMON
491 |  /*
492 |   * idle load balancing details
493 | @@ -7422,12 +7430,39 @@ static inline int on_null_domain(struct rq *rq)
494 |   *   needed, they will kick the idle load balancer, which then does idle
495 |   *   load balancing for all the idle CPUs.
496 |   */
497 | +#ifndef	CONFIG_BLD
498 |  static struct {
499 |  	cpumask_var_t idle_cpus_mask;
500 |  	atomic_t nr_cpus;
501 |  	unsigned long next_balance;     /* in jiffy units */
502 |  } nohz ____cacheline_aligned;
503 |  
504 | +static inline void nohz_balance_exit_idle(int cpu)
505 | +{
506 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
507 | +		/*
508 | +		 * Completely isolated CPUs don't ever set, so we must test.
509 | +		 */
510 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
511 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
512 | +			atomic_dec(&nohz.nr_cpus);
513 | +		}
514 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
515 | +	}
516 | +}
517 | +
518 | +static int sched_ilb_notifier(struct notifier_block *nfb,
519 | +					unsigned long action, void *hcpu)
520 | +{
521 | +	switch (action & ~CPU_TASKS_FROZEN) {
522 | +	case CPU_DYING:
523 | +		nohz_balance_exit_idle(smp_processor_id());
524 | +		return NOTIFY_OK;
525 | +	default:
526 | +		return NOTIFY_DONE;
527 | +	}
528 | +}
529 | +
530 |  static inline int find_new_ilb(void)
531 |  {
532 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
533 | @@ -7465,20 +7500,7 @@ static void nohz_balancer_kick(void)
534 |  	smp_send_reschedule(ilb_cpu);
535 |  	return;
536 |  }
537 | -
538 | -static inline void nohz_balance_exit_idle(int cpu)
539 | -{
540 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
541 | -		/*
542 | -		 * Completely isolated CPUs don't ever set, so we must test.
543 | -		 */
544 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
545 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
546 | -			atomic_dec(&nohz.nr_cpus);
547 | -		}
548 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
549 | -	}
550 | -}
551 | +#endif	/* CONFIG_BLD */
552 |  
553 |  static inline void set_cpu_sd_state_busy(void)
554 |  {
555 | @@ -7520,6 +7542,7 @@ unlock:
556 |   */
557 |  void nohz_balance_enter_idle(int cpu)
558 |  {
559 | +#ifndef	CONFIG_BLD
560 |  	/*
561 |  	 * If this cpu is going down, then nothing needs to be done.
562 |  	 */
563 | @@ -7538,23 +7561,10 @@ void nohz_balance_enter_idle(int cpu)
564 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
565 |  	atomic_inc(&nohz.nr_cpus);
566 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
567 | -}
568 | -
569 | -static int sched_ilb_notifier(struct notifier_block *nfb,
570 | -					unsigned long action, void *hcpu)
571 | -{
572 | -	switch (action & ~CPU_TASKS_FROZEN) {
573 | -	case CPU_DYING:
574 | -		nohz_balance_exit_idle(smp_processor_id());
575 | -		return NOTIFY_OK;
576 | -	default:
577 | -		return NOTIFY_DONE;
578 | -	}
579 | +#endif
580 |  }
581 |  #endif
582 |  
583 | -static DEFINE_SPINLOCK(balancing);
584 | -
585 |  /*
586 |   * Scale the max load_balance interval with the number of CPUs in the system.
587 |   * This trades load-balance latency on larger machines for less cross talk.
588 | @@ -7564,6 +7574,9 @@ void update_max_interval(void)
589 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
590 |  }
591 |  
592 | +#ifndef	CONFIG_BLD
593 | +static DEFINE_SPINLOCK(balancing);
594 | +
595 |  /*
596 |   * It checks each scheduling domain to see if it is due to be balanced,
597 |   * and initiates a balancing operation if so.
598 | @@ -7851,6 +7864,7 @@ void trigger_load_balance(struct rq *rq)
599 |  		nohz_balancer_kick();
600 |  #endif
601 |  }
602 | +#endif	/* CONFIG_BLD */
603 |  
604 |  static void rq_online_fair(struct rq *rq)
605 |  {
606 | @@ -8282,7 +8296,9 @@ const struct sched_class fair_sched_class = {
607 |  	.put_prev_task		= put_prev_task_fair,
608 |  
609 |  #ifdef CONFIG_SMP
610 | +#ifndef	CONFIG_BLD
611 |  	.select_task_rq		= select_task_rq_fair,
612 | +#endif
613 |  	.migrate_task_rq	= migrate_task_rq_fair,
614 |  
615 |  	.rq_online		= rq_online_fair,
616 | @@ -8344,6 +8360,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
617 |  
618 |  __init void init_sched_fair_class(void)
619 |  {
620 | +#ifndef	CONFIG_BLD
621 |  #ifdef CONFIG_SMP
622 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
623 |  
624 | @@ -8353,5 +8370,5 @@ __init void init_sched_fair_class(void)
625 |  	cpu_notifier(sched_ilb_notifier, 0);
626 |  #endif
627 |  #endif /* SMP */
628 | -
629 | +#endif /* BLD */
630 |  }
631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
632 | index 8ec86ab..cada34d 100644
633 | --- a/kernel/sched/rt.c
634 | +++ b/kernel/sched/rt.c
635 | @@ -1313,6 +1313,7 @@ static void yield_task_rt(struct rq *rq)
636 |  #ifdef CONFIG_SMP
637 |  static int find_lowest_rq(struct task_struct *task);
638 |  
639 | +#ifndef	CONFIG_BLD
640 |  static int
641 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
642 |  {
643 | @@ -1368,6 +1369,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
644 |  out:
645 |  	return cpu;
646 |  }
647 | +#endif	/* CONFIG_BLD */
648 |  
649 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
650 |  {
651 | @@ -2272,7 +2274,9 @@ const struct sched_class rt_sched_class = {
652 |  	.put_prev_task		= put_prev_task_rt,
653 |  
654 |  #ifdef CONFIG_SMP
655 | +#ifndef	CONFIG_BLD
656 |  	.select_task_rq		= select_task_rq_rt,
657 | +#endif
658 |  
659 |  	.set_cpus_allowed       = set_cpus_allowed_common,
660 |  	.rq_online              = rq_online_rt,
661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
662 | index b242775..256ad05 100644
663 | --- a/kernel/sched/sched.h
664 | +++ b/kernel/sched/sched.h
665 | @@ -395,9 +395,8 @@ struct cfs_rq {
666 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
667 |  #endif /* CONFIG_SMP */
668 |  
669 | -#ifdef CONFIG_FAIR_GROUP_SCHED
670 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
671 | -
672 | +#ifdef CONFIG_FAIR_GROUP_SCHED
673 |  	/*
674 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
675 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
676 | @@ -421,6 +420,11 @@ struct cfs_rq {
677 |  	struct list_head throttled_list;
678 |  #endif /* CONFIG_CFS_BANDWIDTH */
679 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
680 | +
681 | +#ifdef CONFIG_BLD
682 | +	struct list_head bld_cfs_list;
683 | +	char pos;
684 | +#endif
685 |  };
686 |  
687 |  static inline int rt_bandwidth_enabled(void)
688 | @@ -465,12 +469,16 @@ struct rt_rq {
689 |  	/* Nests inside the rq lock: */
690 |  	raw_spinlock_t rt_runtime_lock;
691 |  
692 | +	struct rq *rq;
693 |  #ifdef CONFIG_RT_GROUP_SCHED
694 |  	unsigned long rt_nr_boosted;
695 |  
696 | -	struct rq *rq;
697 |  	struct task_group *tg;
698 |  #endif
699 | +#ifdef CONFIG_BLD
700 | +	struct list_head bld_rt_list;
701 | +	int lowbit;
702 | +#endif
703 |  };
704 |  
705 |  /* Deadline class' related fields in a runqueue */
706 | 


--------------------------------------------------------------------------------
/BLD-4.5.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/init/Kconfig b/init/Kconfig
  2 | index 2232080..627f6ca 100644
  3 | --- a/init/Kconfig
  4 | +++ b/init/Kconfig
  5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  6 |  	depends on BROKEN || !SMP
  7 |  	default y
  8 |  
  9 | +config BLD
 10 | +	bool "An alternate CPU load distribution technique for task scheduler"
 11 | +	depends on SMP
 12 | +	default y
 13 | +	help
 14 | +	  This is an alternate CPU load distribution technique based for task
 15 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 16 | +	  suitable for NUMA, should work well on SMP.
 17 | +
 18 |  config INIT_ENV_ARG_LIMIT
 19 |  	int
 20 |  	default 32 if !UML
 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 22 | new file mode 100644
 23 | index 0000000..f1f9fba
 24 | --- /dev/null
 25 | +++ b/kernel/sched/bld.h
 26 | @@ -0,0 +1,215 @@
 27 | +#ifdef CONFIG_BLD
 28 | +
 29 | +static DEFINE_RWLOCK(rt_list_lock);
 30 | +static LIST_HEAD(rt_rq_head);
 31 | +static LIST_HEAD(cfs_rq_head);
 32 | +static DEFINE_RWLOCK(cfs_list_lock);
 33 | +
 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 36 | +{
 37 | +	return cfs_rq->rq;
 38 | +}
 39 | +#else
 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 41 | +{
 42 | +	return container_of(cfs_rq, struct rq, cfs);
 43 | +}
 44 | +#endif
 45 | +
 46 | +#ifdef CONFIG_RT_GROUP_SCHED
 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 48 | +{
 49 | +	return rt_rq->rq;
 50 | +}
 51 | +#else
 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 53 | +{
 54 | +	return container_of(rt_rq, struct rq, rt);
 55 | +}
 56 | +#endif
 57 | +
 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 59 | +{
 60 | +	int cpu = smp_processor_id(), i;
 61 | +	unsigned long load, varload;
 62 | +	struct rq *rq;
 63 | +
 64 | +	if (task_type) {
 65 | +		varload = ULONG_MAX;
 66 | +		for_each_cpu(i, mask) {
 67 | +			rq = cpu_rq(i);
 68 | +			load = rq->cfs.load.weight;
 69 | +			if (load < varload) {
 70 | +				varload = load;
 71 | +				cpu = i;
 72 | +			}
 73 | +		}
 74 | +	} else {
 75 | +		/* Here's an attempt to get a CPU within the mask where
 76 | +		 * we can preempt easily. To achieve this we tried to
 77 | +		 * maintain a lowbit, which indicate the lowest bit set on
 78 | +		 * array bitmap. Since all CPUs contains high priority
 79 | +		 * kernel threads therefore we eliminate 0, so it might not
 80 | +		 * be right every time, but it's just an indicator.
 81 | +		 */
 82 | +		varload = 1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load >= varload) {
 88 | +				varload = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = firstbit;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index 41f6b22..d000500 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/kasan.h>
255 | @@ -87,6 +89,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -834,6 +837,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	if (!(flags & ENQUEUE_RESTORE))
265 |  		sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -842,6 +847,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	if (!(flags & DEQUEUE_SAVE))
274 |  		sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1637,7 +1644,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
282 |  	lockdep_assert_held(&p->pi_lock);
283 |  
284 |  	if (p->nr_cpus_allowed > 1)
285 | +#ifndef	CONFIG_BLD
286 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
287 | +#else
288 | +		if(dl_task(p))
289 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
290 | +		else
291 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
292 | +#endif
293 |  
294 |  	/*
295 |  	 * In order not to call set_task_cpu() on a blocking task we need
296 | @@ -1827,7 +1841,11 @@ void scheduler_ipi(void)
297 |  	 */
298 |  	preempt_fold_need_resched();
299 |  
300 | +#ifndef	CONFIG_BLD
301 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
302 | +#else
303 | +	if (llist_empty(&this_rq()->wake_list))
304 | +#endif
305 |  		return;
306 |  
307 |  	/*
308 | @@ -1849,13 +1867,16 @@ void scheduler_ipi(void)
309 |  	/*
310 |  	 * Check if someone kicked us for doing the nohz idle load balance.
311 |  	 */
312 | +#ifndef	CONFIG_BLD
313 |  	if (unlikely(got_nohz_idle_kick())) {
314 |  		this_rq()->idle_balance = 1;
315 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
316 |  	}
317 | +#endif
318 |  	irq_exit();
319 |  }
320 |  
321 | +#ifndef	CONFIG_BLD
322 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
323 |  {
324 |  	struct rq *rq = cpu_rq(cpu);
325 | @@ -1868,6 +1889,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
326 |  	}
327 |  }
328 |  
329 | +#endif
330 | +
331 | +bool cpus_share_cache(int this_cpu, int that_cpu)
332 | +{
333 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
334 | +}
335 | +
336 |  void wake_up_if_idle(int cpu)
337 |  {
338 |  	struct rq *rq = cpu_rq(cpu);
339 | @@ -1891,18 +1919,13 @@ void wake_up_if_idle(int cpu)
340 |  out:
341 |  	rcu_read_unlock();
342 |  }
343 | -
344 | -bool cpus_share_cache(int this_cpu, int that_cpu)
345 | -{
346 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
347 | -}
348 |  #endif /* CONFIG_SMP */
349 |  
350 |  static void ttwu_queue(struct task_struct *p, int cpu)
351 |  {
352 |  	struct rq *rq = cpu_rq(cpu);
353 |  
354 | -#if defined(CONFIG_SMP)
355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
356 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
357 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
358 |  		ttwu_queue_remote(p, cpu);
359 | @@ -2344,7 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
360 |  	 * Silence PROVE_RCU.
361 |  	 */
362 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
363 | -	set_task_cpu(p, cpu);
364 | +	__set_task_cpu(p, cpu);
365 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
366 |  
367 |  #ifdef CONFIG_SCHED_INFO
368 | @@ -2889,7 +2912,14 @@ void sched_exec(void)
369 |  	int dest_cpu;
370 |  
371 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
372 | +#ifndef	CONFIG_BLD
373 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
374 | +#else
375 | +	if (dl_task(p))
376 | +		dest_cpu = task_cpu(p);
377 | +	else
378 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
379 | +#endif
380 |  	if (dest_cpu == smp_processor_id())
381 |  		goto unlock;
382 |  
383 | @@ -2978,8 +3008,10 @@ void scheduler_tick(void)
384 |  
385 |  #ifdef CONFIG_SMP
386 |  	rq->idle_balance = idle_cpu(cpu);
387 | +#ifndef	CONFIG_BLD
388 |  	trigger_load_balance(rq);
389 |  #endif
390 | +#endif
391 |  	rq_last_tick_reset(rq);
392 |  }
393 |  
394 | @@ -7601,6 +7633,15 @@ void __init sched_init(void)
395 |  #endif
396 |  		init_rq_hrtick(rq);
397 |  		atomic_set(&rq->nr_iowait, 0);
398 | +#ifdef CONFIG_BLD
399 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
400 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
401 | +		rq->cfs.pos = 0;
402 | +
403 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
404 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
405 | +		rq->rt.lowbit = INT_MAX;
406 | +#endif
407 |  	}
408 |  
409 |  	set_load_weight(&init_task);
410 | @@ -7641,6 +7682,9 @@ void __init sched_init(void)
411 |  	init_sched_fair_class();
412 |  
413 |  	scheduler_running = 1;
414 | +#ifdef	CONFIG_BLD
415 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
416 | +#endif
417 |  }
418 |  
419 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
421 | index 56b7d4b..7fe9d9b 100644
422 | --- a/kernel/sched/fair.c
423 | +++ b/kernel/sched/fair.c
424 | @@ -4655,6 +4655,7 @@ static void task_waking_fair(struct task_struct *p)
425 |  	record_wakee(p);
426 |  }
427 |  
428 | +#ifndef	CONFIG_BLD
429 |  #ifdef CONFIG_FAIR_GROUP_SCHED
430 |  /*
431 |   * effective_load() calculates the load change as seen from the root_task_group
432 | @@ -5140,6 +5141,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
433 |  
434 |  	return new_cpu;
435 |  }
436 | +#endif	/* CONFIG_BLD */
437 |  
438 |  /*
439 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
440 | @@ -5444,6 +5446,7 @@ idle:
441 |  	 * further scheduler activity on it and we're being very careful to
442 |  	 * re-start the picking loop.
443 |  	 */
444 | +#ifndef	CONFIG_BLD
445 |  	lockdep_unpin_lock(&rq->lock);
446 |  	new_tasks = idle_balance(rq);
447 |  	lockdep_pin_lock(&rq->lock);
448 | @@ -5457,7 +5460,7 @@ idle:
449 |  
450 |  	if (new_tasks > 0)
451 |  		goto again;
452 | -
453 | +#endif
454 |  	return NULL;
455 |  }
456 |  
457 | @@ -6118,8 +6121,9 @@ static unsigned long task_h_load(struct task_struct *p)
458 |  }
459 |  #endif
460 |  
461 | -/********** Helpers for find_busiest_group ************************/
462 | +#ifndef	CONFIG_BLD
463 |  
464 | +/********** Helpers for find_busiest_group ************************/
465 |  enum group_type {
466 |  	group_other = 0,
467 |  	group_imbalanced,
468 | @@ -6210,6 +6214,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
469 |  
470 |  	return load_idx;
471 |  }
472 | +#endif	/* CONFIG_BLD	*/
473 |  
474 |  static unsigned long scale_rt_capacity(int cpu)
475 |  {
476 | @@ -6318,6 +6323,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
477 |  	sdg->sgc->capacity = capacity;
478 |  }
479 |  
480 | +#ifndef	CONFIG_BLD
481 |  /*
482 |   * Check whether the capacity of the rq has been noticeably reduced by side
483 |   * activity. The imbalance_pct is used for the threshold.
484 | @@ -7551,6 +7557,8 @@ static inline int on_null_domain(struct rq *rq)
485 |  	return unlikely(!rcu_dereference_sched(rq->sd));
486 |  }
487 |  
488 | +#endif	/* CONFIG_BLD */
489 | +
490 |  #ifdef CONFIG_NO_HZ_COMMON
491 |  /*
492 |   * idle load balancing details
493 | @@ -7558,12 +7566,39 @@ static inline int on_null_domain(struct rq *rq)
494 |   *   needed, they will kick the idle load balancer, which then does idle
495 |   *   load balancing for all the idle CPUs.
496 |   */
497 | +#ifndef	CONFIG_BLD
498 |  static struct {
499 |  	cpumask_var_t idle_cpus_mask;
500 |  	atomic_t nr_cpus;
501 |  	unsigned long next_balance;     /* in jiffy units */
502 |  } nohz ____cacheline_aligned;
503 |  
504 | +static inline void nohz_balance_exit_idle(int cpu)
505 | +{
506 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
507 | +		/*
508 | +		 * Completely isolated CPUs don't ever set, so we must test.
509 | +		 */
510 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
511 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
512 | +			atomic_dec(&nohz.nr_cpus);
513 | +		}
514 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
515 | +	}
516 | +}
517 | +
518 | +static int sched_ilb_notifier(struct notifier_block *nfb,
519 | +					unsigned long action, void *hcpu)
520 | +{
521 | +	switch (action & ~CPU_TASKS_FROZEN) {
522 | +	case CPU_DYING:
523 | +		nohz_balance_exit_idle(smp_processor_id());
524 | +		return NOTIFY_OK;
525 | +	default:
526 | +		return NOTIFY_DONE;
527 | +	}
528 | +}
529 | +
530 |  static inline int find_new_ilb(void)
531 |  {
532 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
533 | @@ -7601,20 +7636,7 @@ static void nohz_balancer_kick(void)
534 |  	smp_send_reschedule(ilb_cpu);
535 |  	return;
536 |  }
537 | -
538 | -static inline void nohz_balance_exit_idle(int cpu)
539 | -{
540 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
541 | -		/*
542 | -		 * Completely isolated CPUs don't ever set, so we must test.
543 | -		 */
544 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
545 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
546 | -			atomic_dec(&nohz.nr_cpus);
547 | -		}
548 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
549 | -	}
550 | -}
551 | +#endif	/* CONFIG_BLD */
552 |  
553 |  static inline void set_cpu_sd_state_busy(void)
554 |  {
555 | @@ -7656,6 +7678,7 @@ unlock:
556 |   */
557 |  void nohz_balance_enter_idle(int cpu)
558 |  {
559 | +#ifndef	CONFIG_BLD
560 |  	/*
561 |  	 * If this cpu is going down, then nothing needs to be done.
562 |  	 */
563 | @@ -7674,23 +7697,10 @@ void nohz_balance_enter_idle(int cpu)
564 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
565 |  	atomic_inc(&nohz.nr_cpus);
566 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
567 | -}
568 | -
569 | -static int sched_ilb_notifier(struct notifier_block *nfb,
570 | -					unsigned long action, void *hcpu)
571 | -{
572 | -	switch (action & ~CPU_TASKS_FROZEN) {
573 | -	case CPU_DYING:
574 | -		nohz_balance_exit_idle(smp_processor_id());
575 | -		return NOTIFY_OK;
576 | -	default:
577 | -		return NOTIFY_DONE;
578 | -	}
579 | +#endif
580 |  }
581 |  #endif
582 |  
583 | -static DEFINE_SPINLOCK(balancing);
584 | -
585 |  /*
586 |   * Scale the max load_balance interval with the number of CPUs in the system.
587 |   * This trades load-balance latency on larger machines for less cross talk.
588 | @@ -7700,6 +7710,9 @@ void update_max_interval(void)
589 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
590 |  }
591 |  
592 | +#ifndef	CONFIG_BLD
593 | +static DEFINE_SPINLOCK(balancing);
594 | +
595 |  /*
596 |   * It checks each scheduling domain to see if it is due to be balanced,
597 |   * and initiates a balancing operation if so.
598 | @@ -7987,6 +8000,7 @@ void trigger_load_balance(struct rq *rq)
599 |  		nohz_balancer_kick();
600 |  #endif
601 |  }
602 | +#endif	/* CONFIG_BLD */
603 |  
604 |  static void rq_online_fair(struct rq *rq)
605 |  {
606 | @@ -8418,7 +8432,9 @@ const struct sched_class fair_sched_class = {
607 |  	.put_prev_task		= put_prev_task_fair,
608 |  
609 |  #ifdef CONFIG_SMP
610 | +#ifndef	CONFIG_BLD
611 |  	.select_task_rq		= select_task_rq_fair,
612 | +#endif
613 |  	.migrate_task_rq	= migrate_task_rq_fair,
614 |  
615 |  	.rq_online		= rq_online_fair,
616 | @@ -8480,6 +8496,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
617 |  
618 |  __init void init_sched_fair_class(void)
619 |  {
620 | +#ifndef	CONFIG_BLD
621 |  #ifdef CONFIG_SMP
622 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
623 |  
624 | @@ -8489,5 +8506,5 @@ __init void init_sched_fair_class(void)
625 |  	cpu_notifier(sched_ilb_notifier, 0);
626 |  #endif
627 |  #endif /* SMP */
628 | -
629 | +#endif /* BLD */
630 |  }
631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
632 | index 8ec86ab..cada34d 100644
633 | --- a/kernel/sched/rt.c
634 | +++ b/kernel/sched/rt.c
635 | @@ -1313,6 +1313,7 @@ static void yield_task_rt(struct rq *rq)
636 |  #ifdef CONFIG_SMP
637 |  static int find_lowest_rq(struct task_struct *task);
638 |  
639 | +#ifndef	CONFIG_BLD
640 |  static int
641 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
642 |  {
643 | @@ -1368,6 +1369,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
644 |  out:
645 |  	return cpu;
646 |  }
647 | +#endif	/* CONFIG_BLD */
648 |  
649 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
650 |  {
651 | @@ -2272,7 +2274,9 @@ const struct sched_class rt_sched_class = {
652 |  	.put_prev_task		= put_prev_task_rt,
653 |  
654 |  #ifdef CONFIG_SMP
655 | +#ifndef	CONFIG_BLD
656 |  	.select_task_rq		= select_task_rq_rt,
657 | +#endif
658 |  
659 |  	.set_cpus_allowed       = set_cpus_allowed_common,
660 |  	.rq_online              = rq_online_rt,
661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
662 | index 10f1637..33a2aa97 100644
663 | --- a/kernel/sched/sched.h
664 | +++ b/kernel/sched/sched.h
665 | @@ -408,9 +408,8 @@ struct cfs_rq {
666 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
667 |  #endif /* CONFIG_SMP */
668 |  
669 | -#ifdef CONFIG_FAIR_GROUP_SCHED
670 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
671 | -
672 | +#ifdef CONFIG_FAIR_GROUP_SCHED
673 |  	/*
674 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
675 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
676 | @@ -434,6 +433,11 @@ struct cfs_rq {
677 |  	struct list_head throttled_list;
678 |  #endif /* CONFIG_CFS_BANDWIDTH */
679 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
680 | +
681 | +#ifdef CONFIG_BLD
682 | +	struct list_head bld_cfs_list;
683 | +	char pos;
684 | +#endif
685 |  };
686 |  
687 |  static inline int rt_bandwidth_enabled(void)
688 | @@ -478,12 +482,16 @@ struct rt_rq {
689 |  	/* Nests inside the rq lock: */
690 |  	raw_spinlock_t rt_runtime_lock;
691 |  
692 | +	struct rq *rq;
693 |  #ifdef CONFIG_RT_GROUP_SCHED
694 |  	unsigned long rt_nr_boosted;
695 |  
696 | -	struct rq *rq;
697 |  	struct task_group *tg;
698 |  #endif
699 | +#ifdef CONFIG_BLD
700 | +	struct list_head bld_rt_list;
701 | +	int lowbit;
702 | +#endif
703 |  };
704 |  
705 |  /* Deadline class' related fields in a runqueue */
706 | 


--------------------------------------------------------------------------------
/BLD-4.3.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/init/Kconfig b/init/Kconfig
  2 | index c24b6f7..898db4a 100644
  3 | --- a/init/Kconfig
  4 | +++ b/init/Kconfig
  5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  6 |  	depends on BROKEN || !SMP
  7 |  	default y
  8 |  
  9 | +config BLD
 10 | +	bool "An alternate CPU load distribution technique for task scheduler"
 11 | +	depends on SMP
 12 | +	default y
 13 | +	help
 14 | +	  This is an alternate CPU load distribution technique based for task
 15 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 16 | +	  suitable for NUMA, should work well on SMP.
 17 | +
 18 |  config INIT_ENV_ARG_LIMIT
 19 |  	int
 20 |  	default 32 if !UML
 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 22 | new file mode 100644
 23 | index 0000000..f1f9fba
 24 | --- /dev/null
 25 | +++ b/kernel/sched/bld.h
 26 | @@ -0,0 +1,215 @@
 27 | +#ifdef CONFIG_BLD
 28 | +
 29 | +static DEFINE_RWLOCK(rt_list_lock);
 30 | +static LIST_HEAD(rt_rq_head);
 31 | +static LIST_HEAD(cfs_rq_head);
 32 | +static DEFINE_RWLOCK(cfs_list_lock);
 33 | +
 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 36 | +{
 37 | +	return cfs_rq->rq;
 38 | +}
 39 | +#else
 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 41 | +{
 42 | +	return container_of(cfs_rq, struct rq, cfs);
 43 | +}
 44 | +#endif
 45 | +
 46 | +#ifdef CONFIG_RT_GROUP_SCHED
 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 48 | +{
 49 | +	return rt_rq->rq;
 50 | +}
 51 | +#else
 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 53 | +{
 54 | +	return container_of(rt_rq, struct rq, rt);
 55 | +}
 56 | +#endif
 57 | +
 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 59 | +{
 60 | +	int cpu = smp_processor_id(), i;
 61 | +	unsigned long load, varload;
 62 | +	struct rq *rq;
 63 | +
 64 | +	if (task_type) {
 65 | +		varload = ULONG_MAX;
 66 | +		for_each_cpu(i, mask) {
 67 | +			rq = cpu_rq(i);
 68 | +			load = rq->cfs.load.weight;
 69 | +			if (load < varload) {
 70 | +				varload = load;
 71 | +				cpu = i;
 72 | +			}
 73 | +		}
 74 | +	} else {
 75 | +		/* Here's an attempt to get a CPU within the mask where
 76 | +		 * we can preempt easily. To achieve this we tried to
 77 | +		 * maintain a lowbit, which indicate the lowest bit set on
 78 | +		 * array bitmap. Since all CPUs contains high priority
 79 | +		 * kernel threads therefore we eliminate 0, so it might not
 80 | +		 * be right every time, but it's just an indicator.
 81 | +		 */
 82 | +		varload = 1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load >= varload) {
 88 | +				varload = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = firstbit;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index bcd214e..71e4a81 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/mm.h>
255 | @@ -86,6 +88,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -832,6 +835,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	update_rq_clock(rq);
265 |  	sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -839,6 +844,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	update_rq_clock(rq);
274 |  	sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1617,7 +1624,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
282 |  	lockdep_assert_held(&p->pi_lock);
283 |  
284 |  	if (p->nr_cpus_allowed > 1)
285 | +#ifndef	CONFIG_BLD
286 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
287 | +#else
288 | +		if(dl_task(p))
289 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
290 | +		else
291 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
292 | +#endif
293 |  
294 |  	/*
295 |  	 * In order not to call set_task_cpu() on a blocking task we need
296 | @@ -1807,7 +1821,11 @@ void scheduler_ipi(void)
297 |  	 */
298 |  	preempt_fold_need_resched();
299 |  
300 | +#ifndef	CONFIG_BLD
301 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
302 | +#else
303 | +	if (llist_empty(&this_rq()->wake_list))
304 | +#endif
305 |  		return;
306 |  
307 |  	/*
308 | @@ -1829,13 +1847,16 @@ void scheduler_ipi(void)
309 |  	/*
310 |  	 * Check if someone kicked us for doing the nohz idle load balance.
311 |  	 */
312 | +#ifndef	CONFIG_BLD
313 |  	if (unlikely(got_nohz_idle_kick())) {
314 |  		this_rq()->idle_balance = 1;
315 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
316 |  	}
317 | +#endif
318 |  	irq_exit();
319 |  }
320 |  
321 | +#ifndef	CONFIG_BLD
322 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
323 |  {
324 |  	struct rq *rq = cpu_rq(cpu);
325 | @@ -1848,6 +1869,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
326 |  	}
327 |  }
328 |  
329 | +#endif
330 | +
331 | +bool cpus_share_cache(int this_cpu, int that_cpu)
332 | +{
333 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
334 | +}
335 | +
336 |  void wake_up_if_idle(int cpu)
337 |  {
338 |  	struct rq *rq = cpu_rq(cpu);
339 | @@ -1871,18 +1899,13 @@ void wake_up_if_idle(int cpu)
340 |  out:
341 |  	rcu_read_unlock();
342 |  }
343 | -
344 | -bool cpus_share_cache(int this_cpu, int that_cpu)
345 | -{
346 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
347 | -}
348 |  #endif /* CONFIG_SMP */
349 |  
350 |  static void ttwu_queue(struct task_struct *p, int cpu)
351 |  {
352 |  	struct rq *rq = cpu_rq(cpu);
353 |  
354 | -#if defined(CONFIG_SMP)
355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
356 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
357 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
358 |  		ttwu_queue_remote(p, cpu);
359 | @@ -2217,7 +2240,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
360 |  	 * Silence PROVE_RCU.
361 |  	 */
362 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
363 | -	set_task_cpu(p, cpu);
364 | +	__set_task_cpu(p, cpu);
365 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
366 |  
367 |  #ifdef CONFIG_SCHED_INFO
368 | @@ -2740,7 +2763,14 @@ void sched_exec(void)
369 |  	int dest_cpu;
370 |  
371 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
372 | +#ifndef	CONFIG_BLD
373 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
374 | +#else
375 | +	if (dl_task(p))
376 | +		dest_cpu = task_cpu(p);
377 | +	else
378 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
379 | +#endif
380 |  	if (dest_cpu == smp_processor_id())
381 |  		goto unlock;
382 |  
383 | @@ -2829,8 +2859,10 @@ void scheduler_tick(void)
384 |  
385 |  #ifdef CONFIG_SMP
386 |  	rq->idle_balance = idle_cpu(cpu);
387 | +#ifndef	CONFIG_BLD
388 |  	trigger_load_balance(rq);
389 |  #endif
390 | +#endif
391 |  	rq_last_tick_reset(rq);
392 |  }
393 |  
394 | @@ -7432,6 +7464,15 @@ void __init sched_init(void)
395 |  #endif
396 |  		init_rq_hrtick(rq);
397 |  		atomic_set(&rq->nr_iowait, 0);
398 | +#ifdef CONFIG_BLD
399 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
400 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
401 | +		rq->cfs.pos = 0;
402 | +
403 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
404 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
405 | +		rq->rt.lowbit = INT_MAX;
406 | +#endif
407 |  	}
408 |  
409 |  	set_load_weight(&init_task);
410 | @@ -7472,6 +7513,9 @@ void __init sched_init(void)
411 |  	init_sched_fair_class();
412 |  
413 |  	scheduler_running = 1;
414 | +#ifdef	CONFIG_BLD
415 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
416 | +#endif
417 |  }
418 |  
419 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
421 | index 9a5e60f..af65fb9 100644
422 | --- a/kernel/sched/fair.c
423 | +++ b/kernel/sched/fair.c
424 | @@ -4460,6 +4460,7 @@ static void task_waking_fair(struct task_struct *p)
425 |  	record_wakee(p);
426 |  }
427 |  
428 | +#ifndef	CONFIG_BLD
429 |  #ifdef CONFIG_FAIR_GROUP_SCHED
430 |  /*
431 |   * effective_load() calculates the load change as seen from the root_task_group
432 | @@ -4938,6 +4939,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
433 |  
434 |  	return new_cpu;
435 |  }
436 | +#endif	/* CONFIG_BLD */
437 |  
438 |  /*
439 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
440 | @@ -5243,6 +5245,7 @@ idle:
441 |  	 * further scheduler activity on it and we're being very careful to
442 |  	 * re-start the picking loop.
443 |  	 */
444 | +#ifndef	CONFIG_BLD
445 |  	lockdep_unpin_lock(&rq->lock);
446 |  	new_tasks = idle_balance(rq);
447 |  	lockdep_pin_lock(&rq->lock);
448 | @@ -5256,7 +5259,7 @@ idle:
449 |  
450 |  	if (new_tasks > 0)
451 |  		goto again;
452 | -
453 | +#endif
454 |  	return NULL;
455 |  }
456 |  
457 | @@ -5917,8 +5920,9 @@ static unsigned long task_h_load(struct task_struct *p)
458 |  }
459 |  #endif
460 |  
461 | -/********** Helpers for find_busiest_group ************************/
462 | +#ifndef	CONFIG_BLD
463 |  
464 | +/********** Helpers for find_busiest_group ************************/
465 |  enum group_type {
466 |  	group_other = 0,
467 |  	group_imbalanced,
468 | @@ -6010,6 +6014,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
469 |  	return load_idx;
470 |  }
471 |  
472 | +#endif	/* CONFIG_BLD */
473 |  static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
474 |  {
475 |  	if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
476 | @@ -6137,6 +6142,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
477 |  	sdg->sgc->capacity = capacity;
478 |  }
479 |  
480 | +#ifndef	CONFIG_BLD
481 |  /*
482 |   * Check whether the capacity of the rq has been noticeably reduced by side
483 |   * activity. The imbalance_pct is used for the threshold.
484 | @@ -7370,6 +7376,8 @@ static inline int on_null_domain(struct rq *rq)
485 |  	return unlikely(!rcu_dereference_sched(rq->sd));
486 |  }
487 |  
488 | +#endif	/* CONFIG_BLD */
489 | +
490 |  #ifdef CONFIG_NO_HZ_COMMON
491 |  /*
492 |   * idle load balancing details
493 | @@ -7377,12 +7385,39 @@ static inline int on_null_domain(struct rq *rq)
494 |   *   needed, they will kick the idle load balancer, which then does idle
495 |   *   load balancing for all the idle CPUs.
496 |   */
497 | +#ifndef	CONFIG_BLD
498 |  static struct {
499 |  	cpumask_var_t idle_cpus_mask;
500 |  	atomic_t nr_cpus;
501 |  	unsigned long next_balance;     /* in jiffy units */
502 |  } nohz ____cacheline_aligned;
503 |  
504 | +static inline void nohz_balance_exit_idle(int cpu)
505 | +{
506 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
507 | +		/*
508 | +		 * Completely isolated CPUs don't ever set, so we must test.
509 | +		 */
510 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
511 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
512 | +			atomic_dec(&nohz.nr_cpus);
513 | +		}
514 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
515 | +	}
516 | +}
517 | +
518 | +static int sched_ilb_notifier(struct notifier_block *nfb,
519 | +					unsigned long action, void *hcpu)
520 | +{
521 | +	switch (action & ~CPU_TASKS_FROZEN) {
522 | +	case CPU_DYING:
523 | +		nohz_balance_exit_idle(smp_processor_id());
524 | +		return NOTIFY_OK;
525 | +	default:
526 | +		return NOTIFY_DONE;
527 | +	}
528 | +}
529 | +
530 |  static inline int find_new_ilb(void)
531 |  {
532 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
533 | @@ -7420,20 +7455,7 @@ static void nohz_balancer_kick(void)
534 |  	smp_send_reschedule(ilb_cpu);
535 |  	return;
536 |  }
537 | -
538 | -static inline void nohz_balance_exit_idle(int cpu)
539 | -{
540 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
541 | -		/*
542 | -		 * Completely isolated CPUs don't ever set, so we must test.
543 | -		 */
544 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
545 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
546 | -			atomic_dec(&nohz.nr_cpus);
547 | -		}
548 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
549 | -	}
550 | -}
551 | +#endif	/* CONFIG_BLD */
552 |  
553 |  static inline void set_cpu_sd_state_busy(void)
554 |  {
555 | @@ -7475,6 +7497,7 @@ unlock:
556 |   */
557 |  void nohz_balance_enter_idle(int cpu)
558 |  {
559 | +#ifndef	CONFIG_BLD
560 |  	/*
561 |  	 * If this cpu is going down, then nothing needs to be done.
562 |  	 */
563 | @@ -7493,23 +7516,10 @@ void nohz_balance_enter_idle(int cpu)
564 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
565 |  	atomic_inc(&nohz.nr_cpus);
566 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
567 | -}
568 | -
569 | -static int sched_ilb_notifier(struct notifier_block *nfb,
570 | -					unsigned long action, void *hcpu)
571 | -{
572 | -	switch (action & ~CPU_TASKS_FROZEN) {
573 | -	case CPU_DYING:
574 | -		nohz_balance_exit_idle(smp_processor_id());
575 | -		return NOTIFY_OK;
576 | -	default:
577 | -		return NOTIFY_DONE;
578 | -	}
579 | +#endif
580 |  }
581 |  #endif
582 |  
583 | -static DEFINE_SPINLOCK(balancing);
584 | -
585 |  /*
586 |   * Scale the max load_balance interval with the number of CPUs in the system.
587 |   * This trades load-balance latency on larger machines for less cross talk.
588 | @@ -7519,6 +7529,9 @@ void update_max_interval(void)
589 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
590 |  }
591 |  
592 | +#ifndef	CONFIG_BLD
593 | +static DEFINE_SPINLOCK(balancing);
594 | +
595 |  /*
596 |   * It checks each scheduling domain to see if it is due to be balanced,
597 |   * and initiates a balancing operation if so.
598 | @@ -7780,6 +7793,7 @@ void trigger_load_balance(struct rq *rq)
599 |  		nohz_balancer_kick();
600 |  #endif
601 |  }
602 | +#endif	/* CONFIG_BLD */
603 |  
604 |  static void rq_online_fair(struct rq *rq)
605 |  {
606 | @@ -8245,7 +8259,9 @@ const struct sched_class fair_sched_class = {
607 |  	.put_prev_task		= put_prev_task_fair,
608 |  
609 |  #ifdef CONFIG_SMP
610 | +#ifndef	CONFIG_BLD
611 |  	.select_task_rq		= select_task_rq_fair,
612 | +#endif
613 |  	.migrate_task_rq	= migrate_task_rq_fair,
614 |  
615 |  	.rq_online		= rq_online_fair,
616 | @@ -8307,6 +8323,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
617 |  
618 |  __init void init_sched_fair_class(void)
619 |  {
620 | +#ifndef	CONFIG_BLD
621 |  #ifdef CONFIG_SMP
622 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
623 |  
624 | @@ -8316,5 +8333,5 @@ __init void init_sched_fair_class(void)
625 |  	cpu_notifier(sched_ilb_notifier, 0);
626 |  #endif
627 |  #endif /* SMP */
628 | -
629 | +#endif /* BLD */
630 |  }
631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
632 | index d2ea593..d4d3159 100644
633 | --- a/kernel/sched/rt.c
634 | +++ b/kernel/sched/rt.c
635 | @@ -1323,6 +1323,7 @@ static void yield_task_rt(struct rq *rq)
636 |  #ifdef CONFIG_SMP
637 |  static int find_lowest_rq(struct task_struct *task);
638 |  
639 | +#ifndef	CONFIG_BLD
640 |  static int
641 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
642 |  {
643 | @@ -1378,6 +1379,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
644 |  out:
645 |  	return cpu;
646 |  }
647 | +#endif	/* CONFIG_BLD */
648 |  
649 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
650 |  {
651 | @@ -2282,7 +2284,9 @@ const struct sched_class rt_sched_class = {
652 |  	.put_prev_task		= put_prev_task_rt,
653 |  
654 |  #ifdef CONFIG_SMP
655 | +#ifndef	CONFIG_BLD
656 |  	.select_task_rq		= select_task_rq_rt,
657 | +#endif
658 |  
659 |  	.set_cpus_allowed       = set_cpus_allowed_common,
660 |  	.rq_online              = rq_online_rt,
661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
662 | index 6d2a119..cce9116 100644
663 | --- a/kernel/sched/sched.h
664 | +++ b/kernel/sched/sched.h
665 | @@ -391,9 +391,8 @@ struct cfs_rq {
666 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
667 |  #endif /* CONFIG_SMP */
668 |  
669 | -#ifdef CONFIG_FAIR_GROUP_SCHED
670 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
671 | -
672 | +#ifdef CONFIG_FAIR_GROUP_SCHED
673 |  	/*
674 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
675 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
676 | @@ -417,6 +416,11 @@ struct cfs_rq {
677 |  	struct list_head throttled_list;
678 |  #endif /* CONFIG_CFS_BANDWIDTH */
679 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
680 | +
681 | +#ifdef CONFIG_BLD
682 | +	struct list_head bld_cfs_list;
683 | +	char pos;
684 | +#endif
685 |  };
686 |  
687 |  static inline int rt_bandwidth_enabled(void)
688 | @@ -461,12 +465,16 @@ struct rt_rq {
689 |  	/* Nests inside the rq lock: */
690 |  	raw_spinlock_t rt_runtime_lock;
691 |  
692 | +	struct rq *rq;
693 |  #ifdef CONFIG_RT_GROUP_SCHED
694 |  	unsigned long rt_nr_boosted;
695 |  
696 | -	struct rq *rq;
697 |  	struct task_group *tg;
698 |  #endif
699 | +#ifdef CONFIG_BLD
700 | +	struct list_head bld_rt_list;
701 | +	int lowbit;
702 | +#endif
703 |  };
704 |  
705 |  /* Deadline class' related fields in a runqueue */
706 | 


--------------------------------------------------------------------------------
/BLD-4.6.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/init/Kconfig b/init/Kconfig
  2 | index 0dfd09d..8d704e5 100644
  3 | --- a/init/Kconfig
  4 | +++ b/init/Kconfig
  5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP
  6 |  	depends on BROKEN || !SMP
  7 |  	default y
  8 |  
  9 | +config BLD
 10 | +	bool "An alternate CPU load distribution technique for task scheduler"
 11 | +	depends on SMP
 12 | +	default y
 13 | +	help
 14 | +	  This is an alternate CPU load distribution technique based for task
 15 | +	  scheduler based on The Barbershop Load Distribution algorithm. Not
 16 | +	  suitable for NUMA, should work well on SMP.
 17 | +
 18 |  config INIT_ENV_ARG_LIMIT
 19 |  	int
 20 |  	default 32 if !UML
 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
 22 | new file mode 100644
 23 | index 0000000..f1f9fba
 24 | --- /dev/null
 25 | +++ b/kernel/sched/bld.h
 26 | @@ -0,0 +1,215 @@
 27 | +#ifdef CONFIG_BLD
 28 | +
 29 | +static DEFINE_RWLOCK(rt_list_lock);
 30 | +static LIST_HEAD(rt_rq_head);
 31 | +static LIST_HEAD(cfs_rq_head);
 32 | +static DEFINE_RWLOCK(cfs_list_lock);
 33 | +
 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED
 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 36 | +{
 37 | +	return cfs_rq->rq;
 38 | +}
 39 | +#else
 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
 41 | +{
 42 | +	return container_of(cfs_rq, struct rq, cfs);
 43 | +}
 44 | +#endif
 45 | +
 46 | +#ifdef CONFIG_RT_GROUP_SCHED
 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 48 | +{
 49 | +	return rt_rq->rq;
 50 | +}
 51 | +#else
 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
 53 | +{
 54 | +	return container_of(rt_rq, struct rq, rt);
 55 | +}
 56 | +#endif
 57 | +
 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
 59 | +{
 60 | +	int cpu = smp_processor_id(), i;
 61 | +	unsigned long load, varload;
 62 | +	struct rq *rq;
 63 | +
 64 | +	if (task_type) {
 65 | +		varload = ULONG_MAX;
 66 | +		for_each_cpu(i, mask) {
 67 | +			rq = cpu_rq(i);
 68 | +			load = rq->cfs.load.weight;
 69 | +			if (load < varload) {
 70 | +				varload = load;
 71 | +				cpu = i;
 72 | +			}
 73 | +		}
 74 | +	} else {
 75 | +		/* Here's an attempt to get a CPU within the mask where
 76 | +		 * we can preempt easily. To achieve this we tried to
 77 | +		 * maintain a lowbit, which indicate the lowest bit set on
 78 | +		 * array bitmap. Since all CPUs contains high priority
 79 | +		 * kernel threads therefore we eliminate 0, so it might not
 80 | +		 * be right every time, but it's just an indicator.
 81 | +		 */
 82 | +		varload = 1;
 83 | +
 84 | +		for_each_cpu(i, mask) {
 85 | +			rq = cpu_rq(i);
 86 | +			load = rq->rt.lowbit;
 87 | +			if (load >= varload) {
 88 | +				varload = load;
 89 | +				cpu = i;
 90 | +			}
 91 | +		}
 92 | +	}
 93 | +
 94 | +	return cpu;
 95 | +}
 96 | +
 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
 98 | +{
 99 | +	struct cfs_rq *cfs;
100 | +	unsigned long flags;
101 | +	unsigned int cpu = smp_processor_id();
102 | +
103 | +	read_lock_irqsave(&cfs_list_lock, flags);
104 | +	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
105 | +		cpu = cpu_of(rq_of_cfs(cfs));
106 | +		if (cpu_online(cpu))
107 | +			break;
108 | +	}
109 | +	read_unlock_irqrestore(&cfs_list_lock, flags);
110 | +	return cpu;
111 | +}
112 | +
113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
114 | +{
115 | +	struct rt_rq *rt;
116 | +	unsigned long flags;
117 | +	unsigned int cpu = smp_processor_id();
118 | +
119 | +	read_lock_irqsave(&rt_list_lock, flags);
120 | +	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
121 | +		cpu = cpu_of(rq_of_rt(rt));
122 | +		if (cpu_online(cpu))
123 | +			break;
124 | +	}
125 | +	read_unlock_irqrestore(&rt_list_lock, flags);
126 | +	return cpu;
127 | +}
128 | +
129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
130 | +{
131 | +	unsigned int cpu = smp_processor_id(), want_affine = 0;
132 | +	struct cpumask *tmpmask;
133 | +
134 | +	if (p->nr_cpus_allowed == 1)
135 | +		return task_cpu(p);
136 | +
137 | +	if (sd_flags & SD_BALANCE_WAKE) {
138 | +		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
139 | +			want_affine = 1;
140 | +		}
141 | +	}
142 | +
143 | +	if (want_affine)
144 | +		tmpmask = tsk_cpus_allowed(p);
145 | +	else
146 | +		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
147 | +
148 | +	if (rt_task(p))
149 | +		cpu = select_cpu_for_wakeup(0, tmpmask);
150 | +	else
151 | +		cpu = select_cpu_for_wakeup(1, tmpmask);
152 | +
153 | +	return cpu;
154 | +}
155 | +
156 | +static void track_load_rt(struct rq *rq, struct task_struct *p)
157 | +{
158 | +	unsigned long flag;
159 | +	int firstbit;
160 | +	struct rt_rq *first;
161 | +	struct rt_prio_array *array = &rq->rt.active;
162 | +
163 | +	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
164 | +	firstbit = sched_find_first_bit(array->bitmap);
165 | +
166 | +	/* Maintaining rt.lowbit */
167 | +	if (firstbit > 0 && firstbit <= rq->rt.lowbit)
168 | +		rq->rt.lowbit = firstbit;
169 | +
170 | +	if (rq->rt.lowbit < first->lowbit) {
171 | +		write_lock_irqsave(&rt_list_lock, flag);
172 | +		list_del(&rq->rt.bld_rt_list);
173 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
174 | +		write_unlock_irqrestore(&rt_list_lock, flag);
175 | +	}
176 | +}
177 | +
178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
179 | +{
180 | +	unsigned int cpu;
181 | +
182 | +	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
183 | +		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
184 | +	else {
185 | +		if (rt_task(p))
186 | +			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
187 | +		else
188 | +			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
189 | +	}
190 | +
191 | +	return cpu;
192 | +}
193 | +
194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
195 | +{
196 | +	unsigned long flag;
197 | +	if (rt_task(p)) {
198 | +		track_load_rt(rq, p);
199 | +	} else {
200 | +		if (rq->cfs.pos != 2) {
201 | +			struct cfs_rq *last;
202 | +			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
203 | +			if (rq->cfs.load.weight >= last->load.weight) {
204 | +				write_lock_irqsave(&cfs_list_lock, flag);
205 | +				list_del(&rq->cfs.bld_cfs_list);
206 | +				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
207 | +				rq->cfs.pos = 2; last->pos = 1;
208 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
209 | +			}
210 | +		}
211 | +	}
212 | +}
213 | +
214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
215 | +{
216 | +	unsigned long flag;
217 | +	if (rt_task(p)) {
218 | +		track_load_rt(rq, p);
219 | +	} else {
220 | +		if (rq->cfs.pos != 0) {
221 | +			struct cfs_rq *first;
222 | +			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
223 | +			if (rq->cfs.load.weight <= first->load.weight) {
224 | +				write_lock_irqsave(&cfs_list_lock, flag);
225 | +				list_del(&rq->cfs.bld_cfs_list);
226 | +				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
227 | +				rq->cfs.pos = 0; first->pos = 1;
228 | +				write_unlock_irqrestore(&cfs_list_lock, flag);
229 | +			}
230 | +		}
231 | +	}
232 | +}
233 | +#else
234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
235 | +{
236 | +}
237 | +
238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
239 | +{
240 | +}
241 | +#endif /* CONFIG_BLD */
242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c
243 | index d1f7149..c3236de 100644
244 | --- a/kernel/sched/core.c
245 | +++ b/kernel/sched/core.c
246 | @@ -24,6 +24,8 @@
247 |   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
248 |   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
249 |   *              Thomas Gleixner, Mike Kravetz
250 | + *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
251 | + *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
252 |   */
253 |  
254 |  #include <linux/kasan.h>
255 | @@ -86,6 +88,7 @@
256 |  #include "sched.h"
257 |  #include "../workqueue_internal.h"
258 |  #include "../smpboot.h"
259 | +#include "bld.h"
260 |  
261 |  #define CREATE_TRACE_POINTS
262 |  #include <trace/events/sched.h>
263 | @@ -713,6 +716,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
264 |  	if (!(flags & ENQUEUE_RESTORE))
265 |  		sched_info_queued(rq, p);
266 |  	p->sched_class->enqueue_task(rq, p, flags);
267 | +	if (!dl_task(p))
268 | +		bld_track_load_activate(rq, p);
269 |  }
270 |  
271 |  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
272 | @@ -721,6 +726,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
273 |  	if (!(flags & DEQUEUE_SAVE))
274 |  		sched_info_dequeued(rq, p);
275 |  	p->sched_class->dequeue_task(rq, p, flags);
276 | +	if (!dl_task(p))
277 | +		bld_track_load_deactivate(rq, p);
278 |  }
279 |  
280 |  void activate_task(struct rq *rq, struct task_struct *p, int flags)
281 | @@ -1515,8 +1522,16 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
282 |  {
283 |  	lockdep_assert_held(&p->pi_lock);
284 |  
285 | -	if (p->nr_cpus_allowed > 1)
286 | +	if (p->nr_cpus_allowed > 1) {
287 | +#ifndef	CONFIG_BLD
288 |  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
289 | +#else
290 | +		if(dl_task(p))
291 | +			cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
292 | +		else
293 | +			cpu = bld_get_cpu(p, sd_flags, wake_flags);
294 | +#endif
295 | +	}
296 |  
297 |  	/*
298 |  	 * In order not to call set_task_cpu() on a blocking task we need
299 | @@ -1706,7 +1721,11 @@ void scheduler_ipi(void)
300 |  	 */
301 |  	preempt_fold_need_resched();
302 |  
303 | +#ifndef	CONFIG_BLD
304 |  	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
305 | +#else
306 | +	if (llist_empty(&this_rq()->wake_list))
307 | +#endif
308 |  		return;
309 |  
310 |  	/*
311 | @@ -1728,13 +1747,16 @@ void scheduler_ipi(void)
312 |  	/*
313 |  	 * Check if someone kicked us for doing the nohz idle load balance.
314 |  	 */
315 | +#ifndef	CONFIG_BLD
316 |  	if (unlikely(got_nohz_idle_kick())) {
317 |  		this_rq()->idle_balance = 1;
318 |  		raise_softirq_irqoff(SCHED_SOFTIRQ);
319 |  	}
320 | +#endif
321 |  	irq_exit();
322 |  }
323 |  
324 | +#ifndef	CONFIG_BLD
325 |  static void ttwu_queue_remote(struct task_struct *p, int cpu)
326 |  {
327 |  	struct rq *rq = cpu_rq(cpu);
328 | @@ -1747,6 +1769,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
329 |  	}
330 |  }
331 |  
332 | +#endif
333 | +
334 | +bool cpus_share_cache(int this_cpu, int that_cpu)
335 | +{
336 | +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
337 | +}
338 | +
339 |  void wake_up_if_idle(int cpu)
340 |  {
341 |  	struct rq *rq = cpu_rq(cpu);
342 | @@ -1770,18 +1799,13 @@ void wake_up_if_idle(int cpu)
343 |  out:
344 |  	rcu_read_unlock();
345 |  }
346 | -
347 | -bool cpus_share_cache(int this_cpu, int that_cpu)
348 | -{
349 | -	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
350 | -}
351 |  #endif /* CONFIG_SMP */
352 |  
353 |  static void ttwu_queue(struct task_struct *p, int cpu)
354 |  {
355 |  	struct rq *rq = cpu_rq(cpu);
356 |  
357 | -#if defined(CONFIG_SMP)
358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
359 |  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
360 |  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
361 |  		ttwu_queue_remote(p, cpu);
362 | @@ -2292,7 +2316,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
363 |  	 * Silence PROVE_RCU.
364 |  	 */
365 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
366 | -	set_task_cpu(p, cpu);
367 | +	__set_task_cpu(p, cpu);
368 |  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
369 |  
370 |  #ifdef CONFIG_SCHED_INFO
371 | @@ -2837,7 +2861,14 @@ void sched_exec(void)
372 |  	int dest_cpu;
373 |  
374 |  	raw_spin_lock_irqsave(&p->pi_lock, flags);
375 | +#ifndef	CONFIG_BLD
376 |  	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
377 | +#else
378 | +	if (dl_task(p))
379 | +		dest_cpu = task_cpu(p);
380 | +	else
381 | +		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
382 | +#endif
383 |  	if (dest_cpu == smp_processor_id())
384 |  		goto unlock;
385 |  
386 | @@ -2926,8 +2957,10 @@ void scheduler_tick(void)
387 |  
388 |  #ifdef CONFIG_SMP
389 |  	rq->idle_balance = idle_cpu(cpu);
390 | +#ifndef	CONFIG_BLD
391 |  	trigger_load_balance(rq);
392 |  #endif
393 | +#endif
394 |  	rq_last_tick_reset(rq);
395 |  }
396 |  
397 | @@ -7359,6 +7392,15 @@ void __init sched_init(void)
398 |  #endif
399 |  		init_rq_hrtick(rq);
400 |  		atomic_set(&rq->nr_iowait, 0);
401 | +#ifdef CONFIG_BLD
402 | +		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
403 | +		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
404 | +		rq->cfs.pos = 0;
405 | +
406 | +		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
407 | +		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
408 | +		rq->rt.lowbit = INT_MAX;
409 | +#endif
410 |  	}
411 |  
412 |  	set_load_weight(&init_task);
413 | @@ -7399,6 +7441,9 @@ void __init sched_init(void)
414 |  	init_sched_fair_class();
415 |  
416 |  	scheduler_running = 1;
417 | +#ifdef	CONFIG_BLD
418 | +	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
419 | +#endif
420 |  }
421 |  
422 |  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
424 | index e7dd0ec..555572f 100644
425 | --- a/kernel/sched/fair.c
426 | +++ b/kernel/sched/fair.c
427 | @@ -4746,6 +4746,7 @@ static void task_waking_fair(struct task_struct *p)
428 |  	record_wakee(p);
429 |  }
430 |  
431 | +#ifndef	CONFIG_BLD
432 |  #ifdef CONFIG_FAIR_GROUP_SCHED
433 |  /*
434 |   * effective_load() calculates the load change as seen from the root_task_group
435 | @@ -5248,6 +5249,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
436 |  
437 |  	return new_cpu;
438 |  }
439 | +#endif	/* CONFIG_BLD */
440 |  
441 |  /*
442 |   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
443 | @@ -5552,6 +5554,7 @@ idle:
444 |  	 * further scheduler activity on it and we're being very careful to
445 |  	 * re-start the picking loop.
446 |  	 */
447 | +#ifndef	CONFIG_BLD
448 |  	lockdep_unpin_lock(&rq->lock);
449 |  	new_tasks = idle_balance(rq);
450 |  	lockdep_pin_lock(&rq->lock);
451 | @@ -5565,7 +5568,7 @@ idle:
452 |  
453 |  	if (new_tasks > 0)
454 |  		goto again;
455 | -
456 | +#endif
457 |  	return NULL;
458 |  }
459 |  
460 | @@ -6226,8 +6229,9 @@ static unsigned long task_h_load(struct task_struct *p)
461 |  }
462 |  #endif
463 |  
464 | -/********** Helpers for find_busiest_group ************************/
465 | +#ifndef	CONFIG_BLD
466 |  
467 | +/********** Helpers for find_busiest_group ************************/
468 |  enum group_type {
469 |  	group_other = 0,
470 |  	group_imbalanced,
471 | @@ -6318,6 +6322,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
472 |  
473 |  	return load_idx;
474 |  }
475 | +#endif	/* CONFIG_BLD	*/
476 |  
477 |  static unsigned long scale_rt_capacity(int cpu)
478 |  {
479 | @@ -6426,6 +6431,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
480 |  	sdg->sgc->capacity = capacity;
481 |  }
482 |  
483 | +#ifndef	CONFIG_BLD
484 |  /*
485 |   * Check whether the capacity of the rq has been noticeably reduced by side
486 |   * activity. The imbalance_pct is used for the threshold.
487 | @@ -7659,6 +7665,8 @@ static inline int on_null_domain(struct rq *rq)
488 |  	return unlikely(!rcu_dereference_sched(rq->sd));
489 |  }
490 |  
491 | +#endif	/* CONFIG_BLD */
492 | +
493 |  #ifdef CONFIG_NO_HZ_COMMON
494 |  /*
495 |   * idle load balancing details
496 | @@ -7666,12 +7674,39 @@ static inline int on_null_domain(struct rq *rq)
497 |   *   needed, they will kick the idle load balancer, which then does idle
498 |   *   load balancing for all the idle CPUs.
499 |   */
500 | +#ifndef	CONFIG_BLD
501 |  static struct {
502 |  	cpumask_var_t idle_cpus_mask;
503 |  	atomic_t nr_cpus;
504 |  	unsigned long next_balance;     /* in jiffy units */
505 |  } nohz ____cacheline_aligned;
506 |  
507 | +static inline void nohz_balance_exit_idle(int cpu)
508 | +{
509 | +	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
510 | +		/*
511 | +		 * Completely isolated CPUs don't ever set, so we must test.
512 | +		 */
513 | +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
514 | +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
515 | +			atomic_dec(&nohz.nr_cpus);
516 | +		}
517 | +		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
518 | +	}
519 | +}
520 | +
521 | +static int sched_ilb_notifier(struct notifier_block *nfb,
522 | +					unsigned long action, void *hcpu)
523 | +{
524 | +	switch (action & ~CPU_TASKS_FROZEN) {
525 | +	case CPU_DYING:
526 | +		nohz_balance_exit_idle(smp_processor_id());
527 | +		return NOTIFY_OK;
528 | +	default:
529 | +		return NOTIFY_DONE;
530 | +	}
531 | +}
532 | +
533 |  static inline int find_new_ilb(void)
534 |  {
535 |  	int ilb = cpumask_first(nohz.idle_cpus_mask);
536 | @@ -7709,20 +7744,7 @@ static void nohz_balancer_kick(void)
537 |  	smp_send_reschedule(ilb_cpu);
538 |  	return;
539 |  }
540 | -
541 | -static inline void nohz_balance_exit_idle(int cpu)
542 | -{
543 | -	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
544 | -		/*
545 | -		 * Completely isolated CPUs don't ever set, so we must test.
546 | -		 */
547 | -		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
548 | -			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
549 | -			atomic_dec(&nohz.nr_cpus);
550 | -		}
551 | -		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
552 | -	}
553 | -}
554 | +#endif	/* CONFIG_BLD */
555 |  
556 |  static inline void set_cpu_sd_state_busy(void)
557 |  {
558 | @@ -7764,6 +7786,7 @@ unlock:
559 |   */
560 |  void nohz_balance_enter_idle(int cpu)
561 |  {
562 | +#ifndef	CONFIG_BLD
563 |  	/*
564 |  	 * If this cpu is going down, then nothing needs to be done.
565 |  	 */
566 | @@ -7782,23 +7805,10 @@ void nohz_balance_enter_idle(int cpu)
567 |  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
568 |  	atomic_inc(&nohz.nr_cpus);
569 |  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
570 | -}
571 | -
572 | -static int sched_ilb_notifier(struct notifier_block *nfb,
573 | -					unsigned long action, void *hcpu)
574 | -{
575 | -	switch (action & ~CPU_TASKS_FROZEN) {
576 | -	case CPU_DYING:
577 | -		nohz_balance_exit_idle(smp_processor_id());
578 | -		return NOTIFY_OK;
579 | -	default:
580 | -		return NOTIFY_DONE;
581 | -	}
582 | +#endif
583 |  }
584 |  #endif
585 |  
586 | -static DEFINE_SPINLOCK(balancing);
587 | -
588 |  /*
589 |   * Scale the max load_balance interval with the number of CPUs in the system.
590 |   * This trades load-balance latency on larger machines for less cross talk.
591 | @@ -7808,6 +7818,9 @@ void update_max_interval(void)
592 |  	max_load_balance_interval = HZ*num_online_cpus()/10;
593 |  }
594 |  
595 | +#ifndef	CONFIG_BLD
596 | +static DEFINE_SPINLOCK(balancing);
597 | +
598 |  /*
599 |   * It checks each scheduling domain to see if it is due to be balanced,
600 |   * and initiates a balancing operation if so.
601 | @@ -8095,6 +8108,7 @@ void trigger_load_balance(struct rq *rq)
602 |  		nohz_balancer_kick();
603 |  #endif
604 |  }
605 | +#endif	/* CONFIG_BLD */
606 |  
607 |  static void rq_online_fair(struct rq *rq)
608 |  {
609 | @@ -8531,7 +8545,9 @@ const struct sched_class fair_sched_class = {
610 |  	.put_prev_task		= put_prev_task_fair,
611 |  
612 |  #ifdef CONFIG_SMP
613 | +#ifndef	CONFIG_BLD
614 |  	.select_task_rq		= select_task_rq_fair,
615 | +#endif
616 |  	.migrate_task_rq	= migrate_task_rq_fair,
617 |  
618 |  	.rq_online		= rq_online_fair,
619 | @@ -8593,6 +8609,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
620 |  
621 |  __init void init_sched_fair_class(void)
622 |  {
623 | +#ifndef	CONFIG_BLD
624 |  #ifdef CONFIG_SMP
625 |  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
626 |  
627 | @@ -8602,5 +8619,5 @@ __init void init_sched_fair_class(void)
628 |  	cpu_notifier(sched_ilb_notifier, 0);
629 |  #endif
630 |  #endif /* SMP */
631 | -
632 | +#endif /* BLD */
633 |  }
634 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
635 | index ec4f538d..4462bed 100644
636 | --- a/kernel/sched/rt.c
637 | +++ b/kernel/sched/rt.c
638 | @@ -1375,6 +1375,7 @@ static void yield_task_rt(struct rq *rq)
639 |  #ifdef CONFIG_SMP
640 |  static int find_lowest_rq(struct task_struct *task);
641 |  
642 | +#ifndef	CONFIG_BLD
643 |  static int
644 |  select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
645 |  {
646 | @@ -1430,6 +1431,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
647 |  out:
648 |  	return cpu;
649 |  }
650 | +#endif	/* CONFIG_BLD */
651 |  
652 |  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
653 |  {
654 | @@ -2335,7 +2337,9 @@ const struct sched_class rt_sched_class = {
655 |  	.put_prev_task		= put_prev_task_rt,
656 |  
657 |  #ifdef CONFIG_SMP
658 | +#ifndef	CONFIG_BLD
659 |  	.select_task_rq		= select_task_rq_rt,
660 | +#endif
661 |  
662 |  	.set_cpus_allowed       = set_cpus_allowed_common,
663 |  	.rq_online              = rq_online_rt,
664 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
665 | index ec2e8d2..aaab735 100644
666 | --- a/kernel/sched/sched.h
667 | +++ b/kernel/sched/sched.h
668 | @@ -408,9 +408,8 @@ struct cfs_rq {
669 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
670 |  #endif /* CONFIG_SMP */
671 |  
672 | -#ifdef CONFIG_FAIR_GROUP_SCHED
673 |  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
674 | -
675 | +#ifdef CONFIG_FAIR_GROUP_SCHED
676 |  	/*
677 |  	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
678 |  	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
679 | @@ -434,6 +433,11 @@ struct cfs_rq {
680 |  	struct list_head throttled_list;
681 |  #endif /* CONFIG_CFS_BANDWIDTH */
682 |  #endif /* CONFIG_FAIR_GROUP_SCHED */
683 | +
684 | +#ifdef CONFIG_BLD
685 | +	struct list_head bld_cfs_list;
686 | +	char pos;
687 | +#endif
688 |  };
689 |  
690 |  static inline int rt_bandwidth_enabled(void)
691 | @@ -479,12 +483,16 @@ struct rt_rq {
692 |  	/* Nests inside the rq lock: */
693 |  	raw_spinlock_t rt_runtime_lock;
694 |  
695 | +	struct rq *rq;
696 |  #ifdef CONFIG_RT_GROUP_SCHED
697 |  	unsigned long rt_nr_boosted;
698 |  
699 | -	struct rq *rq;
700 |  	struct task_group *tg;
701 |  #endif
702 | +#ifdef CONFIG_BLD
703 | +	struct list_head bld_rt_list;
704 | +	int lowbit;
705 | +#endif
706 |  };
707 |  
708 |  /* Deadline class' related fields in a runqueue */
709 | 


--------------------------------------------------------------------------------