├── README.md ├── BLD-3.16.patch ├── BLD-3.19.patch ├── BLD-3.17.patch ├── BLD-3.18.patch ├── BLD-4.8.patch ├── BLD-4.1.patch ├── BLD-4.4.patch ├── BLD-4.5.patch ├── BLD-4.3.patch └── BLD-4.6.patch /README.md: -------------------------------------------------------------------------------- 1 | bld-patches 2 | =========== 3 | 4 | Directory of BLD patches, where all the patches will be kept as single patch for stable Linux releases. See Wiki for some idea. 5 | -------------------------------------------------------------------------------- /BLD-3.16.patch: -------------------------------------------------------------------------------- 1 | BLD-3.16 for Linux kernel 3.16. Nothing special, just rebased 2 | for 3.16. 3 | 4 | Thanks, 5 | Rakib 6 | 7 | Signed-off-by: Rakib Mullick 8 | 9 | diff --git a/init/Kconfig b/init/Kconfig 10 | index 9d76b99..847f34d 100644 11 | --- a/init/Kconfig 12 | +++ b/init/Kconfig 13 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 14 | depends on BROKEN || !SMP 15 | default y 16 | 17 | +config BLD 18 | + bool "An alternate CPU load distribution technique for task scheduler" 19 | + depends on SMP 20 | + default y 21 | + help 22 | + This is an alternate CPU load distribution technique based for task 23 | + scheduler based on The Barbershop Load Distribution algorithm. Not 24 | + suitable for NUMA, should work well on SMP. 25 | + 26 | config INIT_ENV_ARG_LIMIT 27 | int 28 | default 32 if !UML 29 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 30 | new file mode 100644 31 | index 0000000..5a067c1 32 | --- /dev/null 33 | +++ b/kernel/sched/bld.h 34 | @@ -0,0 +1,207 @@ 35 | +#ifdef CONFIG_BLD 36 | + 37 | +static DEFINE_RWLOCK(rt_list_lock); 38 | +static LIST_HEAD(rt_rq_head); 39 | +static LIST_HEAD(cfs_rq_head); 40 | +static DEFINE_RWLOCK(cfs_list_lock); 41 | + 42 | +#ifdef CONFIG_FAIR_GROUP_SCHED 43 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 44 | +{ 45 | + return cfs_rq->rq; 46 | +} 47 | +#else 48 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 49 | +{ 50 | + return container_of(cfs_rq, struct rq, cfs); 51 | +} 52 | +#endif 53 | + 54 | +#ifdef CONFIG_RT_GROUP_SCHED 55 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 56 | +{ 57 | + return rt_rq->rq; 58 | +} 59 | +#else 60 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 61 | +{ 62 | + return container_of(rt_rq, struct rq, rt); 63 | +} 64 | +#endif 65 | + 66 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 67 | +{ 68 | + int cpu = smp_processor_id(), i; 69 | + unsigned long load, min_load = ULONG_MAX; 70 | + struct rq *rq; 71 | + 72 | + if (task_type) { 73 | + for_each_cpu(i, mask) { 74 | + rq = cpu_rq(i); 75 | + load = rq->cfs.load.weight; 76 | + if (load < min_load) { 77 | + min_load = load; 78 | + cpu = i; 79 | + } 80 | + } 81 | + } else { 82 | + min_load = -1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load > min_load) { 88 | + min_load = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = p->prio; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index bc1638b..b429ce5 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -86,6 +88,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -831,6 +834,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | update_rq_clock(rq); 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -838,6 +843,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | update_rq_clock(rq); 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1398,7 +1405,14 @@ out: 282 | static inline 283 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 284 | { 285 | +#ifndef CONFIG_BLD 286 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 287 | +#else 288 | + if (dl_task(p)) 289 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 290 | + else 291 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 292 | +#endif 293 | 294 | /* 295 | * In order not to call set_task_cpu() on a blocking task we need 296 | @@ -1568,9 +1582,13 @@ void scheduler_ipi(void) 297 | */ 298 | preempt_fold_need_resched(); 299 | 300 | +#ifndef CONFIG_BLD 301 | if (llist_empty(&this_rq()->wake_list) 302 | && !tick_nohz_full_cpu(smp_processor_id()) 303 | && !got_nohz_idle_kick()) 304 | +#else 305 | + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id())) 306 | +#endif 307 | return; 308 | 309 | /* 310 | @@ -1593,13 +1611,16 @@ void scheduler_ipi(void) 311 | /* 312 | * Check if someone kicked us for doing the nohz idle load balance. 313 | */ 314 | +#ifndef CONFIG_BLD 315 | if (unlikely(got_nohz_idle_kick())) { 316 | this_rq()->idle_balance = 1; 317 | raise_softirq_irqoff(SCHED_SOFTIRQ); 318 | } 319 | +#endif 320 | irq_exit(); 321 | } 322 | 323 | +#ifndef CONFIG_BLD 324 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 325 | { 326 | struct rq *rq = cpu_rq(cpu); 327 | @@ -1611,6 +1632,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 328 | trace_sched_wake_idle_without_ipi(cpu); 329 | } 330 | } 331 | +#endif 332 | 333 | bool cpus_share_cache(int this_cpu, int that_cpu) 334 | { 335 | @@ -1622,7 +1644,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) 336 | { 337 | struct rq *rq = cpu_rq(cpu); 338 | 339 | -#if defined(CONFIG_SMP) 340 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 341 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 342 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 343 | ttwu_queue_remote(p, cpu); 344 | @@ -1930,7 +1952,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 345 | * Silence PROVE_RCU. 346 | */ 347 | raw_spin_lock_irqsave(&p->pi_lock, flags); 348 | - set_task_cpu(p, cpu); 349 | + __set_task_cpu(p, cpu); 350 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 351 | 352 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 353 | @@ -2398,7 +2420,14 @@ void sched_exec(void) 354 | int dest_cpu; 355 | 356 | raw_spin_lock_irqsave(&p->pi_lock, flags); 357 | +#ifndef CONFIG_BLD 358 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 359 | +#else 360 | + if (dl_task(p)) 361 | + dest_cpu = task_cpu(p); 362 | + else 363 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 364 | +#endif 365 | if (dest_cpu == smp_processor_id()) 366 | goto unlock; 367 | 368 | @@ -2508,8 +2537,10 @@ void scheduler_tick(void) 369 | 370 | #ifdef CONFIG_SMP 371 | rq->idle_balance = idle_cpu(cpu); 372 | +#ifndef CONFIG_BLD 373 | trigger_load_balance(rq); 374 | #endif 375 | +#endif 376 | rq_last_tick_reset(rq); 377 | } 378 | 379 | @@ -6990,6 +7021,15 @@ void __init sched_init(void) 380 | #endif 381 | init_rq_hrtick(rq); 382 | atomic_set(&rq->nr_iowait, 0); 383 | +#ifdef CONFIG_BLD 384 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 385 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 386 | + rq->cfs.pos = 0; 387 | + 388 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 389 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 390 | + rq->rt.lowbit = INT_MAX; 391 | +#endif 392 | } 393 | 394 | set_load_weight(&init_task); 395 | @@ -7030,6 +7070,9 @@ void __init sched_init(void) 396 | init_sched_fair_class(); 397 | 398 | scheduler_running = 1; 399 | +#ifdef CONFIG_BLD 400 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 401 | +#endif 402 | } 403 | 404 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 405 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 406 | index fea7d33..651aa1d 100644 407 | --- a/kernel/sched/fair.c 408 | +++ b/kernel/sched/fair.c 409 | @@ -4101,6 +4101,7 @@ static void task_waking_fair(struct task_struct *p) 410 | record_wakee(p); 411 | } 412 | 413 | +#ifndef CONFIG_BLD 414 | #ifdef CONFIG_FAIR_GROUP_SCHED 415 | /* 416 | * effective_load() calculates the load change as seen from the root_task_group 417 | @@ -4550,6 +4551,7 @@ unlock: 418 | 419 | return new_cpu; 420 | } 421 | +#endif /* CONFIG_BLD */ 422 | 423 | /* 424 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 425 | @@ -4845,6 +4847,7 @@ simple: 426 | return p; 427 | 428 | idle: 429 | +#ifndef CONFIG_BLD 430 | new_tasks = idle_balance(rq); 431 | /* 432 | * Because idle_balance() releases (and re-acquires) rq->lock, it is 433 | @@ -4856,7 +4859,7 @@ idle: 434 | 435 | if (new_tasks > 0) 436 | goto again; 437 | - 438 | +#endif 439 | return NULL; 440 | } 441 | 442 | @@ -6931,12 +6934,40 @@ static inline int on_null_domain(struct rq *rq) 443 | * needed, they will kick the idle load balancer, which then does idle 444 | * load balancing for all the idle CPUs. 445 | */ 446 | +#ifndef CONFIG_BLD 447 | static struct { 448 | cpumask_var_t idle_cpus_mask; 449 | atomic_t nr_cpus; 450 | unsigned long next_balance; /* in jiffy units */ 451 | } nohz ____cacheline_aligned; 452 | 453 | +static inline void nohz_balance_exit_idle(int cpu) 454 | +{ 455 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 456 | + /* 457 | + * Completely isolated CPUs don't ever set, so we must test. 458 | + */ 459 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 460 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 461 | + atomic_dec(&nohz.nr_cpus); 462 | + } 463 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 464 | + } 465 | +} 466 | + 467 | +static int sched_ilb_notifier(struct notifier_block *nfb, 468 | + unsigned long action, void *hcpu) 469 | +{ 470 | + switch (action & ~CPU_TASKS_FROZEN) { 471 | + case CPU_DYING: 472 | + nohz_balance_exit_idle(smp_processor_id()); 473 | + return NOTIFY_OK; 474 | + default: 475 | + return NOTIFY_DONE; 476 | + } 477 | +} 478 | +#endif /* CONFIG_BLD */ 479 | + 480 | static inline int find_new_ilb(void) 481 | { 482 | int ilb = cpumask_first(nohz.idle_cpus_mask); 483 | @@ -6975,20 +7006,6 @@ static void nohz_balancer_kick(void) 484 | return; 485 | } 486 | 487 | -static inline void nohz_balance_exit_idle(int cpu) 488 | -{ 489 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 490 | - /* 491 | - * Completely isolated CPUs don't ever set, so we must test. 492 | - */ 493 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 494 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 495 | - atomic_dec(&nohz.nr_cpus); 496 | - } 497 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 498 | - } 499 | -} 500 | - 501 | static inline void set_cpu_sd_state_busy(void) 502 | { 503 | struct sched_domain *sd; 504 | @@ -7029,6 +7046,7 @@ unlock: 505 | */ 506 | void nohz_balance_enter_idle(int cpu) 507 | { 508 | +#ifndef CONFIG_BLD 509 | /* 510 | * If this cpu is going down, then nothing needs to be done. 511 | */ 512 | @@ -7047,23 +7065,10 @@ void nohz_balance_enter_idle(int cpu) 513 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 514 | atomic_inc(&nohz.nr_cpus); 515 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 516 | -} 517 | - 518 | -static int sched_ilb_notifier(struct notifier_block *nfb, 519 | - unsigned long action, void *hcpu) 520 | -{ 521 | - switch (action & ~CPU_TASKS_FROZEN) { 522 | - case CPU_DYING: 523 | - nohz_balance_exit_idle(smp_processor_id()); 524 | - return NOTIFY_OK; 525 | - default: 526 | - return NOTIFY_DONE; 527 | - } 528 | +#endif 529 | } 530 | #endif 531 | 532 | -static DEFINE_SPINLOCK(balancing); 533 | - 534 | /* 535 | * Scale the max load_balance interval with the number of CPUs in the system. 536 | * This trades load-balance latency on larger machines for less cross talk. 537 | @@ -7073,6 +7078,9 @@ void update_max_interval(void) 538 | max_load_balance_interval = HZ*num_online_cpus()/10; 539 | } 540 | 541 | +#ifndef CONFIG_BLD 542 | +static DEFINE_SPINLOCK(balancing); 543 | + 544 | /* 545 | * It checks each scheduling domain to see if it is due to be balanced, 546 | * and initiates a balancing operation if so. 547 | @@ -7321,6 +7329,7 @@ void trigger_load_balance(struct rq *rq) 548 | nohz_balancer_kick(); 549 | #endif 550 | } 551 | +#endif /* CONFIG_BLD */ 552 | 553 | static void rq_online_fair(struct rq *rq) 554 | { 555 | @@ -7764,7 +7773,9 @@ const struct sched_class fair_sched_class = { 556 | .put_prev_task = put_prev_task_fair, 557 | 558 | #ifdef CONFIG_SMP 559 | +#ifndef CONFIG_BLD 560 | .select_task_rq = select_task_rq_fair, 561 | +#endif 562 | .migrate_task_rq = migrate_task_rq_fair, 563 | 564 | .rq_online = rq_online_fair, 565 | @@ -7802,6 +7813,7 @@ void print_cfs_stats(struct seq_file *m, int cpu) 566 | 567 | __init void init_sched_fair_class(void) 568 | { 569 | +#ifndef CONFIG_BLD 570 | #ifdef CONFIG_SMP 571 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 572 | 573 | @@ -7811,5 +7823,5 @@ __init void init_sched_fair_class(void) 574 | cpu_notifier(sched_ilb_notifier, 0); 575 | #endif 576 | #endif /* SMP */ 577 | - 578 | +#endif /* BLD */ 579 | } 580 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 581 | index a490831..c9d22c3 100644 582 | --- a/kernel/sched/rt.c 583 | +++ b/kernel/sched/rt.c 584 | @@ -1291,6 +1291,7 @@ static void yield_task_rt(struct rq *rq) 585 | #ifdef CONFIG_SMP 586 | static int find_lowest_rq(struct task_struct *task); 587 | 588 | +#ifndef CONFIG_BLD 589 | static int 590 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 591 | { 592 | @@ -1344,6 +1345,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 593 | out: 594 | return cpu; 595 | } 596 | +#endif /* CONFIG_BLD */ 597 | 598 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 599 | { 600 | @@ -2108,7 +2110,9 @@ const struct sched_class rt_sched_class = { 601 | .put_prev_task = put_prev_task_rt, 602 | 603 | #ifdef CONFIG_SMP 604 | +#ifndef CONFIG_BLD 605 | .select_task_rq = select_task_rq_rt, 606 | +#endif 607 | 608 | .set_cpus_allowed = set_cpus_allowed_rt, 609 | .rq_online = rq_online_rt, 610 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 611 | index 31cc02e..1c497d2 100644 612 | --- a/kernel/sched/sched.h 613 | +++ b/kernel/sched/sched.h 614 | @@ -358,9 +358,8 @@ struct cfs_rq { 615 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 616 | #endif /* CONFIG_SMP */ 617 | 618 | -#ifdef CONFIG_FAIR_GROUP_SCHED 619 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 620 | - 621 | +#ifdef CONFIG_FAIR_GROUP_SCHED 622 | /* 623 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 624 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 625 | @@ -384,6 +383,11 @@ struct cfs_rq { 626 | struct list_head throttled_list; 627 | #endif /* CONFIG_CFS_BANDWIDTH */ 628 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 629 | + 630 | +#ifdef CONFIG_BLD 631 | + struct list_head bld_cfs_list; 632 | + char pos; 633 | +#endif 634 | }; 635 | 636 | static inline int rt_bandwidth_enabled(void) 637 | @@ -417,12 +421,16 @@ struct rt_rq { 638 | /* Nests inside the rq lock: */ 639 | raw_spinlock_t rt_runtime_lock; 640 | 641 | + struct rq *rq; 642 | #ifdef CONFIG_RT_GROUP_SCHED 643 | unsigned long rt_nr_boosted; 644 | 645 | - struct rq *rq; 646 | struct task_group *tg; 647 | #endif 648 | +#ifdef CONFIG_BLD 649 | + struct list_head bld_rt_list; 650 | + int lowbit; 651 | +#endif 652 | }; 653 | 654 | /* Deadline class' related fields in a runqueue */ 655 | -------------------------------------------------------------------------------- /BLD-3.19.patch: -------------------------------------------------------------------------------- 1 | BLD patch for Linux-3.19. Rebased on for Linux 3.19. 2 | 3 | diff --git a/init/Kconfig b/init/Kconfig 4 | index 9afb971..062ca7f 100644 5 | --- a/init/Kconfig 6 | +++ b/init/Kconfig 7 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 8 | depends on BROKEN || !SMP 9 | default y 10 | 11 | +config BLD 12 | + bool "An alternate CPU load distribution technique for task scheduler" 13 | + depends on SMP 14 | + default y 15 | + help 16 | + This is an alternate CPU load distribution technique based for task 17 | + scheduler based on The Barbershop Load Distribution algorithm. Not 18 | + suitable for NUMA, should work well on SMP. 19 | + 20 | config INIT_ENV_ARG_LIMIT 21 | int 22 | default 32 if !UML 23 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 24 | new file mode 100644 25 | index 0000000..f1f9fba 26 | --- /dev/null 27 | +++ b/kernel/sched/bld.h 28 | @@ -0,0 +1,215 @@ 29 | +#ifdef CONFIG_BLD 30 | + 31 | +static DEFINE_RWLOCK(rt_list_lock); 32 | +static LIST_HEAD(rt_rq_head); 33 | +static LIST_HEAD(cfs_rq_head); 34 | +static DEFINE_RWLOCK(cfs_list_lock); 35 | + 36 | +#ifdef CONFIG_FAIR_GROUP_SCHED 37 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 38 | +{ 39 | + return cfs_rq->rq; 40 | +} 41 | +#else 42 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 43 | +{ 44 | + return container_of(cfs_rq, struct rq, cfs); 45 | +} 46 | +#endif 47 | + 48 | +#ifdef CONFIG_RT_GROUP_SCHED 49 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 50 | +{ 51 | + return rt_rq->rq; 52 | +} 53 | +#else 54 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 55 | +{ 56 | + return container_of(rt_rq, struct rq, rt); 57 | +} 58 | +#endif 59 | + 60 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 61 | +{ 62 | + int cpu = smp_processor_id(), i; 63 | + unsigned long load, varload; 64 | + struct rq *rq; 65 | + 66 | + if (task_type) { 67 | + varload = ULONG_MAX; 68 | + for_each_cpu(i, mask) { 69 | + rq = cpu_rq(i); 70 | + load = rq->cfs.load.weight; 71 | + if (load < varload) { 72 | + varload = load; 73 | + cpu = i; 74 | + } 75 | + } 76 | + } else { 77 | + /* Here's an attempt to get a CPU within the mask where 78 | + * we can preempt easily. To achieve this we tried to 79 | + * maintain a lowbit, which indicate the lowest bit set on 80 | + * array bitmap. Since all CPUs contains high priority 81 | + * kernel threads therefore we eliminate 0, so it might not 82 | + * be right every time, but it's just an indicator. 83 | + */ 84 | + varload = 1; 85 | + 86 | + for_each_cpu(i, mask) { 87 | + rq = cpu_rq(i); 88 | + load = rq->rt.lowbit; 89 | + if (load >= varload) { 90 | + varload = load; 91 | + cpu = i; 92 | + } 93 | + } 94 | + } 95 | + 96 | + return cpu; 97 | +} 98 | + 99 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 100 | +{ 101 | + struct cfs_rq *cfs; 102 | + unsigned long flags; 103 | + unsigned int cpu = smp_processor_id(); 104 | + 105 | + read_lock_irqsave(&cfs_list_lock, flags); 106 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 107 | + cpu = cpu_of(rq_of_cfs(cfs)); 108 | + if (cpu_online(cpu)) 109 | + break; 110 | + } 111 | + read_unlock_irqrestore(&cfs_list_lock, flags); 112 | + return cpu; 113 | +} 114 | + 115 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 116 | +{ 117 | + struct rt_rq *rt; 118 | + unsigned long flags; 119 | + unsigned int cpu = smp_processor_id(); 120 | + 121 | + read_lock_irqsave(&rt_list_lock, flags); 122 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 123 | + cpu = cpu_of(rq_of_rt(rt)); 124 | + if (cpu_online(cpu)) 125 | + break; 126 | + } 127 | + read_unlock_irqrestore(&rt_list_lock, flags); 128 | + return cpu; 129 | +} 130 | + 131 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 132 | +{ 133 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 134 | + struct cpumask *tmpmask; 135 | + 136 | + if (p->nr_cpus_allowed == 1) 137 | + return task_cpu(p); 138 | + 139 | + if (sd_flags & SD_BALANCE_WAKE) { 140 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 141 | + want_affine = 1; 142 | + } 143 | + } 144 | + 145 | + if (want_affine) 146 | + tmpmask = tsk_cpus_allowed(p); 147 | + else 148 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 149 | + 150 | + if (rt_task(p)) 151 | + cpu = select_cpu_for_wakeup(0, tmpmask); 152 | + else 153 | + cpu = select_cpu_for_wakeup(1, tmpmask); 154 | + 155 | + return cpu; 156 | +} 157 | + 158 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 159 | +{ 160 | + unsigned long flag; 161 | + int firstbit; 162 | + struct rt_rq *first; 163 | + struct rt_prio_array *array = &rq->rt.active; 164 | + 165 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 166 | + firstbit = sched_find_first_bit(array->bitmap); 167 | + 168 | + /* Maintaining rt.lowbit */ 169 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 170 | + rq->rt.lowbit = firstbit; 171 | + 172 | + if (rq->rt.lowbit < first->lowbit) { 173 | + write_lock_irqsave(&rt_list_lock, flag); 174 | + list_del(&rq->rt.bld_rt_list); 175 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 176 | + write_unlock_irqrestore(&rt_list_lock, flag); 177 | + } 178 | +} 179 | + 180 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 181 | +{ 182 | + unsigned int cpu; 183 | + 184 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 185 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 186 | + else { 187 | + if (rt_task(p)) 188 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 189 | + else 190 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 191 | + } 192 | + 193 | + return cpu; 194 | +} 195 | + 196 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 197 | +{ 198 | + unsigned long flag; 199 | + if (rt_task(p)) { 200 | + track_load_rt(rq, p); 201 | + } else { 202 | + if (rq->cfs.pos != 2) { 203 | + struct cfs_rq *last; 204 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 205 | + if (rq->cfs.load.weight >= last->load.weight) { 206 | + write_lock_irqsave(&cfs_list_lock, flag); 207 | + list_del(&rq->cfs.bld_cfs_list); 208 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 209 | + rq->cfs.pos = 2; last->pos = 1; 210 | + write_unlock_irqrestore(&cfs_list_lock, flag); 211 | + } 212 | + } 213 | + } 214 | +} 215 | + 216 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 217 | +{ 218 | + unsigned long flag; 219 | + if (rt_task(p)) { 220 | + track_load_rt(rq, p); 221 | + } else { 222 | + if (rq->cfs.pos != 0) { 223 | + struct cfs_rq *first; 224 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 225 | + if (rq->cfs.load.weight <= first->load.weight) { 226 | + write_lock_irqsave(&cfs_list_lock, flag); 227 | + list_del(&rq->cfs.bld_cfs_list); 228 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 229 | + rq->cfs.pos = 0; first->pos = 1; 230 | + write_unlock_irqrestore(&cfs_list_lock, flag); 231 | + } 232 | + } 233 | + } 234 | +} 235 | +#else 236 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 237 | +{ 238 | +} 239 | + 240 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 241 | +{ 242 | +} 243 | +#endif /* CONFIG_BLD */ 244 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 245 | index 5eab11d..ab18e8d 100644 246 | --- a/kernel/sched/core.c 247 | +++ b/kernel/sched/core.c 248 | @@ -24,6 +24,8 @@ 249 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 250 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 251 | * Thomas Gleixner, Mike Kravetz 252 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 253 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 254 | */ 255 | 256 | #include 257 | @@ -86,6 +88,7 @@ 258 | #include "sched.h" 259 | #include "../workqueue_internal.h" 260 | #include "../smpboot.h" 261 | +#include "bld.h" 262 | 263 | #define CREATE_TRACE_POINTS 264 | #include 265 | @@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 266 | update_rq_clock(rq); 267 | sched_info_queued(rq, p); 268 | p->sched_class->enqueue_task(rq, p, flags); 269 | + if (!dl_task(p)) 270 | + bld_track_load_activate(rq, p); 271 | } 272 | 273 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 274 | @@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 275 | update_rq_clock(rq); 276 | sched_info_dequeued(rq, p); 277 | p->sched_class->dequeue_task(rq, p, flags); 278 | + if (!dl_task(p)) 279 | + bld_track_load_deactivate(rq, p); 280 | } 281 | 282 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 283 | @@ -1412,7 +1419,14 @@ static inline 284 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 285 | { 286 | if (p->nr_cpus_allowed > 1) 287 | +#ifndef CONFIG_BLD 288 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 289 | +#else 290 | + if(dl_task(p)) 291 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 292 | + else 293 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 294 | +#endif 295 | 296 | /* 297 | * In order not to call set_task_cpu() on a blocking task we need 298 | @@ -1582,7 +1596,11 @@ void scheduler_ipi(void) 299 | */ 300 | preempt_fold_need_resched(); 301 | 302 | +#ifndef CONFIG_BLD 303 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 304 | +#else 305 | + if (llist_empty(&this_rq()->wake_list)) 306 | +#endif 307 | return; 308 | 309 | /* 310 | @@ -1604,13 +1622,16 @@ void scheduler_ipi(void) 311 | /* 312 | * Check if someone kicked us for doing the nohz idle load balance. 313 | */ 314 | +#ifndef CONFIG_BLD 315 | if (unlikely(got_nohz_idle_kick())) { 316 | this_rq()->idle_balance = 1; 317 | raise_softirq_irqoff(SCHED_SOFTIRQ); 318 | } 319 | +#endif 320 | irq_exit(); 321 | } 322 | 323 | +#ifndef CONFIG_BLD 324 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 325 | { 326 | struct rq *rq = cpu_rq(cpu); 327 | @@ -1623,6 +1644,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 328 | } 329 | } 330 | 331 | +#endif 332 | + 333 | +bool cpus_share_cache(int this_cpu, int that_cpu) 334 | +{ 335 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 336 | +} 337 | + 338 | void wake_up_if_idle(int cpu) 339 | { 340 | struct rq *rq = cpu_rq(cpu); 341 | @@ -1646,18 +1674,13 @@ void wake_up_if_idle(int cpu) 342 | out: 343 | rcu_read_unlock(); 344 | } 345 | - 346 | -bool cpus_share_cache(int this_cpu, int that_cpu) 347 | -{ 348 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 349 | -} 350 | #endif /* CONFIG_SMP */ 351 | 352 | static void ttwu_queue(struct task_struct *p, int cpu) 353 | { 354 | struct rq *rq = cpu_rq(cpu); 355 | 356 | -#if defined(CONFIG_SMP) 357 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 358 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 359 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 360 | ttwu_queue_remote(p, cpu); 361 | @@ -1978,7 +2001,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 362 | * Silence PROVE_RCU. 363 | */ 364 | raw_spin_lock_irqsave(&p->pi_lock, flags); 365 | - set_task_cpu(p, cpu); 366 | + __set_task_cpu(p, cpu); 367 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 368 | 369 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 370 | @@ -2445,7 +2468,14 @@ void sched_exec(void) 371 | int dest_cpu; 372 | 373 | raw_spin_lock_irqsave(&p->pi_lock, flags); 374 | +#ifndef CONFIG_BLD 375 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 376 | +#else 377 | + if (dl_task(p)) 378 | + dest_cpu = task_cpu(p); 379 | + else 380 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 381 | +#endif 382 | if (dest_cpu == smp_processor_id()) 383 | goto unlock; 384 | 385 | @@ -2533,8 +2563,10 @@ void scheduler_tick(void) 386 | 387 | #ifdef CONFIG_SMP 388 | rq->idle_balance = idle_cpu(cpu); 389 | +#ifndef CONFIG_BLD 390 | trigger_load_balance(rq); 391 | #endif 392 | +#endif 393 | rq_last_tick_reset(rq); 394 | } 395 | 396 | @@ -7261,6 +7293,15 @@ void __init sched_init(void) 397 | #endif 398 | init_rq_hrtick(rq); 399 | atomic_set(&rq->nr_iowait, 0); 400 | +#ifdef CONFIG_BLD 401 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 402 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 403 | + rq->cfs.pos = 0; 404 | + 405 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 406 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 407 | + rq->rt.lowbit = INT_MAX; 408 | +#endif 409 | } 410 | 411 | set_load_weight(&init_task); 412 | @@ -7301,6 +7342,9 @@ void __init sched_init(void) 413 | init_sched_fair_class(); 414 | 415 | scheduler_running = 1; 416 | +#ifdef CONFIG_BLD 417 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 418 | +#endif 419 | } 420 | 421 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 422 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 423 | index fe331fc..37d3839 100644 424 | --- a/kernel/sched/fair.c 425 | +++ b/kernel/sched/fair.c 426 | @@ -4351,6 +4351,7 @@ static void task_waking_fair(struct task_struct *p) 427 | record_wakee(p); 428 | } 429 | 430 | +#ifndef CONFIG_BLD 431 | #ifdef CONFIG_FAIR_GROUP_SCHED 432 | /* 433 | * effective_load() calculates the load change as seen from the root_task_group 434 | @@ -4803,6 +4804,7 @@ unlock: 435 | 436 | return new_cpu; 437 | } 438 | +#endif /* CONFIG_BLD */ 439 | 440 | /* 441 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 442 | @@ -5098,6 +5100,7 @@ simple: 443 | return p; 444 | 445 | idle: 446 | +#ifndef CONFIG_BLD 447 | new_tasks = idle_balance(rq); 448 | /* 449 | * Because idle_balance() releases (and re-acquires) rq->lock, it is 450 | @@ -5109,7 +5112,7 @@ idle: 451 | 452 | if (new_tasks > 0) 453 | goto again; 454 | - 455 | +#endif 456 | return NULL; 457 | } 458 | 459 | @@ -7293,12 +7296,39 @@ static inline int on_null_domain(struct rq *rq) 460 | * needed, they will kick the idle load balancer, which then does idle 461 | * load balancing for all the idle CPUs. 462 | */ 463 | +#ifndef CONFIG_BLD 464 | static struct { 465 | cpumask_var_t idle_cpus_mask; 466 | atomic_t nr_cpus; 467 | unsigned long next_balance; /* in jiffy units */ 468 | } nohz ____cacheline_aligned; 469 | 470 | +static inline void nohz_balance_exit_idle(int cpu) 471 | +{ 472 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 473 | + /* 474 | + * Completely isolated CPUs don't ever set, so we must test. 475 | + */ 476 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 477 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 478 | + atomic_dec(&nohz.nr_cpus); 479 | + } 480 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 481 | + } 482 | +} 483 | + 484 | +static int sched_ilb_notifier(struct notifier_block *nfb, 485 | + unsigned long action, void *hcpu) 486 | +{ 487 | + switch (action & ~CPU_TASKS_FROZEN) { 488 | + case CPU_DYING: 489 | + nohz_balance_exit_idle(smp_processor_id()); 490 | + return NOTIFY_OK; 491 | + default: 492 | + return NOTIFY_DONE; 493 | + } 494 | +} 495 | + 496 | static inline int find_new_ilb(void) 497 | { 498 | int ilb = cpumask_first(nohz.idle_cpus_mask); 499 | @@ -7336,20 +7366,7 @@ static void nohz_balancer_kick(void) 500 | smp_send_reschedule(ilb_cpu); 501 | return; 502 | } 503 | - 504 | -static inline void nohz_balance_exit_idle(int cpu) 505 | -{ 506 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 507 | - /* 508 | - * Completely isolated CPUs don't ever set, so we must test. 509 | - */ 510 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 511 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 512 | - atomic_dec(&nohz.nr_cpus); 513 | - } 514 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 515 | - } 516 | -} 517 | +#endif /* CONFIG_BLD */ 518 | 519 | static inline void set_cpu_sd_state_busy(void) 520 | { 521 | @@ -7391,6 +7408,7 @@ unlock: 522 | */ 523 | void nohz_balance_enter_idle(int cpu) 524 | { 525 | +#ifndef CONFIG_BLD 526 | /* 527 | * If this cpu is going down, then nothing needs to be done. 528 | */ 529 | @@ -7409,23 +7427,10 @@ void nohz_balance_enter_idle(int cpu) 530 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 531 | atomic_inc(&nohz.nr_cpus); 532 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 533 | -} 534 | - 535 | -static int sched_ilb_notifier(struct notifier_block *nfb, 536 | - unsigned long action, void *hcpu) 537 | -{ 538 | - switch (action & ~CPU_TASKS_FROZEN) { 539 | - case CPU_DYING: 540 | - nohz_balance_exit_idle(smp_processor_id()); 541 | - return NOTIFY_OK; 542 | - default: 543 | - return NOTIFY_DONE; 544 | - } 545 | +#endif 546 | } 547 | #endif 548 | 549 | -static DEFINE_SPINLOCK(balancing); 550 | - 551 | /* 552 | * Scale the max load_balance interval with the number of CPUs in the system. 553 | * This trades load-balance latency on larger machines for less cross talk. 554 | @@ -7435,6 +7440,9 @@ void update_max_interval(void) 555 | max_load_balance_interval = HZ*num_online_cpus()/10; 556 | } 557 | 558 | +#ifndef CONFIG_BLD 559 | +static DEFINE_SPINLOCK(balancing); 560 | + 561 | /* 562 | * It checks each scheduling domain to see if it is due to be balanced, 563 | * and initiates a balancing operation if so. 564 | @@ -7683,6 +7691,7 @@ void trigger_load_balance(struct rq *rq) 565 | nohz_balancer_kick(); 566 | #endif 567 | } 568 | +#endif /* CONFIG_BLD */ 569 | 570 | static void rq_online_fair(struct rq *rq) 571 | { 572 | @@ -8128,7 +8137,9 @@ const struct sched_class fair_sched_class = { 573 | .put_prev_task = put_prev_task_fair, 574 | 575 | #ifdef CONFIG_SMP 576 | +#ifndef CONFIG_BLD 577 | .select_task_rq = select_task_rq_fair, 578 | +#endif 579 | .migrate_task_rq = migrate_task_rq_fair, 580 | 581 | .rq_online = rq_online_fair, 582 | @@ -8168,6 +8179,7 @@ void print_cfs_stats(struct seq_file *m, int cpu) 583 | 584 | __init void init_sched_fair_class(void) 585 | { 586 | +#ifndef CONFIG_BLD 587 | #ifdef CONFIG_SMP 588 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 589 | 590 | @@ -8177,5 +8189,5 @@ __init void init_sched_fair_class(void) 591 | cpu_notifier(sched_ilb_notifier, 0); 592 | #endif 593 | #endif /* SMP */ 594 | - 595 | +#endif /* BLD */ 596 | } 597 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 598 | index ee15f5a..bfdd0b7 100644 599 | --- a/kernel/sched/rt.c 600 | +++ b/kernel/sched/rt.c 601 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq) 602 | #ifdef CONFIG_SMP 603 | static int find_lowest_rq(struct task_struct *task); 604 | 605 | +#ifndef CONFIG_BLD 606 | static int 607 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 608 | { 609 | @@ -1345,6 +1346,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 610 | out: 611 | return cpu; 612 | } 613 | +#endif /* CONFIG_BLD */ 614 | 615 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 616 | { 617 | @@ -2114,7 +2116,9 @@ const struct sched_class rt_sched_class = { 618 | .put_prev_task = put_prev_task_rt, 619 | 620 | #ifdef CONFIG_SMP 621 | +#ifndef CONFIG_BLD 622 | .select_task_rq = select_task_rq_rt, 623 | +#endif 624 | 625 | .set_cpus_allowed = set_cpus_allowed_rt, 626 | .rq_online = rq_online_rt, 627 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 628 | index 9a2a45c..24b1c7e 100644 629 | --- a/kernel/sched/sched.h 630 | +++ b/kernel/sched/sched.h 631 | @@ -385,9 +385,8 @@ struct cfs_rq { 632 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 633 | #endif /* CONFIG_SMP */ 634 | 635 | -#ifdef CONFIG_FAIR_GROUP_SCHED 636 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 637 | - 638 | +#ifdef CONFIG_FAIR_GROUP_SCHED 639 | /* 640 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 641 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 642 | @@ -411,6 +410,11 @@ struct cfs_rq { 643 | struct list_head throttled_list; 644 | #endif /* CONFIG_CFS_BANDWIDTH */ 645 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 646 | + 647 | +#ifdef CONFIG_BLD 648 | + struct list_head bld_cfs_list; 649 | + char pos; 650 | +#endif 651 | }; 652 | 653 | static inline int rt_bandwidth_enabled(void) 654 | @@ -444,12 +448,16 @@ struct rt_rq { 655 | /* Nests inside the rq lock: */ 656 | raw_spinlock_t rt_runtime_lock; 657 | 658 | + struct rq *rq; 659 | #ifdef CONFIG_RT_GROUP_SCHED 660 | unsigned long rt_nr_boosted; 661 | 662 | - struct rq *rq; 663 | struct task_group *tg; 664 | #endif 665 | +#ifdef CONFIG_BLD 666 | + struct list_head bld_rt_list; 667 | + int lowbit; 668 | +#endif 669 | }; 670 | 671 | /* Deadline class' related fields in a runqueue */ 672 | -------------------------------------------------------------------------------- /BLD-3.17.patch: -------------------------------------------------------------------------------- 1 | BLD patch for Linux 3.17, contains a build fix when CONFIG_BLD=n. 2 | Below shows a stat of default netperf run on localhost system 3 | (client/server) running on local system (core i3, 2g ram). 4 | 5 | tcp_stream tcp_rr udp_stream udp_rr 6 | 7 | mainline 9343.54 20812.03 18231.74 24396.074 8 | 18210.71 9 | 10 | bld 14738.35 29224.54 26475.75 34910.08 11 | 26462.53 12 | 13 | These are average of 5 runs of each tests. BLD performs better 14 | and shows ~(35-40)% improvement. And, recently Luis Cruz backports 15 | BLD's previous release BLD-3.16 for Android and experimentally 16 | ran it on his galaxy SIII, these could be found at following link: 17 | 18 | https://github.com/SyNtheticNightmar3/bld-patches 19 | 20 | If you are interested in running it on Android, take a look at the 21 | above link. 22 | 23 | Thanks, 24 | Rakib 25 | 26 | Signed-off-by: Rakib Mullick 27 | --- 28 | 29 | diff --git a/init/Kconfig b/init/Kconfig 30 | index 80a6907..65319c6 100644 31 | --- a/init/Kconfig 32 | +++ b/init/Kconfig 33 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 34 | depends on BROKEN || !SMP 35 | default y 36 | 37 | +config BLD 38 | + bool "An alternate CPU load distribution technique for task scheduler" 39 | + depends on SMP 40 | + default y 41 | + help 42 | + This is an alternate CPU load distribution technique based for task 43 | + scheduler based on The Barbershop Load Distribution algorithm. Not 44 | + suitable for NUMA, should work well on SMP. 45 | + 46 | config INIT_ENV_ARG_LIMIT 47 | int 48 | default 32 if !UML 49 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 50 | new file mode 100644 51 | index 0000000..097dd23 52 | --- /dev/null 53 | +++ b/kernel/sched/bld.h 54 | @@ -0,0 +1,207 @@ 55 | +#ifdef CONFIG_BLD 56 | + 57 | +static DEFINE_RWLOCK(rt_list_lock); 58 | +static LIST_HEAD(rt_rq_head); 59 | +static LIST_HEAD(cfs_rq_head); 60 | +static DEFINE_RWLOCK(cfs_list_lock); 61 | + 62 | +#ifdef CONFIG_FAIR_GROUP_SCHED 63 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 64 | +{ 65 | + return cfs_rq->rq; 66 | +} 67 | +#else 68 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 69 | +{ 70 | + return container_of(cfs_rq, struct rq, cfs); 71 | +} 72 | +#endif 73 | + 74 | +#ifdef CONFIG_RT_GROUP_SCHED 75 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 76 | +{ 77 | + return rt_rq->rq; 78 | +} 79 | +#else 80 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 81 | +{ 82 | + return container_of(rt_rq, struct rq, rt); 83 | +} 84 | +#endif 85 | + 86 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 87 | +{ 88 | + int cpu = smp_processor_id(), i; 89 | + unsigned long load, min_load = ULONG_MAX; 90 | + struct rq *rq; 91 | + 92 | + if (task_type) { 93 | + for_each_cpu(i, mask) { 94 | + rq = cpu_rq(i); 95 | + load = rq->cfs.load.weight; 96 | + if (load < min_load) { 97 | + min_load = load; 98 | + cpu = i; 99 | + } 100 | + } 101 | + } else { 102 | + min_load = -1; 103 | + 104 | + for_each_cpu(i, mask) { 105 | + rq = cpu_rq(i); 106 | + load = rq->rt.lowbit; 107 | + if (load > min_load) { 108 | + min_load = load; 109 | + cpu = i; 110 | + } 111 | + } 112 | + } 113 | + 114 | + return cpu; 115 | +} 116 | + 117 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 118 | +{ 119 | + struct cfs_rq *cfs; 120 | + unsigned long flags; 121 | + unsigned int cpu = smp_processor_id(); 122 | + 123 | + read_lock_irqsave(&cfs_list_lock, flags); 124 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 125 | + cpu = cpu_of(rq_of_cfs(cfs)); 126 | + if (cpu_online(cpu)) 127 | + break; 128 | + } 129 | + read_unlock_irqrestore(&cfs_list_lock, flags); 130 | + return cpu; 131 | +} 132 | + 133 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 134 | +{ 135 | + struct rt_rq *rt; 136 | + unsigned long flags; 137 | + unsigned int cpu = smp_processor_id(); 138 | + 139 | + read_lock_irqsave(&rt_list_lock, flags); 140 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 141 | + cpu = cpu_of(rq_of_rt(rt)); 142 | + if (cpu_online(cpu)) 143 | + break; 144 | + } 145 | + read_unlock_irqrestore(&rt_list_lock, flags); 146 | + return cpu; 147 | +} 148 | + 149 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 150 | +{ 151 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 152 | + struct cpumask *tmpmask; 153 | + 154 | + if (p->nr_cpus_allowed == 1) 155 | + return task_cpu(p); 156 | + 157 | + if (sd_flags & SD_BALANCE_WAKE) { 158 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 159 | + want_affine = 1; 160 | + } 161 | + } 162 | + 163 | + if (want_affine) 164 | + tmpmask = tsk_cpus_allowed(p); 165 | + else 166 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 167 | + 168 | + if (rt_task(p)) 169 | + cpu = select_cpu_for_wakeup(0, tmpmask); 170 | + else 171 | + cpu = select_cpu_for_wakeup(1, tmpmask); 172 | + 173 | + return cpu; 174 | +} 175 | + 176 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 177 | +{ 178 | + unsigned long flag; 179 | + int firstbit; 180 | + struct rt_rq *first; 181 | + struct rt_prio_array *array = &rq->rt.active; 182 | + 183 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 184 | + firstbit = sched_find_first_bit(array->bitmap); 185 | + 186 | + /* Maintaining rt.lowbit */ 187 | + if (firstbit <= rq->rt.lowbit) 188 | + rq->rt.lowbit = p->prio; 189 | + 190 | + if (rq->rt.lowbit < first->lowbit) { 191 | + write_lock_irqsave(&rt_list_lock, flag); 192 | + list_del(&rq->rt.bld_rt_list); 193 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 194 | + write_unlock_irqrestore(&rt_list_lock, flag); 195 | + } 196 | +} 197 | + 198 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 199 | +{ 200 | + unsigned int cpu; 201 | + 202 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 203 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 204 | + else { 205 | + if (rt_task(p)) 206 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 207 | + else 208 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 209 | + } 210 | + 211 | + return cpu; 212 | +} 213 | + 214 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 2) { 221 | + struct cfs_rq *last; 222 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight >= last->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 2; last->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | + 234 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | + unsigned long flag; 237 | + if (rt_task(p)) { 238 | + track_load_rt(rq, p); 239 | + } else { 240 | + if (rq->cfs.pos != 0) { 241 | + struct cfs_rq *first; 242 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 243 | + if (rq->cfs.load.weight <= first->load.weight) { 244 | + write_lock_irqsave(&cfs_list_lock, flag); 245 | + list_del(&rq->cfs.bld_cfs_list); 246 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 247 | + rq->cfs.pos = 0; first->pos = 1; 248 | + write_unlock_irqrestore(&cfs_list_lock, flag); 249 | + } 250 | + } 251 | + } 252 | +} 253 | +#else 254 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 255 | +{ 256 | +} 257 | + 258 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 259 | +{ 260 | +} 261 | +#endif /* CONFIG_BLD */ 262 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 263 | index ec1a286..e2c3cef 100644 264 | --- a/kernel/sched/core.c 265 | +++ b/kernel/sched/core.c 266 | @@ -24,6 +24,8 @@ 267 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 268 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 269 | * Thomas Gleixner, Mike Kravetz 270 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 271 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 272 | */ 273 | 274 | #include 275 | @@ -86,6 +88,7 @@ 276 | #include "sched.h" 277 | #include "../workqueue_internal.h" 278 | #include "../smpboot.h" 279 | +#include "bld.h" 280 | 281 | #define CREATE_TRACE_POINTS 282 | #include 283 | @@ -842,6 +845,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 284 | update_rq_clock(rq); 285 | sched_info_queued(rq, p); 286 | p->sched_class->enqueue_task(rq, p, flags); 287 | + if (!dl_task(p)) 288 | + bld_track_load_activate(rq, p); 289 | } 290 | 291 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 292 | @@ -849,6 +854,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 293 | update_rq_clock(rq); 294 | sched_info_dequeued(rq, p); 295 | p->sched_class->dequeue_task(rq, p, flags); 296 | + if (!dl_task(p)) 297 | + bld_track_load_deactivate(rq, p); 298 | } 299 | 300 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 301 | @@ -1409,7 +1416,14 @@ out: 302 | static inline 303 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 304 | { 305 | +#ifndef CONFIG_BLD 306 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 307 | +#else 308 | + if (dl_task(p)) 309 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 310 | + else 311 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 312 | +#endif 313 | 314 | /* 315 | * In order not to call set_task_cpu() on a blocking task we need 316 | @@ -1579,7 +1593,11 @@ void scheduler_ipi(void) 317 | */ 318 | preempt_fold_need_resched(); 319 | 320 | +#ifndef CONFIG_BLD 321 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 322 | +#else 323 | + if (llist_empty(&this_rq()->wake_list)) 324 | +#endif 325 | return; 326 | 327 | /* 328 | @@ -1601,13 +1619,16 @@ void scheduler_ipi(void) 329 | /* 330 | * Check if someone kicked us for doing the nohz idle load balance. 331 | */ 332 | +#ifndef CONFIG_BLD 333 | if (unlikely(got_nohz_idle_kick())) { 334 | this_rq()->idle_balance = 1; 335 | raise_softirq_irqoff(SCHED_SOFTIRQ); 336 | } 337 | +#endif 338 | irq_exit(); 339 | } 340 | 341 | +#ifndef CONFIG_BLD 342 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 343 | { 344 | struct rq *rq = cpu_rq(cpu); 345 | @@ -1619,6 +1640,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 346 | trace_sched_wake_idle_without_ipi(cpu); 347 | } 348 | } 349 | +#endif 350 | 351 | bool cpus_share_cache(int this_cpu, int that_cpu) 352 | { 353 | @@ -1630,7 +1652,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) 354 | { 355 | struct rq *rq = cpu_rq(cpu); 356 | 357 | -#if defined(CONFIG_SMP) 358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 359 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 360 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 361 | ttwu_queue_remote(p, cpu); 362 | @@ -1938,7 +1960,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 363 | * Silence PROVE_RCU. 364 | */ 365 | raw_spin_lock_irqsave(&p->pi_lock, flags); 366 | - set_task_cpu(p, cpu); 367 | + __set_task_cpu(p, cpu); 368 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 369 | 370 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 371 | @@ -2413,7 +2435,14 @@ void sched_exec(void) 372 | int dest_cpu; 373 | 374 | raw_spin_lock_irqsave(&p->pi_lock, flags); 375 | +#ifndef CONFIG_BLD 376 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 377 | +#else 378 | + if (dl_task(p)) 379 | + dest_cpu = task_cpu(p); 380 | + else 381 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 382 | +#endif 383 | if (dest_cpu == smp_processor_id()) 384 | goto unlock; 385 | 386 | @@ -2530,8 +2559,10 @@ void scheduler_tick(void) 387 | 388 | #ifdef CONFIG_SMP 389 | rq->idle_balance = idle_cpu(cpu); 390 | +#ifndef CONFIG_BLD 391 | trigger_load_balance(rq); 392 | #endif 393 | +#endif 394 | rq_last_tick_reset(rq); 395 | } 396 | 397 | @@ -7030,6 +7061,15 @@ void __init sched_init(void) 398 | #endif 399 | init_rq_hrtick(rq); 400 | atomic_set(&rq->nr_iowait, 0); 401 | +#ifdef CONFIG_BLD 402 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 403 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 404 | + rq->cfs.pos = 0; 405 | + 406 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 407 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 408 | + rq->rt.lowbit = INT_MAX; 409 | +#endif 410 | } 411 | 412 | set_load_weight(&init_task); 413 | @@ -7070,6 +7110,9 @@ void __init sched_init(void) 414 | init_sched_fair_class(); 415 | 416 | scheduler_running = 1; 417 | +#ifdef CONFIG_BLD 418 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 419 | +#endif 420 | } 421 | 422 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 424 | index bfa3c86..20fc00c 100644 425 | --- a/kernel/sched/fair.c 426 | +++ b/kernel/sched/fair.c 427 | @@ -4136,6 +4136,7 @@ static void task_waking_fair(struct task_struct *p) 428 | record_wakee(p); 429 | } 430 | 431 | +#ifndef CONFIG_BLD 432 | #ifdef CONFIG_FAIR_GROUP_SCHED 433 | /* 434 | * effective_load() calculates the load change as seen from the root_task_group 435 | @@ -4585,6 +4586,7 @@ unlock: 436 | 437 | return new_cpu; 438 | } 439 | +#endif /* CONFIG_BLD */ 440 | 441 | /* 442 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 443 | @@ -4880,6 +4882,7 @@ simple: 444 | return p; 445 | 446 | idle: 447 | +#ifndef CONFIG_BLD 448 | new_tasks = idle_balance(rq); 449 | /* 450 | * Because idle_balance() releases (and re-acquires) rq->lock, it is 451 | @@ -4891,7 +4894,7 @@ idle: 452 | 453 | if (new_tasks > 0) 454 | goto again; 455 | - 456 | +#endif 457 | return NULL; 458 | } 459 | 460 | @@ -6981,12 +6984,39 @@ static inline int on_null_domain(struct rq *rq) 461 | * needed, they will kick the idle load balancer, which then does idle 462 | * load balancing for all the idle CPUs. 463 | */ 464 | +#ifndef CONFIG_BLD 465 | static struct { 466 | cpumask_var_t idle_cpus_mask; 467 | atomic_t nr_cpus; 468 | unsigned long next_balance; /* in jiffy units */ 469 | } nohz ____cacheline_aligned; 470 | 471 | +static inline void nohz_balance_exit_idle(int cpu) 472 | +{ 473 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 474 | + /* 475 | + * Completely isolated CPUs don't ever set, so we must test. 476 | + */ 477 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 478 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 479 | + atomic_dec(&nohz.nr_cpus); 480 | + } 481 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 482 | + } 483 | +} 484 | + 485 | +static int sched_ilb_notifier(struct notifier_block *nfb, 486 | + unsigned long action, void *hcpu) 487 | +{ 488 | + switch (action & ~CPU_TASKS_FROZEN) { 489 | + case CPU_DYING: 490 | + nohz_balance_exit_idle(smp_processor_id()); 491 | + return NOTIFY_OK; 492 | + default: 493 | + return NOTIFY_DONE; 494 | + } 495 | +} 496 | + 497 | static inline int find_new_ilb(void) 498 | { 499 | int ilb = cpumask_first(nohz.idle_cpus_mask); 500 | @@ -7024,20 +7054,7 @@ static void nohz_balancer_kick(void) 501 | smp_send_reschedule(ilb_cpu); 502 | return; 503 | } 504 | - 505 | -static inline void nohz_balance_exit_idle(int cpu) 506 | -{ 507 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 508 | - /* 509 | - * Completely isolated CPUs don't ever set, so we must test. 510 | - */ 511 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 512 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 513 | - atomic_dec(&nohz.nr_cpus); 514 | - } 515 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 516 | - } 517 | -} 518 | +#endif /* CONFIG_BLD */ 519 | 520 | static inline void set_cpu_sd_state_busy(void) 521 | { 522 | @@ -7079,6 +7096,7 @@ unlock: 523 | */ 524 | void nohz_balance_enter_idle(int cpu) 525 | { 526 | +#ifndef CONFIG_BLD 527 | /* 528 | * If this cpu is going down, then nothing needs to be done. 529 | */ 530 | @@ -7097,23 +7115,10 @@ void nohz_balance_enter_idle(int cpu) 531 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 532 | atomic_inc(&nohz.nr_cpus); 533 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 534 | -} 535 | - 536 | -static int sched_ilb_notifier(struct notifier_block *nfb, 537 | - unsigned long action, void *hcpu) 538 | -{ 539 | - switch (action & ~CPU_TASKS_FROZEN) { 540 | - case CPU_DYING: 541 | - nohz_balance_exit_idle(smp_processor_id()); 542 | - return NOTIFY_OK; 543 | - default: 544 | - return NOTIFY_DONE; 545 | - } 546 | +#endif 547 | } 548 | #endif 549 | 550 | -static DEFINE_SPINLOCK(balancing); 551 | - 552 | /* 553 | * Scale the max load_balance interval with the number of CPUs in the system. 554 | * This trades load-balance latency on larger machines for less cross talk. 555 | @@ -7123,6 +7128,9 @@ void update_max_interval(void) 556 | max_load_balance_interval = HZ*num_online_cpus()/10; 557 | } 558 | 559 | +#ifndef CONFIG_BLD 560 | +static DEFINE_SPINLOCK(balancing); 561 | + 562 | /* 563 | * It checks each scheduling domain to see if it is due to be balanced, 564 | * and initiates a balancing operation if so. 565 | @@ -7371,6 +7379,7 @@ void trigger_load_balance(struct rq *rq) 566 | nohz_balancer_kick(); 567 | #endif 568 | } 569 | +#endif /* CONFIG_BLD */ 570 | 571 | static void rq_online_fair(struct rq *rq) 572 | { 573 | @@ -7816,7 +7825,9 @@ const struct sched_class fair_sched_class = { 574 | .put_prev_task = put_prev_task_fair, 575 | 576 | #ifdef CONFIG_SMP 577 | +#ifndef CONFIG_BLD 578 | .select_task_rq = select_task_rq_fair, 579 | +#endif 580 | .migrate_task_rq = migrate_task_rq_fair, 581 | 582 | .rq_online = rq_online_fair, 583 | @@ -7854,6 +7865,7 @@ void print_cfs_stats(struct seq_file *m, int cpu) 584 | 585 | __init void init_sched_fair_class(void) 586 | { 587 | +#ifndef CONFIG_BLD 588 | #ifdef CONFIG_SMP 589 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 590 | 591 | @@ -7863,5 +7875,5 @@ __init void init_sched_fair_class(void) 592 | cpu_notifier(sched_ilb_notifier, 0); 593 | #endif 594 | #endif /* SMP */ 595 | - 596 | +#endif /* BLD */ 597 | } 598 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 599 | index 5f6edca..ea0946c 100644 600 | --- a/kernel/sched/rt.c 601 | +++ b/kernel/sched/rt.c 602 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq) 603 | #ifdef CONFIG_SMP 604 | static int find_lowest_rq(struct task_struct *task); 605 | 606 | +#ifndef CONFIG_BLD 607 | static int 608 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 609 | { 610 | @@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 611 | out: 612 | return cpu; 613 | } 614 | +#endif /* CONFIG_BLD */ 615 | 616 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 617 | { 618 | @@ -2112,7 +2114,9 @@ const struct sched_class rt_sched_class = { 619 | .put_prev_task = put_prev_task_rt, 620 | 621 | #ifdef CONFIG_SMP 622 | +#ifndef CONFIG_BLD 623 | .select_task_rq = select_task_rq_rt, 624 | +#endif 625 | 626 | .set_cpus_allowed = set_cpus_allowed_rt, 627 | .rq_online = rq_online_rt, 628 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 629 | index 579712f..a00914d 100644 630 | --- a/kernel/sched/sched.h 631 | +++ b/kernel/sched/sched.h 632 | @@ -358,9 +358,8 @@ struct cfs_rq { 633 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 634 | #endif /* CONFIG_SMP */ 635 | 636 | -#ifdef CONFIG_FAIR_GROUP_SCHED 637 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 638 | - 639 | +#ifdef CONFIG_FAIR_GROUP_SCHED 640 | /* 641 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 642 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 643 | @@ -384,6 +383,11 @@ struct cfs_rq { 644 | struct list_head throttled_list; 645 | #endif /* CONFIG_CFS_BANDWIDTH */ 646 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 647 | + 648 | +#ifdef CONFIG_BLD 649 | + struct list_head bld_cfs_list; 650 | + char pos; 651 | +#endif 652 | }; 653 | 654 | static inline int rt_bandwidth_enabled(void) 655 | @@ -417,12 +421,16 @@ struct rt_rq { 656 | /* Nests inside the rq lock: */ 657 | raw_spinlock_t rt_runtime_lock; 658 | 659 | + struct rq *rq; 660 | #ifdef CONFIG_RT_GROUP_SCHED 661 | unsigned long rt_nr_boosted; 662 | 663 | - struct rq *rq; 664 | struct task_group *tg; 665 | #endif 666 | +#ifdef CONFIG_BLD 667 | + struct list_head bld_rt_list; 668 | + int lowbit; 669 | +#endif 670 | }; 671 | 672 | /* Deadline class' related fields in a runqueue */ 673 | -------------------------------------------------------------------------------- /BLD-3.18.patch: -------------------------------------------------------------------------------- 1 | BLD patch for Linux-3.18. Changes since previous release: 2 | 3 | * Contains changes to address issue at the time of wakeup 4 | of rt tasks, it was reported by Peter Junos . 5 | 6 | BLD has some positive impact on vmlinux size too, the following shows 7 | the picture: 8 | 9 | $ cat size.mainline 10 | text data bss dec hex filename 11 | 12769041 2056008 11722752 26547801 1951659 vmlinux 12 | 13 | $ cat size.bld 14 | text data bss dec hex filename 15 | 12755462 2056040 11722752 26534254 194e16e vmlinux 16 | 17 | and the config could be found here: 18 | 19 | https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17 20 | 21 | After previous release, Mike Galbraith shows that on systems BLD can 22 | reduce throughput significantly, due to L2 misses and where no L3 is 23 | available, that issue is yet to address, I lack those kind of systems, 24 | so it might take some time. 25 | 26 | Thanks, 27 | Rakib 28 | 29 | --- 30 | 31 | diff --git a/init/Kconfig b/init/Kconfig 32 | index 2081a4d..becfd85 100644 33 | --- a/init/Kconfig 34 | +++ b/init/Kconfig 35 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 36 | depends on BROKEN || !SMP 37 | default y 38 | 39 | +config BLD 40 | + bool "An alternate CPU load distribution technique for task scheduler" 41 | + depends on SMP 42 | + default y 43 | + help 44 | + This is an alternate CPU load distribution technique based for task 45 | + scheduler based on The Barbershop Load Distribution algorithm. Not 46 | + suitable for NUMA, should work well on SMP. 47 | + 48 | config INIT_ENV_ARG_LIMIT 49 | int 50 | default 32 if !UML 51 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 52 | new file mode 100644 53 | index 0000000..f1f9fba 54 | --- /dev/null 55 | +++ b/kernel/sched/bld.h 56 | @@ -0,0 +1,215 @@ 57 | +#ifdef CONFIG_BLD 58 | + 59 | +static DEFINE_RWLOCK(rt_list_lock); 60 | +static LIST_HEAD(rt_rq_head); 61 | +static LIST_HEAD(cfs_rq_head); 62 | +static DEFINE_RWLOCK(cfs_list_lock); 63 | + 64 | +#ifdef CONFIG_FAIR_GROUP_SCHED 65 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 66 | +{ 67 | + return cfs_rq->rq; 68 | +} 69 | +#else 70 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 71 | +{ 72 | + return container_of(cfs_rq, struct rq, cfs); 73 | +} 74 | +#endif 75 | + 76 | +#ifdef CONFIG_RT_GROUP_SCHED 77 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 78 | +{ 79 | + return rt_rq->rq; 80 | +} 81 | +#else 82 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 83 | +{ 84 | + return container_of(rt_rq, struct rq, rt); 85 | +} 86 | +#endif 87 | + 88 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 89 | +{ 90 | + int cpu = smp_processor_id(), i; 91 | + unsigned long load, varload; 92 | + struct rq *rq; 93 | + 94 | + if (task_type) { 95 | + varload = ULONG_MAX; 96 | + for_each_cpu(i, mask) { 97 | + rq = cpu_rq(i); 98 | + load = rq->cfs.load.weight; 99 | + if (load < varload) { 100 | + varload = load; 101 | + cpu = i; 102 | + } 103 | + } 104 | + } else { 105 | + /* Here's an attempt to get a CPU within the mask where 106 | + * we can preempt easily. To achieve this we tried to 107 | + * maintain a lowbit, which indicate the lowest bit set on 108 | + * array bitmap. Since all CPUs contains high priority 109 | + * kernel threads therefore we eliminate 0, so it might not 110 | + * be right every time, but it's just an indicator. 111 | + */ 112 | + varload = 1; 113 | + 114 | + for_each_cpu(i, mask) { 115 | + rq = cpu_rq(i); 116 | + load = rq->rt.lowbit; 117 | + if (load >= varload) { 118 | + varload = load; 119 | + cpu = i; 120 | + } 121 | + } 122 | + } 123 | + 124 | + return cpu; 125 | +} 126 | + 127 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 128 | +{ 129 | + struct cfs_rq *cfs; 130 | + unsigned long flags; 131 | + unsigned int cpu = smp_processor_id(); 132 | + 133 | + read_lock_irqsave(&cfs_list_lock, flags); 134 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 135 | + cpu = cpu_of(rq_of_cfs(cfs)); 136 | + if (cpu_online(cpu)) 137 | + break; 138 | + } 139 | + read_unlock_irqrestore(&cfs_list_lock, flags); 140 | + return cpu; 141 | +} 142 | + 143 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 144 | +{ 145 | + struct rt_rq *rt; 146 | + unsigned long flags; 147 | + unsigned int cpu = smp_processor_id(); 148 | + 149 | + read_lock_irqsave(&rt_list_lock, flags); 150 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 151 | + cpu = cpu_of(rq_of_rt(rt)); 152 | + if (cpu_online(cpu)) 153 | + break; 154 | + } 155 | + read_unlock_irqrestore(&rt_list_lock, flags); 156 | + return cpu; 157 | +} 158 | + 159 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 160 | +{ 161 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 162 | + struct cpumask *tmpmask; 163 | + 164 | + if (p->nr_cpus_allowed == 1) 165 | + return task_cpu(p); 166 | + 167 | + if (sd_flags & SD_BALANCE_WAKE) { 168 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 169 | + want_affine = 1; 170 | + } 171 | + } 172 | + 173 | + if (want_affine) 174 | + tmpmask = tsk_cpus_allowed(p); 175 | + else 176 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 177 | + 178 | + if (rt_task(p)) 179 | + cpu = select_cpu_for_wakeup(0, tmpmask); 180 | + else 181 | + cpu = select_cpu_for_wakeup(1, tmpmask); 182 | + 183 | + return cpu; 184 | +} 185 | + 186 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 187 | +{ 188 | + unsigned long flag; 189 | + int firstbit; 190 | + struct rt_rq *first; 191 | + struct rt_prio_array *array = &rq->rt.active; 192 | + 193 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 194 | + firstbit = sched_find_first_bit(array->bitmap); 195 | + 196 | + /* Maintaining rt.lowbit */ 197 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 198 | + rq->rt.lowbit = firstbit; 199 | + 200 | + if (rq->rt.lowbit < first->lowbit) { 201 | + write_lock_irqsave(&rt_list_lock, flag); 202 | + list_del(&rq->rt.bld_rt_list); 203 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 204 | + write_unlock_irqrestore(&rt_list_lock, flag); 205 | + } 206 | +} 207 | + 208 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 209 | +{ 210 | + unsigned int cpu; 211 | + 212 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 213 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 214 | + else { 215 | + if (rt_task(p)) 216 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 217 | + else 218 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 219 | + } 220 | + 221 | + return cpu; 222 | +} 223 | + 224 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 225 | +{ 226 | + unsigned long flag; 227 | + if (rt_task(p)) { 228 | + track_load_rt(rq, p); 229 | + } else { 230 | + if (rq->cfs.pos != 2) { 231 | + struct cfs_rq *last; 232 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 233 | + if (rq->cfs.load.weight >= last->load.weight) { 234 | + write_lock_irqsave(&cfs_list_lock, flag); 235 | + list_del(&rq->cfs.bld_cfs_list); 236 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 237 | + rq->cfs.pos = 2; last->pos = 1; 238 | + write_unlock_irqrestore(&cfs_list_lock, flag); 239 | + } 240 | + } 241 | + } 242 | +} 243 | + 244 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 245 | +{ 246 | + unsigned long flag; 247 | + if (rt_task(p)) { 248 | + track_load_rt(rq, p); 249 | + } else { 250 | + if (rq->cfs.pos != 0) { 251 | + struct cfs_rq *first; 252 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 253 | + if (rq->cfs.load.weight <= first->load.weight) { 254 | + write_lock_irqsave(&cfs_list_lock, flag); 255 | + list_del(&rq->cfs.bld_cfs_list); 256 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 257 | + rq->cfs.pos = 0; first->pos = 1; 258 | + write_unlock_irqrestore(&cfs_list_lock, flag); 259 | + } 260 | + } 261 | + } 262 | +} 263 | +#else 264 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 265 | +{ 266 | +} 267 | + 268 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 269 | +{ 270 | +} 271 | +#endif /* CONFIG_BLD */ 272 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 273 | index 89e7283..bd702c6 100644 274 | --- a/kernel/sched/core.c 275 | +++ b/kernel/sched/core.c 276 | @@ -24,6 +24,8 @@ 277 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 278 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 279 | * Thomas Gleixner, Mike Kravetz 280 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 281 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 282 | */ 283 | 284 | #include 285 | @@ -86,6 +88,7 @@ 286 | #include "sched.h" 287 | #include "../workqueue_internal.h" 288 | #include "../smpboot.h" 289 | +#include "bld.h" 290 | 291 | #define CREATE_TRACE_POINTS 292 | #include 293 | @@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 294 | update_rq_clock(rq); 295 | sched_info_queued(rq, p); 296 | p->sched_class->enqueue_task(rq, p, flags); 297 | + if (!dl_task(p)) 298 | + bld_track_load_activate(rq, p); 299 | } 300 | 301 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 302 | @@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 303 | update_rq_clock(rq); 304 | sched_info_dequeued(rq, p); 305 | p->sched_class->dequeue_task(rq, p, flags); 306 | + if (!dl_task(p)) 307 | + bld_track_load_deactivate(rq, p); 308 | } 309 | 310 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 311 | @@ -1407,7 +1414,14 @@ out: 312 | static inline 313 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 314 | { 315 | +#ifndef CONFIG_BLD 316 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 317 | +#else 318 | + if (dl_task(p)) 319 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 320 | + else 321 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 322 | +#endif 323 | 324 | /* 325 | * In order not to call set_task_cpu() on a blocking task we need 326 | @@ -1577,7 +1591,11 @@ void scheduler_ipi(void) 327 | */ 328 | preempt_fold_need_resched(); 329 | 330 | +#ifndef CONFIG_BLD 331 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 332 | +#else 333 | + if (llist_empty(&this_rq()->wake_list)) 334 | +#endif 335 | return; 336 | 337 | /* 338 | @@ -1599,13 +1617,16 @@ void scheduler_ipi(void) 339 | /* 340 | * Check if someone kicked us for doing the nohz idle load balance. 341 | */ 342 | +#ifndef CONFIG_BLD 343 | if (unlikely(got_nohz_idle_kick())) { 344 | this_rq()->idle_balance = 1; 345 | raise_softirq_irqoff(SCHED_SOFTIRQ); 346 | } 347 | +#endif 348 | irq_exit(); 349 | } 350 | 351 | +#ifndef CONFIG_BLD 352 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 353 | { 354 | struct rq *rq = cpu_rq(cpu); 355 | @@ -1618,6 +1639,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 356 | } 357 | } 358 | 359 | +#endif 360 | + 361 | +bool cpus_share_cache(int this_cpu, int that_cpu) 362 | +{ 363 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 364 | +} 365 | + 366 | void wake_up_if_idle(int cpu) 367 | { 368 | struct rq *rq = cpu_rq(cpu); 369 | @@ -1636,18 +1664,13 @@ void wake_up_if_idle(int cpu) 370 | raw_spin_unlock_irqrestore(&rq->lock, flags); 371 | } 372 | } 373 | - 374 | -bool cpus_share_cache(int this_cpu, int that_cpu) 375 | -{ 376 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 377 | -} 378 | #endif /* CONFIG_SMP */ 379 | 380 | static void ttwu_queue(struct task_struct *p, int cpu) 381 | { 382 | struct rq *rq = cpu_rq(cpu); 383 | 384 | -#if defined(CONFIG_SMP) 385 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 386 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 387 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 388 | ttwu_queue_remote(p, cpu); 389 | @@ -1966,7 +1989,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 390 | * Silence PROVE_RCU. 391 | */ 392 | raw_spin_lock_irqsave(&p->pi_lock, flags); 393 | - set_task_cpu(p, cpu); 394 | + __set_task_cpu(p, cpu); 395 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 396 | 397 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 398 | @@ -2451,7 +2474,14 @@ void sched_exec(void) 399 | int dest_cpu; 400 | 401 | raw_spin_lock_irqsave(&p->pi_lock, flags); 402 | +#ifndef CONFIG_BLD 403 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 404 | +#else 405 | + if (dl_task(p)) 406 | + dest_cpu = task_cpu(p); 407 | + else 408 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 409 | +#endif 410 | if (dest_cpu == smp_processor_id()) 411 | goto unlock; 412 | 413 | @@ -2539,8 +2569,10 @@ void scheduler_tick(void) 414 | 415 | #ifdef CONFIG_SMP 416 | rq->idle_balance = idle_cpu(cpu); 417 | +#ifndef CONFIG_BLD 418 | trigger_load_balance(rq); 419 | #endif 420 | +#endif 421 | rq_last_tick_reset(rq); 422 | } 423 | 424 | @@ -7126,6 +7158,15 @@ void __init sched_init(void) 425 | #endif 426 | init_rq_hrtick(rq); 427 | atomic_set(&rq->nr_iowait, 0); 428 | +#ifdef CONFIG_BLD 429 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 430 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 431 | + rq->cfs.pos = 0; 432 | + 433 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 434 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 435 | + rq->rt.lowbit = INT_MAX; 436 | +#endif 437 | } 438 | 439 | set_load_weight(&init_task); 440 | @@ -7166,6 +7207,9 @@ void __init sched_init(void) 441 | init_sched_fair_class(); 442 | 443 | scheduler_running = 1; 444 | +#ifdef CONFIG_BLD 445 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 446 | +#endif 447 | } 448 | 449 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 450 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 451 | index ef2b104..ea45f76 100644 452 | --- a/kernel/sched/fair.c 453 | +++ b/kernel/sched/fair.c 454 | @@ -4164,6 +4164,7 @@ static void task_waking_fair(struct task_struct *p) 455 | record_wakee(p); 456 | } 457 | 458 | +#ifndef CONFIG_BLD 459 | #ifdef CONFIG_FAIR_GROUP_SCHED 460 | /* 461 | * effective_load() calculates the load change as seen from the root_task_group 462 | @@ -4619,6 +4620,7 @@ unlock: 463 | 464 | return new_cpu; 465 | } 466 | +#endif /* CONFIG_BLD */ 467 | 468 | /* 469 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 470 | @@ -4914,6 +4916,7 @@ simple: 471 | return p; 472 | 473 | idle: 474 | +#ifndef CONFIG_BLD 475 | new_tasks = idle_balance(rq); 476 | /* 477 | * Because idle_balance() releases (and re-acquires) rq->lock, it is 478 | @@ -4925,7 +4928,7 @@ idle: 479 | 480 | if (new_tasks > 0) 481 | goto again; 482 | - 483 | +#endif 484 | return NULL; 485 | } 486 | 487 | @@ -7107,12 +7110,39 @@ static inline int on_null_domain(struct rq *rq) 488 | * needed, they will kick the idle load balancer, which then does idle 489 | * load balancing for all the idle CPUs. 490 | */ 491 | +#ifndef CONFIG_BLD 492 | static struct { 493 | cpumask_var_t idle_cpus_mask; 494 | atomic_t nr_cpus; 495 | unsigned long next_balance; /* in jiffy units */ 496 | } nohz ____cacheline_aligned; 497 | 498 | +static inline void nohz_balance_exit_idle(int cpu) 499 | +{ 500 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 501 | + /* 502 | + * Completely isolated CPUs don't ever set, so we must test. 503 | + */ 504 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 505 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 506 | + atomic_dec(&nohz.nr_cpus); 507 | + } 508 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 509 | + } 510 | +} 511 | + 512 | +static int sched_ilb_notifier(struct notifier_block *nfb, 513 | + unsigned long action, void *hcpu) 514 | +{ 515 | + switch (action & ~CPU_TASKS_FROZEN) { 516 | + case CPU_DYING: 517 | + nohz_balance_exit_idle(smp_processor_id()); 518 | + return NOTIFY_OK; 519 | + default: 520 | + return NOTIFY_DONE; 521 | + } 522 | +} 523 | + 524 | static inline int find_new_ilb(void) 525 | { 526 | int ilb = cpumask_first(nohz.idle_cpus_mask); 527 | @@ -7150,20 +7180,7 @@ static void nohz_balancer_kick(void) 528 | smp_send_reschedule(ilb_cpu); 529 | return; 530 | } 531 | - 532 | -static inline void nohz_balance_exit_idle(int cpu) 533 | -{ 534 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 535 | - /* 536 | - * Completely isolated CPUs don't ever set, so we must test. 537 | - */ 538 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 539 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 540 | - atomic_dec(&nohz.nr_cpus); 541 | - } 542 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 543 | - } 544 | -} 545 | +#endif /* CONFIG_BLD */ 546 | 547 | static inline void set_cpu_sd_state_busy(void) 548 | { 549 | @@ -7205,6 +7222,7 @@ unlock: 550 | */ 551 | void nohz_balance_enter_idle(int cpu) 552 | { 553 | +#ifndef CONFIG_BLD 554 | /* 555 | * If this cpu is going down, then nothing needs to be done. 556 | */ 557 | @@ -7223,23 +7241,10 @@ void nohz_balance_enter_idle(int cpu) 558 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 559 | atomic_inc(&nohz.nr_cpus); 560 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 561 | -} 562 | - 563 | -static int sched_ilb_notifier(struct notifier_block *nfb, 564 | - unsigned long action, void *hcpu) 565 | -{ 566 | - switch (action & ~CPU_TASKS_FROZEN) { 567 | - case CPU_DYING: 568 | - nohz_balance_exit_idle(smp_processor_id()); 569 | - return NOTIFY_OK; 570 | - default: 571 | - return NOTIFY_DONE; 572 | - } 573 | +#endif 574 | } 575 | #endif 576 | 577 | -static DEFINE_SPINLOCK(balancing); 578 | - 579 | /* 580 | * Scale the max load_balance interval with the number of CPUs in the system. 581 | * This trades load-balance latency on larger machines for less cross talk. 582 | @@ -7249,6 +7254,9 @@ void update_max_interval(void) 583 | max_load_balance_interval = HZ*num_online_cpus()/10; 584 | } 585 | 586 | +#ifndef CONFIG_BLD 587 | +static DEFINE_SPINLOCK(balancing); 588 | + 589 | /* 590 | * It checks each scheduling domain to see if it is due to be balanced, 591 | * and initiates a balancing operation if so. 592 | @@ -7497,6 +7505,7 @@ void trigger_load_balance(struct rq *rq) 593 | nohz_balancer_kick(); 594 | #endif 595 | } 596 | +#endif /* CONFIG_BLD */ 597 | 598 | static void rq_online_fair(struct rq *rq) 599 | { 600 | @@ -7942,7 +7951,9 @@ const struct sched_class fair_sched_class = { 601 | .put_prev_task = put_prev_task_fair, 602 | 603 | #ifdef CONFIG_SMP 604 | +#ifndef CONFIG_BLD 605 | .select_task_rq = select_task_rq_fair, 606 | +#endif 607 | .migrate_task_rq = migrate_task_rq_fair, 608 | 609 | .rq_online = rq_online_fair, 610 | @@ -7982,6 +7993,7 @@ void print_cfs_stats(struct seq_file *m, int cpu) 611 | 612 | __init void init_sched_fair_class(void) 613 | { 614 | +#ifndef CONFIG_BLD 615 | #ifdef CONFIG_SMP 616 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 617 | 618 | @@ -7991,5 +8003,5 @@ __init void init_sched_fair_class(void) 619 | cpu_notifier(sched_ilb_notifier, 0); 620 | #endif 621 | #endif /* SMP */ 622 | - 623 | +#endif /* BLD */ 624 | } 625 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 626 | index 20bca39..03a0ed3 100644 627 | --- a/kernel/sched/rt.c 628 | +++ b/kernel/sched/rt.c 629 | @@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq) 630 | #ifdef CONFIG_SMP 631 | static int find_lowest_rq(struct task_struct *task); 632 | 633 | +#ifndef CONFIG_BLD 634 | static int 635 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 636 | { 637 | @@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 638 | out: 639 | return cpu; 640 | } 641 | +#endif /* CONFIG_BLD */ 642 | 643 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 644 | { 645 | @@ -2111,7 +2113,9 @@ const struct sched_class rt_sched_class = { 646 | .put_prev_task = put_prev_task_rt, 647 | 648 | #ifdef CONFIG_SMP 649 | +#ifndef CONFIG_BLD 650 | .select_task_rq = select_task_rq_rt, 651 | +#endif 652 | 653 | .set_cpus_allowed = set_cpus_allowed_rt, 654 | .rq_online = rq_online_rt, 655 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 656 | index 2df8ef0..79fce51 100644 657 | --- a/kernel/sched/sched.h 658 | +++ b/kernel/sched/sched.h 659 | @@ -366,9 +366,8 @@ struct cfs_rq { 660 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 661 | #endif /* CONFIG_SMP */ 662 | 663 | -#ifdef CONFIG_FAIR_GROUP_SCHED 664 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 665 | - 666 | +#ifdef CONFIG_FAIR_GROUP_SCHED 667 | /* 668 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 669 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 670 | @@ -392,6 +391,11 @@ struct cfs_rq { 671 | struct list_head throttled_list; 672 | #endif /* CONFIG_CFS_BANDWIDTH */ 673 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 674 | + 675 | +#ifdef CONFIG_BLD 676 | + struct list_head bld_cfs_list; 677 | + char pos; 678 | +#endif 679 | }; 680 | 681 | static inline int rt_bandwidth_enabled(void) 682 | @@ -425,12 +429,16 @@ struct rt_rq { 683 | /* Nests inside the rq lock: */ 684 | raw_spinlock_t rt_runtime_lock; 685 | 686 | + struct rq *rq; 687 | #ifdef CONFIG_RT_GROUP_SCHED 688 | unsigned long rt_nr_boosted; 689 | 690 | - struct rq *rq; 691 | struct task_group *tg; 692 | #endif 693 | +#ifdef CONFIG_BLD 694 | + struct list_head bld_rt_list; 695 | + int lowbit; 696 | +#endif 697 | }; 698 | 699 | /* Deadline class' related fields in a runqueue */ 700 | -------------------------------------------------------------------------------- /BLD-4.8.patch: -------------------------------------------------------------------------------- 1 | diff --git a/init/Kconfig b/init/Kconfig 2 | index cac3f09..4e49d16 100644 3 | --- a/init/Kconfig 4 | +++ b/init/Kconfig 5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 6 | depends on BROKEN || !SMP 7 | default y 8 | 9 | +config BLD 10 | + bool "An alternate CPU load distribution technique for task scheduler" 11 | + depends on SMP 12 | + default y 13 | + help 14 | + This is an alternate CPU load distribution technique based for task 15 | + scheduler based on The Barbershop Load Distribution algorithm. Not 16 | + suitable for NUMA, should work well on SMP. 17 | + 18 | config INIT_ENV_ARG_LIMIT 19 | int 20 | default 32 if !UML 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 22 | new file mode 100644 23 | index 0000000..f1f9fba 24 | --- /dev/null 25 | +++ b/kernel/sched/bld.h 26 | @@ -0,0 +1,215 @@ 27 | +#ifdef CONFIG_BLD 28 | + 29 | +static DEFINE_RWLOCK(rt_list_lock); 30 | +static LIST_HEAD(rt_rq_head); 31 | +static LIST_HEAD(cfs_rq_head); 32 | +static DEFINE_RWLOCK(cfs_list_lock); 33 | + 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 36 | +{ 37 | + return cfs_rq->rq; 38 | +} 39 | +#else 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 41 | +{ 42 | + return container_of(cfs_rq, struct rq, cfs); 43 | +} 44 | +#endif 45 | + 46 | +#ifdef CONFIG_RT_GROUP_SCHED 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 48 | +{ 49 | + return rt_rq->rq; 50 | +} 51 | +#else 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 53 | +{ 54 | + return container_of(rt_rq, struct rq, rt); 55 | +} 56 | +#endif 57 | + 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 59 | +{ 60 | + int cpu = smp_processor_id(), i; 61 | + unsigned long load, varload; 62 | + struct rq *rq; 63 | + 64 | + if (task_type) { 65 | + varload = ULONG_MAX; 66 | + for_each_cpu(i, mask) { 67 | + rq = cpu_rq(i); 68 | + load = rq->cfs.load.weight; 69 | + if (load < varload) { 70 | + varload = load; 71 | + cpu = i; 72 | + } 73 | + } 74 | + } else { 75 | + /* Here's an attempt to get a CPU within the mask where 76 | + * we can preempt easily. To achieve this we tried to 77 | + * maintain a lowbit, which indicate the lowest bit set on 78 | + * array bitmap. Since all CPUs contains high priority 79 | + * kernel threads therefore we eliminate 0, so it might not 80 | + * be right every time, but it's just an indicator. 81 | + */ 82 | + varload = 1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load >= varload) { 88 | + varload = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = firstbit; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index 44817c6..f0f3321 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -87,6 +89,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -751,6 +754,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | if (!(flags & ENQUEUE_RESTORE)) 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -759,6 +764,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | if (!(flags & DEQUEUE_SAVE)) 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1588,11 +1595,17 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 282 | { 283 | lockdep_assert_held(&p->pi_lock); 284 | 285 | +#ifndef CONFIG_BLD 286 | if (tsk_nr_cpus_allowed(p) > 1) 287 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 288 | else 289 | cpu = cpumask_any(tsk_cpus_allowed(p)); 290 | - 291 | +#else 292 | + if (dl_task(p)) 293 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 294 | + else 295 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 296 | +#endif 297 | /* 298 | * In order not to call set_task_cpu() on a blocking task we need 299 | * to rely on ttwu() to place the task on a valid ->cpus_allowed 300 | @@ -1795,7 +1808,11 @@ void scheduler_ipi(void) 301 | */ 302 | preempt_fold_need_resched(); 303 | 304 | +#ifndef CONFIG_BLD 305 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 306 | +#else 307 | + if (llist_empty(&this_rq()->wake_list)) 308 | +#endif 309 | return; 310 | 311 | /* 312 | @@ -1817,13 +1834,16 @@ void scheduler_ipi(void) 313 | /* 314 | * Check if someone kicked us for doing the nohz idle load balance. 315 | */ 316 | +#ifndef CONFIG_BLD 317 | if (unlikely(got_nohz_idle_kick())) { 318 | this_rq()->idle_balance = 1; 319 | raise_softirq_irqoff(SCHED_SOFTIRQ); 320 | } 321 | +#endif 322 | irq_exit(); 323 | } 324 | 325 | +#ifndef CONFIG_BLD 326 | static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) 327 | { 328 | struct rq *rq = cpu_rq(cpu); 329 | @@ -1837,6 +1857,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) 330 | trace_sched_wake_idle_without_ipi(cpu); 331 | } 332 | } 333 | +#endif /* CONFIG_BLD */ 334 | 335 | void wake_up_if_idle(int cpu) 336 | { 337 | @@ -1873,7 +1894,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 338 | struct rq *rq = cpu_rq(cpu); 339 | struct pin_cookie cookie; 340 | 341 | -#if defined(CONFIG_SMP) 342 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 343 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 344 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 345 | ttwu_queue_remote(p, cpu, wake_flags); 346 | @@ -2971,7 +2992,14 @@ void sched_exec(void) 347 | int dest_cpu; 348 | 349 | raw_spin_lock_irqsave(&p->pi_lock, flags); 350 | +#ifndef CONFIG_BLD 351 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 352 | +#else 353 | + if (dl_task(p)) 354 | + dest_cpu = task_cpu(p); 355 | + else 356 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 357 | +#endif 358 | if (dest_cpu == smp_processor_id()) 359 | goto unlock; 360 | 361 | @@ -3078,8 +3106,10 @@ void scheduler_tick(void) 362 | 363 | #ifdef CONFIG_SMP 364 | rq->idle_balance = idle_cpu(cpu); 365 | +#ifndef CONFIG_BLD 366 | trigger_load_balance(rq); 367 | #endif 368 | +#endif 369 | rq_last_tick_reset(rq); 370 | } 371 | 372 | @@ -7313,7 +7343,9 @@ int sched_cpu_dying(unsigned int cpu) 373 | raw_spin_unlock_irqrestore(&rq->lock, flags); 374 | calc_load_migrate(rq); 375 | update_max_interval(); 376 | +#ifndef CONFIG_BLD 377 | nohz_balance_exit_idle(cpu); 378 | +#endif 379 | hrtick_clear(rq); 380 | return 0; 381 | } 382 | @@ -7519,6 +7551,15 @@ void __init sched_init(void) 383 | #endif /* CONFIG_SMP */ 384 | init_rq_hrtick(rq); 385 | atomic_set(&rq->nr_iowait, 0); 386 | +#ifdef CONFIG_BLD 387 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 388 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 389 | + rq->cfs.pos = 0; 390 | + 391 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 392 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 393 | + rq->rt.lowbit = INT_MAX; 394 | +#endif 395 | } 396 | 397 | set_load_weight(&init_task); 398 | @@ -7561,6 +7602,9 @@ void __init sched_init(void) 399 | init_schedstats(); 400 | 401 | scheduler_running = 1; 402 | +#ifdef CONFIG_BLD 403 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 404 | +#endif 405 | } 406 | 407 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 408 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 409 | index 039de34..f823e5b 100644 410 | --- a/kernel/sched/fair.c 411 | +++ b/kernel/sched/fair.c 412 | @@ -4924,6 +4924,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) 413 | return 0; 414 | } 415 | 416 | +#ifndef CONFIG_BLD 417 | #ifdef CONFIG_FAIR_GROUP_SCHED 418 | /* 419 | * effective_load() calculates the load change as seen from the root_task_group 420 | @@ -5455,6 +5456,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f 421 | 422 | return new_cpu; 423 | } 424 | +#endif /* CONFIG_BLD */ 425 | 426 | /* 427 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 428 | @@ -5785,6 +5787,7 @@ idle: 429 | * further scheduler activity on it and we're being very careful to 430 | * re-start the picking loop. 431 | */ 432 | +#ifndef CONFIG_BLD 433 | lockdep_unpin_lock(&rq->lock, cookie); 434 | new_tasks = idle_balance(rq); 435 | lockdep_repin_lock(&rq->lock, cookie); 436 | @@ -5798,7 +5801,7 @@ idle: 437 | 438 | if (new_tasks > 0) 439 | goto again; 440 | - 441 | +#endif /* CONFIG_BLD */ 442 | return NULL; 443 | } 444 | 445 | @@ -6459,8 +6462,9 @@ static unsigned long task_h_load(struct task_struct *p) 446 | } 447 | #endif 448 | 449 | -/********** Helpers for find_busiest_group ************************/ 450 | +#ifndef CONFIG_BLD 451 | 452 | +/********** Helpers for find_busiest_group ************************/ 453 | enum group_type { 454 | group_other = 0, 455 | group_imbalanced, 456 | @@ -6551,6 +6555,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 457 | 458 | return load_idx; 459 | } 460 | +#endif /* CONFIG_BLD */ 461 | 462 | static unsigned long scale_rt_capacity(int cpu) 463 | { 464 | @@ -6659,6 +6664,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 465 | sdg->sgc->capacity = capacity; 466 | } 467 | 468 | +#ifndef CONFIG_BLD 469 | /* 470 | * Check whether the capacity of the rq has been noticeably reduced by side 471 | * activity. The imbalance_pct is used for the threshold. 472 | @@ -7892,6 +7898,7 @@ static inline int on_null_domain(struct rq *rq) 473 | { 474 | return unlikely(!rcu_dereference_sched(rq->sd)); 475 | } 476 | +#endif /* CONFIG_BLD */ 477 | 478 | #ifdef CONFIG_NO_HZ_COMMON 479 | /* 480 | @@ -7900,12 +7907,39 @@ static inline int on_null_domain(struct rq *rq) 481 | * needed, they will kick the idle load balancer, which then does idle 482 | * load balancing for all the idle CPUs. 483 | */ 484 | +#ifndef CONFIG_BLD 485 | static struct { 486 | cpumask_var_t idle_cpus_mask; 487 | atomic_t nr_cpus; 488 | unsigned long next_balance; /* in jiffy units */ 489 | } nohz ____cacheline_aligned; 490 | 491 | +void nohz_balance_exit_idle(unsigned int cpu) 492 | +{ 493 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 494 | + /* 495 | + * Completely isolated CPUs don't ever set, so we must test. 496 | + */ 497 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 498 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 499 | + atomic_dec(&nohz.nr_cpus); 500 | + } 501 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 502 | + } 503 | +} 504 | + 505 | +static int sched_ilb_notifier(struct notifier_block *nfb, 506 | + unsigned long action, void *hcpu) 507 | +{ 508 | + switch (action & ~CPU_TASKS_FROZEN) { 509 | + case CPU_DYING: 510 | + nohz_balance_exit_idle(smp_processor_id()); 511 | + return NOTIFY_OK; 512 | + default: 513 | + return NOTIFY_DONE; 514 | + } 515 | +} 516 | + 517 | static inline int find_new_ilb(void) 518 | { 519 | int ilb = cpumask_first(nohz.idle_cpus_mask); 520 | @@ -7944,20 +7978,6 @@ static void nohz_balancer_kick(void) 521 | return; 522 | } 523 | 524 | -void nohz_balance_exit_idle(unsigned int cpu) 525 | -{ 526 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 527 | - /* 528 | - * Completely isolated CPUs don't ever set, so we must test. 529 | - */ 530 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 531 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 532 | - atomic_dec(&nohz.nr_cpus); 533 | - } 534 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 535 | - } 536 | -} 537 | - 538 | static inline void set_cpu_sd_state_busy(void) 539 | { 540 | struct sched_domain *sd; 541 | @@ -7974,6 +7994,8 @@ static inline void set_cpu_sd_state_busy(void) 542 | unlock: 543 | rcu_read_unlock(); 544 | } 545 | +#endif /* CONFIG_BLD */ 546 | +#endif /* NO_HZ_COMMON */ 547 | 548 | void set_cpu_sd_state_idle(void) 549 | { 550 | @@ -7998,6 +8020,7 @@ unlock: 551 | */ 552 | void nohz_balance_enter_idle(int cpu) 553 | { 554 | +#ifndef CONFIG_BLD 555 | /* 556 | * If this cpu is going down, then nothing needs to be done. 557 | */ 558 | @@ -8016,10 +8039,8 @@ void nohz_balance_enter_idle(int cpu) 559 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 560 | atomic_inc(&nohz.nr_cpus); 561 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 562 | -} 563 | #endif 564 | - 565 | -static DEFINE_SPINLOCK(balancing); 566 | +} 567 | 568 | /* 569 | * Scale the max load_balance interval with the number of CPUs in the system. 570 | @@ -8030,6 +8051,9 @@ void update_max_interval(void) 571 | max_load_balance_interval = HZ*num_online_cpus()/10; 572 | } 573 | 574 | +#ifndef CONFIG_BLD 575 | +static DEFINE_SPINLOCK(balancing); 576 | + 577 | /* 578 | * It checks each scheduling domain to see if it is due to be balanced, 579 | * and initiates a balancing operation if so. 580 | @@ -8317,6 +8341,7 @@ void trigger_load_balance(struct rq *rq) 581 | nohz_balancer_kick(); 582 | #endif 583 | } 584 | +#endif /* CONFIG_BLD */ 585 | 586 | static void rq_online_fair(struct rq *rq) 587 | { 588 | @@ -8332,7 +8357,6 @@ static void rq_offline_fair(struct rq *rq) 589 | /* Ensure any throttled groups are reachable by pick_next_task */ 590 | unthrottle_offline_cfs_rqs(rq); 591 | } 592 | - 593 | #endif /* CONFIG_SMP */ 594 | 595 | /* 596 | @@ -8791,7 +8815,9 @@ const struct sched_class fair_sched_class = { 597 | .put_prev_task = put_prev_task_fair, 598 | 599 | #ifdef CONFIG_SMP 600 | +#ifndef CONFIG_BLD 601 | .select_task_rq = select_task_rq_fair, 602 | +#endif 603 | .migrate_task_rq = migrate_task_rq_fair, 604 | 605 | .rq_online = rq_online_fair, 606 | @@ -8852,6 +8878,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) 607 | 608 | __init void init_sched_fair_class(void) 609 | { 610 | +#ifndef CONFIG_BLD 611 | #ifdef CONFIG_SMP 612 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 613 | 614 | @@ -8860,5 +8887,5 @@ __init void init_sched_fair_class(void) 615 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 616 | #endif 617 | #endif /* SMP */ 618 | - 619 | +#endif /* BLD */ 620 | } 621 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 622 | index d5690b7..6f3589e 100644 623 | --- a/kernel/sched/rt.c 624 | +++ b/kernel/sched/rt.c 625 | @@ -1375,6 +1375,7 @@ static void yield_task_rt(struct rq *rq) 626 | #ifdef CONFIG_SMP 627 | static int find_lowest_rq(struct task_struct *task); 628 | 629 | +#ifndef CONFIG_BLD 630 | static int 631 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 632 | { 633 | @@ -1430,6 +1431,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 634 | out: 635 | return cpu; 636 | } 637 | +#endif /* CONFIG_BLD */ 638 | 639 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 640 | { 641 | @@ -2335,7 +2337,9 @@ const struct sched_class rt_sched_class = { 642 | .put_prev_task = put_prev_task_rt, 643 | 644 | #ifdef CONFIG_SMP 645 | +#ifndef CONFIG_BLD 646 | .select_task_rq = select_task_rq_rt, 647 | +#endif 648 | 649 | .set_cpus_allowed = set_cpus_allowed_common, 650 | .rq_online = rq_online_rt, 651 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 652 | index c64fc51..a1d329b 100644 653 | --- a/kernel/sched/sched.h 654 | +++ b/kernel/sched/sched.h 655 | @@ -416,9 +416,8 @@ struct cfs_rq { 656 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 657 | #endif /* CONFIG_SMP */ 658 | 659 | -#ifdef CONFIG_FAIR_GROUP_SCHED 660 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 661 | - 662 | +#ifdef CONFIG_FAIR_GROUP_SCHED 663 | /* 664 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 665 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 666 | @@ -442,6 +441,11 @@ struct cfs_rq { 667 | struct list_head throttled_list; 668 | #endif /* CONFIG_CFS_BANDWIDTH */ 669 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 670 | + 671 | +#ifdef CONFIG_BLD 672 | + struct list_head bld_cfs_list; 673 | + char pos; 674 | +#endif 675 | }; 676 | 677 | static inline int rt_bandwidth_enabled(void) 678 | @@ -487,12 +491,16 @@ struct rt_rq { 679 | /* Nests inside the rq lock: */ 680 | raw_spinlock_t rt_runtime_lock; 681 | 682 | + struct rq *rq; 683 | #ifdef CONFIG_RT_GROUP_SCHED 684 | unsigned long rt_nr_boosted; 685 | 686 | - struct rq *rq; 687 | struct task_group *tg; 688 | #endif 689 | +#ifdef CONFIG_BLD 690 | + struct list_head bld_rt_list; 691 | + int lowbit; 692 | +#endif 693 | }; 694 | 695 | /* Deadline class' related fields in a runqueue */ 696 | -------------------------------------------------------------------------------- /BLD-4.1.patch: -------------------------------------------------------------------------------- 1 | BLD patch for Linux-4.1. Just code rebase on Linux-4.1. 2 | 3 | 4 | diff --git a/init/Kconfig b/init/Kconfig 5 | index dc24dec..87860d4 100644 6 | --- a/init/Kconfig 7 | +++ b/init/Kconfig 8 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 9 | depends on BROKEN || !SMP 10 | default y 11 | 12 | +config BLD 13 | + bool "An alternate CPU load distribution technique for task scheduler" 14 | + depends on SMP 15 | + default y 16 | + help 17 | + This is an alternate CPU load distribution technique based for task 18 | + scheduler based on The Barbershop Load Distribution algorithm. Not 19 | + suitable for NUMA, should work well on SMP. 20 | + 21 | config INIT_ENV_ARG_LIMIT 22 | int 23 | default 32 if !UML 24 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 25 | new file mode 100644 26 | index 0000000..f1f9fba 27 | --- /dev/null 28 | +++ b/kernel/sched/bld.h 29 | @@ -0,0 +1,215 @@ 30 | +#ifdef CONFIG_BLD 31 | + 32 | +static DEFINE_RWLOCK(rt_list_lock); 33 | +static LIST_HEAD(rt_rq_head); 34 | +static LIST_HEAD(cfs_rq_head); 35 | +static DEFINE_RWLOCK(cfs_list_lock); 36 | + 37 | +#ifdef CONFIG_FAIR_GROUP_SCHED 38 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 39 | +{ 40 | + return cfs_rq->rq; 41 | +} 42 | +#else 43 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 44 | +{ 45 | + return container_of(cfs_rq, struct rq, cfs); 46 | +} 47 | +#endif 48 | + 49 | +#ifdef CONFIG_RT_GROUP_SCHED 50 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 51 | +{ 52 | + return rt_rq->rq; 53 | +} 54 | +#else 55 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 56 | +{ 57 | + return container_of(rt_rq, struct rq, rt); 58 | +} 59 | +#endif 60 | + 61 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 62 | +{ 63 | + int cpu = smp_processor_id(), i; 64 | + unsigned long load, varload; 65 | + struct rq *rq; 66 | + 67 | + if (task_type) { 68 | + varload = ULONG_MAX; 69 | + for_each_cpu(i, mask) { 70 | + rq = cpu_rq(i); 71 | + load = rq->cfs.load.weight; 72 | + if (load < varload) { 73 | + varload = load; 74 | + cpu = i; 75 | + } 76 | + } 77 | + } else { 78 | + /* Here's an attempt to get a CPU within the mask where 79 | + * we can preempt easily. To achieve this we tried to 80 | + * maintain a lowbit, which indicate the lowest bit set on 81 | + * array bitmap. Since all CPUs contains high priority 82 | + * kernel threads therefore we eliminate 0, so it might not 83 | + * be right every time, but it's just an indicator. 84 | + */ 85 | + varload = 1; 86 | + 87 | + for_each_cpu(i, mask) { 88 | + rq = cpu_rq(i); 89 | + load = rq->rt.lowbit; 90 | + if (load >= varload) { 91 | + varload = load; 92 | + cpu = i; 93 | + } 94 | + } 95 | + } 96 | + 97 | + return cpu; 98 | +} 99 | + 100 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 101 | +{ 102 | + struct cfs_rq *cfs; 103 | + unsigned long flags; 104 | + unsigned int cpu = smp_processor_id(); 105 | + 106 | + read_lock_irqsave(&cfs_list_lock, flags); 107 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 108 | + cpu = cpu_of(rq_of_cfs(cfs)); 109 | + if (cpu_online(cpu)) 110 | + break; 111 | + } 112 | + read_unlock_irqrestore(&cfs_list_lock, flags); 113 | + return cpu; 114 | +} 115 | + 116 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 117 | +{ 118 | + struct rt_rq *rt; 119 | + unsigned long flags; 120 | + unsigned int cpu = smp_processor_id(); 121 | + 122 | + read_lock_irqsave(&rt_list_lock, flags); 123 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 124 | + cpu = cpu_of(rq_of_rt(rt)); 125 | + if (cpu_online(cpu)) 126 | + break; 127 | + } 128 | + read_unlock_irqrestore(&rt_list_lock, flags); 129 | + return cpu; 130 | +} 131 | + 132 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 133 | +{ 134 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 135 | + struct cpumask *tmpmask; 136 | + 137 | + if (p->nr_cpus_allowed == 1) 138 | + return task_cpu(p); 139 | + 140 | + if (sd_flags & SD_BALANCE_WAKE) { 141 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 142 | + want_affine = 1; 143 | + } 144 | + } 145 | + 146 | + if (want_affine) 147 | + tmpmask = tsk_cpus_allowed(p); 148 | + else 149 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 150 | + 151 | + if (rt_task(p)) 152 | + cpu = select_cpu_for_wakeup(0, tmpmask); 153 | + else 154 | + cpu = select_cpu_for_wakeup(1, tmpmask); 155 | + 156 | + return cpu; 157 | +} 158 | + 159 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 160 | +{ 161 | + unsigned long flag; 162 | + int firstbit; 163 | + struct rt_rq *first; 164 | + struct rt_prio_array *array = &rq->rt.active; 165 | + 166 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 167 | + firstbit = sched_find_first_bit(array->bitmap); 168 | + 169 | + /* Maintaining rt.lowbit */ 170 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 171 | + rq->rt.lowbit = firstbit; 172 | + 173 | + if (rq->rt.lowbit < first->lowbit) { 174 | + write_lock_irqsave(&rt_list_lock, flag); 175 | + list_del(&rq->rt.bld_rt_list); 176 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 177 | + write_unlock_irqrestore(&rt_list_lock, flag); 178 | + } 179 | +} 180 | + 181 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 182 | +{ 183 | + unsigned int cpu; 184 | + 185 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 186 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 187 | + else { 188 | + if (rt_task(p)) 189 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 190 | + else 191 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 192 | + } 193 | + 194 | + return cpu; 195 | +} 196 | + 197 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 198 | +{ 199 | + unsigned long flag; 200 | + if (rt_task(p)) { 201 | + track_load_rt(rq, p); 202 | + } else { 203 | + if (rq->cfs.pos != 2) { 204 | + struct cfs_rq *last; 205 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 206 | + if (rq->cfs.load.weight >= last->load.weight) { 207 | + write_lock_irqsave(&cfs_list_lock, flag); 208 | + list_del(&rq->cfs.bld_cfs_list); 209 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 210 | + rq->cfs.pos = 2; last->pos = 1; 211 | + write_unlock_irqrestore(&cfs_list_lock, flag); 212 | + } 213 | + } 214 | + } 215 | +} 216 | + 217 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 218 | +{ 219 | + unsigned long flag; 220 | + if (rt_task(p)) { 221 | + track_load_rt(rq, p); 222 | + } else { 223 | + if (rq->cfs.pos != 0) { 224 | + struct cfs_rq *first; 225 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 226 | + if (rq->cfs.load.weight <= first->load.weight) { 227 | + write_lock_irqsave(&cfs_list_lock, flag); 228 | + list_del(&rq->cfs.bld_cfs_list); 229 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 230 | + rq->cfs.pos = 0; first->pos = 1; 231 | + write_unlock_irqrestore(&cfs_list_lock, flag); 232 | + } 233 | + } 234 | + } 235 | +} 236 | +#else 237 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 238 | +{ 239 | +} 240 | + 241 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 242 | +{ 243 | +} 244 | +#endif /* CONFIG_BLD */ 245 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 246 | index 1236732..5fb8bf7 100644 247 | --- a/kernel/sched/core.c 248 | +++ b/kernel/sched/core.c 249 | @@ -24,6 +24,8 @@ 250 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 251 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 252 | * Thomas Gleixner, Mike Kravetz 253 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 254 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 255 | */ 256 | 257 | #include 258 | @@ -86,6 +88,7 @@ 259 | #include "sched.h" 260 | #include "../workqueue_internal.h" 261 | #include "../smpboot.h" 262 | +#include "bld.h" 263 | 264 | #define CREATE_TRACE_POINTS 265 | #include 266 | @@ -807,6 +810,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 267 | update_rq_clock(rq); 268 | sched_info_queued(rq, p); 269 | p->sched_class->enqueue_task(rq, p, flags); 270 | + if (!dl_task(p)) 271 | + bld_track_load_activate(rq, p); 272 | } 273 | 274 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 275 | @@ -814,6 +819,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 276 | update_rq_clock(rq); 277 | sched_info_dequeued(rq, p); 278 | p->sched_class->dequeue_task(rq, p, flags); 279 | + if (!dl_task(p)) 280 | + bld_track_load_deactivate(rq, p); 281 | } 282 | 283 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 284 | @@ -1379,7 +1386,14 @@ static inline 285 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 286 | { 287 | if (p->nr_cpus_allowed > 1) 288 | +#ifndef CONFIG_BLD 289 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 290 | +#else 291 | + if(dl_task(p)) 292 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 293 | + else 294 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 295 | +#endif 296 | 297 | /* 298 | * In order not to call set_task_cpu() on a blocking task we need 299 | @@ -1549,7 +1563,11 @@ void scheduler_ipi(void) 300 | */ 301 | preempt_fold_need_resched(); 302 | 303 | +#ifndef CONFIG_BLD 304 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 305 | +#else 306 | + if (llist_empty(&this_rq()->wake_list)) 307 | +#endif 308 | return; 309 | 310 | /* 311 | @@ -1571,13 +1589,16 @@ void scheduler_ipi(void) 312 | /* 313 | * Check if someone kicked us for doing the nohz idle load balance. 314 | */ 315 | +#ifndef CONFIG_BLD 316 | if (unlikely(got_nohz_idle_kick())) { 317 | this_rq()->idle_balance = 1; 318 | raise_softirq_irqoff(SCHED_SOFTIRQ); 319 | } 320 | +#endif 321 | irq_exit(); 322 | } 323 | 324 | +#ifndef CONFIG_BLD 325 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 326 | { 327 | struct rq *rq = cpu_rq(cpu); 328 | @@ -1590,6 +1611,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 329 | } 330 | } 331 | 332 | +#endif 333 | + 334 | +bool cpus_share_cache(int this_cpu, int that_cpu) 335 | +{ 336 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 337 | +} 338 | + 339 | void wake_up_if_idle(int cpu) 340 | { 341 | struct rq *rq = cpu_rq(cpu); 342 | @@ -1613,18 +1641,13 @@ void wake_up_if_idle(int cpu) 343 | out: 344 | rcu_read_unlock(); 345 | } 346 | - 347 | -bool cpus_share_cache(int this_cpu, int that_cpu) 348 | -{ 349 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 350 | -} 351 | #endif /* CONFIG_SMP */ 352 | 353 | static void ttwu_queue(struct task_struct *p, int cpu) 354 | { 355 | struct rq *rq = cpu_rq(cpu); 356 | 357 | -#if defined(CONFIG_SMP) 358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 359 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 360 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 361 | ttwu_queue_remote(p, cpu); 362 | @@ -1948,7 +1971,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 363 | * Silence PROVE_RCU. 364 | */ 365 | raw_spin_lock_irqsave(&p->pi_lock, flags); 366 | - set_task_cpu(p, cpu); 367 | + __set_task_cpu(p, cpu); 368 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 369 | 370 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 371 | @@ -2415,7 +2438,14 @@ void sched_exec(void) 372 | int dest_cpu; 373 | 374 | raw_spin_lock_irqsave(&p->pi_lock, flags); 375 | +#ifndef CONFIG_BLD 376 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 377 | +#else 378 | + if (dl_task(p)) 379 | + dest_cpu = task_cpu(p); 380 | + else 381 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 382 | +#endif 383 | if (dest_cpu == smp_processor_id()) 384 | goto unlock; 385 | 386 | @@ -2503,8 +2533,10 @@ void scheduler_tick(void) 387 | 388 | #ifdef CONFIG_SMP 389 | rq->idle_balance = idle_cpu(cpu); 390 | +#ifndef CONFIG_BLD 391 | trigger_load_balance(rq); 392 | #endif 393 | +#endif 394 | rq_last_tick_reset(rq); 395 | } 396 | 397 | @@ -7221,6 +7253,15 @@ void __init sched_init(void) 398 | #endif 399 | init_rq_hrtick(rq); 400 | atomic_set(&rq->nr_iowait, 0); 401 | +#ifdef CONFIG_BLD 402 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 403 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 404 | + rq->cfs.pos = 0; 405 | + 406 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 407 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 408 | + rq->rt.lowbit = INT_MAX; 409 | +#endif 410 | } 411 | 412 | set_load_weight(&init_task); 413 | @@ -7261,6 +7302,9 @@ void __init sched_init(void) 414 | init_sched_fair_class(); 415 | 416 | scheduler_running = 1; 417 | +#ifdef CONFIG_BLD 418 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 419 | +#endif 420 | } 421 | 422 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 424 | index c2980e8..ffe8e78 100644 425 | --- a/kernel/sched/fair.c 426 | +++ b/kernel/sched/fair.c 427 | @@ -4424,6 +4424,7 @@ static void task_waking_fair(struct task_struct *p) 428 | record_wakee(p); 429 | } 430 | 431 | +#ifndef CONFIG_BLD 432 | #ifdef CONFIG_FAIR_GROUP_SCHED 433 | /* 434 | * effective_load() calculates the load change as seen from the root_task_group 435 | @@ -4903,6 +4904,7 @@ unlock: 436 | 437 | return new_cpu; 438 | } 439 | +#endif /* CONFIG_BLD */ 440 | 441 | /* 442 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 443 | @@ -5198,6 +5200,7 @@ simple: 444 | return p; 445 | 446 | idle: 447 | +#ifndef CONFIG_BLD 448 | new_tasks = idle_balance(rq); 449 | /* 450 | * Because idle_balance() releases (and re-acquires) rq->lock, it is 451 | @@ -5209,7 +5212,7 @@ idle: 452 | 453 | if (new_tasks > 0) 454 | goto again; 455 | - 456 | +#endif 457 | return NULL; 458 | } 459 | 460 | @@ -5921,8 +5924,9 @@ static unsigned long task_h_load(struct task_struct *p) 461 | } 462 | #endif 463 | 464 | -/********** Helpers for find_busiest_group ************************/ 465 | +#ifndef CONFIG_BLD 466 | 467 | +/********** Helpers for find_busiest_group ************************/ 468 | enum group_type { 469 | group_other = 0, 470 | group_imbalanced, 471 | @@ -6014,6 +6018,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 472 | return load_idx; 473 | } 474 | 475 | +#endif /* CONFIG_BLD */ 476 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) 477 | { 478 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 479 | @@ -6141,6 +6146,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 480 | sdg->sgc->capacity = capacity; 481 | } 482 | 483 | +#ifndef CONFIG_BLD 484 | /* 485 | * Check whether the capacity of the rq has been noticeably reduced by side 486 | * activity. The imbalance_pct is used for the threshold. 487 | @@ -7377,6 +7383,8 @@ static inline int on_null_domain(struct rq *rq) 488 | return unlikely(!rcu_dereference_sched(rq->sd)); 489 | } 490 | 491 | +#endif /* CONFIG_BLD */ 492 | + 493 | #ifdef CONFIG_NO_HZ_COMMON 494 | /* 495 | * idle load balancing details 496 | @@ -7384,12 +7392,39 @@ static inline int on_null_domain(struct rq *rq) 497 | * needed, they will kick the idle load balancer, which then does idle 498 | * load balancing for all the idle CPUs. 499 | */ 500 | +#ifndef CONFIG_BLD 501 | static struct { 502 | cpumask_var_t idle_cpus_mask; 503 | atomic_t nr_cpus; 504 | unsigned long next_balance; /* in jiffy units */ 505 | } nohz ____cacheline_aligned; 506 | 507 | +static inline void nohz_balance_exit_idle(int cpu) 508 | +{ 509 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 510 | + /* 511 | + * Completely isolated CPUs don't ever set, so we must test. 512 | + */ 513 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 514 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 515 | + atomic_dec(&nohz.nr_cpus); 516 | + } 517 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 518 | + } 519 | +} 520 | + 521 | +static int sched_ilb_notifier(struct notifier_block *nfb, 522 | + unsigned long action, void *hcpu) 523 | +{ 524 | + switch (action & ~CPU_TASKS_FROZEN) { 525 | + case CPU_DYING: 526 | + nohz_balance_exit_idle(smp_processor_id()); 527 | + return NOTIFY_OK; 528 | + default: 529 | + return NOTIFY_DONE; 530 | + } 531 | +} 532 | + 533 | static inline int find_new_ilb(void) 534 | { 535 | int ilb = cpumask_first(nohz.idle_cpus_mask); 536 | @@ -7427,20 +7462,7 @@ static void nohz_balancer_kick(void) 537 | smp_send_reschedule(ilb_cpu); 538 | return; 539 | } 540 | - 541 | -static inline void nohz_balance_exit_idle(int cpu) 542 | -{ 543 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 544 | - /* 545 | - * Completely isolated CPUs don't ever set, so we must test. 546 | - */ 547 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 548 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 549 | - atomic_dec(&nohz.nr_cpus); 550 | - } 551 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 552 | - } 553 | -} 554 | +#endif /* CONFIG_BLD */ 555 | 556 | static inline void set_cpu_sd_state_busy(void) 557 | { 558 | @@ -7482,6 +7504,7 @@ unlock: 559 | */ 560 | void nohz_balance_enter_idle(int cpu) 561 | { 562 | +#ifndef CONFIG_BLD 563 | /* 564 | * If this cpu is going down, then nothing needs to be done. 565 | */ 566 | @@ -7500,23 +7523,10 @@ void nohz_balance_enter_idle(int cpu) 567 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 568 | atomic_inc(&nohz.nr_cpus); 569 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 570 | -} 571 | - 572 | -static int sched_ilb_notifier(struct notifier_block *nfb, 573 | - unsigned long action, void *hcpu) 574 | -{ 575 | - switch (action & ~CPU_TASKS_FROZEN) { 576 | - case CPU_DYING: 577 | - nohz_balance_exit_idle(smp_processor_id()); 578 | - return NOTIFY_OK; 579 | - default: 580 | - return NOTIFY_DONE; 581 | - } 582 | +#endif 583 | } 584 | #endif 585 | 586 | -static DEFINE_SPINLOCK(balancing); 587 | - 588 | /* 589 | * Scale the max load_balance interval with the number of CPUs in the system. 590 | * This trades load-balance latency on larger machines for less cross talk. 591 | @@ -7526,6 +7536,9 @@ void update_max_interval(void) 592 | max_load_balance_interval = HZ*num_online_cpus()/10; 593 | } 594 | 595 | +#ifndef CONFIG_BLD 596 | +static DEFINE_SPINLOCK(balancing); 597 | + 598 | /* 599 | * It checks each scheduling domain to see if it is due to be balanced, 600 | * and initiates a balancing operation if so. 601 | @@ -7787,6 +7800,7 @@ void trigger_load_balance(struct rq *rq) 602 | nohz_balancer_kick(); 603 | #endif 604 | } 605 | +#endif /* CONFIG_BLD */ 606 | 607 | static void rq_online_fair(struct rq *rq) 608 | { 609 | @@ -8232,7 +8246,9 @@ const struct sched_class fair_sched_class = { 610 | .put_prev_task = put_prev_task_fair, 611 | 612 | #ifdef CONFIG_SMP 613 | +#ifndef CONFIG_BLD 614 | .select_task_rq = select_task_rq_fair, 615 | +#endif 616 | .migrate_task_rq = migrate_task_rq_fair, 617 | 618 | .rq_online = rq_online_fair, 619 | @@ -8272,6 +8288,7 @@ void print_cfs_stats(struct seq_file *m, int cpu) 620 | 621 | __init void init_sched_fair_class(void) 622 | { 623 | +#ifndef CONFIG_BLD 624 | #ifdef CONFIG_SMP 625 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 626 | 627 | @@ -8281,5 +8298,5 @@ __init void init_sched_fair_class(void) 628 | cpu_notifier(sched_ilb_notifier, 0); 629 | #endif 630 | #endif /* SMP */ 631 | - 632 | +#endif /* BLD */ 633 | } 634 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 635 | index 575da76..105468eb 100644 636 | --- a/kernel/sched/rt.c 637 | +++ b/kernel/sched/rt.c 638 | @@ -1310,6 +1310,7 @@ static void yield_task_rt(struct rq *rq) 639 | #ifdef CONFIG_SMP 640 | static int find_lowest_rq(struct task_struct *task); 641 | 642 | +#ifndef CONFIG_BLD 643 | static int 644 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 645 | { 646 | @@ -1365,6 +1366,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 647 | out: 648 | return cpu; 649 | } 650 | +#endif /* CONFIG_BLD */ 651 | 652 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 653 | { 654 | @@ -2309,7 +2311,9 @@ const struct sched_class rt_sched_class = { 655 | .put_prev_task = put_prev_task_rt, 656 | 657 | #ifdef CONFIG_SMP 658 | +#ifndef CONFIG_BLD 659 | .select_task_rq = select_task_rq_rt, 660 | +#endif 661 | 662 | .set_cpus_allowed = set_cpus_allowed_rt, 663 | .rq_online = rq_online_rt, 664 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 665 | index e0e1299..11aad03 100644 666 | --- a/kernel/sched/sched.h 667 | +++ b/kernel/sched/sched.h 668 | @@ -392,9 +392,8 @@ struct cfs_rq { 669 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 670 | #endif /* CONFIG_SMP */ 671 | 672 | -#ifdef CONFIG_FAIR_GROUP_SCHED 673 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 674 | - 675 | +#ifdef CONFIG_FAIR_GROUP_SCHED 676 | /* 677 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 678 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 679 | @@ -418,6 +417,11 @@ struct cfs_rq { 680 | struct list_head throttled_list; 681 | #endif /* CONFIG_CFS_BANDWIDTH */ 682 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 683 | + 684 | +#ifdef CONFIG_BLD 685 | + struct list_head bld_cfs_list; 686 | + char pos; 687 | +#endif 688 | }; 689 | 690 | static inline int rt_bandwidth_enabled(void) 691 | @@ -462,12 +466,16 @@ struct rt_rq { 692 | /* Nests inside the rq lock: */ 693 | raw_spinlock_t rt_runtime_lock; 694 | 695 | + struct rq *rq; 696 | #ifdef CONFIG_RT_GROUP_SCHED 697 | unsigned long rt_nr_boosted; 698 | 699 | - struct rq *rq; 700 | struct task_group *tg; 701 | #endif 702 | +#ifdef CONFIG_BLD 703 | + struct list_head bld_rt_list; 704 | + int lowbit; 705 | +#endif 706 | }; 707 | 708 | /* Deadline class' related fields in a runqueue */ 709 | -------------------------------------------------------------------------------- /BLD-4.4.patch: -------------------------------------------------------------------------------- 1 | diff --git a/init/Kconfig b/init/Kconfig 2 | index 235c7a2..01a91fb 100644 3 | --- a/init/Kconfig 4 | +++ b/init/Kconfig 5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 6 | depends on BROKEN || !SMP 7 | default y 8 | 9 | +config BLD 10 | + bool "An alternate CPU load distribution technique for task scheduler" 11 | + depends on SMP 12 | + default y 13 | + help 14 | + This is an alternate CPU load distribution technique based for task 15 | + scheduler based on The Barbershop Load Distribution algorithm. Not 16 | + suitable for NUMA, should work well on SMP. 17 | + 18 | config INIT_ENV_ARG_LIMIT 19 | int 20 | default 32 if !UML 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 22 | new file mode 100644 23 | index 0000000..f1f9fba 24 | --- /dev/null 25 | +++ b/kernel/sched/bld.h 26 | @@ -0,0 +1,215 @@ 27 | +#ifdef CONFIG_BLD 28 | + 29 | +static DEFINE_RWLOCK(rt_list_lock); 30 | +static LIST_HEAD(rt_rq_head); 31 | +static LIST_HEAD(cfs_rq_head); 32 | +static DEFINE_RWLOCK(cfs_list_lock); 33 | + 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 36 | +{ 37 | + return cfs_rq->rq; 38 | +} 39 | +#else 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 41 | +{ 42 | + return container_of(cfs_rq, struct rq, cfs); 43 | +} 44 | +#endif 45 | + 46 | +#ifdef CONFIG_RT_GROUP_SCHED 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 48 | +{ 49 | + return rt_rq->rq; 50 | +} 51 | +#else 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 53 | +{ 54 | + return container_of(rt_rq, struct rq, rt); 55 | +} 56 | +#endif 57 | + 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 59 | +{ 60 | + int cpu = smp_processor_id(), i; 61 | + unsigned long load, varload; 62 | + struct rq *rq; 63 | + 64 | + if (task_type) { 65 | + varload = ULONG_MAX; 66 | + for_each_cpu(i, mask) { 67 | + rq = cpu_rq(i); 68 | + load = rq->cfs.load.weight; 69 | + if (load < varload) { 70 | + varload = load; 71 | + cpu = i; 72 | + } 73 | + } 74 | + } else { 75 | + /* Here's an attempt to get a CPU within the mask where 76 | + * we can preempt easily. To achieve this we tried to 77 | + * maintain a lowbit, which indicate the lowest bit set on 78 | + * array bitmap. Since all CPUs contains high priority 79 | + * kernel threads therefore we eliminate 0, so it might not 80 | + * be right every time, but it's just an indicator. 81 | + */ 82 | + varload = 1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load >= varload) { 88 | + varload = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = firstbit; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index 732e993..ffb231a 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -86,6 +88,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -833,6 +836,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | if (!(flags & ENQUEUE_RESTORE)) 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -841,6 +846,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | if (!(flags & DEQUEUE_SAVE)) 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1625,7 +1632,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 282 | lockdep_assert_held(&p->pi_lock); 283 | 284 | if (p->nr_cpus_allowed > 1) 285 | +#ifndef CONFIG_BLD 286 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 287 | +#else 288 | + if(dl_task(p)) 289 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 290 | + else 291 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 292 | +#endif 293 | 294 | /* 295 | * In order not to call set_task_cpu() on a blocking task we need 296 | @@ -1815,7 +1829,11 @@ void scheduler_ipi(void) 297 | */ 298 | preempt_fold_need_resched(); 299 | 300 | +#ifndef CONFIG_BLD 301 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 302 | +#else 303 | + if (llist_empty(&this_rq()->wake_list)) 304 | +#endif 305 | return; 306 | 307 | /* 308 | @@ -1837,13 +1855,16 @@ void scheduler_ipi(void) 309 | /* 310 | * Check if someone kicked us for doing the nohz idle load balance. 311 | */ 312 | +#ifndef CONFIG_BLD 313 | if (unlikely(got_nohz_idle_kick())) { 314 | this_rq()->idle_balance = 1; 315 | raise_softirq_irqoff(SCHED_SOFTIRQ); 316 | } 317 | +#endif 318 | irq_exit(); 319 | } 320 | 321 | +#ifndef CONFIG_BLD 322 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 323 | { 324 | struct rq *rq = cpu_rq(cpu); 325 | @@ -1856,6 +1877,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 326 | } 327 | } 328 | 329 | +#endif 330 | + 331 | +bool cpus_share_cache(int this_cpu, int that_cpu) 332 | +{ 333 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 334 | +} 335 | + 336 | void wake_up_if_idle(int cpu) 337 | { 338 | struct rq *rq = cpu_rq(cpu); 339 | @@ -1879,18 +1907,13 @@ void wake_up_if_idle(int cpu) 340 | out: 341 | rcu_read_unlock(); 342 | } 343 | - 344 | -bool cpus_share_cache(int this_cpu, int that_cpu) 345 | -{ 346 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 347 | -} 348 | #endif /* CONFIG_SMP */ 349 | 350 | static void ttwu_queue(struct task_struct *p, int cpu) 351 | { 352 | struct rq *rq = cpu_rq(cpu); 353 | 354 | -#if defined(CONFIG_SMP) 355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 356 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 357 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 358 | ttwu_queue_remote(p, cpu); 359 | @@ -2243,7 +2266,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 360 | * Silence PROVE_RCU. 361 | */ 362 | raw_spin_lock_irqsave(&p->pi_lock, flags); 363 | - set_task_cpu(p, cpu); 364 | + __set_task_cpu(p, cpu); 365 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 366 | 367 | #ifdef CONFIG_SCHED_INFO 368 | @@ -2788,7 +2811,14 @@ void sched_exec(void) 369 | int dest_cpu; 370 | 371 | raw_spin_lock_irqsave(&p->pi_lock, flags); 372 | +#ifndef CONFIG_BLD 373 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 374 | +#else 375 | + if (dl_task(p)) 376 | + dest_cpu = task_cpu(p); 377 | + else 378 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 379 | +#endif 380 | if (dest_cpu == smp_processor_id()) 381 | goto unlock; 382 | 383 | @@ -2877,8 +2907,10 @@ void scheduler_tick(void) 384 | 385 | #ifdef CONFIG_SMP 386 | rq->idle_balance = idle_cpu(cpu); 387 | +#ifndef CONFIG_BLD 388 | trigger_load_balance(rq); 389 | #endif 390 | +#endif 391 | rq_last_tick_reset(rq); 392 | } 393 | 394 | @@ -7492,6 +7524,15 @@ void __init sched_init(void) 395 | #endif 396 | init_rq_hrtick(rq); 397 | atomic_set(&rq->nr_iowait, 0); 398 | +#ifdef CONFIG_BLD 399 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 400 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 401 | + rq->cfs.pos = 0; 402 | + 403 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 404 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 405 | + rq->rt.lowbit = INT_MAX; 406 | +#endif 407 | } 408 | 409 | set_load_weight(&init_task); 410 | @@ -7532,6 +7573,9 @@ void __init sched_init(void) 411 | init_sched_fair_class(); 412 | 413 | scheduler_running = 1; 414 | +#ifdef CONFIG_BLD 415 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 416 | +#endif 417 | } 418 | 419 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 421 | index cfdc0e6..08fd5f4 100644 422 | --- a/kernel/sched/fair.c 423 | +++ b/kernel/sched/fair.c 424 | @@ -4518,6 +4518,7 @@ static void task_waking_fair(struct task_struct *p) 425 | record_wakee(p); 426 | } 427 | 428 | +#ifndef CONFIG_BLD 429 | #ifdef CONFIG_FAIR_GROUP_SCHED 430 | /* 431 | * effective_load() calculates the load change as seen from the root_task_group 432 | @@ -5003,6 +5004,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f 433 | 434 | return new_cpu; 435 | } 436 | +#endif /* CONFIG_BLD */ 437 | 438 | /* 439 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 440 | @@ -5308,6 +5310,7 @@ idle: 441 | * further scheduler activity on it and we're being very careful to 442 | * re-start the picking loop. 443 | */ 444 | +#ifndef CONFIG_BLD 445 | lockdep_unpin_lock(&rq->lock); 446 | new_tasks = idle_balance(rq); 447 | lockdep_pin_lock(&rq->lock); 448 | @@ -5321,7 +5324,7 @@ idle: 449 | 450 | if (new_tasks > 0) 451 | goto again; 452 | - 453 | +#endif 454 | return NULL; 455 | } 456 | 457 | @@ -5982,8 +5985,9 @@ static unsigned long task_h_load(struct task_struct *p) 458 | } 459 | #endif 460 | 461 | -/********** Helpers for find_busiest_group ************************/ 462 | +#ifndef CONFIG_BLD 463 | 464 | +/********** Helpers for find_busiest_group ************************/ 465 | enum group_type { 466 | group_other = 0, 467 | group_imbalanced, 468 | @@ -6074,6 +6078,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 469 | 470 | return load_idx; 471 | } 472 | +#endif /* CONFIG_BLD */ 473 | 474 | static unsigned long scale_rt_capacity(int cpu) 475 | { 476 | @@ -6182,6 +6187,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 477 | sdg->sgc->capacity = capacity; 478 | } 479 | 480 | +#ifndef CONFIG_BLD 481 | /* 482 | * Check whether the capacity of the rq has been noticeably reduced by side 483 | * activity. The imbalance_pct is used for the threshold. 484 | @@ -7415,6 +7421,8 @@ static inline int on_null_domain(struct rq *rq) 485 | return unlikely(!rcu_dereference_sched(rq->sd)); 486 | } 487 | 488 | +#endif /* CONFIG_BLD */ 489 | + 490 | #ifdef CONFIG_NO_HZ_COMMON 491 | /* 492 | * idle load balancing details 493 | @@ -7422,12 +7430,39 @@ static inline int on_null_domain(struct rq *rq) 494 | * needed, they will kick the idle load balancer, which then does idle 495 | * load balancing for all the idle CPUs. 496 | */ 497 | +#ifndef CONFIG_BLD 498 | static struct { 499 | cpumask_var_t idle_cpus_mask; 500 | atomic_t nr_cpus; 501 | unsigned long next_balance; /* in jiffy units */ 502 | } nohz ____cacheline_aligned; 503 | 504 | +static inline void nohz_balance_exit_idle(int cpu) 505 | +{ 506 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 507 | + /* 508 | + * Completely isolated CPUs don't ever set, so we must test. 509 | + */ 510 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 511 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 512 | + atomic_dec(&nohz.nr_cpus); 513 | + } 514 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 515 | + } 516 | +} 517 | + 518 | +static int sched_ilb_notifier(struct notifier_block *nfb, 519 | + unsigned long action, void *hcpu) 520 | +{ 521 | + switch (action & ~CPU_TASKS_FROZEN) { 522 | + case CPU_DYING: 523 | + nohz_balance_exit_idle(smp_processor_id()); 524 | + return NOTIFY_OK; 525 | + default: 526 | + return NOTIFY_DONE; 527 | + } 528 | +} 529 | + 530 | static inline int find_new_ilb(void) 531 | { 532 | int ilb = cpumask_first(nohz.idle_cpus_mask); 533 | @@ -7465,20 +7500,7 @@ static void nohz_balancer_kick(void) 534 | smp_send_reschedule(ilb_cpu); 535 | return; 536 | } 537 | - 538 | -static inline void nohz_balance_exit_idle(int cpu) 539 | -{ 540 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 541 | - /* 542 | - * Completely isolated CPUs don't ever set, so we must test. 543 | - */ 544 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 545 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 546 | - atomic_dec(&nohz.nr_cpus); 547 | - } 548 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 549 | - } 550 | -} 551 | +#endif /* CONFIG_BLD */ 552 | 553 | static inline void set_cpu_sd_state_busy(void) 554 | { 555 | @@ -7520,6 +7542,7 @@ unlock: 556 | */ 557 | void nohz_balance_enter_idle(int cpu) 558 | { 559 | +#ifndef CONFIG_BLD 560 | /* 561 | * If this cpu is going down, then nothing needs to be done. 562 | */ 563 | @@ -7538,23 +7561,10 @@ void nohz_balance_enter_idle(int cpu) 564 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 565 | atomic_inc(&nohz.nr_cpus); 566 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 567 | -} 568 | - 569 | -static int sched_ilb_notifier(struct notifier_block *nfb, 570 | - unsigned long action, void *hcpu) 571 | -{ 572 | - switch (action & ~CPU_TASKS_FROZEN) { 573 | - case CPU_DYING: 574 | - nohz_balance_exit_idle(smp_processor_id()); 575 | - return NOTIFY_OK; 576 | - default: 577 | - return NOTIFY_DONE; 578 | - } 579 | +#endif 580 | } 581 | #endif 582 | 583 | -static DEFINE_SPINLOCK(balancing); 584 | - 585 | /* 586 | * Scale the max load_balance interval with the number of CPUs in the system. 587 | * This trades load-balance latency on larger machines for less cross talk. 588 | @@ -7564,6 +7574,9 @@ void update_max_interval(void) 589 | max_load_balance_interval = HZ*num_online_cpus()/10; 590 | } 591 | 592 | +#ifndef CONFIG_BLD 593 | +static DEFINE_SPINLOCK(balancing); 594 | + 595 | /* 596 | * It checks each scheduling domain to see if it is due to be balanced, 597 | * and initiates a balancing operation if so. 598 | @@ -7851,6 +7864,7 @@ void trigger_load_balance(struct rq *rq) 599 | nohz_balancer_kick(); 600 | #endif 601 | } 602 | +#endif /* CONFIG_BLD */ 603 | 604 | static void rq_online_fair(struct rq *rq) 605 | { 606 | @@ -8282,7 +8296,9 @@ const struct sched_class fair_sched_class = { 607 | .put_prev_task = put_prev_task_fair, 608 | 609 | #ifdef CONFIG_SMP 610 | +#ifndef CONFIG_BLD 611 | .select_task_rq = select_task_rq_fair, 612 | +#endif 613 | .migrate_task_rq = migrate_task_rq_fair, 614 | 615 | .rq_online = rq_online_fair, 616 | @@ -8344,6 +8360,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) 617 | 618 | __init void init_sched_fair_class(void) 619 | { 620 | +#ifndef CONFIG_BLD 621 | #ifdef CONFIG_SMP 622 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 623 | 624 | @@ -8353,5 +8370,5 @@ __init void init_sched_fair_class(void) 625 | cpu_notifier(sched_ilb_notifier, 0); 626 | #endif 627 | #endif /* SMP */ 628 | - 629 | +#endif /* BLD */ 630 | } 631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 632 | index 8ec86ab..cada34d 100644 633 | --- a/kernel/sched/rt.c 634 | +++ b/kernel/sched/rt.c 635 | @@ -1313,6 +1313,7 @@ static void yield_task_rt(struct rq *rq) 636 | #ifdef CONFIG_SMP 637 | static int find_lowest_rq(struct task_struct *task); 638 | 639 | +#ifndef CONFIG_BLD 640 | static int 641 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 642 | { 643 | @@ -1368,6 +1369,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 644 | out: 645 | return cpu; 646 | } 647 | +#endif /* CONFIG_BLD */ 648 | 649 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 650 | { 651 | @@ -2272,7 +2274,9 @@ const struct sched_class rt_sched_class = { 652 | .put_prev_task = put_prev_task_rt, 653 | 654 | #ifdef CONFIG_SMP 655 | +#ifndef CONFIG_BLD 656 | .select_task_rq = select_task_rq_rt, 657 | +#endif 658 | 659 | .set_cpus_allowed = set_cpus_allowed_common, 660 | .rq_online = rq_online_rt, 661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 662 | index b242775..256ad05 100644 663 | --- a/kernel/sched/sched.h 664 | +++ b/kernel/sched/sched.h 665 | @@ -395,9 +395,8 @@ struct cfs_rq { 666 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 667 | #endif /* CONFIG_SMP */ 668 | 669 | -#ifdef CONFIG_FAIR_GROUP_SCHED 670 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 671 | - 672 | +#ifdef CONFIG_FAIR_GROUP_SCHED 673 | /* 674 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 675 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 676 | @@ -421,6 +420,11 @@ struct cfs_rq { 677 | struct list_head throttled_list; 678 | #endif /* CONFIG_CFS_BANDWIDTH */ 679 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 680 | + 681 | +#ifdef CONFIG_BLD 682 | + struct list_head bld_cfs_list; 683 | + char pos; 684 | +#endif 685 | }; 686 | 687 | static inline int rt_bandwidth_enabled(void) 688 | @@ -465,12 +469,16 @@ struct rt_rq { 689 | /* Nests inside the rq lock: */ 690 | raw_spinlock_t rt_runtime_lock; 691 | 692 | + struct rq *rq; 693 | #ifdef CONFIG_RT_GROUP_SCHED 694 | unsigned long rt_nr_boosted; 695 | 696 | - struct rq *rq; 697 | struct task_group *tg; 698 | #endif 699 | +#ifdef CONFIG_BLD 700 | + struct list_head bld_rt_list; 701 | + int lowbit; 702 | +#endif 703 | }; 704 | 705 | /* Deadline class' related fields in a runqueue */ 706 | -------------------------------------------------------------------------------- /BLD-4.5.patch: -------------------------------------------------------------------------------- 1 | diff --git a/init/Kconfig b/init/Kconfig 2 | index 2232080..627f6ca 100644 3 | --- a/init/Kconfig 4 | +++ b/init/Kconfig 5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 6 | depends on BROKEN || !SMP 7 | default y 8 | 9 | +config BLD 10 | + bool "An alternate CPU load distribution technique for task scheduler" 11 | + depends on SMP 12 | + default y 13 | + help 14 | + This is an alternate CPU load distribution technique based for task 15 | + scheduler based on The Barbershop Load Distribution algorithm. Not 16 | + suitable for NUMA, should work well on SMP. 17 | + 18 | config INIT_ENV_ARG_LIMIT 19 | int 20 | default 32 if !UML 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 22 | new file mode 100644 23 | index 0000000..f1f9fba 24 | --- /dev/null 25 | +++ b/kernel/sched/bld.h 26 | @@ -0,0 +1,215 @@ 27 | +#ifdef CONFIG_BLD 28 | + 29 | +static DEFINE_RWLOCK(rt_list_lock); 30 | +static LIST_HEAD(rt_rq_head); 31 | +static LIST_HEAD(cfs_rq_head); 32 | +static DEFINE_RWLOCK(cfs_list_lock); 33 | + 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 36 | +{ 37 | + return cfs_rq->rq; 38 | +} 39 | +#else 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 41 | +{ 42 | + return container_of(cfs_rq, struct rq, cfs); 43 | +} 44 | +#endif 45 | + 46 | +#ifdef CONFIG_RT_GROUP_SCHED 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 48 | +{ 49 | + return rt_rq->rq; 50 | +} 51 | +#else 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 53 | +{ 54 | + return container_of(rt_rq, struct rq, rt); 55 | +} 56 | +#endif 57 | + 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 59 | +{ 60 | + int cpu = smp_processor_id(), i; 61 | + unsigned long load, varload; 62 | + struct rq *rq; 63 | + 64 | + if (task_type) { 65 | + varload = ULONG_MAX; 66 | + for_each_cpu(i, mask) { 67 | + rq = cpu_rq(i); 68 | + load = rq->cfs.load.weight; 69 | + if (load < varload) { 70 | + varload = load; 71 | + cpu = i; 72 | + } 73 | + } 74 | + } else { 75 | + /* Here's an attempt to get a CPU within the mask where 76 | + * we can preempt easily. To achieve this we tried to 77 | + * maintain a lowbit, which indicate the lowest bit set on 78 | + * array bitmap. Since all CPUs contains high priority 79 | + * kernel threads therefore we eliminate 0, so it might not 80 | + * be right every time, but it's just an indicator. 81 | + */ 82 | + varload = 1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load >= varload) { 88 | + varload = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = firstbit; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index 41f6b22..d000500 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -87,6 +89,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -834,6 +837,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | if (!(flags & ENQUEUE_RESTORE)) 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -842,6 +847,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | if (!(flags & DEQUEUE_SAVE)) 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1637,7 +1644,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 282 | lockdep_assert_held(&p->pi_lock); 283 | 284 | if (p->nr_cpus_allowed > 1) 285 | +#ifndef CONFIG_BLD 286 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 287 | +#else 288 | + if(dl_task(p)) 289 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 290 | + else 291 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 292 | +#endif 293 | 294 | /* 295 | * In order not to call set_task_cpu() on a blocking task we need 296 | @@ -1827,7 +1841,11 @@ void scheduler_ipi(void) 297 | */ 298 | preempt_fold_need_resched(); 299 | 300 | +#ifndef CONFIG_BLD 301 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 302 | +#else 303 | + if (llist_empty(&this_rq()->wake_list)) 304 | +#endif 305 | return; 306 | 307 | /* 308 | @@ -1849,13 +1867,16 @@ void scheduler_ipi(void) 309 | /* 310 | * Check if someone kicked us for doing the nohz idle load balance. 311 | */ 312 | +#ifndef CONFIG_BLD 313 | if (unlikely(got_nohz_idle_kick())) { 314 | this_rq()->idle_balance = 1; 315 | raise_softirq_irqoff(SCHED_SOFTIRQ); 316 | } 317 | +#endif 318 | irq_exit(); 319 | } 320 | 321 | +#ifndef CONFIG_BLD 322 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 323 | { 324 | struct rq *rq = cpu_rq(cpu); 325 | @@ -1868,6 +1889,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 326 | } 327 | } 328 | 329 | +#endif 330 | + 331 | +bool cpus_share_cache(int this_cpu, int that_cpu) 332 | +{ 333 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 334 | +} 335 | + 336 | void wake_up_if_idle(int cpu) 337 | { 338 | struct rq *rq = cpu_rq(cpu); 339 | @@ -1891,18 +1919,13 @@ void wake_up_if_idle(int cpu) 340 | out: 341 | rcu_read_unlock(); 342 | } 343 | - 344 | -bool cpus_share_cache(int this_cpu, int that_cpu) 345 | -{ 346 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 347 | -} 348 | #endif /* CONFIG_SMP */ 349 | 350 | static void ttwu_queue(struct task_struct *p, int cpu) 351 | { 352 | struct rq *rq = cpu_rq(cpu); 353 | 354 | -#if defined(CONFIG_SMP) 355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 356 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 357 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 358 | ttwu_queue_remote(p, cpu); 359 | @@ -2344,7 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 360 | * Silence PROVE_RCU. 361 | */ 362 | raw_spin_lock_irqsave(&p->pi_lock, flags); 363 | - set_task_cpu(p, cpu); 364 | + __set_task_cpu(p, cpu); 365 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 366 | 367 | #ifdef CONFIG_SCHED_INFO 368 | @@ -2889,7 +2912,14 @@ void sched_exec(void) 369 | int dest_cpu; 370 | 371 | raw_spin_lock_irqsave(&p->pi_lock, flags); 372 | +#ifndef CONFIG_BLD 373 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 374 | +#else 375 | + if (dl_task(p)) 376 | + dest_cpu = task_cpu(p); 377 | + else 378 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 379 | +#endif 380 | if (dest_cpu == smp_processor_id()) 381 | goto unlock; 382 | 383 | @@ -2978,8 +3008,10 @@ void scheduler_tick(void) 384 | 385 | #ifdef CONFIG_SMP 386 | rq->idle_balance = idle_cpu(cpu); 387 | +#ifndef CONFIG_BLD 388 | trigger_load_balance(rq); 389 | #endif 390 | +#endif 391 | rq_last_tick_reset(rq); 392 | } 393 | 394 | @@ -7601,6 +7633,15 @@ void __init sched_init(void) 395 | #endif 396 | init_rq_hrtick(rq); 397 | atomic_set(&rq->nr_iowait, 0); 398 | +#ifdef CONFIG_BLD 399 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 400 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 401 | + rq->cfs.pos = 0; 402 | + 403 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 404 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 405 | + rq->rt.lowbit = INT_MAX; 406 | +#endif 407 | } 408 | 409 | set_load_weight(&init_task); 410 | @@ -7641,6 +7682,9 @@ void __init sched_init(void) 411 | init_sched_fair_class(); 412 | 413 | scheduler_running = 1; 414 | +#ifdef CONFIG_BLD 415 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 416 | +#endif 417 | } 418 | 419 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 421 | index 56b7d4b..7fe9d9b 100644 422 | --- a/kernel/sched/fair.c 423 | +++ b/kernel/sched/fair.c 424 | @@ -4655,6 +4655,7 @@ static void task_waking_fair(struct task_struct *p) 425 | record_wakee(p); 426 | } 427 | 428 | +#ifndef CONFIG_BLD 429 | #ifdef CONFIG_FAIR_GROUP_SCHED 430 | /* 431 | * effective_load() calculates the load change as seen from the root_task_group 432 | @@ -5140,6 +5141,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f 433 | 434 | return new_cpu; 435 | } 436 | +#endif /* CONFIG_BLD */ 437 | 438 | /* 439 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 440 | @@ -5444,6 +5446,7 @@ idle: 441 | * further scheduler activity on it and we're being very careful to 442 | * re-start the picking loop. 443 | */ 444 | +#ifndef CONFIG_BLD 445 | lockdep_unpin_lock(&rq->lock); 446 | new_tasks = idle_balance(rq); 447 | lockdep_pin_lock(&rq->lock); 448 | @@ -5457,7 +5460,7 @@ idle: 449 | 450 | if (new_tasks > 0) 451 | goto again; 452 | - 453 | +#endif 454 | return NULL; 455 | } 456 | 457 | @@ -6118,8 +6121,9 @@ static unsigned long task_h_load(struct task_struct *p) 458 | } 459 | #endif 460 | 461 | -/********** Helpers for find_busiest_group ************************/ 462 | +#ifndef CONFIG_BLD 463 | 464 | +/********** Helpers for find_busiest_group ************************/ 465 | enum group_type { 466 | group_other = 0, 467 | group_imbalanced, 468 | @@ -6210,6 +6214,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 469 | 470 | return load_idx; 471 | } 472 | +#endif /* CONFIG_BLD */ 473 | 474 | static unsigned long scale_rt_capacity(int cpu) 475 | { 476 | @@ -6318,6 +6323,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 477 | sdg->sgc->capacity = capacity; 478 | } 479 | 480 | +#ifndef CONFIG_BLD 481 | /* 482 | * Check whether the capacity of the rq has been noticeably reduced by side 483 | * activity. The imbalance_pct is used for the threshold. 484 | @@ -7551,6 +7557,8 @@ static inline int on_null_domain(struct rq *rq) 485 | return unlikely(!rcu_dereference_sched(rq->sd)); 486 | } 487 | 488 | +#endif /* CONFIG_BLD */ 489 | + 490 | #ifdef CONFIG_NO_HZ_COMMON 491 | /* 492 | * idle load balancing details 493 | @@ -7558,12 +7566,39 @@ static inline int on_null_domain(struct rq *rq) 494 | * needed, they will kick the idle load balancer, which then does idle 495 | * load balancing for all the idle CPUs. 496 | */ 497 | +#ifndef CONFIG_BLD 498 | static struct { 499 | cpumask_var_t idle_cpus_mask; 500 | atomic_t nr_cpus; 501 | unsigned long next_balance; /* in jiffy units */ 502 | } nohz ____cacheline_aligned; 503 | 504 | +static inline void nohz_balance_exit_idle(int cpu) 505 | +{ 506 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 507 | + /* 508 | + * Completely isolated CPUs don't ever set, so we must test. 509 | + */ 510 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 511 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 512 | + atomic_dec(&nohz.nr_cpus); 513 | + } 514 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 515 | + } 516 | +} 517 | + 518 | +static int sched_ilb_notifier(struct notifier_block *nfb, 519 | + unsigned long action, void *hcpu) 520 | +{ 521 | + switch (action & ~CPU_TASKS_FROZEN) { 522 | + case CPU_DYING: 523 | + nohz_balance_exit_idle(smp_processor_id()); 524 | + return NOTIFY_OK; 525 | + default: 526 | + return NOTIFY_DONE; 527 | + } 528 | +} 529 | + 530 | static inline int find_new_ilb(void) 531 | { 532 | int ilb = cpumask_first(nohz.idle_cpus_mask); 533 | @@ -7601,20 +7636,7 @@ static void nohz_balancer_kick(void) 534 | smp_send_reschedule(ilb_cpu); 535 | return; 536 | } 537 | - 538 | -static inline void nohz_balance_exit_idle(int cpu) 539 | -{ 540 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 541 | - /* 542 | - * Completely isolated CPUs don't ever set, so we must test. 543 | - */ 544 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 545 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 546 | - atomic_dec(&nohz.nr_cpus); 547 | - } 548 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 549 | - } 550 | -} 551 | +#endif /* CONFIG_BLD */ 552 | 553 | static inline void set_cpu_sd_state_busy(void) 554 | { 555 | @@ -7656,6 +7678,7 @@ unlock: 556 | */ 557 | void nohz_balance_enter_idle(int cpu) 558 | { 559 | +#ifndef CONFIG_BLD 560 | /* 561 | * If this cpu is going down, then nothing needs to be done. 562 | */ 563 | @@ -7674,23 +7697,10 @@ void nohz_balance_enter_idle(int cpu) 564 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 565 | atomic_inc(&nohz.nr_cpus); 566 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 567 | -} 568 | - 569 | -static int sched_ilb_notifier(struct notifier_block *nfb, 570 | - unsigned long action, void *hcpu) 571 | -{ 572 | - switch (action & ~CPU_TASKS_FROZEN) { 573 | - case CPU_DYING: 574 | - nohz_balance_exit_idle(smp_processor_id()); 575 | - return NOTIFY_OK; 576 | - default: 577 | - return NOTIFY_DONE; 578 | - } 579 | +#endif 580 | } 581 | #endif 582 | 583 | -static DEFINE_SPINLOCK(balancing); 584 | - 585 | /* 586 | * Scale the max load_balance interval with the number of CPUs in the system. 587 | * This trades load-balance latency on larger machines for less cross talk. 588 | @@ -7700,6 +7710,9 @@ void update_max_interval(void) 589 | max_load_balance_interval = HZ*num_online_cpus()/10; 590 | } 591 | 592 | +#ifndef CONFIG_BLD 593 | +static DEFINE_SPINLOCK(balancing); 594 | + 595 | /* 596 | * It checks each scheduling domain to see if it is due to be balanced, 597 | * and initiates a balancing operation if so. 598 | @@ -7987,6 +8000,7 @@ void trigger_load_balance(struct rq *rq) 599 | nohz_balancer_kick(); 600 | #endif 601 | } 602 | +#endif /* CONFIG_BLD */ 603 | 604 | static void rq_online_fair(struct rq *rq) 605 | { 606 | @@ -8418,7 +8432,9 @@ const struct sched_class fair_sched_class = { 607 | .put_prev_task = put_prev_task_fair, 608 | 609 | #ifdef CONFIG_SMP 610 | +#ifndef CONFIG_BLD 611 | .select_task_rq = select_task_rq_fair, 612 | +#endif 613 | .migrate_task_rq = migrate_task_rq_fair, 614 | 615 | .rq_online = rq_online_fair, 616 | @@ -8480,6 +8496,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) 617 | 618 | __init void init_sched_fair_class(void) 619 | { 620 | +#ifndef CONFIG_BLD 621 | #ifdef CONFIG_SMP 622 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 623 | 624 | @@ -8489,5 +8506,5 @@ __init void init_sched_fair_class(void) 625 | cpu_notifier(sched_ilb_notifier, 0); 626 | #endif 627 | #endif /* SMP */ 628 | - 629 | +#endif /* BLD */ 630 | } 631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 632 | index 8ec86ab..cada34d 100644 633 | --- a/kernel/sched/rt.c 634 | +++ b/kernel/sched/rt.c 635 | @@ -1313,6 +1313,7 @@ static void yield_task_rt(struct rq *rq) 636 | #ifdef CONFIG_SMP 637 | static int find_lowest_rq(struct task_struct *task); 638 | 639 | +#ifndef CONFIG_BLD 640 | static int 641 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 642 | { 643 | @@ -1368,6 +1369,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 644 | out: 645 | return cpu; 646 | } 647 | +#endif /* CONFIG_BLD */ 648 | 649 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 650 | { 651 | @@ -2272,7 +2274,9 @@ const struct sched_class rt_sched_class = { 652 | .put_prev_task = put_prev_task_rt, 653 | 654 | #ifdef CONFIG_SMP 655 | +#ifndef CONFIG_BLD 656 | .select_task_rq = select_task_rq_rt, 657 | +#endif 658 | 659 | .set_cpus_allowed = set_cpus_allowed_common, 660 | .rq_online = rq_online_rt, 661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 662 | index 10f1637..33a2aa97 100644 663 | --- a/kernel/sched/sched.h 664 | +++ b/kernel/sched/sched.h 665 | @@ -408,9 +408,8 @@ struct cfs_rq { 666 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 667 | #endif /* CONFIG_SMP */ 668 | 669 | -#ifdef CONFIG_FAIR_GROUP_SCHED 670 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 671 | - 672 | +#ifdef CONFIG_FAIR_GROUP_SCHED 673 | /* 674 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 675 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 676 | @@ -434,6 +433,11 @@ struct cfs_rq { 677 | struct list_head throttled_list; 678 | #endif /* CONFIG_CFS_BANDWIDTH */ 679 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 680 | + 681 | +#ifdef CONFIG_BLD 682 | + struct list_head bld_cfs_list; 683 | + char pos; 684 | +#endif 685 | }; 686 | 687 | static inline int rt_bandwidth_enabled(void) 688 | @@ -478,12 +482,16 @@ struct rt_rq { 689 | /* Nests inside the rq lock: */ 690 | raw_spinlock_t rt_runtime_lock; 691 | 692 | + struct rq *rq; 693 | #ifdef CONFIG_RT_GROUP_SCHED 694 | unsigned long rt_nr_boosted; 695 | 696 | - struct rq *rq; 697 | struct task_group *tg; 698 | #endif 699 | +#ifdef CONFIG_BLD 700 | + struct list_head bld_rt_list; 701 | + int lowbit; 702 | +#endif 703 | }; 704 | 705 | /* Deadline class' related fields in a runqueue */ 706 | -------------------------------------------------------------------------------- /BLD-4.3.patch: -------------------------------------------------------------------------------- 1 | diff --git a/init/Kconfig b/init/Kconfig 2 | index c24b6f7..898db4a 100644 3 | --- a/init/Kconfig 4 | +++ b/init/Kconfig 5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 6 | depends on BROKEN || !SMP 7 | default y 8 | 9 | +config BLD 10 | + bool "An alternate CPU load distribution technique for task scheduler" 11 | + depends on SMP 12 | + default y 13 | + help 14 | + This is an alternate CPU load distribution technique based for task 15 | + scheduler based on The Barbershop Load Distribution algorithm. Not 16 | + suitable for NUMA, should work well on SMP. 17 | + 18 | config INIT_ENV_ARG_LIMIT 19 | int 20 | default 32 if !UML 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 22 | new file mode 100644 23 | index 0000000..f1f9fba 24 | --- /dev/null 25 | +++ b/kernel/sched/bld.h 26 | @@ -0,0 +1,215 @@ 27 | +#ifdef CONFIG_BLD 28 | + 29 | +static DEFINE_RWLOCK(rt_list_lock); 30 | +static LIST_HEAD(rt_rq_head); 31 | +static LIST_HEAD(cfs_rq_head); 32 | +static DEFINE_RWLOCK(cfs_list_lock); 33 | + 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 36 | +{ 37 | + return cfs_rq->rq; 38 | +} 39 | +#else 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 41 | +{ 42 | + return container_of(cfs_rq, struct rq, cfs); 43 | +} 44 | +#endif 45 | + 46 | +#ifdef CONFIG_RT_GROUP_SCHED 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 48 | +{ 49 | + return rt_rq->rq; 50 | +} 51 | +#else 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 53 | +{ 54 | + return container_of(rt_rq, struct rq, rt); 55 | +} 56 | +#endif 57 | + 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 59 | +{ 60 | + int cpu = smp_processor_id(), i; 61 | + unsigned long load, varload; 62 | + struct rq *rq; 63 | + 64 | + if (task_type) { 65 | + varload = ULONG_MAX; 66 | + for_each_cpu(i, mask) { 67 | + rq = cpu_rq(i); 68 | + load = rq->cfs.load.weight; 69 | + if (load < varload) { 70 | + varload = load; 71 | + cpu = i; 72 | + } 73 | + } 74 | + } else { 75 | + /* Here's an attempt to get a CPU within the mask where 76 | + * we can preempt easily. To achieve this we tried to 77 | + * maintain a lowbit, which indicate the lowest bit set on 78 | + * array bitmap. Since all CPUs contains high priority 79 | + * kernel threads therefore we eliminate 0, so it might not 80 | + * be right every time, but it's just an indicator. 81 | + */ 82 | + varload = 1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load >= varload) { 88 | + varload = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = firstbit; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index bcd214e..71e4a81 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -86,6 +88,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -832,6 +835,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | update_rq_clock(rq); 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -839,6 +844,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | update_rq_clock(rq); 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1617,7 +1624,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 282 | lockdep_assert_held(&p->pi_lock); 283 | 284 | if (p->nr_cpus_allowed > 1) 285 | +#ifndef CONFIG_BLD 286 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 287 | +#else 288 | + if(dl_task(p)) 289 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 290 | + else 291 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 292 | +#endif 293 | 294 | /* 295 | * In order not to call set_task_cpu() on a blocking task we need 296 | @@ -1807,7 +1821,11 @@ void scheduler_ipi(void) 297 | */ 298 | preempt_fold_need_resched(); 299 | 300 | +#ifndef CONFIG_BLD 301 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 302 | +#else 303 | + if (llist_empty(&this_rq()->wake_list)) 304 | +#endif 305 | return; 306 | 307 | /* 308 | @@ -1829,13 +1847,16 @@ void scheduler_ipi(void) 309 | /* 310 | * Check if someone kicked us for doing the nohz idle load balance. 311 | */ 312 | +#ifndef CONFIG_BLD 313 | if (unlikely(got_nohz_idle_kick())) { 314 | this_rq()->idle_balance = 1; 315 | raise_softirq_irqoff(SCHED_SOFTIRQ); 316 | } 317 | +#endif 318 | irq_exit(); 319 | } 320 | 321 | +#ifndef CONFIG_BLD 322 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 323 | { 324 | struct rq *rq = cpu_rq(cpu); 325 | @@ -1848,6 +1869,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 326 | } 327 | } 328 | 329 | +#endif 330 | + 331 | +bool cpus_share_cache(int this_cpu, int that_cpu) 332 | +{ 333 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 334 | +} 335 | + 336 | void wake_up_if_idle(int cpu) 337 | { 338 | struct rq *rq = cpu_rq(cpu); 339 | @@ -1871,18 +1899,13 @@ void wake_up_if_idle(int cpu) 340 | out: 341 | rcu_read_unlock(); 342 | } 343 | - 344 | -bool cpus_share_cache(int this_cpu, int that_cpu) 345 | -{ 346 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 347 | -} 348 | #endif /* CONFIG_SMP */ 349 | 350 | static void ttwu_queue(struct task_struct *p, int cpu) 351 | { 352 | struct rq *rq = cpu_rq(cpu); 353 | 354 | -#if defined(CONFIG_SMP) 355 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 356 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 357 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 358 | ttwu_queue_remote(p, cpu); 359 | @@ -2217,7 +2240,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 360 | * Silence PROVE_RCU. 361 | */ 362 | raw_spin_lock_irqsave(&p->pi_lock, flags); 363 | - set_task_cpu(p, cpu); 364 | + __set_task_cpu(p, cpu); 365 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 366 | 367 | #ifdef CONFIG_SCHED_INFO 368 | @@ -2740,7 +2763,14 @@ void sched_exec(void) 369 | int dest_cpu; 370 | 371 | raw_spin_lock_irqsave(&p->pi_lock, flags); 372 | +#ifndef CONFIG_BLD 373 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 374 | +#else 375 | + if (dl_task(p)) 376 | + dest_cpu = task_cpu(p); 377 | + else 378 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 379 | +#endif 380 | if (dest_cpu == smp_processor_id()) 381 | goto unlock; 382 | 383 | @@ -2829,8 +2859,10 @@ void scheduler_tick(void) 384 | 385 | #ifdef CONFIG_SMP 386 | rq->idle_balance = idle_cpu(cpu); 387 | +#ifndef CONFIG_BLD 388 | trigger_load_balance(rq); 389 | #endif 390 | +#endif 391 | rq_last_tick_reset(rq); 392 | } 393 | 394 | @@ -7432,6 +7464,15 @@ void __init sched_init(void) 395 | #endif 396 | init_rq_hrtick(rq); 397 | atomic_set(&rq->nr_iowait, 0); 398 | +#ifdef CONFIG_BLD 399 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 400 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 401 | + rq->cfs.pos = 0; 402 | + 403 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 404 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 405 | + rq->rt.lowbit = INT_MAX; 406 | +#endif 407 | } 408 | 409 | set_load_weight(&init_task); 410 | @@ -7472,6 +7513,9 @@ void __init sched_init(void) 411 | init_sched_fair_class(); 412 | 413 | scheduler_running = 1; 414 | +#ifdef CONFIG_BLD 415 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 416 | +#endif 417 | } 418 | 419 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 420 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 421 | index 9a5e60f..af65fb9 100644 422 | --- a/kernel/sched/fair.c 423 | +++ b/kernel/sched/fair.c 424 | @@ -4460,6 +4460,7 @@ static void task_waking_fair(struct task_struct *p) 425 | record_wakee(p); 426 | } 427 | 428 | +#ifndef CONFIG_BLD 429 | #ifdef CONFIG_FAIR_GROUP_SCHED 430 | /* 431 | * effective_load() calculates the load change as seen from the root_task_group 432 | @@ -4938,6 +4939,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f 433 | 434 | return new_cpu; 435 | } 436 | +#endif /* CONFIG_BLD */ 437 | 438 | /* 439 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 440 | @@ -5243,6 +5245,7 @@ idle: 441 | * further scheduler activity on it and we're being very careful to 442 | * re-start the picking loop. 443 | */ 444 | +#ifndef CONFIG_BLD 445 | lockdep_unpin_lock(&rq->lock); 446 | new_tasks = idle_balance(rq); 447 | lockdep_pin_lock(&rq->lock); 448 | @@ -5256,7 +5259,7 @@ idle: 449 | 450 | if (new_tasks > 0) 451 | goto again; 452 | - 453 | +#endif 454 | return NULL; 455 | } 456 | 457 | @@ -5917,8 +5920,9 @@ static unsigned long task_h_load(struct task_struct *p) 458 | } 459 | #endif 460 | 461 | -/********** Helpers for find_busiest_group ************************/ 462 | +#ifndef CONFIG_BLD 463 | 464 | +/********** Helpers for find_busiest_group ************************/ 465 | enum group_type { 466 | group_other = 0, 467 | group_imbalanced, 468 | @@ -6010,6 +6014,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 469 | return load_idx; 470 | } 471 | 472 | +#endif /* CONFIG_BLD */ 473 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) 474 | { 475 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 476 | @@ -6137,6 +6142,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 477 | sdg->sgc->capacity = capacity; 478 | } 479 | 480 | +#ifndef CONFIG_BLD 481 | /* 482 | * Check whether the capacity of the rq has been noticeably reduced by side 483 | * activity. The imbalance_pct is used for the threshold. 484 | @@ -7370,6 +7376,8 @@ static inline int on_null_domain(struct rq *rq) 485 | return unlikely(!rcu_dereference_sched(rq->sd)); 486 | } 487 | 488 | +#endif /* CONFIG_BLD */ 489 | + 490 | #ifdef CONFIG_NO_HZ_COMMON 491 | /* 492 | * idle load balancing details 493 | @@ -7377,12 +7385,39 @@ static inline int on_null_domain(struct rq *rq) 494 | * needed, they will kick the idle load balancer, which then does idle 495 | * load balancing for all the idle CPUs. 496 | */ 497 | +#ifndef CONFIG_BLD 498 | static struct { 499 | cpumask_var_t idle_cpus_mask; 500 | atomic_t nr_cpus; 501 | unsigned long next_balance; /* in jiffy units */ 502 | } nohz ____cacheline_aligned; 503 | 504 | +static inline void nohz_balance_exit_idle(int cpu) 505 | +{ 506 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 507 | + /* 508 | + * Completely isolated CPUs don't ever set, so we must test. 509 | + */ 510 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 511 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 512 | + atomic_dec(&nohz.nr_cpus); 513 | + } 514 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 515 | + } 516 | +} 517 | + 518 | +static int sched_ilb_notifier(struct notifier_block *nfb, 519 | + unsigned long action, void *hcpu) 520 | +{ 521 | + switch (action & ~CPU_TASKS_FROZEN) { 522 | + case CPU_DYING: 523 | + nohz_balance_exit_idle(smp_processor_id()); 524 | + return NOTIFY_OK; 525 | + default: 526 | + return NOTIFY_DONE; 527 | + } 528 | +} 529 | + 530 | static inline int find_new_ilb(void) 531 | { 532 | int ilb = cpumask_first(nohz.idle_cpus_mask); 533 | @@ -7420,20 +7455,7 @@ static void nohz_balancer_kick(void) 534 | smp_send_reschedule(ilb_cpu); 535 | return; 536 | } 537 | - 538 | -static inline void nohz_balance_exit_idle(int cpu) 539 | -{ 540 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 541 | - /* 542 | - * Completely isolated CPUs don't ever set, so we must test. 543 | - */ 544 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 545 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 546 | - atomic_dec(&nohz.nr_cpus); 547 | - } 548 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 549 | - } 550 | -} 551 | +#endif /* CONFIG_BLD */ 552 | 553 | static inline void set_cpu_sd_state_busy(void) 554 | { 555 | @@ -7475,6 +7497,7 @@ unlock: 556 | */ 557 | void nohz_balance_enter_idle(int cpu) 558 | { 559 | +#ifndef CONFIG_BLD 560 | /* 561 | * If this cpu is going down, then nothing needs to be done. 562 | */ 563 | @@ -7493,23 +7516,10 @@ void nohz_balance_enter_idle(int cpu) 564 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 565 | atomic_inc(&nohz.nr_cpus); 566 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 567 | -} 568 | - 569 | -static int sched_ilb_notifier(struct notifier_block *nfb, 570 | - unsigned long action, void *hcpu) 571 | -{ 572 | - switch (action & ~CPU_TASKS_FROZEN) { 573 | - case CPU_DYING: 574 | - nohz_balance_exit_idle(smp_processor_id()); 575 | - return NOTIFY_OK; 576 | - default: 577 | - return NOTIFY_DONE; 578 | - } 579 | +#endif 580 | } 581 | #endif 582 | 583 | -static DEFINE_SPINLOCK(balancing); 584 | - 585 | /* 586 | * Scale the max load_balance interval with the number of CPUs in the system. 587 | * This trades load-balance latency on larger machines for less cross talk. 588 | @@ -7519,6 +7529,9 @@ void update_max_interval(void) 589 | max_load_balance_interval = HZ*num_online_cpus()/10; 590 | } 591 | 592 | +#ifndef CONFIG_BLD 593 | +static DEFINE_SPINLOCK(balancing); 594 | + 595 | /* 596 | * It checks each scheduling domain to see if it is due to be balanced, 597 | * and initiates a balancing operation if so. 598 | @@ -7780,6 +7793,7 @@ void trigger_load_balance(struct rq *rq) 599 | nohz_balancer_kick(); 600 | #endif 601 | } 602 | +#endif /* CONFIG_BLD */ 603 | 604 | static void rq_online_fair(struct rq *rq) 605 | { 606 | @@ -8245,7 +8259,9 @@ const struct sched_class fair_sched_class = { 607 | .put_prev_task = put_prev_task_fair, 608 | 609 | #ifdef CONFIG_SMP 610 | +#ifndef CONFIG_BLD 611 | .select_task_rq = select_task_rq_fair, 612 | +#endif 613 | .migrate_task_rq = migrate_task_rq_fair, 614 | 615 | .rq_online = rq_online_fair, 616 | @@ -8307,6 +8323,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) 617 | 618 | __init void init_sched_fair_class(void) 619 | { 620 | +#ifndef CONFIG_BLD 621 | #ifdef CONFIG_SMP 622 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 623 | 624 | @@ -8316,5 +8333,5 @@ __init void init_sched_fair_class(void) 625 | cpu_notifier(sched_ilb_notifier, 0); 626 | #endif 627 | #endif /* SMP */ 628 | - 629 | +#endif /* BLD */ 630 | } 631 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 632 | index d2ea593..d4d3159 100644 633 | --- a/kernel/sched/rt.c 634 | +++ b/kernel/sched/rt.c 635 | @@ -1323,6 +1323,7 @@ static void yield_task_rt(struct rq *rq) 636 | #ifdef CONFIG_SMP 637 | static int find_lowest_rq(struct task_struct *task); 638 | 639 | +#ifndef CONFIG_BLD 640 | static int 641 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 642 | { 643 | @@ -1378,6 +1379,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 644 | out: 645 | return cpu; 646 | } 647 | +#endif /* CONFIG_BLD */ 648 | 649 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 650 | { 651 | @@ -2282,7 +2284,9 @@ const struct sched_class rt_sched_class = { 652 | .put_prev_task = put_prev_task_rt, 653 | 654 | #ifdef CONFIG_SMP 655 | +#ifndef CONFIG_BLD 656 | .select_task_rq = select_task_rq_rt, 657 | +#endif 658 | 659 | .set_cpus_allowed = set_cpus_allowed_common, 660 | .rq_online = rq_online_rt, 661 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 662 | index 6d2a119..cce9116 100644 663 | --- a/kernel/sched/sched.h 664 | +++ b/kernel/sched/sched.h 665 | @@ -391,9 +391,8 @@ struct cfs_rq { 666 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 667 | #endif /* CONFIG_SMP */ 668 | 669 | -#ifdef CONFIG_FAIR_GROUP_SCHED 670 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 671 | - 672 | +#ifdef CONFIG_FAIR_GROUP_SCHED 673 | /* 674 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 675 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 676 | @@ -417,6 +416,11 @@ struct cfs_rq { 677 | struct list_head throttled_list; 678 | #endif /* CONFIG_CFS_BANDWIDTH */ 679 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 680 | + 681 | +#ifdef CONFIG_BLD 682 | + struct list_head bld_cfs_list; 683 | + char pos; 684 | +#endif 685 | }; 686 | 687 | static inline int rt_bandwidth_enabled(void) 688 | @@ -461,12 +465,16 @@ struct rt_rq { 689 | /* Nests inside the rq lock: */ 690 | raw_spinlock_t rt_runtime_lock; 691 | 692 | + struct rq *rq; 693 | #ifdef CONFIG_RT_GROUP_SCHED 694 | unsigned long rt_nr_boosted; 695 | 696 | - struct rq *rq; 697 | struct task_group *tg; 698 | #endif 699 | +#ifdef CONFIG_BLD 700 | + struct list_head bld_rt_list; 701 | + int lowbit; 702 | +#endif 703 | }; 704 | 705 | /* Deadline class' related fields in a runqueue */ 706 | -------------------------------------------------------------------------------- /BLD-4.6.patch: -------------------------------------------------------------------------------- 1 | diff --git a/init/Kconfig b/init/Kconfig 2 | index 0dfd09d..8d704e5 100644 3 | --- a/init/Kconfig 4 | +++ b/init/Kconfig 5 | @@ -36,6 +36,15 @@ config BROKEN_ON_SMP 6 | depends on BROKEN || !SMP 7 | default y 8 | 9 | +config BLD 10 | + bool "An alternate CPU load distribution technique for task scheduler" 11 | + depends on SMP 12 | + default y 13 | + help 14 | + This is an alternate CPU load distribution technique based for task 15 | + scheduler based on The Barbershop Load Distribution algorithm. Not 16 | + suitable for NUMA, should work well on SMP. 17 | + 18 | config INIT_ENV_ARG_LIMIT 19 | int 20 | default 32 if !UML 21 | diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h 22 | new file mode 100644 23 | index 0000000..f1f9fba 24 | --- /dev/null 25 | +++ b/kernel/sched/bld.h 26 | @@ -0,0 +1,215 @@ 27 | +#ifdef CONFIG_BLD 28 | + 29 | +static DEFINE_RWLOCK(rt_list_lock); 30 | +static LIST_HEAD(rt_rq_head); 31 | +static LIST_HEAD(cfs_rq_head); 32 | +static DEFINE_RWLOCK(cfs_list_lock); 33 | + 34 | +#ifdef CONFIG_FAIR_GROUP_SCHED 35 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 36 | +{ 37 | + return cfs_rq->rq; 38 | +} 39 | +#else 40 | +static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq) 41 | +{ 42 | + return container_of(cfs_rq, struct rq, cfs); 43 | +} 44 | +#endif 45 | + 46 | +#ifdef CONFIG_RT_GROUP_SCHED 47 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 48 | +{ 49 | + return rt_rq->rq; 50 | +} 51 | +#else 52 | +static inline struct rq *rq_of_rt(struct rt_rq *rt_rq) 53 | +{ 54 | + return container_of(rt_rq, struct rq, rt); 55 | +} 56 | +#endif 57 | + 58 | +static int select_cpu_for_wakeup(int task_type, struct cpumask *mask) 59 | +{ 60 | + int cpu = smp_processor_id(), i; 61 | + unsigned long load, varload; 62 | + struct rq *rq; 63 | + 64 | + if (task_type) { 65 | + varload = ULONG_MAX; 66 | + for_each_cpu(i, mask) { 67 | + rq = cpu_rq(i); 68 | + load = rq->cfs.load.weight; 69 | + if (load < varload) { 70 | + varload = load; 71 | + cpu = i; 72 | + } 73 | + } 74 | + } else { 75 | + /* Here's an attempt to get a CPU within the mask where 76 | + * we can preempt easily. To achieve this we tried to 77 | + * maintain a lowbit, which indicate the lowest bit set on 78 | + * array bitmap. Since all CPUs contains high priority 79 | + * kernel threads therefore we eliminate 0, so it might not 80 | + * be right every time, but it's just an indicator. 81 | + */ 82 | + varload = 1; 83 | + 84 | + for_each_cpu(i, mask) { 85 | + rq = cpu_rq(i); 86 | + load = rq->rt.lowbit; 87 | + if (load >= varload) { 88 | + varload = load; 89 | + cpu = i; 90 | + } 91 | + } 92 | + } 93 | + 94 | + return cpu; 95 | +} 96 | + 97 | +static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags) 98 | +{ 99 | + struct cfs_rq *cfs; 100 | + unsigned long flags; 101 | + unsigned int cpu = smp_processor_id(); 102 | + 103 | + read_lock_irqsave(&cfs_list_lock, flags); 104 | + list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) { 105 | + cpu = cpu_of(rq_of_cfs(cfs)); 106 | + if (cpu_online(cpu)) 107 | + break; 108 | + } 109 | + read_unlock_irqrestore(&cfs_list_lock, flags); 110 | + return cpu; 111 | +} 112 | + 113 | +static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags) 114 | +{ 115 | + struct rt_rq *rt; 116 | + unsigned long flags; 117 | + unsigned int cpu = smp_processor_id(); 118 | + 119 | + read_lock_irqsave(&rt_list_lock, flags); 120 | + list_for_each_entry(rt, &rt_rq_head, bld_rt_list) { 121 | + cpu = cpu_of(rq_of_rt(rt)); 122 | + if (cpu_online(cpu)) 123 | + break; 124 | + } 125 | + read_unlock_irqrestore(&rt_list_lock, flags); 126 | + return cpu; 127 | +} 128 | + 129 | +static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags) 130 | +{ 131 | + unsigned int cpu = smp_processor_id(), want_affine = 0; 132 | + struct cpumask *tmpmask; 133 | + 134 | + if (p->nr_cpus_allowed == 1) 135 | + return task_cpu(p); 136 | + 137 | + if (sd_flags & SD_BALANCE_WAKE) { 138 | + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 139 | + want_affine = 1; 140 | + } 141 | + } 142 | + 143 | + if (want_affine) 144 | + tmpmask = tsk_cpus_allowed(p); 145 | + else 146 | + tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd); 147 | + 148 | + if (rt_task(p)) 149 | + cpu = select_cpu_for_wakeup(0, tmpmask); 150 | + else 151 | + cpu = select_cpu_for_wakeup(1, tmpmask); 152 | + 153 | + return cpu; 154 | +} 155 | + 156 | +static void track_load_rt(struct rq *rq, struct task_struct *p) 157 | +{ 158 | + unsigned long flag; 159 | + int firstbit; 160 | + struct rt_rq *first; 161 | + struct rt_prio_array *array = &rq->rt.active; 162 | + 163 | + first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list); 164 | + firstbit = sched_find_first_bit(array->bitmap); 165 | + 166 | + /* Maintaining rt.lowbit */ 167 | + if (firstbit > 0 && firstbit <= rq->rt.lowbit) 168 | + rq->rt.lowbit = firstbit; 169 | + 170 | + if (rq->rt.lowbit < first->lowbit) { 171 | + write_lock_irqsave(&rt_list_lock, flag); 172 | + list_del(&rq->rt.bld_rt_list); 173 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 174 | + write_unlock_irqrestore(&rt_list_lock, flag); 175 | + } 176 | +} 177 | + 178 | +static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags) 179 | +{ 180 | + unsigned int cpu; 181 | + 182 | + if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1))) 183 | + cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags); 184 | + else { 185 | + if (rt_task(p)) 186 | + cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags); 187 | + else 188 | + cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags); 189 | + } 190 | + 191 | + return cpu; 192 | +} 193 | + 194 | +static void bld_track_load_activate(struct rq *rq, struct task_struct *p) 195 | +{ 196 | + unsigned long flag; 197 | + if (rt_task(p)) { 198 | + track_load_rt(rq, p); 199 | + } else { 200 | + if (rq->cfs.pos != 2) { 201 | + struct cfs_rq *last; 202 | + last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list); 203 | + if (rq->cfs.load.weight >= last->load.weight) { 204 | + write_lock_irqsave(&cfs_list_lock, flag); 205 | + list_del(&rq->cfs.bld_cfs_list); 206 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 207 | + rq->cfs.pos = 2; last->pos = 1; 208 | + write_unlock_irqrestore(&cfs_list_lock, flag); 209 | + } 210 | + } 211 | + } 212 | +} 213 | + 214 | +static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 215 | +{ 216 | + unsigned long flag; 217 | + if (rt_task(p)) { 218 | + track_load_rt(rq, p); 219 | + } else { 220 | + if (rq->cfs.pos != 0) { 221 | + struct cfs_rq *first; 222 | + first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list); 223 | + if (rq->cfs.load.weight <= first->load.weight) { 224 | + write_lock_irqsave(&cfs_list_lock, flag); 225 | + list_del(&rq->cfs.bld_cfs_list); 226 | + list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head); 227 | + rq->cfs.pos = 0; first->pos = 1; 228 | + write_unlock_irqrestore(&cfs_list_lock, flag); 229 | + } 230 | + } 231 | + } 232 | +} 233 | +#else 234 | +static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p) 235 | +{ 236 | +} 237 | + 238 | +static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p) 239 | +{ 240 | +} 241 | +#endif /* CONFIG_BLD */ 242 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c 243 | index d1f7149..c3236de 100644 244 | --- a/kernel/sched/core.c 245 | +++ b/kernel/sched/core.c 246 | @@ -24,6 +24,8 @@ 247 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 248 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 249 | * Thomas Gleixner, Mike Kravetz 250 | + * 2012-Feb The Barbershop Load Distribution (BLD) algorithm - an alternate 251 | + * CPU load distribution technique for kernel scheduler by Rakib Mullick. 252 | */ 253 | 254 | #include 255 | @@ -86,6 +88,7 @@ 256 | #include "sched.h" 257 | #include "../workqueue_internal.h" 258 | #include "../smpboot.h" 259 | +#include "bld.h" 260 | 261 | #define CREATE_TRACE_POINTS 262 | #include 263 | @@ -713,6 +716,8 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 264 | if (!(flags & ENQUEUE_RESTORE)) 265 | sched_info_queued(rq, p); 266 | p->sched_class->enqueue_task(rq, p, flags); 267 | + if (!dl_task(p)) 268 | + bld_track_load_activate(rq, p); 269 | } 270 | 271 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 272 | @@ -721,6 +726,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 273 | if (!(flags & DEQUEUE_SAVE)) 274 | sched_info_dequeued(rq, p); 275 | p->sched_class->dequeue_task(rq, p, flags); 276 | + if (!dl_task(p)) 277 | + bld_track_load_deactivate(rq, p); 278 | } 279 | 280 | void activate_task(struct rq *rq, struct task_struct *p, int flags) 281 | @@ -1515,8 +1522,16 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 282 | { 283 | lockdep_assert_held(&p->pi_lock); 284 | 285 | - if (p->nr_cpus_allowed > 1) 286 | + if (p->nr_cpus_allowed > 1) { 287 | +#ifndef CONFIG_BLD 288 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 289 | +#else 290 | + if(dl_task(p)) 291 | + cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags); 292 | + else 293 | + cpu = bld_get_cpu(p, sd_flags, wake_flags); 294 | +#endif 295 | + } 296 | 297 | /* 298 | * In order not to call set_task_cpu() on a blocking task we need 299 | @@ -1706,7 +1721,11 @@ void scheduler_ipi(void) 300 | */ 301 | preempt_fold_need_resched(); 302 | 303 | +#ifndef CONFIG_BLD 304 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 305 | +#else 306 | + if (llist_empty(&this_rq()->wake_list)) 307 | +#endif 308 | return; 309 | 310 | /* 311 | @@ -1728,13 +1747,16 @@ void scheduler_ipi(void) 312 | /* 313 | * Check if someone kicked us for doing the nohz idle load balance. 314 | */ 315 | +#ifndef CONFIG_BLD 316 | if (unlikely(got_nohz_idle_kick())) { 317 | this_rq()->idle_balance = 1; 318 | raise_softirq_irqoff(SCHED_SOFTIRQ); 319 | } 320 | +#endif 321 | irq_exit(); 322 | } 323 | 324 | +#ifndef CONFIG_BLD 325 | static void ttwu_queue_remote(struct task_struct *p, int cpu) 326 | { 327 | struct rq *rq = cpu_rq(cpu); 328 | @@ -1747,6 +1769,13 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) 329 | } 330 | } 331 | 332 | +#endif 333 | + 334 | +bool cpus_share_cache(int this_cpu, int that_cpu) 335 | +{ 336 | + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 337 | +} 338 | + 339 | void wake_up_if_idle(int cpu) 340 | { 341 | struct rq *rq = cpu_rq(cpu); 342 | @@ -1770,18 +1799,13 @@ void wake_up_if_idle(int cpu) 343 | out: 344 | rcu_read_unlock(); 345 | } 346 | - 347 | -bool cpus_share_cache(int this_cpu, int that_cpu) 348 | -{ 349 | - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 350 | -} 351 | #endif /* CONFIG_SMP */ 352 | 353 | static void ttwu_queue(struct task_struct *p, int cpu) 354 | { 355 | struct rq *rq = cpu_rq(cpu); 356 | 357 | -#if defined(CONFIG_SMP) 358 | +#if defined(CONFIG_SMP) && !defined(CONFIG_BLD) 359 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 360 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ 361 | ttwu_queue_remote(p, cpu); 362 | @@ -2292,7 +2316,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) 363 | * Silence PROVE_RCU. 364 | */ 365 | raw_spin_lock_irqsave(&p->pi_lock, flags); 366 | - set_task_cpu(p, cpu); 367 | + __set_task_cpu(p, cpu); 368 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); 369 | 370 | #ifdef CONFIG_SCHED_INFO 371 | @@ -2837,7 +2861,14 @@ void sched_exec(void) 372 | int dest_cpu; 373 | 374 | raw_spin_lock_irqsave(&p->pi_lock, flags); 375 | +#ifndef CONFIG_BLD 376 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 377 | +#else 378 | + if (dl_task(p)) 379 | + dest_cpu = task_cpu(p); 380 | + else 381 | + dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0); 382 | +#endif 383 | if (dest_cpu == smp_processor_id()) 384 | goto unlock; 385 | 386 | @@ -2926,8 +2957,10 @@ void scheduler_tick(void) 387 | 388 | #ifdef CONFIG_SMP 389 | rq->idle_balance = idle_cpu(cpu); 390 | +#ifndef CONFIG_BLD 391 | trigger_load_balance(rq); 392 | #endif 393 | +#endif 394 | rq_last_tick_reset(rq); 395 | } 396 | 397 | @@ -7359,6 +7392,15 @@ void __init sched_init(void) 398 | #endif 399 | init_rq_hrtick(rq); 400 | atomic_set(&rq->nr_iowait, 0); 401 | +#ifdef CONFIG_BLD 402 | + INIT_LIST_HEAD(&rq->cfs.bld_cfs_list); 403 | + list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head); 404 | + rq->cfs.pos = 0; 405 | + 406 | + INIT_LIST_HEAD(&rq->rt.bld_rt_list); 407 | + list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head); 408 | + rq->rt.lowbit = INT_MAX; 409 | +#endif 410 | } 411 | 412 | set_load_weight(&init_task); 413 | @@ -7399,6 +7441,9 @@ void __init sched_init(void) 414 | init_sched_fair_class(); 415 | 416 | scheduler_running = 1; 417 | +#ifdef CONFIG_BLD 418 | + printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n"); 419 | +#endif 420 | } 421 | 422 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 423 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 424 | index e7dd0ec..555572f 100644 425 | --- a/kernel/sched/fair.c 426 | +++ b/kernel/sched/fair.c 427 | @@ -4746,6 +4746,7 @@ static void task_waking_fair(struct task_struct *p) 428 | record_wakee(p); 429 | } 430 | 431 | +#ifndef CONFIG_BLD 432 | #ifdef CONFIG_FAIR_GROUP_SCHED 433 | /* 434 | * effective_load() calculates the load change as seen from the root_task_group 435 | @@ -5248,6 +5249,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f 436 | 437 | return new_cpu; 438 | } 439 | +#endif /* CONFIG_BLD */ 440 | 441 | /* 442 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 443 | @@ -5552,6 +5554,7 @@ idle: 444 | * further scheduler activity on it and we're being very careful to 445 | * re-start the picking loop. 446 | */ 447 | +#ifndef CONFIG_BLD 448 | lockdep_unpin_lock(&rq->lock); 449 | new_tasks = idle_balance(rq); 450 | lockdep_pin_lock(&rq->lock); 451 | @@ -5565,7 +5568,7 @@ idle: 452 | 453 | if (new_tasks > 0) 454 | goto again; 455 | - 456 | +#endif 457 | return NULL; 458 | } 459 | 460 | @@ -6226,8 +6229,9 @@ static unsigned long task_h_load(struct task_struct *p) 461 | } 462 | #endif 463 | 464 | -/********** Helpers for find_busiest_group ************************/ 465 | +#ifndef CONFIG_BLD 466 | 467 | +/********** Helpers for find_busiest_group ************************/ 468 | enum group_type { 469 | group_other = 0, 470 | group_imbalanced, 471 | @@ -6318,6 +6322,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, 472 | 473 | return load_idx; 474 | } 475 | +#endif /* CONFIG_BLD */ 476 | 477 | static unsigned long scale_rt_capacity(int cpu) 478 | { 479 | @@ -6426,6 +6431,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) 480 | sdg->sgc->capacity = capacity; 481 | } 482 | 483 | +#ifndef CONFIG_BLD 484 | /* 485 | * Check whether the capacity of the rq has been noticeably reduced by side 486 | * activity. The imbalance_pct is used for the threshold. 487 | @@ -7659,6 +7665,8 @@ static inline int on_null_domain(struct rq *rq) 488 | return unlikely(!rcu_dereference_sched(rq->sd)); 489 | } 490 | 491 | +#endif /* CONFIG_BLD */ 492 | + 493 | #ifdef CONFIG_NO_HZ_COMMON 494 | /* 495 | * idle load balancing details 496 | @@ -7666,12 +7674,39 @@ static inline int on_null_domain(struct rq *rq) 497 | * needed, they will kick the idle load balancer, which then does idle 498 | * load balancing for all the idle CPUs. 499 | */ 500 | +#ifndef CONFIG_BLD 501 | static struct { 502 | cpumask_var_t idle_cpus_mask; 503 | atomic_t nr_cpus; 504 | unsigned long next_balance; /* in jiffy units */ 505 | } nohz ____cacheline_aligned; 506 | 507 | +static inline void nohz_balance_exit_idle(int cpu) 508 | +{ 509 | + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 510 | + /* 511 | + * Completely isolated CPUs don't ever set, so we must test. 512 | + */ 513 | + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 514 | + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 515 | + atomic_dec(&nohz.nr_cpus); 516 | + } 517 | + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 518 | + } 519 | +} 520 | + 521 | +static int sched_ilb_notifier(struct notifier_block *nfb, 522 | + unsigned long action, void *hcpu) 523 | +{ 524 | + switch (action & ~CPU_TASKS_FROZEN) { 525 | + case CPU_DYING: 526 | + nohz_balance_exit_idle(smp_processor_id()); 527 | + return NOTIFY_OK; 528 | + default: 529 | + return NOTIFY_DONE; 530 | + } 531 | +} 532 | + 533 | static inline int find_new_ilb(void) 534 | { 535 | int ilb = cpumask_first(nohz.idle_cpus_mask); 536 | @@ -7709,20 +7744,7 @@ static void nohz_balancer_kick(void) 537 | smp_send_reschedule(ilb_cpu); 538 | return; 539 | } 540 | - 541 | -static inline void nohz_balance_exit_idle(int cpu) 542 | -{ 543 | - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 544 | - /* 545 | - * Completely isolated CPUs don't ever set, so we must test. 546 | - */ 547 | - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 548 | - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 549 | - atomic_dec(&nohz.nr_cpus); 550 | - } 551 | - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 552 | - } 553 | -} 554 | +#endif /* CONFIG_BLD */ 555 | 556 | static inline void set_cpu_sd_state_busy(void) 557 | { 558 | @@ -7764,6 +7786,7 @@ unlock: 559 | */ 560 | void nohz_balance_enter_idle(int cpu) 561 | { 562 | +#ifndef CONFIG_BLD 563 | /* 564 | * If this cpu is going down, then nothing needs to be done. 565 | */ 566 | @@ -7782,23 +7805,10 @@ void nohz_balance_enter_idle(int cpu) 567 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 568 | atomic_inc(&nohz.nr_cpus); 569 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 570 | -} 571 | - 572 | -static int sched_ilb_notifier(struct notifier_block *nfb, 573 | - unsigned long action, void *hcpu) 574 | -{ 575 | - switch (action & ~CPU_TASKS_FROZEN) { 576 | - case CPU_DYING: 577 | - nohz_balance_exit_idle(smp_processor_id()); 578 | - return NOTIFY_OK; 579 | - default: 580 | - return NOTIFY_DONE; 581 | - } 582 | +#endif 583 | } 584 | #endif 585 | 586 | -static DEFINE_SPINLOCK(balancing); 587 | - 588 | /* 589 | * Scale the max load_balance interval with the number of CPUs in the system. 590 | * This trades load-balance latency on larger machines for less cross talk. 591 | @@ -7808,6 +7818,9 @@ void update_max_interval(void) 592 | max_load_balance_interval = HZ*num_online_cpus()/10; 593 | } 594 | 595 | +#ifndef CONFIG_BLD 596 | +static DEFINE_SPINLOCK(balancing); 597 | + 598 | /* 599 | * It checks each scheduling domain to see if it is due to be balanced, 600 | * and initiates a balancing operation if so. 601 | @@ -8095,6 +8108,7 @@ void trigger_load_balance(struct rq *rq) 602 | nohz_balancer_kick(); 603 | #endif 604 | } 605 | +#endif /* CONFIG_BLD */ 606 | 607 | static void rq_online_fair(struct rq *rq) 608 | { 609 | @@ -8531,7 +8545,9 @@ const struct sched_class fair_sched_class = { 610 | .put_prev_task = put_prev_task_fair, 611 | 612 | #ifdef CONFIG_SMP 613 | +#ifndef CONFIG_BLD 614 | .select_task_rq = select_task_rq_fair, 615 | +#endif 616 | .migrate_task_rq = migrate_task_rq_fair, 617 | 618 | .rq_online = rq_online_fair, 619 | @@ -8593,6 +8609,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) 620 | 621 | __init void init_sched_fair_class(void) 622 | { 623 | +#ifndef CONFIG_BLD 624 | #ifdef CONFIG_SMP 625 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 626 | 627 | @@ -8602,5 +8619,5 @@ __init void init_sched_fair_class(void) 628 | cpu_notifier(sched_ilb_notifier, 0); 629 | #endif 630 | #endif /* SMP */ 631 | - 632 | +#endif /* BLD */ 633 | } 634 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 635 | index ec4f538d..4462bed 100644 636 | --- a/kernel/sched/rt.c 637 | +++ b/kernel/sched/rt.c 638 | @@ -1375,6 +1375,7 @@ static void yield_task_rt(struct rq *rq) 639 | #ifdef CONFIG_SMP 640 | static int find_lowest_rq(struct task_struct *task); 641 | 642 | +#ifndef CONFIG_BLD 643 | static int 644 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 645 | { 646 | @@ -1430,6 +1431,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 647 | out: 648 | return cpu; 649 | } 650 | +#endif /* CONFIG_BLD */ 651 | 652 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 653 | { 654 | @@ -2335,7 +2337,9 @@ const struct sched_class rt_sched_class = { 655 | .put_prev_task = put_prev_task_rt, 656 | 657 | #ifdef CONFIG_SMP 658 | +#ifndef CONFIG_BLD 659 | .select_task_rq = select_task_rq_rt, 660 | +#endif 661 | 662 | .set_cpus_allowed = set_cpus_allowed_common, 663 | .rq_online = rq_online_rt, 664 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 665 | index ec2e8d2..aaab735 100644 666 | --- a/kernel/sched/sched.h 667 | +++ b/kernel/sched/sched.h 668 | @@ -408,9 +408,8 @@ struct cfs_rq { 669 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 670 | #endif /* CONFIG_SMP */ 671 | 672 | -#ifdef CONFIG_FAIR_GROUP_SCHED 673 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 674 | - 675 | +#ifdef CONFIG_FAIR_GROUP_SCHED 676 | /* 677 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 678 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 679 | @@ -434,6 +433,11 @@ struct cfs_rq { 680 | struct list_head throttled_list; 681 | #endif /* CONFIG_CFS_BANDWIDTH */ 682 | #endif /* CONFIG_FAIR_GROUP_SCHED */ 683 | + 684 | +#ifdef CONFIG_BLD 685 | + struct list_head bld_cfs_list; 686 | + char pos; 687 | +#endif 688 | }; 689 | 690 | static inline int rt_bandwidth_enabled(void) 691 | @@ -479,12 +483,16 @@ struct rt_rq { 692 | /* Nests inside the rq lock: */ 693 | raw_spinlock_t rt_runtime_lock; 694 | 695 | + struct rq *rq; 696 | #ifdef CONFIG_RT_GROUP_SCHED 697 | unsigned long rt_nr_boosted; 698 | 699 | - struct rq *rq; 700 | struct task_group *tg; 701 | #endif 702 | +#ifdef CONFIG_BLD 703 | + struct list_head bld_rt_list; 704 | + int lowbit; 705 | +#endif 706 | }; 707 | 708 | /* Deadline class' related fields in a runqueue */ 709 | --------------------------------------------------------------------------------