├── README.md ├── linux_cpu_scheduler.lyx ├── linux_cpu_scheduler.pdf └── sched_comments.patch /README.md: -------------------------------------------------------------------------------- 1 | # Understanding the Linux 2.6.8.1 CPU Scheduler 2 | 3 | This paper is intended to be an introduction to the Linux 2.6.8.1 CPU scheduler implementation. 4 | 5 | It is available in lyx and pdf formats. 6 | 7 | There is also a patch against the Linux 2.6.8.1 sched.c that adds more comments. This did not received as much attention to detail as the paper, but it should be quite accurate. 8 | -------------------------------------------------------------------------------- /linux_cpu_scheduler.lyx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bdaehlie/linux-cpu-scheduler-docs/33f52ff95ac23fa0cdb53b18a88531e3fb1b9754/linux_cpu_scheduler.lyx -------------------------------------------------------------------------------- /linux_cpu_scheduler.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bdaehlie/linux-cpu-scheduler-docs/33f52ff95ac23fa0cdb53b18a88531e3fb1b9754/linux_cpu_scheduler.pdf -------------------------------------------------------------------------------- /sched_comments.patch: -------------------------------------------------------------------------------- 1 | --- /Users/josh/Desktop/cs_capstone/reference/linux-2.6.8.1-unpatched/kernel/sched.c Sat Aug 14 05:55:59 2004 2 | +++ /Users/josh/Desktop/cs_capstone/josh_capstone_work/sched_commented_2.6.8.1.c Sun Jan 2 03:24:40 2005 3 | @@ -18,6 +18,24 @@ 4 | * 2004-04-02 Scheduler domains code by Nick Piggin 5 | */ 6 | 7 | +/* 8 | + * Additional comments by Josh Aas. 9 | + * Copyright (c)2004 Silicon Graphics, Inc. (SGI) 10 | + * 11 | + * Comments are situated above what they describe. 12 | + * 13 | + * Abbreviations: 14 | + * RT - real-time (as in a "real-time process") 15 | + * UP - uniprocessor 16 | + * 17 | + * Notes: 18 | + * - SMT means symmetric multithreading. This is not the same thing as 19 | + * SMP. An example of an SMT system is an Intel Pentium 4 Hyper-Threading (HT) 20 | + * enabled processor. Basically, a single SMT chip can run multiple threads, 21 | + * which has some interesting scheduler implications since the threads 22 | + * share certain physical CPU resources. 23 | + */ 24 | + 25 | #include 26 | #include 27 | #include 28 | @@ -44,6 +62,18 @@ 29 | 30 | #include 31 | 32 | +/* 33 | + * NUMA architectures have groups of CPUs (and memory) organized 34 | + * into nodes. These macros are for getting the CPU mask for 35 | + * a node that a CPU belongs to. 36 | + * 37 | + * If the kernel is compiled for a NUMA architecture, do a node lookup 38 | + * by getting a CPU's node and then getting the CPU mask/map for 39 | + * that node. If non-NUMA, there will only be one mask/map, so insert that. 40 | + * 41 | + * Note that these NUMA macros are not used. They should probably have been 42 | + * removed from this file. 43 | + */ 44 | #ifdef CONFIG_NUMA 45 | #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) 46 | #else 47 | @@ -54,6 +84,25 @@ 48 | * Convert user-nice values [ -20 ... 0 ... 19 ] 49 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 50 | * and back. 51 | + * 52 | + * PRIO values are the priority values that the Linux scheduler uses internally. 53 | + * Possible PRIO values for RT tasks are 0 through (MAX_RT_PRIO - 1), and possible PRIO 54 | + * values for non-RT tasks are MAX_RT_PRIO through (MAX_PRIO - 1). The lower a task's 55 | + * PRIO value, the higher its priority. With this setup, RT tasks will always have 56 | + * a higher priority than non-RT tasks. 57 | + * 58 | + * For non-RT tasks, in order to convert a user-nice value to a PRIO value, one would 59 | + * start with MAX_RT_PRIO, add the user-nice value, and then add 20 to make up for the 60 | + * fact that the highest possible priority user-nice value is -20. Converting from a 61 | + * PRIO value to a user-nice value is just the opposite. This is what the 62 | + * NICE_TO_PRIO(nice) and PRIO_TO_NICE(prio) macros do. 63 | + * 64 | + * TASK_NICE(p) simply gets the user-nice value for a given task. Each task has a 65 | + * static and a dynamic priority value. The static priority value is set by users 66 | + * via the nice() system call and ranges from -20 to 19. It is stored as a PRIO. The 67 | + * dynamic priority is based on a task's static priority, but it is modified based 68 | + * on interactivity. The dynamic priority is not relevent here, but is mentioned in 69 | + * order to explain why TASK_NICE(p) is determined by a task's static_prio field. 70 | */ 71 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 72 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 73 | @@ -63,6 +112,23 @@ 74 | * 'User priority' is the nice value converted to something we 75 | * can work with better when scaling various scheduler parameters, 76 | * it's a [ 0 ... 39 ] range. 77 | + * 78 | + * USER_PRIO(p) takes an interal non-RT priority and returns its 79 | + * priority in terms of 0-39. It is only used by the other macros 80 | + * in this group as values of 0-39 don't mean anything in terms of 81 | + * internal PRIO values or user-nice values. It is simply a shortcut. 82 | + * 83 | + * TASK_USER_PRIO is not used by anything, and should be removed from 84 | + * the kernel. It is a useless calculation for the reason described above. 85 | + * All it doers is return a task's USER_PRIO. 86 | + * 87 | + * MAX_USER_PRIO returns the total number of different priority levels 88 | + * non-RT processes can have. In this case, it resoves to 40 (100-139). 89 | + * 90 | + * AVG_TIMESLICE basically resolves to the half-way point between MIN_TIMESLICE 91 | + * and MAX_TIMESLICE. The reason it isn't written simply like that is so the 92 | + * algorithm can withstand changes to the priority system. It resolves to about 93 | + * 100ms. 94 | */ 95 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 96 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 97 | @@ -72,6 +138,13 @@ 98 | 99 | /* 100 | * Some helpers for converting nanosecond timing to jiffy resolution 101 | + * 102 | + * A nanosecond (NS) is one-billionth of a second. A jiffy is a period of time 103 | + * calculated by 1/HZ, where HZ is the architecture-defined number of ticks 104 | + * per second. So, to convert from nanoseconds to jiffies, one divides a billion 105 | + * by HZ (which results in the number of nanoseconds in a jiffy), and divides 106 | + * the number of nanoseconds by that. Jiffies to NS is the same, but multiply 107 | + * the number of jiffies by the number of nanoseconds in a jiffy. 108 | */ 109 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 110 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 111 | @@ -79,9 +152,46 @@ 112 | /* 113 | * These are the 'tuning knobs' of the scheduler: 114 | * 115 | - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, 116 | - * maximum timeslice is 200 msecs. Timeslices get refilled after 117 | - * they expire. 118 | + * MIN_TIMESLICE is the minimum timeslice that a task can be given. It resolves to about 10ms. 119 | + * 120 | + * MAX_TIMESLCIE is the maximum timeslice that a task can be given. It resolves to about 200ms. 121 | + * 122 | + * ON_RUNQUEUE_WEIGHT ... 123 | + * 124 | + * CHILD_PENALTY is the penalty that the sleep_avg of forked child tasks gets 125 | + * in order to prevent very interactive tasks from spawning other very interactive 126 | + * tasks. 127 | + * 128 | + * PARENT_PENALTY is the penalty that the sleep_avg of parents who forked tasks 129 | + * gets in order to prevent very interactive tasks from spawning other very interactive 130 | + * tasks. 131 | + * 132 | + * EXIT_WEIGHT ... 133 | + * 134 | + * PRIO_BONUS_RATIO is the ratio used to determine MAX_BONUS. 135 | + * 136 | + * MAX_BONUS ... MAX_USER_PRIO ressolves to 40, and PRIO_BONUS_RATIO is 25. 137 | + * So essentially this means that the max bonus that can be 138 | + * given to a task is 25% of the total non-RT priority 139 | + * range. Since there are 40 possible non-RT priorities, this 140 | + * resolves to 10. 141 | + * 142 | + * INTERACTIVE_DELTA is the static component used to determine whether or not a task 143 | + * should be considered interactive. The higher this is, the more difficult it is for 144 | + * tasks to be considered interactive. See the DELTA and TASK_INTERACTIVE macros for 145 | + * more information. 146 | + * 147 | + * MAX_SLEEP_AVG is the number of jiffies that is the maximum average sleep time for 148 | + * a task. The higher a task's sleep_avg, the more interactive it is, so this essentially 149 | + * puts a limit on how interactive a task can be. 150 | + * 151 | + * STARVATION_LIMIT is the time limit for which a runnable task may be deprived of 152 | + * CPU time before it is considered to be starving. 153 | + * 154 | + * NS_MAX_SLEEP_AVG is the same as MAX_SLEEP_AVG, but in nanoseconds. 155 | + * 156 | + * CREDIT_LIMIT is used to determine whether or not a task has high or low interactivity 157 | + * credit. See the macros HIGH_CREDIT and LOW_CREDIT. 158 | */ 159 | #define MIN_TIMESLICE ( 10 * HZ / 1000) 160 | #define MAX_TIMESLICE (200 * HZ / 1000) 161 | @@ -101,7 +211,9 @@ 162 | * If a task is 'interactive' then we reinsert it in the active 163 | * array after it has expired its current timeslice. (it will not 164 | * continue to run immediately, it will still roundrobin with 165 | - * other interactive tasks.) 166 | + * other interactive tasks.) This behavior does not prevent the expired 167 | + * and unexpired queues from ever being swapped - they will get swapped 168 | + * as soon as something in the expired queue is going to starve. 169 | * 170 | * This part scales the interactivity limit depending on niceness. 171 | * 172 | @@ -116,7 +228,9 @@ 173 | * 174 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic 175 | * priority range a task can explore, a value of '1' means the 176 | - * task is rated interactive.) 177 | + * task is rated interactive. So - there are 11 columns. The middle 178 | + * column is whether or not a task with a certain user-nice level 179 | + * is considered interactive if given no + or - bonus at all.) 180 | * 181 | * Ie. nice +19 tasks can never get 'interactive' enough to be 182 | * reinserted into the active array. And only heavily CPU-hog nice -20 183 | @@ -125,10 +239,26 @@ 184 | * too hard. 185 | */ 186 | 187 | +/* 188 | + * The process's current bonus is its sleep average in jiffies times MAX_BONUS 189 | + * divided by MAX_SLEEP_AVG. Essentially it scales a processes sleep average into 190 | + * the range MAX_BONUS. 191 | + */ 192 | #define CURRENT_BONUS(p) \ 193 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ 194 | MAX_SLEEP_AVG) 195 | 196 | +/* 197 | + * If an interactive task has too long a timeslice, it may 198 | + * be preempted by a task of equal priority. The task 199 | + * does not lose its timeslice, it is just put on the bottom of the 200 | + * list of tasks of its priority waiting to run. If there 201 | + * was a task of higher priority, it would have already preempted 202 | + * a given task. TIMESLICE_GRANULARITY is the time limit for 203 | + * what is considered "too long" a timeslice. It is called granularity 204 | + * because the timeslice is effectively broken up if it is longer than 205 | + * TIMESLICE_GRANULARITY. 206 | + */ 207 | #ifdef CONFIG_SMP 208 | #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ 209 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ 210 | @@ -138,12 +268,38 @@ 211 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) 212 | #endif 213 | 214 | +/* 215 | + * This macro is used in the TASK_INTERACTIVE macro to decide if a 216 | + * task should be considered interactive. SCALE calculates how much 217 | + * higher in priority a task must be from its nice value, minus the 218 | + * INTERACTIVE_DELTA, in order to be considered interactive. The idea 219 | + * is that tasks with a higher priority nice value should not need to 220 | + * be given as much of a bonus in order to be considered interactive 221 | + * as tasks given a lower priority nice value. So, a task with a -10 222 | + * nice value will be more easily considered interactive than a task 223 | + * with a +10 nice value. Since INTERACTIVE_DELTA is static, SCALE 224 | + * provides a value to add to it in order to do the interactivity scaling. 225 | + */ 226 | #define SCALE(v1,v1_max,v2_max) \ 227 | (v1) * (v2_max) / (v1_max) 228 | 229 | +/* 230 | + * A task must be DELTA higher in priority than its nice 231 | + * value in order to be considered interactive. This value 232 | + * is the combination of the scaled factor and the constant 233 | + * INTERACTIVE_DELTA factor. 234 | + */ 235 | #define DELTA(p) \ 236 | (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 237 | 238 | +/* 239 | + * This macro returns whether or not a task should be considered 240 | + * interactive. If a task's priority value (lower values are higher 241 | + * priority) is less than or equal to its static_prio (i.e. nice value) 242 | + * minus DELTA, then it is interactive. This is because tasks are given 243 | + * given priority-raising bonuses (prio lowering) based on heuristics 244 | + * that measure characteristics of interactivity. 245 | + */ 246 | #define TASK_INTERACTIVE(p) \ 247 | ((p)->prio <= (p)->static_prio - DELTA(p)) 248 | 249 | @@ -157,6 +313,10 @@ 250 | #define LOW_CREDIT(p) \ 251 | ((p)->interactive_credit < -CREDIT_LIMIT) 252 | 253 | +/* 254 | + * just tells whether or not there is a task in rq that 255 | + * should preempt the task p. 256 | + */ 257 | #define TASK_PREEMPTS_CURR(p, rq) \ 258 | ((p)->prio < (rq)->curr->prio) 259 | 260 | @@ -170,30 +330,91 @@ 261 | * 262 | * task_timeslice() is the interface that is used by the scheduler. 263 | */ 264 | - 265 | -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ 266 | - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ 267 | - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) 268 | +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) * \ 269 | + (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) 270 | 271 | static unsigned int task_timeslice(task_t *p) 272 | { 273 | return BASE_TIMESLICE(p); 274 | } 275 | 276 | +/* 277 | + * The task_hot macro takes a process, the current time, and a scheduler domain. 278 | + * A scheduler domain is essentially a grouping a processors that share cache. 279 | + * task_hot determines whether or not cache in a scheduler domain is likely 280 | + * to contain data that the given process could use. The value cache_hot_time 281 | + * is the amount of time that data is likely to remain the cache. Thus, if 282 | + * the time between when the process was last run and now is less than that 283 | + * amount of time, it is likely that the cache will still be hot (i.e. contain 284 | + * relevant data). 285 | + */ 286 | #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) 287 | 288 | +/* These are the runqueue data structures: */ 289 | + 290 | /* 291 | - * These are the runqueue data structures: 292 | + * The BITMAP_SIZE macro resolves to the number of long integers 293 | + * required to create a bitmap with one bit per scheduler priority 294 | + * (there are MAX_PRIO priorities). 295 | + * 296 | + * The "...+1+7)/8" part might seem odd. MAX_PRIO + 1 covers all priorities, 297 | + * adding 7 ensures that division by 8 will result in a number > 1. 298 | */ 299 | - 300 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) 301 | 302 | typedef struct runqueue runqueue_t; 303 | 304 | +/* 305 | + * The prio_array data structure is extremely important as it is what allows 306 | + * the Linux scheduling algorithm to perform in O(1) time. 307 | + * 308 | + * The basic structure in the Linux scheduler is the runqueue, defined below. 309 | + * There is one runqueue per processor, and within that runqueue there are two 310 | + * structures of type prio_array. One is for tasks that have not used up their 311 | + * timeslice yet, the other is for tasks that have used up their timeslice. The 312 | + * former are considered active, the latter expired. Note that active and expired 313 | + * has nothing to do with whether or not a task is runnable - active simply means 314 | + * that since the last time timeslices were allocated, a given task in that queue 315 | + * has not used up its timeslice. A task in the active list still has time available 316 | + * on the CPU, tasks in the expired list have used up their timeslice. 317 | + * 318 | + * The nr_active value stores the number of runnable tasks in the prio_array. The 319 | + * bitmap is a string of bits, one for each priority level on the system (140 by 320 | + * default), that indicates whether or not there are any tasks in the prio_array 321 | + * at a given priority level. The queue value is an array of pointers to arrays 322 | + * that store all tasks at a given priority level. 323 | + * 324 | + * So if there is only one runnable task in the prio_array, nr_active will be equal to 325 | + * one. If that task is not RT, and it has a nice value of 20, there will be 326 | + * a one in the 119th position of the bitmap to indicate that there is a task in the 327 | + * prio_array at that priority level. The queue array would have a pointer at the 119th 328 | + * position pointing to an array of length 1, its single element being the task in question. 329 | + * 330 | + * This is very useful because in order to determine the next task to run, the scheduler simply 331 | + * 1) looks to see if there are any runnable tasks in its active prio_array (i.e. is nr_active > 0) 332 | + * 2) if so, go to step 3 otherwise go to step 6 333 | + * 3) find the first 1 in the active prio_array's bitmap. There must be a 1 somewhere since 334 | + * we know that there is a task in the prio_array and it must have a priority level. 335 | + * 4) run the first task in the array at the position in the prio_array's queue equal to 336 | + * the first 1 found in the bitmap. 337 | + * 5) when the task is done running for some reason, recalculate its new timeslice and put it 338 | + * in the expired prio_array. decement nr_active in the active prio_array, and increment 339 | + * it in the expired prio_array. if the task was the last task at a given priority, 340 | + * clear the priority's bit in the active prio_array and make sure the priority's bit 341 | + * is set in the expired prio_array. repeat steps 1-4 until no tasks exist in the active 342 | + * prio_array. 343 | + * 6) when no tasks exist in the active prio_array, swap the active and inactive prio_arrays 344 | + * and start over again. since timeslices are recalculated for each process when 345 | + * it is put onto the expired array, the swap of prio_arrays is fast (i.e. no 346 | + * sitting around recalculating a timeslice for every task) 347 | + * 348 | + * This results in O(1) behavior since no step in the process requires iterating over a number 349 | + * of tasks that grows larger when the total number of tasks grows. 350 | + */ 351 | struct prio_array { 352 | - unsigned int nr_active; 353 | - unsigned long bitmap[BITMAP_SIZE]; 354 | - struct list_head queue[MAX_PRIO]; 355 | + unsigned int nr_active; /* number of runnable tasks in this prio_array */ 356 | + unsigned long bitmap[BITMAP_SIZE]; /* bitmap showing which priority levels contain tasks */ 357 | + struct list_head queue[MAX_PRIO]; /* a list of array heads, one for each priority on the system */ 358 | }; 359 | 360 | /* 361 | @@ -204,50 +425,61 @@ struct prio_array { 362 | * acquire operations must be ordered by ascending &runqueue. 363 | */ 364 | struct runqueue { 365 | - spinlock_t lock; 366 | + spinlock_t lock; /* lock that protects this runqueue */ 367 | 368 | - /* 369 | - * nr_running and cpu_load should be in the same cacheline because 370 | - * remote CPUs use both these fields when doing load calculation. 371 | - */ 372 | - unsigned long nr_running; 373 | + /* 374 | + * nr_running and cpu_load should be in the same cacheline because 375 | + * remote CPUs use both these fields when doing load calculation. 376 | + */ 377 | + unsigned long nr_running; /* number of runnable tasks */ 378 | #ifdef CONFIG_SMP 379 | - unsigned long cpu_load; 380 | + unsigned long cpu_load; /* this CPU's load */ 381 | #endif 382 | - unsigned long long nr_switches; 383 | - unsigned long expired_timestamp, nr_uninterruptible; 384 | - unsigned long long timestamp_last_tick; 385 | - task_t *curr, *idle; 386 | - struct mm_struct *prev_mm; 387 | - prio_array_t *active, *expired, arrays[2]; 388 | - int best_expired_prio; 389 | - atomic_t nr_iowait; 390 | + unsigned long long nr_switches; /* number of context switches */ 391 | + unsigned long expired_timestamp, nr_uninterruptible; /* time of last array swap and number of 392 | + uninterruptible processes in queue */ 393 | + unsigned long long timestamp_last_tick; /* timestamp of last scheduler tick */ 394 | + task_t *curr, *idle; /* this processors current and idle task */ 395 | + struct mm_struct *prev_mm; /* the last running task's mm_struct */ 396 | + prio_array_t *active, *expired, arrays[2]; /* the active and expired prio_arrays */ 397 | + int best_expired_prio; /* highest priority that exists in the expired prio_array */ 398 | + atomic_t nr_iowait; /* number of tasks in the queue waiting on i/o */ 399 | 400 | #ifdef CONFIG_SMP 401 | - struct sched_domain *sd; 402 | + struct sched_domain *sd; /* in SMP systems there can be different scheduler domains */ 403 | 404 | /* For active balancing */ 405 | - int active_balance; 406 | + int active_balance; /* */ 407 | int push_cpu; 408 | 409 | + /* this migration thread for the processor that this runqueue belongs to */ 410 | task_t *migration_thread; 411 | struct list_head migration_queue; 412 | #endif 413 | }; 414 | 415 | +/* Define one runqueue per CPU. */ 416 | static DEFINE_PER_CPU(struct runqueue, runqueues); 417 | 418 | +/* Iterate through domains that a CPU is a part of */ 419 | #define for_each_domain(cpu, domain) \ 420 | for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 421 | 422 | +/* 423 | + * cpu_rq gets the runqueue for a given cpu 424 | + * 425 | + * this_rq gets the runqueue for the current cpu 426 | + * 427 | + * task_rq gets the runqueue that a certain task is in 428 | + * 429 | + * cpu_curr gets the current task on a given CPU 430 | + */ 431 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 432 | #define this_rq() (&__get_cpu_var(runqueues)) 433 | #define task_rq(p) cpu_rq(task_cpu(p)) 434 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 435 | 436 | -/* 437 | - * Default context-switch locking: 438 | - */ 439 | +/* Default context-switch locking */ 440 | #ifndef prepare_arch_switch 441 | # define prepare_arch_switch(rq, next) do { } while (0) 442 | # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 443 | @@ -264,23 +496,28 @@ static runqueue_t *task_rq_lock(task_t * 444 | struct runqueue *rq; 445 | 446 | repeat_lock_task: 447 | - local_irq_save(*flags); 448 | - rq = task_rq(p); 449 | - spin_lock(&rq->lock); 450 | + local_irq_save(*flags); /* save irq flags */ 451 | + rq = task_rq(p); /* get runqueue for the task */ 452 | + spin_lock(&rq->lock); /* lock the runqueue */ 453 | + /* make sure the task is still on the runqueue we just locked */ 454 | if (unlikely(rq != task_rq(p))) { 455 | + /* if not, unlock and restore irq flags, then try again */ 456 | spin_unlock_irqrestore(&rq->lock, *flags); 457 | goto repeat_lock_task; 458 | } 459 | return rq; 460 | } 461 | 462 | +/* simply unlock a runqueue, not as touchy as locking! */ 463 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 464 | { 465 | spin_unlock_irqrestore(&rq->lock, *flags); 466 | } 467 | 468 | /* 469 | - * rq_lock - lock a given runqueue and disable interrupts. 470 | + * rq_lock - lock the current processor's runqueue and disable interrupts. 471 | + * Since the current CPU is executing this code, its runqueue is easier to 472 | + * lock than if we were trying to lock some other CPU's runqueue (see task_rq_lock()). 473 | */ 474 | static runqueue_t *this_rq_lock(void) 475 | { 476 | @@ -293,6 +530,10 @@ static runqueue_t *this_rq_lock(void) 477 | return rq; 478 | } 479 | 480 | +/* 481 | + * A convenience method for making sure that runqueues get unlocked 482 | + * via the right lock mechanism. 483 | + */ 484 | static inline void rq_unlock(runqueue_t *rq) 485 | { 486 | spin_unlock_irq(&rq->lock); 487 | @@ -303,24 +544,37 @@ static inline void rq_unlock(runqueue_t 488 | */ 489 | static void dequeue_task(struct task_struct *p, prio_array_t *array) 490 | { 491 | - array->nr_active--; 492 | + array->nr_active--; /* one less active task in the array */ 493 | list_del(&p->run_list); 494 | + /* 495 | + * Clear the bit that says there is a task in the prio array with a certain priority 496 | + * if no more tasks at p's priority in the prio array. 497 | + */ 498 | if (list_empty(array->queue + p->prio)) 499 | __clear_bit(p->prio, array->bitmap); 500 | } 501 | 502 | static void enqueue_task(struct task_struct *p, prio_array_t *array) 503 | { 504 | + /* add the task at the right spot in the prio array */ 505 | list_add_tail(&p->run_list, array->queue + p->prio); 506 | + /* 507 | + * set the bit that says there is at least one task in the prio array 508 | + * with priority p->prio 509 | + */ 510 | __set_bit(p->prio, array->bitmap); 511 | - array->nr_active++; 512 | - p->array = array; 513 | + array->nr_active++; /* one more active task in the array */ 514 | + p->array = array; /* set the field in the task that says what prio array it is in */ 515 | } 516 | 517 | /* 518 | - * Used by the migration code - we pull tasks from the head of the 519 | - * remote queue so we want these tasks to show up at the head of the 520 | - * local queue: 521 | + * Migration code always has the highest priority. When CPUs go down (become 522 | + * idle), the idle task must get a higher priority than the migration code. 523 | + * This function is used by __activate_idle_task, which is called by 524 | + * sched_idle_next. sched_idle_next is called when CPUs get taken down. 525 | + * 526 | + * This is really similar to enqueue task, except it adds to the top of the list 527 | + * instead of the tail (list_add() instead of list_add_tail()). 528 | */ 529 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 530 | { 531 | @@ -347,13 +601,28 @@ static inline void enqueue_task_head(str 532 | static int effective_prio(task_t *p) 533 | { 534 | int bonus, prio; 535 | - 536 | + 537 | + /* don't do anything if this is a RT task */ 538 | if (rt_task(p)) 539 | return p->prio; 540 | 541 | + /* 542 | + * take the CURRENT_BONUS, which is sleep_avg mapped onto 543 | + * 0-MAX_BONUS, and subtract half of MAX_BONUS since it is 544 | + * twice the possible + or - bonus. So if MAX_BONUS is 10, 545 | + * and a task sleeps a lot, it might get a CURRENT_BONUS of 546 | + * say, 8. Subtracting 5, that makes 3. This will be subtracted 547 | + * from static_prio since the task should have a high priority 548 | + * and lower prio values are higher priority. If a task sleeps 549 | + * very little, the bonus value calculated here will be negative. 550 | + * In that case, the negative value will get subtracted from 551 | + * static_prio, lowering the priority. 552 | + */ 553 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 554 | 555 | + /* give the task a prio based on the just-calculated bonus and static_prio */ 556 | prio = p->static_prio - bonus; 557 | + /* make sure the prio value is within non-RT bounds and return it */ 558 | if (prio < MAX_RT_PRIO) 559 | prio = MAX_RT_PRIO; 560 | if (prio > MAX_PRIO-1) 561 | @@ -379,8 +648,21 @@ static inline void __activate_idle_task( 562 | rq->nr_running++; 563 | } 564 | 565 | +/* 566 | + * This function recalculates a task's priority ("I know this because I can 567 | + * read" - John Fraser Hart). It is called by the main schedule() function 568 | + * when a task is moved to the expired prio array, and also when tasks are 569 | + * activated. 570 | + */ 571 | static void recalc_task_prio(task_t *p, unsigned long long now) 572 | { 573 | + /* 574 | + * __sleep_time is used because an unsigned long long will be able 575 | + * to hold a huge number, which might be the case in the calculation 576 | + * of "now - p-> timestamp" but will not be the case if the number 577 | + * is kept <= NS_MAX_SLEEP_AVG. So, once the number is calculated to 578 | + * be <= NS_MAX_SLEEP_AVG, then the unsigned long sleep_time is used. 579 | + */ 580 | unsigned long long __sleep_time = now - p->timestamp; 581 | unsigned long sleep_time; 582 | 583 | @@ -393,7 +675,7 @@ static void recalc_task_prio(task_t *p, 584 | /* 585 | * User tasks that sleep a long time are categorised as 586 | * idle and will get just interactive status to stay active & 587 | - * prevent them suddenly becoming cpu hogs and starving 588 | + * prevent them from suddenly becoming cpu hogs and starving 589 | * other processes. 590 | */ 591 | if (p->mm && p->activated != -1 && 592 | @@ -405,7 +687,9 @@ static void recalc_task_prio(task_t *p, 593 | } else { 594 | /* 595 | * The lower the sleep avg a task has the more 596 | - * rapidly it will rise with sleep time. 597 | + * rapidly it will rise with sleep time. If a task 598 | + * has a high sleep avg, CURRENT_BONUS(p) will be high, 599 | + * and thus MAX_BONUS - CURRENT_BONUS(p) will be low. 600 | */ 601 | sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; 602 | 603 | @@ -507,7 +791,13 @@ static void activate_task(task_t *p, run 604 | */ 605 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) 606 | { 607 | + /* one less running task */ 608 | rq->nr_running--; 609 | + /* 610 | + * this is leaving the running state and 611 | + * becoming uninterruptible, so increment 612 | + * nr_uninterruptible 613 | + */ 614 | if (p->state == TASK_UNINTERRUPTIBLE) 615 | rq->nr_uninterruptible++; 616 | dequeue_task(p, p->array); 617 | @@ -527,7 +817,7 @@ static void resched_task(task_t *p) 618 | int need_resched, nrpolling; 619 | 620 | preempt_disable(); 621 | - /* minimise the chance of sending an interrupt to poll_idle() */ 622 | + /* minimize the chance of sending an interrupt to poll_idle() */ 623 | nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 624 | need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); 625 | nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 626 | @@ -543,15 +833,19 @@ static inline void resched_task(task_t * 627 | } 628 | #endif 629 | 630 | -/** 631 | +/* 632 | * task_curr - is this task currently executing on a CPU? 633 | - * @p: the task in question. 634 | */ 635 | inline int task_curr(const task_t *p) 636 | { 637 | return cpu_curr(task_cpu(p)) == p; 638 | } 639 | 640 | +/* 641 | + * This section contains code for migrating tasks between CPUs on 642 | + * SMP systems 643 | + */ 644 | + 645 | #ifdef CONFIG_SMP 646 | enum request_type { 647 | REQ_MOVE_TASK, 648 | @@ -563,11 +857,11 @@ typedef struct { 649 | enum request_type type; 650 | 651 | /* For REQ_MOVE_TASK */ 652 | - task_t *task; 653 | - int dest_cpu; 654 | + task_t *task; /* task to operate on */ 655 | + int dest_cpu; /* if REQ_MOVE_TASK, this is the destination CPU */ 656 | 657 | /* For REQ_SET_DOMAIN */ 658 | - struct sched_domain *sd; 659 | + struct sched_domain *sd; /* destination domain */ 660 | 661 | struct completion done; 662 | } migration_req_t; 663 | @@ -589,6 +883,10 @@ static int migrate_task(task_t *p, int d 664 | return 0; 665 | } 666 | 667 | + /* 668 | + * fill in migration request fields and add task to a 669 | + * migration queue, to be migrated later 670 | + */ 671 | init_completion(&req->done); 672 | req->type = REQ_MOVE_TASK; 673 | req->task = p; 674 | @@ -640,6 +938,13 @@ void kick_process(task_t *p) 675 | 676 | preempt_disable(); 677 | cpu = task_cpu(p); 678 | + /* 679 | + * If the process is on this CPU, then its already in kernel mode, because we're 680 | + * executing right now. In that case, don't tell it to reschedule. If the process 681 | + * is not the current process on some CPU, then kernel mode must kick in before 682 | + * it runs so again, don't bother rescheduling it. It should be obvious why this 683 | + * function doesn't apply on a UP system. 684 | + */ 685 | if ((cpu != smp_processor_id()) && task_curr(p)) 686 | smp_send_reschedule(cpu); 687 | preempt_enable(); 688 | @@ -661,9 +966,7 @@ static inline unsigned long source_load( 689 | return min(rq->cpu_load, load_now); 690 | } 691 | 692 | -/* 693 | - * Return a high guess at the load of a migration-target cpu 694 | - */ 695 | +/* Return a high guess at the load of a migration-target cpu */ 696 | static inline unsigned long target_load(int cpu) 697 | { 698 | runqueue_t *rq = cpu_rq(cpu); 699 | @@ -672,7 +975,7 @@ static inline unsigned long target_load( 700 | return max(rq->cpu_load, load_now); 701 | } 702 | 703 | -#endif 704 | +#endif /* CONFIG_SMP */ 705 | 706 | /* 707 | * wake_idle() is useful especially on SMT architectures to wake a 708 | @@ -689,16 +992,28 @@ static int wake_idle(int cpu, task_t *p) 709 | struct sched_domain *sd; 710 | int i; 711 | 712 | + /* if the task is already on an idle CPU, leave it there */ 713 | if (idle_cpu(cpu)) 714 | return cpu; 715 | 716 | + /* don't change CPUs if the scheduler domain does not support WAKE_IDLE */ 717 | sd = rq->sd; 718 | if (!(sd->flags & SD_WAKE_IDLE)) 719 | return cpu; 720 | 721 | + /* 722 | + * First, put the &'ed value of the scheduler domain span 723 | + * and the online CPU map into tmp. Then, & tmp with the 724 | + * cpus that p is allowed to run on. That gives a list 725 | + * of potential CPUs in the map tmp. 726 | + */ 727 | cpus_and(tmp, sd->span, cpu_online_map); 728 | cpus_and(tmp, tmp, p->cpus_allowed); 729 | 730 | + /* 731 | + * cycle through the cpu map tmp, made above, 732 | + * and send the task to the first idle CPU. 733 | + */ 734 | for_each_cpu_mask(i, tmp) { 735 | if (idle_cpu(i)) 736 | return i; 737 | @@ -739,26 +1054,35 @@ static int try_to_wake_up(task_t * p, un 738 | int new_cpu; 739 | #endif 740 | 741 | + /* 742 | + * lock the task's runqueue, disabling interrupts, 743 | + * then check to see if the task is in one of the 744 | + * states we wish to wake it from. If not, get out. 745 | + */ 746 | rq = task_rq_lock(p, &flags); 747 | old_state = p->state; 748 | if (!(old_state & state)) 749 | goto out; 750 | 751 | + /* the task is already awake if it is in a prio array! */ 752 | if (p->array) 753 | goto out_running; 754 | - 755 | + 756 | cpu = task_cpu(p); 757 | this_cpu = smp_processor_id(); 758 | 759 | #ifdef CONFIG_SMP 760 | + /* if the task is running but was interrupted, we just need to activate it */ 761 | if (unlikely(task_running(rq, p))) 762 | goto out_activate; 763 | 764 | new_cpu = cpu; 765 | 766 | + /* if the task's CPU is this CPU or this CPU is not one it is allowed on... */ 767 | if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 768 | goto out_set_cpu; 769 | 770 | + /* grab the load on the source and target CPUs */ 771 | load = source_load(cpu); 772 | this_load = target_load(this_cpu); 773 | 774 | @@ -809,8 +1133,10 @@ out_set_cpu: 775 | /* might preempt at this point */ 776 | rq = task_rq_lock(p, &flags); 777 | old_state = p->state; 778 | + /* If the state of p is not one we wish to wake from, get out */ 779 | if (!(old_state & state)) 780 | goto out; 781 | + /* if p is in a prio array, it is already running */ 782 | if (p->array) 783 | goto out_running; 784 | 785 | @@ -852,6 +1178,7 @@ out: 786 | return success; 787 | } 788 | 789 | +/* just an exported convenience function for try_to_wake_up() */ 790 | int fastcall wake_up_process(task_t * p) 791 | { 792 | return try_to_wake_up(p, TASK_STOPPED | 793 | @@ -899,7 +1226,8 @@ void fastcall sched_fork(task_t *p) 794 | p->time_slice = (current->time_slice + 1) >> 1; 795 | /* 796 | * The remainder of the first timeslice might be recovered by 797 | - * the parent if the child exits early enough. 798 | + * the parent if the child exits early enough. Set first_time_slice 799 | + * in order to indicate that that p's timeslice is reclaimable. 800 | */ 801 | p->first_time_slice = 1; 802 | current->time_slice >>= 1; 803 | @@ -930,6 +1258,7 @@ void fastcall wake_up_forked_process(tas 804 | unsigned long flags; 805 | runqueue_t *rq = task_rq_lock(current, &flags); 806 | 807 | + /* The freshly forked process should not already be running! */ 808 | BUG_ON(p->state != TASK_RUNNING); 809 | 810 | /* 811 | @@ -943,14 +1272,17 @@ void fastcall wake_up_forked_process(tas 812 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * 813 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 814 | 815 | + /* Start of with interactive credit of 0. */ 816 | p->interactive_credit = 0; 817 | 818 | + /* Set an initial priority and CPU. The initial CPU is the current CPU. */ 819 | p->prio = effective_prio(p); 820 | set_task_cpu(p, smp_processor_id()); 821 | 822 | + /* If the task is not already on a runqueue prio array, put it on one. */ 823 | if (unlikely(!current->array)) 824 | __activate_task(p, rq); 825 | - else { 826 | + else { /* Otherwise just situate it in the runqueue its in. */ 827 | p->prio = current->prio; 828 | list_add_tail(&p->run_list, ¤t->run_list); 829 | p->array = current->array; 830 | @@ -975,6 +1307,10 @@ void fastcall sched_exit(task_t * p) 831 | runqueue_t *rq; 832 | 833 | local_irq_save(flags); 834 | + /* 835 | + * if the exiting child was only on its first time slice, 836 | + * give it back to the parent 837 | + */ 838 | if (p->first_time_slice) { 839 | p->parent->time_slice += p->time_slice; 840 | if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) 841 | @@ -1055,6 +1391,10 @@ task_t * context_switch(runqueue_t *rq, 842 | struct mm_struct *mm = next->mm; 843 | struct mm_struct *oldmm = prev->active_mm; 844 | 845 | + /* 846 | + * If the new task doesn't have an mm, make it the same 847 | + * as the old task's. 848 | + */ 849 | if (unlikely(!mm)) { 850 | next->active_mm = oldmm; 851 | atomic_inc(&oldmm->mm_count); 852 | @@ -1062,6 +1402,13 @@ task_t * context_switch(runqueue_t *rq, 853 | } else 854 | switch_mm(oldmm, mm, next); 855 | 856 | + /* 857 | + * If the previous task does not have an mm, 858 | + * set its active_mm field to NULL, warn, and 859 | + * then set the runqueue's previous mm to the previous 860 | + * task's active_mm for use in making good cache hotness 861 | + * decisions in the future. 862 | + */ 863 | if (unlikely(!prev->mm)) { 864 | prev->active_mm = NULL; 865 | WARN_ON(rq->prev_mm); 866 | @@ -1177,9 +1524,15 @@ static int find_idlest_cpu(struct task_s 867 | min_cpu = UINT_MAX; 868 | min_load = ULONG_MAX; 869 | 870 | + /* set mask to a map created by 871 | + * 1) getting a bitmap of online CPUs in the right scheduler domain 872 | + * 2) & the map from step 1 with p's allowed CPU 873 | + * The result is a map of CPUs that p could potentially run on. 874 | + */ 875 | cpus_and(mask, sd->span, cpu_online_map); 876 | cpus_and(mask, mask, p->cpus_allowed); 877 | 878 | + /* cycle through each CPU looking for the one with the lowest load */ 879 | for_each_cpu_mask(i, mask) { 880 | load = target_load(i); 881 | 882 | @@ -1226,11 +1579,13 @@ void fastcall wake_up_forked_thread(task 883 | 884 | /* 885 | * Find the largest domain that this CPU is part of that 886 | - * is willing to balance on clone: 887 | + * is willing to balance on clone; that is, a domain willing 888 | + * to accept cloned tasks onto its CPUs. 889 | */ 890 | for_each_domain(this_cpu, tmp) 891 | if (tmp->flags & SD_BALANCE_CLONE) 892 | sd = tmp; 893 | + /* If a domain was found, choose its idlest CPU, otherwise just use this CPU */ 894 | if (sd) 895 | cpu = find_idlest_cpu(p, this_cpu, sd); 896 | else 897 | @@ -1256,7 +1611,9 @@ lock_again: 898 | /* 899 | * We decrease the sleep average of forking parents 900 | * and children as well, to keep max-interactive tasks 901 | - * from forking tasks that are max-interactive. 902 | + * from forking tasks that are max-interactive. This is similar 903 | + * to what we do when new processes are forked 904 | + * (in wake_up_forked_process()) 905 | */ 906 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * 907 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 908 | @@ -1341,10 +1698,16 @@ void sched_balance_exec(void) 909 | if (this_rq()->nr_running <= 1) 910 | goto out; 911 | 912 | + /* 913 | + * Find the largest domain this CPU belongs to that is willing to 914 | + * balance on exec. 915 | + */ 916 | for_each_domain(this_cpu, tmp) 917 | if (tmp->flags & SD_BALANCE_EXEC) 918 | sd = tmp; 919 | 920 | + /* If a domain was found, find its idlest CPU and migrate there 921 | + * Otherwise, just stay on this CPU. */ 922 | if (sd) { 923 | new_cpu = find_idlest_cpu(current, this_cpu, sd); 924 | if (new_cpu != this_cpu) { 925 | @@ -1385,10 +1748,11 @@ void pull_task(runqueue_t *src_rq, prio_ 926 | set_task_cpu(p, this_cpu); 927 | this_rq->nr_running++; 928 | enqueue_task(p, this_array); 929 | - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 930 | + /* account for differences in timestamp between CPUs */ 931 | + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 932 | + this_rq->timestamp_last_tick; 933 | /* 934 | - * Note that idle threads have a prio of MAX_PRIO, for this test 935 | + * Note that idle threads have a prio of MAX_PRIO, so this test 936 | * to be always true for them. 937 | */ 938 | if (TASK_PREEMPTS_CURR(p, this_rq)) 939 | @@ -1514,17 +1878,21 @@ find_busiest_group(struct sched_domain * 940 | 941 | max_load = this_load = total_load = total_pwr = 0; 942 | 943 | + /* go through each group, done with a do loop since this is a circular linked list */ 944 | do { 945 | cpumask_t tmp; 946 | unsigned long load; 947 | int local_group; 948 | int i, nr_cpus = 0; 949 | 950 | + /* is the current CPU in the group we're looking at? */ 951 | local_group = cpu_isset(this_cpu, group->cpumask); 952 | 953 | /* Tally up the load of all CPUs in the group */ 954 | avg_load = 0; 955 | + /* make a map, tmp, of CPUs in this group and online */ 956 | cpus_and(tmp, group->cpumask, cpu_online_map); 957 | + /* if this group doesn't contain any online CPUs, move on */ 958 | if (unlikely(cpus_empty(tmp))) 959 | goto nextgroup; 960 | 961 | @@ -1539,6 +1907,10 @@ find_busiest_group(struct sched_domain * 962 | avg_load += load; 963 | } 964 | 965 | + /* 966 | + * This check is redundant since it can never be true, and has 967 | + * apparently been fixed in Linux 2.6.10rc3 968 | + */ 969 | if (!nr_cpus) 970 | goto nextgroup; 971 | 972 | @@ -1551,7 +1923,7 @@ find_busiest_group(struct sched_domain * 973 | if (local_group) { 974 | this_load = avg_load; 975 | this = group; 976 | - goto nextgroup; 977 | + goto nextgroup; /* pointless goto since it goes there anyway */ 978 | } else if (avg_load > max_load) { 979 | max_load = avg_load; 980 | busiest = group; 981 | @@ -1573,7 +1945,7 @@ nextgroup: 982 | * We're trying to get all the cpus to the average_load, so we don't 983 | * want to push ourselves above the average load, nor do we wish to 984 | * reduce the max loaded cpu below the average load, as either of these 985 | - * actions would just result in more rebalancing later, and ping-pong 986 | + * actions would just result in more rebalancing later, and ping-ponging 987 | * tasks around. Thus we look for the minimum possible imbalance. 988 | * Negative imbalances (*we* are more loaded than anyone else) will 989 | * be counted as no imbalance for these purposes -- we can't fix that 990 | @@ -1985,6 +2357,7 @@ void scheduler_tick(int user_ticks, int 991 | runqueue_t *rq = this_rq(); 992 | task_t *p = current; 993 | 994 | + /* update last tick timestamp to now */ 995 | rq->timestamp_last_tick = sched_clock(); 996 | 997 | if (rcu_pending(cpu)) 998 | @@ -1998,24 +2371,36 @@ void scheduler_tick(int user_ticks, int 999 | cpustat->softirq += sys_ticks; 1000 | sys_ticks = 0; 1001 | } 1002 | - 1003 | + 1004 | + /* if the current task is the idle task... */ 1005 | if (p == rq->idle) { 1006 | + /* If at least one task is waiting on i/o, then 1007 | + * the the time since the last tick was spent waiting 1008 | + * on I/O, and that is why we're idle. Otherwise, we just 1009 | + * have nothing to do. Update cpustat accordingly. 1010 | + */ 1011 | if (atomic_read(&rq->nr_iowait) > 0) 1012 | cpustat->iowait += sys_ticks; 1013 | else 1014 | cpustat->idle += sys_ticks; 1015 | + /* wke up a priority sleeper since we're idle for one reason or another */ 1016 | if (wake_priority_sleeper(rq)) 1017 | goto out; 1018 | + /* if we couldn't wake anything up, then try to rebalance */ 1019 | rebalance_tick(cpu, rq, IDLE); 1020 | + /* leave since we were idle and did what we could */ 1021 | return; 1022 | } 1023 | + 1024 | if (TASK_NICE(p) > 0) 1025 | cpustat->nice += user_ticks; 1026 | else 1027 | cpustat->user += user_ticks; 1028 | cpustat->system += sys_ticks; 1029 | 1030 | - /* Task might have expired already, but not scheduled off yet */ 1031 | + /* Task might have expired already, but not scheduled off yet. 1032 | + * Possible since we're in a timer interrupt right now. 1033 | + */ 1034 | if (p->array != rq->active) { 1035 | set_tsk_need_resched(p); 1036 | goto out; 1037 | @@ -2044,22 +2429,34 @@ void scheduler_tick(int user_ticks, int 1038 | } 1039 | goto out_unlock; 1040 | } 1041 | + /* if the task is out of time */ 1042 | if (!--p->time_slice) { 1043 | + /* dequeue it from the active prio array */ 1044 | dequeue_task(p, rq->active); 1045 | + /* reschedule it */ 1046 | set_tsk_need_resched(p); 1047 | + /* recalculte its priority */ 1048 | p->prio = effective_prio(p); 1049 | + /* give it a new timeslice */ 1050 | p->time_slice = task_timeslice(p); 1051 | + /* 1052 | + * This can't be its first timeslice since it just ran out 1053 | + * of one. Remember that tasks that exit on their first timeslice 1054 | + * can give part of their timeslice back to the parent task. 1055 | + */ 1056 | p->first_time_slice = 0; 1057 | 1058 | if (!rq->expired_timestamp) 1059 | rq->expired_timestamp = jiffies; 1060 | + /* if the task is not interactive or there is something starving on the expired list */ 1061 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 1062 | + /* enqueue the task on the expired list */ 1063 | enqueue_task(p, rq->expired); 1064 | if (p->static_prio < rq->best_expired_prio) 1065 | rq->best_expired_prio = p->static_prio; 1066 | - } else 1067 | + } else /* otherwise put it back on the active list */ 1068 | enqueue_task(p, rq->active); 1069 | - } else { 1070 | + } else { /* task is not out of time */ 1071 | /* 1072 | * Prevent a too long timeslice allowing a task to monopolize 1073 | * the CPU. We do this by splitting up the timeslice into 1074 | @@ -2088,12 +2485,22 @@ void scheduler_tick(int user_ticks, int 1075 | } 1076 | } 1077 | out_unlock: 1078 | + /* we are done messing with this runqueue so unlock it */ 1079 | spin_unlock(&rq->lock); 1080 | out: 1081 | + /* see if we need to do some rebalancing */ 1082 | rebalance_tick(cpu, rq, NOT_IDLE); 1083 | } 1084 | 1085 | #ifdef CONFIG_SCHED_SMT 1086 | +/* 1087 | + * If there are other idle virtual processors associated with the given cpu, 1088 | + * and they have runnable tasks, try to wake them up. This is called in 1089 | + * schedule(), when the current CPU is has no runnable tasks and idle rebalancing 1090 | + * fails to add any runnable tasks. This is because on SMT, tasks can be sleeping 1091 | + * in order to give other sibling processors with higher priority tasks full 1092 | + * access to cache. 1093 | + */ 1094 | static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) 1095 | { 1096 | int i; 1097 | @@ -2205,12 +2612,12 @@ asmlinkage void __sched schedule(void) 1098 | } 1099 | 1100 | need_resched: 1101 | - preempt_disable(); 1102 | - prev = current; 1103 | - rq = this_rq(); 1104 | + preempt_disable(); /* do not allow this algorithm to be preempted */ 1105 | + prev = current; /* whatever task is running now will be the previous task */ 1106 | + rq = this_rq(); /* get the runqueue for the processor that needs scheduling */ 1107 | 1108 | release_kernel_lock(prev); 1109 | - now = sched_clock(); 1110 | + now = sched_clock(); /* get the current time in nanoseconds */ 1111 | if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) 1112 | run_time = now - prev->timestamp; 1113 | else 1114 | @@ -2226,10 +2633,6 @@ need_resched: 1115 | 1116 | spin_lock_irq(&rq->lock); 1117 | 1118 | - /* 1119 | - * if entering off of a kernel preemption go straight 1120 | - * to picking the next task. 1121 | - */ 1122 | switch_count = &prev->nivcsw; 1123 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 1124 | switch_count = &prev->nvcsw; 1125 | @@ -2241,6 +2644,11 @@ need_resched: 1126 | } 1127 | 1128 | cpu = smp_processor_id(); 1129 | + /* 1130 | + * If there are no runnable tasks in the runqueue, try to do an idle balance. 1131 | + * If nothing is runnable after that, just switch to idle. No need to swap arrays 1132 | + * since there is nothing runnable in the expired array or the active one. 1133 | + */ 1134 | if (unlikely(!rq->nr_running)) { 1135 | idle_balance(cpu, rq); 1136 | if (!rq->nr_running) { 1137 | @@ -2251,6 +2659,7 @@ need_resched: 1138 | } 1139 | } 1140 | 1141 | + /* If there are no runnable tasks in the active prio array, swap arrays. */ 1142 | array = rq->active; 1143 | if (unlikely(!array->nr_active)) { 1144 | /* 1145 | @@ -2263,15 +2672,27 @@ need_resched: 1146 | rq->best_expired_prio = MAX_PRIO; 1147 | } 1148 | 1149 | + /* find the first priority level with tasks in it, and grab the first task in it */ 1150 | idx = sched_find_first_bit(array->bitmap); 1151 | queue = array->queue + idx; 1152 | next = list_entry(queue->next, task_t, run_list); 1153 | 1154 | + /* 1155 | + * If there is a dependent sleeper, a task sleeping on a sibling virtual 1156 | + * CPU on SMT systems, just switch to idle and let dependent_sleeper() wake 1157 | + * up the dependent task. 1158 | + */ 1159 | if (dependent_sleeper(cpu, rq, next)) { 1160 | next = rq->idle; 1161 | goto switch_tasks; 1162 | } 1163 | 1164 | + /* 1165 | + * If the next task is not an RT task and has been woken up, 1166 | + * give it a new priority calculated with a longer sleep time 1167 | + * as a boost. If the tasks is a normal first-time wakeup 1168 | + * (next->activated == 1), weigh down the bonus. 1169 | + */ 1170 | if (!rt_task(next) && next->activated > 0) { 1171 | unsigned long long delta = now - next->timestamp; 1172 | 1173 | @@ -2283,13 +2704,23 @@ need_resched: 1174 | recalc_task_prio(next, next->timestamp + delta); 1175 | enqueue_task(next, array); 1176 | } 1177 | + /* clear the next task's activated status */ 1178 | next->activated = 0; 1179 | + 1180 | + /* make the switch to whatever next task was selected */ 1181 | switch_tasks: 1182 | prefetch(next); 1183 | clear_tsk_need_resched(prev); 1184 | RCU_qsctr(task_cpu(prev))++; 1185 | 1186 | + /* subtract running time from previous task's sleep_avg */ 1187 | prev->sleep_avg -= run_time; 1188 | + 1189 | + /* 1190 | + * If the previous task's sleep average is 0 or lower now, 1191 | + * set it to 0 and then drop its interactive credit since 1192 | + * it obviously wasn't sleeping much and is thus less I/O bound. 1193 | + */ 1194 | if ((long)prev->sleep_avg <= 0) { 1195 | prev->sleep_avg = 0; 1196 | if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) 1197 | @@ -2297,6 +2728,7 @@ switch_tasks: 1198 | } 1199 | prev->timestamp = now; 1200 | 1201 | + /* make the actual context switch if prev and next are not the same */ 1202 | if (likely(prev != next)) { 1203 | next->timestamp = now; 1204 | rq->nr_switches++; 1205 | @@ -2313,6 +2745,10 @@ switch_tasks: 1206 | 1207 | reacquire_kernel_lock(current); 1208 | preempt_enable_no_resched(); 1209 | + /* 1210 | + * Since preemtion was disabled this whole time, check to see if kernel 1211 | + * preemption was requested (reschedule requested) and reschedule if so. 1212 | + */ 1213 | if (test_thread_flag(TIF_NEED_RESCHED)) 1214 | goto need_resched; 1215 | } 1216 | @@ -2331,12 +2767,18 @@ asmlinkage void __sched preempt_schedule 1217 | 1218 | /* 1219 | * If there is a non-zero preempt_count or interrupts are disabled, 1220 | - * we do not want to preempt the current task. Just return.. 1221 | + * we do not want to preempt the current task. Just return. 1222 | */ 1223 | if (unlikely(ti->preempt_count || irqs_disabled())) 1224 | return; 1225 | 1226 | need_resched: 1227 | + /* 1228 | + * Set preempt count to indicate that we are preempting, reschedule, 1229 | + * and then clear the preempt count as rescheduling has happened. 1230 | + * Only needs to reschedule once no matter how many time the reschedule 1231 | + * was requested. 1232 | + */ 1233 | ti->preempt_count = PREEMPT_ACTIVE; 1234 | schedule(); 1235 | ti->preempt_count = 0; 1236 | @@ -2350,6 +2792,7 @@ need_resched: 1237 | EXPORT_SYMBOL(preempt_schedule); 1238 | #endif /* CONFIG_PREEMPT */ 1239 | 1240 | +/* exported call for trying to wake up a task */ 1241 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 1242 | { 1243 | task_t *p = curr->task; 1244 | @@ -2372,11 +2815,17 @@ static void __wake_up_common(wait_queue_ 1245 | { 1246 | struct list_head *tmp, *next; 1247 | 1248 | + /* go through each task in the wait queue */ 1249 | list_for_each_safe(tmp, next, &q->task_list) { 1250 | wait_queue_t *curr; 1251 | unsigned flags; 1252 | curr = list_entry(tmp, wait_queue_t, task_list); 1253 | flags = curr->flags; 1254 | + /* 1255 | + * Try to wake up the task, and if it was exclusive and there are more 1256 | + * exclusive tasks in the wait queue, then quit. Don't want to wake up 1257 | + * more than one exclusive task at a time. 1258 | + */ 1259 | if (curr->func(curr, mode, sync, key) && 1260 | (flags & WQ_FLAG_EXCLUSIVE) && 1261 | !--nr_exclusive) 1262 | @@ -2428,9 +2877,11 @@ void fastcall __wake_up_sync(wait_queue_ 1263 | unsigned long flags; 1264 | int sync = 1; 1265 | 1266 | + /* obviously, leave if there is no wait queue */ 1267 | if (unlikely(!q)) 1268 | return; 1269 | 1270 | + /* if there are no exclusive tasks, don't do sync */ 1271 | if (unlikely(!nr_exclusive)) 1272 | sync = 0; 1273 | 1274 | @@ -2585,6 +3036,7 @@ void set_user_nice(task_t *p, long nice) 1275 | if (array) 1276 | dequeue_task(p, array); 1277 | 1278 | + /* set the new static_prio and just adjust the dynamic prio instead of recalculating */ 1279 | old_prio = p->prio; 1280 | new_prio = NICE_TO_PRIO(nice); 1281 | delta = new_prio - old_prio; 1282 | @@ -2743,6 +3195,7 @@ static int setscheduler(pid_t pid, int p 1283 | */ 1284 | rq = task_rq_lock(p, &flags); 1285 | 1286 | + /* makes sure the policy is sane */ 1287 | if (policy < 0) 1288 | policy = p->policy; 1289 | else { 1290 | @@ -2910,6 +3363,7 @@ asmlinkage long sys_sched_setaffinity(pi 1291 | if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) 1292 | return -EFAULT; 1293 | 1294 | + /* don't allow CPU hotplugging while we do this - obvious consequences */ 1295 | lock_cpu_hotplug(); 1296 | read_lock(&tasklist_lock); 1297 | 1298 | @@ -3397,9 +3851,9 @@ static void __migrate_task(struct task_s 1299 | */ 1300 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick 1301 | + rq_dest->timestamp_last_tick; 1302 | - deactivate_task(p, rq_src); 1303 | - activate_task(p, rq_dest, 0); 1304 | - if (TASK_PREEMPTS_CURR(p, rq_dest)) 1305 | + deactivate_task(p, rq_src); /* off the runqueue it is on */ 1306 | + activate_task(p, rq_dest, 0); /* on the runqueue it should be on */ 1307 | + if (TASK_PREEMPTS_CURR(p, rq_dest)) /* perhaps preempt dest cpu's current task */ 1308 | resched_task(rq_dest->curr); 1309 | } 1310 | 1311 | @@ -3421,25 +3875,33 @@ static int migration_thread(void * data) 1312 | BUG_ON(rq->migration_thread != current); 1313 | 1314 | set_current_state(TASK_INTERRUPTIBLE); 1315 | + 1316 | + /* basically, just keep trying to be helpful in one way or another 1317 | + * until we're told to die... 1318 | + */ 1319 | while (!kthread_should_stop()) { 1320 | struct list_head *head; 1321 | migration_req_t *req; 1322 | 1323 | + /* can I freeze the current thread for you? */ 1324 | if (current->flags & PF_FREEZE) 1325 | refrigerator(PF_FREEZE); 1326 | 1327 | spin_lock_irq(&rq->lock); 1328 | 1329 | + /* Is this CPU offline? If so, I'll just go die. */ 1330 | if (cpu_is_offline(cpu)) { 1331 | spin_unlock_irq(&rq->lock); 1332 | goto wait_to_die; 1333 | } 1334 | 1335 | + /* does my runqueue need to be balanced? */ 1336 | if (rq->active_balance) { 1337 | active_load_balance(rq, cpu); 1338 | rq->active_balance = 0; 1339 | } 1340 | 1341 | + /* anything need to be migrated? If not, schedule me out. */ 1342 | head = &rq->migration_queue; 1343 | 1344 | if (list_empty(head)) { 1345 | @@ -3448,6 +3910,7 @@ static int migration_thread(void * data) 1346 | set_current_state(TASK_INTERRUPTIBLE); 1347 | continue; 1348 | } 1349 | + /* do some migration */ 1350 | req = list_entry(head->next, migration_req_t, list); 1351 | list_del_init(head->next); 1352 | 1353 | @@ -3466,9 +3929,11 @@ static int migration_thread(void * data) 1354 | 1355 | complete(&req->done); 1356 | } 1357 | + /* migration thread suicide */ 1358 | __set_current_state(TASK_RUNNING); 1359 | return 0; 1360 | 1361 | + /* migration thread hospice... with no CPU, time is running out for us... */ 1362 | wait_to_die: 1363 | /* Wait for kthread_stop */ 1364 | set_current_state(TASK_INTERRUPTIBLE); 1365 | @@ -3481,7 +3946,7 @@ wait_to_die: 1366 | } 1367 | 1368 | #ifdef CONFIG_HOTPLUG_CPU 1369 | -/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ 1370 | +/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ 1371 | static void migrate_all_tasks(int src_cpu) 1372 | { 1373 | struct task_struct *tsk, *t; 1374 | @@ -3729,6 +4194,7 @@ static void __init arch_init_sched_domai 1375 | struct sched_group *node = &sched_group_nodes[i]; 1376 | int j; 1377 | 1378 | + /* get mask of node cpus that are possible */ 1379 | cpus_and(nodemask, tmp, cpu_possible_map); 1380 | 1381 | if (cpus_empty(nodemask)) 1382 | @@ -3744,6 +4210,7 @@ static void __init arch_init_sched_domai 1383 | cpu_set(j, cpu->cpumask); 1384 | cpu->cpu_power = SCHED_LOAD_SCALE; 1385 | 1386 | + /* set up circular linked list */ 1387 | if (!first_cpu) 1388 | first_cpu = cpu; 1389 | if (last_cpu) 1390 | --------------------------------------------------------------------------------