├── README.md
├── linux_cpu_scheduler.lyx
├── linux_cpu_scheduler.pdf
└── sched_comments.patch


/README.md:
--------------------------------------------------------------------------------
1 | # Understanding the Linux 2.6.8.1 CPU Scheduler
2 | 
3 | This paper is intended to be an introduction to the Linux 2.6.8.1 CPU scheduler implementation.
4 | 
5 | It is available in lyx and pdf formats.
6 | 
7 | There is also a patch against the Linux 2.6.8.1 sched.c that adds more comments. This did not received as much attention to detail as the paper, but it should be quite accurate.
8 | 


--------------------------------------------------------------------------------
/linux_cpu_scheduler.lyx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bdaehlie/linux-cpu-scheduler-docs/33f52ff95ac23fa0cdb53b18a88531e3fb1b9754/linux_cpu_scheduler.lyx


--------------------------------------------------------------------------------
/linux_cpu_scheduler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bdaehlie/linux-cpu-scheduler-docs/33f52ff95ac23fa0cdb53b18a88531e3fb1b9754/linux_cpu_scheduler.pdf


--------------------------------------------------------------------------------
/sched_comments.patch:
--------------------------------------------------------------------------------
   1 | --- /Users/josh/Desktop/cs_capstone/reference/linux-2.6.8.1-unpatched/kernel/sched.c	Sat Aug 14 05:55:59 2004
   2 | +++ /Users/josh/Desktop/cs_capstone/josh_capstone_work/sched_commented_2.6.8.1.c	Sun Jan  2 03:24:40 2005
   3 | @@ -18,6 +18,24 @@
   4 |   *  2004-04-02	Scheduler domains code by Nick Piggin
   5 |   */
   6 |  
   7 | +/*
   8 | + * Additional comments by Josh Aas.
   9 | + * Copyright (c)2004 Silicon Graphics, Inc. (SGI)
  10 | + *
  11 | + * Comments are situated above what they describe.
  12 | + * 
  13 | + * Abbreviations:
  14 | + * RT - real-time (as in a "real-time process")
  15 | + * UP - uniprocessor
  16 | + *
  17 | + * Notes:
  18 | + * - SMT means symmetric multithreading. This is not the same thing as
  19 | + *   SMP. An example of an SMT system is an Intel Pentium 4 Hyper-Threading (HT)
  20 | + *   enabled processor. Basically, a single SMT chip can run multiple threads,
  21 | + *   which has some interesting scheduler implications since the threads
  22 | + *   share certain physical CPU resources.
  23 | + */
  24 | +
  25 |  #include <linux/mm.h>
  26 |  #include <linux/module.h>
  27 |  #include <linux/nmi.h>
  28 | @@ -44,6 +62,18 @@
  29 |  
  30 |  #include <asm/unistd.h>
  31 |  
  32 | +/*
  33 | + * NUMA architectures have groups of CPUs (and memory) organized
  34 | + * into nodes. These macros are for getting the CPU mask for
  35 | + * a node that a CPU belongs to.
  36 | + *
  37 | + * If the kernel is compiled for a NUMA architecture, do a node lookup
  38 | + * by getting a CPU's node and then getting the CPU mask/map for
  39 | + * that node. If non-NUMA, there will only be one mask/map, so insert that.
  40 | + *
  41 | + * Note that these NUMA macros are not used. They should probably have been
  42 | + * removed from this file.
  43 | + */
  44 |  #ifdef CONFIG_NUMA
  45 |  #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  46 |  #else
  47 | @@ -54,6 +84,25 @@
  48 |   * Convert user-nice values [ -20 ... 0 ... 19 ]
  49 |   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  50 |   * and back.
  51 | + *
  52 | + * PRIO values are the priority values that the Linux scheduler uses internally.
  53 | + * Possible PRIO values for RT tasks are 0 through (MAX_RT_PRIO - 1), and possible PRIO
  54 | + * values for non-RT tasks are MAX_RT_PRIO through (MAX_PRIO - 1). The lower a task's
  55 | + * PRIO value, the higher its priority. With this setup, RT tasks will always have
  56 | + * a higher priority than non-RT tasks.
  57 | + *
  58 | + * For non-RT tasks, in order to convert a user-nice value to a PRIO value, one would
  59 | + * start with MAX_RT_PRIO, add the user-nice value, and then add 20 to make up for the
  60 | + * fact that the highest possible priority user-nice value is -20. Converting from a
  61 | + * PRIO value to a user-nice value is just the opposite. This is what the
  62 | + * NICE_TO_PRIO(nice) and PRIO_TO_NICE(prio) macros do.
  63 | + *
  64 | + * TASK_NICE(p) simply gets the user-nice value for a given task. Each task has a
  65 | + * static and a dynamic priority value. The static priority value is set by users
  66 | + * via the nice() system call and ranges from -20 to 19. It is stored as a PRIO. The
  67 | + * dynamic priority is based on a task's static priority, but it is modified based
  68 | + * on interactivity. The dynamic priority is not relevent here, but is mentioned in
  69 | + * order to explain why TASK_NICE(p) is determined by a task's static_prio field.
  70 |   */
  71 |  #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
  72 |  #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
  73 | @@ -63,6 +112,23 @@
  74 |   * 'User priority' is the nice value converted to something we
  75 |   * can work with better when scaling various scheduler parameters,
  76 |   * it's a [ 0 ... 39 ] range.
  77 | + *
  78 | + * USER_PRIO(p) takes an interal non-RT priority and returns its
  79 | + * priority in terms of 0-39. It is only used by the other macros
  80 | + * in this group as values of 0-39 don't mean anything in terms of
  81 | + * internal PRIO values or user-nice values. It is simply a shortcut.
  82 | + *
  83 | + * TASK_USER_PRIO is not used by anything, and should be removed from
  84 | + * the kernel. It is a useless calculation for the reason described above.
  85 | + * All it doers is return a task's USER_PRIO.
  86 | + *
  87 | + * MAX_USER_PRIO returns the total number of different priority levels
  88 | + * non-RT processes can have. In this case, it resoves to 40 (100-139).
  89 | + *
  90 | + * AVG_TIMESLICE basically resolves to the half-way point between MIN_TIMESLICE
  91 | + * and MAX_TIMESLICE. The reason it isn't written simply like that is so the
  92 | + * algorithm can withstand changes to the priority system. It resolves to about
  93 | + * 100ms.
  94 |   */
  95 |  #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
  96 |  #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
  97 | @@ -72,6 +138,13 @@
  98 |  
  99 |  /*
 100 |   * Some helpers for converting nanosecond timing to jiffy resolution
 101 | + *
 102 | + * A nanosecond (NS) is one-billionth of a second. A jiffy is a period of time
 103 | + * calculated by 1/HZ, where HZ is the architecture-defined number of ticks
 104 | + * per second. So, to convert from nanoseconds to jiffies, one divides a billion
 105 | + * by HZ (which results in the number of nanoseconds in a jiffy), and divides
 106 | + * the number of nanoseconds by that. Jiffies to NS is the same, but multiply
 107 | + * the number of jiffies by the number of nanoseconds in a jiffy.
 108 |   */
 109 |  #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 110 |  #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 111 | @@ -79,9 +152,46 @@
 112 |  /*
 113 |   * These are the 'tuning knobs' of the scheduler:
 114 |   *
 115 | - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
 116 | - * maximum timeslice is 200 msecs. Timeslices get refilled after
 117 | - * they expire.
 118 | + * MIN_TIMESLICE is the minimum timeslice that a task can be given. It resolves to about 10ms.
 119 | + *
 120 | + * MAX_TIMESLCIE is the maximum timeslice that a task can be given. It resolves to about 200ms.
 121 | + *
 122 | + * ON_RUNQUEUE_WEIGHT ...
 123 | + *
 124 | + * CHILD_PENALTY is the penalty that the sleep_avg of forked child tasks gets
 125 | + * in order to prevent very interactive tasks from spawning other very interactive
 126 | + * tasks.
 127 | + *
 128 | + * PARENT_PENALTY is the penalty that the sleep_avg of parents who forked tasks
 129 | + * gets in order to prevent very interactive tasks from spawning other very interactive
 130 | + * tasks.
 131 | + *
 132 | + * EXIT_WEIGHT ...
 133 | + *
 134 | + * PRIO_BONUS_RATIO is the ratio used to determine MAX_BONUS. 
 135 | + *
 136 | + * MAX_BONUS ...  MAX_USER_PRIO ressolves to 40, and PRIO_BONUS_RATIO is 25.
 137 | + * So essentially this means that the max bonus that can be
 138 | + * given to a task is 25% of the total non-RT priority
 139 | + * range. Since there are 40 possible non-RT priorities, this
 140 | + * resolves to 10.
 141 | + *
 142 | + * INTERACTIVE_DELTA is the static component used to determine whether or not a task
 143 | + * should be considered interactive. The higher this is, the more difficult it is for
 144 | + * tasks to be considered interactive. See the DELTA and TASK_INTERACTIVE macros for
 145 | + * more information.
 146 | + *
 147 | + * MAX_SLEEP_AVG is the number of jiffies that is the maximum average sleep time for
 148 | + * a task. The higher a task's sleep_avg, the more interactive it is, so this essentially
 149 | + * puts a limit on how interactive a task can be.
 150 | + *
 151 | + * STARVATION_LIMIT is the time limit for which a runnable task may be deprived of
 152 | + * CPU time before it is considered to be starving.
 153 | + *
 154 | + * NS_MAX_SLEEP_AVG is the same as MAX_SLEEP_AVG, but in nanoseconds.
 155 | + *
 156 | + * CREDIT_LIMIT is used to determine whether or not a task has high or low interactivity
 157 | + * credit. See the macros HIGH_CREDIT and LOW_CREDIT.
 158 |   */
 159 |  #define MIN_TIMESLICE		( 10 * HZ / 1000)
 160 |  #define MAX_TIMESLICE		(200 * HZ / 1000)
 161 | @@ -101,7 +211,9 @@
 162 |   * If a task is 'interactive' then we reinsert it in the active
 163 |   * array after it has expired its current timeslice. (it will not
 164 |   * continue to run immediately, it will still roundrobin with
 165 | - * other interactive tasks.)
 166 | + * other interactive tasks.) This behavior does not prevent the expired
 167 | + * and unexpired queues from ever being swapped - they will get swapped
 168 | + * as soon as something in the expired queue is going to starve.
 169 |   *
 170 |   * This part scales the interactivity limit depending on niceness.
 171 |   *
 172 | @@ -116,7 +228,9 @@
 173 |   *
 174 |   * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 175 |   *  priority range a task can explore, a value of '1' means the
 176 | - *  task is rated interactive.)
 177 | + *  task is rated interactive. So - there are 11 columns. The middle
 178 | + *  column is whether or not a task with a certain user-nice level
 179 | + *  is considered interactive if given no + or - bonus at all.)
 180 |   *
 181 |   * Ie. nice +19 tasks can never get 'interactive' enough to be
 182 |   * reinserted into the active array. And only heavily CPU-hog nice -20
 183 | @@ -125,10 +239,26 @@
 184 |   * too hard.
 185 |   */
 186 |  
 187 | +/*
 188 | + * The process's current bonus is its sleep average in jiffies times MAX_BONUS
 189 | + * divided by MAX_SLEEP_AVG. Essentially it scales a processes sleep average into
 190 | + * the range MAX_BONUS.
 191 | + */
 192 |  #define CURRENT_BONUS(p) \
 193 |  	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 194 |  		MAX_SLEEP_AVG)
 195 |  
 196 | +/*
 197 | + * If an interactive task has too long a timeslice, it may
 198 | + * be preempted by a task of equal priority. The task
 199 | + * does not lose its timeslice, it is just put on the bottom of the
 200 | + * list of tasks of its priority waiting to run. If there
 201 | + * was a task of higher priority, it would have already preempted
 202 | + * a given task. TIMESLICE_GRANULARITY is the time limit for
 203 | + * what is considered "too long" a timeslice. It is called granularity
 204 | + * because the timeslice is effectively broken up if it is longer than
 205 | + * TIMESLICE_GRANULARITY.
 206 | + */
 207 |  #ifdef CONFIG_SMP
 208 |  #define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
 209 |  		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 210 | @@ -138,12 +268,38 @@
 211 |  		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 212 |  #endif
 213 |  
 214 | +/*
 215 | + * This macro is used in the TASK_INTERACTIVE macro to decide if a
 216 | + * task should be considered interactive. SCALE calculates how much
 217 | + * higher in priority a task must be from its nice value, minus the
 218 | + * INTERACTIVE_DELTA, in order to be considered interactive. The idea
 219 | + * is that tasks with a higher priority nice value should not need to
 220 | + * be given as much of a bonus in order to be considered interactive
 221 | + * as tasks given a lower priority nice value. So, a task with a -10
 222 | + * nice value will be more easily considered interactive than a task
 223 | + * with a +10 nice value. Since INTERACTIVE_DELTA is static, SCALE
 224 | + * provides a value to add to it in order to do the interactivity scaling.
 225 | + */
 226 |  #define SCALE(v1,v1_max,v2_max) \
 227 |  	(v1) * (v2_max) / (v1_max)
 228 |  
 229 | +/* 
 230 | + * A task must be DELTA higher in priority than its nice
 231 | + * value in order to be considered interactive. This value
 232 | + * is the combination of the scaled factor and the constant
 233 | + * INTERACTIVE_DELTA factor.
 234 | + */
 235 |  #define DELTA(p) \
 236 |  	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 237 |  
 238 | +/* 
 239 | + * This macro returns whether or not a task should be considered
 240 | + * interactive. If a task's priority value (lower values are higher
 241 | + * priority) is less than or equal to its static_prio (i.e. nice value)
 242 | + * minus DELTA, then it is interactive. This is because tasks are given
 243 | + * given priority-raising bonuses (prio lowering) based on heuristics
 244 | + * that measure characteristics of interactivity.
 245 | + */
 246 |  #define TASK_INTERACTIVE(p) \
 247 |  	((p)->prio <= (p)->static_prio - DELTA(p))
 248 |  
 249 | @@ -157,6 +313,10 @@
 250 |  #define LOW_CREDIT(p) \
 251 |  	((p)->interactive_credit < -CREDIT_LIMIT)
 252 |  
 253 | +/*
 254 | + * just tells whether or not there is a task in rq that
 255 | + * should preempt the task p.
 256 | + */
 257 |  #define TASK_PREEMPTS_CURR(p, rq) \
 258 |  	((p)->prio < (rq)->curr->prio)
 259 |  
 260 | @@ -170,30 +330,91 @@
 261 |   *
 262 |   * task_timeslice() is the interface that is used by the scheduler.
 263 |   */
 264 | -
 265 | -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 266 | -		((MAX_TIMESLICE - MIN_TIMESLICE) * \
 267 | -			(MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 268 | +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) * \
 269 | +			                     (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 270 |  
 271 |  static unsigned int task_timeslice(task_t *p)
 272 |  {
 273 |  	return BASE_TIMESLICE(p);
 274 |  }
 275 |  
 276 | +/*
 277 | + * The task_hot macro takes a process, the current time, and a scheduler domain.
 278 | + * A scheduler domain is essentially a grouping a processors that share cache.
 279 | + * task_hot determines whether or not cache in a scheduler domain is likely
 280 | + * to contain data that the given process could use. The value cache_hot_time
 281 | + * is the amount of time that data is likely to remain the cache. Thus, if
 282 | + * the time between when the process was last run and now is less than that
 283 | + * amount of time, it is likely that the cache will still be hot (i.e. contain
 284 | + * relevant data).
 285 | + */
 286 |  #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 287 |  
 288 | +/* These are the runqueue data structures: */
 289 | +
 290 |  /*
 291 | - * These are the runqueue data structures:
 292 | + * The BITMAP_SIZE macro resolves to the number of long integers
 293 | + * required to create a bitmap with one bit per scheduler priority
 294 | + * (there are MAX_PRIO priorities).
 295 | + *
 296 | + * The "...+1+7)/8" part might seem odd. MAX_PRIO + 1 covers all priorities,
 297 | + * adding 7 ensures that division by 8 will result in a number > 1.
 298 |   */
 299 | -
 300 |  #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 301 |  
 302 |  typedef struct runqueue runqueue_t;
 303 |  
 304 | +/*
 305 | + * The prio_array data structure is extremely important as it is what allows
 306 | + * the Linux scheduling algorithm to perform in O(1) time.
 307 | + *
 308 | + * The basic structure in the Linux scheduler is the runqueue, defined below.
 309 | + * There is one runqueue per processor, and within that runqueue there are two
 310 | + * structures of type prio_array. One is for tasks that have not used up their
 311 | + * timeslice yet, the other is for tasks that have used up their timeslice. The
 312 | + * former are considered active, the latter expired. Note that active and expired
 313 | + * has nothing to do with whether or not a task is runnable - active simply means
 314 | + * that since the last time timeslices were allocated, a given task in that queue
 315 | + * has not used up its timeslice. A task in the active list still has time available
 316 | + * on the CPU, tasks in the expired list have used up their timeslice.
 317 | + *
 318 | + * The nr_active value stores the number of runnable tasks in the prio_array. The
 319 | + * bitmap is a string of bits, one for each priority level on the system (140 by
 320 | + * default), that indicates whether or not there are any tasks in the prio_array
 321 | + * at a given priority level. The queue value is an array of pointers to arrays
 322 | + * that store all tasks at a given priority level.
 323 | + *
 324 | + * So if there is only one runnable task in the prio_array, nr_active will be equal to
 325 | + * one. If that task is not RT, and it has a nice value of 20, there will be
 326 | + * a one in the 119th position of the bitmap to indicate that there is a task in the
 327 | + * prio_array at that priority level. The queue array would have a pointer at the 119th
 328 | + * position pointing to an array of length 1, its single element being the task in question.
 329 | + *
 330 | + * This is very useful because in order to determine the next task to run, the scheduler simply
 331 | + * 1) looks to see if there are any runnable tasks in its active prio_array (i.e. is nr_active > 0)
 332 | + * 2) if so, go to step 3 otherwise go to step 6
 333 | + * 3) find the first 1 in the active prio_array's bitmap. There must be a 1 somewhere since
 334 | + *    we know that there is a task in the prio_array and it must have a priority level.
 335 | + * 4) run the first task in the array at the position in the prio_array's queue equal to
 336 | + *    the first 1 found in the bitmap.
 337 | + * 5) when the task is done running for some reason,  recalculate its new timeslice and put it
 338 | + *    in the expired prio_array. decement nr_active in the active prio_array, and increment
 339 | + *    it in the expired prio_array. if the task was the last task at a given priority,
 340 | + *    clear the priority's bit in the active prio_array and make sure the priority's bit
 341 | + *    is set in the expired prio_array. repeat steps 1-4 until no tasks exist in the active
 342 | + *    prio_array.
 343 | + * 6) when no tasks exist in the active prio_array, swap the active and inactive prio_arrays
 344 | + *    and start over again. since timeslices are recalculated for each process when
 345 | + *    it is put onto the expired array, the swap of prio_arrays is fast (i.e. no
 346 | + *    sitting around recalculating a timeslice for every task)
 347 | + *
 348 | + * This results in O(1) behavior since no step in the process requires iterating over a number
 349 | + * of tasks that grows larger when the total number of tasks grows.
 350 | + */
 351 |  struct prio_array {
 352 | -	unsigned int nr_active;
 353 | -	unsigned long bitmap[BITMAP_SIZE];
 354 | -	struct list_head queue[MAX_PRIO];
 355 | +	unsigned int nr_active; /* number of runnable tasks in this prio_array */
 356 | +	unsigned long bitmap[BITMAP_SIZE]; /* bitmap showing which priority levels contain tasks */
 357 | +	struct list_head queue[MAX_PRIO]; /* a list of array heads, one for each priority on the system */
 358 |  };
 359 |  
 360 |  /*
 361 | @@ -204,50 +425,61 @@ struct prio_array {
 362 |   * acquire operations must be ordered by ascending &runqueue.
 363 |   */
 364 |  struct runqueue {
 365 | -	spinlock_t lock;
 366 | +	spinlock_t lock; /* lock that protects this runqueue */
 367 |  
 368 | -	/*
 369 | -	 * nr_running and cpu_load should be in the same cacheline because
 370 | -	 * remote CPUs use both these fields when doing load calculation.
 371 | -	 */
 372 | -	unsigned long nr_running;
 373 | +  /*
 374 | +   * nr_running and cpu_load should be in the same cacheline because
 375 | +   * remote CPUs use both these fields when doing load calculation.
 376 | +   */
 377 | +	unsigned long nr_running; /* number of runnable tasks */
 378 |  #ifdef CONFIG_SMP
 379 | -	unsigned long cpu_load;
 380 | +	unsigned long cpu_load; /* this CPU's load */
 381 |  #endif
 382 | -	unsigned long long nr_switches;
 383 | -	unsigned long expired_timestamp, nr_uninterruptible;
 384 | -	unsigned long long timestamp_last_tick;
 385 | -	task_t *curr, *idle;
 386 | -	struct mm_struct *prev_mm;
 387 | -	prio_array_t *active, *expired, arrays[2];
 388 | -	int best_expired_prio;
 389 | -	atomic_t nr_iowait;
 390 | +	unsigned long long nr_switches; /* number of context switches */
 391 | +	unsigned long expired_timestamp, nr_uninterruptible; /* time of last array swap and number of
 392 | +                                                          uninterruptible processes in queue */
 393 | +	unsigned long long timestamp_last_tick; /* timestamp of last scheduler tick */
 394 | +	task_t *curr, *idle; /* this processors current and idle task */
 395 | +	struct mm_struct *prev_mm; /* the last running task's mm_struct */
 396 | +	prio_array_t *active, *expired, arrays[2]; /* the active and expired prio_arrays */
 397 | +	int best_expired_prio; /* highest priority that exists in the expired prio_array */
 398 | +	atomic_t nr_iowait; /* number of tasks in the queue waiting on i/o */
 399 |  
 400 |  #ifdef CONFIG_SMP
 401 | -	struct sched_domain *sd;
 402 | +	struct sched_domain *sd; /* in SMP systems there can be different scheduler domains */
 403 |  
 404 |  	/* For active balancing */
 405 | -	int active_balance;
 406 | +	int active_balance; /* */
 407 |  	int push_cpu;
 408 |  
 409 | +  /* this migration thread for the processor that this runqueue belongs to */
 410 |  	task_t *migration_thread;
 411 |  	struct list_head migration_queue;
 412 |  #endif
 413 |  };
 414 |  
 415 | +/* Define one runqueue per CPU. */
 416 |  static DEFINE_PER_CPU(struct runqueue, runqueues);
 417 |  
 418 | +/* Iterate through domains that a CPU is a part of */
 419 |  #define for_each_domain(cpu, domain) \
 420 |  	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 421 |  
 422 | +/*
 423 | + * cpu_rq gets the runqueue for a given cpu
 424 | + *
 425 | + * this_rq gets the runqueue for the current cpu
 426 | + * 
 427 | + * task_rq gets the runqueue that a certain task is in
 428 | + *
 429 | + * cpu_curr gets the current task on a given CPU
 430 | + */
 431 |  #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 432 |  #define this_rq()		(&__get_cpu_var(runqueues))
 433 |  #define task_rq(p)		cpu_rq(task_cpu(p))
 434 |  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 435 |  
 436 | -/*
 437 | - * Default context-switch locking:
 438 | - */
 439 | +/* Default context-switch locking */
 440 |  #ifndef prepare_arch_switch
 441 |  # define prepare_arch_switch(rq, next)	do { } while (0)
 442 |  # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
 443 | @@ -264,23 +496,28 @@ static runqueue_t *task_rq_lock(task_t *
 444 |  	struct runqueue *rq;
 445 |  
 446 |  repeat_lock_task:
 447 | -	local_irq_save(*flags);
 448 | -	rq = task_rq(p);
 449 | -	spin_lock(&rq->lock);
 450 | +	local_irq_save(*flags); /* save irq flags */
 451 | +	rq = task_rq(p); /* get runqueue for the task */
 452 | +	spin_lock(&rq->lock); /* lock the runqueue */
 453 | +  /* make sure the task is still on the runqueue we just locked */
 454 |  	if (unlikely(rq != task_rq(p))) {
 455 | +    /* if not, unlock and restore irq flags, then try again */
 456 |  		spin_unlock_irqrestore(&rq->lock, *flags);
 457 |  		goto repeat_lock_task;
 458 |  	}
 459 |  	return rq;
 460 |  }
 461 |  
 462 | +/* simply unlock a runqueue, not as touchy as locking! */
 463 |  static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 464 |  {
 465 |  	spin_unlock_irqrestore(&rq->lock, *flags);
 466 |  }
 467 |  
 468 |  /*
 469 | - * rq_lock - lock a given runqueue and disable interrupts.
 470 | + * rq_lock - lock the current processor's runqueue and disable interrupts.
 471 | + * Since the current CPU is executing this code, its runqueue is easier to
 472 | + * lock than if we were trying to lock some other CPU's runqueue (see task_rq_lock()).
 473 |   */
 474 |  static runqueue_t *this_rq_lock(void)
 475 |  {
 476 | @@ -293,6 +530,10 @@ static runqueue_t *this_rq_lock(void)
 477 |  	return rq;
 478 |  }
 479 |  
 480 | +/* 
 481 | + * A convenience method for making sure that runqueues get unlocked
 482 | + * via the right lock mechanism.
 483 | + */
 484 |  static inline void rq_unlock(runqueue_t *rq)
 485 |  {
 486 |  	spin_unlock_irq(&rq->lock);
 487 | @@ -303,24 +544,37 @@ static inline void rq_unlock(runqueue_t 
 488 |   */
 489 |  static void dequeue_task(struct task_struct *p, prio_array_t *array)
 490 |  {
 491 | -	array->nr_active--;
 492 | +	array->nr_active--; /* one less active task in the array */
 493 |  	list_del(&p->run_list);
 494 | +  /* 
 495 | +   * Clear the bit that says there is a task in the prio array with a certain priority
 496 | +   * if no more tasks at p's priority in the prio array.
 497 | +   */
 498 |  	if (list_empty(array->queue + p->prio))
 499 |  		__clear_bit(p->prio, array->bitmap);
 500 |  }
 501 |  
 502 |  static void enqueue_task(struct task_struct *p, prio_array_t *array)
 503 |  {
 504 | +  /* add the task at the right spot in the prio array */
 505 |  	list_add_tail(&p->run_list, array->queue + p->prio);
 506 | +  /* 
 507 | +   * set the bit that says there is at least one task in the prio array
 508 | +   * with priority p->prio
 509 | +   */
 510 |  	__set_bit(p->prio, array->bitmap);
 511 | -	array->nr_active++;
 512 | -	p->array = array;
 513 | +	array->nr_active++; /* one more active task in the array */
 514 | +	p->array = array; /* set the field in the task that says what prio array it is in */
 515 |  }
 516 |  
 517 |  /*
 518 | - * Used by the migration code - we pull tasks from the head of the
 519 | - * remote queue so we want these tasks to show up at the head of the
 520 | - * local queue:
 521 | + * Migration code always has the highest priority. When CPUs go down (become
 522 | + * idle), the idle task must get a higher priority than the migration code.
 523 | + * This function is used by __activate_idle_task, which is called by
 524 | + * sched_idle_next. sched_idle_next is called when CPUs get taken down.
 525 | + * 
 526 | + * This is really similar to enqueue task, except it adds to the top of the list
 527 | + * instead of the tail (list_add() instead of list_add_tail()).
 528 |   */
 529 |  static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 530 |  {
 531 | @@ -347,13 +601,28 @@ static inline void enqueue_task_head(str
 532 |  static int effective_prio(task_t *p)
 533 |  {
 534 |  	int bonus, prio;
 535 | -
 536 | +  
 537 | +  /* don't do anything if this is a RT task */
 538 |  	if (rt_task(p))
 539 |  		return p->prio;
 540 |  
 541 | +  /*
 542 | +   * take the CURRENT_BONUS, which is sleep_avg mapped onto
 543 | +   * 0-MAX_BONUS, and subtract half of MAX_BONUS since it is
 544 | +   * twice the possible + or - bonus. So if MAX_BONUS is 10,
 545 | +   * and a task sleeps a lot, it might get a CURRENT_BONUS of
 546 | +   * say, 8. Subtracting 5, that makes 3. This will be subtracted
 547 | +   * from static_prio since the task should have a high priority
 548 | +   * and lower prio values are higher priority. If a task sleeps
 549 | +   * very little, the bonus value calculated here will be negative.
 550 | +   * In that case, the negative value will get subtracted from
 551 | +   * static_prio, lowering the priority.
 552 | +   */
 553 |  	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 554 |  
 555 | +  /* give the task a prio based on the just-calculated bonus and static_prio */
 556 |  	prio = p->static_prio - bonus;
 557 | +  /* make sure the prio value is within non-RT bounds and return it */
 558 |  	if (prio < MAX_RT_PRIO)
 559 |  		prio = MAX_RT_PRIO;
 560 |  	if (prio > MAX_PRIO-1)
 561 | @@ -379,8 +648,21 @@ static inline void __activate_idle_task(
 562 |  	rq->nr_running++;
 563 |  }
 564 |  
 565 | +/*
 566 | + * This function recalculates a task's priority ("I know this because I can
 567 | + * read" - John Fraser Hart). It is called by the main schedule() function
 568 | + * when a task is moved to the expired prio array, and also when tasks are
 569 | + * activated.
 570 | + */
 571 |  static void recalc_task_prio(task_t *p, unsigned long long now)
 572 |  {
 573 | +  /*
 574 | +   * __sleep_time is used because an unsigned long long will be able
 575 | +   * to hold a huge number, which might be the case in the calculation
 576 | +   * of "now - p-> timestamp" but will not be the case if the number
 577 | +   * is kept <= NS_MAX_SLEEP_AVG. So, once the number is calculated to
 578 | +   * be <= NS_MAX_SLEEP_AVG, then the unsigned long sleep_time is used.
 579 | +   */
 580 |  	unsigned long long __sleep_time = now - p->timestamp;
 581 |  	unsigned long sleep_time;
 582 |  
 583 | @@ -393,7 +675,7 @@ static void recalc_task_prio(task_t *p, 
 584 |  		/*
 585 |  		 * User tasks that sleep a long time are categorised as
 586 |  		 * idle and will get just interactive status to stay active &
 587 | -		 * prevent them suddenly becoming cpu hogs and starving
 588 | +		 * prevent them from suddenly becoming cpu hogs and starving
 589 |  		 * other processes.
 590 |  		 */
 591 |  		if (p->mm && p->activated != -1 &&
 592 | @@ -405,7 +687,9 @@ static void recalc_task_prio(task_t *p, 
 593 |  		} else {
 594 |  			/*
 595 |  			 * The lower the sleep avg a task has the more
 596 | -			 * rapidly it will rise with sleep time.
 597 | +			 * rapidly it will rise with sleep time. If a task
 598 | +       * has a high sleep avg, CURRENT_BONUS(p) will be high,
 599 | +       * and thus MAX_BONUS - CURRENT_BONUS(p) will be low.
 600 |  			 */
 601 |  			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 602 |  
 603 | @@ -507,7 +791,13 @@ static void activate_task(task_t *p, run
 604 |   */
 605 |  static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 606 |  {
 607 | +  /* one less running task */
 608 |  	rq->nr_running--;
 609 | +  /* 
 610 | +   * this is leaving the running state and
 611 | +   * becoming uninterruptible, so increment
 612 | +   * nr_uninterruptible
 613 | +   */
 614 |  	if (p->state == TASK_UNINTERRUPTIBLE)
 615 |  		rq->nr_uninterruptible++;
 616 |  	dequeue_task(p, p->array);
 617 | @@ -527,7 +817,7 @@ static void resched_task(task_t *p)
 618 |  	int need_resched, nrpolling;
 619 |  
 620 |  	preempt_disable();
 621 | -	/* minimise the chance of sending an interrupt to poll_idle() */
 622 | +	/* minimize the chance of sending an interrupt to poll_idle() */
 623 |  	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 624 |  	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 625 |  	nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 626 | @@ -543,15 +833,19 @@ static inline void resched_task(task_t *
 627 |  }
 628 |  #endif
 629 |  
 630 | -/**
 631 | +/*
 632 |   * task_curr - is this task currently executing on a CPU?
 633 | - * @p: the task in question.
 634 |   */
 635 |  inline int task_curr(const task_t *p)
 636 |  {
 637 |  	return cpu_curr(task_cpu(p)) == p;
 638 |  }
 639 |  
 640 | +/*
 641 | + * This section contains code for migrating tasks between CPUs on
 642 | + * SMP systems
 643 | + */
 644 | +
 645 |  #ifdef CONFIG_SMP
 646 |  enum request_type {
 647 |  	REQ_MOVE_TASK,
 648 | @@ -563,11 +857,11 @@ typedef struct {
 649 |  	enum request_type type;
 650 |  
 651 |  	/* For REQ_MOVE_TASK */
 652 | -	task_t *task;
 653 | -	int dest_cpu;
 654 | +	task_t *task; /* task to operate on */
 655 | +	int dest_cpu; /* if REQ_MOVE_TASK, this is the destination CPU */
 656 |  
 657 |  	/* For REQ_SET_DOMAIN */
 658 | -	struct sched_domain *sd;
 659 | +	struct sched_domain *sd; /* destination domain */
 660 |  
 661 |  	struct completion done;
 662 |  } migration_req_t;
 663 | @@ -589,6 +883,10 @@ static int migrate_task(task_t *p, int d
 664 |  		return 0;
 665 |  	}
 666 |  
 667 | +  /*
 668 | +   * fill in migration request fields and add task to a
 669 | +   * migration queue, to be migrated later
 670 | +   */
 671 |  	init_completion(&req->done);
 672 |  	req->type = REQ_MOVE_TASK;
 673 |  	req->task = p;
 674 | @@ -640,6 +938,13 @@ void kick_process(task_t *p)
 675 |  
 676 |  	preempt_disable();
 677 |  	cpu = task_cpu(p);
 678 | +  /* 
 679 | +   * If the process is on this CPU, then its already in kernel mode, because we're
 680 | +   * executing right now. In that case, don't tell it to reschedule. If the process
 681 | +   * is not the current process on some CPU, then kernel mode must kick in before
 682 | +   * it runs so again, don't bother rescheduling it. It should be obvious why this
 683 | +   * function doesn't apply on a UP system.
 684 | +   */
 685 |  	if ((cpu != smp_processor_id()) && task_curr(p))
 686 |  		smp_send_reschedule(cpu);
 687 |  	preempt_enable();
 688 | @@ -661,9 +966,7 @@ static inline unsigned long source_load(
 689 |  	return min(rq->cpu_load, load_now);
 690 |  }
 691 |  
 692 | -/*
 693 | - * Return a high guess at the load of a migration-target cpu
 694 | - */
 695 | +/* Return a high guess at the load of a migration-target cpu */
 696 |  static inline unsigned long target_load(int cpu)
 697 |  {
 698 |  	runqueue_t *rq = cpu_rq(cpu);
 699 | @@ -672,7 +975,7 @@ static inline unsigned long target_load(
 700 |  	return max(rq->cpu_load, load_now);
 701 |  }
 702 |  
 703 | -#endif
 704 | +#endif /* CONFIG_SMP */
 705 |  
 706 |  /*
 707 |   * wake_idle() is useful especially on SMT architectures to wake a
 708 | @@ -689,16 +992,28 @@ static int wake_idle(int cpu, task_t *p)
 709 |  	struct sched_domain *sd;
 710 |  	int i;
 711 |  
 712 | +  /* if the task is already on an idle CPU, leave it there */
 713 |  	if (idle_cpu(cpu))
 714 |  		return cpu;
 715 |  
 716 | +  /* don't change CPUs if the scheduler domain does not support WAKE_IDLE */
 717 |  	sd = rq->sd;
 718 |  	if (!(sd->flags & SD_WAKE_IDLE))
 719 |  		return cpu;
 720 |  
 721 | +  /*
 722 | +   * First, put the &'ed value of the scheduler domain span
 723 | +   * and the online CPU map into tmp. Then, & tmp with the
 724 | +   * cpus that p is allowed to run on. That gives a list
 725 | +   * of potential CPUs in the map tmp.
 726 | +   */
 727 |  	cpus_and(tmp, sd->span, cpu_online_map);
 728 |  	cpus_and(tmp, tmp, p->cpus_allowed);
 729 |  
 730 | +  /* 
 731 | +   * cycle through the cpu map tmp, made above,
 732 | +   * and send the task to the first idle CPU.
 733 | +   */
 734 |  	for_each_cpu_mask(i, tmp) {
 735 |  		if (idle_cpu(i))
 736 |  			return i;
 737 | @@ -739,26 +1054,35 @@ static int try_to_wake_up(task_t * p, un
 738 |  	int new_cpu;
 739 |  #endif
 740 |  
 741 | +  /*
 742 | +   * lock the task's runqueue, disabling interrupts,
 743 | +   * then check to see if the task is in one of the
 744 | +   * states we wish to wake it from. If not, get out.
 745 | +   */
 746 |  	rq = task_rq_lock(p, &flags);
 747 |  	old_state = p->state;
 748 |  	if (!(old_state & state))
 749 |  		goto out;
 750 |  
 751 | +  /* the task is already awake if it is in a prio array! */
 752 |  	if (p->array)
 753 |  		goto out_running;
 754 | -
 755 | +  
 756 |  	cpu = task_cpu(p);
 757 |  	this_cpu = smp_processor_id();
 758 |  
 759 |  #ifdef CONFIG_SMP
 760 | +  /* if the task is running but was interrupted, we just need to activate it */
 761 |  	if (unlikely(task_running(rq, p)))
 762 |  		goto out_activate;
 763 |  
 764 |  	new_cpu = cpu;
 765 |  
 766 | +  /* if the task's CPU is this CPU or this CPU is not one it is allowed on... */
 767 |  	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 768 |  		goto out_set_cpu;
 769 |  
 770 | +  /* grab the load on the source and target CPUs */ 
 771 |  	load = source_load(cpu);
 772 |  	this_load = target_load(this_cpu);
 773 |  
 774 | @@ -809,8 +1133,10 @@ out_set_cpu:
 775 |  		/* might preempt at this point */
 776 |  		rq = task_rq_lock(p, &flags);
 777 |  		old_state = p->state;
 778 | +    /* If the state of p is not one we wish to wake from, get out */
 779 |  		if (!(old_state & state))
 780 |  			goto out;
 781 | +    /* if p is in a prio array, it is already running */
 782 |  		if (p->array)
 783 |  			goto out_running;
 784 |  
 785 | @@ -852,6 +1178,7 @@ out:
 786 |  	return success;
 787 |  }
 788 |  
 789 | +/* just an exported convenience function for try_to_wake_up() */
 790 |  int fastcall wake_up_process(task_t * p)
 791 |  {
 792 |  	return try_to_wake_up(p, TASK_STOPPED |
 793 | @@ -899,7 +1226,8 @@ void fastcall sched_fork(task_t *p)
 794 |  	p->time_slice = (current->time_slice + 1) >> 1;
 795 |  	/*
 796 |  	 * The remainder of the first timeslice might be recovered by
 797 | -	 * the parent if the child exits early enough.
 798 | +	 * the parent if the child exits early enough. Set first_time_slice
 799 | +   * in order to indicate that that p's timeslice is reclaimable.
 800 |  	 */
 801 |  	p->first_time_slice = 1;
 802 |  	current->time_slice >>= 1;
 803 | @@ -930,6 +1258,7 @@ void fastcall wake_up_forked_process(tas
 804 |  	unsigned long flags;
 805 |  	runqueue_t *rq = task_rq_lock(current, &flags);
 806 |  
 807 | +  /* The freshly forked process should not already be running! */
 808 |  	BUG_ON(p->state != TASK_RUNNING);
 809 |  
 810 |  	/*
 811 | @@ -943,14 +1272,17 @@ void fastcall wake_up_forked_process(tas
 812 |  	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 813 |  		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 814 |  
 815 | +  /* Start of with interactive credit of 0. */
 816 |  	p->interactive_credit = 0;
 817 |  
 818 | +  /* Set an initial priority and CPU. The initial CPU is the current CPU. */
 819 |  	p->prio = effective_prio(p);
 820 |  	set_task_cpu(p, smp_processor_id());
 821 |  
 822 | +  /* If the task is not already on a runqueue prio array, put it on one. */
 823 |  	if (unlikely(!current->array))
 824 |  		__activate_task(p, rq);
 825 | -	else {
 826 | +	else { /* Otherwise just situate it in the runqueue its in. */
 827 |  		p->prio = current->prio;
 828 |  		list_add_tail(&p->run_list, &current->run_list);
 829 |  		p->array = current->array;
 830 | @@ -975,6 +1307,10 @@ void fastcall sched_exit(task_t * p)
 831 |  	runqueue_t *rq;
 832 |  
 833 |  	local_irq_save(flags);
 834 | +  /*
 835 | +   * if the exiting child was only on its first time slice,
 836 | +   * give it back to the parent
 837 | +   */
 838 |  	if (p->first_time_slice) {
 839 |  		p->parent->time_slice += p->time_slice;
 840 |  		if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 841 | @@ -1055,6 +1391,10 @@ task_t * context_switch(runqueue_t *rq, 
 842 |  	struct mm_struct *mm = next->mm;
 843 |  	struct mm_struct *oldmm = prev->active_mm;
 844 |  
 845 | +  /*
 846 | +   * If the new task doesn't have an mm, make it the same
 847 | +   * as the old task's.
 848 | +   */
 849 |  	if (unlikely(!mm)) {
 850 |  		next->active_mm = oldmm;
 851 |  		atomic_inc(&oldmm->mm_count);
 852 | @@ -1062,6 +1402,13 @@ task_t * context_switch(runqueue_t *rq, 
 853 |  	} else
 854 |  		switch_mm(oldmm, mm, next);
 855 |  
 856 | +  /*
 857 | +   * If the previous task does not have an mm,
 858 | +   * set its active_mm field to NULL, warn, and
 859 | +   * then set the runqueue's previous mm to the previous
 860 | +   * task's active_mm for use in making good cache hotness
 861 | +   * decisions in the future.
 862 | +   */
 863 |  	if (unlikely(!prev->mm)) {
 864 |  		prev->active_mm = NULL;
 865 |  		WARN_ON(rq->prev_mm);
 866 | @@ -1177,9 +1524,15 @@ static int find_idlest_cpu(struct task_s
 867 |  	min_cpu = UINT_MAX;
 868 |  	min_load = ULONG_MAX;
 869 |  
 870 | +  /* set mask to a map created by
 871 | +   * 1) getting a bitmap of online CPUs in the right scheduler domain
 872 | +   * 2) & the map from step 1 with p's allowed CPU
 873 | +   * The result is a map of CPUs that p could potentially run on.
 874 | +   */
 875 |  	cpus_and(mask, sd->span, cpu_online_map);
 876 |  	cpus_and(mask, mask, p->cpus_allowed);
 877 |  
 878 | +  /* cycle through each CPU looking for the one with the lowest load */
 879 |  	for_each_cpu_mask(i, mask) {
 880 |  		load = target_load(i);
 881 |  
 882 | @@ -1226,11 +1579,13 @@ void fastcall wake_up_forked_thread(task
 883 |  
 884 |  	/*
 885 |  	 * Find the largest domain that this CPU is part of that
 886 | -	 * is willing to balance on clone:
 887 | +	 * is willing to balance on clone; that is, a domain willing
 888 | +   * to accept cloned tasks onto its CPUs.
 889 |  	 */
 890 |  	for_each_domain(this_cpu, tmp)
 891 |  		if (tmp->flags & SD_BALANCE_CLONE)
 892 |  			sd = tmp;
 893 | +  /* If a domain was found, choose its idlest CPU, otherwise just use this CPU */
 894 |  	if (sd)
 895 |  		cpu = find_idlest_cpu(p, this_cpu, sd);
 896 |  	else
 897 | @@ -1256,7 +1611,9 @@ lock_again:
 898 |  	/*
 899 |  	 * We decrease the sleep average of forking parents
 900 |  	 * and children as well, to keep max-interactive tasks
 901 | -	 * from forking tasks that are max-interactive.
 902 | +	 * from forking tasks that are max-interactive. This is similar
 903 | +   * to what we do when new processes are forked
 904 | +   * (in wake_up_forked_process())
 905 |  	 */
 906 |  	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
 907 |  		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 908 | @@ -1341,10 +1698,16 @@ void sched_balance_exec(void)
 909 |  	if (this_rq()->nr_running <= 1)
 910 |  		goto out;
 911 |  
 912 | +  /*
 913 | +   * Find the largest domain this CPU belongs to that is willing to
 914 | +   * balance on exec.
 915 | +   */
 916 |  	for_each_domain(this_cpu, tmp)
 917 |  		if (tmp->flags & SD_BALANCE_EXEC)
 918 |  			sd = tmp;
 919 |  
 920 | +  /* If a domain was found, find its idlest CPU and migrate there
 921 | +   * Otherwise, just stay on this CPU. */
 922 |  	if (sd) {
 923 |  		new_cpu = find_idlest_cpu(current, this_cpu, sd);
 924 |  		if (new_cpu != this_cpu) {
 925 | @@ -1385,10 +1748,11 @@ void pull_task(runqueue_t *src_rq, prio_
 926 |  	set_task_cpu(p, this_cpu);
 927 |  	this_rq->nr_running++;
 928 |  	enqueue_task(p, this_array);
 929 | -	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 930 | +	/* account for differences in timestamp between CPUs */
 931 | +  p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 932 |  				+ this_rq->timestamp_last_tick;
 933 |  	/*
 934 | -	 * Note that idle threads have a prio of MAX_PRIO, for this test
 935 | +	 * Note that idle threads have a prio of MAX_PRIO, so this test
 936 |  	 * to be always true for them.
 937 |  	 */
 938 |  	if (TASK_PREEMPTS_CURR(p, this_rq))
 939 | @@ -1514,17 +1878,21 @@ find_busiest_group(struct sched_domain *
 940 |  
 941 |  	max_load = this_load = total_load = total_pwr = 0;
 942 |  
 943 | +  /* go through each group, done with a do loop since this is a circular linked list */
 944 |  	do {
 945 |  		cpumask_t tmp;
 946 |  		unsigned long load;
 947 |  		int local_group;
 948 |  		int i, nr_cpus = 0;
 949 |  
 950 | +    /* is the current CPU in the group we're looking at? */
 951 |  		local_group = cpu_isset(this_cpu, group->cpumask);
 952 |  
 953 |  		/* Tally up the load of all CPUs in the group */
 954 |  		avg_load = 0;
 955 | +    /* make a map, tmp, of CPUs in this group and online */
 956 |  		cpus_and(tmp, group->cpumask, cpu_online_map);
 957 | +    /* if this group doesn't contain any online CPUs, move on */
 958 |  		if (unlikely(cpus_empty(tmp)))
 959 |  			goto nextgroup;
 960 |  
 961 | @@ -1539,6 +1907,10 @@ find_busiest_group(struct sched_domain *
 962 |  			avg_load += load;
 963 |  		}
 964 |  
 965 | +    /* 
 966 | +     * This check is redundant since it can never be true, and has
 967 | +     * apparently been fixed in Linux 2.6.10rc3
 968 | +     */
 969 |  		if (!nr_cpus)
 970 |  			goto nextgroup;
 971 |  
 972 | @@ -1551,7 +1923,7 @@ find_busiest_group(struct sched_domain *
 973 |  		if (local_group) {
 974 |  			this_load = avg_load;
 975 |  			this = group;
 976 | -			goto nextgroup;
 977 | +			goto nextgroup; /* pointless goto since it goes there anyway */
 978 |  		} else if (avg_load > max_load) {
 979 |  			max_load = avg_load;
 980 |  			busiest = group;
 981 | @@ -1573,7 +1945,7 @@ nextgroup:
 982 |  	 * We're trying to get all the cpus to the average_load, so we don't
 983 |  	 * want to push ourselves above the average load, nor do we wish to
 984 |  	 * reduce the max loaded cpu below the average load, as either of these
 985 | -	 * actions would just result in more rebalancing later, and ping-pong
 986 | +	 * actions would just result in more rebalancing later, and ping-ponging
 987 |  	 * tasks around. Thus we look for the minimum possible imbalance.
 988 |  	 * Negative imbalances (*we* are more loaded than anyone else) will
 989 |  	 * be counted as no imbalance for these purposes -- we can't fix that
 990 | @@ -1985,6 +2357,7 @@ void scheduler_tick(int user_ticks, int 
 991 |  	runqueue_t *rq = this_rq();
 992 |  	task_t *p = current;
 993 |  
 994 | +  /* update last tick timestamp to now */
 995 |  	rq->timestamp_last_tick = sched_clock();
 996 |  
 997 |  	if (rcu_pending(cpu))
 998 | @@ -1998,24 +2371,36 @@ void scheduler_tick(int user_ticks, int 
 999 |  		cpustat->softirq += sys_ticks;
1000 |  		sys_ticks = 0;
1001 |  	}
1002 | -
1003 | + 
1004 | +  /* if the current task is the idle task... */
1005 |  	if (p == rq->idle) {
1006 | +    /* If at least one task is waiting on i/o, then
1007 | +     * the the time since the last tick was spent waiting
1008 | +     * on I/O, and that is why we're idle. Otherwise, we just
1009 | +     * have nothing to do. Update cpustat accordingly.
1010 | +     */
1011 |  		if (atomic_read(&rq->nr_iowait) > 0)
1012 |  			cpustat->iowait += sys_ticks;
1013 |  		else
1014 |  			cpustat->idle += sys_ticks;
1015 | +    /* wke up a priority sleeper since we're idle for one reason or another */
1016 |  		if (wake_priority_sleeper(rq))
1017 |  			goto out;
1018 | +    /* if we couldn't wake anything up, then try to rebalance */
1019 |  		rebalance_tick(cpu, rq, IDLE);
1020 | +    /* leave since we were idle and did what we could */
1021 |  		return;
1022 |  	}
1023 | +  
1024 |  	if (TASK_NICE(p) > 0)
1025 |  		cpustat->nice += user_ticks;
1026 |  	else
1027 |  		cpustat->user += user_ticks;
1028 |  	cpustat->system += sys_ticks;
1029 |  
1030 | -	/* Task might have expired already, but not scheduled off yet */
1031 | +	/* Task might have expired already, but not scheduled off yet.
1032 | +   * Possible since we're in a timer interrupt right now.
1033 | +   */
1034 |  	if (p->array != rq->active) {
1035 |  		set_tsk_need_resched(p);
1036 |  		goto out;
1037 | @@ -2044,22 +2429,34 @@ void scheduler_tick(int user_ticks, int 
1038 |  		}
1039 |  		goto out_unlock;
1040 |  	}
1041 | +  /* if the task is out of time */
1042 |  	if (!--p->time_slice) {
1043 | +    /* dequeue it from the active prio array */
1044 |  		dequeue_task(p, rq->active);
1045 | +    /* reschedule it */
1046 |  		set_tsk_need_resched(p);
1047 | +    /* recalculte its priority */
1048 |  		p->prio = effective_prio(p);
1049 | +    /* give it a new timeslice */
1050 |  		p->time_slice = task_timeslice(p);
1051 | +    /*
1052 | +     * This can't be its first timeslice since it just ran out
1053 | +     * of one. Remember that tasks that exit on their first timeslice
1054 | +     * can give part of their timeslice back to the parent task.
1055 | +     */
1056 |  		p->first_time_slice = 0;
1057 |  
1058 |  		if (!rq->expired_timestamp)
1059 |  			rq->expired_timestamp = jiffies;
1060 | +    /* if the task is not interactive or there is something starving on the expired list */
1061 |  		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1062 | +      /* enqueue the task on the expired list */
1063 |  			enqueue_task(p, rq->expired);
1064 |  			if (p->static_prio < rq->best_expired_prio)
1065 |  				rq->best_expired_prio = p->static_prio;
1066 | -		} else
1067 | +		} else /* otherwise put it back on the active list */
1068 |  			enqueue_task(p, rq->active);
1069 | -	} else {
1070 | +	} else { /* task is not out of time */
1071 |  		/*
1072 |  		 * Prevent a too long timeslice allowing a task to monopolize
1073 |  		 * the CPU. We do this by splitting up the timeslice into
1074 | @@ -2088,12 +2485,22 @@ void scheduler_tick(int user_ticks, int 
1075 |  		}
1076 |  	}
1077 |  out_unlock:
1078 | +  /* we are done messing with this runqueue so unlock it */
1079 |  	spin_unlock(&rq->lock);
1080 |  out:
1081 | +  /* see if we need to do some rebalancing */
1082 |  	rebalance_tick(cpu, rq, NOT_IDLE);
1083 |  }
1084 |  
1085 |  #ifdef CONFIG_SCHED_SMT
1086 | +/*
1087 | + * If there are other idle virtual processors associated with the given cpu,
1088 | + * and they have runnable tasks, try to wake them up. This is called in
1089 | + * schedule(), when the current CPU is has no runnable tasks and idle rebalancing
1090 | + * fails to add any runnable tasks. This is because on SMT, tasks can be sleeping
1091 | + * in order to give other sibling processors with higher priority tasks full
1092 | + * access to cache.
1093 | + */
1094 |  static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
1095 |  {
1096 |  	int i;
1097 | @@ -2205,12 +2612,12 @@ asmlinkage void __sched schedule(void)
1098 |  	}
1099 |  
1100 |  need_resched:
1101 | -	preempt_disable();
1102 | -	prev = current;
1103 | -	rq = this_rq();
1104 | +	preempt_disable(); /* do not allow this algorithm to be preempted */
1105 | +	prev = current;    /* whatever task is running now will be the previous task */
1106 | +	rq = this_rq();    /* get the runqueue for the processor that needs scheduling */
1107 |  
1108 |  	release_kernel_lock(prev);
1109 | -	now = sched_clock();
1110 | +	now = sched_clock(); /* get the current time in nanoseconds */
1111 |  	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1112 |  		run_time = now - prev->timestamp;
1113 |  	else
1114 | @@ -2226,10 +2633,6 @@ need_resched:
1115 |  
1116 |  	spin_lock_irq(&rq->lock);
1117 |  
1118 | -	/*
1119 | -	 * if entering off of a kernel preemption go straight
1120 | -	 * to picking the next task.
1121 | -	 */
1122 |  	switch_count = &prev->nivcsw;
1123 |  	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1124 |  		switch_count = &prev->nvcsw;
1125 | @@ -2241,6 +2644,11 @@ need_resched:
1126 |  	}
1127 |  
1128 |  	cpu = smp_processor_id();
1129 | +  /*
1130 | +   * If there are no runnable tasks in the runqueue, try to do an idle balance.
1131 | +   * If nothing is runnable after that, just switch to idle. No need to swap arrays
1132 | +   * since there is nothing runnable in the expired array or the active one.
1133 | +   */
1134 |  	if (unlikely(!rq->nr_running)) {
1135 |  		idle_balance(cpu, rq);
1136 |  		if (!rq->nr_running) {
1137 | @@ -2251,6 +2659,7 @@ need_resched:
1138 |  		}
1139 |  	}
1140 |  
1141 | +  /* If there are no runnable tasks in the active prio array, swap arrays. */
1142 |  	array = rq->active;
1143 |  	if (unlikely(!array->nr_active)) {
1144 |  		/*
1145 | @@ -2263,15 +2672,27 @@ need_resched:
1146 |  		rq->best_expired_prio = MAX_PRIO;
1147 |  	}
1148 |  
1149 | +  /* find the first priority level with tasks in it, and grab the first task in it */
1150 |  	idx = sched_find_first_bit(array->bitmap);
1151 |  	queue = array->queue + idx;
1152 |  	next = list_entry(queue->next, task_t, run_list);
1153 |  
1154 | +  /*
1155 | +   * If there is a dependent sleeper, a task sleeping on a sibling virtual
1156 | +   * CPU on SMT systems, just switch to idle and let dependent_sleeper() wake
1157 | +   * up the dependent task.
1158 | +   */
1159 |  	if (dependent_sleeper(cpu, rq, next)) {
1160 |  		next = rq->idle;
1161 |  		goto switch_tasks;
1162 |  	}
1163 |  
1164 | +  /*
1165 | +   * If the next task is not an RT task and has been woken up,
1166 | +   * give it a new priority calculated with a longer sleep time
1167 | +   * as a boost. If the tasks is a normal first-time wakeup
1168 | +   * (next->activated == 1), weigh down the bonus.
1169 | +   */
1170 |  	if (!rt_task(next) && next->activated > 0) {
1171 |  		unsigned long long delta = now - next->timestamp;
1172 |  
1173 | @@ -2283,13 +2704,23 @@ need_resched:
1174 |  		recalc_task_prio(next, next->timestamp + delta);
1175 |  		enqueue_task(next, array);
1176 |  	}
1177 | +  /* clear the next task's activated status */
1178 |  	next->activated = 0;
1179 | +  
1180 | +  /* make the switch to whatever next task was selected */
1181 |  switch_tasks:
1182 |  	prefetch(next);
1183 |  	clear_tsk_need_resched(prev);
1184 |  	RCU_qsctr(task_cpu(prev))++;
1185 |  
1186 | +  /* subtract running time from previous task's sleep_avg */
1187 |  	prev->sleep_avg -= run_time;
1188 | +  
1189 | +  /*
1190 | +   * If the previous task's sleep average is 0 or lower now,
1191 | +   * set it to 0 and then drop its interactive credit since
1192 | +   * it obviously wasn't sleeping much and is thus less I/O bound.
1193 | +   */
1194 |  	if ((long)prev->sleep_avg <= 0) {
1195 |  		prev->sleep_avg = 0;
1196 |  		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1197 | @@ -2297,6 +2728,7 @@ switch_tasks:
1198 |  	}
1199 |  	prev->timestamp = now;
1200 |  
1201 | +  /* make the actual context switch if prev and next are not the same */
1202 |  	if (likely(prev != next)) {
1203 |  		next->timestamp = now;
1204 |  		rq->nr_switches++;
1205 | @@ -2313,6 +2745,10 @@ switch_tasks:
1206 |  
1207 |  	reacquire_kernel_lock(current);
1208 |  	preempt_enable_no_resched();
1209 | +  /*
1210 | +   * Since preemtion was disabled this whole time, check to see if kernel
1211 | +   * preemption was requested (reschedule requested) and reschedule if so.
1212 | +   */
1213 |  	if (test_thread_flag(TIF_NEED_RESCHED))
1214 |  		goto need_resched;
1215 |  }
1216 | @@ -2331,12 +2767,18 @@ asmlinkage void __sched preempt_schedule
1217 |  
1218 |  	/*
1219 |  	 * If there is a non-zero preempt_count or interrupts are disabled,
1220 | -	 * we do not want to preempt the current task.  Just return..
1221 | +	 * we do not want to preempt the current task.  Just return.
1222 |  	 */
1223 |  	if (unlikely(ti->preempt_count || irqs_disabled()))
1224 |  		return;
1225 |  
1226 |  need_resched:
1227 | +  /*
1228 | +   * Set preempt count to indicate that we are preempting, reschedule,
1229 | +   * and then clear the preempt count as rescheduling has happened.
1230 | +   * Only needs to reschedule once no matter how many time the reschedule
1231 | +   * was requested.
1232 | +   */
1233 |  	ti->preempt_count = PREEMPT_ACTIVE;
1234 |  	schedule();
1235 |  	ti->preempt_count = 0;
1236 | @@ -2350,6 +2792,7 @@ need_resched:
1237 |  EXPORT_SYMBOL(preempt_schedule);
1238 |  #endif /* CONFIG_PREEMPT */
1239 |  
1240 | +/* exported call for trying to wake up a task */
1241 |  int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
1242 |  {
1243 |  	task_t *p = curr->task;
1244 | @@ -2372,11 +2815,17 @@ static void __wake_up_common(wait_queue_
1245 |  {
1246 |  	struct list_head *tmp, *next;
1247 |  
1248 | +  /* go through each task in the wait queue */
1249 |  	list_for_each_safe(tmp, next, &q->task_list) {
1250 |  		wait_queue_t *curr;
1251 |  		unsigned flags;
1252 |  		curr = list_entry(tmp, wait_queue_t, task_list);
1253 |  		flags = curr->flags;
1254 | +    /*
1255 | +     * Try to wake up the task, and if it was exclusive and there are more
1256 | +     * exclusive tasks in the wait queue, then quit. Don't want to wake up
1257 | +     * more than one exclusive task at a time.
1258 | +     */
1259 |  		if (curr->func(curr, mode, sync, key) &&
1260 |  		    (flags & WQ_FLAG_EXCLUSIVE) &&
1261 |  		    !--nr_exclusive)
1262 | @@ -2428,9 +2877,11 @@ void fastcall __wake_up_sync(wait_queue_
1263 |  	unsigned long flags;
1264 |  	int sync = 1;
1265 |  
1266 | +  /* obviously, leave if there is no wait queue */
1267 |  	if (unlikely(!q))
1268 |  		return;
1269 |  
1270 | +  /* if there are no exclusive tasks, don't do sync */
1271 |  	if (unlikely(!nr_exclusive))
1272 |  		sync = 0;
1273 |  
1274 | @@ -2585,6 +3036,7 @@ void set_user_nice(task_t *p, long nice)
1275 |  	if (array)
1276 |  		dequeue_task(p, array);
1277 |  
1278 | +  /* set the new static_prio and just adjust the dynamic prio instead of recalculating */
1279 |  	old_prio = p->prio;
1280 |  	new_prio = NICE_TO_PRIO(nice);
1281 |  	delta = new_prio - old_prio;
1282 | @@ -2743,6 +3195,7 @@ static int setscheduler(pid_t pid, int p
1283 |  	 */
1284 |  	rq = task_rq_lock(p, &flags);
1285 |  
1286 | +  /* makes sure the policy is sane */
1287 |  	if (policy < 0)
1288 |  		policy = p->policy;
1289 |  	else {
1290 | @@ -2910,6 +3363,7 @@ asmlinkage long sys_sched_setaffinity(pi
1291 |  	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
1292 |  		return -EFAULT;
1293 |  
1294 | +  /* don't allow CPU hotplugging while we do this - obvious consequences */
1295 |  	lock_cpu_hotplug();
1296 |  	read_lock(&tasklist_lock);
1297 |  
1298 | @@ -3397,9 +3851,9 @@ static void __migrate_task(struct task_s
1299 |  		 */
1300 |  		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
1301 |  				+ rq_dest->timestamp_last_tick;
1302 | -		deactivate_task(p, rq_src);
1303 | -		activate_task(p, rq_dest, 0);
1304 | -		if (TASK_PREEMPTS_CURR(p, rq_dest))
1305 | +		deactivate_task(p, rq_src); /* off the runqueue it is on */
1306 | +		activate_task(p, rq_dest, 0); /* on the runqueue it should be on */
1307 | +		if (TASK_PREEMPTS_CURR(p, rq_dest)) /* perhaps preempt dest cpu's current task */
1308 |  			resched_task(rq_dest->curr);
1309 |  	}
1310 |  
1311 | @@ -3421,25 +3875,33 @@ static int migration_thread(void * data)
1312 |  	BUG_ON(rq->migration_thread != current);
1313 |  
1314 |  	set_current_state(TASK_INTERRUPTIBLE);
1315 | +  
1316 | +  /* basically, just keep trying to be helpful in one way or another
1317 | +   * until we're told to die... 
1318 | +   */
1319 |  	while (!kthread_should_stop()) {
1320 |  		struct list_head *head;
1321 |  		migration_req_t *req;
1322 |  
1323 | +    /* can I freeze the current thread for you? */
1324 |  		if (current->flags & PF_FREEZE)
1325 |  			refrigerator(PF_FREEZE);
1326 |  
1327 |  		spin_lock_irq(&rq->lock);
1328 |  
1329 | +    /* Is this CPU offline? If so, I'll just go die. */
1330 |  		if (cpu_is_offline(cpu)) {
1331 |  			spin_unlock_irq(&rq->lock);
1332 |  			goto wait_to_die;
1333 |  		}
1334 |  
1335 | +    /* does my runqueue need to be balanced? */
1336 |  		if (rq->active_balance) {
1337 |  			active_load_balance(rq, cpu);
1338 |  			rq->active_balance = 0;
1339 |  		}
1340 |  
1341 | +    /* anything need to be migrated? If not, schedule me out. */
1342 |  		head = &rq->migration_queue;
1343 |  
1344 |  		if (list_empty(head)) {
1345 | @@ -3448,6 +3910,7 @@ static int migration_thread(void * data)
1346 |  			set_current_state(TASK_INTERRUPTIBLE);
1347 |  			continue;
1348 |  		}
1349 | +    /* do some migration */
1350 |  		req = list_entry(head->next, migration_req_t, list);
1351 |  		list_del_init(head->next);
1352 |  
1353 | @@ -3466,9 +3929,11 @@ static int migration_thread(void * data)
1354 |  
1355 |  		complete(&req->done);
1356 |  	}
1357 | +  /* migration thread suicide */
1358 |  	__set_current_state(TASK_RUNNING);
1359 |  	return 0;
1360 |  
1361 | +  /* migration thread hospice... with no CPU, time is running out for us... */
1362 |  wait_to_die:
1363 |  	/* Wait for kthread_stop */
1364 |  	set_current_state(TASK_INTERRUPTIBLE);
1365 | @@ -3481,7 +3946,7 @@ wait_to_die:
1366 |  }
1367 |  
1368 |  #ifdef CONFIG_HOTPLUG_CPU
1369 | -/* migrate_all_tasks - function to migrate all tasks from the dead cpu.  */
1370 | +/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */
1371 |  static void migrate_all_tasks(int src_cpu)
1372 |  {
1373 |  	struct task_struct *tsk, *t;
1374 | @@ -3729,6 +4194,7 @@ static void __init arch_init_sched_domai
1375 |  		struct sched_group *node = &sched_group_nodes[i];
1376 |  		int j;
1377 |  
1378 | +    /* get mask of node cpus that are possible */
1379 |  		cpus_and(nodemask, tmp, cpu_possible_map);
1380 |  
1381 |  		if (cpus_empty(nodemask))
1382 | @@ -3744,6 +4210,7 @@ static void __init arch_init_sched_domai
1383 |  			cpu_set(j, cpu->cpumask);
1384 |  			cpu->cpu_power = SCHED_LOAD_SCALE;
1385 |  
1386 | +      /* set up circular linked list */
1387 |  			if (!first_cpu)
1388 |  				first_cpu = cpu;
1389 |  			if (last_cpu)
1390 | 


--------------------------------------------------------------------------------