vcs.maemo.org Git - kernel-bfs/blob - kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch

   1 - Major overhaul of queued changes.
   2
   3 - Microoptimise multiplications/divisions to be shifts where suitable.
   4
   5 - Change ISO calculations to have their own lock so as to not grab the grq
   6 lock during a scheduler tick.
   7
   8 - Change all deadline accounting to use nanosecond values.
   9
  10 - Introduce local niffies variable which is updated from any runqueue using the
  11 TSC clock whenever the grq lock is taken. Use niffies to compare deadlines to.
  12 This will give much more granular deadlines when jiffies are low resolution
  13 such as 100Hz, and rarely will tasks have the same deadlines now.
  14
  15 - Drop the "skip_clock_update" concept as we update the niffies each time we
  16 update the rq clocks, thus we want to update it more often.
  17
  18 - Rework try_preempt.
  19
  20 - Bypass rechecking deadline when we know that prev will run again in schedule.
  21
  22 - Check to see if prev can run on an idle CPU when being descheduled as may
  23 happen when a task must use a certain CPU for affinity reasons.
  24
  25 - Decrease maximum rr_interval possible to 1000 (one second) as there is no
  26 demonstrable advantage to higher values, and may overflow with other changes
  27 being introduced.
  28
  29 - Check for when tasks are scheduled within a tick to see if they're in the
  30 1st or 2nd half of the tick. Use this to decide how far into last tick a task
  31 can run. This will make the greatest difference on lower Hz values with small
  32 rr intervals.
  33
  34 - Change the test for exhausted time slice to 100us as rescheduling with less
  35 time available than this will either greatly overrun its quota or reschedule
  36 very quickly.
  37
  38 - Change SCHED_BATCH tasks to refill timeslices and reset deadline every time
  39 they're descheduled as they've been flagged as latency insensitive, likely
  40 fully CPU bound tasks. This should decrease the impact running batch tasks
  41 has on other tasks.
  42
  43 - Microoptimise before context switch in schedule()
  44
  45 - Add a local last_task variable to each runqueue which keeps a copy of the
  46 last non-idle task that ran on this CPU. Use this value to determine that a
  47 task is still cache warm on this CPU even if it has run elsewhere in the
  48 meantime. This improves throughput on relatively idle systems with >2 logical
  49 CPUs.
  50
  51 - Remove the first_time_slice concept as it wasn't contributing on any
  52 meaningful level but was adding to overhead, especially on sched_exit.
  53
  54 - Check that when a task forks and loses timeslice to its child that it isn't
  55 due to be descheduled and ensure that a child inherits the proper variables
  56 from its parent across fork.
  57
  58 - Fix try_preempt which may have been picking a higher priority runqueue on
  59 SMP if it had a later deadline. Remove the test for equal deadline as
  60 nanosecond deadline resolution means this will never happen.
  61
  62 - Random other minor cleanups.
  63
  64 -ck
  65 ---
  66
  67  include/linux/sched.h                 |    4
  68  kernel/sched_bfs.c                    |  466 +++++++++++++++++++++-------------
  69  kernel/sysctl.c                       |    4
  70  4 files changed, 293 insertions(+), 183 deletions(-)
  71
  72 Index: linux-2.6.35-bfs/include/linux/sched.h
  73 ===================================================================
  74 --- linux-2.6.35-bfs.orig/include/linux/sched.h 2010-09-25 08:18:08.792894602 +1000
  75 +++ linux-2.6.35-bfs/include/linux/sched.h      2010-09-25 08:20:25.822886826 +1000
  76 @@ -1118,7 +1118,7 @@ struct task_struct {
  77         int prio, static_prio, normal_prio;
  78         unsigned int rt_priority;
  79  #ifdef CONFIG_SCHED_BFS
  80 -       int time_slice, first_time_slice;
  81 -       unsigned long deadline;
  82 +       int time_slice;
  83 +       u64 deadline;
  84         struct list_head run_list;
  85         u64 last_ran;
  86 @@ -1547,7 +1547,7 @@ static inline void tsk_cpus_current(stru
  87
  88  static inline void print_scheduler_version(void)
  89  {
  90 -       printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas ported by ToAsTcfh.\n");
  91 +       printk(KERN_INFO"BFS CPU scheduler v0.350 by Con Kolivas ported by ToAsTcfh.\n");
  92  }
  93
  94  static inline int iso_task(struct task_struct *p)
  95 Index: linux-2.6.35-bfs/kernel/sched_bfs.c
  96 ===================================================================
  97 --- linux-2.6.35-bfs.orig/kernel/sched_bfs.c    2010-09-25 08:18:08.804894864 +1000
  98 +++ linux-2.6.35-bfs/kernel/sched_bfs.c 2010-09-25 08:20:25.827886935 +1000
  99 @@ -106,10 +106,19 @@
 100  #define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 101  #define SCHED_PRIO(p)          ((p)+MAX_RT_PRIO)
 102
 103 -/* Some helpers for converting to/from various scales.*/
 104 +/*
 105 + * Some helpers for converting to/from various scales. Use shifts to get
 106 + * approximate multiples of ten for less overhead.
 107 + */
 108  #define JIFFIES_TO_NS(TIME)    ((TIME) * (1000000000 / HZ))
 109 -#define MS_TO_NS(TIME)         ((TIME) * 1000000)
 110 -#define MS_TO_US(TIME)         ((TIME) * 1000)
 111 +#define HALF_JIFFY_NS          (1000000000 / HZ / 2)
 112 +#define HALF_JIFFY_US          (1000000 / HZ / 2)
 113 +#define MS_TO_NS(TIME)         ((TIME) << 20)
 114 +#define MS_TO_US(TIME)         ((TIME) << 10)
 115 +#define NS_TO_MS(TIME)         ((TIME) >> 20)
 116 +#define NS_TO_US(TIME)         ((TIME) >> 10)
 117 +
 118 +#define RESCHED_US     (100) /* Reschedule if less than this many us left */
 119
 120  /*
 121   * This is the time all tasks within the same priority round robin.
 122 @@ -140,8 +149,9 @@ static inline unsigned long timeslice(vo
 123  }
 124
 125  /*
 126 - * The global runqueue data that all CPUs work off. All data is protected
 127 - * by grq.lock.
 128 + * The global runqueue data that all CPUs work off. Data is protected either
 129 + * by the global grq lock, or the discrete lock that precedes the data in this
 130 + * struct.
 131   */
 132  struct global_rq {
 133         raw_spinlock_t lock;
 134 @@ -150,17 +160,17 @@ struct global_rq {
 135         unsigned long long nr_switches;
 136         struct list_head queue[PRIO_LIMIT];
 137         DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
 138 -       int iso_ticks;
 139 -       int iso_refractory;
 140  #ifdef CONFIG_SMP
 141         unsigned long qnr; /* queued not running */
 142         cpumask_t cpu_idle_map;
 143         int idle_cpus;
 144  #endif
 145 -#if BITS_PER_LONG < 64
 146 -       unsigned long jiffies;
 147 -       u64 jiffies_64;
 148 -#endif
 149 +       /* Nanosecond jiffies */
 150 +       u64 niffies;
 151 +
 152 +       raw_spinlock_t iso_lock;
 153 +       int iso_ticks;
 154 +       int iso_refractory;
 155  };
 156
 157  /* There can be only one */
 158 @@ -176,8 +186,8 @@ struct rq {
 159         u64 nohz_stamp;
 160         unsigned char in_nohz_recently;
 161  #endif
 162 +       struct task_struct *last_task;
 163  #endif
 164 -       unsigned int skip_clock_update;
 165
 166         struct task_struct *curr, *idle;
 167         struct mm_struct *prev_mm;
 168 @@ -213,9 +223,11 @@ struct rq {
 169         /* See if all cache siblings are idle */
 170         cpumask_t cache_siblings;
 171  #endif
 172 +       u64 last_niffy; /* Last time this RQ updated grq.niffies */
 173  #endif
 174 +       u64 clock, old_clock, last_tick;
 175 +       int dither;
 176
 177 -       u64 clock;
 178  #ifdef CONFIG_SCHEDSTATS
 179
 180         /* latency stats */
 181 @@ -290,12 +290,4 @@ struct root_domain {
 182  static struct root_domain def_root_domain;
 183  #endif
 184
 185 -static inline int cpu_of(struct rq *rq)
 186 -{
 187 -#ifdef CONFIG_SMP
 188 -       return rq->cpu;
 189 -#else
 190 -       return 0;
 191 -#endif
 192 -}
 193
 194 @@ -310,17 +313,65 @@ static inline int cpu_of(struct rq *rq)
 195  #define for_each_domain(cpu, __sd) \
 196         for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 197
 198 +static inline void update_rq_clock(struct rq *rq);
 199 +
 200  #ifdef CONFIG_SMP
 201  #define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
 202  #define this_rq()              (&__get_cpu_var(runqueues))
 203  #define task_rq(p)             cpu_rq(task_cpu(p))
 204  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 205 +static inline int cpu_of(struct rq *rq)
 206 +{
 207 +       return rq->cpu;
 208 +}
 209 +
 210 +/*
 211 + * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
 212 + * clock is updated with the grq.lock held, it is an opportunity to update the
 213 + * niffies value. Any CPU can update it by adding how much its clock has
 214 + * increased since it last updated niffies, minus any added niffies by other
 215 + * CPUs.
 216 + */
 217 +static inline void update_clocks(struct rq *rq)
 218 +{
 219 +       s64 ndiff;
 220 +
 221 +       update_rq_clock(rq);
 222 +       ndiff = rq->clock - rq->old_clock;
 223 +       /* old_clock is only updated when we are updating niffies */
 224 +       rq->old_clock = rq->clock;
 225 +       ndiff -= grq.niffies - rq->last_niffy;
 226 +       /*
 227 +        * Sanity check should sched_clock return bogus values or be limited to
 228 +        * just jiffy resolution. Some time will always have passed.
 229 +        */
 230 +       if (unlikely(ndiff < 1 || ndiff > MS_TO_NS(rr_interval)))
 231 +               ndiff = 1;
 232 +       grq.niffies += ndiff;
 233 +       rq->last_niffy = grq.niffies;
 234 +}
 235  #else /* CONFIG_SMP */
 236  static struct rq *uprq;
 237  #define cpu_rq(cpu)    (uprq)
 238  #define this_rq()      (uprq)
 239  #define task_rq(p)     (uprq)
 240  #define cpu_curr(cpu)  ((uprq)->curr)
 241 +static inline int cpu_of(struct rq *rq)
 242 +{
 243 +       return 0;
 244 +}
 245 +
 246 +static inline void update_clocks(struct rq *rq)
 247 +{
 248 +       s64 ndiff;
 249 +
 250 +       update_rq_clock(rq);
 251 +       ndiff = rq->clock - rq->old_clock;
 252 +       rq->old_clock = rq->clock;
 253 +       if (unlikely(ndiff < 1 || ndiff > MS_TO_US(rr_interval)))
 254 +               ndiff = 1;
 255 +       grq.niffies += ndiff;
 256 +}
 257  #endif
 258  #define raw_rq()       (&__raw_get_cpu_var(runqueues))
 259
 260 @@ -335,13 +386,13 @@ static struct rq *uprq;
 261
 262  /*
 263   * All common locking functions performed on grq.lock. rq->clock is local to
 264 - * the cpu accessing it so it can be modified just with interrupts disabled,
 265 - * but looking up task_rq must be done under grq.lock to be safe.
 266 + * the CPU accessing it so it can be modified just with interrupts disabled
 267 + * when we're not updating niffies.
 268 + * Looking up task_rq must be done under grq.lock to be safe.
 269   */
 270 -static inline void update_rq_clock(struct rq *rq)
 271 +static inline void update_rq_clock(struct rq *rq)
 272  {
 273 -       if (!rq->skip_clock_update)
 274 -               rq->clock = sched_clock_cpu(cpu_of(rq));
 275 +       rq->clock = sched_clock_cpu(cpu_of(rq));
 276  }
 277
 278  static inline int task_running(struct task_struct *p)
 279 @@ -370,8 +421,8 @@ static inline void grq_lock_irq(void)
 280  static inline void time_lock_grq(struct rq *rq)
 281         __acquires(grq.lock)
 282  {
 283 -       update_rq_clock(rq);
 284         grq_lock();
 285 +       update_clocks(rq);
 286  }
 287
 288  static inline void grq_unlock_irq(void)
 289 @@ -405,7 +456,7 @@ static inline struct rq
 290         __acquires(grq.lock)
 291  {
 292         struct rq *rq = task_grq_lock(p, flags);
 293 -       update_rq_clock(rq);
 294 +       update_clocks(rq);
 295         return rq;
 296  }
 297
 298 @@ -420,7 +471,7 @@ static inline void time_task_grq_lock_ir
 299         __acquires(grq.lock)
 300  {
 301         struct rq *rq = task_grq_lock_irq(p);
 302 -       update_rq_clock(rq);
 303 +       update_clocks(rq);
 304  }
 305
 306  static inline void task_grq_unlock_irq(void)
 307 @@ -515,33 +566,6 @@ static inline void finish_lock_switch(st
 308  }
 309  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 310
 311 -/*
 312 - * In order to have a monotonic clock that does not wrap we have a 64 bit
 313 - * unsigned long that's protected by grq.lock used in place of jiffies on
 314 - * 32 bit builds.
 315 - */
 316 -#if BITS_PER_LONG < 64
 317 -static inline void update_gjiffies(void)
 318 -{
 319 -       if (grq.jiffies != jiffies) {
 320 -               grq_lock();
 321 -               grq.jiffies = jiffies;
 322 -               grq.jiffies_64++;
 323 -               grq_unlock();
 324 -       }
 325 -}
 326 -
 327 -#define gjiffies (grq.jiffies_64)
 328 -
 329 -#else /* BITS_PER_LONG < 64 */
 330 -static inline void update_gjiffies(void)
 331 -{
 332 -}
 333 -
 334 -#define gjiffies jiffies
 335 -
 336 -#endif /* BITS_PER_LONG < 64 */
 337 -
 338  static inline int deadline_before(u64 deadline, u64 time)
 339  {
 340         return (deadline < time);
 341 @@ -574,17 +598,6 @@ static void dequeue_task(struct task_str
 342  }
 343
 344  /*
 345 - * When a task is freshly forked, the first_time_slice flag is set to say
 346 - * it has taken time_slice from its parent and if it exits on this first
 347 - * time_slice it can return its time_slice back to the parent.
 348 - */
 349 -static inline void reset_first_time_slice(struct task_struct *p)
 350 -{
 351 -       if (unlikely(p->first_time_slice))
 352 -               p->first_time_slice = 0;
 353 -}
 354 -
 355 -/*
 356   * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
 357   * an idle task, we ensure none of the following conditions are met.
 358   */
 359 @@ -646,11 +659,11 @@ static inline int task_prio_ratio(struct
 360  /*
 361   * task_timeslice - all tasks of all priorities get the exact same timeslice
 362   * length. CPU distribution is handled by giving different deadlines to
 363 - * tasks of different priorities.
 364 + * tasks of different priorities. Use 128 as the base value for fast shifts.
 365   */
 366  static inline int task_timeslice(struct task_struct *p)
 367  {
 368 -       return (rr_interval * task_prio_ratio(p) / 100);
 369 +       return (rr_interval * task_prio_ratio(p) / 128);
 370  }
 371
 372  #ifdef CONFIG_SMP
 373 @@ -702,6 +715,15 @@ static int suitable_idle_cpus(struct tas
 374
 375  static void resched_task(struct task_struct *p);
 376
 377 +/*
 378 + * last_task stores the last non-idle task scheduled on the local rq for
 379 + * cache warmth testing.
 380 + */
 381 +static inline void set_last_task(struct rq *rq, struct task_struct *p)
 382 +{
 383 +       rq->last_task = p;
 384 +}
 385 +
 386  #define CPUIDLE_CACHE_BUSY     (1)
 387  #define CPUIDLE_DIFF_CPU       (2)
 388  #define CPUIDLE_THREAD_BUSY    (4)
 389 @@ -724,6 +746,9 @@ static void resched_task(struct task_str
 390   * Other node, other CPU, idle cache, idle threads.
 391   * Other node, other CPU, busy cache, idle threads.
 392   * Other node, other CPU, busy threads.
 393 + *
 394 + * If p was the last task running on this rq, then regardless of where
 395 + * it has been running since then, it is cache warm on this rq.
 396   */
 397  static void resched_best_idle(struct task_struct *p)
 398  {
 399 @@ -756,11 +781,14 @@ static void resched_best_idle(struct tas
 400                 tmp_rq = cpu_rq(cpu_tmp);
 401
 402                 if (rq->cpu_locality[cpu_tmp]) {
 403 +                       /* Check rq->last_task hasn't been dereferenced */
 404 +                       if (rq->last_task && p != rq->last_task) {
 405  #ifdef CONFIG_NUMA
 406 -                       if (rq->cpu_locality[cpu_tmp] > 1)
 407 -                               ranking |= CPUIDLE_DIFF_NODE;
 408 +                               if (rq->cpu_locality[cpu_tmp] > 1)
 409 +                                       ranking |= CPUIDLE_DIFF_NODE;
 410  #endif
 411 -                       ranking |= CPUIDLE_DIFF_CPU;
 412 +                               ranking |= CPUIDLE_DIFF_CPU;
 413 +                       }
 414                 }
 415  #ifdef CONFIG_SCHED_MC
 416                 if (!(tmp_rq->cache_idle(cpu_tmp)))
 417 @@ -802,6 +830,11 @@ static inline void resched_suitable_idle
 418  static inline int
 419  cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
 420  {
 421 +       /* Check rq->last_task hasn't been dereferenced */
 422 +       if (likely(rq->last_task)) {
 423 +               if (rq->last_task == p)
 424 +                       return 0;
 425 +       }
 426         return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
 427  }
 428  #else /* CONFIG_SMP */
 429 @@ -840,6 +873,10 @@ cache_distance(struct rq *task_rq, struc
 430  {
 431         return 0;
 432  }
 433 +
 434 +static inline void set_last_task(struct rq *rq, struct task_struct *p)
 435 +{
 436 +}
 437  #endif /* CONFIG_SMP */
 438
 439  /*
 440 @@ -887,7 +924,7 @@ static int effective_prio(struct task_st
 441   */
 442  static void activate_task(struct task_struct *p, struct rq *rq)
 443  {
 444 -       update_rq_clock(rq);
 445 +       update_clocks(rq);
 446
 447         /*
 448          * Sleep time is in units of nanosecs, so shift by 20 to get a
 449 @@ -1157,8 +1194,28 @@ EXPORT_SYMBOL_GPL(kick_process);
 450  #endif
 451
 452  #define rq_idle(rq)    ((rq)->rq_prio == PRIO_LIMIT)
 453 -#define task_idle(p)   ((p)->prio == PRIO_LIMIT)
 454
 455 +/*
 456 + * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
 457 + * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
 458 + * between themselves, they cooperatively multitask. An idle rq scores as
 459 + * prio PRIO_LIMIT so it is always preempted.
 460 + */
 461 +static inline int
 462 +can_preempt(struct task_struct *p, int prio, unsigned long deadline,
 463 +           unsigned int policy)
 464 +{
 465 +       /* Better static priority RT task or better policy preemption */
 466 +       if (p->prio < prio)
 467 +               return 1;
 468 +       if (p->prio > prio)
 469 +               return 0;
 470 +       /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
 471 +       if (!deadline_before(p->deadline, deadline))
 472 +               return 0;
 473 +       return 1;
 474 +}
 475 +#ifdef CONFIG_SMP
 476  #ifdef CONFIG_HOTPLUG_CPU
 477  /*
 478   * Check to see if there is a task that is affined only to offline CPUs but
 479 @@ -1178,14 +1235,20 @@ static inline int online_cpus(struct tas
 480  #endif
 481
 482  /*
 483 - * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
 484 - * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt
 485 - * between themselves, they cooperatively multitask. An idle rq scores as
 486 - * prio PRIO_LIMIT so it is always preempted. latest_deadline and
 487 - * highest_prio_rq are initialised only to silence the compiler. When
 488 - * all else is equal, still prefer this_rq.
 489 + * Check to see if p can run on cpu, and if not, whether there are any online
 490 + * CPUs it can run on instead.
 491 + */
 492 +static inline int needs_other_cpu(struct task_struct *p, int cpu)
 493 +{
 494 +       if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
 495 +               return 1;
 496 +       return 0;
 497 +}
 498 +
 499 +/*
 500 + * latest_deadline and highest_prio_rq are initialised only to silence the
 501 + * compiler. When all else is equal, still prefer this_rq.
 502   */
 503 -#ifdef CONFIG_SMP
 504  static void try_preempt(struct task_struct *p, struct rq *this_rq)
 505  {
 506         struct rq *highest_prio_rq = this_rq;
 507 @@ -1193,6 +1256,10 @@ static void try_preempt(struct task_stru
 508         int highest_prio;
 509         cpumask_t tmp;
 510
 511 +       /* IDLEPRIO tasks never preempt anything */
 512 +       if (p->policy == SCHED_IDLEPRIO)
 513 +               return;
 514 +
 515         if (suitable_idle_cpus(p)) {
 516                 resched_best_idle(p);
 517                 return;
 518 @@ -1219,30 +1286,32 @@ static void try_preempt(struct task_stru
 519                 offset_deadline = rq->rq_deadline -
 520                                   cache_distance(this_rq, rq, p);
 521
 522 -               if (rq_prio > highest_prio ||
 523 -                   (deadline_after(offset_deadline, latest_deadline) ||
 524 -                   (offset_deadline == latest_deadline && this_rq == rq))) {
 525 +               if (rq_prio > highest_prio || (rq_prio == highest_prio &&
 526 +                   deadline_after(offset_deadline, latest_deadline))) {
 527                         latest_deadline = offset_deadline;
 528                         highest_prio = rq_prio;
 529                         highest_prio_rq = rq;
 530                 }
 531         }
 532
 533 -       if (p->prio > highest_prio || (p->prio == highest_prio &&
 534 -           p->policy == SCHED_NORMAL &&
 535 -           !deadline_before(p->deadline, latest_deadline)))
 536 +       if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline,
 537 +           highest_prio_rq->rq_policy))
 538                 return;
 539
 540 -       /* p gets to preempt highest_prio_rq->curr */
 541         resched_task(highest_prio_rq->curr);
 542 -       highest_prio_rq->skip_clock_update = 1;
 543  }
 544  #else /* CONFIG_SMP */
 545 +static inline int needs_other_cpu(struct task_struct *p, int cpu)
 546 +{
 547 +       return 0;
 548 +}
 549 +
 550  static void try_preempt(struct task_struct *p, struct rq *this_rq)
 551  {
 552 -       if (p->prio < uprq->rq_prio ||
 553 -           (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL &&
 554 -            deadline_before(p->deadline, uprq->rq_deadline)))
 555 +       if (p->policy == SCHED_IDLEPRIO)
 556 +               return;
 557 +       if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline,
 558 +           uprq->rq_policy))
 559                 resched_task(uprq->curr);
 560  }
 561  #endif /* CONFIG_SMP */
 562 @@ -1352,12 +1421,15 @@ int wake_up_state(struct task_struct *p,
 563         return try_to_wake_up(p, state, 0);
 564  }
 565
 566 +static void time_slice_expired(struct task_struct *p);
 567 +
 568  /*
 569   * Perform scheduler related setup for a newly forked process p.
 570   * p is forked by current.
 571   */
 572  void sched_fork(struct task_struct *p, int clone_flags)
 573  {
 574 +       struct task_struct *curr;
 575         int cpu = get_cpu();
 576         struct rq *rq;
 577
 578 @@ -1396,10 +1468,11 @@ void sched_fork(struct task_struct *p, i
 579                 p->sched_reset_on_fork = 0;
 580         }
 581
 582 +       curr = current;
 583         /*
 584          * Make sure we do not leak PI boosting priority to the child.
 585          */
 586 -       p->prio = current->normal_prio;
 587 +       p->prio = curr->normal_prio;
 588
 589         INIT_LIST_HEAD(&p->run_list);
 590  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 591 @@ -1420,18 +1493,26 @@ void sched_fork(struct task_struct *p, i
 592          * total amount of pending timeslices in the system doesn't change,
 593          * resulting in more scheduling fairness. If it's negative, it won't
 594          * matter since that's the same as being 0. current's time_slice is
 595 -        * actually in rq_time_slice when it's running.
 596 +        * actually in rq_time_slice when it's running, as is its last_ran
 597 +        * value. rq->rq_deadline is only modified within schedule() so it
 598 +        * is always equal to current->deadline.
 599          */
 600 -       rq = task_grq_lock_irq(current);
 601 -       if (likely(rq->rq_time_slice > 0)) {
 602 +       rq = task_grq_lock_irq(curr);
 603 +       if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
 604                 rq->rq_time_slice /= 2;
 605 +               p->time_slice = rq->rq_time_slice;
 606 +       } else {
 607                 /*
 608 -                * The remainder of the first timeslice might be recovered by
 609 -                * the parent if the child exits early enough.
 610 +                * Forking task has run out of timeslice. Reschedule it and
 611 +                * start its child with a new time slice and deadline. The
 612 +                * child will end up running first because its deadline will
 613 +                * be slightly earlier.
 614                  */
 615 -               p->first_time_slice = 1;
 616 +               rq->rq_time_slice = 0;
 617 +               set_tsk_need_resched(curr);
 618 +               time_slice_expired(p);
 619         }
 620 -       p->time_slice = rq->rq_time_slice;
 621 +       p->last_ran = rq->rq_last_ran;
 622         task_grq_unlock_irq();
 623  out:
 624         put_cpu();
 625 @@ -1470,40 +1551,9 @@ void wake_up_new_task(struct task_struct
 626         task_grq_unlock(&flags);
 627  }
 628
 629 -/*
 630 - * Potentially available exiting-child timeslices are
 631 - * retrieved here - this way the parent does not get
 632 - * penalised for creating too many threads.
 633 - *
 634 - * (this cannot be used to 'generate' timeslices
 635 - * artificially, because any timeslice recovered here
 636 - * was given away by the parent in the first place.)
 637 - */
 638 +/* Nothing to do here */
 639  void sched_exit(struct task_struct *p)
 640  {
 641 -       struct task_struct *parent;
 642 -       unsigned long flags;
 643 -       struct rq *rq;
 644 -
 645 -       if (unlikely(p->first_time_slice)) {
 646 -               int *par_tslice, *p_tslice;
 647 -
 648 -               parent = p->parent;
 649 -               par_tslice = &parent->time_slice;
 650 -               p_tslice = &p->time_slice;
 651 -
 652 -               rq = task_grq_lock(parent, &flags);
 653 -               /* The real time_slice of the "curr" task is on the rq var.*/
 654 -               if (p == rq->curr)
 655 -                       p_tslice = &rq->rq_time_slice;
 656 -               else if (parent == task_rq(parent)->curr)
 657 -                       par_tslice = &rq->rq_time_slice;
 658 -
 659 -               *par_tslice += *p_tslice;
 660 -               if (unlikely(*par_tslice > timeslice()))
 661 -                       *par_tslice = timeslice();
 662 -               task_grq_unlock(&flags);
 663 -       }
 664  }
 665
 666  #ifdef CONFIG_PREEMPT_NOTIFIERS
 667 @@ -1981,7 +2031,7 @@ update_cpu_clock(struct rq *rq, struct t
 668                 else if (unlikely(time_diff > JIFFIES_TO_NS(1)))
 669                         time_diff = JIFFIES_TO_NS(1);
 670
 671 -               rq->rq_time_slice -= time_diff / 1000;
 672 +               rq->rq_time_slice -= NS_TO_US(time_diff);
 673         }
 674         rq->rq_last_ran = rq->timekeep_clock = rq->clock;
 675  }
 676 @@ -1997,7 +2047,7 @@ static u64 do_task_delta_exec(struct tas
 677         u64 ns = 0;
 678
 679         if (p == rq->curr) {
 680 -               update_rq_clock(rq);
 681 +               update_clocks(rq);
 682                 ns = rq->clock - rq->rq_last_ran;
 683                 if (unlikely((s64)ns < 0))
 684                         ns = 0;
 685 @@ -2171,10 +2221,22 @@ void account_idle_ticks(unsigned long ti
 686  }
 687  #endif
 688
 689 +static inline void grq_iso_lock(void)
 690 +       __acquires(grq.iso_lock)
 691 +{
 692 +       raw_spin_lock(&grq.iso_lock);
 693 +}
 694 +
 695 +static inline void grq_iso_unlock(void)
 696 +       __releases(grq.iso_lock)
 697 +{
 698 +       raw_spin_unlock(&grq.iso_lock);
 699 +}
 700 +
 701  /*
 702   * Functions to test for when SCHED_ISO tasks have used their allocated
 703   * quota as real time scheduling and convert them back to SCHED_NORMAL.
 704 - * Where possible, the data is tested lockless, to avoid grabbing grq_lock
 705 + * Where possible, the data is tested lockless, to avoid grabbing iso_lock
 706   * because the occasional inaccurate result won't matter. However the
 707   * tick data is only ever modified under lock. iso_refractory is only simply
 708   * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
 709 @@ -2209,21 +2271,21 @@ static unsigned int test_ret_isorefracto
 710
 711  static void iso_tick(void)
 712  {
 713 -       grq_lock();
 714 +       grq_iso_lock();
 715         grq.iso_ticks += 100;
 716 -       grq_unlock();
 717 +       grq_iso_unlock();
 718  }
 719
 720  /* No SCHED_ISO task was running so decrease rq->iso_ticks */
 721  static inline void no_iso_tick(void)
 722  {
 723         if (grq.iso_ticks) {
 724 -               grq_lock();
 725 +               grq_iso_lock();
 726                 grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
 727                 if (unlikely(grq.iso_refractory && grq.iso_ticks <
 728                     ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
 729                         clear_iso_refractory();
 730 -               grq_unlock();
 731 +               grq_iso_unlock();
 732         }
 733  }
 734
 735 @@ -2262,10 +2324,23 @@ static void task_running_tick(struct rq
 736         }
 737
 738         /* SCHED_FIFO tasks never run out of timeslice. */
 739 -       if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO)
 740 +       if (rq->rq_policy == SCHED_FIFO)
 741                 return;
 742 +       /*
 743 +        * Tasks that were scheduled in the first half of a tick are not
 744 +        * allowed to run into the 2nd half of the next tick if they will
 745 +        * run out of time slice in the interim. Otherwise, if they have
 746 +        * less than 100us of time slice left they will be rescheduled.
 747 +        */
 748 +       if (rq->dither) {
 749 +               if (rq->rq_time_slice > HALF_JIFFY_US)
 750 +                       return;
 751 +               else
 752 +                       rq->rq_time_slice = 0;
 753 +       } else if (rq->rq_time_slice >= RESCHED_US)
 754 +                       return;
 755
 756 -       /* p->time_slice <= 0. We only modify task_struct under grq lock */
 757 +       /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
 758         p = rq->curr;
 759         requeue_task(p);
 760         grq_lock();
 761 @@ -2286,13 +2361,14 @@ void scheduler_tick(void)
 762         struct rq *rq = cpu_rq(cpu);
 763
 764         sched_clock_tick();
 765 +       /* grq lock not grabbed, so only update rq clock */
 766         update_rq_clock(rq);
 767         update_cpu_clock(rq, rq->curr, 1);
 768 -       update_gjiffies();
 769         if (!rq_idle(rq))
 770                 task_running_tick(rq);
 771         else
 772                 no_iso_tick();
 773 +       rq->last_tick = rq->clock;
 774         perf_event_task_tick(rq->curr);
 775  }
 776
 777 @@ -2354,7 +2430,7 @@ EXPORT_SYMBOL(sub_preempt_count);
 778  #endif
 779
 780  /*
 781 - * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline
 782 + * Deadline is "now" in niffies + (offset by priority). Setting the deadline
 783   * is the key to everything. It distributes cpu fairly amongst tasks of the
 784   * same nice value, it proportions cpu according to nice level, it means the
 785   * task that last woke up the longest ago has the earliest deadline, thus
 786 @@ -2364,7 +2440,7 @@ EXPORT_SYMBOL(sub_preempt_count);
 787   */
 788  static inline int prio_deadline_diff(int user_prio)
 789  {
 790 -       return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1;
 791 +       return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
 792  }
 793
 794  static inline int task_deadline_diff(struct task_struct *p)
 795 @@ -2377,25 +2453,33 @@ static inline int static_deadline_diff(i
 796         return prio_deadline_diff(USER_PRIO(static_prio));
 797  }
 798
 799 -static inline int longest_deadline_diff(void)
 800 +static inline int ms_longest_deadline_diff(void)
 801  {
 802 -       return prio_deadline_diff(39);
 803 +       return NS_TO_MS(prio_deadline_diff(39));
 804  }
 805
 806  /*
 807   * The time_slice is only refilled when it is empty and that is when we set a
 808   * new deadline.
 809   */
 810 -static inline void time_slice_expired(struct task_struct *p)
 811 +static void time_slice_expired(struct task_struct *p)
 812  {
 813 -       reset_first_time_slice(p);
 814         p->time_slice = timeslice();
 815 -       p->deadline = gjiffies + task_deadline_diff(p);
 816 +       p->deadline = grq.niffies + task_deadline_diff(p);
 817  }
 818
 819 +/*
 820 + * Timeslices below RESCHED_US are considered as good as expired as there's no
 821 + * point rescheduling when there's so little time left. SCHED_BATCH tasks
 822 + * have been flagged be not latency sensitive and likely to be fully CPU
 823 + * bound so every time they're rescheduled they have their time_slice
 824 + * refilled, but get a new later deadline to have little effect on
 825 + * SCHED_NORMAL tasks.
 826 +
 827 + */
 828  static inline void check_deadline(struct task_struct *p)
 829  {
 830 -       if (p->time_slice <= 0)
 831 +       if (p->time_slice < RESCHED_US || batch_task(p))
 832                 time_slice_expired(p);
 833  }
 834
 835 @@ -2433,7 +2517,7 @@ retry:
 836         queue = grq.queue + idx;
 837         list_for_each_entry(p, queue, run_list) {
 838                 /* Make sure cpu affinity is ok */
 839 -               if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed))
 840 +               if (needs_other_cpu(p, cpu))
 841                         continue;
 842                 if (idx < MAX_RT_PRIO) {
 843                         /* We found an rt task */
 844 @@ -2560,12 +2644,14 @@ need_resched_nonpreemptible:
 845         deactivate = 0;
 846         schedule_debug(prev);
 847
 848 -       local_irq_disable();
 849 -       update_rq_clock(rq);
 850 +       grq_lock_irq();
 851 +       update_clocks(rq);
 852         update_cpu_clock(rq, prev, 0);
 853 -       rq->skip_clock_update = 0;
 854 +       if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
 855 +               rq->dither = 0;
 856 +       else
 857 +               rq->dither = 1;
 858
 859 -       grq_lock();
 860         clear_tsk_need_resched(prev);
 861
 862         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 863 @@ -2581,36 +2667,54 @@ need_resched_nonpreemptible:
 864                 prev->time_slice = rq->rq_time_slice;
 865                 prev->deadline = rq->rq_deadline;
 866                 check_deadline(prev);
 867 -               return_task(prev, deactivate);
 868 -               /* Task changed affinity off this cpu */
 869 -               if (unlikely(!cpus_intersects(prev->cpus_allowed,
 870 -                   cpumask_of_cpu(cpu)))) {
 871 -                       if (online_cpus(prev))
 872 +               prev->last_ran = rq->clock;
 873 +
 874 +               /* Task changed affinity off this CPU */
 875 +               if (needs_other_cpu(prev, cpu))
 876 +                       resched_suitable_idle(prev);
 877 +               else if (!deactivate) {
 878 +                       if (!queued_notrunning()) {
 879 +                               /*
 880 +                               * We now know prev is the only thing that is
 881 +                               * awaiting CPU so we can bypass rechecking for
 882 +                               * the earliest deadline task and just run it
 883 +                               * again.
 884 +                               */
 885 +                               grq_unlock_irq();
 886 +                               goto rerun_prev_unlocked;
 887 +                       } else {
 888 +                               /*
 889 +                                * If prev got kicked off by a task that has to
 890 +                                * run on this CPU for affinity reasons then
 891 +                                * there may be an idle CPU it can go to.
 892 +                                */
 893                                 resched_suitable_idle(prev);
 894                         }
 895 +               }
 896 +               return_task(prev, deactivate);
 897         }
 898
 899 -       if (likely(queued_notrunning())) {
 900 -               next = earliest_deadline_task(rq, idle);
 901 -       } else {
 902 +       if (unlikely(!queued_notrunning())) {
 903 +               /*
 904 +                * This CPU is now truly idle as opposed to when idle is
 905 +                * scheduled as a high priority task in its own right.
 906 +                */
 907                 next = idle;
 908                 schedstat_inc(rq, sched_goidle);
 909 -       }
 910 -
 911 -       prefetch(next);
 912 -       prefetch_stack(next);
 913 -
 914 -       if (task_idle(next))
 915                 set_cpuidle_map(cpu);
 916 -       else
 917 +       } else {
 918 +               next = earliest_deadline_task(rq, idle);
 919 +               prefetch(next);
 920 +               prefetch_stack(next);
 921                 clear_cpuidle_map(cpu);
 922 -
 923 -       prev->last_ran = rq->clock;
 924 +       }
 925
 926         if (likely(prev != next)) {
 927                 sched_info_switch(prev, next);
 928                 perf_event_task_sched_out(prev, next);
 929
 930 +               if (prev != idle)
 931 +                       set_last_task(rq, prev);
 932                 set_rq_task(rq, next);
 933                 grq.nr_switches++;
 934                 prev->oncpu = 0;
 935 @@ -2629,6 +2733,7 @@ need_resched_nonpreemptible:
 936         } else
 937                 grq_unlock_irq();
 938
 939 +rerun_prev_unlocked:
 940         if (unlikely(reacquire_kernel_lock(current) < 0)) {
 941                 prev = rq->curr;
 942                 switch_count = &prev->nivcsw;
 943 @@ -3324,8 +3429,9 @@ int task_prio(const struct task_struct *
 944         if (prio <= 0)
 945                 goto out;
 946
 947 -       delta = p->deadline - gjiffies;
 948 -       delta = delta * 40 / longest_deadline_diff();
 949 +       /* Convert to ms to avoid overflows */
 950 +       delta = NS_TO_MS(p->deadline - grq.niffies);
 951 +       delta = delta * 40 / ms_longest_deadline_diff();
 952         if (delta > 0 && delta <= 80)
 953                 prio += delta;
 954         if (idleprio_task(p))
 955 @@ -3533,7 +3639,7 @@ recheck:
 956                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 957                 goto recheck;
 958         }
 959 -       update_rq_clock(rq);
 960 +       update_clocks(rq);
 961         p->sched_reset_on_fork = reset_on_fork;
 962
 963         queued = task_queued(p);
 964 @@ -4835,7 +4941,7 @@ migration_call(struct notifier_block *nf
 965                 __setscheduler(idle, rq, SCHED_NORMAL, 0);
 966                 idle->prio = PRIO_LIMIT;
 967                 set_rq_task(rq, idle);
 968 -               update_rq_clock(rq);
 969 +               update_clocks(rq);
 970                 grq_unlock_irq();
 971                 break;
 972
 973 @@ -6531,12 +6637,14 @@ void __init sched_init(void)
 974         int i;
 975         struct rq *rq;
 976
 977 -       prio_ratios[0] = 100;
 978 +       prio_ratios[0] = 128;
 979         for (i = 1 ; i < PRIO_RANGE ; i++)
 980                 prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 981
 982         raw_spin_lock_init(&grq.lock);
 983         grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
 984 +       grq.niffies = 0;
 985 +       raw_spin_lock_init(&grq.iso_lock);
 986         grq.iso_ticks = grq.iso_refractory = 0;
 987  #ifdef CONFIG_SMP
 988         init_defrootdomain();
 989 @@ -6549,7 +6657,9 @@ void __init sched_init(void)
 990                 rq = cpu_rq(i);
 991                 rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
 992                               rq->iowait_pc = rq->idle_pc = 0;
 993 +               rq->dither = 0;
 994  #ifdef CONFIG_SMP
 995 +               rq->last_niffy = 0;
 996                 rq->sd = NULL;
 997                 rq->rd = NULL;
 998                 rq->online = 0;
 999 Index: linux-2.6.35-bfs/kernel/sysctl.c
1000 ===================================================================
1001 --- linux-2.6.35-bfs.orig/kernel/sysctl.c       2010-09-25 08:18:30.147361076 +1000
1002 +++ linux-2.6.35-bfs/kernel/sysctl.c    2010-09-25 08:20:25.823886848 +1000
1003 @@ -119,7 +119,7 @@ static int __maybe_unused one_hundred =
1004  #ifdef CONFIG_SCHED_BFS
1005  extern int rr_interval;
1006  extern int sched_iso_cpu;
1007 -static int __read_mostly five_thousand = 5000;
1008 +static int __read_mostly one_thousand = 1000;
1009  #endif
1010  #ifdef CONFIG_PRINTK
1011  static int ten_thousand = 10000;
1012 @@ -794,7 +794,7 @@ static struct ctl_table kern_table[] = {
1013                 .mode           = 0644,
1014                 .proc_handler   = &proc_dointvec_minmax,
1015                 .extra1         = &one,
1016 -               .extra2         = &five_thousand,
1017 +               .extra2         = &one_thousand,
1018         },
1019         {
1020                 .procname       = "iso_cpu",