2 Documentation/scheduler/sched-BFS.txt | 9 -
3 include/linux/sched.h | 5
4 kernel/sched_bfs.c | 286 ++++++++++++++++++----------------
5 3 files changed, 155 insertions(+), 145 deletions(-)
7 Index: kernel-2.6.28/include/linux/sched.h
8 ===================================================================
9 --- kernel-2.6.28.orig/include/linux/sched.h
10 +++ kernel-2.6.28/include/linux/sched.h
11 @@ -1152,9 +1152,6 @@ struct task_struct {
14 cpumask_t cpus_allowed;
15 -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_BFS)
16 - cpumask_t unplugged_mask;
19 #ifdef CONFIG_PREEMPT_RCU
20 int rcu_read_lock_nesting;
21 @@ -1422,7 +1419,7 @@ static inline void tsk_cpus_current(stru
23 static inline void print_scheduler_version(void)
25 - printk(KERN_INFO"BFS CPU scheduler v0.318 by Con Kolivas ported by ToAsTcfh.\n");
26 + printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas ported by ToAsTcfh.\n");
29 static inline int iso_task(struct task_struct *p)
30 Index: kernel-2.6.28/kernel/sched_bfs.c
31 ===================================================================
32 --- kernel-2.6.28.orig/kernel/sched_bfs.c
33 +++ kernel-2.6.28/kernel/sched_bfs.c
34 @@ -172,6 +172,11 @@ struct global_rq {
36 unsigned long qnr; /* queued not running */
37 cpumask_t cpu_idle_map;
40 +#if BITS_PER_LONG < 64
41 + unsigned long jiffies;
46 @@ -188,6 +193,7 @@ struct rq {
47 unsigned char in_nohz_recently;
50 + unsigned int skip_clock_update;
52 struct task_struct *curr, *idle;
53 struct mm_struct *prev_mm;
54 @@ -198,6 +204,7 @@ struct rq {
58 + int rq_running; /* There is a task running */
60 /* Accurate timekeeping data */
62 @@ -331,7 +338,8 @@ static struct rq *uprq;
64 static inline void update_rq_clock(struct rq *rq)
66 - rq->clock = sched_clock_cpu(cpu_of(rq));
67 + if (!rq->skip_clock_update)
68 + rq->clock = sched_clock_cpu(cpu_of(rq));
71 static inline int task_running(struct task_struct *p)
72 @@ -506,6 +514,43 @@ static inline void finish_lock_switch(st
73 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
76 + * In order to have a monotonic clock that does not wrap we have a 64 bit
77 + * unsigned long that's protected by grq.lock used in place of jiffies on
80 +#if BITS_PER_LONG < 64
81 +static inline void update_gjiffies(void)
83 + if (grq.jiffies != jiffies) {
85 + grq.jiffies = jiffies;
91 +#define gjiffies (grq.jiffies_64)
93 +#else /* BITS_PER_LONG < 64 */
94 +static inline void update_gjiffies(void)
98 +#define gjiffies jiffies
100 +#endif /* BITS_PER_LONG < 64 */
102 +static inline int deadline_before(u64 deadline, u64 time)
104 + return (deadline < time);
107 +static inline int deadline_after(u64 deadline, u64 time)
109 + return (deadline > time);
113 * A task that is queued but not running will be on the grq run list.
114 * A task that is not running or queued will not be on the grq run list.
115 * A task that is currently running will have ->oncpu set but not on the
116 @@ -628,21 +673,28 @@ static inline int queued_notrunning(void
120 - * The cpu_idle_map stores a bitmap of all the cpus currently idle to
121 - * allow easy lookup of whether any suitable idle cpus are available.
122 + * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
123 + * allow easy lookup of whether any suitable idle CPUs are available.
124 + * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
125 + * idle_cpus variable than to do a full bitmask check when we are busy.
127 static inline void set_cpuidle_map(unsigned long cpu)
129 cpu_set(cpu, grq.cpu_idle_map);
133 static inline void clear_cpuidle_map(unsigned long cpu)
135 cpu_clear(cpu, grq.cpu_idle_map);
136 + if (cpus_empty(grq.cpu_idle_map))
140 static int suitable_idle_cpus(struct task_struct *p)
142 + if (!grq.idle_cpus)
144 return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
147 @@ -1107,6 +1159,25 @@ void kick_process(struct task_struct *p)
148 #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
149 #define task_idle(p) ((p)->prio == PRIO_LIMIT)
151 +#ifdef CONFIG_HOTPLUG_CPU
153 + * Check to see if there is a task that is affined only to offline CPUs but
154 + * still wants runtime. This happens to kernel threads during suspend/halt and
155 + * disabling of CPUs.
157 +static inline int online_cpus(struct task_struct *p)
159 + return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed)));
161 +#else /* CONFIG_HOTPLUG_CPU */
162 +/* All available CPUs are always online without hotplug. */
163 +static inline int online_cpus(struct task_struct *p)
171 * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
172 * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt
173 @@ -1128,7 +1199,11 @@ static void try_preempt(struct task_stru
177 - cpus_and(tmp, cpu_online_map, p->cpus_allowed);
178 + if (online_cpus(p))
179 + cpus_and(tmp, cpu_online_map, p->cpus_allowed);
181 + (cpumask_copy(&tmp, &cpu_online_map));
186 @@ -1146,7 +1221,7 @@ static void try_preempt(struct task_stru
187 cache_distance(this_rq, rq, p);
189 if (rq_prio > highest_prio ||
190 - (time_after(offset_deadline, latest_deadline) ||
191 + (deadline_after(offset_deadline, latest_deadline) ||
192 (offset_deadline == latest_deadline && this_rq == rq))) {
193 latest_deadline = offset_deadline;
194 highest_prio = rq_prio;
195 @@ -1155,21 +1230,21 @@ static void try_preempt(struct task_stru
198 if (p->prio > highest_prio || (p->prio == highest_prio &&
199 - p->policy == SCHED_NORMAL && !time_before(p->deadline, latest_deadline)))
201 + p->policy == SCHED_NORMAL &&
202 + !deadline_before(p->deadline, latest_deadline)))
205 /* p gets to preempt highest_prio_rq->curr */
206 resched_task(highest_prio_rq->curr);
208 + highest_prio_rq->skip_clock_update = 1;
210 #else /* CONFIG_SMP */
211 static void try_preempt(struct task_struct *p, struct rq *this_rq)
213 if (p->prio < uprq->rq_prio ||
214 (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL &&
215 - time_before(p->deadline, uprq->rq_deadline)))
216 + deadline_before(p->deadline, uprq->rq_deadline)))
217 resched_task(uprq->curr);
220 #endif /* CONFIG_SMP */
222 @@ -1335,9 +1410,9 @@ void wake_up_new_task(struct task_struct
225 rq = task_grq_lock(p, &flags); ;
226 + p->state = TASK_RUNNING;
228 - BUG_ON(p->state != TASK_RUNNING);
229 - /* Unnecessary but small chance that the parent changed cpus */
230 + /* Unnecessary but small chance that the parent changed CPU */
231 set_task_cpu(p, task_cpu(parent));
232 activate_task(p, rq);
233 trace_mark(kernel_sched_wakeup_new,
234 @@ -2015,15 +2090,16 @@ static void clear_iso_refractory(void)
236 * Test if SCHED_ISO tasks have run longer than their alloted period as RT
237 * tasks and set the refractory flag if necessary. There is 10% hysteresis
238 - * for unsetting the flag.
239 + * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
242 static unsigned int test_ret_isorefractory(struct rq *rq)
244 if (likely(!grq.iso_refractory)) {
245 - if (grq.iso_ticks / ISO_PERIOD > sched_iso_cpu)
246 + if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
247 set_iso_refractory();
249 - if (grq.iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
250 + if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
251 clear_iso_refractory();
253 return grq.iso_refractory;
254 @@ -2042,8 +2118,8 @@ static inline void no_iso_tick(void)
257 grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
258 - if (unlikely(grq.iso_refractory && grq.iso_ticks /
259 - ISO_PERIOD < (sched_iso_cpu * 90 / 100)))
260 + if (unlikely(grq.iso_refractory && grq.iso_ticks <
261 + ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
262 clear_iso_refractory();
265 @@ -2110,6 +2186,7 @@ void scheduler_tick(void)
268 update_cpu_clock(rq, rq->curr, 1);
271 task_running_tick(rq);
273 @@ -2175,7 +2252,7 @@ EXPORT_SYMBOL(sub_preempt_count);
277 - * Deadline is "now" in jiffies + (offset by priority). Setting the deadline
278 + * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline
279 * is the key to everything. It distributes cpu fairly amongst tasks of the
280 * same nice value, it proportions cpu according to nice level, it means the
281 * task that last woke up the longest ago has the earliest deadline, thus
282 @@ -2211,7 +2288,7 @@ static inline void time_slice_expired(st
284 reset_first_time_slice(p);
285 p->time_slice = timeslice();
286 - p->deadline = jiffies + task_deadline_diff(p);
287 + p->deadline = gjiffies + task_deadline_diff(p);
290 static inline void check_deadline(struct task_struct *p)
291 @@ -2237,22 +2314,16 @@ static inline void check_deadline(struct
293 * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
294 * selected by the earliest deadline.
295 - * Once deadlines are expired (jiffies has passed it) tasks are chosen in FIFO
296 - * order. Note that very few tasks will be FIFO for very long because they
297 - * only end up that way if they sleep for long or if if there are enough fully
298 - * cpu bound tasks to push the load to ~8 higher than the number of CPUs for
302 task_struct *earliest_deadline_task(struct rq *rq, struct task_struct *idle)
304 unsigned long dl, earliest_deadline = 0; /* Initialise to silence compiler */
305 - struct task_struct *p, *edt;
306 + struct task_struct *p, *edt = idle;
307 unsigned int cpu = cpu_of(rq);
308 struct list_head *queue;
313 idx = find_next_bit(grq.prio_bitmap, PRIO_LIMIT, idx);
314 if (idx >= PRIO_LIMIT)
315 @@ -2260,7 +2331,7 @@ retry:
316 queue = grq.queue + idx;
317 list_for_each_entry(p, queue, run_list) {
318 /* Make sure cpu affinity is ok */
319 - if (!cpu_isset(cpu, p->cpus_allowed))
320 + if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed))
322 if (idx < MAX_RT_PRIO) {
323 /* We found an rt task */
324 @@ -2271,21 +2342,12 @@ retry:
325 dl = p->deadline + cache_distance(task_rq(p), rq, p);
328 - * Look for tasks with old deadlines and pick them in FIFO
329 - * order, taking the first one found.
331 - if (time_is_before_jiffies(dl)) {
337 * No rt tasks. Find the earliest deadline task. Now we're in
338 * O(n) territory. This is what we silenced the compiler for:
339 * edt will always start as idle.
342 - time_before(dl, earliest_deadline)) {
343 + deadline_before(dl, earliest_deadline)) {
344 earliest_deadline = dl;
347 @@ -2358,6 +2420,10 @@ static inline void set_rq_task(struct rq
348 rq->rq_last_ran = p->last_ran;
349 rq->rq_policy = p->policy;
350 rq->rq_prio = p->prio;
352 + rq->rq_running = 1;
354 + rq->rq_running = 0;
357 static void reset_rq_task(struct rq *rq, struct task_struct *p)
358 @@ -2395,6 +2461,7 @@ need_resched_nonpreemptible:
361 update_cpu_clock(rq, prev, 0);
362 + rq->skip_clock_update = 0;
365 clear_tsk_need_resched(prev);
366 @@ -2415,8 +2482,10 @@ need_resched_nonpreemptible:
367 return_task(prev, deactivate);
368 /* Task changed affinity off this cpu */
369 if (unlikely(!cpus_intersects(prev->cpus_allowed,
370 - cpumask_of_cpu(cpu))))
371 - resched_suitable_idle(prev);
372 + cpumask_of_cpu(cpu)))) {
373 + if (online_cpus(prev))
374 + resched_suitable_idle(prev);
378 if (likely(queued_notrunning())) {
379 @@ -2461,7 +2530,7 @@ need_resched_nonpreemptible:
380 goto need_resched_nonpreemptible;
381 preempt_enable_no_resched();
382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
386 EXPORT_SYMBOL(schedule);
388 @@ -2829,7 +2898,7 @@ void rt_mutex_setprio(struct task_struct
390 BUG_ON(prio < 0 || prio > MAX_PRIO);
392 - rq = time_task_grq_lock(p, &flags);
393 + rq = task_grq_lock(p, &flags);
396 queued = task_queued(p);
397 @@ -2976,7 +3045,8 @@ int task_prio(const struct task_struct *
401 - delta = (p->deadline - jiffies) * 40 / longest_deadline_diff();
402 + delta = p->deadline - gjiffies;
403 + delta = delta * 40 / longest_deadline_diff();
404 if (delta > 0 && delta <= 80)
406 if (idleprio_task(p))
407 @@ -3126,7 +3196,7 @@ recheck:
408 if (policy == SCHED_NORMAL)
410 if (policy != SCHED_IDLEPRIO)
415 if (policy == SCHED_IDLEPRIO)
416 @@ -3794,9 +3864,6 @@ void init_idle(struct task_struct *idle,
417 rq->curr = rq->idle = idle;
419 set_cpuidle_map(cpu);
420 -#ifdef CONFIG_HOTPLUG_CPU
421 - idle->unplugged_mask = CPU_MASK_NONE;
423 grq_unlock_irqrestore(&flags);
425 /* Set the preempt count _outside_ the spinlocks! */
426 @@ -3975,8 +4042,11 @@ int set_cpus_allowed_ptr(struct task_str
428 if (task_running(p)) {
429 /* Task is running on the wrong cpu now, reschedule it. */
430 - set_tsk_need_resched(p);
432 + if (rq == this_rq()) {
433 + set_tsk_need_resched(p);
438 set_task_cpu(p, any_online_cpu(*new_mask));
440 @@ -3993,7 +4063,24 @@ out:
441 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
443 #ifdef CONFIG_HOTPLUG_CPU
444 -/* Schedules idle task to be the next runnable task on current CPU.
446 + * Reschedule a task if it's on a dead CPU.
448 +void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
450 + unsigned long flags;
451 + struct rq *rq, *dead_rq;
453 + dead_rq = cpu_rq(dead_cpu);
454 + rq = task_grq_lock(p, &flags);
455 + if (rq == dead_rq && task_running(p))
457 + task_grq_unlock(&flags);
462 + * Schedules idle task to be the next runnable task on current CPU.
463 * It does so by boosting its priority to highest possible.
464 * Used by CPU offline code.
466 @@ -4011,7 +4098,7 @@ void sched_idle_next(void)
467 * Strictly not necessary since rest of the CPUs are stopped by now
468 * and interrupts disabled on the current cpu.
470 - time_grq_lock(rq, &flags);
471 + grq_lock_irqsave(&flags);
473 __setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1);
475 @@ -4295,7 +4382,7 @@ migration_call(struct notifier_block *nf
476 struct task_struct *idle;
477 int cpu = (long)hcpu;
480 + struct rq *rq = cpu_rq(cpu);
484 @@ -4306,14 +4393,12 @@ migration_call(struct notifier_block *nf
486 case CPU_ONLINE_FROZEN:
487 /* Update our root-domain */
489 grq_lock_irqsave(&flags);
491 BUG_ON(!cpu_isset(cpu, rq->rd->span));
496 grq_unlock_irqrestore(&flags);
499 @@ -4325,11 +4410,9 @@ migration_call(struct notifier_block *nf
501 case CPU_DEAD_FROZEN:
502 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
505 /* Idle task back to normal (off runqueue, low prio) */
508 return_task(idle, 1);
509 idle->static_prio = MAX_PRIO;
510 __setscheduler(idle, rq, SCHED_NORMAL, 0);
511 @@ -4342,7 +4425,7 @@ migration_call(struct notifier_block *nf
514 case CPU_DYING_FROZEN:
516 + /* Update our root-domain */
517 grq_lock_irqsave(&flags);
519 BUG_ON(!cpu_isset(cpu, rq->rd->span));
520 @@ -5752,7 +5835,7 @@ static int cache_cpu_idle(unsigned long
521 void __init sched_init_smp(void)
523 struct sched_domain *sd;
524 - int cpu, i, cpu_scale;
527 cpumask_t non_isolated_cpus;
529 @@ -5784,16 +5867,11 @@ void __init sched_init_smp(void)
532 * Assume that every added cpu gives us slightly less overall latency
533 - * allowing us to increase the base rr_interval, but in a non linear
535 + * allowing us to increase the base rr_interval, non-linearly and with
538 - cpu_scale = ilog2(num_online_cpus());
539 - rr_interval *= 100;
540 - for (i = 0; i < cpu_scale; i++) {
544 - rr_interval /= 100;
545 + cpus = num_online_cpus();
546 + rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);
550 @@ -5874,8 +5952,12 @@ void __init sched_init(void)
551 prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
553 spin_lock_init(&grq.lock);
554 + grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
555 + grq.iso_ticks = grq.iso_refractory = 0;
557 init_defrootdomain();
558 + grq.qnr = grq.idle_cpus = 0;
559 + cpumask_clear(&grq.cpu_idle_map);
561 uprq = &per_cpu(runqueues, 0);
563 @@ -5994,7 +6076,6 @@ void normalize_rt_tasks(void)
565 spin_lock_irqsave(&p->pi_lock, flags);
566 rq = __task_grq_lock(p);
567 - update_rq_clock(rq);
569 queued = task_queued(p);
571 @@ -6103,6 +6184,10 @@ cputime_t task_stime(struct task_struct
575 +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
579 inline cputime_t task_gtime(struct task_struct *p)