From c8e511560ce9bdd38d3e8bbfe35a63a9b4114172 Mon Sep 17 00:00:00 2001
From: Peter Hunt <peter_j_hunt@hotmail.com>
Date: Mon, 29 Aug 2011 09:44:10 +0000
Subject: [PATCH] BFS prerequisites and tidy up of resched functions

---
 .../debian/patches/bfs/bfs-implement_prereqs.patch |  169 +++++
 .../debian/patches/bfs/bfs-setup_prereqs.patch     |  703 ++++++++++++++++++++
 .../debian/patches/bfs/bfs-tidy_up_resched.patch   |   84 +++
 .../tick_sched-set_inidle_unconditionally.patch    |   30 +
 kernel-bfs-2.6.28/debian/patches/series            |    4 +
 5 files changed, 990 insertions(+)
 create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs/bfs-implement_prereqs.patch
 create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs/bfs-setup_prereqs.patch
 create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs/bfs-tidy_up_resched.patch
 create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs/tick_sched-set_inidle_unconditionally.patch

diff --git a/kernel-bfs-2.6.28/debian/patches/bfs/bfs-implement_prereqs.patch b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-implement_prereqs.patch
new file mode 100644
index 0000000..23f3e9f
--- /dev/null
+++ b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-implement_prereqs.patch
@@ -0,0 +1,169 @@
+--- linux-2.6.28.orig/kernel/sched_bfs.c	2011-06-18 18:46:15.009219526 +0200
++++ linux-2.6.28.new/kernel/sched_bfs.c	2011-06-18 19:06:34.087896512 +0200
+@@ -229,9 +229,6 @@ struct rq {
+ 	struct sched_info rq_sched_info;
+ 
+ 	/* sys_sched_yield() stats */
+-	unsigned int yld_exp_empty;
+-	unsigned int yld_act_empty;
+-	unsigned int yld_both_empty;
+ 	unsigned int yld_count;
+ 
+ 	/* schedule() stats */
+@@ -2585,6 +2582,46 @@ void __wake_up_locked(wait_queue_head_t 
+ 	__wake_up_common(q, mode, 1, 0, NULL);
+ }
+ 
++void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
++{
++	__wake_up_common(q, mode, 1, 0, key);
++}
++
++/**
++ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
++ * @q: the waitqueue
++ * @mode: which threads
++ * @nr_exclusive: how many wake-one or wake-many threads to wake up
++ * @key: opaque value to be passed to wakeup targets
++ *
++ * The sync wakeup differs that the waker knows that it will schedule
++ * away soon, so while the target thread will be woken up, it will not
++ * be migrated to another CPU - ie. the two threads are 'synchronised'
++ * with each other. This can prevent needless bouncing between CPUs.
++ *
++ * On UP it can prevent extra preemption.
++ *
++ * It may be assumed that this function implies a write memory barrier before
++ * changing the task state if and only if any tasks are woken up.
++ */
++void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
++			int nr_exclusive, void *key)
++{
++	unsigned long flags;
++	int sync = 1;
++
++	if (unlikely(!q))
++		return;
++
++	if (unlikely(!nr_exclusive))
++		sync = 0;
++
++	spin_lock_irqsave(&q->lock, flags);
++	__wake_up_common(q, mode, nr_exclusive, sync, key);
++	spin_unlock_irqrestore(&q->lock, flags);
++}
++EXPORT_SYMBOL_GPL(__wake_up_sync_key);
++
+ /**
+  * __wake_up_sync - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+@@ -2615,6 +2652,18 @@ void __wake_up_sync(wait_queue_head_t *q
+ }
+ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+ 
++/**
++ * complete: - signals a single thread waiting on this completion
++ * @x:  holds the state of this particular completion
++ *
++ * This will wake up a single thread waiting on this completion. Threads will be
++ * awakened in the same order in which they were queued.
++ *
++ * See also complete_all(), wait_for_completion() and related routines.
++ *
++ * It may be assumed that this function implies a write memory barrier before
++ * changing the task state if and only if any tasks are woken up.
++ */
+ void complete(struct completion *x)
+ {
+ 	unsigned long flags;
+@@ -2626,6 +2675,15 @@ void complete(struct completion *x)
+ }
+ EXPORT_SYMBOL(complete);
+ 
++/**
++ * complete_all: - signals all threads waiting on this completion
++ * @x:  holds the state of this particular completion
++ *
++ * This will wake up all threads waiting on this particular completion event.
++ *
++ * It may be assumed that this function implies a write memory barrier before
++ * changing the task state if and only if any tasks are woken up.
++ */
+ void complete_all(struct completion *x)
+ {
+ 	unsigned long flags;
+@@ -2677,12 +2735,31 @@ wait_for_common(struct completion *x, lo
+ 	return timeout;
+ }
+ 
++/**
++ * wait_for_completion: - waits for completion of a task
++ * @x:  holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It is NOT
++ * interruptible and there is no timeout.
++ *
++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ * and interrupt capability. Also see complete().
++ */
+ void __sched wait_for_completion(struct completion *x)
+ {
+ 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+ }
+ EXPORT_SYMBOL(wait_for_completion);
+ 
++/**
++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ * @x:  holds the state of this particular completion
++ * @timeout:  timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. The timeout is in jiffies. It is not
++ * interruptible.
++ */
+ unsigned long __sched
+ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+ {
+@@ -2690,6 +2767,13 @@ wait_for_completion_timeout(struct compl
+ }
+ EXPORT_SYMBOL(wait_for_completion_timeout);
+ 
++/**
++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ * @x:  holds the state of this particular completion
++ *
++ * This waits for completion of a specific task to be signaled. It is
++ * interruptible.
++ */
+ int __sched wait_for_completion_interruptible(struct completion *x)
+ {
+ 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+@@ -2699,6 +2783,14 @@ int __sched wait_for_completion_interrup
+ }
+ EXPORT_SYMBOL(wait_for_completion_interruptible);
+ 
++/**
++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ * @x:  holds the state of this particular completion
++ * @timeout:  timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ */
+ unsigned long __sched
+ wait_for_completion_interruptible_timeout(struct completion *x,
+ 					  unsigned long timeout)
+@@ -2707,6 +2799,13 @@ wait_for_completion_interruptible_timeou
+ }
+ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+ 
++/**
++ * wait_for_completion_killable: - waits for completion of a task (killable)
++ * @x:  holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It can be
++ * interrupted by a kill signal.
++ */
+ int __sched wait_for_completion_killable(struct completion *x)
+ {
+ 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
diff --git a/kernel-bfs-2.6.28/debian/patches/bfs/bfs-setup_prereqs.patch b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-setup_prereqs.patch
new file mode 100644
index 0000000..a95045e
--- /dev/null
+++ b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-setup_prereqs.patch
@@ -0,0 +1,703 @@
+diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
+index 994b4d3..1e1dc49 100644
+--- a/arch/arm/kernel/process.c
++++ b/arch/arm/kernel/process.c
+@@ -373,7 +373,7 @@ void release_thread(struct task_struct *dead_task)
+ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+ 
+ int
+-copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start,
++copy_thread(unsigned long clone_flags, unsigned long stack_start,
+ 	    unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
+ {
+ 	struct thread_info *thread = task_thread_info(p);
+diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
+index bc84e12..99bf9b0 100644
+--- a/drivers/char/tty_io.c
++++ b/drivers/char/tty_io.c
+@@ -2682,7 +2682,7 @@ void __do_SAK(struct tty_struct *tty)
+ 	/* Kill the entire session */
+ 	do_each_pid_task(session, PIDTYPE_SID, p) {
+ 		printk(KERN_NOTICE "SAK: killed process %d"
+-			" (%s): task_session_nr(p)==tty->session\n",
++			" (%s): task_session(p)==tty->session\n",
+ 			task_pid_nr(p), p->comm);
+ 		send_sig(SIGKILL, p, 1);
+ 	} while_each_pid_task(session, PIDTYPE_SID, p);
+@@ -2692,7 +2692,7 @@ void __do_SAK(struct tty_struct *tty)
+ 	do_each_thread(g, p) {
+ 		if (p->signal->tty == tty) {
+ 			printk(KERN_NOTICE "SAK: killed process %d"
+-			    " (%s): task_session_nr(p)==tty->session\n",
++			    " (%s): task_session(p)==tty->session\n",
+ 			    task_pid_nr(p), p->comm);
+ 			send_sig(SIGKILL, p, 1);
+ 			continue;
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 011db2f..802d144 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -538,25 +539,8 @@ struct signal_struct {
+ 
+ 	struct list_head cpu_timers[3];
+ 
+-	/* job control IDs */
+-
+-	/*
+-	 * pgrp and session fields are deprecated.
+-	 * use the task_session_Xnr and task_pgrp_Xnr routines below
+-	 */
+-
+-	union {
+-		pid_t pgrp __deprecated;
+-		pid_t __pgrp;
+-	};
+-
+ 	struct pid *tty_old_pgrp;
+ 
+-	union {
+-		pid_t session __deprecated;
+-		pid_t __session;
+-	};
+-
+ 	/* boolean value for session group leader */
+ 	int leader;
+ 
+@@ -1453,16 +1437,6 @@ static inline int rt_task(struct task_struct *p)
+ 	return rt_prio(p->prio);
+ }
+ 
+-static inline void set_task_session(struct task_struct *tsk, pid_t session)
+-{
+-	tsk->signal->__session = session;
+-}
+-
+-static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
+-{
+-	tsk->signal->__pgrp = pgrp;
+-}
+-
+ static inline struct pid *task_pid(struct task_struct *task)
+ {
+ 	return task->pids[PIDTYPE_PID].pid;
+@@ -1473,6 +1447,11 @@ static inline struct pid *task_tgid(struct task_struct *task)
+ 	return task->group_leader->pids[PIDTYPE_PID].pid;
+ }
+ 
++/*
++ * Without tasklist or rcu lock it is not safe to dereference
++ * the result of task_pgrp/task_session even if task == current,
++ * we can race with another thread doing sys_setsid/sys_setpgid.
++ */
+ static inline struct pid *task_pgrp(struct task_struct *task)
+ {
+ 	return task->group_leader->pids[PIDTYPE_PGID].pid;
+@@ -1498,17 +1477,23 @@ struct pid_namespace;
+  *
+  * see also pid_nr() etc in include/linux/pid.h
+  */
++pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
++			struct pid_namespace *ns);
+ 
+ static inline pid_t task_pid_nr(struct task_struct *tsk)
+ {
+ 	return tsk->pid;
+ }
+ 
+-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
++static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
++					struct pid_namespace *ns)
++{
++	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
++}
+ 
+ static inline pid_t task_pid_vnr(struct task_struct *tsk)
+ {
+-	return pid_vnr(task_pid(tsk));
++	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
+ }
+ 
+ 
+@@ -1525,31 +1510,34 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+ }
+ 
+ 
+-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
++static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
++					struct pid_namespace *ns)
+ {
+-	return tsk->signal->__pgrp;
++	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+ }
+ 
+-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+-
+ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
+ {
+-	return pid_vnr(task_pgrp(tsk));
++	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
+ }
+ 
+ 
+-static inline pid_t task_session_nr(struct task_struct *tsk)
++static inline pid_t task_session_nr_ns(struct task_struct *tsk,
++					struct pid_namespace *ns)
+ {
+-	return tsk->signal->__session;
++	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+ }
+ 
+-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+-
+ static inline pid_t task_session_vnr(struct task_struct *tsk)
+ {
+-	return pid_vnr(task_session(tsk));
++	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
+ }
+ 
++/* obsolete, do not use */
++static inline pid_t task_pgrp_nr(struct task_struct *tsk)
++{
++	return task_pgrp_nr_ns(tsk, &init_pid_ns);
++}
+ 
+ /**
+  * pid_alive - check that a task structure is not stale
+@@ -1949,7 +1937,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
+ /* Allocate a new mm structure and copy contents from tsk->mm */
+ extern struct mm_struct *dup_mm(struct task_struct *tsk);
+ 
+-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
++extern int copy_thread(unsigned long, unsigned long, unsigned long,
++			struct task_struct *, struct pt_regs *);
+ extern void flush_thread(void);
+ extern void exit_thread(void);
+ 
+diff --git a/include/linux/wait.h b/include/linux/wait.h
+index a210ede..0d2eeb0 100644
+--- a/include/linux/wait.h
++++ b/include/linux/wait.h
+@@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
+ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ 			int nr_exclusive, int sync, void *key);
+ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+-extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+-extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
++void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
++void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
++			void *key);
++void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
++void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+ void __wake_up_bit(wait_queue_head_t *, void *, int);
+ int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
+ int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
+diff --git a/kernel/exit.c b/kernel/exit.c
+index efd30cc..ca734c6f 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -362,16 +362,12 @@ static void reparent_to_kthreadd(void)
+ void __set_special_pids(struct pid *pid)
+ {
+ 	struct task_struct *curr = current->group_leader;
+-	pid_t nr = pid_nr(pid);
+ 
+-	if (task_session(curr) != pid) {
++	if (task_session(curr) != pid)
+ 		change_pid(curr, PIDTYPE_SID, pid);
+-		set_task_session(curr, nr);
+-	}
+-	if (task_pgrp(curr) != pid) {
++
++	if (task_pgrp(curr) != pid)
+ 		change_pid(curr, PIDTYPE_PGID, pid);
+-		set_task_pgrp(curr, nr);
+-	}
+ }
+ 
+ static void set_special_pids(struct pid *pid)
+@@ -815,33 +811,44 @@ static void ptrace_exit_finish(struct task_struct *parent,
+ 	}
+ }
+ 
+-static void reparent_thread(struct task_struct *p, struct task_struct *father)
++/* Returns nonzero if the child should be released. */
++static int reparent_thread(struct task_struct *p, struct task_struct *father)
+ {
++	int dead;
++
+ 	if (p->pdeath_signal)
+ 		/* We already hold the tasklist_lock here.  */
+ 		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+ 
+ 	list_move_tail(&p->sibling, &p->real_parent->children);
+ 
++	if (task_detached(p))
++		return 0;
+ 	/* If this is a threaded reparent there is no need to
+ 	 * notify anyone anything has happened.
+ 	 */
+ 	if (same_thread_group(p->real_parent, father))
+-		return;
++		return 0;
+ 
+ 	/* We don't want people slaying init.  */
+-	if (!task_detached(p))
+-		p->exit_signal = SIGCHLD;
++	p->exit_signal = SIGCHLD;
+ 
+ 	/* If we'd notified the old parent about this child's death,
+ 	 * also notify the new parent.
+ 	 */
+-	if (!ptrace_reparented(p) &&
+-	    p->exit_state == EXIT_ZOMBIE &&
+-	    !task_detached(p) && thread_group_empty(p))
++	dead = 0;
++	if (!p->ptrace &&
++	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+ 		do_notify_parent(p, p->exit_signal);
++		if (task_detached(p)) {
++			p->exit_state = EXIT_DEAD;
++			dead = 1;
++		}
++	}
+ 
+ 	kill_orphaned_pgrp(p, father);
++
++	return dead;
+ }
+ 
+ /*
+@@ -901,7 +908,8 @@ static void forget_original_parent(struct task_struct *father)
+ 			BUG_ON(p->ptrace);
+ 			p->parent = p->real_parent;
+ 		}
+-		reparent_thread(p, father);
++		if (reparent_thread(p, father))
++			list_add(&p->ptrace_entry, &ptrace_dead);;
+ 	}
+ 
+ 	write_unlock_irq(&tasklist_lock);
+@@ -1420,6 +1428,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
+ 	return retval;
+ }
+ 
++static int *task_stopped_code(struct task_struct *p, bool ptrace)
++{
++	if (ptrace) {
++		if (task_is_stopped_or_traced(p))
++			return &p->exit_code;
++	} else {
++		if (p->signal->flags & SIGNAL_STOP_STOPPED)
++			return &p->signal->group_exit_code;
++	}
++	return NULL;
++}
++
+ /*
+  * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+@@ -1430,7 +1450,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
+ 			     int options, struct siginfo __user *infop,
+ 			     int __user *stat_addr, struct rusage __user *ru)
+ {
+-	int retval, exit_code, why;
++	int retval, exit_code, *p_code, why;
+ 	uid_t uid = 0; /* unneeded, required by compiler */
+ 	pid_t pid;
+ 
+@@ -1440,22 +1460,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
+ 	exit_code = 0;
+ 	spin_lock_irq(&p->sighand->siglock);
+ 
+-	if (unlikely(!task_is_stopped_or_traced(p)))
+-		goto unlock_sig;
+-
+-	if (!ptrace && p->signal->group_stop_count > 0)
+-		/*
+-		 * A group stop is in progress and this is the group leader.
+-		 * We won't report until all threads have stopped.
+-		 */
++	p_code = task_stopped_code(p, ptrace);
++	if (unlikely(!p_code))
+ 		goto unlock_sig;
+ 
+-	exit_code = p->exit_code;
++	exit_code = *p_code;
+ 	if (!exit_code)
+ 		goto unlock_sig;
+ 
+ 	if (!unlikely(options & WNOWAIT))
+-		p->exit_code = 0;
++		*p_code = 0;
+ 
+ 	/* don't need the RCU readlock here as we're holding a spinlock */
+ 	uid = __task_cred(p)->uid;
+@@ -1611,7 +1625,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
+ 	 */
+ 	*notask_error = 0;
+ 
+-	if (task_is_stopped_or_traced(p))
++	if (task_stopped_code(p, ptrace))
+ 		return wait_task_stopped(ptrace, p, options,
+ 					 infop, stat_addr, ru);
+ 
+@@ -1811,7 +1825,7 @@  asmlinkage long sys_wait4(pid_t upid, in
+ 		pid = find_get_pid(-upid);
+ 	} else if (upid == 0) {
+ 		type = PIDTYPE_PGID;
+-		pid = get_pid(task_pgrp(current));
++		pid = get_task_pid(current, PIDTYPE_PGID);
+ 	} else /* upid > 0 */ {
+ 		type = PIDTYPE_PID;
+ 		pid = find_get_pid(upid);
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 4854c2c..cf9f156 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1120,7 +1120,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 		goto bad_fork_cleanup_mm;
+ 	if ((retval = copy_io(clone_flags, p)))
+ 		goto bad_fork_cleanup_namespaces;
+-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
++	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+ 	if (retval)
+ 		goto bad_fork_cleanup_io;
+ 
+@@ -1258,8 +1258,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 			p->signal->leader_pid = pid;
+ 			tty_kref_put(p->signal->tty);
+ 			p->signal->tty = tty_kref_get(current->signal->tty);
+-			set_task_pgrp(p, task_pgrp_nr(current));
+-			set_task_session(p, task_session_nr(current));
+ 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ 			attach_pid(p, PIDTYPE_SID, task_session(current));
+ 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+diff --git a/kernel/pid.c b/kernel/pid.c
+index 1b3586f..8582d4e 100644
+--- a/kernel/pid.c
++++ b/kernel/pid.c
+@@ -403,6 +403,8 @@  struct pid *get_task_pid(struct task_str
+ {
+ 	struct pid *pid;
+ 	rcu_read_lock();
++	if (type != PIDTYPE_PID)
++		task = task->group_leader;
+ 	pid = get_pid(task->pids[type].pid);
+ 	rcu_read_unlock();
+ 	return pid;
+@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
+ }
+ EXPORT_SYMBOL_GPL(pid_vnr);
+ 
+-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
++pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
++			struct pid_namespace *ns)
+ {
+-	return pid_nr_ns(task_pid(tsk), ns);
++	pid_t nr = 0;
++
++	rcu_read_lock();
++	if (!ns)
++		ns = current->nsproxy->pid_ns;
++	if (likely(pid_alive(task))) {
++		if (type != PIDTYPE_PID)
++			task = task->group_leader;
++		nr = pid_nr_ns(task->pids[type].pid, ns);
++	}
++	rcu_read_unlock();
++
++	return nr;
+ }
+-EXPORT_SYMBOL(task_pid_nr_ns);
++EXPORT_SYMBOL(__task_pid_nr_ns);
+ 
+ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+ {
+@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+ }
+ EXPORT_SYMBOL(task_tgid_nr_ns);
+ 
+-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+-{
+-	return pid_nr_ns(task_pgrp(tsk), ns);
+-}
+-EXPORT_SYMBOL(task_pgrp_nr_ns);
+-
+-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+-{
+-	return pid_nr_ns(task_session(tsk), ns);
+-}
+-EXPORT_SYMBOL(task_session_nr_ns);
+-
+ /*
+  * Used by proc to find the first pid that is greater then or equal to nr.
+  *
+diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
+index 476607f..0df323a 100644
+--- a/kernel/posix-cpu-timers.c
++++ b/kernel/posix-cpu-timers.c
+@@ -1371,7 +1372,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
+ 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+ 			return 1;
+ 	}
+-	return 0;
++
++	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
+ }
+ 
+ /*
+@@ -1419,19 +1421,19 @@ void run_posix_cpu_timers(struct task_struct *tsk)
+ 	 * timer call will interfere.
+ 	 */
+ 	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+-		int firing;
++		int cpu_firing;
++
+ 		spin_lock(&timer->it_lock);
+ 		list_del_init(&timer->it.cpu.entry);
+-		firing = timer->it.cpu.firing;
++		cpu_firing = timer->it.cpu.firing;
+ 		timer->it.cpu.firing = 0;
+ 		/*
+ 		 * The firing flag is -1 if we collided with a reset
+ 		 * of the timer, which already reported this
+ 		 * almost-firing as an overrun.  So don't generate an event.
+ 		 */
+-		if (likely(firing >= 0)) {
++		if (likely(cpu_firing >= 0))
+ 			cpu_timer_fire(timer);
+-		}
+ 		spin_unlock(&timer->it_lock);
+ 	}
+ }
+diff --git a/kernel/sched.c b/kernel/sched.c
+index f1e8560..b0cdc3a 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -618,9 +618,6 @@ struct rq {
+ 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+ 
+ 	/* sys_sched_yield() stats */
+-	unsigned int yld_exp_empty;
+-	unsigned int yld_act_empty;
+-	unsigned int yld_both_empty;
+ 	unsigned int yld_count;
+ 
+ 	/* schedule() stats */
+@@ -2750,7 +2747,40 @@ unsigned long nr_iowait(void)
+ 	return sum;
+ }
+ 
+-unsigned long nr_active(void)
++/* Variables and functions for calc_load */
++static atomic_long_t calc_load_tasks;
++static unsigned long calc_load_update;
++unsigned long avenrun[3];
++EXPORT_SYMBOL(avenrun);
++
++/**
++ * get_avenrun - get the load average array
++ * @loads:	pointer to dest load array
++ * @offset:	offset to add
++ * @shift:	shift count to shift the result left
++ *
++ * These values are estimates at best, so no need for locking.
++ */
++void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
++{
++	loads[0] = (avenrun[0] + offset) << shift;
++	loads[1] = (avenrun[1] + offset) << shift;
++	loads[2] = (avenrun[2] + offset) << shift;
++}
++
++static unsigned long
++calc_load(unsigned long load, unsigned long exp, unsigned long active)
++{
++	load *= exp;
++	load += active * (FIXED_1 - exp);
++	return load >> FSHIFT;
++}
++
++/*
++ * calc_load - update the avenrun load estimates 10 ticks after the
++ * CPUs have updated calc_load_tasks.
++ */
++void calc_global_load(void)
+ {
+ 	unsigned long i, running = 0, uninterruptible = 0;
+ 
+@@ -4781,11 +4811,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+ 	__wake_up_common(q, mode, 1, 0, NULL);
+ }
+ 
++void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
++{
++	__wake_up_common(q, mode, 1, 0, key);
++}
++
+ /**
+- * __wake_up_sync - wake up threads blocked on a waitqueue.
++ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+  * @mode: which threads
+  * @nr_exclusive: how many wake-one or wake-many threads to wake up
++ * @key: opaque value to be passed to wakeup targets
+  *
+  * The sync wakeup differs that the waker knows that it will schedule
+  * away soon, so while the target thread will be woken up, it will not
+@@ -4794,8 +4830,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+  *
+  * On UP it can prevent extra preemption.
+  */
+-void
+-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
++void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
++			int nr_exclusive, void *key)
+ {
+ 	unsigned long flags;
+ 	int sync = 1;
+@@ -4807,9 +4843,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ 		sync = 0;
+ 
+ 	spin_lock_irqsave(&q->lock, flags);
+-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
++	__wake_up_common(q, mode, nr_exclusive, sync, key);
+ 	spin_unlock_irqrestore(&q->lock, flags);
+ }
++EXPORT_SYMBOL_GPL(__wake_up_sync_key);
++
++/*
++ * __wake_up_sync - see __wake_up_sync_key()
++ */
++void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
++{
++	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
++}
+ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+ 
+ /**
+diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
+index 16eeba4e..bdf57bc 100644
+--- a/kernel/sched_debug.c
++++ b/kernel/sched_debug.c
+@@ -287,9 +287,6 @@ static void print_cpu(struct seq_file *m, int cpu)
+ #ifdef CONFIG_SCHEDSTATS
+ #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+ 
+-	P(yld_exp_empty);
+-	P(yld_act_empty);
+-	P(yld_both_empty);
+ 	P(yld_count);
+ 
+ 	P(sched_switch);
+@@ -314,7 +311,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
+ 	u64 now = ktime_to_ns(ktime_get());
+ 	int cpu;
+ 
+-	SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
++	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+ 		init_utsname()->release,
+ 		(int)strcspn(init_utsname()->version, " "),
+ 		init_utsname()->version);
+diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
+index a8f93dd..32d2bd4 100644
+--- a/kernel/sched_stats.h
++++ b/kernel/sched_stats.h
+@@ -4,7 +4,7 @@
+  * bump this up when changing the output format or the meaning of an existing
+  * format, so that tools can adapt (or abort)
+  */
+-#define SCHEDSTAT_VERSION 14
++#define SCHEDSTAT_VERSION 15
+ 
+ static int show_schedstat(struct seq_file *seq, void *v)
+ {
+@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
+ 
+ 		/* runqueue-specific stats */
+ 		seq_printf(seq,
+-		    "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
+-		    cpu, rq->yld_both_empty,
+-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
++		    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
++		    cpu, rq->yld_count,
+ 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ 		    rq->ttwu_count, rq->ttwu_local,
+ 		    rq->rq_cpu_time,
+diff --git a/kernel/sys.c b/kernel/sys.c
+index 37f458e..742cefa 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
+ 	if (err)
+ 		goto out;
+ 
+-	if (task_pgrp(p) != pgrp) {
++	if (task_pgrp(p) != pgrp)
+ 		change_pid(p, PIDTYPE_PGID, pgrp);
+-		set_task_pgrp(p, pid_nr(pgrp));
+-	}
+ 
+ 	err = 0;
+ out:
+diff --git a/kernel/workqueue.c b/kernel/workqueue.c
+index 1f0c509..6e9b6d1 100644
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -972,17 +972,19 @@ undo:
+ 
+ #ifdef CONFIG_SMP
+ struct work_for_cpu {
+-	struct work_struct work;
++	struct completion completion;
+ 	long (*fn)(void *);
+ 	void *arg;
+ 	long ret;
+ };
+ 
+-static void do_work_for_cpu(struct work_struct *w)
++static int do_work_for_cpu(void *_wfc)
+ {
+-	struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
++	struct work_for_cpu *wfc = _wfc;
+ 
+ 	wfc->ret = wfc->fn(wfc->arg);
++	complete(&wfc->completion);
++	return 0;
+ }
+ 
+ /**
+@@ -996,20 +998,19 @@ static void do_work_for_cpu(struct work_
+  */
+ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+ {
+-	struct work_for_cpu wfc;
+-
+-	INIT_WORK(&wfc.work, do_work_for_cpu);
+-	wfc.fn = fn;
+-	wfc.arg = arg;
+-	get_online_cpus();
+-	if (unlikely(!cpu_online(cpu)))
+-		wfc.ret = -EINVAL;
+-	else {
+-		schedule_work_on(cpu, &wfc.work);
+-		flush_work(&wfc.work);
+-	}
+-	put_online_cpus();
+-
++	struct task_struct *sub_thread;
++	struct work_for_cpu wfc = {
++		.completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
++		.fn = fn,
++		.arg = arg,
++	};
++
++	sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
++	if (IS_ERR(sub_thread))
++		return PTR_ERR(sub_thread);
++	kthread_bind(sub_thread, cpu);
++	wake_up_process(sub_thread);
++	wait_for_completion(&wfc.completion);
+ 	return wfc.ret;
+ }
+ EXPORT_SYMBOL_GPL(work_on_cpu);
diff --git a/kernel-bfs-2.6.28/debian/patches/bfs/bfs-tidy_up_resched.patch b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-tidy_up_resched.patch
new file mode 100644
index 0000000..d1ff566
--- /dev/null
+++ b/kernel-bfs-2.6.28/debian/patches/bfs/bfs-tidy_up_resched.patch
@@ -0,0 +1,84 @@
+--- linux-2.6.28/kernel/sched_bfs.c	2011-06-17 23:09:25.884488799 +0200
++++ linux-2.6.28.new/kernel/sched_bfs.c	2011-06-17 23:15:51.483825482 +0200
+@@ -2459,7 +2459,7 @@ need_resched_nonpreemptible:
+ 	if (unlikely(reacquire_kernel_lock(current) < 0))
+ 		goto need_resched_nonpreemptible;
+ 	preempt_enable_no_resched();
+-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
++	if (need_resched())
+ 		goto need_resched;
+ }
+ EXPORT_SYMBOL(schedule);
+@@ -2491,7 +2491,7 @@ asmlinkage void __sched preempt_schedule
+ 		 * between schedule and now.
+ 		 */
+ 		barrier();
+-	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
++	} while (need_resched());
+ }
+ EXPORT_SYMBOL(preempt_schedule);
+ 
+@@ -2520,7 +2520,7 @@ asmlinkage void __sched preempt_schedule
+ 		 * between schedule and now.
+ 		 */
+ 		barrier();
+-	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
++	} while (need_resched());
+ }
+ 
+ #endif /* CONFIG_PREEMPT */
+@@ -3489,6 +3489,11 @@ asmlinkage long sys_sched_yield(void)
+ 	return 0;
+ }
+ 
++static inline int should_resched(void)
++{
++	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
++}
++
+ static void __cond_resched(void)
+ {
+ 	/* NOT a real fix but will make voluntary preempt work. é¦¬é¹¿ãªäº */
+@@ -3511,8 +3516,7 @@ static void __cond_resched(void)
+ 
+ int __sched _cond_resched(void)
+ {
+-	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+-					system_state == SYSTEM_RUNNING) {
++	if (should_resched()) {
+ 		__cond_resched();
+ 		return 1;
+ 	}
+@@ -3530,12 +3534,12 @@ EXPORT_SYMBOL(_cond_resched);
+  */
+ int cond_resched_lock(spinlock_t *lock)
+ {
+-	int resched = need_resched() && system_state == SYSTEM_RUNNING;
++	int resched = should_resched();
+ 	int ret = 0;
+ 
+ 	if (spin_needbreak(lock) || resched) {
+ 		spin_unlock(lock);
+-		if (resched && need_resched())
++		if (resched)
+ 			__cond_resched();
+ 		else
+ 			cpu_relax();
+@@ -3550,7 +3554,7 @@ int __sched cond_resched_softirq(void)
+ {
+ 	BUG_ON(!in_softirq());
+ 
+-	if (need_resched() && system_state == SYSTEM_RUNNING) {
++	if (should_resched()) {
+ 		local_bh_enable();
+ 		__cond_resched();
+ 		local_bh_disable();
+@@ -3919,7 +3923,7 @@ void wake_up_idle_cpu(int cpu)
+ 	 * lockless. The worst case is that the other CPU runs the
+ 	 * idle task through an additional NOOP schedule()
+ 	 */
+-	set_tsk_thread_flag(idle, TIF_NEED_RESCHED);
++	set_tsk_need_resched(idle);
+ 
+ 	/* NEED_RESCHED must be visible before we test polling */
+ 	smp_mb();
diff --git a/kernel-bfs-2.6.28/debian/patches/bfs/tick_sched-set_inidle_unconditionally.patch b/kernel-bfs-2.6.28/debian/patches/bfs/tick_sched-set_inidle_unconditionally.patch
new file mode 100644
index 0000000..a20d728
--- /dev/null
+++ b/kernel-bfs-2.6.28/debian/patches/bfs/tick_sched-set_inidle_unconditionally.patch
@@ -0,0 +1,30 @@
+--- linux-2.6.28/kernel/time/tick-sched.c	2011-06-20 00:00:22.673390790 +0200
++++ linux-2.6.28.new/kernel/time/tick-sched.c	2011-06-20 12:12:30.374080397 +0200
+@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidl
+ 	if (!inidle && !ts->inidle)
+ 		goto end;
+ 
++	/*
++	 * Set ts->inidle unconditionally. Even if the system did not
++	 * switch to NOHZ mode the cpu frequency governers rely on the
++	 * update of the idle time accounting in tick_nohz_start_idle().
++	 */
++	ts->inidle = 1;
++
+ 	now = tick_nohz_start_idle(ts);
+ 
+ 	/*
+@@ -248,12 +255,10 @@ void tick_nohz_stop_sched_tick(int inidl
+ 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ 		goto end;
+ 
+-	ts->inidle = 1;
+-
+ 	if (need_resched())
+ 		goto end;
+ 
+-	if (unlikely(local_softirq_pending())) {
++	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ 		static int ratelimit;
+ 
+ 		if (ratelimit < 10) {
diff --git a/kernel-bfs-2.6.28/debian/patches/series b/kernel-bfs-2.6.28/debian/patches/series
index 1ae8215..5b3ae99 100644
--- a/kernel-bfs-2.6.28/debian/patches/series
+++ b/kernel-bfs-2.6.28/debian/patches/series
@@ -50,7 +50,11 @@ class10sd_dto14_fix.diff
 
 #################################
 # BFS Patches
+bfs/tick_sched-set_inidle_unconditionally.patch
+bfs/bfs-setup_prereqs.patch
 bfs/bfs.patch
+bfs/bfs-implement_prereqs.patch
+bfs/bfs-tidy_up_resched.patch
 bfs/bfs-316-to-318.patch
 bfs/bfs-318-to-330.patch
 bfs/sched_reset_on_fork.diff
-- 
1.7.9.5