BFS 401 update to hierarchical tree based penalty patch (that can also group threads...

author tigerite <peter_j_hunt@hotmail.com>

Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)

committer tigerite <peter_j_hunt@hotmail.com>

Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)
author tigerite <peter_j_hunt@hotmail.com>
Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)
committer tigerite <peter_j_hunt@hotmail.com>
Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)
diff --git a/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch b/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch

new file mode 100644 (file)

index 0000000..764ff6c
--- /dev/null
+++ b/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch
@@ -0,0 +1,284 @@
+Make it possible to have interactivity and responsiveness at very high load
+levels by making deadlines offset by the fork depth from init. This has a
+similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect
+example of this and will, with fork_depth_penalty enabled, be felt as much
+at 'make -j24' as it normally would be with just 'make'.
+
+Note that this drastically affects CPU distribution, and also has the
+indirect side effect of partitioning CPU entitlement to different users as
+well. No assumption as to CPU distribution should be made based on past
+behaviour.
+
+This is achieved by separating out forks to new processes vs new threads.
+When a new process is detected, its fork depth is inherited from its parent
+across fork() and then is incremented by one. That fork_depth is then used
+to cause a relative offset of its deadline.
+
+This feature is disabled in this patch by default and can be optionally
+enabled.
+
+Threads are kept at the same fork_depth as their parent process, and can
+optionally have their CPU entitlement all managed as one process together
+by enabling the group_thread_accounting feature. This feature is disabled
+by default in this patch, as many desktop applications such as firefox,
+amarok, etc are multithreaded. By disabling this feature and enabling the
+fork_depth_penalty feature (default) it favours CPU towards desktop
+applications.
+
+Extensive testing is required to ensure this does not cause regressions in
+common workloads.
+
+There are two sysctls to enable/disable these features.
+
+They are in /proc/sys/kernel/
+
+group_thread_accounting - groups CPU accounting by threads
+fork_depth_penalty - penalises according to depth of forking from init
+
+-ck
+
+---
+ include/linux/sched.h |    7 +++
+ kernel/sched_bfs.c    |   88 ++++++++++++++++++++++++++++++++++++++++++++++----
+ kernel/sysctl.c       |   20 +++++++++++
+ 3 files changed, 108 insertions(+), 7 deletions(-)
+
+Index: linux-2.6.36-rc7-ck1/include/linux/sched.h
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h    2010-10-08 09:39:38.016240768 +1100
++++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100
+@@ -1187,10 +1187,15 @@ struct task_struct {
+       unsigned int rt_priority;
+ #ifdef CONFIG_SCHED_BFS
+       int time_slice;
+-      u64 deadline;
++      /* Virtual deadline in niffies, and when the deadline was set */
++      u64 deadline, deadline_niffy;
+       struct list_head run_list;
+       u64 last_ran;
+       u64 sched_time; /* sched_clock time spent running */
++      /* Number of threads currently requesting CPU time */
++      unsigned long threads_running;
++      /* Depth of forks from init */
++      int fork_depth;
+ #ifdef CONFIG_SMP
+       int sticky; /* Soft affined flag */
+ #endif
+Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c       2010-10-08 09:39:37.918242270 +1100
++++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c    2010-10-08 11:16:01.382198622 +1100
+@@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6;
+ int sched_iso_cpu __read_mostly = 70;
+ 
+ /*
++ * group_thread_accounting - sysctl to decide whether to treat whole thread
++ * groups as a single entity for the purposes of CPU distribution.
++ */
++int group_thread_accounting __read_mostly;
++
++/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */
++int fork_depth_penalty __read_mostly;
++
++/*
+  * The relative length of deadline for each priority(nice) level.
+  */
+ static int prio_ratios[PRIO_RANGE] __read_mostly;
+@@ -661,11 +670,29 @@ static int isoprio_suitable(void)
+       return !grq.iso_refractory;
+ }
+ 
++static inline u64 __task_deadline_diff(struct task_struct *p);
++static inline u64 task_deadline_diff(struct task_struct *p);
++
+ /*
+  * Adding to the global runqueue. Enter with grq locked.
+  */
+ static void enqueue_task(struct task_struct *p)
+ {
++      s64 max_tdd = task_deadline_diff(p);
++
++      /*
++       * Make sure that when we're queueing this task again that it
++       * doesn't have any old deadlines from when the thread group was
++       * being penalised and cap the deadline to the highest it could
++       * be, based on the current number of threads running.
++       */
++      if (group_thread_accounting) {
++              max_tdd += p->group_leader->threads_running *
++                         __task_deadline_diff(p);
++      }
++      if (p->deadline - p->deadline_niffy > max_tdd)
++              p->deadline = p->deadline_niffy + max_tdd;
++
+       if (!rt_task(p)) {
+               /* Check it hasn't gotten rt from PI */
+               if ((idleprio_task(p) && idleprio_suitable(p)) ||
+@@ -967,10 +994,13 @@ static int effective_prio(struct task_st
+ }
+ 
+ /*
+- * activate_task - move a task to the runqueue. Enter with grq locked.
++ * activate_task - move a task to the runqueue. Enter with grq locked. The
++ * number of threads running is stored in the group_leader struct.
+  */
+ static void activate_task(struct task_struct *p, struct rq *rq)
+ {
++      unsigned long *threads_running = &p->group_leader->threads_running;
++
+       update_clocks(rq);
+ 
+       /*
+@@ -987,6 +1017,14 @@ static void activate_task(struct task_st
+       p->prio = effective_prio(p);
+       if (task_contributes_to_load(p))
+               grq.nr_uninterruptible--;
++      /*
++       * Adjust deadline according to number of running threads within
++       * this thread group. This ends up distributing CPU to the thread
++       * group as a single entity.
++       */
++      ++*threads_running;
++      if (*threads_running > 1 && group_thread_accounting)
++              p->deadline += __task_deadline_diff(p);
+       enqueue_task(p);
+       grq.nr_running++;
+       inc_qnr();
+@@ -998,9 +1036,14 @@ static void activate_task(struct task_st
+  */
+ static inline void deactivate_task(struct task_struct *p)
+ {
++      unsigned long *threads_running = &p->group_leader->threads_running;
++
+       if (task_contributes_to_load(p))
+               grq.nr_uninterruptible++;
+       grq.nr_running--;
++      --*threads_running;
++      if (*threads_running > 0 && group_thread_accounting)
++              p->deadline -= __task_deadline_diff(p);
+ }
+ 
+ #ifdef CONFIG_SMP
+@@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct
+       parent = p->parent;
+       /* Unnecessary but small chance that the parent changed CPU */
+       set_task_cpu(p, task_cpu(parent));
++      if (!(clone_flags & CLONE_THREAD)) {
++              p->fork_depth++;
++              p->threads_running = 0;
++      }
+       activate_task(p, rq);
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+@@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int
+       return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+ }
+ 
+-static inline u64 task_deadline_diff(struct task_struct *p)
++static inline u64 __task_deadline_diff(struct task_struct *p)
+ {
+       return prio_deadline_diff(TASK_USER_PRIO(p));
+ }
+ 
++static inline u64 task_deadline_diff(struct task_struct *p)
++{
++      u64 pdd = __task_deadline_diff(p);
++
++      if (fork_depth_penalty && p->fork_depth > 1)
++              pdd *= p->fork_depth;
++      return pdd;
++}
++
+ static inline u64 static_deadline_diff(int static_prio)
+ {
+       return prio_deadline_diff(USER_PRIO(static_prio));
+@@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di
+  */
+ static void time_slice_expired(struct task_struct *p)
+ {
++      u64 tdd = task_deadline_diff(p);
++
++      /*
++       * We proportionately increase the deadline according to how many
++       * threads are running. This effectively makes a thread group have
++       * the same CPU as one task, no matter how many threads are running.
++       * time_slice_expired can be called when there may be none running
++       * when p is deactivated so we must explicitly test for more than 1.
++       */
++      if (group_thread_accounting) {
++              unsigned long *threads_running = &p->group_leader->threads_running;
++
++              if (*threads_running > 1)
++                      tdd += *threads_running * __task_deadline_diff(p);
++      }
+       p->time_slice = timeslice();
+-      p->deadline = grq.niffies + task_deadline_diff(p);
++      p->deadline_niffy = grq.niffies;
++      p->deadline = grq.niffies + tdd;
+ }
+ 
+ /*
+@@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment)
+  *
+  * This is the priority value as seen by users in /proc.
+  * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
++ * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO).
+  */
+ int task_prio(const struct task_struct *p)
+ {
+@@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct *
+ 
+       /* Convert to ms to avoid overflows */
+       delta = NS_TO_MS(p->deadline - grq.niffies);
+-      delta = delta * 40 / ms_longest_deadline_diff();
+-      if (delta > 0 && delta <= 80)
++      if (fork_depth_penalty)
++              delta *= 4;
++      else
++              delta *= 40;
++      delta /= ms_longest_deadline_diff();
++      if (delta > 0)
+               prio += delta;
+       if (idleprio_task(p))
+               prio += 40;
+Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c  2010-10-08 09:39:11.603648964 +1100
++++ linux-2.6.36-rc7-ck1/kernel/sysctl.c       2010-10-08 09:39:53.579007778 +1100
+@@ -121,6 +121,8 @@ static int __maybe_unused one_hundred = 
+ #ifdef CONFIG_SCHED_BFS
+ extern int rr_interval;
+ extern int sched_iso_cpu;
++extern int group_thread_accounting;
++extern int fork_depth_penalty;
+ static int __read_mostly one_thousand = 1000;
+ #endif
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+@@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = {
+               .extra1         = &zero,
+               .extra2         = &one_hundred,
+       },
++      {
++              .procname       = "group_thread_accounting",
++              .data           = &group_thread_accounting,
++              .maxlen         = sizeof (int),
++              .mode           = 0644,
++              .proc_handler   = &proc_dointvec_minmax,
++              .extra1         = &zero,
++              .extra2         = &one,
++      },
++      {
++              .procname       = "fork_depth_penalty",
++              .data           = &fork_depth_penalty,
++              .maxlen         = sizeof (int),
++              .mode           = 0644,
++              .proc_handler   = &proc_dointvec_minmax,
++              .extra1         = &zero,
++              .extra2         = &one,
++      },
+ #endif
+ #if defined(CONFIG_S390) && defined(CONFIG_SMP)
+       {
+v
+vv
diff --git a/kernel-bfs-2.6.28/debian/patches/series b/kernel-bfs-2.6.28/debian/patches/series

index 82aeba8..bf4a4ad 100644 (file)
--- a/kernel-bfs-2.6.28/debian/patches/series
+++ b/kernel-bfs-2.6.28/debian/patches/series
@@ -28,11 +28,11 @@ bfs-318-to-330.patch
  sched_reset_on_fork.diff
  bfs-330-to-350.patch
  bfs-350-to-357.patch
-#bfs357-penalise_fork_depth_account_threads.patch
  bfs-357-to-360.patch
  bfs-360-to-363.patch
  bfs-363-to-400.patch
  bfs-400-to-401.patch
+bfs401-penalise_fork_depth_account_threads.patch
  voltage_scaling_1.diff
  voltage_scaling_0.diff
  arm-proc-v7.diff
author	tigerite <peter_j_hunt@hotmail.com>
	Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)
committer	tigerite <peter_j_hunt@hotmail.com>
	Wed, 25 May 2011 20:50:40 +0000 (22:50 +0200)
kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch	[new file with mode: 0644]	patch \| blob
kernel-bfs-2.6.28/debian/patches/series		patch \| blob \| history