1 Make it possible to have interactivity and responsiveness at very high load
2 levels by making deadlines offset by the fork depth from init. This has a
3 similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect
4 example of this and will, with fork_depth_penalty enabled, be felt as much
5 at 'make -j24' as it normally would be with just 'make'.
7 Note that this drastically affects CPU distribution, and also has the
8 indirect side effect of partitioning CPU entitlement to different users as
9 well. No assumption as to CPU distribution should be made based on past
12 This is achieved by separating out forks to new processes vs new threads.
13 When a new process is detected, its fork depth is inherited from its parent
14 across fork() and then is incremented by one. That fork_depth is then used
15 to cause a relative offset of its deadline.
17 This feature is enabled in this patch by default and can be optionally
20 Threads are kept at the same fork_depth as their parent process, and can
21 optionally have their CPU entitlement all managed as one process together
22 by enabling the group_thread_accounting feature. This feature is disabled
23 by default in this patch, as many desktop applications such as firefox,
24 amarok, etc are multithreaded. By disabling this feature and enabling the
25 fork_depth_penalty feature (default) it favours CPU towards desktop
28 Extensive testing is required to ensure this does not cause regressions in
31 There are two sysctls to enable/disable these features.
33 They are in /proc/sys/kernel/
35 group_thread_accounting - groups CPU accounting by threads
36 fork_depth_penalty - penalises according to depth of forking from init
41 include/linux/sched.h | 7 +++
42 kernel/sched_bfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++----
43 kernel/sysctl.c | 20 +++++++++++
44 3 files changed, 108 insertions(+), 7 deletions(-)
46 Index: linux-2.6.36-rc7-ck1/include/linux/sched.h
47 ===================================================================
48 --- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h 2010-10-08 09:39:38.016240768 +1100
49 +++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100
50 @@ -1187,10 +1187,15 @@ struct task_struct {
51 unsigned int rt_priority;
52 #ifdef CONFIG_SCHED_BFS
55 + /* Virtual deadline in niffies, and when the deadline was set */
56 + u64 deadline, deadline_niffy;
57 struct list_head run_list;
59 u64 sched_time; /* sched_clock time spent running */
60 + /* Number of threads currently requesting CPU time */
61 + unsigned long threads_running;
62 + /* Depth of forks from init */
65 unsigned long rt_timeout;
66 #else /* CONFIG_SCHED_BFS */
67 Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c
68 ===================================================================
69 --- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c 2010-10-08 09:39:37.918242270 +1100
70 +++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c 2010-10-08 11:16:01.382198622 +1100
71 @@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6;
72 int sched_iso_cpu __read_mostly = 70;
75 + * group_thread_accounting - sysctl to decide whether to treat whole thread
76 + * groups as a single entity for the purposes of CPU distribution.
78 +int group_thread_accounting __read_mostly;
80 +/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */
81 +int fork_depth_penalty __read_mostly = 1;
84 * The relative length of deadline for each priority(nice) level.
86 static int prio_ratios[PRIO_RANGE] __read_mostly;
87 @@ -661,11 +670,29 @@ static int isoprio_suitable(void)
88 return !grq.iso_refractory;
91 +static inline u64 __task_deadline_diff(struct task_struct *p);
92 +static inline u64 task_deadline_diff(struct task_struct *p);
95 * Adding to the global runqueue. Enter with grq locked.
97 static void enqueue_task(struct task_struct *p)
99 + s64 max_tdd = task_deadline_diff(p);
102 + * Make sure that when we're queueing this task again that it
103 + * doesn't have any old deadlines from when the thread group was
104 + * being penalised and cap the deadline to the highest it could
105 + * be, based on the current number of threads running.
107 + if (group_thread_accounting) {
108 + max_tdd += p->group_leader->threads_running *
109 + __task_deadline_diff(p);
111 + if (p->deadline - p->deadline_niffy > max_tdd)
112 + p->deadline = p->deadline_niffy + max_tdd;
115 /* Check it hasn't gotten rt from PI */
116 if ((idleprio_task(p) && idleprio_suitable(p)) ||
117 @@ -967,10 +994,13 @@ static int effective_prio(struct task_st
121 - * activate_task - move a task to the runqueue. Enter with grq locked.
122 + * activate_task - move a task to the runqueue. Enter with grq locked. The
123 + * number of threads running is stored in the group_leader struct.
125 static void activate_task(struct task_struct *p, struct rq *rq)
127 + unsigned long *threads_running = &p->group_leader->threads_running;
132 @@ -987,6 +1017,14 @@ static void activate_task(struct task_st
133 p->prio = effective_prio(p);
134 if (task_contributes_to_load(p))
135 grq.nr_uninterruptible--;
137 + * Adjust deadline according to number of running threads within
138 + * this thread group. This ends up distributing CPU to the thread
139 + * group as a single entity.
141 + ++*threads_running;
142 + if (*threads_running > 1 && group_thread_accounting)
143 + p->deadline += __task_deadline_diff(p);
147 @@ -998,9 +1036,14 @@ static void activate_task(struct task_st
149 static inline void deactivate_task(struct task_struct *p)
151 + unsigned long *threads_running = &p->group_leader->threads_running;
153 if (task_contributes_to_load(p))
154 grq.nr_uninterruptible++;
156 + --*threads_running;
157 + if (*threads_running > 0 && group_thread_accounting)
158 + p->deadline -= __task_deadline_diff(p);
162 @@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct
164 /* Unnecessary but small chance that the parent changed CPU */
165 set_task_cpu(p, task_cpu(parent));
166 + if (!(clone_flags & CLONE_THREAD)) {
168 + p->threads_running = 0;
170 activate_task(p, rq);
171 trace_sched_wakeup_new(p, 1);
172 if (!(clone_flags & CLONE_VM) && rq->curr == parent &&
173 @@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int
174 return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
177 -static inline u64 task_deadline_diff(struct task_struct *p)
178 +static inline u64 __task_deadline_diff(struct task_struct *p)
180 return prio_deadline_diff(TASK_USER_PRIO(p));
183 +static inline u64 task_deadline_diff(struct task_struct *p)
185 + u64 pdd = __task_deadline_diff(p);
187 + if (fork_depth_penalty && p->fork_depth > 1)
188 + pdd *= p->fork_depth;
192 static inline u64 static_deadline_diff(int static_prio)
194 return prio_deadline_diff(USER_PRIO(static_prio));
195 @@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di
197 static void time_slice_expired(struct task_struct *p)
199 + u64 tdd = task_deadline_diff(p);
202 + * We proportionately increase the deadline according to how many
203 + * threads are running. This effectively makes a thread group have
204 + * the same CPU as one task, no matter how many threads are running.
205 + * time_slice_expired can be called when there may be none running
206 + * when p is deactivated so we must explicitly test for more than 1.
208 + if (group_thread_accounting) {
209 + unsigned long *threads_running = &p->group_leader->threads_running;
211 + if (*threads_running > 1)
212 + tdd += *threads_running * __task_deadline_diff(p);
214 p->time_slice = timeslice();
215 - p->deadline = grq.niffies + task_deadline_diff(p);
216 + p->deadline_niffy = grq.niffies;
217 + p->deadline = grq.niffies + tdd;
221 @@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment)
223 * This is the priority value as seen by users in /proc.
224 * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
225 - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
226 + * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO).
228 int task_prio(const struct task_struct *p)
230 @@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct *
232 /* Convert to ms to avoid overflows */
233 delta = NS_TO_MS(p->deadline - grq.niffies);
234 - delta = delta * 40 / ms_longest_deadline_diff();
235 - if (delta > 0 && delta <= 80)
236 + if (fork_depth_penalty)
240 + delta /= ms_longest_deadline_diff();
243 if (idleprio_task(p))
245 Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c
246 ===================================================================
247 --- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c 2010-10-08 09:39:11.603648964 +1100
248 +++ linux-2.6.36-rc7-ck1/kernel/sysctl.c 2010-10-08 09:39:53.579007778 +1100
249 @@ -121,6 +121,8 @@ static int __maybe_unused one_hundred =
250 #ifdef CONFIG_SCHED_BFS
251 extern int rr_interval;
252 extern int sched_iso_cpu;
253 +extern int group_thread_accounting;
254 +extern int fork_depth_penalty;
255 static int __read_mostly one_thousand = 1000;
258 @@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = {
260 .extra2 = &one_hundred,
263 + .procname = "group_thread_accounting",
264 + .data = &group_thread_accounting,
265 + .maxlen = sizeof (int),
267 + .proc_handler = &proc_dointvec_minmax,
272 + .procname = "fork_depth_penalty",
273 + .data = &fork_depth_penalty,
274 + .maxlen = sizeof (int),
276 + .proc_handler = &proc_dointvec_minmax,
281 #if defined(CONFIG_S390) && defined(CONFIG_SMP)