From d73dbd41b47af50f7652082b5b8a4594836d6ed9 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Tue, 12 Apr 2011 14:32:21 +0000 Subject: [PATCH] this time with the patches actually included (and properly identified!) --- ...fs357-penalise_fork_depth_account_threads.patch | 282 ++++++++++++++++++++ .../debian/patches/cpufreq_earlyload.diff | 25 ++ .../patch_swap_notify_core_support_2.6.28.diff | 220 +++++++++++++++ .../debian/patches/wl12xx-rx-fix.diff | 15 ++ 4 files changed, 542 insertions(+) create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs357-penalise_fork_depth_account_threads.patch create mode 100644 kernel-bfs-2.6.28/debian/patches/cpufreq_earlyload.diff create mode 100644 kernel-bfs-2.6.28/debian/patches/patch_swap_notify_core_support_2.6.28.diff create mode 100644 kernel-bfs-2.6.28/debian/patches/wl12xx-rx-fix.diff diff --git a/kernel-bfs-2.6.28/debian/patches/bfs357-penalise_fork_depth_account_threads.patch b/kernel-bfs-2.6.28/debian/patches/bfs357-penalise_fork_depth_account_threads.patch new file mode 100644 index 0000000..077ee5e --- /dev/null +++ b/kernel-bfs-2.6.28/debian/patches/bfs357-penalise_fork_depth_account_threads.patch @@ -0,0 +1,282 @@ +Make it possible to have interactivity and responsiveness at very high load +levels by making deadlines offset by the fork depth from init. This has a +similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect +example of this and will, with fork_depth_penalty enabled, be felt as much +at 'make -j24' as it normally would be with just 'make'. + +Note that this drastically affects CPU distribution, and also has the +indirect side effect of partitioning CPU entitlement to different users as +well. No assumption as to CPU distribution should be made based on past +behaviour. + +This is achieved by separating out forks to new processes vs new threads. +When a new process is detected, its fork depth is inherited from its parent +across fork() and then is incremented by one. That fork_depth is then used +to cause a relative offset of its deadline. + +This feature is enabled in this patch by default and can be optionally +disabled. + +Threads are kept at the same fork_depth as their parent process, and can +optionally have their CPU entitlement all managed as one process together +by enabling the group_thread_accounting feature. This feature is disabled +by default in this patch, as many desktop applications such as firefox, +amarok, etc are multithreaded. By disabling this feature and enabling the +fork_depth_penalty feature (default) it favours CPU towards desktop +applications. + +Extensive testing is required to ensure this does not cause regressions in +common workloads. + +There are two sysctls to enable/disable these features. + +They are in /proc/sys/kernel/ + +group_thread_accounting - groups CPU accounting by threads +fork_depth_penalty - penalises according to depth of forking from init + +-ck + +--- + include/linux/sched.h | 7 +++ + kernel/sched_bfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++---- + kernel/sysctl.c | 20 +++++++++++ + 3 files changed, 108 insertions(+), 7 deletions(-) + +Index: linux-2.6.36-rc7-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h 2010-10-08 09:39:38.016240768 +1100 ++++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100 +@@ -1187,10 +1187,15 @@ struct task_struct { + unsigned int rt_priority; + #ifdef CONFIG_SCHED_BFS + int time_slice; +- u64 deadline; ++ /* Virtual deadline in niffies, and when the deadline was set */ ++ u64 deadline, deadline_niffy; + struct list_head run_list; + u64 last_ran; + u64 sched_time; /* sched_clock time spent running */ ++ /* Number of threads currently requesting CPU time */ ++ unsigned long threads_running; ++ /* Depth of forks from init */ ++ int fork_depth; + + unsigned long rt_timeout; + #else /* CONFIG_SCHED_BFS */ +Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c 2010-10-08 09:39:37.918242270 +1100 ++++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c 2010-10-08 11:16:01.382198622 +1100 +@@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6; + int sched_iso_cpu __read_mostly = 70; + + /* ++ * group_thread_accounting - sysctl to decide whether to treat whole thread ++ * groups as a single entity for the purposes of CPU distribution. ++ */ ++int group_thread_accounting __read_mostly; ++ ++/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */ ++int fork_depth_penalty __read_mostly = 1; ++ ++/* + * The relative length of deadline for each priority(nice) level. + */ + static int prio_ratios[PRIO_RANGE] __read_mostly; +@@ -661,11 +670,29 @@ static int isoprio_suitable(void) + return !grq.iso_refractory; + } + ++static inline u64 __task_deadline_diff(struct task_struct *p); ++static inline u64 task_deadline_diff(struct task_struct *p); ++ + /* + * Adding to the global runqueue. Enter with grq locked. + */ + static void enqueue_task(struct task_struct *p) + { ++ s64 max_tdd = task_deadline_diff(p); ++ ++ /* ++ * Make sure that when we're queueing this task again that it ++ * doesn't have any old deadlines from when the thread group was ++ * being penalised and cap the deadline to the highest it could ++ * be, based on the current number of threads running. ++ */ ++ if (group_thread_accounting) { ++ max_tdd += p->group_leader->threads_running * ++ __task_deadline_diff(p); ++ } ++ if (p->deadline - p->deadline_niffy > max_tdd) ++ p->deadline = p->deadline_niffy + max_tdd; ++ + if (!rt_task(p)) { + /* Check it hasn't gotten rt from PI */ + if ((idleprio_task(p) && idleprio_suitable(p)) || +@@ -967,10 +994,13 @@ static int effective_prio(struct task_st + } + + /* +- * activate_task - move a task to the runqueue. Enter with grq locked. ++ * activate_task - move a task to the runqueue. Enter with grq locked. The ++ * number of threads running is stored in the group_leader struct. + */ + static void activate_task(struct task_struct *p, struct rq *rq) + { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ + update_clocks(rq); + + /* +@@ -987,6 +1017,14 @@ static void activate_task(struct task_st + p->prio = effective_prio(p); + if (task_contributes_to_load(p)) + grq.nr_uninterruptible--; ++ /* ++ * Adjust deadline according to number of running threads within ++ * this thread group. This ends up distributing CPU to the thread ++ * group as a single entity. ++ */ ++ ++*threads_running; ++ if (*threads_running > 1 && group_thread_accounting) ++ p->deadline += __task_deadline_diff(p); + enqueue_task(p); + grq.nr_running++; + inc_qnr(); +@@ -998,9 +1036,14 @@ static void activate_task(struct task_st + */ + static inline void deactivate_task(struct task_struct *p) + { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ + if (task_contributes_to_load(p)) + grq.nr_uninterruptible++; + grq.nr_running--; ++ --*threads_running; ++ if (*threads_running > 0 && group_thread_accounting) ++ p->deadline -= __task_deadline_diff(p); + } + + #ifdef CONFIG_SMP +@@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct + parent = p->parent; + /* Unnecessary but small chance that the parent changed CPU */ + set_task_cpu(p, task_cpu(parent)); ++ if (!(clone_flags & CLONE_THREAD)) { ++ p->fork_depth++; ++ p->threads_running = 0; ++ } + activate_task(p, rq); + trace_sched_wakeup_new(p, 1); + if (!(clone_flags & CLONE_VM) && rq->curr == parent && +@@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int + return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); + } + +-static inline u64 task_deadline_diff(struct task_struct *p) ++static inline u64 __task_deadline_diff(struct task_struct *p) + { + return prio_deadline_diff(TASK_USER_PRIO(p)); + } + ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ u64 pdd = __task_deadline_diff(p); ++ ++ if (fork_depth_penalty && p->fork_depth > 1) ++ pdd *= p->fork_depth; ++ return pdd; ++} ++ + static inline u64 static_deadline_diff(int static_prio) + { + return prio_deadline_diff(USER_PRIO(static_prio)); +@@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di + */ + static void time_slice_expired(struct task_struct *p) + { ++ u64 tdd = task_deadline_diff(p); ++ ++ /* ++ * We proportionately increase the deadline according to how many ++ * threads are running. This effectively makes a thread group have ++ * the same CPU as one task, no matter how many threads are running. ++ * time_slice_expired can be called when there may be none running ++ * when p is deactivated so we must explicitly test for more than 1. ++ */ ++ if (group_thread_accounting) { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ ++ if (*threads_running > 1) ++ tdd += *threads_running * __task_deadline_diff(p); ++ } + p->time_slice = timeslice(); +- p->deadline = grq.niffies + task_deadline_diff(p); ++ p->deadline_niffy = grq.niffies; ++ p->deadline = grq.niffies + tdd; + } + + /* +@@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment) + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -100. Normal tasks are centered around 1, value goes +- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO). + */ + int task_prio(const struct task_struct *p) + { +@@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct * + + /* Convert to ms to avoid overflows */ + delta = NS_TO_MS(p->deadline - grq.niffies); +- delta = delta * 40 / ms_longest_deadline_diff(); +- if (delta > 0 && delta <= 80) ++ if (fork_depth_penalty) ++ delta *= 4; ++ else ++ delta *= 40; ++ delta /= ms_longest_deadline_diff(); ++ if (delta > 0) + prio += delta; + if (idleprio_task(p)) + prio += 40; +Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c 2010-10-08 09:39:11.603648964 +1100 ++++ linux-2.6.36-rc7-ck1/kernel/sysctl.c 2010-10-08 09:39:53.579007778 +1100 +@@ -121,6 +121,8 @@ static int __maybe_unused one_hundred = + #ifdef CONFIG_SCHED_BFS + extern int rr_interval; + extern int sched_iso_cpu; ++extern int group_thread_accounting; ++extern int fork_depth_penalty; + static int __read_mostly one_thousand = 1000; + #endif + #ifdef CONFIG_PRINTK +@@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .procname = "group_thread_accounting", ++ .data = &group_thread_accounting, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "fork_depth_penalty", ++ .data = &fork_depth_penalty, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { diff --git a/kernel-bfs-2.6.28/debian/patches/cpufreq_earlyload.diff b/kernel-bfs-2.6.28/debian/patches/cpufreq_earlyload.diff new file mode 100644 index 0000000..561b8b5 --- /dev/null +++ b/kernel-bfs-2.6.28/debian/patches/cpufreq_earlyload.diff @@ -0,0 +1,25 @@ +From 30b1d92216154c9da2c9c33b6add9c458f98df44 Mon Sep 17 00:00:00 2001 +From: Alistair Buxton +Date: Wed, 8 Sep 2010 11:51:21 +0100 +Subject: [PATCH] Fix CPU frequency driver so that it loads *before* the things that use it. + +Signed-off-by: Alistair Buxton +--- + arch/arm/plat-omap/cpu-omap.c | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/arch/arm/plat-omap/cpu-omap.c b/arch/arm/plat-omap/cpu-omap.c +index 3974680..033a2bb 100644 +--- a/arch/arm/plat-omap/cpu-omap.c ++++ b/arch/arm/plat-omap/cpu-omap.c +@@ -188,7 +188,7 @@ static int __init omap_cpufreq_init(void) + return cpufreq_register_driver(&omap_driver); + } + +-late_initcall(omap_cpufreq_init); ++arch_initcall(omap_cpufreq_init); + + /* + * if ever we want to remove this, upon cleanup call: +-- +1.7.0.4 diff --git a/kernel-bfs-2.6.28/debian/patches/patch_swap_notify_core_support_2.6.28.diff b/kernel-bfs-2.6.28/debian/patches/patch_swap_notify_core_support_2.6.28.diff new file mode 100644 index 0000000..2a66eb1 --- /dev/null +++ b/kernel-bfs-2.6.28/debian/patches/patch_swap_notify_core_support_2.6.28.diff @@ -0,0 +1,220 @@ +diff -uprN linux-2.6.28/block/genhd.c linux-2.6.28.new/block/genhd.c +--- linux-2.6.28/block/genhd.c 2011-03-13 15:15:43.815647000 +0100 ++++ linux-2.6.28.new/block/genhd.c 2011-03-15 10:13:35.145764805 +0100 +@@ -1129,6 +1129,8 @@ struct gendisk *alloc_disk_node(int mino + disk->part_tbl->part[0] = &disk->part0; + + disk->minors = minors; ++ disk->flags |= ++ (GENHD_FL_REMAP_SWAPPED_PAGES | GENHD_FL_NOTIFY_REMAPPED_ONLY); + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; +diff -uprN linux-2.6.28/include/linux/blkdev.h linux-2.6.28.new/include/linux/blkdev.h +--- linux-2.6.28/include/linux/blkdev.h 2011-02-01 09:54:54.519982520 +0100 ++++ linux-2.6.28.new/include/linux/blkdev.h 2011-02-01 10:15:39.369903561 +0100 +@@ -1068,6 +1068,8 @@ struct block_device_operations { + int (*media_changed) (struct gendisk *); + int (*revalidate_disk) (struct gendisk *); + int (*getgeo)(struct block_device *, struct hd_geometry *); ++ /* this callback is with swap_lock and sometimes page table lock held */ ++ void (*swap_slot_free_notify) (struct block_device *, unsigned long); + struct module *owner; + }; + +diff -uprN linux-2.6.28/include/linux/genhd.h linux-2.6.28.new/include/linux/genhd.h +--- linux-2.6.28/include/linux/genhd.h 2011-03-13 15:23:58.275368057 +0100 ++++ linux-2.6.28.new/include/linux/genhd.h 2011-03-15 10:14:01.575121499 +0100 +@@ -113,6 +113,8 @@ struct hd_struct { + #define GENHD_FL_UP 16 + #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 + #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ ++#define GENHD_FL_REMAP_SWAPPED_PAGES 128 ++#define GENHD_FL_NOTIFY_REMAPPED_ONLY 256 + + #define BLK_SCSI_MAX_CMDS (256) + #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) +diff -uprN linux-2.6.28/mm/swapfile.c linux-2.6.28.new/mm/swapfile.c +--- linux-2.6.28/mm/swapfile.c 2011-02-01 09:54:31.434289623 +0100 ++++ linux-2.6.28.new/mm/swapfile.c 2011-03-15 10:14:50.998178343 +0100 +@@ -270,10 +270,23 @@ out: + return NULL; + } + ++static void swap_entry_update(struct swap_info_struct *p, unsigned long offset) ++{ ++ if (offset < p->lowest_bit) ++ p->lowest_bit = offset; ++ if (offset > p->highest_bit) ++ p->highest_bit = offset; ++ if (p->prio > swap_info[swap_list.next].prio) ++ swap_list.next = p - swap_info; ++ nr_swap_pages++; ++ p->inuse_pages--; ++} ++ + static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) + { + int count = p->swap_map[offset]; + unsigned old; ++ struct gendisk *disk; + + if (count >= SWAP_MAP_MAX) + return count; +@@ -283,28 +296,40 @@ static int swap_entry_free(struct swap_i + if (count) + return count; + +- spin_lock(&p->remap_lock); ++ disk = p->bdev->bd_disk; + +- if (offset < p->lowest_bit) +- p->lowest_bit = offset; +- if (offset > p->highest_bit) +- p->highest_bit = offset; +- if (p->prio > swap_info[swap_list.next].prio) +- swap_list.next = p - swap_info; +- nr_swap_pages++; +- p->inuse_pages--; ++ if (p->swap_remap) { ++ spin_lock(&p->remap_lock); ++ swap_entry_update(p, offset); ++ } ++ else { ++ swap_entry_update(p, offset); ++ if (disk->fops->swap_slot_free_notify) ++ disk->fops->swap_slot_free_notify(p->bdev, offset); ++ return 0; ++ } + + /* Re-map the page number */ + old = p->swap_remap[offset] & 0x7FFFFFFF; + /* Zero means it was not re-mapped */ +- if (!old) +- goto out; ++ if (!old) { ++ /* Skip notify if flag is set or the page is used */ ++ if ((disk->flags & GENHD_FL_NOTIFY_REMAPPED_ONLY) || ++ (p->swap_remap[offset] & 0x80000000)) ++ goto out; ++ ++ old = offset; ++ goto notify; ++ } + /* Clear the re-mapping */ + p->swap_remap[offset] &= 0x80000000; + /* Mark the re-mapped page as unused */ + p->swap_remap[old] &= 0x7FFFFFFF; + /* Record how many free pages there are */ + p->gaps_exist += 1; ++notify: ++ if (disk->fops->swap_slot_free_notify) ++ disk->fops->swap_slot_free_notify(p->bdev, old); + out: + spin_unlock(&p->remap_lock); + return 0; +@@ -1110,6 +1135,8 @@ sector_t map_swap_page(struct swap_info_ + struct swap_extent *start_se = se; + unsigned old; + ++ if (!sis->swap_remap) ++ goto out; + /* + * Instead of using the offset we are given, re-map it to the next + * sequential position. +@@ -1159,7 +1186,7 @@ sector_t map_swap_page(struct swap_info_ + offset = old; + } + spin_unlock(&sis->remap_lock); +- ++out: + for ( ; ; ) { + struct list_head *lh; + +@@ -1517,8 +1544,10 @@ SYSCALL_DEFINE1(swapoff, const char __us + p->flags = 0; + spin_unlock(&swap_lock); + mutex_unlock(&swapon_mutex); +- kfree(p->gap_pool_arr); +- vfree(p->swap_remap); ++ if (p->swap_remap) { ++ kfree(p->gap_pool_arr); ++ vfree(p->swap_remap); ++ } + vfree(swap_map); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { +@@ -1832,15 +1861,17 @@ SYSCALL_DEFINE2(swapon, const char __use + error = -ENOMEM; + goto bad_swap; + } +- swap_remap = vmalloc(maxpages * sizeof(unsigned)); +- if (!swap_remap) { +- error = -ENOMEM; +- goto bad_swap; ++ if (p->bdev->bd_disk->flags & GENHD_FL_REMAP_SWAPPED_PAGES) { ++ swap_remap = vmalloc(maxpages * sizeof(unsigned)); ++ if (!swap_remap) { ++ error = -ENOMEM; ++ goto bad_swap; ++ } ++ memset(swap_remap, 0, maxpages * sizeof(unsigned)); + } + + error = 0; + memset(swap_map, 0, maxpages * sizeof(short)); +- memset(swap_remap, 0, maxpages * sizeof(unsigned)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) +@@ -1872,13 +1903,15 @@ SYSCALL_DEFINE2(swapon, const char __use + goto bad_swap; + } + +- p->gap_pool_arr = kmalloc(sizeof(struct swap_gap_node)* +- SWAP_GAP_TREE_SIZE, GFP_KERNEL); +- if (!p->gap_pool_arr) { +- error = -ENOMEM; +- goto bad_swap; ++ if (swap_remap) { ++ p->gap_pool_arr = kmalloc(sizeof(struct swap_gap_node)* ++ SWAP_GAP_TREE_SIZE, GFP_KERNEL); ++ if (!p->gap_pool_arr) { ++ error = -ENOMEM; ++ goto bad_swap; ++ } ++ p->gaps_tree = RB_ROOT; + } +- p->gaps_tree = RB_ROOT; + + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); +@@ -1889,11 +1922,13 @@ SYSCALL_DEFINE2(swapon, const char __use + p->prio = --least_priority; + p->swap_map = swap_map; + p->swap_remap = swap_remap; +- p->gap_next = 1; +- p->gap_end = p->max - 1; +- p->gaps_exist = p->max - 1; +- spin_lock_init(&p->remap_lock); +- mutex_init(&p->remap_mutex); ++ if (swap_remap) { ++ p->gap_next = 1; ++ p->gap_end = p->max - 1; ++ p->gaps_exist = p->max - 1; ++ spin_lock_init(&p->remap_lock); ++ mutex_init(&p->remap_mutex); ++ } + p->flags = SWP_ACTIVE; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; +@@ -1932,7 +1967,8 @@ bad_swap_2: + p->swap_file = NULL; + p->flags = 0; + spin_unlock(&swap_lock); +- vfree(swap_remap); ++ if (swap_remap) ++ vfree(swap_remap); + vfree(swap_map); + if (swap_file) + filp_close(swap_file, NULL); diff --git a/kernel-bfs-2.6.28/debian/patches/wl12xx-rx-fix.diff b/kernel-bfs-2.6.28/debian/patches/wl12xx-rx-fix.diff new file mode 100644 index 0000000..5eb85aa --- /dev/null +++ b/kernel-bfs-2.6.28/debian/patches/wl12xx-rx-fix.diff @@ -0,0 +1,15 @@ +diff -ru a/drivers/net/wireless/wl12xx/wl1251_rx.c b/drivers/net/wireless/wl12xx/wl1251_rx.c +--- a/drivers/net/wireless/wl12xx/wl1251_rx.c 2011-01-22 17:33:06.966780985 +1100 ++++ b/drivers/net/wireless/wl12xx/wl1251_rx.c 2011-01-22 17:17:36.992422987 +1100 +@@ -185,8 +185,8 @@ + rx_buffer = skb_put(skb, length); + wl1251_spi_mem_read(wl, rx_packet_ring_addr, rx_buffer, length); + +- /* The actual lenght doesn't include the target's alignment */ +- skb->len = desc->length - PLCP_HEADER_LENGTH; ++ /* The actual length doesn't include the target's alignment */ ++ skb_trim(skb, desc->length - PLCP_HEADER_LENGTH); + + fc = (u16 *)skb->data; + + -- 1.7.9.5