Changes to series and rx51_defconfig file for BFQ
[kernel-bfs] / kernel-bfs-2.6.28 / debian / patches / block-switch-from-BFQ-for-2.6.28-to-BFQ-v1.patch
1 diff -uprN linux-2.6.28/block/bfq-cgroup.c linux-2.6.28.new/block/bfq-cgroup.c
2 --- linux-2.6.28/block/bfq-cgroup.c     2011-06-13 21:41:23.758108202 +0200
3 +++ linux-2.6.28.new/block/bfq-cgroup.c 2011-06-13 21:41:41.631928601 +0200
4 @@ -6,10 +6,13 @@
5   *
6   * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
7   *                   Paolo Valente <paolo.valente@unimore.it>
8 + *
9 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
10   */
11  
12  #ifdef CONFIG_CGROUP_BFQIO
13  static struct bfqio_cgroup bfqio_root_cgroup = {
14 +       .weight = BFQ_DEFAULT_GRP_WEIGHT,
15         .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
16         .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
17  };
18 @@ -17,6 +20,8 @@ static struct bfqio_cgroup bfqio_root_cg
19  static inline void bfq_init_entity(struct bfq_entity *entity,
20                                    struct bfq_group *bfqg)
21  {
22 +       entity->weight = entity->new_weight;
23 +       entity->orig_weight = entity->new_weight;
24         entity->ioprio = entity->new_ioprio;
25         entity->ioprio_class = entity->new_ioprio_class;
26         entity->parent = bfqg->my_entity;
27 @@ -54,6 +59,8 @@ static inline void bfq_group_init_entity
28  {
29         struct bfq_entity *entity = &bfqg->entity;
30  
31 +       entity->weight = entity->new_weight = bgrp->weight;
32 +       entity->orig_weight = entity->new_weight;
33         entity->ioprio = entity->new_ioprio = bgrp->ioprio;
34         entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
35         entity->ioprio_changed = 1;
36 @@ -301,6 +308,9 @@ static struct bfq_group *__bfq_cic_chang
37  
38                 if (entity->sched_data != &bfqg->sched_data) {
39                         cic_set_bfqq(cic, NULL, 0);
40 +                       bfq_log_bfqq(bfqd, async_bfqq,
41 +                                    "cic_change_group: %p %d",
42 +                                    async_bfqq, async_bfqq->ref);
43                         bfq_put_queue(async_bfqq);
44                 }
45         }
46 @@ -453,6 +463,7 @@ static void bfq_disconnect_groups(struct
47         struct hlist_node *pos, *n;
48         struct bfq_group *bfqg;
49  
50 +       bfq_log(bfqd, "disconnect_groups beginning") ;
51         hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
52                 hlist_del(&bfqg->bfqd_node);
53  
54 @@ -466,6 +477,9 @@ static void bfq_disconnect_groups(struct
55                  * to the list.
56                  */
57                 rcu_assign_pointer(bfqg->bfqd, NULL);
58 +
59 +               bfq_log(bfqd, "disconnect_groups: put async for group %p",
60 +                       bfqg) ;
61                 bfq_put_async_queues(bfqd, bfqg);
62         }
63  }
64 @@ -475,6 +489,8 @@ static inline void bfq_free_root_group(s
65         struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
66         struct bfq_group *bfqg = bfqd->root_group;
67  
68 +       bfq_put_async_queues(bfqd, bfqg);
69 +
70         spin_lock_irq(&bgrp->lock);
71         hlist_del_rcu(&bfqg->group_node);
72         spin_unlock_irq(&bgrp->lock);
73 @@ -529,6 +545,7 @@ static u64 bfqio_cgroup_##__VAR##_read(s
74         return ret;                                                     \
75  }
76  
77 +SHOW_FUNCTION(weight);
78  SHOW_FUNCTION(ioprio);
79  SHOW_FUNCTION(ioprio_class);
80  #undef SHOW_FUNCTION
81 @@ -551,9 +568,9 @@ static int bfqio_cgroup_##__VAR##_write(
82         bgrp = cgroup_to_bfqio(cgroup);                                 \
83                                                                         \
84         spin_lock_irq(&bgrp->lock);                                     \
85 -       bgrp->__VAR = (unsigned char)val;                               \
86 +       bgrp->__VAR = (unsigned short)val;                              \
87         hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {  \
88 -               bfqg->entity.new_##__VAR = (unsigned char)val;          \
89 +               bfqg->entity.new_##__VAR = (unsigned short)val;         \
90                 smp_wmb();                                              \
91                 bfqg->entity.ioprio_changed = 1;                        \
92         }                                                               \
93 @@ -564,12 +581,18 @@ static int bfqio_cgroup_##__VAR##_write(
94         return 0;                                                       \
95  }
96  
97 +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
98  STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
99  STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
100  #undef STORE_FUNCTION
101  
102  static struct cftype bfqio_files[] = {
103         {
104 +               .name = "weight",
105 +               .read_u64 = bfqio_cgroup_weight_read,
106 +               .write_u64 = bfqio_cgroup_weight_write,
107 +       },
108 +       {
109                 .name = "ioprio",
110                 .read_u64 = bfqio_cgroup_ioprio_read,
111                 .write_u64 = bfqio_cgroup_ioprio_write,
112 @@ -697,6 +720,8 @@ struct cgroup_subsys bfqio_subsys = {
113  static inline void bfq_init_entity(struct bfq_entity *entity,
114                                    struct bfq_group *bfqg)
115  {
116 +       entity->weight = entity->new_weight;
117 +       entity->orig_weight = entity->new_weight;
118         entity->ioprio = entity->new_ioprio;
119         entity->ioprio_class = entity->new_ioprio_class;
120         entity->sched_data = &bfqg->sched_data;
121 diff -uprN linux-2.6.28/block/bfq-iosched.c linux-2.6.28.new/block/bfq-iosched.c
122 --- linux-2.6.28/block/bfq-iosched.c    2011-06-13 21:41:23.766107399 +0200
123 +++ linux-2.6.28.new/block/bfq-iosched.c        2011-06-13 21:42:46.519829365 +0200
124 @@ -7,23 +7,36 @@
125   * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
126   *                   Paolo Valente <paolo.valente@unimore.it>
127   *
128 - * BFQ is a proportional share disk scheduling algorithm based on CFQ,
129 - * that uses the B-WF2Q+ internal scheduler to assign budgets (i.e.,
130 - * slices in the service domain) to the tasks accessing the disk.  It
131 - * has been introduced in [1], where the interested reader can find an
132 - * accurate description of the algorithm, the guarantees it provides
133 - * and their formal proofs.  With respect to the algorithm presented
134 - * in the paper, this implementation adds a timeout to limit the maximum
135 - * time a queue can spend to complete its assigned budget, and a
136 - * hierarchical extension, based on H-WF2Q+.
137 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
138 + *
139 + * BFQ is a proportional share disk scheduling algorithm based on the
140 + * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
141 + * measured in number of sectors, to tasks instead of time slices.
142 + * The disk is not granted to the active task for a given time slice,
143 + * but until it has exahusted its assigned budget.  This change from
144 + * the time to the service domain allows BFQ to distribute the disk
145 + * bandwidth among tasks as desired, without any distortion due to
146 + * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
147 + * internal scheduler, called B-WF2Q+, to schedule tasks according to
148 + * their budgets.  Thanks to this accurate scheduler, BFQ can afford
149 + * to assign high budgets to disk-bound non-seeky tasks (to boost the
150 + * throughput), and yet guarantee low latencies to interactive and
151 + * soft real-time applications.
152 + *
153 + * BFQ has been introduced in [1], where the interested reader can
154 + * find an accurate description of the algorithm, the bandwidth
155 + * distribution and latency guarantees it provides, plus formal proofs
156 + * of all the properties.  With respect to the algorithm presented in
157 + * the paper, this implementation adds several little heuristics, and
158 + * a hierarchical extension, based on H-WF2Q+.
159   *
160   * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
161   * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
162   * complexity derives from the one introduced with EEVDF in [3].
163   *
164   * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
165 - *     with Deterministic Guarantees on Bandwidth Distribution,'' to be
166 - *     published.
167 + *     with Deterministic Guarantees on Bandwidth Distribution,'' to appear
168 + *     on IEEE Transactions on Computer.
169   *
170   *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq.pdf
171   *
172 @@ -40,6 +53,7 @@
173   *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
174   */
175  #include <linux/module.h>
176 +#include <linux/slab.h>
177  #include <linux/blkdev.h>
178  #include <linux/cgroup.h>
179  #include <linux/elevator.h>
180 @@ -47,26 +61,33 @@
181  #include <linux/ioprio.h>
182  #include "bfq.h"
183  
184 -/* Max nr of dispatches in one round of service. */
185 +/* Max number of dispatches in one round of service. */
186  static const int bfq_quantum = 4;
187  
188 -/* Expiration time of each request (jiffies). */
189 +/* Expiration time of sync (0) and async (1) requests, in jiffies. */
190  static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
191  
192  /* Maximum backwards seek, in KiB. */
193  static const int bfq_back_max = 16 * 1024;
194  
195 -/* Penalty of a backwards seek. */
196 +/* Penalty of a backwards seek, in number of sectors. */
197  static const int bfq_back_penalty = 2;
198  
199 -/* Idling period duration (jiffies). */
200 +/* Idling period duration, in jiffies. */
201  static int bfq_slice_idle = HZ / 125;
202  
203 -/* Default maximum budget values (sectors). */
204 -static const int bfq_max_budget = 16 * 1024;
205 +/* Default maximum budget values, in sectors and number of requests. */
206 +static const int bfq_default_max_budget = 16 * 1024;
207  static const int bfq_max_budget_async_rq = 4;
208  
209 -/* Default timeout values (jiffies), approximating CFQ defaults. */
210 +/*
211 + * Async to sync throughput distribution is controlled as follows:
212 + * when an async request is served, the entity is charged the number
213 + * of sectors of the request, multipled by the factor below
214 + */
215 +static const int bfq_async_charge_factor = 10;
216 +
217 +/* Default timeout values, in jiffies, approximating CFQ defaults. */
218  static const int bfq_timeout_sync = HZ / 8;
219  static int bfq_timeout_async = HZ / 25;
220  
221 @@ -84,8 +105,7 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
222  #define BFQ_HW_QUEUE_THRESHOLD 4
223  #define BFQ_HW_QUEUE_SAMPLES   32
224  
225 -/* Budget feedback step. */
226 -#define BFQ_BUDGET_STEP                128
227 +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > (8 * 1024))
228  
229  /* Min samples used for peak rate estimation (for autotuning). */
230  #define BFQ_PEAK_RATE_SAMPLES  32
231 @@ -104,15 +124,10 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
232  #include "bfq-sched.c"
233  #include "bfq-cgroup.c"
234  
235 -static inline int bfq_class_idle(struct bfq_queue *bfqq)
236 -{
237 -       return bfqq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
238 -}
239 +#define bfq_class_idle(cfqq)   ((bfqq)->entity.ioprio_class ==\
240 +                                IOPRIO_CLASS_IDLE)
241  
242 -static inline int bfq_sample_valid(int samples)
243 -{
244 -       return samples > 80;
245 -}
246 +#define bfq_sample_valid(samples)      ((samples) > 80)
247  
248  /*
249   * We regard a request as SYNC, if either it's a read or has the SYNC bit
250 @@ -180,7 +195,7 @@ static struct request *bfq_choose_req(st
251         last = bfqd->last_position;
252  
253         /*
254 -        * by definition, 1KiB is 2 sectors
255 +        * By definition, 1KiB is 2 sectors.
256          */
257         back_max = bfqd->bfq_back_max * 2;
258  
259 @@ -282,17 +297,24 @@ static void bfq_del_rq_rb(struct request
260                 bfq_del_bfqq_busy(bfqd, bfqq, 1);
261  }
262  
263 +/* see the definition of bfq_async_charge_factor for details */
264 +static inline unsigned long bfq_serv_to_charge(struct request *rq,
265 +                                              struct bfq_queue *bfqq)
266 +{
267 +       return rq->hard_nr_sectors *
268 +               (1 + ((!bfq_bfqq_sync(bfqq)) * bfq_async_charge_factor));
269 +}
270 +
271  /**
272   * bfq_updated_next_req - update the queue after a new next_rq selection.
273   * @bfqd: the device data the queue belongs to.
274   * @bfqq: the queue to update.
275   *
276 - * Whenever the first request of a queue changes we try to allocate it
277 - * enough service (if it has grown), or to anticipate its finish time
278 - * (if it has shrinked), to reduce the time it has to wait, still taking
279 - * into account the queue budget.  We try to avoid the queue having not
280 - * enough service allocated for its first request, thus having to go
281 - * through two dispatch rounds to actually dispatch the request.
282 + * If the first request of a queue changes we make sure that the queue
283 + * has enough budget to serve at least its first request (if the
284 + * request has grown).  We do this because if the queue has not enough
285 + * budget for its first request, it has to go through two dispatch
286 + * rounds to actually get it dispatched.
287   */
288  static void bfq_updated_next_req(struct bfq_data *bfqd,
289                                  struct bfq_queue *bfqq)
290 @@ -300,7 +322,7 @@ static void bfq_updated_next_req(struct 
291         struct bfq_entity *entity = &bfqq->entity;
292         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
293         struct request *next_rq = bfqq->next_rq;
294 -       bfq_service_t new_budget;
295 +       unsigned long new_budget;
296  
297         if (next_rq == NULL)
298                 return;
299 @@ -315,9 +337,10 @@ static void bfq_updated_next_req(struct 
300         BUG_ON(entity->tree != &st->active);
301         BUG_ON(entity == entity->sched_data->active_entity);
302  
303 -       new_budget = max(bfqq->max_budget, next_rq->hard_nr_sectors);
304 +       new_budget = max_t(unsigned long, bfqq->max_budget,
305 +                          bfq_serv_to_charge(next_rq, bfqq));
306         entity->budget = new_budget;
307 -       bfq_log_bfqq(bfqd, bfqq, "budget=%lu", new_budget);
308 +       bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
309         bfq_activate_bfqq(bfqd, bfqq);
310  }
311  
312 @@ -328,6 +351,7 @@ static void bfq_add_rq_rb(struct request
313         struct bfq_data *bfqd = bfqq->bfqd;
314         struct request *__alias, *next_rq;
315  
316 +       bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
317         bfqq->queued[rq_is_sync(rq)]++;
318         bfqd->queued++;
319  
320 @@ -339,15 +363,33 @@ static void bfq_add_rq_rb(struct request
321                 bfq_dispatch_insert(bfqd->queue, __alias);
322  
323         /*
324 -        * check if this request is a better next-serve candidate
325 +        * Check if this request is a better next-serve candidate.
326          */
327         next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq);
328         BUG_ON(next_rq == NULL);
329         bfqq->next_rq = next_rq;
330  
331         if (!bfq_bfqq_busy(bfqq)) {
332 -               entity->budget = max(bfqq->max_budget,
333 -                                    next_rq->hard_nr_sectors);
334 +               entity->budget = max_t(unsigned long, bfqq->max_budget,
335 +                                      bfq_serv_to_charge(next_rq, bfqq));
336 +
337 +               /*
338 +                * If the queue is not being boosted and has been idle
339 +                * for enough time, start a boosting period
340 +                */
341 +               if (bfqd->low_latency && bfqq->high_weight_budget == 0) {
342 +                       if (bfqq->last_activation_time + BFQ_MIN_ACT_INTERVAL <
343 +                           jiffies_to_msecs(jiffies)) {
344 +                               bfqq->high_weight_budget = BFQ_BOOST_BUDGET;
345 +                               entity->ioprio_changed = 1;
346 +                               bfq_log_bfqq(bfqd, bfqq,
347 +                                            "wboost starting at %lu msec",
348 +                                            bfqq->last_activation_time);
349 +                       }
350 +                       bfqq->last_activation_time =
351 +                               jiffies_to_msecs(jiffies);
352 +               }
353 +
354                 bfq_add_bfqq_busy(bfqd, bfqq);
355         } else
356                 bfq_updated_next_req(bfqd, bfqq);
357 @@ -446,7 +488,7 @@ static void bfq_merged_requests(struct r
358                                 struct request *next)
359  {
360         /*
361 -        * reposition in fifo if next is older than rq
362 +        * Reposition in fifo if next is older than rq.
363          */
364         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
365             time_before(next->start_time, rq->start_time))
366 @@ -475,10 +517,7 @@ static int bfq_allow_merge(struct reques
367                 return 0;
368  
369         bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
370 -       if (bfqq == RQ_BFQQ(rq))
371 -               return 1;
372 -
373 -       return 0;
374 +       return bfqq == RQ_BFQQ(rq);
375  }
376  
377  static void __bfq_set_active_queue(struct bfq_data *bfqd,
378 @@ -489,9 +528,10 @@ static void __bfq_set_active_queue(struc
379                 bfq_mark_bfqq_budget_new(bfqq);
380                 bfq_clear_bfqq_fifo_expire(bfqq);
381  
382 -               bfqq->budgets_assigned = (bfqq->budgets_assigned*7 + 256) / 8;
383 +               bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
384  
385 -               bfq_log_bfqq(bfqd, bfqq, "active");
386 +               bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
387 +                            bfqq->entity.budget);
388         }
389  
390         bfqd->active_queue = bfqq;
391 @@ -509,7 +549,26 @@ static struct bfq_queue *bfq_set_active_
392         return bfqq;
393  }
394  
395 -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
396 +/*
397 + * If enough samples have been computed, return the current max budget
398 + * stored in bfqd, which is dynamically updated according to the
399 + * estimated disk peak rate; otherwise return the default max budget
400 + */
401 +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
402 +{
403 +       return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
404 +               bfqd->bfq_max_budget;
405 +}
406 +
407 +/*
408 + * Return min budget, which is a fraction of the current or default
409 + * max budget (trying with 1/32)
410 + */
411 +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
412 +{
413 +       return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
414 +               bfqd->bfq_max_budget / 32;
415 +}
416  
417  static void bfq_arm_slice_timer(struct bfq_data *bfqd)
418  {
419 @@ -531,19 +590,30 @@ static void bfq_arm_slice_timer(struct b
420         bfq_mark_bfqq_wait_request(bfqq);
421  
422         /*
423 -        * we don't want to idle for seeks, but we do want to allow
424 +        * We don't want to idle for seeks, but we do want to allow
425          * fair distribution of slice time for a process doing back-to-back
426 -        * seeks. so allow a little bit of time for him to submit a new rq
427 +        * seeks. So allow a little bit of time for him to submit a new rq.
428 +        *
429 +        * To prevent processes with (partly) seeky workloads from
430 +        * being too ill-treated, grant them a small fraction of the
431 +        * assigned budget before reducing the waiting time to
432 +        * BFQ_MIN_TT. This happened to help reduce latency.
433          */
434         sl = bfqd->bfq_slice_idle;
435 -       if (bfq_sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
436 +       if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
437 +           bfqq->entity.service > bfq_max_budget(bfqd) / 8)
438                 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
439  
440         bfqd->last_idling_start = ktime_get();
441         mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
442 -       bfq_log(bfqd, "arm idle: %lu", sl);
443 +       bfq_log(bfqd, "arm idle: %lu ms", sl);
444  }
445  
446 +/*
447 + * Set the maximum time for the active queue to consume its
448 + * budget. This prevents seeky processes from lowering the disk
449 + * throughput (always guaranteed with a time slice scheme as in CFQ).
450 + */
451  static void bfq_set_budget_timeout(struct bfq_data *bfqd)
452  {
453         struct bfq_queue *bfqq = bfqd->active_queue;
454 @@ -552,7 +622,8 @@ static void bfq_set_budget_timeout(struc
455  
456         bfq_clear_bfqq_budget_new(bfqq);
457         bfqq->budget_timeout = jiffies +
458 -               bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)];
459 +               bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)] *
460 +               (bfqq->entity.weight / bfqq->entity.orig_weight);
461  }
462  
463  /*
464 @@ -572,7 +643,7 @@ static void bfq_dispatch_insert(struct r
465  }
466  
467  /*
468 - * return expired entry, or NULL to just start from scratch in rbtree
469 + * Return expired entry, or NULL to just start from scratch in rbtree.
470   */
471  static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
472  {
473 @@ -597,7 +668,7 @@ static struct request *bfq_check_fifo(st
474         return rq;
475  }
476  
477 -static inline bfq_service_t bfq_bfqq_budget_left(struct bfq_queue *bfqq)
478 +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
479  {
480         struct bfq_entity *entity = &bfqq->entity;
481         return entity->budget - entity->service;
482 @@ -616,98 +687,115 @@ static void __bfq_bfqq_expire(struct bfq
483  }
484  
485  /**
486 - * bfq_default_budget - return the default budget for @bfqq on @bfqd.
487 - * @bfqd: the device descriptor.
488 - * @bfqq: the queue to consider.
489 - *
490 - * We use 3/4 of the @bfqd maximum budget as the default value
491 - * for the max_budget field of the queues.  This lets the feedback
492 - * mechanism to start from some middle ground, then the behavior
493 - * of the task will drive the heuristics towards high values, if
494 - * it behaves as a greedy sequential reader, or towards small values
495 - * if it shows a more intermittent behavior.
496 - */
497 -static bfq_service_t bfq_default_budget(struct bfq_data *bfqd,
498 -                                       struct bfq_queue *bfqq)
499 -{
500 -       bfq_service_t budget;
501 -
502 -       /*
503 -        * When we need an estimate of the peak rate we need to avoid
504 -        * to give budgets that are too short due to previous measurements.
505 -        * So, in the first 10 assignments use a ``safe'' budget value.
506 -        */
507 -       if (bfqq->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
508 -               budget = bfq_max_budget;
509 -       else
510 -               budget = bfqd->bfq_max_budget;
511 -
512 -       return budget - budget / 4;
513 -}
514 -
515 -static inline bfq_service_t bfq_min_budget(struct bfq_data *bfqd,
516 -                                          struct bfq_queue *bfqq)
517 -{
518 -       return bfqd->bfq_max_budget / 2;
519 -}
520 -
521 -/**
522   * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
523   * @bfqd: device data.
524   * @bfqq: queue to update.
525   * @reason: reason for expiration.
526   *
527 - * Handle the feedback on @bfqq budget.  This is driven by the following
528 - * principles:
529 - *   - async queues get always the maximum budget value (their ability to
530 - *     dispatch is limited by @bfqd->bfq_max_budget_async_rq).
531 - *   - If @bfqq has been too idle we decrease its budget, as it is likely
532 - *     to be more interested in latency than in throughput.
533 - *   - If @bfqq took too much to consume its budget it is likely to be
534 - *     seeky, so reset the budget to the default, in order to have all
535 - *     the seeky queues to be charged for the same service, trying to
536 - *     achieve fairness at least in the time domain among them.
537 - *   - If @bfqq exhausted its budget treat it as a greedy reader, in
538 - *     order to run it at full speed.
539 - *   - If @bfqq expired due to lack of requests leave its budget untouched.
540 + * Handle the feedback on @bfqq budget.  See the body for detailed
541 + * comments.
542   */
543  static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
544                                      struct bfq_queue *bfqq,
545                                      enum bfqq_expiration reason)
546  {
547         struct request *next_rq;
548 -       bfq_service_t budget, min_budget;
549 +       unsigned long budget, min_budget;
550  
551         budget = bfqq->max_budget;
552 -       min_budget = bfq_min_budget(bfqd, bfqq);
553 +       min_budget = bfq_min_budget(bfqd);
554  
555         BUG_ON(bfqq != bfqd->active_queue);
556  
557 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
558 +               bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
559 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
560 +               budget, bfq_min_budget(bfqd));
561 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
562 +               bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
563 +
564         if (bfq_bfqq_sync(bfqq)) {
565                 switch (reason) {
566 +               /*
567 +                * Caveat: in all the following cases we trade latency
568 +                * for throughput.
569 +                */
570                 case BFQ_BFQQ_TOO_IDLE:
571 -                       if (budget > min_budget + BFQ_BUDGET_STEP)
572 -                               budget -= BFQ_BUDGET_STEP;
573 -                       else
574 -                               budget = min_budget;
575 +                       /*
576 +                        * This is the only case where we may reduce
577 +                        * the budget: if there is no requets of the
578 +                        * process still waiting for completion, then
579 +                        * we assume (tentatively) that the timer has
580 +                        * expired because the batch of requests of
581 +                        * the process could have been served with a
582 +                        * smaller budget.  Hence, betting that
583 +                        * process will behave in the same way when it
584 +                        * becomes backlogged again, we reduce its
585 +                        * next budget.  As long as we guess right,
586 +                        * this budget cut reduces the latency
587 +                        * experienced by the process.
588 +                        *
589 +                        * However, if there are still outstanding
590 +                        * requests, then the process may have not yet
591 +                        * issued its next request just because it is
592 +                        * still waiting for the completion of some of
593 +                        * the still oustanding ones.  So in this
594 +                        * subcase we do not reduce its budget, on the
595 +                        * contrary we increase it to possibly boost
596 +                        * the throughput, as discussed in the
597 +                        * comments to the BUDGET_TIMEOUT case.
598 +                        */
599 +                       if (bfqq->dispatched > 0) /* still oustanding reqs */
600 +                               budget = min(budget * 2, bfqd->bfq_max_budget);
601 +                       else {
602 +                               if (budget > 5 * min_budget)
603 +                                       budget -= 4 * min_budget;
604 +                               else
605 +                                       budget = min_budget;
606 +                       }
607                         break;
608                 case BFQ_BFQQ_BUDGET_TIMEOUT:
609 -                       budget = bfq_default_budget(bfqd, bfqq);
610 +                       /*
611 +                        * We double the budget here because: 1) it
612 +                        * gives the chance to boost the throughput if
613 +                        * this is not a seeky process (which may have
614 +                        * bumped into this timeout because of, e.g.,
615 +                        * ZBR), 2) together with charge_full_budget
616 +                        * it helps give seeky processes higher
617 +                        * timestamps, and hence be served less
618 +                        * frequently.
619 +                        */
620 +                       budget = min(budget * 2, bfqd->bfq_max_budget);
621                         break;
622                 case BFQ_BFQQ_BUDGET_EXHAUSTED:
623 -                       budget = min(budget + 8 * BFQ_BUDGET_STEP,
624 -                                    bfqd->bfq_max_budget);
625 +                       /*
626 +                        * The process still has backlog, and did not
627 +                        * let either the budget timeout or the disk
628 +                        * idling timeout expire. Hence it is not
629 +                        * seeky, has a short thinktime and may be
630 +                        * happy with a higher budget too. So
631 +                        * definitely increase the budget of this good
632 +                        * candidate to boost the disk throughput.
633 +                        */
634 +                       budget = min(budget * 4, bfqd->bfq_max_budget);
635                         break;
636                 case BFQ_BFQQ_NO_MORE_REQUESTS:
637 +                      /*
638 +                       * Leave the budget unchanged.
639 +                       */
640                 default:
641                         return;
642                 }
643 -       } else
644 +       } else /* async queue */
645 +           /* async queues get always the maximum possible budget
646 +            * (their ability to dispatch is limited by
647 +            * @bfqd->bfq_max_budget_async_rq).
648 +            */
649                 budget = bfqd->bfq_max_budget;
650  
651         bfqq->max_budget = budget;
652  
653 -       if (bfqq->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
654 +       if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
655             bfqq->max_budget > bfqd->bfq_max_budget)
656                 bfqq->max_budget = bfqd->bfq_max_budget;
657  
658 @@ -719,30 +807,40 @@ static void __bfq_bfqq_recalc_budget(str
659          */
660         next_rq = bfqq->next_rq;
661         if (next_rq != NULL)
662 -               bfqq->entity.budget = max(bfqq->max_budget,
663 -                                         next_rq->hard_nr_sectors);
664 -       bfq_log_bfqq(bfqd, bfqq, "budget=%lu (%d)", bfqq->entity.budget,
665 -                    bfq_bfqq_sync(bfqq));
666 +               bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
667 +                                           bfq_serv_to_charge(next_rq, bfqq));
668 +       else
669 +               bfqq->entity.budget = bfqq->max_budget;
670 +
671 +       bfq_log_bfqq(bfqd, bfqq, "head sect: %lu, new budget %lu",
672 +                       next_rq != NULL ? next_rq->hard_nr_sectors : 0,
673 +                       bfqq->entity.budget);
674  }
675  
676 -static bfq_service_t bfq_calc_max_budget(u64 peak_rate, u64 timeout)
677 +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
678  {
679 -       bfq_service_t max_budget;
680 +       unsigned long max_budget;
681  
682         /*
683          * The max_budget calculated when autotuning is equal to the
684 -        * amount of sectors transfered in 0.75 * timeout_sync at the
685 +        * amount of sectors transfered in timeout_sync at the
686          * estimated peak rate.
687          */
688 -       max_budget = (bfq_service_t)(peak_rate * 1000 *
689 +       max_budget = (unsigned long)(peak_rate * 1000 *
690                                      timeout >> BFQ_RATE_SHIFT);
691 -       max_budget -= max_budget / 4;
692  
693         return max_budget;
694  }
695  
696 +/*
697 + * In addition to updating the peak rate, checks whether the process
698 + * is "slow", and returns 1 if so. This slow flag is used, in addition
699 + * to the budget timeout, to reduce the amount of service provided to
700 + * seeky processes, and hence reduce their chances to lower the
701 + * throughput. See the code for more details.
702 + */
703  static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
704 -                               int compensate)
705 +                               int compensate, enum bfqq_expiration reason)
706  {
707         u64 bw, usecs, expected, timeout;
708         ktime_t delta;
709 @@ -775,10 +873,21 @@ static int bfq_update_peak_rate(struct b
710          * the peak rate estimation.
711          */
712         if (usecs > 20000) {
713 -               if (bw > bfqd->peak_rate) {
714 -                       bfqd->peak_rate = bw;
715 +               if (bw > bfqd->peak_rate ||
716 +                  (!BFQQ_SEEKY(bfqq) &&
717 +                   reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
718 +                       bfq_log(bfqd, "measured bw =%llu", bw);
719 +                       /*
720 +                        * To smooth oscillations use a low-pass filter with
721 +                        * alpha=7/8, i.e.,
722 +                        * new_rate = (7/8) * old_rate + (1/8) * bw
723 +                        */
724 +                       do_div(bw, 8);
725 +                       bfqd->peak_rate *= 7;
726 +                       do_div(bfqd->peak_rate, 8);
727 +                       bfqd->peak_rate += bw;
728                         update = 1;
729 -                       bfq_log(bfqd, "peak_rate=%llu", bw);
730 +                       bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
731                 }
732  
733                 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
734 @@ -790,11 +899,22 @@ static int bfq_update_peak_rate(struct b
735                     update && bfqd->bfq_user_max_budget == 0) {
736                         bfqd->bfq_max_budget =
737                                 bfq_calc_max_budget(bfqd->peak_rate, timeout);
738 -                       bfq_log(bfqd, "max_budget=%lu", bfqd->bfq_max_budget);
739 +                       bfq_log(bfqd, "new max_budget=%lu",
740 +                               bfqd->bfq_max_budget);
741                 }
742         }
743  
744         /*
745 +        * If the process has been served for a too short time
746 +        * interval to let its possible sequential accesses prevail on
747 +        * the initial seek time needed to move the disk head on the
748 +        * first sector it requested, then give the process a chance
749 +        * and for the moment return false.
750 +        */
751 +       if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
752 +               return 0;
753 +
754 +       /*
755          * A process is considered ``slow'' (i.e., seeky, so that we
756          * cannot treat it fairly in the service domain, as it would
757          * slow down too much the other processes) if, when a slice
758 @@ -804,26 +924,46 @@ static int bfq_update_peak_rate(struct b
759          */
760         expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
761  
762 -       return expected > bfqq->entity.budget;
763 +       /*
764 +        * Caveat: processes doing IO in the slower disk zones will
765 +        * tend to be slow(er) even if not seeky. And the estimated
766 +        * peak rate will actually be an average over the disk
767 +        * surface. Hence, to not be too harsh with unlucky processes,
768 +        * we keep a budget/3 margin of safety before declaring a
769 +        * process slow.
770 +        */
771 +       return expected > (4 * bfqq->entity.budget) / 3;
772  }
773  
774 -/*
775 +/**
776   * bfq_bfqq_expire - expire a queue.
777   * @bfqd: device owning the queue.
778   * @bfqq: the queue to expire.
779   * @compensate: if true, compensate for the time spent idling.
780   * @reason: the reason causing the expiration.
781   *
782 - * The behavior is the following: when a queue expires because it has
783 - * been idling for too much we sync its finish time with the service
784 - * received and decrease its budget.  If @bfqq expires due to budget
785 - * exhaustion we increase its budget and sync its finish time.
786 - * If @bfqq expires due to budget timeout we do not sync its finish time
787 - * to avoid seeky queues to take too much disk time; instead we charge
788 - * it the maximum budget value.  Using the max budget value for all the
789 - * queues that expire due to budget timeout has the effect of using the
790 - * WF2Q+ scheduler to assign timeslices to those queues, without violating
791 - * the service domain guarantees for well-behaved queues.
792 + *
793 + * If the process associated to the queue is slow (i.e., seeky), or in
794 + * case of budget timeout, or, finally, if it is async, we
795 + * artificially charge it an entire budget (independently of the
796 + * actual service it received). As a consequence, the queue will get
797 + * higher timestamps than the correct ones upon reactivation, and
798 + * hence it will be rescheduled as if it had received more service
799 + * than what it actually received. In the end, this class of processes
800 + * will receive less service in proportion to how slowly they consume
801 + * their budgets (and hence how seriously they tend to lower the
802 + * throughput).
803 + *
804 + * In contrast, when a queue expires because it has been idling for
805 + * too much or because it exhausted its budget, we do not touch the
806 + * amount of service it has received. Hence when the queue will be
807 + * reactivated and its timestamps updated, the latter will be in sync
808 + * with the actual service received by the queue until expiration.
809 + *
810 + * Charging a full budget to the first type of queues and the exact
811 + * service to the others has the effect of using the WF2Q+ policy to
812 + * schedule the former on a timeslice basis, without violating the
813 + * service domain guarantees of the latter.
814   */
815  static void bfq_bfqq_expire(struct bfq_data *bfqd,
816                             struct bfq_queue *bfqq,
817 @@ -831,27 +971,42 @@ static void bfq_bfqq_expire(struct bfq_d
818                             enum bfqq_expiration reason)
819  {
820         int slow;
821 +       BUG_ON(bfqq != bfqd->active_queue);
822  
823 -       slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
824 +       /* Update disk peak rate for autotuning and check whether the
825 +        * process is slow (see bfq_update_peak_rate).
826 +        */
827 +       slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
828  
829         /*
830 -        * Treat slow (i.e., seeky) traffic as timed out, to not favor
831 -        * it over sequential traffic (a seeky queue consumes less budget,
832 -        * so it would receive smaller timestamps wrt a sequential one
833 -        * when an idling timer fires).
834 +        * As above explained, 'punish' slow (i.e., seeky), timed-out
835 +        * and async queues, to favor sequential sync workloads.
836 +        *
837 +        * Processes doing IO in the slower disk zones will tend to be
838 +        * slow(er) even if not seeky. Hence, since the estimated peak
839 +        * rate is actually an average over the disk surface, these
840 +        * processes may timeout just for bad luck. To avoid punishing
841 +        * them we do not charge a full budget to a process that
842 +        * succeeded in consuming at least 2/3 of its budget.
843          */
844 -       if (slow && reason == BFQ_BFQQ_TOO_IDLE)
845 -               reason = BFQ_BFQQ_BUDGET_TIMEOUT;
846 -
847 -       if (reason == BFQ_BFQQ_BUDGET_TIMEOUT || !bfq_bfqq_sync(bfqq))
848 +       if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
849 +                    bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
850                 bfq_bfqq_charge_full_budget(bfqq);
851  
852 -       bfq_log_bfqq(bfqd, bfqq, "expire (%d, %d)", reason, slow);
853 +       bfq_log_bfqq(bfqd, bfqq,
854 +               "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
855 +               bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
856  
857 +       /* Increase, decrease or leave budget unchanged according to reason */
858         __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
859         __bfq_bfqq_expire(bfqd, bfqq);
860  }
861  
862 +/*
863 + * Budget timeout is not implemented through a dedicated timer, but
864 + * just checked on request arrivals and completions, as well as on
865 + * idle timer expirations.
866 + */
867  static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
868  {
869         if (bfq_bfqq_budget_new(bfqq))
870 @@ -864,6 +1019,22 @@ static int bfq_bfqq_budget_timeout(struc
871  }
872  
873  /*
874 + * If we expire a queue that is waiting for the arrival of a new
875 + * request, we may prevent the fictitious timestamp backshifting that
876 + * allows the guarantees of the queue to be preserved (see [1] for
877 + * this tricky aspect). Hence we return true only if this condition
878 + * does not hold, or if the queue is slow enough to deserve only to be
879 + * kicked off for preserving a high throughput.
880 +*/
881 +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
882 +{
883 +       return (!bfq_bfqq_wait_request(bfqq) ||
884 +               bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
885 +               &&
886 +               bfq_bfqq_budget_timeout(bfqq);
887 +}
888 +
889 +/*
890   * Select a queue for service.  If we have a current active queue,
891   * check whether to continue servicing it, or retrieve and set a new one.
892   */
893 @@ -877,10 +1048,10 @@ static struct bfq_queue *bfq_select_queu
894         if (bfqq == NULL)
895                 goto new_queue;
896  
897 -       if (bfq_bfqq_budget_timeout(bfqq)) {
898 -               bfq_bfqq_charge_full_budget(bfqq);
899 +       bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
900 +
901 +       if (bfq_may_expire_for_budg_timeout(bfqq))
902                 goto expire;
903 -       }
904  
905         next_rq = bfqq->next_rq;
906         /*
907 @@ -888,7 +1059,8 @@ static struct bfq_queue *bfq_select_queu
908          * serve them, keep the queue, otherwise expire it.
909          */
910         if (next_rq != NULL) {
911 -               if (next_rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
912 +               if (bfq_serv_to_charge(next_rq, bfqq) >
913 +                       bfq_bfqq_budget_left(bfqq)) {
914                         reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
915                         goto expire;
916                 } else
917 @@ -896,9 +1068,8 @@ static struct bfq_queue *bfq_select_queu
918         }
919  
920         /*
921 -        * No requests pending.  If the active queue still has requests in
922 -        * flight or is idling for a new request, allow either of these
923 -        * conditions to happen (or time out) before selecting a new queue.
924 +        * No requests pending.  If the active queue still has
925 +        * requests in flight or is idling for a new request, then keep it.
926          */
927         if (timer_pending(&bfqd->idle_slice_timer) ||
928             (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq))) {
929 @@ -911,6 +1082,7 @@ expire:
930         bfq_bfqq_expire(bfqd, bfqq, 0, reason);
931  new_queue:
932         bfqq = bfq_set_active_queue(bfqd);
933 +       bfq_log(bfqd, "select_queue: new queue returned (possibly NULL)");
934  keep_queue:
935         return bfqq;
936  }
937 @@ -929,13 +1101,15 @@ static int __bfq_dispatch_requests(struc
938  
939         do {
940                 struct request *rq;
941 +               unsigned long service_to_charge;
942  
943                 /* Follow expired path, else get first next available. */
944                 rq = bfq_check_fifo(bfqq);
945                 if (rq == NULL)
946                         rq = bfqq->next_rq;
947 +               service_to_charge = bfq_serv_to_charge(rq, bfqq);
948  
949 -               if (rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
950 +               if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
951                         /*
952                          * Expire the queue for budget exhaustion, and
953                          * make sure that the next act_budget is enough
954 @@ -947,9 +1121,41 @@ static int __bfq_dispatch_requests(struc
955                 }
956  
957                 /* Finally, insert request into driver dispatch list. */
958 -               bfq_bfqq_served(bfqq, rq->hard_nr_sectors);
959 +               bfq_bfqq_served(bfqq, service_to_charge);
960                 bfq_dispatch_insert(bfqd->queue, rq);
961  
962 +               if (bfqq->high_weight_budget > 0) { /* queue is being boosted */
963 +                       struct bfq_entity *entity = &bfqq->entity;
964 +
965 +                       bfq_log_bfqq(bfqd, bfqq, "busy period dur %llu msec, "
966 +                                    "old highwbudg %lu",
967 +                                    jiffies_to_msecs(jiffies) -
968 +                                    bfqq->last_activation_time,
969 +                                    bfqq->high_weight_budget);
970 +                       /*
971 +                        * Decrease the budget for weight boosting by
972 +                        * the just received service, or, if too much
973 +                        * time has elapsed from the beginning of this
974 +                        * boosting period, stop it
975 +                        */
976 +                       if (jiffies_to_msecs(jiffies) -
977 +                           bfqq->last_activation_time <= BFQ_BOOST_TIMEOUT
978 +                           &&
979 +                           bfqq->high_weight_budget > service_to_charge)
980 +                               bfqq->high_weight_budget -= service_to_charge;
981 +                       else
982 +                               bfqq->high_weight_budget = 0;
983 +                       entity->ioprio_changed = 1;
984 +                       __bfq_entity_update_weight_prio(
985 +                               bfq_entity_service_tree(entity),
986 +                               entity);
987 +               }
988 +
989 +               bfq_log_bfqq(bfqd, bfqq, "dispatched %lu sec req (%llu), "
990 +                               "budg left %lu",
991 +                               rq->hard_nr_sectors, rq->hard_sector,
992 +                               bfq_bfqq_budget_left(bfqq));
993 +
994                 dispatched++;
995  
996                 if (bfqd->active_cic == NULL) {
997 @@ -961,6 +1167,8 @@ static int __bfq_dispatch_requests(struc
998                         break;
999         } while (dispatched < max_dispatch);
1000  
1001 +       bfq_log_bfqq(bfqd, bfqq, "dispatched %d reqs", dispatched);
1002 +
1003         if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
1004             dispatched >= bfqd->bfq_max_budget_async_rq) ||
1005             bfq_class_idle(bfqq)))
1006 @@ -1009,7 +1217,7 @@ static int bfq_forced_dispatch(struct bf
1007                 st = bfq_entity_service_tree(&bfqq->entity);
1008  
1009                 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
1010 -               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1011 +               bfqq->max_budget = bfq_max_budget(bfqd);
1012  
1013                 bfq_forget_idle(st);
1014         }
1015 @@ -1025,6 +1233,7 @@ static int bfq_dispatch_requests(struct 
1016         struct bfq_queue *bfqq;
1017         int dispatched;
1018  
1019 +       bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
1020         if (bfqd->busy_queues == 0)
1021                 return 0;
1022  
1023 @@ -1056,9 +1265,11 @@ static int bfq_dispatch_requests(struct 
1024                 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
1025  
1026                 dispatched += __bfq_dispatch_requests(bfqd, bfqq, max_dispatch);
1027 +               bfq_log_bfqq(bfqd, bfqq, "total dispatched increased to %d "
1028 +                            "(max_disp %d)", dispatched, max_dispatch);
1029         }
1030  
1031 -       bfq_log(bfqd, "dispatched=%d", dispatched);
1032 +       bfq_log(bfqd, "final total dispatched=%d", dispatched);
1033         return dispatched;
1034  }
1035  
1036 @@ -1074,6 +1285,7 @@ static void bfq_put_queue(struct bfq_que
1037  
1038         BUG_ON(atomic_read(&bfqq->ref) <= 0);
1039  
1040 +       bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
1041         if (!atomic_dec_and_test(&bfqq->ref))
1042                 return;
1043  
1044 @@ -1083,7 +1295,7 @@ static void bfq_put_queue(struct bfq_que
1045         BUG_ON(bfq_bfqq_busy(bfqq));
1046         BUG_ON(bfqd->active_queue == bfqq);
1047  
1048 -       bfq_log_bfqq(bfqd, bfqq, "freed");
1049 +       bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
1050  
1051         kmem_cache_free(bfq_pool, bfqq);
1052  }
1053 @@ -1095,6 +1307,7 @@ static void bfq_exit_bfqq(struct bfq_dat
1054                 bfq_schedule_dispatch(bfqd);
1055         }
1056  
1057 +       bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
1058         bfq_put_queue(bfqq);
1059  }
1060  
1061 @@ -1116,7 +1329,7 @@ static void bfq_init_prio_data(struct bf
1062                 printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
1063         case IOPRIO_CLASS_NONE:
1064                 /*
1065 -                * no prio set, inherit CPU scheduling settings
1066 +                * No prio set, inherit CPU scheduling settings.
1067                  */
1068                 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
1069                 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
1070 @@ -1139,8 +1352,8 @@ static void bfq_init_prio_data(struct bf
1071         bfqq->entity.ioprio_changed = 1;
1072  
1073         /*
1074 -        * keep track of original prio settings in case we have to temporarily
1075 -        * elevate the priority of this queue
1076 +        * Keep track of original prio settings in case we have to temporarily
1077 +        * elevate the priority of this queue.
1078          */
1079         bfqq->org_ioprio = bfqq->entity.new_ioprio;
1080         bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
1081 @@ -1167,6 +1380,9 @@ static void bfq_changed_ioprio(struct io
1082                                          GFP_ATOMIC);
1083                 if (new_bfqq != NULL) {
1084                         cic->cfqq[ASYNC] = new_bfqq;
1085 +                       bfq_log_bfqq(bfqd, bfqq,
1086 +                                    "changed_ioprio: bfqq %p %d",
1087 +                                    bfqq, bfqq->ref);
1088                         bfq_put_queue(bfqq);
1089                 }
1090         }
1091 @@ -1233,9 +1449,13 @@ retry:
1092                                 bfq_mark_bfqq_idle_window(bfqq);
1093                         bfq_mark_bfqq_sync(bfqq);
1094                 }
1095 -               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1096 +               /* Tentative initial value to trade off between thr and lat */
1097 +               bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
1098                 bfqq->pid = current->pid;
1099  
1100 +               bfqq->last_activation_time = 0;
1101 +               bfqq->high_weight_budget = 0;
1102 +
1103                 bfq_log_bfqq(bfqd, bfqq, "allocated");
1104         }
1105  
1106 @@ -1285,14 +1505,17 @@ static struct bfq_queue *bfq_get_queue(s
1107         }
1108  
1109         /*
1110 -        * pin the queue now that it's allocated, scheduler exit will prune it
1111 +        * Pin the queue now that it's allocated, scheduler exit will prune it.
1112          */
1113         if (!is_sync && *async_bfqq == NULL) {
1114                 atomic_inc(&bfqq->ref);
1115 +               bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
1116 +                            bfqq, bfqq->ref);
1117                 *async_bfqq = bfqq;
1118         }
1119  
1120         atomic_inc(&bfqq->ref);
1121 +       bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
1122         return bfqq;
1123  }
1124  
1125 @@ -1309,35 +1532,35 @@ static void bfq_update_io_thinktime(stru
1126  
1127  static void bfq_update_io_seektime(struct bfq_data *bfqd,
1128                                    struct bfq_queue *bfqq,
1129 -                                  struct cfq_io_context *cic,
1130                                    struct request *rq)
1131  {
1132         sector_t sdist;
1133         u64 total;
1134  
1135 -       if (cic->last_request_pos < rq->sector)
1136 -               sdist = rq->sector - cic->last_request_pos;
1137 +       if (bfqq->last_request_pos < rq->sector)
1138 +               sdist = rq->sector - bfqq->last_request_pos;
1139         else
1140 -               sdist = cic->last_request_pos - rq->sector;
1141 +               sdist = bfqq->last_request_pos - rq->sector;
1142  
1143         /*
1144          * Don't allow the seek distance to get too large from the
1145          * odd fragment, pagein, etc.
1146          */
1147 -       if (cic->seek_samples == 0) /* first request, not really a seek */
1148 +       if (bfqq->seek_samples == 0) /* first request, not really a seek */
1149                 sdist = 0;
1150 -       else if (cic->seek_samples <= 60) /* second&third seek */
1151 -               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1152 +       else if (bfqq->seek_samples <= 60) /* second & third seek */
1153 +               sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
1154         else
1155 -               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1156 +               sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
1157  
1158 -       cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1159 -       cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1160 -       total = cic->seek_total + (cic->seek_samples/2);
1161 -       do_div(total, cic->seek_samples);
1162 -       cic->seek_mean = (sector_t)total;
1163 +       bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
1164 +       bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
1165 +       total = bfqq->seek_total + (bfqq->seek_samples/2);
1166 +       do_div(total, bfqq->seek_samples);
1167 +       bfqq->seek_mean = (sector_t)total;
1168  
1169 -       bfq_log_bfqq(bfqd, bfqq, "dist=%lu mean=%lu", sdist, cic->seek_mean);
1170 +       bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
1171 +                       (u64)bfqq->seek_mean);
1172  }
1173  
1174  /*
1175 @@ -1357,7 +1580,7 @@ static void bfq_update_idle_window(struc
1176         enable_idle = bfq_bfqq_idle_window(bfqq);
1177  
1178         if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
1179 -           bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && CIC_SEEKY(cic)))
1180 +           bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
1181                 enable_idle = 0;
1182         else if (bfq_sample_valid(cic->ttime_samples)) {
1183                 if (cic->ttime_mean > bfqd->bfq_slice_idle)
1184 @@ -1365,14 +1588,13 @@ static void bfq_update_idle_window(struc
1185                 else
1186                         enable_idle = 1;
1187         }
1188 +       bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
1189 +               enable_idle);
1190  
1191         if (enable_idle)
1192                 bfq_mark_bfqq_idle_window(bfqq);
1193         else
1194                 bfq_clear_bfqq_idle_window(bfqq);
1195 -
1196 -       bfq_log_bfqq(bfqd, bfqq, "idle_window=%d (%d)",
1197 -                    enable_idle, CIC_SEEKY(cic));
1198  }
1199  
1200  /*
1201 @@ -1388,20 +1610,37 @@ static void bfq_rq_enqueued(struct bfq_d
1202                 bfqq->meta_pending++;
1203  
1204         bfq_update_io_thinktime(bfqd, cic);
1205 -       bfq_update_io_seektime(bfqd, bfqq, cic, rq);
1206 -       bfq_update_idle_window(bfqd, bfqq, cic);
1207 +       bfq_update_io_seektime(bfqd, bfqq, rq);
1208 +       if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
1209 +           !BFQQ_SEEKY(bfqq))
1210 +               bfq_update_idle_window(bfqd, bfqq, cic);
1211 +
1212 +       bfq_log_bfqq(bfqd, bfqq,
1213 +                    "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
1214 +                    bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
1215 +                    bfqq->seek_mean);
1216  
1217 -       cic->last_request_pos = rq->sector + rq->nr_sectors;
1218 +       bfqq->last_request_pos = rq->sector + rq->nr_sectors;
1219  
1220 -       if (bfqq == bfqd->active_queue && bfq_bfqq_wait_request(bfqq)) {
1221 -               /*
1222 -                * If we are waiting for a request for this queue, let it rip
1223 -                * immediately and flag that we must not expire this queue
1224 -                * just now.
1225 -                */
1226 -               bfq_clear_bfqq_wait_request(bfqq);
1227 -               del_timer(&bfqd->idle_slice_timer);
1228 -               blk_start_queueing(bfqd->queue);
1229 +       if (bfqq == bfqd->active_queue) {
1230 +               if (bfq_bfqq_wait_request(bfqq)) {
1231 +                       /*
1232 +                        * If we are waiting for a request for this queue, let
1233 +                        * it rip immediately and flag that we must not expire
1234 +                        * this queue just now.
1235 +                        */
1236 +                       bfq_clear_bfqq_wait_request(bfqq);
1237 +                       del_timer(&bfqd->idle_slice_timer);
1238 +                       /*
1239 +                        * Here we can safely expire the queue, in
1240 +                        * case of budget timeout, without wasting
1241 +                        * guarantees
1242 +                        */
1243 +                       if (bfq_bfqq_budget_timeout(bfqq))
1244 +                               bfq_bfqq_expire(bfqd, bfqq, 0,
1245 +                                               BFQ_BFQQ_BUDGET_TIMEOUT);
1246 +                       blk_start_queueing(bfqd->queue);
1247 +               }
1248         }
1249  }
1250  
1251 @@ -1410,6 +1649,7 @@ static void bfq_insert_request(struct re
1252         struct bfq_data *bfqd = q->elevator->elevator_data;
1253         struct bfq_queue *bfqq = RQ_BFQQ(rq);
1254  
1255 +       assert_spin_locked(bfqd->queue->queue_lock);
1256         bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
1257  
1258         bfq_add_rq_rb(rq);
1259 @@ -1447,7 +1687,8 @@ static void bfq_completed_request(struct
1260         struct bfq_data *bfqd = bfqq->bfqd;
1261         const int sync = rq_is_sync(rq);
1262  
1263 -       bfq_log_bfqq(bfqd, bfqq, "complete");
1264 +       bfq_log_bfqq(bfqd, bfqq, "completed %lu sects req (%d)",
1265 +                       rq->hard_nr_sectors, sync);
1266  
1267         bfq_update_hw_tag(bfqd);
1268  
1269 @@ -1470,7 +1711,7 @@ static void bfq_completed_request(struct
1270                 if (bfq_bfqq_budget_new(bfqq))
1271                         bfq_set_budget_timeout(bfqd);
1272  
1273 -               if (bfq_bfqq_budget_timeout(bfqq))
1274 +               if (bfq_may_expire_for_budg_timeout(bfqq))
1275                         bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
1276                 else if (sync && bfqd->rq_in_driver == 0 &&
1277                          RB_EMPTY_ROOT(&bfqq->sort_list))
1278 @@ -1489,7 +1730,7 @@ static void bfq_prio_boost(struct bfq_qu
1279  {
1280         if (has_fs_excl()) {
1281                 /*
1282 -                * boost idle prio on transactions that would lock out other
1283 +                * Boost idle prio on transactions that would lock out other
1284                  * users of the filesystem
1285                  */
1286                 if (bfq_class_idle(bfqq))
1287 @@ -1498,7 +1739,7 @@ static void bfq_prio_boost(struct bfq_qu
1288                         bfqq->entity.new_ioprio = IOPRIO_NORM;
1289         } else {
1290                 /*
1291 -                * check if we need to unboost the queue
1292 +                * Check if we need to unboost the queue
1293                  */
1294                 if (bfqq->entity.new_ioprio_class != bfqq->org_ioprio_class)
1295                         bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
1296 @@ -1527,7 +1768,7 @@ static int bfq_may_queue(struct request_
1297         /*
1298          * Don't force setup of a queue from here, as a call to may_queue
1299          * does not necessarily imply that a request actually will be queued.
1300 -        * so just lookup a possibly existing queue, or return 'may queue'
1301 +        * So just lookup a possibly existing queue, or return 'may queue'
1302          * if that fails.
1303          */
1304         cic = bfq_cic_lookup(bfqd, tsk->io_context);
1305 @@ -1546,7 +1787,7 @@ static int bfq_may_queue(struct request_
1306  }
1307  
1308  /*
1309 - * queue lock held here
1310 + * Queue lock held here.
1311   */
1312  static void bfq_put_request(struct request *rq)
1313  {
1314 @@ -1563,6 +1804,8 @@ static void bfq_put_request(struct reque
1315                 rq->elevator_private = NULL;
1316                 rq->elevator_private2 = NULL;
1317  
1318 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
1319 +                            bfqq, bfqq->ref);
1320                 bfq_put_queue(bfqq);
1321         }
1322  }
1323 @@ -1603,6 +1846,7 @@ static int bfq_set_request(struct reques
1324  
1325         bfqq->allocated[rw]++;
1326         atomic_inc(&bfqq->ref);
1327 +       bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
1328  
1329         spin_unlock_irqrestore(q->queue_lock, flags);
1330  
1331 @@ -1634,7 +1878,8 @@ static void bfq_kick_queue(struct work_s
1332  }
1333  
1334  /*
1335 - * Timer running if the active_queue is currently idling inside its time slice
1336 + * Handler of the expiration of the timer running if the active_queue
1337 + * is idling inside its time slice.
1338   */
1339  static void bfq_idle_slice_timer(unsigned long data)
1340  {
1341 @@ -1643,8 +1888,6 @@ static void bfq_idle_slice_timer(unsigne
1342         unsigned long flags;
1343         enum bfqq_expiration reason;
1344  
1345 -       bfq_log(bfqd, "slice_timer expired");
1346 -
1347         spin_lock_irqsave(bfqd->queue->queue_lock, flags);
1348  
1349         bfqq = bfqd->active_queue;
1350 @@ -1657,8 +1900,14 @@ static void bfq_idle_slice_timer(unsigne
1351          * we just expire a queue too early.
1352          */
1353         if (bfqq != NULL) {
1354 +               bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
1355                 reason = BFQ_BFQQ_TOO_IDLE;
1356                 if (bfq_bfqq_budget_timeout(bfqq))
1357 +                       /*
1358 +                        * Also here the queue can be safely expired
1359 +                        * for budget timeout without wasting
1360 +                        * guarantees
1361 +                        */
1362                         reason = BFQ_BFQQ_BUDGET_TIMEOUT;
1363  
1364                 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
1365 @@ -1681,8 +1930,11 @@ static inline void __bfq_put_async_bfqq(
1366         struct bfq_group *root_group = bfqd->root_group;
1367         struct bfq_queue *bfqq = *bfqq_ptr;
1368  
1369 +       bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
1370         if (bfqq != NULL) {
1371                 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
1372 +               bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
1373 +                            bfqq, bfqq->ref);
1374                 bfq_put_queue(bfqq);
1375                 *bfqq_ptr = NULL;
1376         }
1377 @@ -1705,7 +1957,7 @@ static void bfq_put_async_queues(struct 
1378         __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
1379  }
1380  
1381 -static void bfq_exit_queue(elevator_t *e)
1382 +static void bfq_exit_queue(struct elevator_queue *e)
1383  {
1384         struct bfq_data *bfqd = e->elevator_data;
1385         struct request_queue *q = bfqd->queue;
1386 @@ -1772,7 +2024,7 @@ static void *bfq_init_queue(struct reque
1387  
1388         bfqd->hw_tag = 1;
1389  
1390 -       bfqd->bfq_max_budget = bfq_max_budget;
1391 +       bfqd->bfq_max_budget = bfq_default_max_budget;
1392  
1393         bfqd->bfq_quantum = bfq_quantum;
1394         bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
1395 @@ -1784,6 +2036,8 @@ static void *bfq_init_queue(struct reque
1396         bfqd->bfq_timeout[ASYNC] = bfq_timeout_async;
1397         bfqd->bfq_timeout[SYNC] = bfq_timeout_sync;
1398  
1399 +       bfqd->low_latency = true;
1400 +
1401         return bfqd;
1402  }
1403  
1404 @@ -1819,16 +2073,19 @@ static ssize_t bfq_var_show(unsigned int
1405         return sprintf(page, "%d\n", var);
1406  }
1407  
1408 -static ssize_t bfq_var_store(unsigned int *var, const char *page, size_t count)
1409 +static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
1410  {
1411 -       char *p = (char *)page;
1412 +       unsigned long new_val;
1413 +       int ret = strict_strtoul(page, 10, &new_val);
1414 +
1415 +       if (ret == 0)
1416 +               *var = new_val;
1417  
1418 -       *var = simple_strtoul(p, &p, 10);
1419         return count;
1420  }
1421  
1422  #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
1423 -static ssize_t __FUNC(elevator_t *e, char *page)                       \
1424 +static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
1425  {                                                                      \
1426         struct bfq_data *bfqd = e->elevator_data;                       \
1427         unsigned int __data = __VAR;                                    \
1428 @@ -1846,13 +2103,15 @@ SHOW_FUNCTION(bfq_max_budget_show, bfqd-
1429  SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
1430  SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[SYNC], 1);
1431  SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[ASYNC], 1);
1432 +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
1433  #undef SHOW_FUNCTION
1434  
1435  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
1436 -static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)   \
1437 +static ssize_t                                                         \
1438 +__FUNC(struct elevator_queue *e, const char *page, size_t count)       \
1439  {                                                                      \
1440         struct bfq_data *bfqd = e->elevator_data;                       \
1441 -       unsigned int __data;                                            \
1442 +       unsigned long __data;                                           \
1443         int ret = bfq_var_store(&__data, (page), count);                \
1444         if (__data < (MIN))                                             \
1445                 __data = (MIN);                                         \
1446 @@ -1879,21 +2138,21 @@ STORE_FUNCTION(bfq_timeout_async_store, 
1447                 INT_MAX, 1);
1448  #undef STORE_FUNCTION
1449  
1450 -static inline bfq_service_t bfq_estimated_max_budget(struct bfq_data *bfqd)
1451 +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
1452  {
1453         u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[SYNC]);
1454  
1455         if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
1456                 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
1457         else
1458 -               return bfq_max_budget;
1459 +               return bfq_default_max_budget;
1460  }
1461  
1462 -static ssize_t bfq_max_budget_store(elevator_t *e, const char *page,
1463 -                                   size_t count)
1464 +static ssize_t bfq_max_budget_store(struct elevator_queue *e,
1465 +                                   const char *page, size_t count)
1466  {
1467         struct bfq_data *bfqd = e->elevator_data;
1468 -       unsigned int __data;
1469 +       unsigned long __data;
1470         int ret = bfq_var_store(&__data, (page), count);
1471  
1472         if (__data == 0)
1473 @@ -1909,11 +2168,11 @@ static ssize_t bfq_max_budget_store(elev
1474         return ret;
1475  }
1476  
1477 -static ssize_t bfq_timeout_sync_store(elevator_t *e, const char *page,
1478 -                                     size_t count)
1479 +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
1480 +                                     const char *page, size_t count)
1481  {
1482         struct bfq_data *bfqd = e->elevator_data;
1483 -       unsigned int __data;
1484 +       unsigned long __data;
1485         int ret = bfq_var_store(&__data, (page), count);
1486  
1487         if (__data < 1)
1488 @@ -1928,6 +2187,20 @@ static ssize_t bfq_timeout_sync_store(el
1489         return ret;
1490  }
1491  
1492 +static ssize_t bfq_low_latency_store(struct elevator_queue *e,
1493 +                                    const char *page, size_t count)
1494 +{
1495 +       struct bfq_data *bfqd = e->elevator_data;
1496 +       unsigned long __data;
1497 +       int ret = bfq_var_store(&__data, (page), count);
1498 +
1499 +       if (__data > 1)
1500 +               __data = 1;
1501 +       bfqd->low_latency = __data;
1502 +
1503 +       return ret;
1504 +}
1505 +
1506  #define BFQ_ATTR(name) \
1507         __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
1508  
1509 @@ -1942,12 +2215,13 @@ static struct elv_fs_entry bfq_attrs[] =
1510         BFQ_ATTR(max_budget_async_rq),
1511         BFQ_ATTR(timeout_sync),
1512         BFQ_ATTR(timeout_async),
1513 +       BFQ_ATTR(low_latency),
1514         __ATTR_NULL
1515  };
1516  
1517  static struct elevator_type iosched_bfq = {
1518         .ops = {
1519 -               .elevator_merge_fn =            bfq_merge,
1520 +               .elevator_merge_fn =            bfq_merge,
1521                 .elevator_merged_fn =           bfq_merged_request,
1522                 .elevator_merge_req_fn =        bfq_merged_requests,
1523                 .elevator_allow_merge_fn =      bfq_allow_merge,
1524 @@ -1974,7 +2248,7 @@ static struct elevator_type iosched_bfq 
1525  static int __init bfq_init(void)
1526  {
1527         /*
1528 -        * can be 0 on HZ < 1000 setups
1529 +        * Can be 0 on HZ < 1000 setups.
1530          */
1531         if (bfq_slice_idle == 0)
1532                 bfq_slice_idle = 1;
1533 diff -uprN linux-2.6.28/block/bfq-sched.c linux-2.6.28.new/block/bfq-sched.c
1534 --- linux-2.6.28/block/bfq-sched.c      2011-06-13 21:41:23.766107399 +0200
1535 +++ linux-2.6.28.new/block/bfq-sched.c  2011-06-13 21:41:41.639927001 +0200
1536 @@ -28,7 +28,7 @@ static int bfq_update_next_active(struct
1537                 return 0;
1538  
1539         /*
1540 -        * NOTE: this can be improved in may ways, such as returning
1541 +        * NOTE: this can be improved in many ways, such as returning
1542          * 1 (and thus propagating upwards the update) only when the
1543          * budget changes, or caching the bfqq that will be scheduled
1544          * next from this subtree.  By now we worry more about
1545 @@ -85,20 +85,33 @@ static inline void bfq_check_next_active
1546   *
1547   * Return @a > @b, dealing with wrapping correctly.
1548   */
1549 -static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
1550 +static inline int bfq_gt(u64 a, u64 b)
1551  {
1552         return (s64)(a - b) > 0;
1553  }
1554  
1555 +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1556 +{
1557 +       struct bfq_queue *bfqq = NULL;
1558 +
1559 +       BUG_ON(entity == NULL);
1560 +
1561 +       if (entity->my_sched_data == NULL)
1562 +               bfqq = container_of(entity, struct bfq_queue, entity);
1563 +
1564 +       return bfqq;
1565 +}
1566 +
1567 +
1568  /**
1569   * bfq_delta - map service into the virtual time domain.
1570   * @service: amount of service.
1571 - * @weight: scale factor.
1572 + * @weight: scale factor (weight of an entity or weight sum).
1573   */
1574 -static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
1575 -                                       bfq_weight_t weight)
1576 +static inline u64 bfq_delta(unsigned long service,
1577 +                                       unsigned long weight)
1578  {
1579 -       bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
1580 +       u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1581  
1582         do_div(d, weight);
1583         return d;
1584 @@ -110,11 +123,25 @@ static inline bfq_timestamp_t bfq_delta(
1585   * @service: the service to be charged to the entity.
1586   */
1587  static inline void bfq_calc_finish(struct bfq_entity *entity,
1588 -                                  bfq_service_t service)
1589 +                                  unsigned long service)
1590  {
1591 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1592 +
1593         BUG_ON(entity->weight == 0);
1594  
1595 -       entity->finish = entity->start + bfq_delta(service, entity->weight);
1596 +       entity->finish = entity->start +
1597 +               bfq_delta(service, entity->weight);
1598 +
1599 +       if (bfqq != NULL) {
1600 +               bfq_log_bfqq(bfqq->bfqd, bfqq,
1601 +                       "calc_finish: serv %lu, w %lu, hi-budg %lu",
1602 +                       service, entity->weight,
1603 +                       bfqq->high_weight_budget);
1604 +               bfq_log_bfqq(bfqq->bfqd, bfqq,
1605 +                       "calc_finish: start %llu, finish %llu, delta %llu",
1606 +                       entity->start, entity->finish,
1607 +                       bfq_delta(service, entity->weight));
1608 +       }
1609  }
1610  
1611  /**
1612 @@ -150,18 +177,6 @@ static inline void bfq_extract(struct rb
1613         rb_erase(&entity->rb_node, root);
1614  }
1615  
1616 -static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1617 -{
1618 -       struct bfq_queue *bfqq = NULL;
1619 -
1620 -       BUG_ON(entity == NULL);
1621 -
1622 -       if (entity->my_sched_data == NULL)
1623 -               bfqq = container_of(entity, struct bfq_queue, entity);
1624 -
1625 -       return bfqq;
1626 -}
1627 -
1628  /**
1629   * bfq_idle_extract - extract an entity from the idle tree.
1630   * @st: the service tree of the owning @entity.
1631 @@ -325,12 +340,26 @@ static void bfq_active_insert(struct bfq
1632   * bfq_ioprio_to_weight - calc a weight from an ioprio.
1633   * @ioprio: the ioprio value to convert.
1634   */
1635 -static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
1636 +static unsigned short bfq_ioprio_to_weight(int ioprio)
1637  {
1638         WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
1639         return IOPRIO_BE_NR - ioprio;
1640  }
1641  
1642 +/**
1643 + * bfq_weight_to_ioprio - calc an ioprio from a weight.
1644 + * @weight: the weight value to convert.
1645 + *
1646 + * To preserve as mush as possible the old only-ioprio user interface,
1647 + * 0 is used as an escape ioprio value for weights (numerically) equal or
1648 + * larger than IOPRIO_BE_NR
1649 + */
1650 +static unsigned short bfq_weight_to_ioprio(int weight)
1651 +{
1652 +       WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
1653 +       return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
1654 +}
1655 +
1656  static inline void bfq_get_entity(struct bfq_entity *entity)
1657  {
1658         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1659 @@ -339,6 +368,8 @@ static inline void bfq_get_entity(struct
1660         if (bfqq != NULL) {
1661                 sd = entity->sched_data;
1662                 atomic_inc(&bfqq->ref);
1663 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1664 +                            bfqq, bfqq->ref);
1665         }
1666  }
1667  
1668 @@ -437,6 +468,8 @@ static void bfq_forget_entity(struct bfq
1669         st->wsum -= entity->weight;
1670         if (bfqq != NULL) {
1671                 sd = entity->sched_data;
1672 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
1673 +                            bfqq, bfqq->ref);
1674                 bfq_put_queue(bfqq);
1675         }
1676  }
1677 @@ -478,6 +511,62 @@ static void bfq_forget_idle(struct bfq_s
1678                 bfq_put_idle_entity(st, first_idle);
1679  }
1680  
1681 +static struct bfq_service_tree *
1682 +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1683 +                        struct bfq_entity *entity)
1684 +{
1685 +       struct bfq_service_tree *new_st = old_st;
1686 +
1687 +       if (entity->ioprio_changed) {
1688 +               int new_boost_coeff = 1;
1689 +               struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1690 +
1691 +               if (bfqq != NULL) {
1692 +                       new_boost_coeff +=
1693 +                               bfqq->high_weight_budget * BFQ_BOOST_COEFF /
1694 +                               BFQ_BOOST_BUDGET;
1695 +                       bfq_log_bfqq(bfqq->bfqd, bfqq,
1696 +                               "update_w_prio: wght %lu, hi-budg %lu, coef %d",
1697 +                               entity->weight, bfqq->high_weight_budget,
1698 +                               new_boost_coeff);
1699 +               }
1700 +
1701 +               BUG_ON(old_st->wsum < entity->weight);
1702 +               old_st->wsum -= entity->weight;
1703 +
1704 +               if (entity->new_weight != entity->orig_weight) {
1705 +                       entity->orig_weight = entity->new_weight;
1706 +                       entity->ioprio =
1707 +                               bfq_weight_to_ioprio(entity->orig_weight);
1708 +               } else if (entity->new_ioprio != entity->ioprio) {
1709 +                       entity->ioprio = entity->new_ioprio;
1710 +                       entity->orig_weight =
1711 +                                       bfq_ioprio_to_weight(entity->ioprio);
1712 +               } else
1713 +                       entity->new_weight = entity->orig_weight =
1714 +                               bfq_ioprio_to_weight(entity->ioprio);
1715 +
1716 +               entity->ioprio_class = entity->new_ioprio_class;
1717 +               entity->ioprio_changed = 0;
1718 +
1719 +               /*
1720 +                * NOTE: here we may be changing the weight too early,
1721 +                * this will cause unfairness.  The correct approach
1722 +                * would have required additional complexity to defer
1723 +                * weight changes to the proper time instants (i.e.,
1724 +                * when entity->finish <= old_st->vtime).
1725 +                */
1726 +               new_st = bfq_entity_service_tree(entity);
1727 +               entity->weight = entity->orig_weight * new_boost_coeff;
1728 +               new_st->wsum += entity->weight;
1729 +
1730 +               if (new_st != old_st)
1731 +                       entity->start = new_st->vtime;
1732 +       }
1733 +
1734 +       return new_st;
1735 +}
1736 +
1737  /**
1738   * bfq_bfqq_served - update the scheduler status after selection for service.
1739   * @bfqq: the queue being served.
1740 @@ -487,7 +576,7 @@ static void bfq_forget_idle(struct bfq_s
1741   * are synchronized every time a new bfqq is selected for service.  By now,
1742   * we keep it to better check consistency.
1743   */
1744 -static void bfq_bfqq_served(struct bfq_queue *bfqq, bfq_service_t served)
1745 +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
1746  {
1747         struct bfq_entity *entity = &bfqq->entity;
1748         struct bfq_service_tree *st;
1749 @@ -496,13 +585,13 @@ static void bfq_bfqq_served(struct bfq_q
1750                 st = bfq_entity_service_tree(entity);
1751  
1752                 entity->service += served;
1753 -
1754                 WARN_ON_ONCE(entity->service > entity->budget);
1755                 BUG_ON(st->wsum == 0);
1756  
1757                 st->vtime += bfq_delta(served, st->wsum);
1758                 bfq_forget_idle(st);
1759         }
1760 +       bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
1761  }
1762  
1763  /**
1764 @@ -515,42 +604,13 @@ static void bfq_bfqq_served(struct bfq_q
1765   * budget.  In this way we should obtain a sort of time-domain
1766   * fairness among all the seeky/slow queues.
1767   */
1768 -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1769 +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1770  {
1771         struct bfq_entity *entity = &bfqq->entity;
1772  
1773 -       bfq_bfqq_served(bfqq, entity->budget - entity->service);
1774 -}
1775 +       bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
1776  
1777 -static struct bfq_service_tree *
1778 -__bfq_entity_update_prio(struct bfq_service_tree *old_st,
1779 -                        struct bfq_entity *entity)
1780 -{
1781 -       struct bfq_service_tree *new_st = old_st;
1782 -
1783 -       if (entity->ioprio_changed) {
1784 -               entity->ioprio = entity->new_ioprio;
1785 -               entity->ioprio_class = entity->new_ioprio_class;
1786 -               entity->ioprio_changed = 0;
1787 -
1788 -               old_st->wsum -= entity->weight;
1789 -               entity->weight = bfq_ioprio_to_weight(entity->ioprio);
1790 -
1791 -               /*
1792 -                * NOTE: here we may be changing the weight too early,
1793 -                * this will cause unfairness.  The correct approach
1794 -                * would have required additional complexity to defer
1795 -                * weight changes to the proper time instants (i.e.,
1796 -                * when entity->finish <= old_st->vtime).
1797 -                */
1798 -               new_st = bfq_entity_service_tree(entity);
1799 -               new_st->wsum += entity->weight;
1800 -
1801 -               if (new_st != old_st)
1802 -                       entity->start = new_st->vtime;
1803 -       }
1804 -
1805 -       return new_st;
1806 +       bfq_bfqq_served(bfqq, entity->budget - entity->service);
1807  }
1808  
1809  /**
1810 @@ -607,7 +667,7 @@ static void __bfq_activate_entity(struct
1811                 entity->on_st = 1;
1812         }
1813  
1814 -       st = __bfq_entity_update_prio(st, entity);
1815 +       st = __bfq_entity_update_weight_prio(st, entity);
1816         bfq_calc_finish(entity, entity->budget);
1817         bfq_active_insert(st, entity);
1818  }
1819 diff -uprN linux-2.6.28/block/bfq.h linux-2.6.28.new/block/bfq.h
1820 --- linux-2.6.28/block/bfq.h    2011-06-13 21:41:23.766107399 +0200
1821 +++ linux-2.6.28.new/block/bfq.h        2011-06-13 21:41:41.639927001 +0200
1822 @@ -1,5 +1,5 @@
1823  /*
1824 - * BFQ: data structures and common functions prototypes.
1825 + * BFQ-v1-r1 for 2.6.35: data structures and common functions prototypes.
1826   *
1827   * Based on ideas and code from CFQ:
1828   * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1829 @@ -21,12 +21,22 @@
1830  
1831  #define BFQ_IOPRIO_CLASSES     3
1832  
1833 -#define BFQ_DEFAULT_GRP_IOPRIO 4
1834 +#define BFQ_MIN_WEIGHT 1
1835 +#define BFQ_MAX_WEIGHT 1000
1836 +
1837 +#define BFQ_DEFAULT_GRP_WEIGHT 10
1838 +#define BFQ_DEFAULT_GRP_IOPRIO 0
1839  #define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
1840  
1841 -typedef u64 bfq_timestamp_t;
1842 -typedef unsigned long bfq_weight_t;
1843 -typedef unsigned long bfq_service_t;
1844 +/* Constants used in weight boosting (in its turn used to reduce latencies): */
1845 +/* max factor by which the weight of a boosted queue is multiplied */
1846 +#define BFQ_BOOST_COEFF        10
1847 +/* max number of sectors that can be served during a boosting period */
1848 +#define BFQ_BOOST_BUDGET       49152
1849 +/* max duration of a boosting period, msec */
1850 +#define BFQ_BOOST_TIMEOUT      6000
1851 +/* min idle period after which boosting may be reactivated for a queue, msec */
1852 +#define BFQ_MIN_ACT_INTERVAL   20000
1853  
1854  struct bfq_entity;
1855  
1856 @@ -51,8 +61,8 @@ struct bfq_service_tree {
1857         struct bfq_entity *first_idle;
1858         struct bfq_entity *last_idle;
1859  
1860 -       bfq_timestamp_t vtime;
1861 -       bfq_weight_t wsum;
1862 +       u64 vtime;
1863 +       unsigned long wsum;
1864  };
1865  
1866  /**
1867 @@ -92,17 +102,19 @@ struct bfq_sched_data {
1868   *             this entity; used for O(log N) lookups into active trees.
1869   * @service: service received during the last round of service.
1870   * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
1871 - * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
1872 + * @weight: weight of the queue
1873   * @parent: parent entity, for hierarchical scheduling.
1874   * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
1875   *                 associated scheduler queue, %NULL on leaf nodes.
1876   * @sched_data: the scheduler queue this entity belongs to.
1877   * @ioprio: the ioprio in use.
1878 - * @new_ioprio: when an ioprio change is requested, the new ioprio value
1879 + * @new_weight: when a weight change is requested, the new weight value.
1880 + * @orig_weight: original weight, used to implement weight boosting
1881 + * @new_ioprio: when an ioprio change is requested, the new ioprio value.
1882   * @ioprio_class: the ioprio_class in use.
1883   * @new_ioprio_class: when an ioprio_class change is requested, the new
1884   *                    ioprio_class value.
1885 - * @ioprio_changed: flag, true when the user requested an ioprio or
1886 + * @ioprio_changed: flag, true when the user requested a weight, ioprio or
1887   *                  ioprio_class change.
1888   *
1889   * A bfq_entity is used to represent either a bfq_queue (leaf node in the
1890 @@ -111,36 +123,39 @@ struct bfq_sched_data {
1891   * hierarchy.  Non-leaf entities have also their own sched_data, stored
1892   * in @my_sched_data.
1893   *
1894 - * Each entity stores independently its priority values; this would allow
1895 - * different weights on different devices, but this functionality is not
1896 - * exported to userspace by now.  Priorities are updated lazily, first
1897 - * storing the new values into the new_* fields, then setting the
1898 - * @ioprio_changed flag.  As soon as there is a transition in the entity
1899 - * state that allows the priority update to take place the effective and
1900 - * the requested priority values are synchronized.
1901 - *
1902 - * The weight value is calculated from the ioprio to export the same
1903 - * interface as CFQ.  When dealing with ``well-behaved'' queues (i.e.,
1904 - * queues that do not spend too much time to consume their budget and
1905 - * have true sequential behavior, and when there are no external factors
1906 - * breaking anticipation) the relative weights at each level of the
1907 - * cgroups hierarchy should be guaranteed.
1908 - * All the fields are protected by the queue lock of the containing bfqd.
1909 + * Each entity stores independently its priority values; this would
1910 + * allow different weights on different devices, but this
1911 + * functionality is not exported to userspace by now.  Priorities and
1912 + * weights are updated lazily, first storing the new values into the
1913 + * new_* fields, then setting the @ioprio_changed flag.  As soon as
1914 + * there is a transition in the entity state that allows the priority
1915 + * update to take place the effective and the requested priority
1916 + * values are synchronized.
1917 + *
1918 + * Unless cgroups are used, the weight value is calculated from the
1919 + * ioprio to export the same interface as CFQ.  When dealing with
1920 + * ``well-behaved'' queues (i.e., queues that do not spend too much
1921 + * time to consume their budget and have true sequential behavior, and
1922 + * when there are no external factors breaking anticipation) the
1923 + * relative weights at each level of the cgroups hierarchy should be
1924 + * guaranteed.  All the fields are protected by the queue lock of the
1925 + * containing bfqd.
1926   */
1927  struct bfq_entity {
1928         struct rb_node rb_node;
1929  
1930         int on_st;
1931  
1932 -       bfq_timestamp_t finish;
1933 -       bfq_timestamp_t start;
1934 +       u64 finish;
1935 +       u64 start;
1936  
1937         struct rb_root *tree;
1938  
1939 -       bfq_timestamp_t min_start;
1940 +       u64 min_start;
1941  
1942 -       bfq_service_t service, budget;
1943 -       bfq_weight_t weight;
1944 +       unsigned long service, budget;
1945 +       unsigned short weight, new_weight;
1946 +       unsigned short orig_weight;
1947  
1948         struct bfq_entity *parent;
1949  
1950 @@ -168,6 +183,7 @@ struct bfq_group;
1951   *                   completed requests .
1952   * @hw_tag_samples: nr of samples used to calculate hw_tag.
1953   * @hw_tag: flag set to one if the driver is showing a queueing behavior.
1954 + * @budgets_assigned: number of budgets assigned.
1955   * @idle_slice_timer: timer set when idling for the next sequential request
1956   *                    from the queue under service.
1957   * @unplug_work: delayed work to restart dispatching on the request queue.
1958 @@ -216,6 +232,8 @@ struct bfq_data {
1959         int hw_tag_samples;
1960         int hw_tag;
1961  
1962 +       int budgets_assigned;
1963 +
1964         struct timer_list idle_slice_timer;
1965         struct work_struct unplug_work;
1966  
1967 @@ -228,7 +246,7 @@ struct bfq_data {
1968         ktime_t last_idling_start;
1969         int peak_rate_samples;
1970         u64 peak_rate;
1971 -       bfq_service_t bfq_max_budget;
1972 +       unsigned long bfq_max_budget;
1973  
1974         struct list_head cic_list;
1975         struct hlist_head group_list;
1976 @@ -244,6 +262,8 @@ struct bfq_data {
1977         unsigned int bfq_user_max_budget;
1978         unsigned int bfq_max_budget_async_rq;
1979         unsigned int bfq_timeout[2];
1980 +
1981 +       bool low_latency;
1982  };
1983  
1984  /**
1985 @@ -260,12 +280,17 @@ struct bfq_data {
1986   * @max_budget: maximum budget allowed from the feedback mechanism.
1987   * @budget_timeout: budget expiration (in jiffies).
1988   * @dispatched: number of requests on the dispatch list or inside driver.
1989 - * @budgets_assigned: number of budgets assigned.
1990   * @org_ioprio: saved ioprio during boosted periods.
1991   * @org_ioprio_class: saved ioprio_class during boosted periods.
1992   * @flags: status flags.
1993   * @bfqq_list: node for active/idle bfqq list inside our bfqd.
1994 + * @seek_samples: number of seeks sampled
1995 + * @seek_total: sum of the distances of the seeks sampled
1996 + * @seek_mean: mean seek distance
1997 + * @last_request_pos: position of the last request enqueued
1998   * @pid: pid of the process owning the queue, used for logging purposes.
1999 + * @last_activation_time: time of the last (idle -> backlogged) transition
2000 + * @high_weight_budget: number of sectors left to serve with boosted weight
2001   *
2002   * A bfq_queue is a leaf request queue; it can be associated to an io_context
2003   * or more (if it is an async one).  @cgroup holds a reference to the
2004 @@ -287,11 +312,10 @@ struct bfq_queue {
2005  
2006         struct bfq_entity entity;
2007  
2008 -       bfq_service_t max_budget;
2009 +       unsigned long max_budget;
2010         unsigned long budget_timeout;
2011  
2012         int dispatched;
2013 -       int budgets_assigned;
2014  
2015         unsigned short org_ioprio;
2016         unsigned short org_ioprio_class;
2017 @@ -300,7 +324,15 @@ struct bfq_queue {
2018  
2019         struct list_head bfqq_list;
2020  
2021 +       unsigned int seek_samples;
2022 +       u64 seek_total;
2023 +       sector_t seek_mean;
2024 +       sector_t last_request_pos;
2025 +
2026         pid_t pid;
2027 +
2028 +       u64 last_activation_time;
2029 +       unsigned long high_weight_budget;
2030  };
2031  
2032  enum bfqq_state_flags {
2033 @@ -400,6 +432,7 @@ struct bfq_group {
2034  /**
2035   * struct bfqio_cgroup - bfq cgroup data structure.
2036   * @css: subsystem state for bfq in the containing cgroup.
2037 + * @weight: cgroup weight.
2038   * @ioprio: cgroup ioprio.
2039   * @ioprio_class: cgroup ioprio_class.
2040   * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
2041 @@ -411,7 +444,7 @@ struct bfq_group {
2042  struct bfqio_cgroup {
2043         struct cgroup_subsys_state css;
2044  
2045 -       unsigned short ioprio, ioprio_class;
2046 +       unsigned short weight, ioprio, ioprio_class;
2047  
2048         spinlock_t lock;
2049         struct hlist_head group_data;