vcs.maemo.org Git - kernel-bfs/blob - kernel-bfs-2.6.28/debian/patches/block-switch-from-BFQ-for-2.6.28-to-BFQ-v1.patch

   1 diff -uprN linux-2.6.28/block/bfq-cgroup.c linux-2.6.28.new/block/bfq-cgroup.c
   2 --- linux-2.6.28/block/bfq-cgroup.c     2011-06-13 21:41:23.758108202 +0200
   3 +++ linux-2.6.28.new/block/bfq-cgroup.c 2011-06-13 21:41:41.631928601 +0200
   4 @@ -6,10 +6,13 @@
   5   *
   6   * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   7   *                   Paolo Valente <paolo.valente@unimore.it>
   8 + *
   9 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
  10   */
  11
  12  #ifdef CONFIG_CGROUP_BFQIO
  13  static struct bfqio_cgroup bfqio_root_cgroup = {
  14 +       .weight = BFQ_DEFAULT_GRP_WEIGHT,
  15         .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
  16         .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
  17  };
  18 @@ -17,6 +20,8 @@ static struct bfqio_cgroup bfqio_root_cg
  19  static inline void bfq_init_entity(struct bfq_entity *entity,
  20                                    struct bfq_group *bfqg)
  21  {
  22 +       entity->weight = entity->new_weight;
  23 +       entity->orig_weight = entity->new_weight;
  24         entity->ioprio = entity->new_ioprio;
  25         entity->ioprio_class = entity->new_ioprio_class;
  26         entity->parent = bfqg->my_entity;
  27 @@ -54,6 +59,8 @@ static inline void bfq_group_init_entity
  28  {
  29         struct bfq_entity *entity = &bfqg->entity;
  30
  31 +       entity->weight = entity->new_weight = bgrp->weight;
  32 +       entity->orig_weight = entity->new_weight;
  33         entity->ioprio = entity->new_ioprio = bgrp->ioprio;
  34         entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
  35         entity->ioprio_changed = 1;
  36 @@ -301,6 +308,9 @@ static struct bfq_group *__bfq_cic_chang
  37
  38                 if (entity->sched_data != &bfqg->sched_data) {
  39                         cic_set_bfqq(cic, NULL, 0);
  40 +                       bfq_log_bfqq(bfqd, async_bfqq,
  41 +                                    "cic_change_group: %p %d",
  42 +                                    async_bfqq, async_bfqq->ref);
  43                         bfq_put_queue(async_bfqq);
  44                 }
  45         }
  46 @@ -453,6 +463,7 @@ static void bfq_disconnect_groups(struct
  47         struct hlist_node *pos, *n;
  48         struct bfq_group *bfqg;
  49
  50 +       bfq_log(bfqd, "disconnect_groups beginning") ;
  51         hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
  52                 hlist_del(&bfqg->bfqd_node);
  53
  54 @@ -466,6 +477,9 @@ static void bfq_disconnect_groups(struct
  55                  * to the list.
  56                  */
  57                 rcu_assign_pointer(bfqg->bfqd, NULL);
  58 +
  59 +               bfq_log(bfqd, "disconnect_groups: put async for group %p",
  60 +                       bfqg) ;
  61                 bfq_put_async_queues(bfqd, bfqg);
  62         }
  63  }
  64 @@ -475,6 +489,8 @@ static inline void bfq_free_root_group(s
  65         struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
  66         struct bfq_group *bfqg = bfqd->root_group;
  67
  68 +       bfq_put_async_queues(bfqd, bfqg);
  69 +
  70         spin_lock_irq(&bgrp->lock);
  71         hlist_del_rcu(&bfqg->group_node);
  72         spin_unlock_irq(&bgrp->lock);
  73 @@ -529,6 +545,7 @@ static u64 bfqio_cgroup_##__VAR##_read(s
  74         return ret;                                                     \
  75  }
  76
  77 +SHOW_FUNCTION(weight);
  78  SHOW_FUNCTION(ioprio);
  79  SHOW_FUNCTION(ioprio_class);
  80  #undef SHOW_FUNCTION
  81 @@ -551,9 +568,9 @@ static int bfqio_cgroup_##__VAR##_write(
  82         bgrp = cgroup_to_bfqio(cgroup);                                 \
  83                                                                         \
  84         spin_lock_irq(&bgrp->lock);                                     \
  85 -       bgrp->__VAR = (unsigned char)val;                               \
  86 +       bgrp->__VAR = (unsigned short)val;                              \
  87         hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {  \
  88 -               bfqg->entity.new_##__VAR = (unsigned char)val;          \
  89 +               bfqg->entity.new_##__VAR = (unsigned short)val;         \
  90                 smp_wmb();                                              \
  91                 bfqg->entity.ioprio_changed = 1;                        \
  92         }                                                               \
  93 @@ -564,12 +581,18 @@ static int bfqio_cgroup_##__VAR##_write(
  94         return 0;                                                       \
  95  }
  96
  97 +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
  98  STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
  99  STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
 100  #undef STORE_FUNCTION
 101
 102  static struct cftype bfqio_files[] = {
 103         {
 104 +               .name = "weight",
 105 +               .read_u64 = bfqio_cgroup_weight_read,
 106 +               .write_u64 = bfqio_cgroup_weight_write,
 107 +       },
 108 +       {
 109                 .name = "ioprio",
 110                 .read_u64 = bfqio_cgroup_ioprio_read,
 111                 .write_u64 = bfqio_cgroup_ioprio_write,
 112 @@ -697,6 +720,8 @@ struct cgroup_subsys bfqio_subsys = {
 113  static inline void bfq_init_entity(struct bfq_entity *entity,
 114                                    struct bfq_group *bfqg)
 115  {
 116 +       entity->weight = entity->new_weight;
 117 +       entity->orig_weight = entity->new_weight;
 118         entity->ioprio = entity->new_ioprio;
 119         entity->ioprio_class = entity->new_ioprio_class;
 120         entity->sched_data = &bfqg->sched_data;
 121 diff -uprN linux-2.6.28/block/bfq-iosched.c linux-2.6.28.new/block/bfq-iosched.c
 122 --- linux-2.6.28/block/bfq-iosched.c    2011-06-13 21:41:23.766107399 +0200
 123 +++ linux-2.6.28.new/block/bfq-iosched.c        2011-06-13 21:42:46.519829365 +0200
 124 @@ -7,23 +7,36 @@
 125   * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 126   *                   Paolo Valente <paolo.valente@unimore.it>
 127   *
 128 - * BFQ is a proportional share disk scheduling algorithm based on CFQ,
 129 - * that uses the B-WF2Q+ internal scheduler to assign budgets (i.e.,
 130 - * slices in the service domain) to the tasks accessing the disk.  It
 131 - * has been introduced in [1], where the interested reader can find an
 132 - * accurate description of the algorithm, the guarantees it provides
 133 - * and their formal proofs.  With respect to the algorithm presented
 134 - * in the paper, this implementation adds a timeout to limit the maximum
 135 - * time a queue can spend to complete its assigned budget, and a
 136 - * hierarchical extension, based on H-WF2Q+.
 137 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
 138 + *
 139 + * BFQ is a proportional share disk scheduling algorithm based on the
 140 + * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
 141 + * measured in number of sectors, to tasks instead of time slices.
 142 + * The disk is not granted to the active task for a given time slice,
 143 + * but until it has exahusted its assigned budget.  This change from
 144 + * the time to the service domain allows BFQ to distribute the disk
 145 + * bandwidth among tasks as desired, without any distortion due to
 146 + * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
 147 + * internal scheduler, called B-WF2Q+, to schedule tasks according to
 148 + * their budgets.  Thanks to this accurate scheduler, BFQ can afford
 149 + * to assign high budgets to disk-bound non-seeky tasks (to boost the
 150 + * throughput), and yet guarantee low latencies to interactive and
 151 + * soft real-time applications.
 152 + *
 153 + * BFQ has been introduced in [1], where the interested reader can
 154 + * find an accurate description of the algorithm, the bandwidth
 155 + * distribution and latency guarantees it provides, plus formal proofs
 156 + * of all the properties.  With respect to the algorithm presented in
 157 + * the paper, this implementation adds several little heuristics, and
 158 + * a hierarchical extension, based on H-WF2Q+.
 159   *
 160   * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
 161   * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
 162   * complexity derives from the one introduced with EEVDF in [3].
 163   *
 164   * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
 165 - *     with Deterministic Guarantees on Bandwidth Distribution,'' to be
 166 - *     published.
 167 + *     with Deterministic Guarantees on Bandwidth Distribution,'' to appear
 168 + *     on IEEE Transactions on Computer.
 169   *
 170   *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq.pdf
 171   *
 172 @@ -40,6 +53,7 @@
 173   *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
 174   */
 175  #include <linux/module.h>
 176 +#include <linux/slab.h>
 177  #include <linux/blkdev.h>
 178  #include <linux/cgroup.h>
 179  #include <linux/elevator.h>
 180 @@ -47,26 +61,33 @@
 181  #include <linux/ioprio.h>
 182  #include "bfq.h"
 183
 184 -/* Max nr of dispatches in one round of service. */
 185 +/* Max number of dispatches in one round of service. */
 186  static const int bfq_quantum = 4;
 187
 188 -/* Expiration time of each request (jiffies). */
 189 +/* Expiration time of sync (0) and async (1) requests, in jiffies. */
 190  static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 191
 192  /* Maximum backwards seek, in KiB. */
 193  static const int bfq_back_max = 16 * 1024;
 194
 195 -/* Penalty of a backwards seek. */
 196 +/* Penalty of a backwards seek, in number of sectors. */
 197  static const int bfq_back_penalty = 2;
 198
 199 -/* Idling period duration (jiffies). */
 200 +/* Idling period duration, in jiffies. */
 201  static int bfq_slice_idle = HZ / 125;
 202
 203 -/* Default maximum budget values (sectors). */
 204 -static const int bfq_max_budget = 16 * 1024;
 205 +/* Default maximum budget values, in sectors and number of requests. */
 206 +static const int bfq_default_max_budget = 16 * 1024;
 207  static const int bfq_max_budget_async_rq = 4;
 208
 209 -/* Default timeout values (jiffies), approximating CFQ defaults. */
 210 +/*
 211 + * Async to sync throughput distribution is controlled as follows:
 212 + * when an async request is served, the entity is charged the number
 213 + * of sectors of the request, multipled by the factor below
 214 + */
 215 +static const int bfq_async_charge_factor = 10;
 216 +
 217 +/* Default timeout values, in jiffies, approximating CFQ defaults. */
 218  static const int bfq_timeout_sync = HZ / 8;
 219  static int bfq_timeout_async = HZ / 25;
 220
 221 @@ -84,8 +105,7 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
 222  #define BFQ_HW_QUEUE_THRESHOLD 4
 223  #define BFQ_HW_QUEUE_SAMPLES   32
 224
 225 -/* Budget feedback step. */
 226 -#define BFQ_BUDGET_STEP                128
 227 +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > (8 * 1024))
 228
 229  /* Min samples used for peak rate estimation (for autotuning). */
 230  #define BFQ_PEAK_RATE_SAMPLES  32
 231 @@ -104,15 +124,10 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
 232  #include "bfq-sched.c"
 233  #include "bfq-cgroup.c"
 234
 235 -static inline int bfq_class_idle(struct bfq_queue *bfqq)
 236 -{
 237 -       return bfqq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
 238 -}
 239 +#define bfq_class_idle(cfqq)   ((bfqq)->entity.ioprio_class ==\
 240 +                                IOPRIO_CLASS_IDLE)
 241
 242 -static inline int bfq_sample_valid(int samples)
 243 -{
 244 -       return samples > 80;
 245 -}
 246 +#define bfq_sample_valid(samples)      ((samples) > 80)
 247
 248  /*
 249   * We regard a request as SYNC, if either it's a read or has the SYNC bit
 250 @@ -180,7 +195,7 @@ static struct request *bfq_choose_req(st
 251         last = bfqd->last_position;
 252
 253         /*
 254 -        * by definition, 1KiB is 2 sectors
 255 +        * By definition, 1KiB is 2 sectors.
 256          */
 257         back_max = bfqd->bfq_back_max * 2;
 258
 259 @@ -282,17 +297,24 @@ static void bfq_del_rq_rb(struct request
 260                 bfq_del_bfqq_busy(bfqd, bfqq, 1);
 261  }
 262
 263 +/* see the definition of bfq_async_charge_factor for details */
 264 +static inline unsigned long bfq_serv_to_charge(struct request *rq,
 265 +                                              struct bfq_queue *bfqq)
 266 +{
 267 +       return rq->hard_nr_sectors *
 268 +               (1 + ((!bfq_bfqq_sync(bfqq)) * bfq_async_charge_factor));
 269 +}
 270 +
 271  /**
 272   * bfq_updated_next_req - update the queue after a new next_rq selection.
 273   * @bfqd: the device data the queue belongs to.
 274   * @bfqq: the queue to update.
 275   *
 276 - * Whenever the first request of a queue changes we try to allocate it
 277 - * enough service (if it has grown), or to anticipate its finish time
 278 - * (if it has shrinked), to reduce the time it has to wait, still taking
 279 - * into account the queue budget.  We try to avoid the queue having not
 280 - * enough service allocated for its first request, thus having to go
 281 - * through two dispatch rounds to actually dispatch the request.
 282 + * If the first request of a queue changes we make sure that the queue
 283 + * has enough budget to serve at least its first request (if the
 284 + * request has grown).  We do this because if the queue has not enough
 285 + * budget for its first request, it has to go through two dispatch
 286 + * rounds to actually get it dispatched.
 287   */
 288  static void bfq_updated_next_req(struct bfq_data *bfqd,
 289                                  struct bfq_queue *bfqq)
 290 @@ -300,7 +322,7 @@ static void bfq_updated_next_req(struct
 291         struct bfq_entity *entity = &bfqq->entity;
 292         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 293         struct request *next_rq = bfqq->next_rq;
 294 -       bfq_service_t new_budget;
 295 +       unsigned long new_budget;
 296
 297         if (next_rq == NULL)
 298                 return;
 299 @@ -315,9 +337,10 @@ static void bfq_updated_next_req(struct
 300         BUG_ON(entity->tree != &st->active);
 301         BUG_ON(entity == entity->sched_data->active_entity);
 302
 303 -       new_budget = max(bfqq->max_budget, next_rq->hard_nr_sectors);
 304 +       new_budget = max_t(unsigned long, bfqq->max_budget,
 305 +                          bfq_serv_to_charge(next_rq, bfqq));
 306         entity->budget = new_budget;
 307 -       bfq_log_bfqq(bfqd, bfqq, "budget=%lu", new_budget);
 308 +       bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
 309         bfq_activate_bfqq(bfqd, bfqq);
 310  }
 311
 312 @@ -328,6 +351,7 @@ static void bfq_add_rq_rb(struct request
 313         struct bfq_data *bfqd = bfqq->bfqd;
 314         struct request *__alias, *next_rq;
 315
 316 +       bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
 317         bfqq->queued[rq_is_sync(rq)]++;
 318         bfqd->queued++;
 319
 320 @@ -339,15 +363,33 @@ static void bfq_add_rq_rb(struct request
 321                 bfq_dispatch_insert(bfqd->queue, __alias);
 322
 323         /*
 324 -        * check if this request is a better next-serve candidate
 325 +        * Check if this request is a better next-serve candidate.
 326          */
 327         next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq);
 328         BUG_ON(next_rq == NULL);
 329         bfqq->next_rq = next_rq;
 330
 331         if (!bfq_bfqq_busy(bfqq)) {
 332 -               entity->budget = max(bfqq->max_budget,
 333 -                                    next_rq->hard_nr_sectors);
 334 +               entity->budget = max_t(unsigned long, bfqq->max_budget,
 335 +                                      bfq_serv_to_charge(next_rq, bfqq));
 336 +
 337 +               /*
 338 +                * If the queue is not being boosted and has been idle
 339 +                * for enough time, start a boosting period
 340 +                */
 341 +               if (bfqd->low_latency && bfqq->high_weight_budget == 0) {
 342 +                       if (bfqq->last_activation_time + BFQ_MIN_ACT_INTERVAL <
 343 +                           jiffies_to_msecs(jiffies)) {
 344 +                               bfqq->high_weight_budget = BFQ_BOOST_BUDGET;
 345 +                               entity->ioprio_changed = 1;
 346 +                               bfq_log_bfqq(bfqd, bfqq,
 347 +                                            "wboost starting at %lu msec",
 348 +                                            bfqq->last_activation_time);
 349 +                       }
 350 +                       bfqq->last_activation_time =
 351 +                               jiffies_to_msecs(jiffies);
 352 +               }
 353 +
 354                 bfq_add_bfqq_busy(bfqd, bfqq);
 355         } else
 356                 bfq_updated_next_req(bfqd, bfqq);
 357 @@ -446,7 +488,7 @@ static void bfq_merged_requests(struct r
 358                                 struct request *next)
 359  {
 360         /*
 361 -        * reposition in fifo if next is older than rq
 362 +        * Reposition in fifo if next is older than rq.
 363          */
 364         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 365             time_before(next->start_time, rq->start_time))
 366 @@ -475,10 +517,7 @@ static int bfq_allow_merge(struct reques
 367                 return 0;
 368
 369         bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
 370 -       if (bfqq == RQ_BFQQ(rq))
 371 -               return 1;
 372 -
 373 -       return 0;
 374 +       return bfqq == RQ_BFQQ(rq);
 375  }
 376
 377  static void __bfq_set_active_queue(struct bfq_data *bfqd,
 378 @@ -489,9 +528,10 @@ static void __bfq_set_active_queue(struc
 379                 bfq_mark_bfqq_budget_new(bfqq);
 380                 bfq_clear_bfqq_fifo_expire(bfqq);
 381
 382 -               bfqq->budgets_assigned = (bfqq->budgets_assigned*7 + 256) / 8;
 383 +               bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
 384
 385 -               bfq_log_bfqq(bfqd, bfqq, "active");
 386 +               bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
 387 +                            bfqq->entity.budget);
 388         }
 389
 390         bfqd->active_queue = bfqq;
 391 @@ -509,7 +549,26 @@ static struct bfq_queue *bfq_set_active_
 392         return bfqq;
 393  }
 394
 395 -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
 396 +/*
 397 + * If enough samples have been computed, return the current max budget
 398 + * stored in bfqd, which is dynamically updated according to the
 399 + * estimated disk peak rate; otherwise return the default max budget
 400 + */
 401 +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
 402 +{
 403 +       return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
 404 +               bfqd->bfq_max_budget;
 405 +}
 406 +
 407 +/*
 408 + * Return min budget, which is a fraction of the current or default
 409 + * max budget (trying with 1/32)
 410 + */
 411 +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
 412 +{
 413 +       return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
 414 +               bfqd->bfq_max_budget / 32;
 415 +}
 416
 417  static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 418  {
 419 @@ -531,19 +590,30 @@ static void bfq_arm_slice_timer(struct b
 420         bfq_mark_bfqq_wait_request(bfqq);
 421
 422         /*
 423 -        * we don't want to idle for seeks, but we do want to allow
 424 +        * We don't want to idle for seeks, but we do want to allow
 425          * fair distribution of slice time for a process doing back-to-back
 426 -        * seeks. so allow a little bit of time for him to submit a new rq
 427 +        * seeks. So allow a little bit of time for him to submit a new rq.
 428 +        *
 429 +        * To prevent processes with (partly) seeky workloads from
 430 +        * being too ill-treated, grant them a small fraction of the
 431 +        * assigned budget before reducing the waiting time to
 432 +        * BFQ_MIN_TT. This happened to help reduce latency.
 433          */
 434         sl = bfqd->bfq_slice_idle;
 435 -       if (bfq_sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
 436 +       if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
 437 +           bfqq->entity.service > bfq_max_budget(bfqd) / 8)
 438                 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
 439
 440         bfqd->last_idling_start = ktime_get();
 441         mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
 442 -       bfq_log(bfqd, "arm idle: %lu", sl);
 443 +       bfq_log(bfqd, "arm idle: %lu ms", sl);
 444  }
 445
 446 +/*
 447 + * Set the maximum time for the active queue to consume its
 448 + * budget. This prevents seeky processes from lowering the disk
 449 + * throughput (always guaranteed with a time slice scheme as in CFQ).
 450 + */
 451  static void bfq_set_budget_timeout(struct bfq_data *bfqd)
 452  {
 453         struct bfq_queue *bfqq = bfqd->active_queue;
 454 @@ -552,7 +622,8 @@ static void bfq_set_budget_timeout(struc
 455
 456         bfq_clear_bfqq_budget_new(bfqq);
 457         bfqq->budget_timeout = jiffies +
 458 -               bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)];
 459 +               bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)] *
 460 +               (bfqq->entity.weight / bfqq->entity.orig_weight);
 461  }
 462
 463  /*
 464 @@ -572,7 +643,7 @@ static void bfq_dispatch_insert(struct r
 465  }
 466
 467  /*
 468 - * return expired entry, or NULL to just start from scratch in rbtree
 469 + * Return expired entry, or NULL to just start from scratch in rbtree.
 470   */
 471  static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
 472  {
 473 @@ -597,7 +668,7 @@ static struct request *bfq_check_fifo(st
 474         return rq;
 475  }
 476
 477 -static inline bfq_service_t bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 478 +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 479  {
 480         struct bfq_entity *entity = &bfqq->entity;
 481         return entity->budget - entity->service;
 482 @@ -616,98 +687,115 @@ static void __bfq_bfqq_expire(struct bfq
 483  }
 484
 485  /**
 486 - * bfq_default_budget - return the default budget for @bfqq on @bfqd.
 487 - * @bfqd: the device descriptor.
 488 - * @bfqq: the queue to consider.
 489 - *
 490 - * We use 3/4 of the @bfqd maximum budget as the default value
 491 - * for the max_budget field of the queues.  This lets the feedback
 492 - * mechanism to start from some middle ground, then the behavior
 493 - * of the task will drive the heuristics towards high values, if
 494 - * it behaves as a greedy sequential reader, or towards small values
 495 - * if it shows a more intermittent behavior.
 496 - */
 497 -static bfq_service_t bfq_default_budget(struct bfq_data *bfqd,
 498 -                                       struct bfq_queue *bfqq)
 499 -{
 500 -       bfq_service_t budget;
 501 -
 502 -       /*
 503 -        * When we need an estimate of the peak rate we need to avoid
 504 -        * to give budgets that are too short due to previous measurements.
 505 -        * So, in the first 10 assignments use a ``safe'' budget value.
 506 -        */
 507 -       if (bfqq->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
 508 -               budget = bfq_max_budget;
 509 -       else
 510 -               budget = bfqd->bfq_max_budget;
 511 -
 512 -       return budget - budget / 4;
 513 -}
 514 -
 515 -static inline bfq_service_t bfq_min_budget(struct bfq_data *bfqd,
 516 -                                          struct bfq_queue *bfqq)
 517 -{
 518 -       return bfqd->bfq_max_budget / 2;
 519 -}
 520 -
 521 -/**
 522   * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
 523   * @bfqd: device data.
 524   * @bfqq: queue to update.
 525   * @reason: reason for expiration.
 526   *
 527 - * Handle the feedback on @bfqq budget.  This is driven by the following
 528 - * principles:
 529 - *   - async queues get always the maximum budget value (their ability to
 530 - *     dispatch is limited by @bfqd->bfq_max_budget_async_rq).
 531 - *   - If @bfqq has been too idle we decrease its budget, as it is likely
 532 - *     to be more interested in latency than in throughput.
 533 - *   - If @bfqq took too much to consume its budget it is likely to be
 534 - *     seeky, so reset the budget to the default, in order to have all
 535 - *     the seeky queues to be charged for the same service, trying to
 536 - *     achieve fairness at least in the time domain among them.
 537 - *   - If @bfqq exhausted its budget treat it as a greedy reader, in
 538 - *     order to run it at full speed.
 539 - *   - If @bfqq expired due to lack of requests leave its budget untouched.
 540 + * Handle the feedback on @bfqq budget.  See the body for detailed
 541 + * comments.
 542   */
 543  static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 544                                      struct bfq_queue *bfqq,
 545                                      enum bfqq_expiration reason)
 546  {
 547         struct request *next_rq;
 548 -       bfq_service_t budget, min_budget;
 549 +       unsigned long budget, min_budget;
 550
 551         budget = bfqq->max_budget;
 552 -       min_budget = bfq_min_budget(bfqd, bfqq);
 553 +       min_budget = bfq_min_budget(bfqd);
 554
 555         BUG_ON(bfqq != bfqd->active_queue);
 556
 557 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
 558 +               bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
 559 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
 560 +               budget, bfq_min_budget(bfqd));
 561 +       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
 562 +               bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
 563 +
 564         if (bfq_bfqq_sync(bfqq)) {
 565                 switch (reason) {
 566 +               /*
 567 +                * Caveat: in all the following cases we trade latency
 568 +                * for throughput.
 569 +                */
 570                 case BFQ_BFQQ_TOO_IDLE:
 571 -                       if (budget > min_budget + BFQ_BUDGET_STEP)
 572 -                               budget -= BFQ_BUDGET_STEP;
 573 -                       else
 574 -                               budget = min_budget;
 575 +                       /*
 576 +                        * This is the only case where we may reduce
 577 +                        * the budget: if there is no requets of the
 578 +                        * process still waiting for completion, then
 579 +                        * we assume (tentatively) that the timer has
 580 +                        * expired because the batch of requests of
 581 +                        * the process could have been served with a
 582 +                        * smaller budget.  Hence, betting that
 583 +                        * process will behave in the same way when it
 584 +                        * becomes backlogged again, we reduce its
 585 +                        * next budget.  As long as we guess right,
 586 +                        * this budget cut reduces the latency
 587 +                        * experienced by the process.
 588 +                        *
 589 +                        * However, if there are still outstanding
 590 +                        * requests, then the process may have not yet
 591 +                        * issued its next request just because it is
 592 +                        * still waiting for the completion of some of
 593 +                        * the still oustanding ones.  So in this
 594 +                        * subcase we do not reduce its budget, on the
 595 +                        * contrary we increase it to possibly boost
 596 +                        * the throughput, as discussed in the
 597 +                        * comments to the BUDGET_TIMEOUT case.
 598 +                        */
 599 +                       if (bfqq->dispatched > 0) /* still oustanding reqs */
 600 +                               budget = min(budget * 2, bfqd->bfq_max_budget);
 601 +                       else {
 602 +                               if (budget > 5 * min_budget)
 603 +                                       budget -= 4 * min_budget;
 604 +                               else
 605 +                                       budget = min_budget;
 606 +                       }
 607                         break;
 608                 case BFQ_BFQQ_BUDGET_TIMEOUT:
 609 -                       budget = bfq_default_budget(bfqd, bfqq);
 610 +                       /*
 611 +                        * We double the budget here because: 1) it
 612 +                        * gives the chance to boost the throughput if
 613 +                        * this is not a seeky process (which may have
 614 +                        * bumped into this timeout because of, e.g.,
 615 +                        * ZBR), 2) together with charge_full_budget
 616 +                        * it helps give seeky processes higher
 617 +                        * timestamps, and hence be served less
 618 +                        * frequently.
 619 +                        */
 620 +                       budget = min(budget * 2, bfqd->bfq_max_budget);
 621                         break;
 622                 case BFQ_BFQQ_BUDGET_EXHAUSTED:
 623 -                       budget = min(budget + 8 * BFQ_BUDGET_STEP,
 624 -                                    bfqd->bfq_max_budget);
 625 +                       /*
 626 +                        * The process still has backlog, and did not
 627 +                        * let either the budget timeout or the disk
 628 +                        * idling timeout expire. Hence it is not
 629 +                        * seeky, has a short thinktime and may be
 630 +                        * happy with a higher budget too. So
 631 +                        * definitely increase the budget of this good
 632 +                        * candidate to boost the disk throughput.
 633 +                        */
 634 +                       budget = min(budget * 4, bfqd->bfq_max_budget);
 635                         break;
 636                 case BFQ_BFQQ_NO_MORE_REQUESTS:
 637 +                      /*
 638 +                       * Leave the budget unchanged.
 639 +                       */
 640                 default:
 641                         return;
 642                 }
 643 -       } else
 644 +       } else /* async queue */
 645 +           /* async queues get always the maximum possible budget
 646 +            * (their ability to dispatch is limited by
 647 +            * @bfqd->bfq_max_budget_async_rq).
 648 +            */
 649                 budget = bfqd->bfq_max_budget;
 650
 651         bfqq->max_budget = budget;
 652
 653 -       if (bfqq->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
 654 +       if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
 655             bfqq->max_budget > bfqd->bfq_max_budget)
 656                 bfqq->max_budget = bfqd->bfq_max_budget;
 657
 658 @@ -719,30 +807,40 @@ static void __bfq_bfqq_recalc_budget(str
 659          */
 660         next_rq = bfqq->next_rq;
 661         if (next_rq != NULL)
 662 -               bfqq->entity.budget = max(bfqq->max_budget,
 663 -                                         next_rq->hard_nr_sectors);
 664 -       bfq_log_bfqq(bfqd, bfqq, "budget=%lu (%d)", bfqq->entity.budget,
 665 -                    bfq_bfqq_sync(bfqq));
 666 +               bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
 667 +                                           bfq_serv_to_charge(next_rq, bfqq));
 668 +       else
 669 +               bfqq->entity.budget = bfqq->max_budget;
 670 +
 671 +       bfq_log_bfqq(bfqd, bfqq, "head sect: %lu, new budget %lu",
 672 +                       next_rq != NULL ? next_rq->hard_nr_sectors : 0,
 673 +                       bfqq->entity.budget);
 674  }
 675
 676 -static bfq_service_t bfq_calc_max_budget(u64 peak_rate, u64 timeout)
 677 +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
 678  {
 679 -       bfq_service_t max_budget;
 680 +       unsigned long max_budget;
 681
 682         /*
 683          * The max_budget calculated when autotuning is equal to the
 684 -        * amount of sectors transfered in 0.75 * timeout_sync at the
 685 +        * amount of sectors transfered in timeout_sync at the
 686          * estimated peak rate.
 687          */
 688 -       max_budget = (bfq_service_t)(peak_rate * 1000 *
 689 +       max_budget = (unsigned long)(peak_rate * 1000 *
 690                                      timeout >> BFQ_RATE_SHIFT);
 691 -       max_budget -= max_budget / 4;
 692
 693         return max_budget;
 694  }
 695
 696 +/*
 697 + * In addition to updating the peak rate, checks whether the process
 698 + * is "slow", and returns 1 if so. This slow flag is used, in addition
 699 + * to the budget timeout, to reduce the amount of service provided to
 700 + * seeky processes, and hence reduce their chances to lower the
 701 + * throughput. See the code for more details.
 702 + */
 703  static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 704 -                               int compensate)
 705 +                               int compensate, enum bfqq_expiration reason)
 706  {
 707         u64 bw, usecs, expected, timeout;
 708         ktime_t delta;
 709 @@ -775,10 +873,21 @@ static int bfq_update_peak_rate(struct b
 710          * the peak rate estimation.
 711          */
 712         if (usecs > 20000) {
 713 -               if (bw > bfqd->peak_rate) {
 714 -                       bfqd->peak_rate = bw;
 715 +               if (bw > bfqd->peak_rate ||
 716 +                  (!BFQQ_SEEKY(bfqq) &&
 717 +                   reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
 718 +                       bfq_log(bfqd, "measured bw =%llu", bw);
 719 +                       /*
 720 +                        * To smooth oscillations use a low-pass filter with
 721 +                        * alpha=7/8, i.e.,
 722 +                        * new_rate = (7/8) * old_rate + (1/8) * bw
 723 +                        */
 724 +                       do_div(bw, 8);
 725 +                       bfqd->peak_rate *= 7;
 726 +                       do_div(bfqd->peak_rate, 8);
 727 +                       bfqd->peak_rate += bw;
 728                         update = 1;
 729 -                       bfq_log(bfqd, "peak_rate=%llu", bw);
 730 +                       bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
 731                 }
 732
 733                 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
 734 @@ -790,11 +899,22 @@ static int bfq_update_peak_rate(struct b
 735                     update && bfqd->bfq_user_max_budget == 0) {
 736                         bfqd->bfq_max_budget =
 737                                 bfq_calc_max_budget(bfqd->peak_rate, timeout);
 738 -                       bfq_log(bfqd, "max_budget=%lu", bfqd->bfq_max_budget);
 739 +                       bfq_log(bfqd, "new max_budget=%lu",
 740 +                               bfqd->bfq_max_budget);
 741                 }
 742         }
 743
 744         /*
 745 +        * If the process has been served for a too short time
 746 +        * interval to let its possible sequential accesses prevail on
 747 +        * the initial seek time needed to move the disk head on the
 748 +        * first sector it requested, then give the process a chance
 749 +        * and for the moment return false.
 750 +        */
 751 +       if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
 752 +               return 0;
 753 +
 754 +       /*
 755          * A process is considered ``slow'' (i.e., seeky, so that we
 756          * cannot treat it fairly in the service domain, as it would
 757          * slow down too much the other processes) if, when a slice
 758 @@ -804,26 +924,46 @@ static int bfq_update_peak_rate(struct b
 759          */
 760         expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
 761
 762 -       return expected > bfqq->entity.budget;
 763 +       /*
 764 +        * Caveat: processes doing IO in the slower disk zones will
 765 +        * tend to be slow(er) even if not seeky. And the estimated
 766 +        * peak rate will actually be an average over the disk
 767 +        * surface. Hence, to not be too harsh with unlucky processes,
 768 +        * we keep a budget/3 margin of safety before declaring a
 769 +        * process slow.
 770 +        */
 771 +       return expected > (4 * bfqq->entity.budget) / 3;
 772  }
 773
 774 -/*
 775 +/**
 776   * bfq_bfqq_expire - expire a queue.
 777   * @bfqd: device owning the queue.
 778   * @bfqq: the queue to expire.
 779   * @compensate: if true, compensate for the time spent idling.
 780   * @reason: the reason causing the expiration.
 781   *
 782 - * The behavior is the following: when a queue expires because it has
 783 - * been idling for too much we sync its finish time with the service
 784 - * received and decrease its budget.  If @bfqq expires due to budget
 785 - * exhaustion we increase its budget and sync its finish time.
 786 - * If @bfqq expires due to budget timeout we do not sync its finish time
 787 - * to avoid seeky queues to take too much disk time; instead we charge
 788 - * it the maximum budget value.  Using the max budget value for all the
 789 - * queues that expire due to budget timeout has the effect of using the
 790 - * WF2Q+ scheduler to assign timeslices to those queues, without violating
 791 - * the service domain guarantees for well-behaved queues.
 792 + *
 793 + * If the process associated to the queue is slow (i.e., seeky), or in
 794 + * case of budget timeout, or, finally, if it is async, we
 795 + * artificially charge it an entire budget (independently of the
 796 + * actual service it received). As a consequence, the queue will get
 797 + * higher timestamps than the correct ones upon reactivation, and
 798 + * hence it will be rescheduled as if it had received more service
 799 + * than what it actually received. In the end, this class of processes
 800 + * will receive less service in proportion to how slowly they consume
 801 + * their budgets (and hence how seriously they tend to lower the
 802 + * throughput).
 803 + *
 804 + * In contrast, when a queue expires because it has been idling for
 805 + * too much or because it exhausted its budget, we do not touch the
 806 + * amount of service it has received. Hence when the queue will be
 807 + * reactivated and its timestamps updated, the latter will be in sync
 808 + * with the actual service received by the queue until expiration.
 809 + *
 810 + * Charging a full budget to the first type of queues and the exact
 811 + * service to the others has the effect of using the WF2Q+ policy to
 812 + * schedule the former on a timeslice basis, without violating the
 813 + * service domain guarantees of the latter.
 814   */
 815  static void bfq_bfqq_expire(struct bfq_data *bfqd,
 816                             struct bfq_queue *bfqq,
 817 @@ -831,27 +971,42 @@ static void bfq_bfqq_expire(struct bfq_d
 818                             enum bfqq_expiration reason)
 819  {
 820         int slow;
 821 +       BUG_ON(bfqq != bfqd->active_queue);
 822
 823 -       slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
 824 +       /* Update disk peak rate for autotuning and check whether the
 825 +        * process is slow (see bfq_update_peak_rate).
 826 +        */
 827 +       slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
 828
 829         /*
 830 -        * Treat slow (i.e., seeky) traffic as timed out, to not favor
 831 -        * it over sequential traffic (a seeky queue consumes less budget,
 832 -        * so it would receive smaller timestamps wrt a sequential one
 833 -        * when an idling timer fires).
 834 +        * As above explained, 'punish' slow (i.e., seeky), timed-out
 835 +        * and async queues, to favor sequential sync workloads.
 836 +        *
 837 +        * Processes doing IO in the slower disk zones will tend to be
 838 +        * slow(er) even if not seeky. Hence, since the estimated peak
 839 +        * rate is actually an average over the disk surface, these
 840 +        * processes may timeout just for bad luck. To avoid punishing
 841 +        * them we do not charge a full budget to a process that
 842 +        * succeeded in consuming at least 2/3 of its budget.
 843          */
 844 -       if (slow && reason == BFQ_BFQQ_TOO_IDLE)
 845 -               reason = BFQ_BFQQ_BUDGET_TIMEOUT;
 846 -
 847 -       if (reason == BFQ_BFQQ_BUDGET_TIMEOUT || !bfq_bfqq_sync(bfqq))
 848 +       if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
 849 +                    bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
 850                 bfq_bfqq_charge_full_budget(bfqq);
 851
 852 -       bfq_log_bfqq(bfqd, bfqq, "expire (%d, %d)", reason, slow);
 853 +       bfq_log_bfqq(bfqd, bfqq,
 854 +               "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
 855 +               bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
 856
 857 +       /* Increase, decrease or leave budget unchanged according to reason */
 858         __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
 859         __bfq_bfqq_expire(bfqd, bfqq);
 860  }
 861
 862 +/*
 863 + * Budget timeout is not implemented through a dedicated timer, but
 864 + * just checked on request arrivals and completions, as well as on
 865 + * idle timer expirations.
 866 + */
 867  static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
 868  {
 869         if (bfq_bfqq_budget_new(bfqq))
 870 @@ -864,6 +1019,22 @@ static int bfq_bfqq_budget_timeout(struc
 871  }
 872
 873  /*
 874 + * If we expire a queue that is waiting for the arrival of a new
 875 + * request, we may prevent the fictitious timestamp backshifting that
 876 + * allows the guarantees of the queue to be preserved (see [1] for
 877 + * this tricky aspect). Hence we return true only if this condition
 878 + * does not hold, or if the queue is slow enough to deserve only to be
 879 + * kicked off for preserving a high throughput.
 880 +*/
 881 +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 882 +{
 883 +       return (!bfq_bfqq_wait_request(bfqq) ||
 884 +               bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
 885 +               &&
 886 +               bfq_bfqq_budget_timeout(bfqq);
 887 +}
 888 +
 889 +/*
 890   * Select a queue for service.  If we have a current active queue,
 891   * check whether to continue servicing it, or retrieve and set a new one.
 892   */
 893 @@ -877,10 +1048,10 @@ static struct bfq_queue *bfq_select_queu
 894         if (bfqq == NULL)
 895                 goto new_queue;
 896
 897 -       if (bfq_bfqq_budget_timeout(bfqq)) {
 898 -               bfq_bfqq_charge_full_budget(bfqq);
 899 +       bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
 900 +
 901 +       if (bfq_may_expire_for_budg_timeout(bfqq))
 902                 goto expire;
 903 -       }
 904
 905         next_rq = bfqq->next_rq;
 906         /*
 907 @@ -888,7 +1059,8 @@ static struct bfq_queue *bfq_select_queu
 908          * serve them, keep the queue, otherwise expire it.
 909          */
 910         if (next_rq != NULL) {
 911 -               if (next_rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
 912 +               if (bfq_serv_to_charge(next_rq, bfqq) >
 913 +                       bfq_bfqq_budget_left(bfqq)) {
 914                         reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
 915                         goto expire;
 916                 } else
 917 @@ -896,9 +1068,8 @@ static struct bfq_queue *bfq_select_queu
 918         }
 919
 920         /*
 921 -        * No requests pending.  If the active queue still has requests in
 922 -        * flight or is idling for a new request, allow either of these
 923 -        * conditions to happen (or time out) before selecting a new queue.
 924 +        * No requests pending.  If the active queue still has
 925 +        * requests in flight or is idling for a new request, then keep it.
 926          */
 927         if (timer_pending(&bfqd->idle_slice_timer) ||
 928             (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq))) {
 929 @@ -911,6 +1082,7 @@ expire:
 930         bfq_bfqq_expire(bfqd, bfqq, 0, reason);
 931  new_queue:
 932         bfqq = bfq_set_active_queue(bfqd);
 933 +       bfq_log(bfqd, "select_queue: new queue returned (possibly NULL)");
 934  keep_queue:
 935         return bfqq;
 936  }
 937 @@ -929,13 +1101,15 @@ static int __bfq_dispatch_requests(struc
 938
 939         do {
 940                 struct request *rq;
 941 +               unsigned long service_to_charge;
 942
 943                 /* Follow expired path, else get first next available. */
 944                 rq = bfq_check_fifo(bfqq);
 945                 if (rq == NULL)
 946                         rq = bfqq->next_rq;
 947 +               service_to_charge = bfq_serv_to_charge(rq, bfqq);
 948
 949 -               if (rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
 950 +               if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
 951                         /*
 952                          * Expire the queue for budget exhaustion, and
 953                          * make sure that the next act_budget is enough
 954 @@ -947,9 +1121,41 @@ static int __bfq_dispatch_requests(struc
 955                 }
 956
 957                 /* Finally, insert request into driver dispatch list. */
 958 -               bfq_bfqq_served(bfqq, rq->hard_nr_sectors);
 959 +               bfq_bfqq_served(bfqq, service_to_charge);
 960                 bfq_dispatch_insert(bfqd->queue, rq);
 961
 962 +               if (bfqq->high_weight_budget > 0) { /* queue is being boosted */
 963 +                       struct bfq_entity *entity = &bfqq->entity;
 964 +
 965 +                       bfq_log_bfqq(bfqd, bfqq, "busy period dur %llu msec, "
 966 +                                    "old highwbudg %lu",
 967 +                                    jiffies_to_msecs(jiffies) -
 968 +                                    bfqq->last_activation_time,
 969 +                                    bfqq->high_weight_budget);
 970 +                       /*
 971 +                        * Decrease the budget for weight boosting by
 972 +                        * the just received service, or, if too much
 973 +                        * time has elapsed from the beginning of this
 974 +                        * boosting period, stop it
 975 +                        */
 976 +                       if (jiffies_to_msecs(jiffies) -
 977 +                           bfqq->last_activation_time <= BFQ_BOOST_TIMEOUT
 978 +                           &&
 979 +                           bfqq->high_weight_budget > service_to_charge)
 980 +                               bfqq->high_weight_budget -= service_to_charge;
 981 +                       else
 982 +                               bfqq->high_weight_budget = 0;
 983 +                       entity->ioprio_changed = 1;
 984 +                       __bfq_entity_update_weight_prio(
 985 +                               bfq_entity_service_tree(entity),
 986 +                               entity);
 987 +               }
 988 +
 989 +               bfq_log_bfqq(bfqd, bfqq, "dispatched %lu sec req (%llu), "
 990 +                               "budg left %lu",
 991 +                               rq->hard_nr_sectors, rq->hard_sector,
 992 +                               bfq_bfqq_budget_left(bfqq));
 993 +
 994                 dispatched++;
 995
 996                 if (bfqd->active_cic == NULL) {
 997 @@ -961,6 +1167,8 @@ static int __bfq_dispatch_requests(struc
 998                         break;
 999         } while (dispatched < max_dispatch);
1000
1001 +       bfq_log_bfqq(bfqd, bfqq, "dispatched %d reqs", dispatched);
1002 +
1003         if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
1004             dispatched >= bfqd->bfq_max_budget_async_rq) ||
1005             bfq_class_idle(bfqq)))
1006 @@ -1009,7 +1217,7 @@ static int bfq_forced_dispatch(struct bf
1007                 st = bfq_entity_service_tree(&bfqq->entity);
1008
1009                 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
1010 -               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1011 +               bfqq->max_budget = bfq_max_budget(bfqd);
1012
1013                 bfq_forget_idle(st);
1014         }
1015 @@ -1025,6 +1233,7 @@ static int bfq_dispatch_requests(struct
1016         struct bfq_queue *bfqq;
1017         int dispatched;
1018
1019 +       bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
1020         if (bfqd->busy_queues == 0)
1021                 return 0;
1022
1023 @@ -1056,9 +1265,11 @@ static int bfq_dispatch_requests(struct
1024                 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
1025
1026                 dispatched += __bfq_dispatch_requests(bfqd, bfqq, max_dispatch);
1027 +               bfq_log_bfqq(bfqd, bfqq, "total dispatched increased to %d "
1028 +                            "(max_disp %d)", dispatched, max_dispatch);
1029         }
1030
1031 -       bfq_log(bfqd, "dispatched=%d", dispatched);
1032 +       bfq_log(bfqd, "final total dispatched=%d", dispatched);
1033         return dispatched;
1034  }
1035
1036 @@ -1074,6 +1285,7 @@ static void bfq_put_queue(struct bfq_que
1037
1038         BUG_ON(atomic_read(&bfqq->ref) <= 0);
1039
1040 +       bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
1041         if (!atomic_dec_and_test(&bfqq->ref))
1042                 return;
1043
1044 @@ -1083,7 +1295,7 @@ static void bfq_put_queue(struct bfq_que
1045         BUG_ON(bfq_bfqq_busy(bfqq));
1046         BUG_ON(bfqd->active_queue == bfqq);
1047
1048 -       bfq_log_bfqq(bfqd, bfqq, "freed");
1049 +       bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
1050
1051         kmem_cache_free(bfq_pool, bfqq);
1052  }
1053 @@ -1095,6 +1307,7 @@ static void bfq_exit_bfqq(struct bfq_dat
1054                 bfq_schedule_dispatch(bfqd);
1055         }
1056
1057 +       bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
1058         bfq_put_queue(bfqq);
1059  }
1060
1061 @@ -1116,7 +1329,7 @@ static void bfq_init_prio_data(struct bf
1062                 printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
1063         case IOPRIO_CLASS_NONE:
1064                 /*
1065 -                * no prio set, inherit CPU scheduling settings
1066 +                * No prio set, inherit CPU scheduling settings.
1067                  */
1068                 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
1069                 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
1070 @@ -1139,8 +1352,8 @@ static void bfq_init_prio_data(struct bf
1071         bfqq->entity.ioprio_changed = 1;
1072
1073         /*
1074 -        * keep track of original prio settings in case we have to temporarily
1075 -        * elevate the priority of this queue
1076 +        * Keep track of original prio settings in case we have to temporarily
1077 +        * elevate the priority of this queue.
1078          */
1079         bfqq->org_ioprio = bfqq->entity.new_ioprio;
1080         bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
1081 @@ -1167,6 +1380,9 @@ static void bfq_changed_ioprio(struct io
1082                                          GFP_ATOMIC);
1083                 if (new_bfqq != NULL) {
1084                         cic->cfqq[ASYNC] = new_bfqq;
1085 +                       bfq_log_bfqq(bfqd, bfqq,
1086 +                                    "changed_ioprio: bfqq %p %d",
1087 +                                    bfqq, bfqq->ref);
1088                         bfq_put_queue(bfqq);
1089                 }
1090         }
1091 @@ -1233,9 +1449,13 @@ retry:
1092                                 bfq_mark_bfqq_idle_window(bfqq);
1093                         bfq_mark_bfqq_sync(bfqq);
1094                 }
1095 -               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1096 +               /* Tentative initial value to trade off between thr and lat */
1097 +               bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
1098                 bfqq->pid = current->pid;
1099
1100 +               bfqq->last_activation_time = 0;
1101 +               bfqq->high_weight_budget = 0;
1102 +
1103                 bfq_log_bfqq(bfqd, bfqq, "allocated");
1104         }
1105
1106 @@ -1285,14 +1505,17 @@ static struct bfq_queue *bfq_get_queue(s
1107         }
1108
1109         /*
1110 -        * pin the queue now that it's allocated, scheduler exit will prune it
1111 +        * Pin the queue now that it's allocated, scheduler exit will prune it.
1112          */
1113         if (!is_sync && *async_bfqq == NULL) {
1114                 atomic_inc(&bfqq->ref);
1115 +               bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
1116 +                            bfqq, bfqq->ref);
1117                 *async_bfqq = bfqq;
1118         }
1119
1120         atomic_inc(&bfqq->ref);
1121 +       bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
1122         return bfqq;
1123  }
1124
1125 @@ -1309,35 +1532,35 @@ static void bfq_update_io_thinktime(stru
1126
1127  static void bfq_update_io_seektime(struct bfq_data *bfqd,
1128                                    struct bfq_queue *bfqq,
1129 -                                  struct cfq_io_context *cic,
1130                                    struct request *rq)
1131  {
1132         sector_t sdist;
1133         u64 total;
1134
1135 -       if (cic->last_request_pos < rq->sector)
1136 -               sdist = rq->sector - cic->last_request_pos;
1137 +       if (bfqq->last_request_pos < rq->sector)
1138 +               sdist = rq->sector - bfqq->last_request_pos;
1139         else
1140 -               sdist = cic->last_request_pos - rq->sector;
1141 +               sdist = bfqq->last_request_pos - rq->sector;
1142
1143         /*
1144          * Don't allow the seek distance to get too large from the
1145          * odd fragment, pagein, etc.
1146          */
1147 -       if (cic->seek_samples == 0) /* first request, not really a seek */
1148 +       if (bfqq->seek_samples == 0) /* first request, not really a seek */
1149                 sdist = 0;
1150 -       else if (cic->seek_samples <= 60) /* second&third seek */
1151 -               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1152 +       else if (bfqq->seek_samples <= 60) /* second & third seek */
1153 +               sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
1154         else
1155 -               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1156 +               sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
1157
1158 -       cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1159 -       cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1160 -       total = cic->seek_total + (cic->seek_samples/2);
1161 -       do_div(total, cic->seek_samples);
1162 -       cic->seek_mean = (sector_t)total;
1163 +       bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
1164 +       bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
1165 +       total = bfqq->seek_total + (bfqq->seek_samples/2);
1166 +       do_div(total, bfqq->seek_samples);
1167 +       bfqq->seek_mean = (sector_t)total;
1168
1169 -       bfq_log_bfqq(bfqd, bfqq, "dist=%lu mean=%lu", sdist, cic->seek_mean);
1170 +       bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
1171 +                       (u64)bfqq->seek_mean);
1172  }
1173
1174  /*
1175 @@ -1357,7 +1580,7 @@ static void bfq_update_idle_window(struc
1176         enable_idle = bfq_bfqq_idle_window(bfqq);
1177
1178         if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
1179 -           bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && CIC_SEEKY(cic)))
1180 +           bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
1181                 enable_idle = 0;
1182         else if (bfq_sample_valid(cic->ttime_samples)) {
1183                 if (cic->ttime_mean > bfqd->bfq_slice_idle)
1184 @@ -1365,14 +1588,13 @@ static void bfq_update_idle_window(struc
1185                 else
1186                         enable_idle = 1;
1187         }
1188 +       bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
1189 +               enable_idle);
1190
1191         if (enable_idle)
1192                 bfq_mark_bfqq_idle_window(bfqq);
1193         else
1194                 bfq_clear_bfqq_idle_window(bfqq);
1195 -
1196 -       bfq_log_bfqq(bfqd, bfqq, "idle_window=%d (%d)",
1197 -                    enable_idle, CIC_SEEKY(cic));
1198  }
1199
1200  /*
1201 @@ -1388,20 +1610,37 @@ static void bfq_rq_enqueued(struct bfq_d
1202                 bfqq->meta_pending++;
1203
1204         bfq_update_io_thinktime(bfqd, cic);
1205 -       bfq_update_io_seektime(bfqd, bfqq, cic, rq);
1206 -       bfq_update_idle_window(bfqd, bfqq, cic);
1207 +       bfq_update_io_seektime(bfqd, bfqq, rq);
1208 +       if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
1209 +           !BFQQ_SEEKY(bfqq))
1210 +               bfq_update_idle_window(bfqd, bfqq, cic);
1211 +
1212 +       bfq_log_bfqq(bfqd, bfqq,
1213 +                    "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
1214 +                    bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
1215 +                    bfqq->seek_mean);
1216
1217 -       cic->last_request_pos = rq->sector + rq->nr_sectors;
1218 +       bfqq->last_request_pos = rq->sector + rq->nr_sectors;
1219
1220 -       if (bfqq == bfqd->active_queue && bfq_bfqq_wait_request(bfqq)) {
1221 -               /*
1222 -                * If we are waiting for a request for this queue, let it rip
1223 -                * immediately and flag that we must not expire this queue
1224 -                * just now.
1225 -                */
1226 -               bfq_clear_bfqq_wait_request(bfqq);
1227 -               del_timer(&bfqd->idle_slice_timer);
1228 -               blk_start_queueing(bfqd->queue);
1229 +       if (bfqq == bfqd->active_queue) {
1230 +               if (bfq_bfqq_wait_request(bfqq)) {
1231 +                       /*
1232 +                        * If we are waiting for a request for this queue, let
1233 +                        * it rip immediately and flag that we must not expire
1234 +                        * this queue just now.
1235 +                        */
1236 +                       bfq_clear_bfqq_wait_request(bfqq);
1237 +                       del_timer(&bfqd->idle_slice_timer);
1238 +                       /*
1239 +                        * Here we can safely expire the queue, in
1240 +                        * case of budget timeout, without wasting
1241 +                        * guarantees
1242 +                        */
1243 +                       if (bfq_bfqq_budget_timeout(bfqq))
1244 +                               bfq_bfqq_expire(bfqd, bfqq, 0,
1245 +                                               BFQ_BFQQ_BUDGET_TIMEOUT);
1246 +                       blk_start_queueing(bfqd->queue);
1247 +               }
1248         }
1249  }
1250
1251 @@ -1410,6 +1649,7 @@ static void bfq_insert_request(struct re
1252         struct bfq_data *bfqd = q->elevator->elevator_data;
1253         struct bfq_queue *bfqq = RQ_BFQQ(rq);
1254
1255 +       assert_spin_locked(bfqd->queue->queue_lock);
1256         bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
1257
1258         bfq_add_rq_rb(rq);
1259 @@ -1447,7 +1687,8 @@ static void bfq_completed_request(struct
1260         struct bfq_data *bfqd = bfqq->bfqd;
1261         const int sync = rq_is_sync(rq);
1262
1263 -       bfq_log_bfqq(bfqd, bfqq, "complete");
1264 +       bfq_log_bfqq(bfqd, bfqq, "completed %lu sects req (%d)",
1265 +                       rq->hard_nr_sectors, sync);
1266
1267         bfq_update_hw_tag(bfqd);
1268
1269 @@ -1470,7 +1711,7 @@ static void bfq_completed_request(struct
1270                 if (bfq_bfqq_budget_new(bfqq))
1271                         bfq_set_budget_timeout(bfqd);
1272
1273 -               if (bfq_bfqq_budget_timeout(bfqq))
1274 +               if (bfq_may_expire_for_budg_timeout(bfqq))
1275                         bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
1276                 else if (sync && bfqd->rq_in_driver == 0 &&
1277                          RB_EMPTY_ROOT(&bfqq->sort_list))
1278 @@ -1489,7 +1730,7 @@ static void bfq_prio_boost(struct bfq_qu
1279  {
1280         if (has_fs_excl()) {
1281                 /*
1282 -                * boost idle prio on transactions that would lock out other
1283 +                * Boost idle prio on transactions that would lock out other
1284                  * users of the filesystem
1285                  */
1286                 if (bfq_class_idle(bfqq))
1287 @@ -1498,7 +1739,7 @@ static void bfq_prio_boost(struct bfq_qu
1288                         bfqq->entity.new_ioprio = IOPRIO_NORM;
1289         } else {
1290                 /*
1291 -                * check if we need to unboost the queue
1292 +                * Check if we need to unboost the queue
1293                  */
1294                 if (bfqq->entity.new_ioprio_class != bfqq->org_ioprio_class)
1295                         bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
1296 @@ -1527,7 +1768,7 @@ static int bfq_may_queue(struct request_
1297         /*
1298          * Don't force setup of a queue from here, as a call to may_queue
1299          * does not necessarily imply that a request actually will be queued.
1300 -        * so just lookup a possibly existing queue, or return 'may queue'
1301 +        * So just lookup a possibly existing queue, or return 'may queue'
1302          * if that fails.
1303          */
1304         cic = bfq_cic_lookup(bfqd, tsk->io_context);
1305 @@ -1546,7 +1787,7 @@ static int bfq_may_queue(struct request_
1306  }
1307
1308  /*
1309 - * queue lock held here
1310 + * Queue lock held here.
1311   */
1312  static void bfq_put_request(struct request *rq)
1313  {
1314 @@ -1563,6 +1804,8 @@ static void bfq_put_request(struct reque
1315                 rq->elevator_private = NULL;
1316                 rq->elevator_private2 = NULL;
1317
1318 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
1319 +                            bfqq, bfqq->ref);
1320                 bfq_put_queue(bfqq);
1321         }
1322  }
1323 @@ -1603,6 +1846,7 @@ static int bfq_set_request(struct reques
1324
1325         bfqq->allocated[rw]++;
1326         atomic_inc(&bfqq->ref);
1327 +       bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
1328
1329         spin_unlock_irqrestore(q->queue_lock, flags);
1330
1331 @@ -1634,7 +1878,8 @@ static void bfq_kick_queue(struct work_s
1332  }
1333
1334  /*
1335 - * Timer running if the active_queue is currently idling inside its time slice
1336 + * Handler of the expiration of the timer running if the active_queue
1337 + * is idling inside its time slice.
1338   */
1339  static void bfq_idle_slice_timer(unsigned long data)
1340  {
1341 @@ -1643,8 +1888,6 @@ static void bfq_idle_slice_timer(unsigne
1342         unsigned long flags;
1343         enum bfqq_expiration reason;
1344
1345 -       bfq_log(bfqd, "slice_timer expired");
1346 -
1347         spin_lock_irqsave(bfqd->queue->queue_lock, flags);
1348
1349         bfqq = bfqd->active_queue;
1350 @@ -1657,8 +1900,14 @@ static void bfq_idle_slice_timer(unsigne
1351          * we just expire a queue too early.
1352          */
1353         if (bfqq != NULL) {
1354 +               bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
1355                 reason = BFQ_BFQQ_TOO_IDLE;
1356                 if (bfq_bfqq_budget_timeout(bfqq))
1357 +                       /*
1358 +                        * Also here the queue can be safely expired
1359 +                        * for budget timeout without wasting
1360 +                        * guarantees
1361 +                        */
1362                         reason = BFQ_BFQQ_BUDGET_TIMEOUT;
1363
1364                 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
1365 @@ -1681,8 +1930,11 @@ static inline void __bfq_put_async_bfqq(
1366         struct bfq_group *root_group = bfqd->root_group;
1367         struct bfq_queue *bfqq = *bfqq_ptr;
1368
1369 +       bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
1370         if (bfqq != NULL) {
1371                 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
1372 +               bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
1373 +                            bfqq, bfqq->ref);
1374                 bfq_put_queue(bfqq);
1375                 *bfqq_ptr = NULL;
1376         }
1377 @@ -1705,7 +1957,7 @@ static void bfq_put_async_queues(struct
1378         __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
1379  }
1380
1381 -static void bfq_exit_queue(elevator_t *e)
1382 +static void bfq_exit_queue(struct elevator_queue *e)
1383  {
1384         struct bfq_data *bfqd = e->elevator_data;
1385         struct request_queue *q = bfqd->queue;
1386 @@ -1772,7 +2024,7 @@ static void *bfq_init_queue(struct reque
1387
1388         bfqd->hw_tag = 1;
1389
1390 -       bfqd->bfq_max_budget = bfq_max_budget;
1391 +       bfqd->bfq_max_budget = bfq_default_max_budget;
1392
1393         bfqd->bfq_quantum = bfq_quantum;
1394         bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
1395 @@ -1784,6 +2036,8 @@ static void *bfq_init_queue(struct reque
1396         bfqd->bfq_timeout[ASYNC] = bfq_timeout_async;
1397         bfqd->bfq_timeout[SYNC] = bfq_timeout_sync;
1398
1399 +       bfqd->low_latency = true;
1400 +
1401         return bfqd;
1402  }
1403
1404 @@ -1819,16 +2073,19 @@ static ssize_t bfq_var_show(unsigned int
1405         return sprintf(page, "%d\n", var);
1406  }
1407
1408 -static ssize_t bfq_var_store(unsigned int *var, const char *page, size_t count)
1409 +static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
1410  {
1411 -       char *p = (char *)page;
1412 +       unsigned long new_val;
1413 +       int ret = strict_strtoul(page, 10, &new_val);
1414 +
1415 +       if (ret == 0)
1416 +               *var = new_val;
1417
1418 -       *var = simple_strtoul(p, &p, 10);
1419         return count;
1420  }
1421
1422  #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
1423 -static ssize_t __FUNC(elevator_t *e, char *page)                       \
1424 +static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
1425  {                                                                      \
1426         struct bfq_data *bfqd = e->elevator_data;                       \
1427         unsigned int __data = __VAR;                                    \
1428 @@ -1846,13 +2103,15 @@ SHOW_FUNCTION(bfq_max_budget_show, bfqd-
1429  SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
1430  SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[SYNC], 1);
1431  SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[ASYNC], 1);
1432 +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
1433  #undef SHOW_FUNCTION
1434
1435  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
1436 -static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)   \
1437 +static ssize_t                                                         \
1438 +__FUNC(struct elevator_queue *e, const char *page, size_t count)       \
1439  {                                                                      \
1440         struct bfq_data *bfqd = e->elevator_data;                       \
1441 -       unsigned int __data;                                            \
1442 +       unsigned long __data;                                           \
1443         int ret = bfq_var_store(&__data, (page), count);                \
1444         if (__data < (MIN))                                             \
1445                 __data = (MIN);                                         \
1446 @@ -1879,21 +2138,21 @@ STORE_FUNCTION(bfq_timeout_async_store,
1447                 INT_MAX, 1);
1448  #undef STORE_FUNCTION
1449
1450 -static inline bfq_service_t bfq_estimated_max_budget(struct bfq_data *bfqd)
1451 +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
1452  {
1453         u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[SYNC]);
1454
1455         if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
1456                 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
1457         else
1458 -               return bfq_max_budget;
1459 +               return bfq_default_max_budget;
1460  }
1461
1462 -static ssize_t bfq_max_budget_store(elevator_t *e, const char *page,
1463 -                                   size_t count)
1464 +static ssize_t bfq_max_budget_store(struct elevator_queue *e,
1465 +                                   const char *page, size_t count)
1466  {
1467         struct bfq_data *bfqd = e->elevator_data;
1468 -       unsigned int __data;
1469 +       unsigned long __data;
1470         int ret = bfq_var_store(&__data, (page), count);
1471
1472         if (__data == 0)
1473 @@ -1909,11 +2168,11 @@ static ssize_t bfq_max_budget_store(elev
1474         return ret;
1475  }
1476
1477 -static ssize_t bfq_timeout_sync_store(elevator_t *e, const char *page,
1478 -                                     size_t count)
1479 +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
1480 +                                     const char *page, size_t count)
1481  {
1482         struct bfq_data *bfqd = e->elevator_data;
1483 -       unsigned int __data;
1484 +       unsigned long __data;
1485         int ret = bfq_var_store(&__data, (page), count);
1486
1487         if (__data < 1)
1488 @@ -1928,6 +2187,20 @@ static ssize_t bfq_timeout_sync_store(el
1489         return ret;
1490  }
1491
1492 +static ssize_t bfq_low_latency_store(struct elevator_queue *e,
1493 +                                    const char *page, size_t count)
1494 +{
1495 +       struct bfq_data *bfqd = e->elevator_data;
1496 +       unsigned long __data;
1497 +       int ret = bfq_var_store(&__data, (page), count);
1498 +
1499 +       if (__data > 1)
1500 +               __data = 1;
1501 +       bfqd->low_latency = __data;
1502 +
1503 +       return ret;
1504 +}
1505 +
1506  #define BFQ_ATTR(name) \
1507         __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
1508
1509 @@ -1942,12 +2215,13 @@ static struct elv_fs_entry bfq_attrs[] =
1510         BFQ_ATTR(max_budget_async_rq),
1511         BFQ_ATTR(timeout_sync),
1512         BFQ_ATTR(timeout_async),
1513 +       BFQ_ATTR(low_latency),
1514         __ATTR_NULL
1515  };
1516
1517  static struct elevator_type iosched_bfq = {
1518         .ops = {
1519 -               .elevator_merge_fn =            bfq_merge,
1520 +               .elevator_merge_fn =            bfq_merge,
1521                 .elevator_merged_fn =           bfq_merged_request,
1522                 .elevator_merge_req_fn =        bfq_merged_requests,
1523                 .elevator_allow_merge_fn =      bfq_allow_merge,
1524 @@ -1974,7 +2248,7 @@ static struct elevator_type iosched_bfq
1525  static int __init bfq_init(void)
1526  {
1527         /*
1528 -        * can be 0 on HZ < 1000 setups
1529 +        * Can be 0 on HZ < 1000 setups.
1530          */
1531         if (bfq_slice_idle == 0)
1532                 bfq_slice_idle = 1;
1533 diff -uprN linux-2.6.28/block/bfq-sched.c linux-2.6.28.new/block/bfq-sched.c
1534 --- linux-2.6.28/block/bfq-sched.c      2011-06-13 21:41:23.766107399 +0200
1535 +++ linux-2.6.28.new/block/bfq-sched.c  2011-06-13 21:41:41.639927001 +0200
1536 @@ -28,7 +28,7 @@ static int bfq_update_next_active(struct
1537                 return 0;
1538
1539         /*
1540 -        * NOTE: this can be improved in may ways, such as returning
1541 +        * NOTE: this can be improved in many ways, such as returning
1542          * 1 (and thus propagating upwards the update) only when the
1543          * budget changes, or caching the bfqq that will be scheduled
1544          * next from this subtree.  By now we worry more about
1545 @@ -85,20 +85,33 @@ static inline void bfq_check_next_active
1546   *
1547   * Return @a > @b, dealing with wrapping correctly.
1548   */
1549 -static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
1550 +static inline int bfq_gt(u64 a, u64 b)
1551  {
1552         return (s64)(a - b) > 0;
1553  }
1554
1555 +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1556 +{
1557 +       struct bfq_queue *bfqq = NULL;
1558 +
1559 +       BUG_ON(entity == NULL);
1560 +
1561 +       if (entity->my_sched_data == NULL)
1562 +               bfqq = container_of(entity, struct bfq_queue, entity);
1563 +
1564 +       return bfqq;
1565 +}
1566 +
1567 +
1568  /**
1569   * bfq_delta - map service into the virtual time domain.
1570   * @service: amount of service.
1571 - * @weight: scale factor.
1572 + * @weight: scale factor (weight of an entity or weight sum).
1573   */
1574 -static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
1575 -                                       bfq_weight_t weight)
1576 +static inline u64 bfq_delta(unsigned long service,
1577 +                                       unsigned long weight)
1578  {
1579 -       bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
1580 +       u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1581
1582         do_div(d, weight);
1583         return d;
1584 @@ -110,11 +123,25 @@ static inline bfq_timestamp_t bfq_delta(
1585   * @service: the service to be charged to the entity.
1586   */
1587  static inline void bfq_calc_finish(struct bfq_entity *entity,
1588 -                                  bfq_service_t service)
1589 +                                  unsigned long service)
1590  {
1591 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1592 +
1593         BUG_ON(entity->weight == 0);
1594
1595 -       entity->finish = entity->start + bfq_delta(service, entity->weight);
1596 +       entity->finish = entity->start +
1597 +               bfq_delta(service, entity->weight);
1598 +
1599 +       if (bfqq != NULL) {
1600 +               bfq_log_bfqq(bfqq->bfqd, bfqq,
1601 +                       "calc_finish: serv %lu, w %lu, hi-budg %lu",
1602 +                       service, entity->weight,
1603 +                       bfqq->high_weight_budget);
1604 +               bfq_log_bfqq(bfqq->bfqd, bfqq,
1605 +                       "calc_finish: start %llu, finish %llu, delta %llu",
1606 +                       entity->start, entity->finish,
1607 +                       bfq_delta(service, entity->weight));
1608 +       }
1609  }
1610
1611  /**
1612 @@ -150,18 +177,6 @@ static inline void bfq_extract(struct rb
1613         rb_erase(&entity->rb_node, root);
1614  }
1615
1616 -static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1617 -{
1618 -       struct bfq_queue *bfqq = NULL;
1619 -
1620 -       BUG_ON(entity == NULL);
1621 -
1622 -       if (entity->my_sched_data == NULL)
1623 -               bfqq = container_of(entity, struct bfq_queue, entity);
1624 -
1625 -       return bfqq;
1626 -}
1627 -
1628  /**
1629   * bfq_idle_extract - extract an entity from the idle tree.
1630   * @st: the service tree of the owning @entity.
1631 @@ -325,12 +340,26 @@ static void bfq_active_insert(struct bfq
1632   * bfq_ioprio_to_weight - calc a weight from an ioprio.
1633   * @ioprio: the ioprio value to convert.
1634   */
1635 -static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
1636 +static unsigned short bfq_ioprio_to_weight(int ioprio)
1637  {
1638         WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
1639         return IOPRIO_BE_NR - ioprio;
1640  }
1641
1642 +/**
1643 + * bfq_weight_to_ioprio - calc an ioprio from a weight.
1644 + * @weight: the weight value to convert.
1645 + *
1646 + * To preserve as mush as possible the old only-ioprio user interface,
1647 + * 0 is used as an escape ioprio value for weights (numerically) equal or
1648 + * larger than IOPRIO_BE_NR
1649 + */
1650 +static unsigned short bfq_weight_to_ioprio(int weight)
1651 +{
1652 +       WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
1653 +       return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
1654 +}
1655 +
1656  static inline void bfq_get_entity(struct bfq_entity *entity)
1657  {
1658         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1659 @@ -339,6 +368,8 @@ static inline void bfq_get_entity(struct
1660         if (bfqq != NULL) {
1661                 sd = entity->sched_data;
1662                 atomic_inc(&bfqq->ref);
1663 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1664 +                            bfqq, bfqq->ref);
1665         }
1666  }
1667
1668 @@ -437,6 +468,8 @@ static void bfq_forget_entity(struct bfq
1669         st->wsum -= entity->weight;
1670         if (bfqq != NULL) {
1671                 sd = entity->sched_data;
1672 +               bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
1673 +                            bfqq, bfqq->ref);
1674                 bfq_put_queue(bfqq);
1675         }
1676  }
1677 @@ -478,6 +511,62 @@ static void bfq_forget_idle(struct bfq_s
1678                 bfq_put_idle_entity(st, first_idle);
1679  }
1680
1681 +static struct bfq_service_tree *
1682 +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1683 +                        struct bfq_entity *entity)
1684 +{
1685 +       struct bfq_service_tree *new_st = old_st;
1686 +
1687 +       if (entity->ioprio_changed) {
1688 +               int new_boost_coeff = 1;
1689 +               struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1690 +
1691 +               if (bfqq != NULL) {
1692 +                       new_boost_coeff +=
1693 +                               bfqq->high_weight_budget * BFQ_BOOST_COEFF /
1694 +                               BFQ_BOOST_BUDGET;
1695 +                       bfq_log_bfqq(bfqq->bfqd, bfqq,
1696 +                               "update_w_prio: wght %lu, hi-budg %lu, coef %d",
1697 +                               entity->weight, bfqq->high_weight_budget,
1698 +                               new_boost_coeff);
1699 +               }
1700 +
1701 +               BUG_ON(old_st->wsum < entity->weight);
1702 +               old_st->wsum -= entity->weight;
1703 +
1704 +               if (entity->new_weight != entity->orig_weight) {
1705 +                       entity->orig_weight = entity->new_weight;
1706 +                       entity->ioprio =
1707 +                               bfq_weight_to_ioprio(entity->orig_weight);
1708 +               } else if (entity->new_ioprio != entity->ioprio) {
1709 +                       entity->ioprio = entity->new_ioprio;
1710 +                       entity->orig_weight =
1711 +                                       bfq_ioprio_to_weight(entity->ioprio);
1712 +               } else
1713 +                       entity->new_weight = entity->orig_weight =
1714 +                               bfq_ioprio_to_weight(entity->ioprio);
1715 +
1716 +               entity->ioprio_class = entity->new_ioprio_class;
1717 +               entity->ioprio_changed = 0;
1718 +
1719 +               /*
1720 +                * NOTE: here we may be changing the weight too early,
1721 +                * this will cause unfairness.  The correct approach
1722 +                * would have required additional complexity to defer
1723 +                * weight changes to the proper time instants (i.e.,
1724 +                * when entity->finish <= old_st->vtime).
1725 +                */
1726 +               new_st = bfq_entity_service_tree(entity);
1727 +               entity->weight = entity->orig_weight * new_boost_coeff;
1728 +               new_st->wsum += entity->weight;
1729 +
1730 +               if (new_st != old_st)
1731 +                       entity->start = new_st->vtime;
1732 +       }
1733 +
1734 +       return new_st;
1735 +}
1736 +
1737  /**
1738   * bfq_bfqq_served - update the scheduler status after selection for service.
1739   * @bfqq: the queue being served.
1740 @@ -487,7 +576,7 @@ static void bfq_forget_idle(struct bfq_s
1741   * are synchronized every time a new bfqq is selected for service.  By now,
1742   * we keep it to better check consistency.
1743   */
1744 -static void bfq_bfqq_served(struct bfq_queue *bfqq, bfq_service_t served)
1745 +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
1746  {
1747         struct bfq_entity *entity = &bfqq->entity;
1748         struct bfq_service_tree *st;
1749 @@ -496,13 +585,13 @@ static void bfq_bfqq_served(struct bfq_q
1750                 st = bfq_entity_service_tree(entity);
1751
1752                 entity->service += served;
1753 -
1754                 WARN_ON_ONCE(entity->service > entity->budget);
1755                 BUG_ON(st->wsum == 0);
1756
1757                 st->vtime += bfq_delta(served, st->wsum);
1758                 bfq_forget_idle(st);
1759         }
1760 +       bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
1761  }
1762
1763  /**
1764 @@ -515,42 +604,13 @@ static void bfq_bfqq_served(struct bfq_q
1765   * budget.  In this way we should obtain a sort of time-domain
1766   * fairness among all the seeky/slow queues.
1767   */
1768 -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1769 +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1770  {
1771         struct bfq_entity *entity = &bfqq->entity;
1772
1773 -       bfq_bfqq_served(bfqq, entity->budget - entity->service);
1774 -}
1775 +       bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
1776
1777 -static struct bfq_service_tree *
1778 -__bfq_entity_update_prio(struct bfq_service_tree *old_st,
1779 -                        struct bfq_entity *entity)
1780 -{
1781 -       struct bfq_service_tree *new_st = old_st;
1782 -
1783 -       if (entity->ioprio_changed) {
1784 -               entity->ioprio = entity->new_ioprio;
1785 -               entity->ioprio_class = entity->new_ioprio_class;
1786 -               entity->ioprio_changed = 0;
1787 -
1788 -               old_st->wsum -= entity->weight;
1789 -               entity->weight = bfq_ioprio_to_weight(entity->ioprio);
1790 -
1791 -               /*
1792 -                * NOTE: here we may be changing the weight too early,
1793 -                * this will cause unfairness.  The correct approach
1794 -                * would have required additional complexity to defer
1795 -                * weight changes to the proper time instants (i.e.,
1796 -                * when entity->finish <= old_st->vtime).
1797 -                */
1798 -               new_st = bfq_entity_service_tree(entity);
1799 -               new_st->wsum += entity->weight;
1800 -
1801 -               if (new_st != old_st)
1802 -                       entity->start = new_st->vtime;
1803 -       }
1804 -
1805 -       return new_st;
1806 +       bfq_bfqq_served(bfqq, entity->budget - entity->service);
1807  }
1808
1809  /**
1810 @@ -607,7 +667,7 @@ static void __bfq_activate_entity(struct
1811                 entity->on_st = 1;
1812         }
1813
1814 -       st = __bfq_entity_update_prio(st, entity);
1815 +       st = __bfq_entity_update_weight_prio(st, entity);
1816         bfq_calc_finish(entity, entity->budget);
1817         bfq_active_insert(st, entity);
1818  }
1819 diff -uprN linux-2.6.28/block/bfq.h linux-2.6.28.new/block/bfq.h
1820 --- linux-2.6.28/block/bfq.h    2011-06-13 21:41:23.766107399 +0200
1821 +++ linux-2.6.28.new/block/bfq.h        2011-06-13 21:41:41.639927001 +0200
1822 @@ -1,5 +1,5 @@
1823  /*
1824 - * BFQ: data structures and common functions prototypes.
1825 + * BFQ-v1-r1 for 2.6.35: data structures and common functions prototypes.
1826   *
1827   * Based on ideas and code from CFQ:
1828   * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1829 @@ -21,12 +21,22 @@
1830
1831  #define BFQ_IOPRIO_CLASSES     3
1832
1833 -#define BFQ_DEFAULT_GRP_IOPRIO 4
1834 +#define BFQ_MIN_WEIGHT 1
1835 +#define BFQ_MAX_WEIGHT 1000
1836 +
1837 +#define BFQ_DEFAULT_GRP_WEIGHT 10
1838 +#define BFQ_DEFAULT_GRP_IOPRIO 0
1839  #define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
1840
1841 -typedef u64 bfq_timestamp_t;
1842 -typedef unsigned long bfq_weight_t;
1843 -typedef unsigned long bfq_service_t;
1844 +/* Constants used in weight boosting (in its turn used to reduce latencies): */
1845 +/* max factor by which the weight of a boosted queue is multiplied */
1846 +#define BFQ_BOOST_COEFF        10
1847 +/* max number of sectors that can be served during a boosting period */
1848 +#define BFQ_BOOST_BUDGET       49152
1849 +/* max duration of a boosting period, msec */
1850 +#define BFQ_BOOST_TIMEOUT      6000
1851 +/* min idle period after which boosting may be reactivated for a queue, msec */
1852 +#define BFQ_MIN_ACT_INTERVAL   20000
1853
1854  struct bfq_entity;
1855
1856 @@ -51,8 +61,8 @@ struct bfq_service_tree {
1857         struct bfq_entity *first_idle;
1858         struct bfq_entity *last_idle;
1859
1860 -       bfq_timestamp_t vtime;
1861 -       bfq_weight_t wsum;
1862 +       u64 vtime;
1863 +       unsigned long wsum;
1864  };
1865
1866  /**
1867 @@ -92,17 +102,19 @@ struct bfq_sched_data {
1868   *             this entity; used for O(log N) lookups into active trees.
1869   * @service: service received during the last round of service.
1870   * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
1871 - * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
1872 + * @weight: weight of the queue
1873   * @parent: parent entity, for hierarchical scheduling.
1874   * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
1875   *                 associated scheduler queue, %NULL on leaf nodes.
1876   * @sched_data: the scheduler queue this entity belongs to.
1877   * @ioprio: the ioprio in use.
1878 - * @new_ioprio: when an ioprio change is requested, the new ioprio value
1879 + * @new_weight: when a weight change is requested, the new weight value.
1880 + * @orig_weight: original weight, used to implement weight boosting
1881 + * @new_ioprio: when an ioprio change is requested, the new ioprio value.
1882   * @ioprio_class: the ioprio_class in use.
1883   * @new_ioprio_class: when an ioprio_class change is requested, the new
1884   *                    ioprio_class value.
1885 - * @ioprio_changed: flag, true when the user requested an ioprio or
1886 + * @ioprio_changed: flag, true when the user requested a weight, ioprio or
1887   *                  ioprio_class change.
1888   *
1889   * A bfq_entity is used to represent either a bfq_queue (leaf node in the
1890 @@ -111,36 +123,39 @@ struct bfq_sched_data {
1891   * hierarchy.  Non-leaf entities have also their own sched_data, stored
1892   * in @my_sched_data.
1893   *
1894 - * Each entity stores independently its priority values; this would allow
1895 - * different weights on different devices, but this functionality is not
1896 - * exported to userspace by now.  Priorities are updated lazily, first
1897 - * storing the new values into the new_* fields, then setting the
1898 - * @ioprio_changed flag.  As soon as there is a transition in the entity
1899 - * state that allows the priority update to take place the effective and
1900 - * the requested priority values are synchronized.
1901 - *
1902 - * The weight value is calculated from the ioprio to export the same
1903 - * interface as CFQ.  When dealing with ``well-behaved'' queues (i.e.,
1904 - * queues that do not spend too much time to consume their budget and
1905 - * have true sequential behavior, and when there are no external factors
1906 - * breaking anticipation) the relative weights at each level of the
1907 - * cgroups hierarchy should be guaranteed.
1908 - * All the fields are protected by the queue lock of the containing bfqd.
1909 + * Each entity stores independently its priority values; this would
1910 + * allow different weights on different devices, but this
1911 + * functionality is not exported to userspace by now.  Priorities and
1912 + * weights are updated lazily, first storing the new values into the
1913 + * new_* fields, then setting the @ioprio_changed flag.  As soon as
1914 + * there is a transition in the entity state that allows the priority
1915 + * update to take place the effective and the requested priority
1916 + * values are synchronized.
1917 + *
1918 + * Unless cgroups are used, the weight value is calculated from the
1919 + * ioprio to export the same interface as CFQ.  When dealing with
1920 + * ``well-behaved'' queues (i.e., queues that do not spend too much
1921 + * time to consume their budget and have true sequential behavior, and
1922 + * when there are no external factors breaking anticipation) the
1923 + * relative weights at each level of the cgroups hierarchy should be
1924 + * guaranteed.  All the fields are protected by the queue lock of the
1925 + * containing bfqd.
1926   */
1927  struct bfq_entity {
1928         struct rb_node rb_node;
1929
1930         int on_st;
1931
1932 -       bfq_timestamp_t finish;
1933 -       bfq_timestamp_t start;
1934 +       u64 finish;
1935 +       u64 start;
1936
1937         struct rb_root *tree;
1938
1939 -       bfq_timestamp_t min_start;
1940 +       u64 min_start;
1941
1942 -       bfq_service_t service, budget;
1943 -       bfq_weight_t weight;
1944 +       unsigned long service, budget;
1945 +       unsigned short weight, new_weight;
1946 +       unsigned short orig_weight;
1947
1948         struct bfq_entity *parent;
1949
1950 @@ -168,6 +183,7 @@ struct bfq_group;
1951   *                   completed requests .
1952   * @hw_tag_samples: nr of samples used to calculate hw_tag.
1953   * @hw_tag: flag set to one if the driver is showing a queueing behavior.
1954 + * @budgets_assigned: number of budgets assigned.
1955   * @idle_slice_timer: timer set when idling for the next sequential request
1956   *                    from the queue under service.
1957   * @unplug_work: delayed work to restart dispatching on the request queue.
1958 @@ -216,6 +232,8 @@ struct bfq_data {
1959         int hw_tag_samples;
1960         int hw_tag;
1961
1962 +       int budgets_assigned;
1963 +
1964         struct timer_list idle_slice_timer;
1965         struct work_struct unplug_work;
1966
1967 @@ -228,7 +246,7 @@ struct bfq_data {
1968         ktime_t last_idling_start;
1969         int peak_rate_samples;
1970         u64 peak_rate;
1971 -       bfq_service_t bfq_max_budget;
1972 +       unsigned long bfq_max_budget;
1973
1974         struct list_head cic_list;
1975         struct hlist_head group_list;
1976 @@ -244,6 +262,8 @@ struct bfq_data {
1977         unsigned int bfq_user_max_budget;
1978         unsigned int bfq_max_budget_async_rq;
1979         unsigned int bfq_timeout[2];
1980 +
1981 +       bool low_latency;
1982  };
1983
1984  /**
1985 @@ -260,12 +280,17 @@ struct bfq_data {
1986   * @max_budget: maximum budget allowed from the feedback mechanism.
1987   * @budget_timeout: budget expiration (in jiffies).
1988   * @dispatched: number of requests on the dispatch list or inside driver.
1989 - * @budgets_assigned: number of budgets assigned.
1990   * @org_ioprio: saved ioprio during boosted periods.
1991   * @org_ioprio_class: saved ioprio_class during boosted periods.
1992   * @flags: status flags.
1993   * @bfqq_list: node for active/idle bfqq list inside our bfqd.
1994 + * @seek_samples: number of seeks sampled
1995 + * @seek_total: sum of the distances of the seeks sampled
1996 + * @seek_mean: mean seek distance
1997 + * @last_request_pos: position of the last request enqueued
1998   * @pid: pid of the process owning the queue, used for logging purposes.
1999 + * @last_activation_time: time of the last (idle -> backlogged) transition
2000 + * @high_weight_budget: number of sectors left to serve with boosted weight
2001   *
2002   * A bfq_queue is a leaf request queue; it can be associated to an io_context
2003   * or more (if it is an async one).  @cgroup holds a reference to the
2004 @@ -287,11 +312,10 @@ struct bfq_queue {
2005
2006         struct bfq_entity entity;
2007
2008 -       bfq_service_t max_budget;
2009 +       unsigned long max_budget;
2010         unsigned long budget_timeout;
2011
2012         int dispatched;
2013 -       int budgets_assigned;
2014
2015         unsigned short org_ioprio;
2016         unsigned short org_ioprio_class;
2017 @@ -300,7 +324,15 @@ struct bfq_queue {
2018
2019         struct list_head bfqq_list;
2020
2021 +       unsigned int seek_samples;
2022 +       u64 seek_total;
2023 +       sector_t seek_mean;
2024 +       sector_t last_request_pos;
2025 +
2026         pid_t pid;
2027 +
2028 +       u64 last_activation_time;
2029 +       unsigned long high_weight_budget;
2030  };
2031
2032  enum bfqq_state_flags {
2033 @@ -400,6 +432,7 @@ struct bfq_group {
2034  /**
2035   * struct bfqio_cgroup - bfq cgroup data structure.
2036   * @css: subsystem state for bfq in the containing cgroup.
2037 + * @weight: cgroup weight.
2038   * @ioprio: cgroup ioprio.
2039   * @ioprio_class: cgroup ioprio_class.
2040   * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
2041 @@ -411,7 +444,7 @@ struct bfq_group {
2042  struct bfqio_cgroup {
2043         struct cgroup_subsys_state css;
2044
2045 -       unsigned short ioprio, ioprio_class;
2046 +       unsigned short weight, ioprio, ioprio_class;
2047
2048         spinlock_t lock;
2049         struct hlist_head group_data;