1 diff -uprN linux-2.6.28/block/bfq-cgroup.c linux-2.6.28.new/block/bfq-cgroup.c
2 --- linux-2.6.28/block/bfq-cgroup.c 2011-06-13 21:41:23.758108202 +0200
3 +++ linux-2.6.28.new/block/bfq-cgroup.c 2011-06-13 21:41:41.631928601 +0200
6 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
7 * Paolo Valente <paolo.valente@unimore.it>
9 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
12 #ifdef CONFIG_CGROUP_BFQIO
13 static struct bfqio_cgroup bfqio_root_cgroup = {
14 + .weight = BFQ_DEFAULT_GRP_WEIGHT,
15 .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
16 .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
18 @@ -17,6 +20,8 @@ static struct bfqio_cgroup bfqio_root_cg
19 static inline void bfq_init_entity(struct bfq_entity *entity,
20 struct bfq_group *bfqg)
22 + entity->weight = entity->new_weight;
23 + entity->orig_weight = entity->new_weight;
24 entity->ioprio = entity->new_ioprio;
25 entity->ioprio_class = entity->new_ioprio_class;
26 entity->parent = bfqg->my_entity;
27 @@ -54,6 +59,8 @@ static inline void bfq_group_init_entity
29 struct bfq_entity *entity = &bfqg->entity;
31 + entity->weight = entity->new_weight = bgrp->weight;
32 + entity->orig_weight = entity->new_weight;
33 entity->ioprio = entity->new_ioprio = bgrp->ioprio;
34 entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
35 entity->ioprio_changed = 1;
36 @@ -301,6 +308,9 @@ static struct bfq_group *__bfq_cic_chang
38 if (entity->sched_data != &bfqg->sched_data) {
39 cic_set_bfqq(cic, NULL, 0);
40 + bfq_log_bfqq(bfqd, async_bfqq,
41 + "cic_change_group: %p %d",
42 + async_bfqq, async_bfqq->ref);
43 bfq_put_queue(async_bfqq);
46 @@ -453,6 +463,7 @@ static void bfq_disconnect_groups(struct
47 struct hlist_node *pos, *n;
48 struct bfq_group *bfqg;
50 + bfq_log(bfqd, "disconnect_groups beginning") ;
51 hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
52 hlist_del(&bfqg->bfqd_node);
54 @@ -466,6 +477,9 @@ static void bfq_disconnect_groups(struct
57 rcu_assign_pointer(bfqg->bfqd, NULL);
59 + bfq_log(bfqd, "disconnect_groups: put async for group %p",
61 bfq_put_async_queues(bfqd, bfqg);
64 @@ -475,6 +489,8 @@ static inline void bfq_free_root_group(s
65 struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
66 struct bfq_group *bfqg = bfqd->root_group;
68 + bfq_put_async_queues(bfqd, bfqg);
70 spin_lock_irq(&bgrp->lock);
71 hlist_del_rcu(&bfqg->group_node);
72 spin_unlock_irq(&bgrp->lock);
73 @@ -529,6 +545,7 @@ static u64 bfqio_cgroup_##__VAR##_read(s
77 +SHOW_FUNCTION(weight);
78 SHOW_FUNCTION(ioprio);
79 SHOW_FUNCTION(ioprio_class);
81 @@ -551,9 +568,9 @@ static int bfqio_cgroup_##__VAR##_write(
82 bgrp = cgroup_to_bfqio(cgroup); \
84 spin_lock_irq(&bgrp->lock); \
85 - bgrp->__VAR = (unsigned char)val; \
86 + bgrp->__VAR = (unsigned short)val; \
87 hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \
88 - bfqg->entity.new_##__VAR = (unsigned char)val; \
89 + bfqg->entity.new_##__VAR = (unsigned short)val; \
91 bfqg->entity.ioprio_changed = 1; \
93 @@ -564,12 +581,18 @@ static int bfqio_cgroup_##__VAR##_write(
97 +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
98 STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
99 STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
100 #undef STORE_FUNCTION
102 static struct cftype bfqio_files[] = {
105 + .read_u64 = bfqio_cgroup_weight_read,
106 + .write_u64 = bfqio_cgroup_weight_write,
110 .read_u64 = bfqio_cgroup_ioprio_read,
111 .write_u64 = bfqio_cgroup_ioprio_write,
112 @@ -697,6 +720,8 @@ struct cgroup_subsys bfqio_subsys = {
113 static inline void bfq_init_entity(struct bfq_entity *entity,
114 struct bfq_group *bfqg)
116 + entity->weight = entity->new_weight;
117 + entity->orig_weight = entity->new_weight;
118 entity->ioprio = entity->new_ioprio;
119 entity->ioprio_class = entity->new_ioprio_class;
120 entity->sched_data = &bfqg->sched_data;
121 diff -uprN linux-2.6.28/block/bfq-iosched.c linux-2.6.28.new/block/bfq-iosched.c
122 --- linux-2.6.28/block/bfq-iosched.c 2011-06-13 21:41:23.766107399 +0200
123 +++ linux-2.6.28.new/block/bfq-iosched.c 2011-06-13 21:42:46.519829365 +0200
125 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
126 * Paolo Valente <paolo.valente@unimore.it>
128 - * BFQ is a proportional share disk scheduling algorithm based on CFQ,
129 - * that uses the B-WF2Q+ internal scheduler to assign budgets (i.e.,
130 - * slices in the service domain) to the tasks accessing the disk. It
131 - * has been introduced in [1], where the interested reader can find an
132 - * accurate description of the algorithm, the guarantees it provides
133 - * and their formal proofs. With respect to the algorithm presented
134 - * in the paper, this implementation adds a timeout to limit the maximum
135 - * time a queue can spend to complete its assigned budget, and a
136 - * hierarchical extension, based on H-WF2Q+.
137 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
139 + * BFQ is a proportional share disk scheduling algorithm based on the
140 + * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
141 + * measured in number of sectors, to tasks instead of time slices.
142 + * The disk is not granted to the active task for a given time slice,
143 + * but until it has exahusted its assigned budget. This change from
144 + * the time to the service domain allows BFQ to distribute the disk
145 + * bandwidth among tasks as desired, without any distortion due to
146 + * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
147 + * internal scheduler, called B-WF2Q+, to schedule tasks according to
148 + * their budgets. Thanks to this accurate scheduler, BFQ can afford
149 + * to assign high budgets to disk-bound non-seeky tasks (to boost the
150 + * throughput), and yet guarantee low latencies to interactive and
151 + * soft real-time applications.
153 + * BFQ has been introduced in [1], where the interested reader can
154 + * find an accurate description of the algorithm, the bandwidth
155 + * distribution and latency guarantees it provides, plus formal proofs
156 + * of all the properties. With respect to the algorithm presented in
157 + * the paper, this implementation adds several little heuristics, and
158 + * a hierarchical extension, based on H-WF2Q+.
160 * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
161 * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
162 * complexity derives from the one introduced with EEVDF in [3].
164 * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
165 - * with Deterministic Guarantees on Bandwidth Distribution,'' to be
167 + * with Deterministic Guarantees on Bandwidth Distribution,'' to appear
168 + * on IEEE Transactions on Computer.
170 * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq.pdf
173 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
175 #include <linux/module.h>
176 +#include <linux/slab.h>
177 #include <linux/blkdev.h>
178 #include <linux/cgroup.h>
179 #include <linux/elevator.h>
181 #include <linux/ioprio.h>
184 -/* Max nr of dispatches in one round of service. */
185 +/* Max number of dispatches in one round of service. */
186 static const int bfq_quantum = 4;
188 -/* Expiration time of each request (jiffies). */
189 +/* Expiration time of sync (0) and async (1) requests, in jiffies. */
190 static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
192 /* Maximum backwards seek, in KiB. */
193 static const int bfq_back_max = 16 * 1024;
195 -/* Penalty of a backwards seek. */
196 +/* Penalty of a backwards seek, in number of sectors. */
197 static const int bfq_back_penalty = 2;
199 -/* Idling period duration (jiffies). */
200 +/* Idling period duration, in jiffies. */
201 static int bfq_slice_idle = HZ / 125;
203 -/* Default maximum budget values (sectors). */
204 -static const int bfq_max_budget = 16 * 1024;
205 +/* Default maximum budget values, in sectors and number of requests. */
206 +static const int bfq_default_max_budget = 16 * 1024;
207 static const int bfq_max_budget_async_rq = 4;
209 -/* Default timeout values (jiffies), approximating CFQ defaults. */
211 + * Async to sync throughput distribution is controlled as follows:
212 + * when an async request is served, the entity is charged the number
213 + * of sectors of the request, multipled by the factor below
215 +static const int bfq_async_charge_factor = 10;
217 +/* Default timeout values, in jiffies, approximating CFQ defaults. */
218 static const int bfq_timeout_sync = HZ / 8;
219 static int bfq_timeout_async = HZ / 25;
221 @@ -84,8 +105,7 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
222 #define BFQ_HW_QUEUE_THRESHOLD 4
223 #define BFQ_HW_QUEUE_SAMPLES 32
225 -/* Budget feedback step. */
226 -#define BFQ_BUDGET_STEP 128
227 +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > (8 * 1024))
229 /* Min samples used for peak rate estimation (for autotuning). */
230 #define BFQ_PEAK_RATE_SAMPLES 32
231 @@ -104,15 +124,10 @@ static DEFINE_SPINLOCK(bfq_ioc_gone_lock
232 #include "bfq-sched.c"
233 #include "bfq-cgroup.c"
235 -static inline int bfq_class_idle(struct bfq_queue *bfqq)
237 - return bfqq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
239 +#define bfq_class_idle(cfqq) ((bfqq)->entity.ioprio_class ==\
242 -static inline int bfq_sample_valid(int samples)
244 - return samples > 80;
246 +#define bfq_sample_valid(samples) ((samples) > 80)
249 * We regard a request as SYNC, if either it's a read or has the SYNC bit
250 @@ -180,7 +195,7 @@ static struct request *bfq_choose_req(st
251 last = bfqd->last_position;
254 - * by definition, 1KiB is 2 sectors
255 + * By definition, 1KiB is 2 sectors.
257 back_max = bfqd->bfq_back_max * 2;
259 @@ -282,17 +297,24 @@ static void bfq_del_rq_rb(struct request
260 bfq_del_bfqq_busy(bfqd, bfqq, 1);
263 +/* see the definition of bfq_async_charge_factor for details */
264 +static inline unsigned long bfq_serv_to_charge(struct request *rq,
265 + struct bfq_queue *bfqq)
267 + return rq->hard_nr_sectors *
268 + (1 + ((!bfq_bfqq_sync(bfqq)) * bfq_async_charge_factor));
272 * bfq_updated_next_req - update the queue after a new next_rq selection.
273 * @bfqd: the device data the queue belongs to.
274 * @bfqq: the queue to update.
276 - * Whenever the first request of a queue changes we try to allocate it
277 - * enough service (if it has grown), or to anticipate its finish time
278 - * (if it has shrinked), to reduce the time it has to wait, still taking
279 - * into account the queue budget. We try to avoid the queue having not
280 - * enough service allocated for its first request, thus having to go
281 - * through two dispatch rounds to actually dispatch the request.
282 + * If the first request of a queue changes we make sure that the queue
283 + * has enough budget to serve at least its first request (if the
284 + * request has grown). We do this because if the queue has not enough
285 + * budget for its first request, it has to go through two dispatch
286 + * rounds to actually get it dispatched.
288 static void bfq_updated_next_req(struct bfq_data *bfqd,
289 struct bfq_queue *bfqq)
290 @@ -300,7 +322,7 @@ static void bfq_updated_next_req(struct
291 struct bfq_entity *entity = &bfqq->entity;
292 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
293 struct request *next_rq = bfqq->next_rq;
294 - bfq_service_t new_budget;
295 + unsigned long new_budget;
299 @@ -315,9 +337,10 @@ static void bfq_updated_next_req(struct
300 BUG_ON(entity->tree != &st->active);
301 BUG_ON(entity == entity->sched_data->active_entity);
303 - new_budget = max(bfqq->max_budget, next_rq->hard_nr_sectors);
304 + new_budget = max_t(unsigned long, bfqq->max_budget,
305 + bfq_serv_to_charge(next_rq, bfqq));
306 entity->budget = new_budget;
307 - bfq_log_bfqq(bfqd, bfqq, "budget=%lu", new_budget);
308 + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
309 bfq_activate_bfqq(bfqd, bfqq);
312 @@ -328,6 +351,7 @@ static void bfq_add_rq_rb(struct request
313 struct bfq_data *bfqd = bfqq->bfqd;
314 struct request *__alias, *next_rq;
316 + bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
317 bfqq->queued[rq_is_sync(rq)]++;
320 @@ -339,15 +363,33 @@ static void bfq_add_rq_rb(struct request
321 bfq_dispatch_insert(bfqd->queue, __alias);
324 - * check if this request is a better next-serve candidate
325 + * Check if this request is a better next-serve candidate.
327 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq);
328 BUG_ON(next_rq == NULL);
329 bfqq->next_rq = next_rq;
331 if (!bfq_bfqq_busy(bfqq)) {
332 - entity->budget = max(bfqq->max_budget,
333 - next_rq->hard_nr_sectors);
334 + entity->budget = max_t(unsigned long, bfqq->max_budget,
335 + bfq_serv_to_charge(next_rq, bfqq));
338 + * If the queue is not being boosted and has been idle
339 + * for enough time, start a boosting period
341 + if (bfqd->low_latency && bfqq->high_weight_budget == 0) {
342 + if (bfqq->last_activation_time + BFQ_MIN_ACT_INTERVAL <
343 + jiffies_to_msecs(jiffies)) {
344 + bfqq->high_weight_budget = BFQ_BOOST_BUDGET;
345 + entity->ioprio_changed = 1;
346 + bfq_log_bfqq(bfqd, bfqq,
347 + "wboost starting at %lu msec",
348 + bfqq->last_activation_time);
350 + bfqq->last_activation_time =
351 + jiffies_to_msecs(jiffies);
354 bfq_add_bfqq_busy(bfqd, bfqq);
356 bfq_updated_next_req(bfqd, bfqq);
357 @@ -446,7 +488,7 @@ static void bfq_merged_requests(struct r
358 struct request *next)
361 - * reposition in fifo if next is older than rq
362 + * Reposition in fifo if next is older than rq.
364 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
365 time_before(next->start_time, rq->start_time))
366 @@ -475,10 +517,7 @@ static int bfq_allow_merge(struct reques
369 bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
370 - if (bfqq == RQ_BFQQ(rq))
374 + return bfqq == RQ_BFQQ(rq);
377 static void __bfq_set_active_queue(struct bfq_data *bfqd,
378 @@ -489,9 +528,10 @@ static void __bfq_set_active_queue(struc
379 bfq_mark_bfqq_budget_new(bfqq);
380 bfq_clear_bfqq_fifo_expire(bfqq);
382 - bfqq->budgets_assigned = (bfqq->budgets_assigned*7 + 256) / 8;
383 + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
385 - bfq_log_bfqq(bfqd, bfqq, "active");
386 + bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
387 + bfqq->entity.budget);
390 bfqd->active_queue = bfqq;
391 @@ -509,7 +549,26 @@ static struct bfq_queue *bfq_set_active_
395 -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
397 + * If enough samples have been computed, return the current max budget
398 + * stored in bfqd, which is dynamically updated according to the
399 + * estimated disk peak rate; otherwise return the default max budget
401 +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
403 + return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
404 + bfqd->bfq_max_budget;
408 + * Return min budget, which is a fraction of the current or default
409 + * max budget (trying with 1/32)
411 +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
413 + return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
414 + bfqd->bfq_max_budget / 32;
417 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
419 @@ -531,19 +590,30 @@ static void bfq_arm_slice_timer(struct b
420 bfq_mark_bfqq_wait_request(bfqq);
423 - * we don't want to idle for seeks, but we do want to allow
424 + * We don't want to idle for seeks, but we do want to allow
425 * fair distribution of slice time for a process doing back-to-back
426 - * seeks. so allow a little bit of time for him to submit a new rq
427 + * seeks. So allow a little bit of time for him to submit a new rq.
429 + * To prevent processes with (partly) seeky workloads from
430 + * being too ill-treated, grant them a small fraction of the
431 + * assigned budget before reducing the waiting time to
432 + * BFQ_MIN_TT. This happened to help reduce latency.
434 sl = bfqd->bfq_slice_idle;
435 - if (bfq_sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
436 + if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
437 + bfqq->entity.service > bfq_max_budget(bfqd) / 8)
438 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
440 bfqd->last_idling_start = ktime_get();
441 mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
442 - bfq_log(bfqd, "arm idle: %lu", sl);
443 + bfq_log(bfqd, "arm idle: %lu ms", sl);
447 + * Set the maximum time for the active queue to consume its
448 + * budget. This prevents seeky processes from lowering the disk
449 + * throughput (always guaranteed with a time slice scheme as in CFQ).
451 static void bfq_set_budget_timeout(struct bfq_data *bfqd)
453 struct bfq_queue *bfqq = bfqd->active_queue;
454 @@ -552,7 +622,8 @@ static void bfq_set_budget_timeout(struc
456 bfq_clear_bfqq_budget_new(bfqq);
457 bfqq->budget_timeout = jiffies +
458 - bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)];
459 + bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)] *
460 + (bfqq->entity.weight / bfqq->entity.orig_weight);
464 @@ -572,7 +643,7 @@ static void bfq_dispatch_insert(struct r
468 - * return expired entry, or NULL to just start from scratch in rbtree
469 + * Return expired entry, or NULL to just start from scratch in rbtree.
471 static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
473 @@ -597,7 +668,7 @@ static struct request *bfq_check_fifo(st
477 -static inline bfq_service_t bfq_bfqq_budget_left(struct bfq_queue *bfqq)
478 +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
480 struct bfq_entity *entity = &bfqq->entity;
481 return entity->budget - entity->service;
482 @@ -616,98 +687,115 @@ static void __bfq_bfqq_expire(struct bfq
486 - * bfq_default_budget - return the default budget for @bfqq on @bfqd.
487 - * @bfqd: the device descriptor.
488 - * @bfqq: the queue to consider.
490 - * We use 3/4 of the @bfqd maximum budget as the default value
491 - * for the max_budget field of the queues. This lets the feedback
492 - * mechanism to start from some middle ground, then the behavior
493 - * of the task will drive the heuristics towards high values, if
494 - * it behaves as a greedy sequential reader, or towards small values
495 - * if it shows a more intermittent behavior.
497 -static bfq_service_t bfq_default_budget(struct bfq_data *bfqd,
498 - struct bfq_queue *bfqq)
500 - bfq_service_t budget;
503 - * When we need an estimate of the peak rate we need to avoid
504 - * to give budgets that are too short due to previous measurements.
505 - * So, in the first 10 assignments use a ``safe'' budget value.
507 - if (bfqq->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
508 - budget = bfq_max_budget;
510 - budget = bfqd->bfq_max_budget;
512 - return budget - budget / 4;
515 -static inline bfq_service_t bfq_min_budget(struct bfq_data *bfqd,
516 - struct bfq_queue *bfqq)
518 - return bfqd->bfq_max_budget / 2;
522 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
523 * @bfqd: device data.
524 * @bfqq: queue to update.
525 * @reason: reason for expiration.
527 - * Handle the feedback on @bfqq budget. This is driven by the following
529 - * - async queues get always the maximum budget value (their ability to
530 - * dispatch is limited by @bfqd->bfq_max_budget_async_rq).
531 - * - If @bfqq has been too idle we decrease its budget, as it is likely
532 - * to be more interested in latency than in throughput.
533 - * - If @bfqq took too much to consume its budget it is likely to be
534 - * seeky, so reset the budget to the default, in order to have all
535 - * the seeky queues to be charged for the same service, trying to
536 - * achieve fairness at least in the time domain among them.
537 - * - If @bfqq exhausted its budget treat it as a greedy reader, in
538 - * order to run it at full speed.
539 - * - If @bfqq expired due to lack of requests leave its budget untouched.
540 + * Handle the feedback on @bfqq budget. See the body for detailed
543 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
544 struct bfq_queue *bfqq,
545 enum bfqq_expiration reason)
547 struct request *next_rq;
548 - bfq_service_t budget, min_budget;
549 + unsigned long budget, min_budget;
551 budget = bfqq->max_budget;
552 - min_budget = bfq_min_budget(bfqd, bfqq);
553 + min_budget = bfq_min_budget(bfqd);
555 BUG_ON(bfqq != bfqd->active_queue);
557 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
558 + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
559 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
560 + budget, bfq_min_budget(bfqd));
561 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
562 + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
564 if (bfq_bfqq_sync(bfqq)) {
567 + * Caveat: in all the following cases we trade latency
570 case BFQ_BFQQ_TOO_IDLE:
571 - if (budget > min_budget + BFQ_BUDGET_STEP)
572 - budget -= BFQ_BUDGET_STEP;
574 - budget = min_budget;
576 + * This is the only case where we may reduce
577 + * the budget: if there is no requets of the
578 + * process still waiting for completion, then
579 + * we assume (tentatively) that the timer has
580 + * expired because the batch of requests of
581 + * the process could have been served with a
582 + * smaller budget. Hence, betting that
583 + * process will behave in the same way when it
584 + * becomes backlogged again, we reduce its
585 + * next budget. As long as we guess right,
586 + * this budget cut reduces the latency
587 + * experienced by the process.
589 + * However, if there are still outstanding
590 + * requests, then the process may have not yet
591 + * issued its next request just because it is
592 + * still waiting for the completion of some of
593 + * the still oustanding ones. So in this
594 + * subcase we do not reduce its budget, on the
595 + * contrary we increase it to possibly boost
596 + * the throughput, as discussed in the
597 + * comments to the BUDGET_TIMEOUT case.
599 + if (bfqq->dispatched > 0) /* still oustanding reqs */
600 + budget = min(budget * 2, bfqd->bfq_max_budget);
602 + if (budget > 5 * min_budget)
603 + budget -= 4 * min_budget;
605 + budget = min_budget;
608 case BFQ_BFQQ_BUDGET_TIMEOUT:
609 - budget = bfq_default_budget(bfqd, bfqq);
611 + * We double the budget here because: 1) it
612 + * gives the chance to boost the throughput if
613 + * this is not a seeky process (which may have
614 + * bumped into this timeout because of, e.g.,
615 + * ZBR), 2) together with charge_full_budget
616 + * it helps give seeky processes higher
617 + * timestamps, and hence be served less
620 + budget = min(budget * 2, bfqd->bfq_max_budget);
622 case BFQ_BFQQ_BUDGET_EXHAUSTED:
623 - budget = min(budget + 8 * BFQ_BUDGET_STEP,
624 - bfqd->bfq_max_budget);
626 + * The process still has backlog, and did not
627 + * let either the budget timeout or the disk
628 + * idling timeout expire. Hence it is not
629 + * seeky, has a short thinktime and may be
630 + * happy with a higher budget too. So
631 + * definitely increase the budget of this good
632 + * candidate to boost the disk throughput.
634 + budget = min(budget * 4, bfqd->bfq_max_budget);
636 case BFQ_BFQQ_NO_MORE_REQUESTS:
638 + * Leave the budget unchanged.
644 + } else /* async queue */
645 + /* async queues get always the maximum possible budget
646 + * (their ability to dispatch is limited by
647 + * @bfqd->bfq_max_budget_async_rq).
649 budget = bfqd->bfq_max_budget;
651 bfqq->max_budget = budget;
653 - if (bfqq->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
654 + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
655 bfqq->max_budget > bfqd->bfq_max_budget)
656 bfqq->max_budget = bfqd->bfq_max_budget;
658 @@ -719,30 +807,40 @@ static void __bfq_bfqq_recalc_budget(str
660 next_rq = bfqq->next_rq;
662 - bfqq->entity.budget = max(bfqq->max_budget,
663 - next_rq->hard_nr_sectors);
664 - bfq_log_bfqq(bfqd, bfqq, "budget=%lu (%d)", bfqq->entity.budget,
665 - bfq_bfqq_sync(bfqq));
666 + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
667 + bfq_serv_to_charge(next_rq, bfqq));
669 + bfqq->entity.budget = bfqq->max_budget;
671 + bfq_log_bfqq(bfqd, bfqq, "head sect: %lu, new budget %lu",
672 + next_rq != NULL ? next_rq->hard_nr_sectors : 0,
673 + bfqq->entity.budget);
676 -static bfq_service_t bfq_calc_max_budget(u64 peak_rate, u64 timeout)
677 +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
679 - bfq_service_t max_budget;
680 + unsigned long max_budget;
683 * The max_budget calculated when autotuning is equal to the
684 - * amount of sectors transfered in 0.75 * timeout_sync at the
685 + * amount of sectors transfered in timeout_sync at the
686 * estimated peak rate.
688 - max_budget = (bfq_service_t)(peak_rate * 1000 *
689 + max_budget = (unsigned long)(peak_rate * 1000 *
690 timeout >> BFQ_RATE_SHIFT);
691 - max_budget -= max_budget / 4;
697 + * In addition to updating the peak rate, checks whether the process
698 + * is "slow", and returns 1 if so. This slow flag is used, in addition
699 + * to the budget timeout, to reduce the amount of service provided to
700 + * seeky processes, and hence reduce their chances to lower the
701 + * throughput. See the code for more details.
703 static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
705 + int compensate, enum bfqq_expiration reason)
707 u64 bw, usecs, expected, timeout;
709 @@ -775,10 +873,21 @@ static int bfq_update_peak_rate(struct b
710 * the peak rate estimation.
713 - if (bw > bfqd->peak_rate) {
714 - bfqd->peak_rate = bw;
715 + if (bw > bfqd->peak_rate ||
716 + (!BFQQ_SEEKY(bfqq) &&
717 + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
718 + bfq_log(bfqd, "measured bw =%llu", bw);
720 + * To smooth oscillations use a low-pass filter with
722 + * new_rate = (7/8) * old_rate + (1/8) * bw
725 + bfqd->peak_rate *= 7;
726 + do_div(bfqd->peak_rate, 8);
727 + bfqd->peak_rate += bw;
729 - bfq_log(bfqd, "peak_rate=%llu", bw);
730 + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
733 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
734 @@ -790,11 +899,22 @@ static int bfq_update_peak_rate(struct b
735 update && bfqd->bfq_user_max_budget == 0) {
736 bfqd->bfq_max_budget =
737 bfq_calc_max_budget(bfqd->peak_rate, timeout);
738 - bfq_log(bfqd, "max_budget=%lu", bfqd->bfq_max_budget);
739 + bfq_log(bfqd, "new max_budget=%lu",
740 + bfqd->bfq_max_budget);
745 + * If the process has been served for a too short time
746 + * interval to let its possible sequential accesses prevail on
747 + * the initial seek time needed to move the disk head on the
748 + * first sector it requested, then give the process a chance
749 + * and for the moment return false.
751 + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
755 * A process is considered ``slow'' (i.e., seeky, so that we
756 * cannot treat it fairly in the service domain, as it would
757 * slow down too much the other processes) if, when a slice
758 @@ -804,26 +924,46 @@ static int bfq_update_peak_rate(struct b
760 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
762 - return expected > bfqq->entity.budget;
764 + * Caveat: processes doing IO in the slower disk zones will
765 + * tend to be slow(er) even if not seeky. And the estimated
766 + * peak rate will actually be an average over the disk
767 + * surface. Hence, to not be too harsh with unlucky processes,
768 + * we keep a budget/3 margin of safety before declaring a
771 + return expected > (4 * bfqq->entity.budget) / 3;
776 * bfq_bfqq_expire - expire a queue.
777 * @bfqd: device owning the queue.
778 * @bfqq: the queue to expire.
779 * @compensate: if true, compensate for the time spent idling.
780 * @reason: the reason causing the expiration.
782 - * The behavior is the following: when a queue expires because it has
783 - * been idling for too much we sync its finish time with the service
784 - * received and decrease its budget. If @bfqq expires due to budget
785 - * exhaustion we increase its budget and sync its finish time.
786 - * If @bfqq expires due to budget timeout we do not sync its finish time
787 - * to avoid seeky queues to take too much disk time; instead we charge
788 - * it the maximum budget value. Using the max budget value for all the
789 - * queues that expire due to budget timeout has the effect of using the
790 - * WF2Q+ scheduler to assign timeslices to those queues, without violating
791 - * the service domain guarantees for well-behaved queues.
793 + * If the process associated to the queue is slow (i.e., seeky), or in
794 + * case of budget timeout, or, finally, if it is async, we
795 + * artificially charge it an entire budget (independently of the
796 + * actual service it received). As a consequence, the queue will get
797 + * higher timestamps than the correct ones upon reactivation, and
798 + * hence it will be rescheduled as if it had received more service
799 + * than what it actually received. In the end, this class of processes
800 + * will receive less service in proportion to how slowly they consume
801 + * their budgets (and hence how seriously they tend to lower the
804 + * In contrast, when a queue expires because it has been idling for
805 + * too much or because it exhausted its budget, we do not touch the
806 + * amount of service it has received. Hence when the queue will be
807 + * reactivated and its timestamps updated, the latter will be in sync
808 + * with the actual service received by the queue until expiration.
810 + * Charging a full budget to the first type of queues and the exact
811 + * service to the others has the effect of using the WF2Q+ policy to
812 + * schedule the former on a timeslice basis, without violating the
813 + * service domain guarantees of the latter.
815 static void bfq_bfqq_expire(struct bfq_data *bfqd,
816 struct bfq_queue *bfqq,
817 @@ -831,27 +971,42 @@ static void bfq_bfqq_expire(struct bfq_d
818 enum bfqq_expiration reason)
821 + BUG_ON(bfqq != bfqd->active_queue);
823 - slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
824 + /* Update disk peak rate for autotuning and check whether the
825 + * process is slow (see bfq_update_peak_rate).
827 + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
830 - * Treat slow (i.e., seeky) traffic as timed out, to not favor
831 - * it over sequential traffic (a seeky queue consumes less budget,
832 - * so it would receive smaller timestamps wrt a sequential one
833 - * when an idling timer fires).
834 + * As above explained, 'punish' slow (i.e., seeky), timed-out
835 + * and async queues, to favor sequential sync workloads.
837 + * Processes doing IO in the slower disk zones will tend to be
838 + * slow(er) even if not seeky. Hence, since the estimated peak
839 + * rate is actually an average over the disk surface, these
840 + * processes may timeout just for bad luck. To avoid punishing
841 + * them we do not charge a full budget to a process that
842 + * succeeded in consuming at least 2/3 of its budget.
844 - if (slow && reason == BFQ_BFQQ_TOO_IDLE)
845 - reason = BFQ_BFQQ_BUDGET_TIMEOUT;
847 - if (reason == BFQ_BFQQ_BUDGET_TIMEOUT || !bfq_bfqq_sync(bfqq))
848 + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
849 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
850 bfq_bfqq_charge_full_budget(bfqq);
852 - bfq_log_bfqq(bfqd, bfqq, "expire (%d, %d)", reason, slow);
853 + bfq_log_bfqq(bfqd, bfqq,
854 + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
855 + bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
857 + /* Increase, decrease or leave budget unchanged according to reason */
858 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
859 __bfq_bfqq_expire(bfqd, bfqq);
863 + * Budget timeout is not implemented through a dedicated timer, but
864 + * just checked on request arrivals and completions, as well as on
865 + * idle timer expirations.
867 static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
869 if (bfq_bfqq_budget_new(bfqq))
870 @@ -864,6 +1019,22 @@ static int bfq_bfqq_budget_timeout(struc
874 + * If we expire a queue that is waiting for the arrival of a new
875 + * request, we may prevent the fictitious timestamp backshifting that
876 + * allows the guarantees of the queue to be preserved (see [1] for
877 + * this tricky aspect). Hence we return true only if this condition
878 + * does not hold, or if the queue is slow enough to deserve only to be
879 + * kicked off for preserving a high throughput.
881 +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
883 + return (!bfq_bfqq_wait_request(bfqq) ||
884 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
886 + bfq_bfqq_budget_timeout(bfqq);
890 * Select a queue for service. If we have a current active queue,
891 * check whether to continue servicing it, or retrieve and set a new one.
893 @@ -877,10 +1048,10 @@ static struct bfq_queue *bfq_select_queu
897 - if (bfq_bfqq_budget_timeout(bfqq)) {
898 - bfq_bfqq_charge_full_budget(bfqq);
899 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
901 + if (bfq_may_expire_for_budg_timeout(bfqq))
905 next_rq = bfqq->next_rq;
907 @@ -888,7 +1059,8 @@ static struct bfq_queue *bfq_select_queu
908 * serve them, keep the queue, otherwise expire it.
910 if (next_rq != NULL) {
911 - if (next_rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
912 + if (bfq_serv_to_charge(next_rq, bfqq) >
913 + bfq_bfqq_budget_left(bfqq)) {
914 reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
917 @@ -896,9 +1068,8 @@ static struct bfq_queue *bfq_select_queu
921 - * No requests pending. If the active queue still has requests in
922 - * flight or is idling for a new request, allow either of these
923 - * conditions to happen (or time out) before selecting a new queue.
924 + * No requests pending. If the active queue still has
925 + * requests in flight or is idling for a new request, then keep it.
927 if (timer_pending(&bfqd->idle_slice_timer) ||
928 (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq))) {
929 @@ -911,6 +1082,7 @@ expire:
930 bfq_bfqq_expire(bfqd, bfqq, 0, reason);
932 bfqq = bfq_set_active_queue(bfqd);
933 + bfq_log(bfqd, "select_queue: new queue returned (possibly NULL)");
937 @@ -929,13 +1101,15 @@ static int __bfq_dispatch_requests(struc
941 + unsigned long service_to_charge;
943 /* Follow expired path, else get first next available. */
944 rq = bfq_check_fifo(bfqq);
947 + service_to_charge = bfq_serv_to_charge(rq, bfqq);
949 - if (rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
950 + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
952 * Expire the queue for budget exhaustion, and
953 * make sure that the next act_budget is enough
954 @@ -947,9 +1121,41 @@ static int __bfq_dispatch_requests(struc
957 /* Finally, insert request into driver dispatch list. */
958 - bfq_bfqq_served(bfqq, rq->hard_nr_sectors);
959 + bfq_bfqq_served(bfqq, service_to_charge);
960 bfq_dispatch_insert(bfqd->queue, rq);
962 + if (bfqq->high_weight_budget > 0) { /* queue is being boosted */
963 + struct bfq_entity *entity = &bfqq->entity;
965 + bfq_log_bfqq(bfqd, bfqq, "busy period dur %llu msec, "
966 + "old highwbudg %lu",
967 + jiffies_to_msecs(jiffies) -
968 + bfqq->last_activation_time,
969 + bfqq->high_weight_budget);
971 + * Decrease the budget for weight boosting by
972 + * the just received service, or, if too much
973 + * time has elapsed from the beginning of this
974 + * boosting period, stop it
976 + if (jiffies_to_msecs(jiffies) -
977 + bfqq->last_activation_time <= BFQ_BOOST_TIMEOUT
979 + bfqq->high_weight_budget > service_to_charge)
980 + bfqq->high_weight_budget -= service_to_charge;
982 + bfqq->high_weight_budget = 0;
983 + entity->ioprio_changed = 1;
984 + __bfq_entity_update_weight_prio(
985 + bfq_entity_service_tree(entity),
989 + bfq_log_bfqq(bfqd, bfqq, "dispatched %lu sec req (%llu), "
991 + rq->hard_nr_sectors, rq->hard_sector,
992 + bfq_bfqq_budget_left(bfqq));
996 if (bfqd->active_cic == NULL) {
997 @@ -961,6 +1167,8 @@ static int __bfq_dispatch_requests(struc
999 } while (dispatched < max_dispatch);
1001 + bfq_log_bfqq(bfqd, bfqq, "dispatched %d reqs", dispatched);
1003 if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
1004 dispatched >= bfqd->bfq_max_budget_async_rq) ||
1005 bfq_class_idle(bfqq)))
1006 @@ -1009,7 +1217,7 @@ static int bfq_forced_dispatch(struct bf
1007 st = bfq_entity_service_tree(&bfqq->entity);
1009 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
1010 - bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1011 + bfqq->max_budget = bfq_max_budget(bfqd);
1013 bfq_forget_idle(st);
1015 @@ -1025,6 +1233,7 @@ static int bfq_dispatch_requests(struct
1016 struct bfq_queue *bfqq;
1019 + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
1020 if (bfqd->busy_queues == 0)
1023 @@ -1056,9 +1265,11 @@ static int bfq_dispatch_requests(struct
1024 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
1026 dispatched += __bfq_dispatch_requests(bfqd, bfqq, max_dispatch);
1027 + bfq_log_bfqq(bfqd, bfqq, "total dispatched increased to %d "
1028 + "(max_disp %d)", dispatched, max_dispatch);
1031 - bfq_log(bfqd, "dispatched=%d", dispatched);
1032 + bfq_log(bfqd, "final total dispatched=%d", dispatched);
1036 @@ -1074,6 +1285,7 @@ static void bfq_put_queue(struct bfq_que
1038 BUG_ON(atomic_read(&bfqq->ref) <= 0);
1040 + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
1041 if (!atomic_dec_and_test(&bfqq->ref))
1044 @@ -1083,7 +1295,7 @@ static void bfq_put_queue(struct bfq_que
1045 BUG_ON(bfq_bfqq_busy(bfqq));
1046 BUG_ON(bfqd->active_queue == bfqq);
1048 - bfq_log_bfqq(bfqd, bfqq, "freed");
1049 + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
1051 kmem_cache_free(bfq_pool, bfqq);
1053 @@ -1095,6 +1307,7 @@ static void bfq_exit_bfqq(struct bfq_dat
1054 bfq_schedule_dispatch(bfqd);
1057 + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
1058 bfq_put_queue(bfqq);
1061 @@ -1116,7 +1329,7 @@ static void bfq_init_prio_data(struct bf
1062 printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
1063 case IOPRIO_CLASS_NONE:
1065 - * no prio set, inherit CPU scheduling settings
1066 + * No prio set, inherit CPU scheduling settings.
1068 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
1069 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
1070 @@ -1139,8 +1352,8 @@ static void bfq_init_prio_data(struct bf
1071 bfqq->entity.ioprio_changed = 1;
1074 - * keep track of original prio settings in case we have to temporarily
1075 - * elevate the priority of this queue
1076 + * Keep track of original prio settings in case we have to temporarily
1077 + * elevate the priority of this queue.
1079 bfqq->org_ioprio = bfqq->entity.new_ioprio;
1080 bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
1081 @@ -1167,6 +1380,9 @@ static void bfq_changed_ioprio(struct io
1083 if (new_bfqq != NULL) {
1084 cic->cfqq[ASYNC] = new_bfqq;
1085 + bfq_log_bfqq(bfqd, bfqq,
1086 + "changed_ioprio: bfqq %p %d",
1088 bfq_put_queue(bfqq);
1091 @@ -1233,9 +1449,13 @@ retry:
1092 bfq_mark_bfqq_idle_window(bfqq);
1093 bfq_mark_bfqq_sync(bfqq);
1095 - bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
1096 + /* Tentative initial value to trade off between thr and lat */
1097 + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
1098 bfqq->pid = current->pid;
1100 + bfqq->last_activation_time = 0;
1101 + bfqq->high_weight_budget = 0;
1103 bfq_log_bfqq(bfqd, bfqq, "allocated");
1106 @@ -1285,14 +1505,17 @@ static struct bfq_queue *bfq_get_queue(s
1110 - * pin the queue now that it's allocated, scheduler exit will prune it
1111 + * Pin the queue now that it's allocated, scheduler exit will prune it.
1113 if (!is_sync && *async_bfqq == NULL) {
1114 atomic_inc(&bfqq->ref);
1115 + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
1120 atomic_inc(&bfqq->ref);
1121 + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
1125 @@ -1309,35 +1532,35 @@ static void bfq_update_io_thinktime(stru
1127 static void bfq_update_io_seektime(struct bfq_data *bfqd,
1128 struct bfq_queue *bfqq,
1129 - struct cfq_io_context *cic,
1135 - if (cic->last_request_pos < rq->sector)
1136 - sdist = rq->sector - cic->last_request_pos;
1137 + if (bfqq->last_request_pos < rq->sector)
1138 + sdist = rq->sector - bfqq->last_request_pos;
1140 - sdist = cic->last_request_pos - rq->sector;
1141 + sdist = bfqq->last_request_pos - rq->sector;
1144 * Don't allow the seek distance to get too large from the
1145 * odd fragment, pagein, etc.
1147 - if (cic->seek_samples == 0) /* first request, not really a seek */
1148 + if (bfqq->seek_samples == 0) /* first request, not really a seek */
1150 - else if (cic->seek_samples <= 60) /* second&third seek */
1151 - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1152 + else if (bfqq->seek_samples <= 60) /* second & third seek */
1153 + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
1155 - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1156 + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
1158 - cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1159 - cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1160 - total = cic->seek_total + (cic->seek_samples/2);
1161 - do_div(total, cic->seek_samples);
1162 - cic->seek_mean = (sector_t)total;
1163 + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
1164 + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
1165 + total = bfqq->seek_total + (bfqq->seek_samples/2);
1166 + do_div(total, bfqq->seek_samples);
1167 + bfqq->seek_mean = (sector_t)total;
1169 - bfq_log_bfqq(bfqd, bfqq, "dist=%lu mean=%lu", sdist, cic->seek_mean);
1170 + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
1171 + (u64)bfqq->seek_mean);
1175 @@ -1357,7 +1580,7 @@ static void bfq_update_idle_window(struc
1176 enable_idle = bfq_bfqq_idle_window(bfqq);
1178 if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
1179 - bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && CIC_SEEKY(cic)))
1180 + bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
1182 else if (bfq_sample_valid(cic->ttime_samples)) {
1183 if (cic->ttime_mean > bfqd->bfq_slice_idle)
1184 @@ -1365,14 +1588,13 @@ static void bfq_update_idle_window(struc
1188 + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
1192 bfq_mark_bfqq_idle_window(bfqq);
1194 bfq_clear_bfqq_idle_window(bfqq);
1196 - bfq_log_bfqq(bfqd, bfqq, "idle_window=%d (%d)",
1197 - enable_idle, CIC_SEEKY(cic));
1201 @@ -1388,20 +1610,37 @@ static void bfq_rq_enqueued(struct bfq_d
1202 bfqq->meta_pending++;
1204 bfq_update_io_thinktime(bfqd, cic);
1205 - bfq_update_io_seektime(bfqd, bfqq, cic, rq);
1206 - bfq_update_idle_window(bfqd, bfqq, cic);
1207 + bfq_update_io_seektime(bfqd, bfqq, rq);
1208 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
1209 + !BFQQ_SEEKY(bfqq))
1210 + bfq_update_idle_window(bfqd, bfqq, cic);
1212 + bfq_log_bfqq(bfqd, bfqq,
1213 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
1214 + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
1217 - cic->last_request_pos = rq->sector + rq->nr_sectors;
1218 + bfqq->last_request_pos = rq->sector + rq->nr_sectors;
1220 - if (bfqq == bfqd->active_queue && bfq_bfqq_wait_request(bfqq)) {
1222 - * If we are waiting for a request for this queue, let it rip
1223 - * immediately and flag that we must not expire this queue
1226 - bfq_clear_bfqq_wait_request(bfqq);
1227 - del_timer(&bfqd->idle_slice_timer);
1228 - blk_start_queueing(bfqd->queue);
1229 + if (bfqq == bfqd->active_queue) {
1230 + if (bfq_bfqq_wait_request(bfqq)) {
1232 + * If we are waiting for a request for this queue, let
1233 + * it rip immediately and flag that we must not expire
1234 + * this queue just now.
1236 + bfq_clear_bfqq_wait_request(bfqq);
1237 + del_timer(&bfqd->idle_slice_timer);
1239 + * Here we can safely expire the queue, in
1240 + * case of budget timeout, without wasting
1243 + if (bfq_bfqq_budget_timeout(bfqq))
1244 + bfq_bfqq_expire(bfqd, bfqq, 0,
1245 + BFQ_BFQQ_BUDGET_TIMEOUT);
1246 + blk_start_queueing(bfqd->queue);
1251 @@ -1410,6 +1649,7 @@ static void bfq_insert_request(struct re
1252 struct bfq_data *bfqd = q->elevator->elevator_data;
1253 struct bfq_queue *bfqq = RQ_BFQQ(rq);
1255 + assert_spin_locked(bfqd->queue->queue_lock);
1256 bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
1259 @@ -1447,7 +1687,8 @@ static void bfq_completed_request(struct
1260 struct bfq_data *bfqd = bfqq->bfqd;
1261 const int sync = rq_is_sync(rq);
1263 - bfq_log_bfqq(bfqd, bfqq, "complete");
1264 + bfq_log_bfqq(bfqd, bfqq, "completed %lu sects req (%d)",
1265 + rq->hard_nr_sectors, sync);
1267 bfq_update_hw_tag(bfqd);
1269 @@ -1470,7 +1711,7 @@ static void bfq_completed_request(struct
1270 if (bfq_bfqq_budget_new(bfqq))
1271 bfq_set_budget_timeout(bfqd);
1273 - if (bfq_bfqq_budget_timeout(bfqq))
1274 + if (bfq_may_expire_for_budg_timeout(bfqq))
1275 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
1276 else if (sync && bfqd->rq_in_driver == 0 &&
1277 RB_EMPTY_ROOT(&bfqq->sort_list))
1278 @@ -1489,7 +1730,7 @@ static void bfq_prio_boost(struct bfq_qu
1280 if (has_fs_excl()) {
1282 - * boost idle prio on transactions that would lock out other
1283 + * Boost idle prio on transactions that would lock out other
1284 * users of the filesystem
1286 if (bfq_class_idle(bfqq))
1287 @@ -1498,7 +1739,7 @@ static void bfq_prio_boost(struct bfq_qu
1288 bfqq->entity.new_ioprio = IOPRIO_NORM;
1291 - * check if we need to unboost the queue
1292 + * Check if we need to unboost the queue
1294 if (bfqq->entity.new_ioprio_class != bfqq->org_ioprio_class)
1295 bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
1296 @@ -1527,7 +1768,7 @@ static int bfq_may_queue(struct request_
1298 * Don't force setup of a queue from here, as a call to may_queue
1299 * does not necessarily imply that a request actually will be queued.
1300 - * so just lookup a possibly existing queue, or return 'may queue'
1301 + * So just lookup a possibly existing queue, or return 'may queue'
1304 cic = bfq_cic_lookup(bfqd, tsk->io_context);
1305 @@ -1546,7 +1787,7 @@ static int bfq_may_queue(struct request_
1309 - * queue lock held here
1310 + * Queue lock held here.
1312 static void bfq_put_request(struct request *rq)
1314 @@ -1563,6 +1804,8 @@ static void bfq_put_request(struct reque
1315 rq->elevator_private = NULL;
1316 rq->elevator_private2 = NULL;
1318 + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
1320 bfq_put_queue(bfqq);
1323 @@ -1603,6 +1846,7 @@ static int bfq_set_request(struct reques
1325 bfqq->allocated[rw]++;
1326 atomic_inc(&bfqq->ref);
1327 + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
1329 spin_unlock_irqrestore(q->queue_lock, flags);
1331 @@ -1634,7 +1878,8 @@ static void bfq_kick_queue(struct work_s
1335 - * Timer running if the active_queue is currently idling inside its time slice
1336 + * Handler of the expiration of the timer running if the active_queue
1337 + * is idling inside its time slice.
1339 static void bfq_idle_slice_timer(unsigned long data)
1341 @@ -1643,8 +1888,6 @@ static void bfq_idle_slice_timer(unsigne
1342 unsigned long flags;
1343 enum bfqq_expiration reason;
1345 - bfq_log(bfqd, "slice_timer expired");
1347 spin_lock_irqsave(bfqd->queue->queue_lock, flags);
1349 bfqq = bfqd->active_queue;
1350 @@ -1657,8 +1900,14 @@ static void bfq_idle_slice_timer(unsigne
1351 * we just expire a queue too early.
1354 + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
1355 reason = BFQ_BFQQ_TOO_IDLE;
1356 if (bfq_bfqq_budget_timeout(bfqq))
1358 + * Also here the queue can be safely expired
1359 + * for budget timeout without wasting
1362 reason = BFQ_BFQQ_BUDGET_TIMEOUT;
1364 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
1365 @@ -1681,8 +1930,11 @@ static inline void __bfq_put_async_bfqq(
1366 struct bfq_group *root_group = bfqd->root_group;
1367 struct bfq_queue *bfqq = *bfqq_ptr;
1369 + bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
1371 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
1372 + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
1374 bfq_put_queue(bfqq);
1377 @@ -1705,7 +1957,7 @@ static void bfq_put_async_queues(struct
1378 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
1381 -static void bfq_exit_queue(elevator_t *e)
1382 +static void bfq_exit_queue(struct elevator_queue *e)
1384 struct bfq_data *bfqd = e->elevator_data;
1385 struct request_queue *q = bfqd->queue;
1386 @@ -1772,7 +2024,7 @@ static void *bfq_init_queue(struct reque
1390 - bfqd->bfq_max_budget = bfq_max_budget;
1391 + bfqd->bfq_max_budget = bfq_default_max_budget;
1393 bfqd->bfq_quantum = bfq_quantum;
1394 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
1395 @@ -1784,6 +2036,8 @@ static void *bfq_init_queue(struct reque
1396 bfqd->bfq_timeout[ASYNC] = bfq_timeout_async;
1397 bfqd->bfq_timeout[SYNC] = bfq_timeout_sync;
1399 + bfqd->low_latency = true;
1404 @@ -1819,16 +2073,19 @@ static ssize_t bfq_var_show(unsigned int
1405 return sprintf(page, "%d\n", var);
1408 -static ssize_t bfq_var_store(unsigned int *var, const char *page, size_t count)
1409 +static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
1411 - char *p = (char *)page;
1412 + unsigned long new_val;
1413 + int ret = strict_strtoul(page, 10, &new_val);
1418 - *var = simple_strtoul(p, &p, 10);
1422 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
1423 -static ssize_t __FUNC(elevator_t *e, char *page) \
1424 +static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1426 struct bfq_data *bfqd = e->elevator_data; \
1427 unsigned int __data = __VAR; \
1428 @@ -1846,13 +2103,15 @@ SHOW_FUNCTION(bfq_max_budget_show, bfqd-
1429 SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
1430 SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[SYNC], 1);
1431 SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[ASYNC], 1);
1432 +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
1433 #undef SHOW_FUNCTION
1435 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
1436 -static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
1438 +__FUNC(struct elevator_queue *e, const char *page, size_t count) \
1440 struct bfq_data *bfqd = e->elevator_data; \
1441 - unsigned int __data; \
1442 + unsigned long __data; \
1443 int ret = bfq_var_store(&__data, (page), count); \
1444 if (__data < (MIN)) \
1446 @@ -1879,21 +2138,21 @@ STORE_FUNCTION(bfq_timeout_async_store,
1448 #undef STORE_FUNCTION
1450 -static inline bfq_service_t bfq_estimated_max_budget(struct bfq_data *bfqd)
1451 +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
1453 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[SYNC]);
1455 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
1456 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
1458 - return bfq_max_budget;
1459 + return bfq_default_max_budget;
1462 -static ssize_t bfq_max_budget_store(elevator_t *e, const char *page,
1464 +static ssize_t bfq_max_budget_store(struct elevator_queue *e,
1465 + const char *page, size_t count)
1467 struct bfq_data *bfqd = e->elevator_data;
1468 - unsigned int __data;
1469 + unsigned long __data;
1470 int ret = bfq_var_store(&__data, (page), count);
1473 @@ -1909,11 +2168,11 @@ static ssize_t bfq_max_budget_store(elev
1477 -static ssize_t bfq_timeout_sync_store(elevator_t *e, const char *page,
1479 +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
1480 + const char *page, size_t count)
1482 struct bfq_data *bfqd = e->elevator_data;
1483 - unsigned int __data;
1484 + unsigned long __data;
1485 int ret = bfq_var_store(&__data, (page), count);
1488 @@ -1928,6 +2187,20 @@ static ssize_t bfq_timeout_sync_store(el
1492 +static ssize_t bfq_low_latency_store(struct elevator_queue *e,
1493 + const char *page, size_t count)
1495 + struct bfq_data *bfqd = e->elevator_data;
1496 + unsigned long __data;
1497 + int ret = bfq_var_store(&__data, (page), count);
1501 + bfqd->low_latency = __data;
1506 #define BFQ_ATTR(name) \
1507 __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
1509 @@ -1942,12 +2215,13 @@ static struct elv_fs_entry bfq_attrs[] =
1510 BFQ_ATTR(max_budget_async_rq),
1511 BFQ_ATTR(timeout_sync),
1512 BFQ_ATTR(timeout_async),
1513 + BFQ_ATTR(low_latency),
1517 static struct elevator_type iosched_bfq = {
1519 - .elevator_merge_fn = bfq_merge,
1520 + .elevator_merge_fn = bfq_merge,
1521 .elevator_merged_fn = bfq_merged_request,
1522 .elevator_merge_req_fn = bfq_merged_requests,
1523 .elevator_allow_merge_fn = bfq_allow_merge,
1524 @@ -1974,7 +2248,7 @@ static struct elevator_type iosched_bfq
1525 static int __init bfq_init(void)
1528 - * can be 0 on HZ < 1000 setups
1529 + * Can be 0 on HZ < 1000 setups.
1531 if (bfq_slice_idle == 0)
1533 diff -uprN linux-2.6.28/block/bfq-sched.c linux-2.6.28.new/block/bfq-sched.c
1534 --- linux-2.6.28/block/bfq-sched.c 2011-06-13 21:41:23.766107399 +0200
1535 +++ linux-2.6.28.new/block/bfq-sched.c 2011-06-13 21:41:41.639927001 +0200
1536 @@ -28,7 +28,7 @@ static int bfq_update_next_active(struct
1540 - * NOTE: this can be improved in may ways, such as returning
1541 + * NOTE: this can be improved in many ways, such as returning
1542 * 1 (and thus propagating upwards the update) only when the
1543 * budget changes, or caching the bfqq that will be scheduled
1544 * next from this subtree. By now we worry more about
1545 @@ -85,20 +85,33 @@ static inline void bfq_check_next_active
1547 * Return @a > @b, dealing with wrapping correctly.
1549 -static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
1550 +static inline int bfq_gt(u64 a, u64 b)
1552 return (s64)(a - b) > 0;
1555 +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1557 + struct bfq_queue *bfqq = NULL;
1559 + BUG_ON(entity == NULL);
1561 + if (entity->my_sched_data == NULL)
1562 + bfqq = container_of(entity, struct bfq_queue, entity);
1569 * bfq_delta - map service into the virtual time domain.
1570 * @service: amount of service.
1571 - * @weight: scale factor.
1572 + * @weight: scale factor (weight of an entity or weight sum).
1574 -static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
1575 - bfq_weight_t weight)
1576 +static inline u64 bfq_delta(unsigned long service,
1577 + unsigned long weight)
1579 - bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
1580 + u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1584 @@ -110,11 +123,25 @@ static inline bfq_timestamp_t bfq_delta(
1585 * @service: the service to be charged to the entity.
1587 static inline void bfq_calc_finish(struct bfq_entity *entity,
1588 - bfq_service_t service)
1589 + unsigned long service)
1591 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1593 BUG_ON(entity->weight == 0);
1595 - entity->finish = entity->start + bfq_delta(service, entity->weight);
1596 + entity->finish = entity->start +
1597 + bfq_delta(service, entity->weight);
1599 + if (bfqq != NULL) {
1600 + bfq_log_bfqq(bfqq->bfqd, bfqq,
1601 + "calc_finish: serv %lu, w %lu, hi-budg %lu",
1602 + service, entity->weight,
1603 + bfqq->high_weight_budget);
1604 + bfq_log_bfqq(bfqq->bfqd, bfqq,
1605 + "calc_finish: start %llu, finish %llu, delta %llu",
1606 + entity->start, entity->finish,
1607 + bfq_delta(service, entity->weight));
1612 @@ -150,18 +177,6 @@ static inline void bfq_extract(struct rb
1613 rb_erase(&entity->rb_node, root);
1616 -static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1618 - struct bfq_queue *bfqq = NULL;
1620 - BUG_ON(entity == NULL);
1622 - if (entity->my_sched_data == NULL)
1623 - bfqq = container_of(entity, struct bfq_queue, entity);
1629 * bfq_idle_extract - extract an entity from the idle tree.
1630 * @st: the service tree of the owning @entity.
1631 @@ -325,12 +340,26 @@ static void bfq_active_insert(struct bfq
1632 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1633 * @ioprio: the ioprio value to convert.
1635 -static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
1636 +static unsigned short bfq_ioprio_to_weight(int ioprio)
1638 WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
1639 return IOPRIO_BE_NR - ioprio;
1643 + * bfq_weight_to_ioprio - calc an ioprio from a weight.
1644 + * @weight: the weight value to convert.
1646 + * To preserve as mush as possible the old only-ioprio user interface,
1647 + * 0 is used as an escape ioprio value for weights (numerically) equal or
1648 + * larger than IOPRIO_BE_NR
1650 +static unsigned short bfq_weight_to_ioprio(int weight)
1652 + WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
1653 + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
1656 static inline void bfq_get_entity(struct bfq_entity *entity)
1658 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1659 @@ -339,6 +368,8 @@ static inline void bfq_get_entity(struct
1661 sd = entity->sched_data;
1662 atomic_inc(&bfqq->ref);
1663 + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1668 @@ -437,6 +468,8 @@ static void bfq_forget_entity(struct bfq
1669 st->wsum -= entity->weight;
1671 sd = entity->sched_data;
1672 + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
1674 bfq_put_queue(bfqq);
1677 @@ -478,6 +511,62 @@ static void bfq_forget_idle(struct bfq_s
1678 bfq_put_idle_entity(st, first_idle);
1681 +static struct bfq_service_tree *
1682 +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1683 + struct bfq_entity *entity)
1685 + struct bfq_service_tree *new_st = old_st;
1687 + if (entity->ioprio_changed) {
1688 + int new_boost_coeff = 1;
1689 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1691 + if (bfqq != NULL) {
1692 + new_boost_coeff +=
1693 + bfqq->high_weight_budget * BFQ_BOOST_COEFF /
1695 + bfq_log_bfqq(bfqq->bfqd, bfqq,
1696 + "update_w_prio: wght %lu, hi-budg %lu, coef %d",
1697 + entity->weight, bfqq->high_weight_budget,
1701 + BUG_ON(old_st->wsum < entity->weight);
1702 + old_st->wsum -= entity->weight;
1704 + if (entity->new_weight != entity->orig_weight) {
1705 + entity->orig_weight = entity->new_weight;
1707 + bfq_weight_to_ioprio(entity->orig_weight);
1708 + } else if (entity->new_ioprio != entity->ioprio) {
1709 + entity->ioprio = entity->new_ioprio;
1710 + entity->orig_weight =
1711 + bfq_ioprio_to_weight(entity->ioprio);
1713 + entity->new_weight = entity->orig_weight =
1714 + bfq_ioprio_to_weight(entity->ioprio);
1716 + entity->ioprio_class = entity->new_ioprio_class;
1717 + entity->ioprio_changed = 0;
1720 + * NOTE: here we may be changing the weight too early,
1721 + * this will cause unfairness. The correct approach
1722 + * would have required additional complexity to defer
1723 + * weight changes to the proper time instants (i.e.,
1724 + * when entity->finish <= old_st->vtime).
1726 + new_st = bfq_entity_service_tree(entity);
1727 + entity->weight = entity->orig_weight * new_boost_coeff;
1728 + new_st->wsum += entity->weight;
1730 + if (new_st != old_st)
1731 + entity->start = new_st->vtime;
1738 * bfq_bfqq_served - update the scheduler status after selection for service.
1739 * @bfqq: the queue being served.
1740 @@ -487,7 +576,7 @@ static void bfq_forget_idle(struct bfq_s
1741 * are synchronized every time a new bfqq is selected for service. By now,
1742 * we keep it to better check consistency.
1744 -static void bfq_bfqq_served(struct bfq_queue *bfqq, bfq_service_t served)
1745 +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
1747 struct bfq_entity *entity = &bfqq->entity;
1748 struct bfq_service_tree *st;
1749 @@ -496,13 +585,13 @@ static void bfq_bfqq_served(struct bfq_q
1750 st = bfq_entity_service_tree(entity);
1752 entity->service += served;
1754 WARN_ON_ONCE(entity->service > entity->budget);
1755 BUG_ON(st->wsum == 0);
1757 st->vtime += bfq_delta(served, st->wsum);
1758 bfq_forget_idle(st);
1760 + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
1764 @@ -515,42 +604,13 @@ static void bfq_bfqq_served(struct bfq_q
1765 * budget. In this way we should obtain a sort of time-domain
1766 * fairness among all the seeky/slow queues.
1768 -static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1769 +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1771 struct bfq_entity *entity = &bfqq->entity;
1773 - bfq_bfqq_served(bfqq, entity->budget - entity->service);
1775 + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
1777 -static struct bfq_service_tree *
1778 -__bfq_entity_update_prio(struct bfq_service_tree *old_st,
1779 - struct bfq_entity *entity)
1781 - struct bfq_service_tree *new_st = old_st;
1783 - if (entity->ioprio_changed) {
1784 - entity->ioprio = entity->new_ioprio;
1785 - entity->ioprio_class = entity->new_ioprio_class;
1786 - entity->ioprio_changed = 0;
1788 - old_st->wsum -= entity->weight;
1789 - entity->weight = bfq_ioprio_to_weight(entity->ioprio);
1792 - * NOTE: here we may be changing the weight too early,
1793 - * this will cause unfairness. The correct approach
1794 - * would have required additional complexity to defer
1795 - * weight changes to the proper time instants (i.e.,
1796 - * when entity->finish <= old_st->vtime).
1798 - new_st = bfq_entity_service_tree(entity);
1799 - new_st->wsum += entity->weight;
1801 - if (new_st != old_st)
1802 - entity->start = new_st->vtime;
1806 + bfq_bfqq_served(bfqq, entity->budget - entity->service);
1810 @@ -607,7 +667,7 @@ static void __bfq_activate_entity(struct
1814 - st = __bfq_entity_update_prio(st, entity);
1815 + st = __bfq_entity_update_weight_prio(st, entity);
1816 bfq_calc_finish(entity, entity->budget);
1817 bfq_active_insert(st, entity);
1819 diff -uprN linux-2.6.28/block/bfq.h linux-2.6.28.new/block/bfq.h
1820 --- linux-2.6.28/block/bfq.h 2011-06-13 21:41:23.766107399 +0200
1821 +++ linux-2.6.28.new/block/bfq.h 2011-06-13 21:41:41.639927001 +0200
1824 - * BFQ: data structures and common functions prototypes.
1825 + * BFQ-v1-r1 for 2.6.35: data structures and common functions prototypes.
1827 * Based on ideas and code from CFQ:
1828 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1831 #define BFQ_IOPRIO_CLASSES 3
1833 -#define BFQ_DEFAULT_GRP_IOPRIO 4
1834 +#define BFQ_MIN_WEIGHT 1
1835 +#define BFQ_MAX_WEIGHT 1000
1837 +#define BFQ_DEFAULT_GRP_WEIGHT 10
1838 +#define BFQ_DEFAULT_GRP_IOPRIO 0
1839 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
1841 -typedef u64 bfq_timestamp_t;
1842 -typedef unsigned long bfq_weight_t;
1843 -typedef unsigned long bfq_service_t;
1844 +/* Constants used in weight boosting (in its turn used to reduce latencies): */
1845 +/* max factor by which the weight of a boosted queue is multiplied */
1846 +#define BFQ_BOOST_COEFF 10
1847 +/* max number of sectors that can be served during a boosting period */
1848 +#define BFQ_BOOST_BUDGET 49152
1849 +/* max duration of a boosting period, msec */
1850 +#define BFQ_BOOST_TIMEOUT 6000
1851 +/* min idle period after which boosting may be reactivated for a queue, msec */
1852 +#define BFQ_MIN_ACT_INTERVAL 20000
1856 @@ -51,8 +61,8 @@ struct bfq_service_tree {
1857 struct bfq_entity *first_idle;
1858 struct bfq_entity *last_idle;
1860 - bfq_timestamp_t vtime;
1861 - bfq_weight_t wsum;
1863 + unsigned long wsum;
1867 @@ -92,17 +102,19 @@ struct bfq_sched_data {
1868 * this entity; used for O(log N) lookups into active trees.
1869 * @service: service received during the last round of service.
1870 * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
1871 - * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
1872 + * @weight: weight of the queue
1873 * @parent: parent entity, for hierarchical scheduling.
1874 * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
1875 * associated scheduler queue, %NULL on leaf nodes.
1876 * @sched_data: the scheduler queue this entity belongs to.
1877 * @ioprio: the ioprio in use.
1878 - * @new_ioprio: when an ioprio change is requested, the new ioprio value
1879 + * @new_weight: when a weight change is requested, the new weight value.
1880 + * @orig_weight: original weight, used to implement weight boosting
1881 + * @new_ioprio: when an ioprio change is requested, the new ioprio value.
1882 * @ioprio_class: the ioprio_class in use.
1883 * @new_ioprio_class: when an ioprio_class change is requested, the new
1884 * ioprio_class value.
1885 - * @ioprio_changed: flag, true when the user requested an ioprio or
1886 + * @ioprio_changed: flag, true when the user requested a weight, ioprio or
1887 * ioprio_class change.
1889 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
1890 @@ -111,36 +123,39 @@ struct bfq_sched_data {
1891 * hierarchy. Non-leaf entities have also their own sched_data, stored
1892 * in @my_sched_data.
1894 - * Each entity stores independently its priority values; this would allow
1895 - * different weights on different devices, but this functionality is not
1896 - * exported to userspace by now. Priorities are updated lazily, first
1897 - * storing the new values into the new_* fields, then setting the
1898 - * @ioprio_changed flag. As soon as there is a transition in the entity
1899 - * state that allows the priority update to take place the effective and
1900 - * the requested priority values are synchronized.
1902 - * The weight value is calculated from the ioprio to export the same
1903 - * interface as CFQ. When dealing with ``well-behaved'' queues (i.e.,
1904 - * queues that do not spend too much time to consume their budget and
1905 - * have true sequential behavior, and when there are no external factors
1906 - * breaking anticipation) the relative weights at each level of the
1907 - * cgroups hierarchy should be guaranteed.
1908 - * All the fields are protected by the queue lock of the containing bfqd.
1909 + * Each entity stores independently its priority values; this would
1910 + * allow different weights on different devices, but this
1911 + * functionality is not exported to userspace by now. Priorities and
1912 + * weights are updated lazily, first storing the new values into the
1913 + * new_* fields, then setting the @ioprio_changed flag. As soon as
1914 + * there is a transition in the entity state that allows the priority
1915 + * update to take place the effective and the requested priority
1916 + * values are synchronized.
1918 + * Unless cgroups are used, the weight value is calculated from the
1919 + * ioprio to export the same interface as CFQ. When dealing with
1920 + * ``well-behaved'' queues (i.e., queues that do not spend too much
1921 + * time to consume their budget and have true sequential behavior, and
1922 + * when there are no external factors breaking anticipation) the
1923 + * relative weights at each level of the cgroups hierarchy should be
1924 + * guaranteed. All the fields are protected by the queue lock of the
1925 + * containing bfqd.
1928 struct rb_node rb_node;
1932 - bfq_timestamp_t finish;
1933 - bfq_timestamp_t start;
1937 struct rb_root *tree;
1939 - bfq_timestamp_t min_start;
1942 - bfq_service_t service, budget;
1943 - bfq_weight_t weight;
1944 + unsigned long service, budget;
1945 + unsigned short weight, new_weight;
1946 + unsigned short orig_weight;
1948 struct bfq_entity *parent;
1950 @@ -168,6 +183,7 @@ struct bfq_group;
1951 * completed requests .
1952 * @hw_tag_samples: nr of samples used to calculate hw_tag.
1953 * @hw_tag: flag set to one if the driver is showing a queueing behavior.
1954 + * @budgets_assigned: number of budgets assigned.
1955 * @idle_slice_timer: timer set when idling for the next sequential request
1956 * from the queue under service.
1957 * @unplug_work: delayed work to restart dispatching on the request queue.
1958 @@ -216,6 +232,8 @@ struct bfq_data {
1962 + int budgets_assigned;
1964 struct timer_list idle_slice_timer;
1965 struct work_struct unplug_work;
1967 @@ -228,7 +246,7 @@ struct bfq_data {
1968 ktime_t last_idling_start;
1969 int peak_rate_samples;
1971 - bfq_service_t bfq_max_budget;
1972 + unsigned long bfq_max_budget;
1974 struct list_head cic_list;
1975 struct hlist_head group_list;
1976 @@ -244,6 +262,8 @@ struct bfq_data {
1977 unsigned int bfq_user_max_budget;
1978 unsigned int bfq_max_budget_async_rq;
1979 unsigned int bfq_timeout[2];
1985 @@ -260,12 +280,17 @@ struct bfq_data {
1986 * @max_budget: maximum budget allowed from the feedback mechanism.
1987 * @budget_timeout: budget expiration (in jiffies).
1988 * @dispatched: number of requests on the dispatch list or inside driver.
1989 - * @budgets_assigned: number of budgets assigned.
1990 * @org_ioprio: saved ioprio during boosted periods.
1991 * @org_ioprio_class: saved ioprio_class during boosted periods.
1992 * @flags: status flags.
1993 * @bfqq_list: node for active/idle bfqq list inside our bfqd.
1994 + * @seek_samples: number of seeks sampled
1995 + * @seek_total: sum of the distances of the seeks sampled
1996 + * @seek_mean: mean seek distance
1997 + * @last_request_pos: position of the last request enqueued
1998 * @pid: pid of the process owning the queue, used for logging purposes.
1999 + * @last_activation_time: time of the last (idle -> backlogged) transition
2000 + * @high_weight_budget: number of sectors left to serve with boosted weight
2002 * A bfq_queue is a leaf request queue; it can be associated to an io_context
2003 * or more (if it is an async one). @cgroup holds a reference to the
2004 @@ -287,11 +312,10 @@ struct bfq_queue {
2006 struct bfq_entity entity;
2008 - bfq_service_t max_budget;
2009 + unsigned long max_budget;
2010 unsigned long budget_timeout;
2013 - int budgets_assigned;
2015 unsigned short org_ioprio;
2016 unsigned short org_ioprio_class;
2017 @@ -300,7 +324,15 @@ struct bfq_queue {
2019 struct list_head bfqq_list;
2021 + unsigned int seek_samples;
2023 + sector_t seek_mean;
2024 + sector_t last_request_pos;
2028 + u64 last_activation_time;
2029 + unsigned long high_weight_budget;
2032 enum bfqq_state_flags {
2033 @@ -400,6 +432,7 @@ struct bfq_group {
2035 * struct bfqio_cgroup - bfq cgroup data structure.
2036 * @css: subsystem state for bfq in the containing cgroup.
2037 + * @weight: cgroup weight.
2038 * @ioprio: cgroup ioprio.
2039 * @ioprio_class: cgroup ioprio_class.
2040 * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
2041 @@ -411,7 +444,7 @@ struct bfq_group {
2042 struct bfqio_cgroup {
2043 struct cgroup_subsys_state css;
2045 - unsigned short ioprio, ioprio_class;
2046 + unsigned short weight, ioprio, ioprio_class;
2049 struct hlist_head group_data;