Clean up patch dir; synchronize patches with kernel-power v48
[kernel-bfs] / kernel-bfs-2.6.28 / debian / patches / bfq / iosched-bfq-02-add-bfq-scheduler.patch
1 Subject: [PATCH 02/03] block: introduce the BFQ I/O scheduler
2
3 Add the BFQ I/O scheduler.  The general structure is borrowed from CFQ,
4 as much of the code.  A (bfq_)queue is associated to each task doing I/O
5 on a device, and each time a scheduling decision has to be taken a queue
6 is selected and it is served until it expires.
7
8 The main differences are:
9     - slices are given in the service domain, we call them budgets and
10       are measured in sectors.  The maximum time a budget can take to
11       be consumed is limited by a configurable timeout, that can be
12       used to set a ``desired latency.''
13
14     - Budgets are scheduled according to a variant of WF2Q+, implemented
15       using an augmented rb-tree to take eligibility into account while
16       preserving an O(log N) overall complexity.
17
18     - ioprio values are mapped to weights using the relation
19       weight = IOPRIO_BE_NR - ioprio.
20
21     - ioprio classes are served in strict priority order, i.e., lower
22       priority queues are not served as long as there are higher priority
23       queues.  Among queues in the same classes the bandwidth is distributed
24       in proportion to the weights of each queue.
25
26     - BFQ supports full hierarchical scheduling, exporting a cgroups
27       interface.  Each node has a full scheduler, so each group can
28       be assigned its own ioprio and an ioprio_class.
29
30 Regarding what has not changed it is worth noting:
31     - the handling of cfq_io_contexts to associate queues to tasks.  Much
32       of the code has been reused just renaming it.  (There is room for
33       code sharing with CFQ but we wanted to minimize the impact of this
34       patch.)
35
36     - The handling of async queues.
37
38     - The handling of idle windows.
39
40     - The handling of merging.
41
42     - The heuristics to assert that a task is worth an idle window (with
43       minor modifications to hw_tag/CIC_SEEKY detection).
44
45 Signed-off-by: Fabio Checconi <fabio@gandalf.sssup.it>
46 Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
47 ---
48 diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
49 new file mode 100644
50 index 0000000..efb03fc
51 --- /dev/null
52 +++ b/block/bfq-cgroup.c
53 @@ -0,0 +1,743 @@
54 +/*
55 + * BFQ: CGROUPS support.
56 + *
57 + * Based on ideas and code from CFQ:
58 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
59 + *
60 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
61 + *                   Paolo Valente <paolo.valente@unimore.it>
62 + */
63 +
64 +#ifdef CONFIG_CGROUP_BFQIO
65 +static struct bfqio_cgroup bfqio_root_cgroup = {
66 +       .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
67 +       .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
68 +};
69 +
70 +static inline void bfq_init_entity(struct bfq_entity *entity,
71 +                                  struct bfq_group *bfqg)
72 +{
73 +       entity->ioprio = entity->new_ioprio;
74 +       entity->ioprio_class = entity->new_ioprio_class;
75 +       entity->parent = bfqg->my_entity;
76 +       entity->sched_data = &bfqg->sched_data;
77 +}
78 +
79 +static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
80 +{
81 +       return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
82 +                           struct bfqio_cgroup, css);
83 +}
84 +
85 +/*
86 + * Search the bfq_group for bfqd into the hash table (by now only a list)
87 + * of bgrp.  Must be called under rcu_read_lock().
88 + */
89 +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
90 +                                           struct bfq_data *bfqd)
91 +{
92 +       struct bfq_group *bfqg;
93 +       struct hlist_node *n;
94 +       void *key;
95 +
96 +       hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) {
97 +               key = rcu_dereference(bfqg->bfqd);
98 +               if (key == bfqd)
99 +                       return bfqg;
100 +       }
101 +
102 +       return NULL;
103 +}
104 +
105 +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
106 +                                        struct bfq_group *bfqg)
107 +{
108 +       struct bfq_entity *entity = &bfqg->entity;
109 +
110 +       entity->ioprio = entity->new_ioprio = bgrp->ioprio;
111 +       entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
112 +       entity->ioprio_changed = 1;
113 +       entity->my_sched_data = &bfqg->sched_data;
114 +}
115 +
116 +static inline void bfq_group_set_parent(struct bfq_group *bfqg,
117 +                                       struct bfq_group *parent)
118 +{
119 +       struct bfq_entity *entity;
120 +
121 +       BUG_ON(parent == NULL);
122 +       BUG_ON(bfqg == NULL);
123 +
124 +       entity = &bfqg->entity;
125 +       entity->parent = parent->my_entity;
126 +       entity->sched_data = &parent->sched_data;
127 +}
128 +
129 +/**
130 + * bfq_group_chain_alloc - allocate a chain of groups.
131 + * @bfqd: queue descriptor.
132 + * @cgroup: the leaf cgroup this chain starts from.
133 + *
134 + * Allocate a chain of groups starting from the one belonging to
135 + * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
136 + * to the root has already an allocated group on @bfqd.
137 + */
138 +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
139 +                                              struct cgroup *cgroup)
140 +{
141 +       struct bfqio_cgroup *bgrp;
142 +       struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
143 +
144 +       for (; cgroup != NULL; cgroup = cgroup->parent) {
145 +               bgrp = cgroup_to_bfqio(cgroup);
146 +
147 +               bfqg = bfqio_lookup_group(bgrp, bfqd);
148 +               if (bfqg != NULL) {
149 +                       /*
150 +                        * All the cgroups in the path from there to the
151 +                        * root must have a bfq_group for bfqd, so we don't
152 +                        * need any more allocations.
153 +                        */
154 +                       break;
155 +               }
156 +
157 +               bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
158 +               if (bfqg == NULL)
159 +                       goto cleanup;
160 +
161 +               bfq_group_init_entity(bgrp, bfqg);
162 +               bfqg->my_entity = &bfqg->entity;
163 +
164 +               if (leaf == NULL) {
165 +                       leaf = bfqg;
166 +                       prev = leaf;
167 +               } else {
168 +                       bfq_group_set_parent(prev, bfqg);
169 +                       /*
170 +                        * Build a list of allocated nodes using the bfqd
171 +                        * filed, that is still unused and will be initialized
172 +                        * only after the node will be connected.
173 +                        */
174 +                       prev->bfqd = bfqg;
175 +                       prev = bfqg;
176 +               }
177 +       }
178 +
179 +       return leaf;
180 +
181 +cleanup:
182 +       while (leaf != NULL) {
183 +               prev = leaf;
184 +               leaf = leaf->bfqd;
185 +               kfree(prev);
186 +       }
187 +
188 +       return NULL;
189 +}
190 +
191 +/**
192 + * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
193 + * @bfqd: the queue descriptor.
194 + * @cgroup: the leaf cgroup to start from.
195 + * @leaf: the leaf group (to be associated to @cgroup).
196 + *
197 + * Try to link a chain of groups to a cgroup hierarchy, connecting the
198 + * nodes bottom-up, so we can be sure that when we find a cgroup in the
199 + * hierarchy that already as a group associated to @bfqd all the nodes
200 + * in the path to the root cgroup have one too.
201 + *
202 + * On locking: the queue lock protects the hierarchy (there is a hierarchy
203 + * per device) while the bfqio_cgroup lock protects the list of groups
204 + * belonging to the same cgroup.
205 + */
206 +static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
207 +                                struct bfq_group *leaf)
208 +{
209 +       struct bfqio_cgroup *bgrp;
210 +       struct bfq_group *bfqg, *next, *prev = NULL;
211 +       unsigned long flags;
212 +
213 +       assert_spin_locked(bfqd->queue->queue_lock);
214 +
215 +       for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
216 +               bgrp = cgroup_to_bfqio(cgroup);
217 +               next = leaf->bfqd;
218 +
219 +               bfqg = bfqio_lookup_group(bgrp, bfqd);
220 +               BUG_ON(bfqg != NULL);
221 +
222 +               spin_lock_irqsave(&bgrp->lock, flags);
223 +
224 +               rcu_assign_pointer(leaf->bfqd, bfqd);
225 +               hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
226 +               hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
227 +
228 +               spin_unlock_irqrestore(&bgrp->lock, flags);
229 +
230 +               prev = leaf;
231 +               leaf = next;
232 +       }
233 +
234 +       BUG_ON(cgroup == NULL && leaf != NULL);
235 +       if (cgroup != NULL && prev != NULL) {
236 +               bgrp = cgroup_to_bfqio(cgroup);
237 +               bfqg = bfqio_lookup_group(bgrp, bfqd);
238 +               bfq_group_set_parent(prev, bfqg);
239 +       }
240 +}
241 +
242 +/**
243 + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
244 + * @bfqd: queue descriptor.
245 + * @cgroup: cgroup being searched for.
246 + *
247 + * Return a group associated to @bfqd in @cgroup, allocating one if
248 + * necessary.  When a group is returned all the cgroups in the path
249 + * to the root have a group associated to @bfqd.
250 + *
251 + * If the allocation fails, return the root group: this breaks guarantees
252 + * but is a safe fallbak.  If this loss becames a problem it can be
253 + * mitigated using the equivalent weight (given by the product of the
254 + * weights of the groups in the path from @group to the root) in the
255 + * root scheduler.
256 + *
257 + * We allocate all the missing nodes in the path from the leaf cgroup
258 + * to the root and we connect the nodes only after all the allocations
259 + * have been successful.
260 + */
261 +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
262 +                                             struct cgroup *cgroup)
263 +{
264 +       struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
265 +       struct bfq_group *bfqg;
266 +
267 +       bfqg = bfqio_lookup_group(bgrp, bfqd);
268 +       if (bfqg != NULL)
269 +               return bfqg;
270 +
271 +       bfqg = bfq_group_chain_alloc(bfqd, cgroup);
272 +       if (bfqg != NULL)
273 +               bfq_group_chain_link(bfqd, cgroup, bfqg);
274 +       else
275 +               bfqg = bfqd->root_group;
276 +
277 +       return bfqg;
278 +}
279 +
280 +/**
281 + * bfq_bfqq_move - migrate @bfqq to @bfqg.
282 + * @bfqd: queue descriptor.
283 + * @bfqq: the queue to move.
284 + * @entity: @bfqq's entity.
285 + * @bfqg: the group to move to.
286 + *
287 + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
288 + * it on the new one.  Avoid putting the entity on the old group idle tree.
289 + *
290 + * Must be called under the queue lock; the cgroup owning @bfqg must
291 + * not disappear (by now this just means that we are called under
292 + * rcu_read_lock()).
293 + */
294 +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
295 +                         struct bfq_entity *entity, struct bfq_group *bfqg)
296 +{
297 +       int busy, resume;
298 +
299 +       busy = bfq_bfqq_busy(bfqq);
300 +       resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
301 +
302 +       BUG_ON(resume && !entity->on_st);
303 +       BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
304 +
305 +       if (busy) {
306 +               BUG_ON(atomic_read(&bfqq->ref) < 2);
307 +
308 +               if (!resume)
309 +                       bfq_del_bfqq_busy(bfqd, bfqq, 0);
310 +               else
311 +                       bfq_deactivate_bfqq(bfqd, bfqq, 0);
312 +       }
313 +
314 +       /*
315 +        * Here we use a reference to bfqg.  We don't need a refcounter
316 +        * as the cgroup reference will not be dropped, so that its
317 +        * destroy() callback will not be invoked.
318 +        */
319 +       entity->parent = bfqg->my_entity;
320 +       entity->sched_data = &bfqg->sched_data;
321 +
322 +       if (busy && resume)
323 +               bfq_activate_bfqq(bfqd, bfqq);
324 +}
325 +
326 +/**
327 + * __bfq_cic_change_cgroup - move @cic to @cgroup.
328 + * @bfqd: the queue descriptor.
329 + * @cic: the cic to move.
330 + * @cgroup: the cgroup to move to.
331 + *
332 + * Move cic to cgroup, assuming that bfqd->queue is locked; the caller
333 + * has to make sure that the reference to cgroup is valid across the call.
334 + *
335 + * NOTE: an alternative approach might have been to store the current
336 + * cgroup in bfqq and getting a reference to it, reducing the lookup
337 + * time here, at the price of slightly more complex code.
338 + */
339 +static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd,
340 +                                                struct cfq_io_context *cic,
341 +                                                struct cgroup *cgroup)
342 +{
343 +       struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0);
344 +       struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1);
345 +       struct bfq_entity *entity;
346 +       struct bfq_group *bfqg;
347 +       struct bfqio_cgroup *bgrp;
348 +
349 +       bgrp = cgroup_to_bfqio(cgroup);
350 +
351 +       bfqg = bfq_find_alloc_group(bfqd, cgroup);
352 +       if (async_bfqq != NULL) {
353 +               entity = &async_bfqq->entity;
354 +
355 +               if (entity->sched_data != &bfqg->sched_data) {
356 +                       cic_set_bfqq(cic, NULL, 0);
357 +                       bfq_put_queue(async_bfqq);
358 +               }
359 +       }
360 +
361 +       if (sync_bfqq != NULL) {
362 +               entity = &sync_bfqq->entity;
363 +               if (entity->sched_data != &bfqg->sched_data)
364 +                       bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
365 +       }
366 +
367 +       return bfqg;
368 +}
369 +
370 +/**
371 + * bfq_cic_change_cgroup - move @cic to @cgroup.
372 + * @cic: the cic being migrated.
373 + * @cgroup: the destination cgroup.
374 + *
375 + * When the task owning @cic is moved to @cgroup, @cic is immediately
376 + * moved into its new parent group.
377 + */
378 +static void bfq_cic_change_cgroup(struct cfq_io_context *cic,
379 +                                 struct cgroup *cgroup)
380 +{
381 +       struct bfq_data *bfqd;
382 +       unsigned long uninitialized_var(flags);
383 +
384 +       bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
385 +       if (bfqd != NULL) {
386 +               __bfq_cic_change_cgroup(bfqd, cic, cgroup);
387 +               bfq_put_bfqd_unlock(bfqd, &flags);
388 +       }
389 +}
390 +
391 +/**
392 + * bfq_cic_update_cgroup - update the cgroup of @cic.
393 + * @cic: the @cic to update.
394 + *
395 + * Make sure that @cic is enqueued in the cgroup of the current task.
396 + * We need this in addition to moving cics during the cgroup attach
397 + * phase because the task owning @cic could be at its first disk
398 + * access or we may end up in the root cgroup as the result of a
399 + * memory allocation failure and here we try to move to the right
400 + * group.
401 + *
402 + * Must be called under the queue lock.  It is safe to use the returned
403 + * value even after the rcu_read_unlock() as the migration/destruction
404 + * paths act under the queue lock too.  IOW it is impossible to race with
405 + * group migration/destruction and end up with an invalid group as:
406 + *   a) here cgroup has not yet been destroyed, nor its destroy callback
407 + *      has started execution, as current holds a reference to it,
408 + *   b) if it is destroyed after rcu_read_unlock() [after current is
409 + *      migrated to a different cgroup] its attach() callback will have
410 + *      taken care of remove all the references to the old cgroup data.
411 + */
412 +static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic)
413 +{
414 +       struct bfq_data *bfqd = cic->key;
415 +       struct bfq_group *bfqg;
416 +       struct cgroup *cgroup;
417 +
418 +       BUG_ON(bfqd == NULL);
419 +
420 +       rcu_read_lock();
421 +       cgroup = task_cgroup(current, bfqio_subsys_id);
422 +       bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup);
423 +       rcu_read_unlock();
424 +
425 +       return bfqg;
426 +}
427 +
428 +/**
429 + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
430 + * @st: the service tree being flushed.
431 + */
432 +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
433 +{
434 +       struct bfq_entity *entity = st->first_idle;
435 +
436 +       for (; entity != NULL; entity = st->first_idle)
437 +               __bfq_deactivate_entity(entity, 0);
438 +}
439 +
440 +/**
441 + * bfq_destroy_group - destroy @bfqg.
442 + * @bgrp: the bfqio_cgroup containing @bfqg.
443 + * @bfqg: the group being destroyed.
444 + *
445 + * Destroy @bfqg, making sure that it is not referenced from its parent.
446 + */
447 +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
448 +{
449 +       struct bfq_data *bfqd;
450 +       struct bfq_service_tree *st;
451 +       struct bfq_entity *entity = bfqg->my_entity;
452 +       unsigned long uninitialized_var(flags);
453 +       int i;
454 +
455 +       hlist_del(&bfqg->group_node);
456 +
457 +       /*
458 +        * We may race with device destruction, take extra care when
459 +        * dereferencing bfqg->bfqd.
460 +        */
461 +       bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
462 +       if (bfqd != NULL) {
463 +               hlist_del(&bfqg->bfqd_node);
464 +               __bfq_deactivate_entity(entity, 0);
465 +               bfq_put_async_queues(bfqd, bfqg);
466 +               bfq_put_bfqd_unlock(bfqd, &flags);
467 +       }
468 +
469 +       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
470 +               st = bfqg->sched_data.service_tree + i;
471 +
472 +               /*
473 +                * The idle tree may still contain bfq_queues belonging
474 +                * to exited task because they never migrated to a different
475 +                * cgroup from the one being destroyed now.  Noone else
476 +                * can access them so it's safe to act without any lock.
477 +                */
478 +               bfq_flush_idle_tree(st);
479 +
480 +               BUG_ON(!RB_EMPTY_ROOT(&st->active));
481 +               BUG_ON(!RB_EMPTY_ROOT(&st->idle));
482 +       }
483 +       BUG_ON(bfqg->sched_data.next_active != NULL);
484 +       BUG_ON(bfqg->sched_data.active_entity != NULL);
485 +       BUG_ON(entity->tree != NULL);
486 +
487 +       /*
488 +        * No need to defer the kfree() to the end of the RCU grace
489 +        * period: we are called from the destroy() callback of our
490 +        * cgroup, so we can be sure that noone is a) still using
491 +        * this cgroup or b) doing lookups in it.
492 +        */
493 +       kfree(bfqg);
494 +}
495 +
496 +/**
497 + * bfq_disconnect_groups - diconnect @bfqd from all its groups.
498 + * @bfqd: the device descriptor being exited.
499 + *
500 + * When the device exits we just make sure that no lookup can return
501 + * the now unused group structures.  They will be deallocated on cgroup
502 + * destruction.
503 + */
504 +static void bfq_disconnect_groups(struct bfq_data *bfqd)
505 +{
506 +       struct hlist_node *pos, *n;
507 +       struct bfq_group *bfqg;
508 +
509 +       hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
510 +               hlist_del(&bfqg->bfqd_node);
511 +
512 +               __bfq_deactivate_entity(bfqg->my_entity, 0);
513 +
514 +               /*
515 +                * Don't remove from the group hash, just set an
516 +                * invalid key.  No lookups can race with the
517 +                * assignment as bfqd is being destroyed; this
518 +                * implies also that new elements cannot be added
519 +                * to the list.
520 +                */
521 +               rcu_assign_pointer(bfqg->bfqd, NULL);
522 +               bfq_put_async_queues(bfqd, bfqg);
523 +       }
524 +}
525 +
526 +static inline void bfq_free_root_group(struct bfq_data *bfqd)
527 +{
528 +       struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
529 +       struct bfq_group *bfqg = bfqd->root_group;
530 +
531 +       spin_lock_irq(&bgrp->lock);
532 +       hlist_del_rcu(&bfqg->group_node);
533 +       spin_unlock_irq(&bgrp->lock);
534 +
535 +       /*
536 +        * No need to synchronize_rcu() here: since the device is gone
537 +        * there cannot be any read-side access to its root_group.
538 +        */
539 +       kfree(bfqg);
540 +}
541 +
542 +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
543 +{
544 +       struct bfq_group *bfqg;
545 +       struct bfqio_cgroup *bgrp;
546 +       int i;
547 +
548 +       bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
549 +       if (bfqg == NULL)
550 +               return NULL;
551 +
552 +       bfqg->entity.parent = NULL;
553 +       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
554 +               bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
555 +
556 +       bgrp = &bfqio_root_cgroup;
557 +       spin_lock_irq(&bgrp->lock);
558 +       rcu_assign_pointer(bfqg->bfqd, bfqd);
559 +       hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
560 +       spin_unlock_irq(&bgrp->lock);
561 +
562 +       return bfqg;
563 +}
564 +
565 +#define SHOW_FUNCTION(__VAR)                                           \
566 +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,          \
567 +                                      struct cftype *cftype)           \
568 +{                                                                      \
569 +       struct bfqio_cgroup *bgrp;                                      \
570 +       u64 ret;                                                        \
571 +                                                                       \
572 +       if (!cgroup_lock_live_group(cgroup))                            \
573 +               return -ENODEV;                                         \
574 +                                                                       \
575 +       bgrp = cgroup_to_bfqio(cgroup);                                 \
576 +       spin_lock_irq(&bgrp->lock);                                     \
577 +       ret = bgrp->__VAR;                                              \
578 +       spin_unlock_irq(&bgrp->lock);                                   \
579 +                                                                       \
580 +       cgroup_unlock();                                                \
581 +                                                                       \
582 +       return ret;                                                     \
583 +}
584 +
585 +SHOW_FUNCTION(ioprio);
586 +SHOW_FUNCTION(ioprio_class);
587 +#undef SHOW_FUNCTION
588 +
589 +#define STORE_FUNCTION(__VAR, __MIN, __MAX)                            \
590 +static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,         \
591 +                                       struct cftype *cftype,          \
592 +                                       u64 val)                        \
593 +{                                                                      \
594 +       struct bfqio_cgroup *bgrp;                                      \
595 +       struct bfq_group *bfqg;                                         \
596 +       struct hlist_node *n;                                           \
597 +                                                                       \
598 +       if (val < (__MIN) || val > (__MAX))                             \
599 +               return -EINVAL;                                         \
600 +                                                                       \
601 +       if (!cgroup_lock_live_group(cgroup))                            \
602 +               return -ENODEV;                                         \
603 +                                                                       \
604 +       bgrp = cgroup_to_bfqio(cgroup);                                 \
605 +                                                                       \
606 +       spin_lock_irq(&bgrp->lock);                                     \
607 +       bgrp->__VAR = (unsigned char)val;                               \
608 +       hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {  \
609 +               bfqg->entity.new_##__VAR = (unsigned char)val;          \
610 +               smp_wmb();                                              \
611 +               bfqg->entity.ioprio_changed = 1;                        \
612 +       }                                                               \
613 +       spin_unlock_irq(&bgrp->lock);                                   \
614 +                                                                       \
615 +       cgroup_unlock();                                                \
616 +                                                                       \
617 +       return 0;                                                       \
618 +}
619 +
620 +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
621 +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
622 +#undef STORE_FUNCTION
623 +
624 +static struct cftype bfqio_files[] = {
625 +       {
626 +               .name = "ioprio",
627 +               .read_u64 = bfqio_cgroup_ioprio_read,
628 +               .write_u64 = bfqio_cgroup_ioprio_write,
629 +       },
630 +       {
631 +               .name = "ioprio_class",
632 +               .read_u64 = bfqio_cgroup_ioprio_class_read,
633 +               .write_u64 = bfqio_cgroup_ioprio_class_write,
634 +       },
635 +};
636 +
637 +static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
638 +{
639 +       return cgroup_add_files(cgroup, subsys, bfqio_files,
640 +                               ARRAY_SIZE(bfqio_files));
641 +}
642 +
643 +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys,
644 +                                               struct cgroup *cgroup)
645 +{
646 +       struct bfqio_cgroup *bgrp;
647 +
648 +       if (cgroup->parent != NULL) {
649 +               bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
650 +               if (bgrp == NULL)
651 +                       return ERR_PTR(-ENOMEM);
652 +       } else
653 +               bgrp = &bfqio_root_cgroup;
654 +
655 +       spin_lock_init(&bgrp->lock);
656 +       INIT_HLIST_HEAD(&bgrp->group_data);
657 +       bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
658 +       bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
659 +
660 +       return &bgrp->css;
661 +}
662 +
663 +/*
664 + * We cannot support shared io contexts, as we have no mean to support
665 + * two tasks with the same ioc in two different groups without major rework
666 + * of the main cic/bfqq data structures.  By now we allow a task to change
667 + * its cgroup only if it's the only owner of its ioc; the drawback of this
668 + * behavior is that a group containing a task that forked using CLONE_IO
669 + * will not be destroyed until the tasks sharing the ioc die.
670 + */
671 +static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
672 +                           struct task_struct *tsk)
673 +{
674 +       struct io_context *ioc;
675 +       int ret = 0;
676 +
677 +       /* task_lock() is needed to avoid races with exit_io_context() */
678 +       task_lock(tsk);
679 +       ioc = tsk->io_context;
680 +       if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
681 +               /*
682 +                * ioc == NULL means that the task is either too young or
683 +                * exiting: if it has still no ioc the ioc can't be shared,
684 +                * if the task is exiting the attach will fail anyway, no
685 +                * matter what we return here.
686 +                */
687 +               ret = -EINVAL;
688 +       task_unlock(tsk);
689 +
690 +       return ret;
691 +}
692 +
693 +static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
694 +                        struct cgroup *prev, struct task_struct *tsk)
695 +{
696 +       struct io_context *ioc;
697 +       struct cfq_io_context *cic;
698 +       struct hlist_node *n;
699 +
700 +       task_lock(tsk);
701 +       ioc = tsk->io_context;
702 +       if (ioc != NULL) {
703 +               BUG_ON(atomic_read(&ioc->refcount) == 0);
704 +               atomic_inc(&ioc->refcount);
705 +       }
706 +       task_unlock(tsk);
707 +
708 +       if (ioc == NULL)
709 +               return;
710 +
711 +       rcu_read_lock();
712 +       hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
713 +               bfq_cic_change_cgroup(cic, cgroup);
714 +       rcu_read_unlock();
715 +
716 +       put_io_context(ioc);
717 +}
718 +
719 +static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
720 +{
721 +       struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
722 +       struct hlist_node *n, *tmp;
723 +       struct bfq_group *bfqg;
724 +
725 +       /*
726 +        * Since we are destroying the cgroup, there are no more tasks
727 +        * referencing it, and all the RCU grace periods that may have
728 +        * referenced it are ended (as the destruction of the parent
729 +        * cgroup is RCU-safe); bgrp->group_data will not be accessed by
730 +        * anything else and we don't need any synchronization.
731 +        */
732 +       hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node)
733 +               bfq_destroy_group(bgrp, bfqg);
734 +
735 +       BUG_ON(!hlist_empty(&bgrp->group_data));
736 +
737 +       kfree(bgrp);
738 +}
739 +
740 +struct cgroup_subsys bfqio_subsys = {
741 +       .name = "bfqio",
742 +       .create = bfqio_create,
743 +       .can_attach = bfqio_can_attach,
744 +       .attach = bfqio_attach,
745 +       .destroy = bfqio_destroy,
746 +       .populate = bfqio_populate,
747 +       .subsys_id = bfqio_subsys_id,
748 +};
749 +#else
750 +static inline void bfq_init_entity(struct bfq_entity *entity,
751 +                                  struct bfq_group *bfqg)
752 +{
753 +       entity->ioprio = entity->new_ioprio;
754 +       entity->ioprio_class = entity->new_ioprio_class;
755 +       entity->sched_data = &bfqg->sched_data;
756 +}
757 +
758 +static inline struct bfq_group *
759 +bfq_cic_update_cgroup(struct cfq_io_context *cic)
760 +{
761 +       struct bfq_data *bfqd = cic->key;
762 +       return bfqd->root_group;
763 +}
764 +
765 +static inline void bfq_bfqq_move(struct bfq_data *bfqd,
766 +                                struct bfq_queue *bfqq,
767 +                                struct bfq_entity *entity,
768 +                                struct bfq_group *bfqg)
769 +{
770 +}
771 +
772 +static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
773 +{
774 +       bfq_put_async_queues(bfqd, bfqd->root_group);
775 +}
776 +
777 +static inline void bfq_free_root_group(struct bfq_data *bfqd)
778 +{
779 +       kfree(bfqd->root_group);
780 +}
781 +
782 +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
783 +{
784 +       struct bfq_group *bfqg;
785 +       int i;
786 +
787 +       bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
788 +       if (bfqg == NULL)
789 +               return NULL;
790 +
791 +       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
792 +               bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
793 +
794 +       return bfqg;
795 +}
796 +#endif
797 diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
798 new file mode 100644
799 index 0000000..8b91d65
800 --- /dev/null
801 +++ b/block/bfq-ioc.c
802 @@ -0,0 +1,375 @@
803 +/*
804 + * BFQ: I/O context handling.
805 + *
806 + * Based on ideas and code from CFQ:
807 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
808 + *
809 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
810 + *                   Paolo Valente <paolo.valente@unimore.it>
811 + */
812 +
813 +/**
814 + * bfq_cic_free_rcu - deferred cic freeing.
815 + * @head: RCU head of the cic to free.
816 + *
817 + * Free the cic containing @head and, if it was the last one and
818 + * the module is exiting wake up anyone waiting for its deallocation
819 + * (see bfq_exit()).
820 + */
821 +static void bfq_cic_free_rcu(struct rcu_head *head)
822 +{
823 +       struct cfq_io_context *cic;
824 +
825 +       cic = container_of(head, struct cfq_io_context, rcu_head);
826 +
827 +       kmem_cache_free(bfq_ioc_pool, cic);
828 +       elv_ioc_count_dec(bfq_ioc_count);
829 +
830 +       if (bfq_ioc_gone != NULL) {
831 +               spin_lock(&bfq_ioc_gone_lock);
832 +               if (bfq_ioc_gone != NULL &&
833 +                   !elv_ioc_count_read(bfq_ioc_count)) {
834 +                       complete(bfq_ioc_gone);
835 +                       bfq_ioc_gone = NULL;
836 +               }
837 +               spin_unlock(&bfq_ioc_gone_lock);
838 +       }
839 +}
840 +
841 +static void bfq_cic_free(struct cfq_io_context *cic)
842 +{
843 +       call_rcu(&cic->rcu_head, bfq_cic_free_rcu);
844 +}
845 +
846 +/**
847 + * cic_free_func - disconnect a cic ready to be freed.
848 + * @ioc: the io_context @cic belongs to.
849 + * @cic: the cic to be freed.
850 + *
851 + * Remove @cic from the @ioc radix tree hash and from its cic list,
852 + * deferring the deallocation of @cic to the end of the current RCU
853 + * grace period.  This assumes that __bfq_exit_single_io_context()
854 + * has already been called for @cic.
855 + */
856 +static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
857 +{
858 +       unsigned long flags;
859 +
860 +       BUG_ON(cic->dead_key == 0);
861 +
862 +       spin_lock_irqsave(&ioc->lock, flags);
863 +       radix_tree_delete(&ioc->bfq_radix_root, cic->dead_key);
864 +       hlist_del_init_rcu(&cic->cic_list);
865 +       spin_unlock_irqrestore(&ioc->lock, flags);
866 +
867 +       bfq_cic_free(cic);
868 +}
869 +
870 +static void bfq_free_io_context(struct io_context *ioc)
871 +{
872 +       /*
873 +        * ioc->refcount is zero here, or we are called from elv_unregister(),
874 +        * so no more cic's are allowed to be linked into this ioc.  So it
875 +        * should be ok to iterate over the known list, we will see all cic's
876 +        * since no new ones are added.
877 +        */
878 +       call_for_each_cic(ioc, cic_free_func);
879 +}
880 +
881 +/**
882 + * __bfq_exit_single_io_context - deassociate @cic from any running task.
883 + * @bfqd: bfq_data on which @cic is valid.
884 + * @cic: the cic being exited.
885 + *
886 + * Whenever no more tasks are using @cic or @bfqd is deallocated we
887 + * need to invalidate its entry in the radix tree hash table and to
888 + * release the queues it refers to.  Save the key used for insertion
889 + * in @cic->dead_key to remove @cic from the radix tree later and assign
890 + * %NULL to its search key to prevent future lookups to succeed on it.
891 + *
892 + * Called under the queue lock.
893 + */
894 +static void __bfq_exit_single_io_context(struct bfq_data *bfqd,
895 +                                        struct cfq_io_context *cic)
896 +{
897 +       struct io_context *ioc = cic->ioc;
898 +
899 +       list_del_init(&cic->queue_list);
900 +
901 +       /*
902 +        * Make sure key == NULL is seen for dead queues.
903 +        */
904 +       cic->dead_key = (unsigned long)cic->key;
905 +       smp_wmb();
906 +
907 +       rcu_assign_pointer(cic->key, NULL);
908 +
909 +       /*
910 +        * No write-side locking as no task is using @ioc (they're exited
911 +        * or bfqd is being deallocated.
912 +        */
913 +       if (ioc->ioc_data == cic)
914 +               rcu_assign_pointer(ioc->ioc_data, NULL);
915 +
916 +       if (cic->cfqq[ASYNC] != NULL) {
917 +               bfq_exit_bfqq(bfqd, cic->cfqq[ASYNC]);
918 +               cic->cfqq[ASYNC] = NULL;
919 +       }
920 +
921 +       if (cic->cfqq[SYNC] != NULL) {
922 +               bfq_exit_bfqq(bfqd, cic->cfqq[SYNC]);
923 +               cic->cfqq[SYNC] = NULL;
924 +       }
925 +}
926 +
927 +/**
928 + * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version).
929 + * @ioc: the io_context @cic belongs to.
930 + * @cic: the cic being exited.
931 + *
932 + * Take the queue lock and call __bfq_exit_single_io_context() to do the
933 + * rest of the work.  We take care of possible races with bfq_exit_queue()
934 + * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism).
935 + */
936 +static void bfq_exit_single_io_context(struct io_context *ioc,
937 +                                      struct cfq_io_context *cic)
938 +{
939 +       struct bfq_data *bfqd;
940 +       unsigned long uninitialized_var(flags);
941 +
942 +       bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
943 +       if (bfqd != NULL) {
944 +               __bfq_exit_single_io_context(bfqd, cic);
945 +               bfq_put_bfqd_unlock(bfqd, &flags);
946 +       }
947 +}
948 +
949 +/**
950 + * bfq_exit_io_context - deassociate @ioc from all cics it owns.
951 + * @ioc: the @ioc being exited.
952 + *
953 + * No more processes are using @ioc we need to clean up and put the
954 + * internal structures we have that belongs to that process.  Loop
955 + * through all its cics, locking their queues and exiting them.
956 + */
957 +static void bfq_exit_io_context(struct io_context *ioc)
958 +{
959 +       call_for_each_cic(ioc, bfq_exit_single_io_context);
960 +}
961 +
962 +static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd,
963 +                                                  gfp_t gfp_mask)
964 +{
965 +       struct cfq_io_context *cic;
966 +
967 +       cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO,
968 +                                                       bfqd->queue->node);
969 +       if (cic != NULL) {
970 +               cic->last_end_request = jiffies;
971 +               INIT_LIST_HEAD(&cic->queue_list);
972 +               INIT_HLIST_NODE(&cic->cic_list);
973 +               cic->dtor = bfq_free_io_context;
974 +               cic->exit = bfq_exit_io_context;
975 +               elv_ioc_count_inc(bfq_ioc_count);
976 +       }
977 +
978 +       return cic;
979 +}
980 +
981 +/**
982 + * bfq_drop_dead_cic - free an exited cic.
983 + * @bfqd: bfq data for the device in use.
984 + * @ioc: io_context owning @cic.
985 + * @cic: the @cic to free.
986 + *
987 + * We drop cfq io contexts lazily, so we may find a dead one.
988 + */
989 +static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc,
990 +                             struct cfq_io_context *cic)
991 +{
992 +       unsigned long flags;
993 +
994 +       WARN_ON(!list_empty(&cic->queue_list));
995 +
996 +       spin_lock_irqsave(&ioc->lock, flags);
997 +
998 +       BUG_ON(ioc->ioc_data == cic);
999 +
1000 +       /*
1001 +        * With shared I/O contexts two lookups may race and drop the
1002 +        * same cic more than one time: RCU guarantees that the storage
1003 +        * will not be freed too early, here we make sure that we do
1004 +        * not try to remove the cic from the hashing structures multiple
1005 +        * times.
1006 +        */
1007 +       if (!hlist_unhashed(&cic->cic_list)) {
1008 +               radix_tree_delete(&ioc->bfq_radix_root, (unsigned long)bfqd);
1009 +               hlist_del_init_rcu(&cic->cic_list);
1010 +               bfq_cic_free(cic);
1011 +       }
1012 +
1013 +       spin_unlock_irqrestore(&ioc->lock, flags);
1014 +}
1015 +
1016 +/**
1017 + * bfq_cic_lookup - search into @ioc a cic associated to @bfqd.
1018 + * @bfqd: the lookup key.
1019 + * @ioc: the io_context of the process doing I/O.
1020 + *
1021 + * If @ioc already has a cic associated to @bfqd return it, return %NULL
1022 + * otherwise.
1023 + */
1024 +static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd,
1025 +                                            struct io_context *ioc)
1026 +{
1027 +       struct cfq_io_context *cic;
1028 +       unsigned long flags;
1029 +       void *k;
1030 +
1031 +       if (unlikely(ioc == NULL))
1032 +               return NULL;
1033 +
1034 +       rcu_read_lock();
1035 +
1036 +       /* We maintain a last-hit cache, to avoid browsing over the tree. */
1037 +       cic = rcu_dereference(ioc->ioc_data);
1038 +       if (cic != NULL) {
1039 +               k = rcu_dereference(cic->key);
1040 +               if (k == bfqd)
1041 +                       goto out;
1042 +       }
1043 +
1044 +       do {
1045 +               cic = radix_tree_lookup(&ioc->bfq_radix_root,
1046 +                                       (unsigned long)bfqd);
1047 +               if (cic == NULL)
1048 +                       goto out;
1049 +
1050 +               k = rcu_dereference(cic->key);
1051 +               if (unlikely(k == NULL)) {
1052 +                       rcu_read_unlock();
1053 +                       bfq_drop_dead_cic(bfqd, ioc, cic);
1054 +                       rcu_read_lock();
1055 +                       continue;
1056 +               }
1057 +
1058 +               spin_lock_irqsave(&ioc->lock, flags);
1059 +               rcu_assign_pointer(ioc->ioc_data, cic);
1060 +               spin_unlock_irqrestore(&ioc->lock, flags);
1061 +               break;
1062 +       } while (1);
1063 +
1064 +out:
1065 +       rcu_read_unlock();
1066 +
1067 +       return cic;
1068 +}
1069 +
1070 +/**
1071 + * bfq_cic_link - add @cic to @ioc.
1072 + * @bfqd: bfq_data @cic refers to.
1073 + * @ioc: io_context @cic belongs to.
1074 + * @cic: the cic to link.
1075 + * @gfp_mask: the mask to use for radix tree preallocations.
1076 + *
1077 + * Add @cic to @ioc, using @bfqd as the search key.  This enables us to
1078 + * lookup the process specific cfq io context when entered from the block
1079 + * layer.  Also adds @cic to a per-bfqd list, used when this queue is
1080 + * removed.
1081 + */
1082 +static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc,
1083 +                       struct cfq_io_context *cic, gfp_t gfp_mask)
1084 +{
1085 +       unsigned long flags;
1086 +       int ret;
1087 +
1088 +       ret = radix_tree_preload(gfp_mask);
1089 +       if (ret == 0) {
1090 +               cic->ioc = ioc;
1091 +
1092 +               /* No write-side locking, cic is not published yet. */
1093 +               rcu_assign_pointer(cic->key, bfqd);
1094 +
1095 +               spin_lock_irqsave(&ioc->lock, flags);
1096 +               ret = radix_tree_insert(&ioc->bfq_radix_root,
1097 +                                       (unsigned long)bfqd, cic);
1098 +               if (ret == 0)
1099 +                       hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list);
1100 +               spin_unlock_irqrestore(&ioc->lock, flags);
1101 +
1102 +               radix_tree_preload_end();
1103 +
1104 +               if (ret == 0) {
1105 +                       spin_lock_irqsave(bfqd->queue->queue_lock, flags);
1106 +                       list_add(&cic->queue_list, &bfqd->cic_list);
1107 +                       spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
1108 +               }
1109 +       }
1110 +
1111 +       if (ret != 0)
1112 +               printk(KERN_ERR "bfq: cic link failed!\n");
1113 +
1114 +       return ret;
1115 +}
1116 +
1117 +/**
1118 + * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc.
1119 + * @ioc: the io_context changing its priority.
1120 + */
1121 +static inline void bfq_ioc_set_ioprio(struct io_context *ioc)
1122 +{
1123 +       call_for_each_cic(ioc, bfq_changed_ioprio);
1124 +}
1125 +
1126 +/**
1127 + * bfq_get_io_context - return the @cic associated to @bfqd in @ioc.
1128 + * @bfqd: the search key.
1129 + * @gfp_mask: the mask to use for cic allocation.
1130 + *
1131 + * Setup general io context and cfq io context.  There can be several cfq
1132 + * io contexts per general io context, if this process is doing io to more
1133 + * than one device managed by cfq.
1134 + */
1135 +static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd,
1136 +                                                gfp_t gfp_mask)
1137 +{
1138 +       struct io_context *ioc = NULL;
1139 +       struct cfq_io_context *cic;
1140 +
1141 +       might_sleep_if(gfp_mask & __GFP_WAIT);
1142 +
1143 +       ioc = get_io_context(gfp_mask, bfqd->queue->node);
1144 +       if (ioc == NULL)
1145 +               return NULL;
1146 +
1147 +       /* Lookup for an existing cic. */
1148 +       cic = bfq_cic_lookup(bfqd, ioc);
1149 +       if (cic != NULL)
1150 +               goto out;
1151 +
1152 +       /* Alloc one if needed. */
1153 +       cic = bfq_alloc_io_context(bfqd, gfp_mask);
1154 +       if (cic == NULL)
1155 +               goto err;
1156 +
1157 +       /* Link it into the ioc's radix tree and cic list. */
1158 +       if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0)
1159 +               goto err_free;
1160 +
1161 +out:
1162 +       /*
1163 +        * test_and_clear_bit() implies a memory barrier, paired with
1164 +        * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
1165 +        * new one.
1166 +        */
1167 +       if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED,
1168 +                                       ioc->ioprio_changed)))
1169 +               bfq_ioc_set_ioprio(ioc);
1170 +
1171 +       return cic;
1172 +err_free:
1173 +       bfq_cic_free(cic);
1174 +err:
1175 +       put_io_context(ioc);
1176 +       return NULL;
1177 +}
1178 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1179 new file mode 100644
1180 index 0000000..83e90e9
1181 --- /dev/null
1182 +++ b/block/bfq-iosched.c
1183 @@ -0,0 +1,2010 @@
1184 +/*
1185 + * BFQ, or Budget Fair Queueing, disk scheduler.
1186 + *
1187 + * Based on ideas and code from CFQ:
1188 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1189 + *
1190 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
1191 + *                   Paolo Valente <paolo.valente@unimore.it>
1192 + *
1193 + * BFQ is a proportional share disk scheduling algorithm based on CFQ,
1194 + * that uses the B-WF2Q+ internal scheduler to assign budgets (i.e.,
1195 + * slices in the service domain) to the tasks accessing the disk.  It
1196 + * has been introduced in [1], where the interested reader can find an
1197 + * accurate description of the algorithm, the guarantees it provides
1198 + * and their formal proofs.  With respect to the algorithm presented
1199 + * in the paper, this implementation adds a timeout to limit the maximum
1200 + * time a queue can spend to complete its assigned budget, and a
1201 + * hierarchical extension, based on H-WF2Q+.
1202 + *
1203 + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1204 + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1205 + * complexity derives from the one introduced with EEVDF in [3].
1206 + *
1207 + * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
1208 + *     with Deterministic Guarantees on Bandwidth Distribution,'' to be
1209 + *     published.
1210 + *
1211 + *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq.pdf
1212 + *
1213 + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1214 + *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1215 + *     Oct 1997.
1216 + *
1217 + *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1218 + *
1219 + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1220 + *     First: A Flexible and Accurate Mechanism for Proportional Share
1221 + *     Resource Allocation,'' technical report.
1222 + *
1223 + *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1224 + */
1225 +#include <linux/module.h>
1226 +#include <linux/blkdev.h>
1227 +#include <linux/cgroup.h>
1228 +#include <linux/elevator.h>
1229 +#include <linux/rbtree.h>
1230 +#include <linux/ioprio.h>
1231 +#include "bfq.h"
1232 +
1233 +/* Max nr of dispatches in one round of service. */
1234 +static const int bfq_quantum = 4;
1235 +
1236 +/* Expiration time of each request (jiffies). */
1237 +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1238 +
1239 +/* Maximum backwards seek, in KiB. */
1240 +static const int bfq_back_max = 16 * 1024;
1241 +
1242 +/* Penalty of a backwards seek. */
1243 +static const int bfq_back_penalty = 2;
1244 +
1245 +/* Idling period duration (jiffies). */
1246 +static int bfq_slice_idle = HZ / 125;
1247 +
1248 +/* Default maximum budget values (sectors). */
1249 +static const int bfq_max_budget = 16 * 1024;
1250 +static const int bfq_max_budget_async_rq = 4;
1251 +
1252 +/* Default timeout values (jiffies), approximating CFQ defaults. */
1253 +static const int bfq_timeout_sync = HZ / 8;
1254 +static int bfq_timeout_async = HZ / 25;
1255 +
1256 +struct kmem_cache *bfq_pool;
1257 +struct kmem_cache *bfq_ioc_pool;
1258 +
1259 +static DEFINE_PER_CPU(unsigned long, bfq_ioc_count);
1260 +static struct completion *bfq_ioc_gone;
1261 +static DEFINE_SPINLOCK(bfq_ioc_gone_lock);
1262 +
1263 +/* Below this threshold (in ms), we consider thinktime immediate. */
1264 +#define BFQ_MIN_TT             2
1265 +
1266 +/* hw_tag detection: parallel requests threshold and min samples needed. */
1267 +#define BFQ_HW_QUEUE_THRESHOLD 4
1268 +#define BFQ_HW_QUEUE_SAMPLES   32
1269 +
1270 +/* Budget feedback step. */
1271 +#define BFQ_BUDGET_STEP                128
1272 +
1273 +/* Min samples used for peak rate estimation (for autotuning). */
1274 +#define BFQ_PEAK_RATE_SAMPLES  32
1275 +
1276 +/* Shift used for peak rate fixed precision calculations. */
1277 +#define BFQ_RATE_SHIFT         16
1278 +
1279 +#define BFQ_SERVICE_TREE_INIT  ((struct bfq_service_tree)              \
1280 +                               { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1281 +
1282 +#define RQ_CIC(rq)             \
1283 +       ((struct cfq_io_context *) (rq)->elevator_private)
1284 +#define RQ_BFQQ(rq)            ((rq)->elevator_private2)
1285 +
1286 +#include "bfq-ioc.c"
1287 +#include "bfq-sched.c"
1288 +#include "bfq-cgroup.c"
1289 +
1290 +static inline int bfq_class_idle(struct bfq_queue *bfqq)
1291 +{
1292 +       return bfqq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
1293 +}
1294 +
1295 +static inline int bfq_sample_valid(int samples)
1296 +{
1297 +       return samples > 80;
1298 +}
1299 +
1300 +/*
1301 + * We regard a request as SYNC, if either it's a read or has the SYNC bit
1302 + * set (in which case it could also be a direct WRITE).
1303 + */
1304 +static inline int bfq_bio_sync(struct bio *bio)
1305 +{
1306 +       if (bio_data_dir(bio) == READ || bio_sync(bio))
1307 +               return 1;
1308 +
1309 +       return 0;
1310 +}
1311 +
1312 +/*
1313 + * Scheduler run of queue, if there are requests pending and no one in the
1314 + * driver that will restart queueing.
1315 + */
1316 +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1317 +{
1318 +       if (bfqd->queued != 0) {
1319 +               bfq_log(bfqd, "schedule dispatch");
1320 +               kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1321 +       }
1322 +}
1323 +
1324 +static inline int bfq_queue_empty(struct request_queue *q)
1325 +{
1326 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1327 +
1328 +       return bfqd->queued == 0;
1329 +}
1330 +
1331 +/*
1332 + * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1333 + * We choose the request that is closesr to the head right now.  Distance
1334 + * behind the head is penalized and only allowed to a certain extent.
1335 + */
1336 +static struct request *bfq_choose_req(struct bfq_data *bfqd,
1337 +                                     struct request *rq1,
1338 +                                     struct request *rq2)
1339 +{
1340 +       sector_t last, s1, s2, d1 = 0, d2 = 0;
1341 +       unsigned long back_max;
1342 +#define BFQ_RQ1_WRAP   0x01 /* request 1 wraps */
1343 +#define BFQ_RQ2_WRAP   0x02 /* request 2 wraps */
1344 +       unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1345 +
1346 +       if (rq1 == NULL || rq1 == rq2)
1347 +               return rq2;
1348 +       if (rq2 == NULL)
1349 +               return rq1;
1350 +
1351 +       if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1352 +               return rq1;
1353 +       else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1354 +               return rq2;
1355 +       if (rq_is_meta(rq1) && !rq_is_meta(rq2))
1356 +               return rq1;
1357 +       else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
1358 +               return rq2;
1359 +
1360 +       s1 = rq1->sector;
1361 +       s2 = rq2->sector;
1362 +
1363 +       last = bfqd->last_position;
1364 +
1365 +       /*
1366 +        * by definition, 1KiB is 2 sectors
1367 +        */
1368 +       back_max = bfqd->bfq_back_max * 2;
1369 +
1370 +       /*
1371 +        * Strict one way elevator _except_ in the case where we allow
1372 +        * short backward seeks which are biased as twice the cost of a
1373 +        * similar forward seek.
1374 +        */
1375 +       if (s1 >= last)
1376 +               d1 = s1 - last;
1377 +       else if (s1 + back_max >= last)
1378 +               d1 = (last - s1) * bfqd->bfq_back_penalty;
1379 +       else
1380 +               wrap |= BFQ_RQ1_WRAP;
1381 +
1382 +       if (s2 >= last)
1383 +               d2 = s2 - last;
1384 +       else if (s2 + back_max >= last)
1385 +               d2 = (last - s2) * bfqd->bfq_back_penalty;
1386 +       else
1387 +               wrap |= BFQ_RQ2_WRAP;
1388 +
1389 +       /* Found required data */
1390 +
1391 +       /*
1392 +        * By doing switch() on the bit mask "wrap" we avoid having to
1393 +        * check two variables for all permutations: --> faster!
1394 +        */
1395 +       switch (wrap) {
1396 +       case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1397 +               if (d1 < d2)
1398 +                       return rq1;
1399 +               else if (d2 < d1)
1400 +                       return rq2;
1401 +               else {
1402 +                       if (s1 >= s2)
1403 +                               return rq1;
1404 +                       else
1405 +                               return rq2;
1406 +               }
1407 +
1408 +       case BFQ_RQ2_WRAP:
1409 +               return rq1;
1410 +       case BFQ_RQ1_WRAP:
1411 +               return rq2;
1412 +       case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1413 +       default:
1414 +               /*
1415 +                * Since both rqs are wrapped,
1416 +                * start with the one that's further behind head
1417 +                * (--> only *one* back seek required),
1418 +                * since back seek takes more time than forward.
1419 +                */
1420 +               if (s1 <= s2)
1421 +                       return rq1;
1422 +               else
1423 +                       return rq2;
1424 +       }
1425 +}
1426 +
1427 +static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1428 +                                       struct bfq_queue *bfqq,
1429 +                                       struct request *last)
1430 +{
1431 +       struct rb_node *rbnext = rb_next(&last->rb_node);
1432 +       struct rb_node *rbprev = rb_prev(&last->rb_node);
1433 +       struct request *next = NULL, *prev = NULL;
1434 +
1435 +       BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1436 +
1437 +       if (rbprev != NULL)
1438 +               prev = rb_entry_rq(rbprev);
1439 +
1440 +       if (rbnext != NULL)
1441 +               next = rb_entry_rq(rbnext);
1442 +       else {
1443 +               rbnext = rb_first(&bfqq->sort_list);
1444 +               if (rbnext && rbnext != &last->rb_node)
1445 +                       next = rb_entry_rq(rbnext);
1446 +       }
1447 +
1448 +       return bfq_choose_req(bfqd, next, prev);
1449 +}
1450 +
1451 +static void bfq_del_rq_rb(struct request *rq)
1452 +{
1453 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
1454 +       struct bfq_data *bfqd = bfqq->bfqd;
1455 +       const int sync = rq_is_sync(rq);
1456 +
1457 +       BUG_ON(bfqq->queued[sync] == 0);
1458 +       bfqq->queued[sync]--;
1459 +       bfqd->queued--;
1460 +
1461 +       elv_rb_del(&bfqq->sort_list, rq);
1462 +
1463 +       if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue &&
1464 +           RB_EMPTY_ROOT(&bfqq->sort_list))
1465 +               bfq_del_bfqq_busy(bfqd, bfqq, 1);
1466 +}
1467 +
1468 +/**
1469 + * bfq_updated_next_req - update the queue after a new next_rq selection.
1470 + * @bfqd: the device data the queue belongs to.
1471 + * @bfqq: the queue to update.
1472 + *
1473 + * Whenever the first request of a queue changes we try to allocate it
1474 + * enough service (if it has grown), or to anticipate its finish time
1475 + * (if it has shrinked), to reduce the time it has to wait, still taking
1476 + * into account the queue budget.  We try to avoid the queue having not
1477 + * enough service allocated for its first request, thus having to go
1478 + * through two dispatch rounds to actually dispatch the request.
1479 + */
1480 +static void bfq_updated_next_req(struct bfq_data *bfqd,
1481 +                                struct bfq_queue *bfqq)
1482 +{
1483 +       struct bfq_entity *entity = &bfqq->entity;
1484 +       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1485 +       struct request *next_rq = bfqq->next_rq;
1486 +       bfq_service_t new_budget;
1487 +
1488 +       if (next_rq == NULL)
1489 +               return;
1490 +
1491 +       if (bfqq == bfqd->active_queue)
1492 +               /*
1493 +                * In order not to break guarantees, budgets cannot be
1494 +                * changed after an entity has been selected.
1495 +                */
1496 +               return;
1497 +
1498 +       BUG_ON(entity->tree != &st->active);
1499 +       BUG_ON(entity == entity->sched_data->active_entity);
1500 +
1501 +       new_budget = max(bfqq->max_budget, next_rq->hard_nr_sectors);
1502 +       entity->budget = new_budget;
1503 +       bfq_log_bfqq(bfqd, bfqq, "budget=%lu", new_budget);
1504 +       bfq_activate_bfqq(bfqd, bfqq);
1505 +}
1506 +
1507 +static void bfq_add_rq_rb(struct request *rq)
1508 +{
1509 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
1510 +       struct bfq_entity *entity = &bfqq->entity;
1511 +       struct bfq_data *bfqd = bfqq->bfqd;
1512 +       struct request *__alias, *next_rq;
1513 +
1514 +       bfqq->queued[rq_is_sync(rq)]++;
1515 +       bfqd->queued++;
1516 +
1517 +       /*
1518 +        * Looks a little odd, but the first insert might return an alias,
1519 +        * if that happens, put the alias on the dispatch list.
1520 +        */
1521 +       while ((__alias = elv_rb_add(&bfqq->sort_list, rq)) != NULL)
1522 +               bfq_dispatch_insert(bfqd->queue, __alias);
1523 +
1524 +       /*
1525 +        * check if this request is a better next-serve candidate
1526 +        */
1527 +       next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq);
1528 +       BUG_ON(next_rq == NULL);
1529 +       bfqq->next_rq = next_rq;
1530 +
1531 +       if (!bfq_bfqq_busy(bfqq)) {
1532 +               entity->budget = max(bfqq->max_budget,
1533 +                                    next_rq->hard_nr_sectors);
1534 +               bfq_add_bfqq_busy(bfqd, bfqq);
1535 +       } else
1536 +               bfq_updated_next_req(bfqd, bfqq);
1537 +}
1538 +
1539 +static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1540 +{
1541 +       elv_rb_del(&bfqq->sort_list, rq);
1542 +       bfqq->queued[rq_is_sync(rq)]--;
1543 +       bfqq->bfqd->queued--;
1544 +       bfq_add_rq_rb(rq);
1545 +}
1546 +
1547 +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1548 +                                         struct bio *bio)
1549 +{
1550 +       struct task_struct *tsk = current;
1551 +       struct cfq_io_context *cic;
1552 +       struct bfq_queue *bfqq;
1553 +
1554 +       cic = bfq_cic_lookup(bfqd, tsk->io_context);
1555 +       if (cic == NULL)
1556 +               return NULL;
1557 +
1558 +       bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
1559 +       if (bfqq != NULL) {
1560 +               sector_t sector = bio->bi_sector + bio_sectors(bio);
1561 +
1562 +               return elv_rb_find(&bfqq->sort_list, sector);
1563 +       }
1564 +
1565 +       return NULL;
1566 +}
1567 +
1568 +static void bfq_activate_request(struct request_queue *q, struct request *rq)
1569 +{
1570 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1571 +
1572 +       bfqd->rq_in_driver++;
1573 +       bfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
1574 +}
1575 +
1576 +static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1577 +{
1578 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1579 +
1580 +       WARN_ON(bfqd->rq_in_driver == 0);
1581 +       bfqd->rq_in_driver--;
1582 +}
1583 +
1584 +static void bfq_remove_request(struct request *rq)
1585 +{
1586 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
1587 +       struct bfq_data *bfqd = bfqq->bfqd;
1588 +
1589 +       if (bfqq->next_rq == rq) {
1590 +               bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1591 +               bfq_updated_next_req(bfqd, bfqq);
1592 +       }
1593 +
1594 +       list_del_init(&rq->queuelist);
1595 +       bfq_del_rq_rb(rq);
1596 +
1597 +       if (rq_is_meta(rq)) {
1598 +               WARN_ON(bfqq->meta_pending == 0);
1599 +               bfqq->meta_pending--;
1600 +       }
1601 +}
1602 +
1603 +static int bfq_merge(struct request_queue *q, struct request **req,
1604 +                    struct bio *bio)
1605 +{
1606 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1607 +       struct request *__rq;
1608 +
1609 +       __rq = bfq_find_rq_fmerge(bfqd, bio);
1610 +       if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1611 +               *req = __rq;
1612 +               return ELEVATOR_FRONT_MERGE;
1613 +       }
1614 +
1615 +       return ELEVATOR_NO_MERGE;
1616 +}
1617 +
1618 +static void bfq_merged_request(struct request_queue *q, struct request *req,
1619 +                              int type)
1620 +{
1621 +       if (type == ELEVATOR_FRONT_MERGE) {
1622 +               struct bfq_queue *bfqq = RQ_BFQQ(req);
1623 +
1624 +               bfq_reposition_rq_rb(bfqq, req);
1625 +       }
1626 +}
1627 +
1628 +static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1629 +                               struct request *next)
1630 +{
1631 +       /*
1632 +        * reposition in fifo if next is older than rq
1633 +        */
1634 +       if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1635 +           time_before(next->start_time, rq->start_time))
1636 +               list_move(&rq->queuelist, &next->queuelist);
1637 +
1638 +       bfq_remove_request(next);
1639 +}
1640 +
1641 +static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1642 +                          struct bio *bio)
1643 +{
1644 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1645 +       struct cfq_io_context *cic;
1646 +       struct bfq_queue *bfqq;
1647 +
1648 +       /* Disallow merge of a sync bio into an async request. */
1649 +       if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1650 +               return 0;
1651 +
1652 +       /*
1653 +        * Lookup the bfqq that this bio will be queued with. Allow
1654 +        * merge only if rq is queued there.
1655 +        */
1656 +       cic = bfq_cic_lookup(bfqd, current->io_context);
1657 +       if (cic == NULL)
1658 +               return 0;
1659 +
1660 +       bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
1661 +       if (bfqq == RQ_BFQQ(rq))
1662 +               return 1;
1663 +
1664 +       return 0;
1665 +}
1666 +
1667 +static void __bfq_set_active_queue(struct bfq_data *bfqd,
1668 +                                  struct bfq_queue *bfqq)
1669 +{
1670 +       if (bfqq != NULL) {
1671 +               bfq_mark_bfqq_must_alloc(bfqq);
1672 +               bfq_mark_bfqq_budget_new(bfqq);
1673 +               bfq_clear_bfqq_fifo_expire(bfqq);
1674 +
1675 +               bfqq->budgets_assigned = (bfqq->budgets_assigned*7 + 256) / 8;
1676 +
1677 +               bfq_log_bfqq(bfqd, bfqq, "active");
1678 +       }
1679 +
1680 +       bfqd->active_queue = bfqq;
1681 +}
1682 +
1683 +/*
1684 + * Get and set a new active queue for service.
1685 + */
1686 +static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
1687 +{
1688 +       struct bfq_queue *bfqq;
1689 +
1690 +       bfqq = bfq_get_next_queue(bfqd);
1691 +       __bfq_set_active_queue(bfqd, bfqq);
1692 +       return bfqq;
1693 +}
1694 +
1695 +#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
1696 +
1697 +static void bfq_arm_slice_timer(struct bfq_data *bfqd)
1698 +{
1699 +       struct bfq_queue *bfqq = bfqd->active_queue;
1700 +       struct cfq_io_context *cic;
1701 +       unsigned long sl;
1702 +
1703 +       WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
1704 +
1705 +       /* Idling is disabled, either manually or by past process history. */
1706 +       if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
1707 +               return;
1708 +
1709 +       /* Tasks have exited, don't wait. */
1710 +       cic = bfqd->active_cic;
1711 +       if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0)
1712 +               return;
1713 +
1714 +       bfq_mark_bfqq_wait_request(bfqq);
1715 +
1716 +       /*
1717 +        * we don't want to idle for seeks, but we do want to allow
1718 +        * fair distribution of slice time for a process doing back-to-back
1719 +        * seeks. so allow a little bit of time for him to submit a new rq
1720 +        */
1721 +       sl = bfqd->bfq_slice_idle;
1722 +       if (bfq_sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1723 +               sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
1724 +
1725 +       bfqd->last_idling_start = ktime_get();
1726 +       mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
1727 +       bfq_log(bfqd, "arm idle: %lu", sl);
1728 +}
1729 +
1730 +static void bfq_set_budget_timeout(struct bfq_data *bfqd)
1731 +{
1732 +       struct bfq_queue *bfqq = bfqd->active_queue;
1733 +
1734 +       bfqd->last_budget_start = ktime_get();
1735 +
1736 +       bfq_clear_bfqq_budget_new(bfqq);
1737 +       bfqq->budget_timeout = jiffies +
1738 +               bfqd->bfq_timeout[!!bfq_bfqq_sync(bfqq)];
1739 +}
1740 +
1741 +/*
1742 + * Move request from internal lists to the request queue dispatch list.
1743 + */
1744 +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
1745 +{
1746 +       struct bfq_data *bfqd = q->elevator->elevator_data;
1747 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
1748 +
1749 +       bfq_remove_request(rq);
1750 +       bfqq->dispatched++;
1751 +       elv_dispatch_sort(q, rq);
1752 +
1753 +       if (bfq_bfqq_sync(bfqq))
1754 +               bfqd->sync_flight++;
1755 +}
1756 +
1757 +/*
1758 + * return expired entry, or NULL to just start from scratch in rbtree
1759 + */
1760 +static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
1761 +{
1762 +       struct bfq_data *bfqd = bfqq->bfqd;
1763 +       struct request *rq;
1764 +       int fifo;
1765 +
1766 +       if (bfq_bfqq_fifo_expire(bfqq))
1767 +               return NULL;
1768 +
1769 +       bfq_mark_bfqq_fifo_expire(bfqq);
1770 +
1771 +       if (list_empty(&bfqq->fifo))
1772 +               return NULL;
1773 +
1774 +       fifo = bfq_bfqq_sync(bfqq);
1775 +       rq = rq_entry_fifo(bfqq->fifo.next);
1776 +
1777 +       if (time_before(jiffies, rq->start_time + bfqd->bfq_fifo_expire[fifo]))
1778 +               return NULL;
1779 +
1780 +       return rq;
1781 +}
1782 +
1783 +static inline bfq_service_t bfq_bfqq_budget_left(struct bfq_queue *bfqq)
1784 +{
1785 +       struct bfq_entity *entity = &bfqq->entity;
1786 +       return entity->budget - entity->service;
1787 +}
1788 +
1789 +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1790 +{
1791 +       BUG_ON(bfqq != bfqd->active_queue);
1792 +
1793 +       __bfq_bfqd_reset_active(bfqd);
1794 +
1795 +       if (RB_EMPTY_ROOT(&bfqq->sort_list))
1796 +               bfq_del_bfqq_busy(bfqd, bfqq, 1);
1797 +       else
1798 +               bfq_activate_bfqq(bfqd, bfqq);
1799 +}
1800 +
1801 +/**
1802 + * bfq_default_budget - return the default budget for @bfqq on @bfqd.
1803 + * @bfqd: the device descriptor.
1804 + * @bfqq: the queue to consider.
1805 + *
1806 + * We use 3/4 of the @bfqd maximum budget as the default value
1807 + * for the max_budget field of the queues.  This lets the feedback
1808 + * mechanism to start from some middle ground, then the behavior
1809 + * of the task will drive the heuristics towards high values, if
1810 + * it behaves as a greedy sequential reader, or towards small values
1811 + * if it shows a more intermittent behavior.
1812 + */
1813 +static bfq_service_t bfq_default_budget(struct bfq_data *bfqd,
1814 +                                       struct bfq_queue *bfqq)
1815 +{
1816 +       bfq_service_t budget;
1817 +
1818 +       /*
1819 +        * When we need an estimate of the peak rate we need to avoid
1820 +        * to give budgets that are too short due to previous measurements.
1821 +        * So, in the first 10 assignments use a ``safe'' budget value.
1822 +        */
1823 +       if (bfqq->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
1824 +               budget = bfq_max_budget;
1825 +       else
1826 +               budget = bfqd->bfq_max_budget;
1827 +
1828 +       return budget - budget / 4;
1829 +}
1830 +
1831 +static inline bfq_service_t bfq_min_budget(struct bfq_data *bfqd,
1832 +                                          struct bfq_queue *bfqq)
1833 +{
1834 +       return bfqd->bfq_max_budget / 2;
1835 +}
1836 +
1837 +/**
1838 + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
1839 + * @bfqd: device data.
1840 + * @bfqq: queue to update.
1841 + * @reason: reason for expiration.
1842 + *
1843 + * Handle the feedback on @bfqq budget.  This is driven by the following
1844 + * principles:
1845 + *   - async queues get always the maximum budget value (their ability to
1846 + *     dispatch is limited by @bfqd->bfq_max_budget_async_rq).
1847 + *   - If @bfqq has been too idle we decrease its budget, as it is likely
1848 + *     to be more interested in latency than in throughput.
1849 + *   - If @bfqq took too much to consume its budget it is likely to be
1850 + *     seeky, so reset the budget to the default, in order to have all
1851 + *     the seeky queues to be charged for the same service, trying to
1852 + *     achieve fairness at least in the time domain among them.
1853 + *   - If @bfqq exhausted its budget treat it as a greedy reader, in
1854 + *     order to run it at full speed.
1855 + *   - If @bfqq expired due to lack of requests leave its budget untouched.
1856 + */
1857 +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
1858 +                                    struct bfq_queue *bfqq,
1859 +                                    enum bfqq_expiration reason)
1860 +{
1861 +       struct request *next_rq;
1862 +       bfq_service_t budget, min_budget;
1863 +
1864 +       budget = bfqq->max_budget;
1865 +       min_budget = bfq_min_budget(bfqd, bfqq);
1866 +
1867 +       BUG_ON(bfqq != bfqd->active_queue);
1868 +
1869 +       if (bfq_bfqq_sync(bfqq)) {
1870 +               switch (reason) {
1871 +               case BFQ_BFQQ_TOO_IDLE:
1872 +                       if (budget > min_budget + BFQ_BUDGET_STEP)
1873 +                               budget -= BFQ_BUDGET_STEP;
1874 +                       else
1875 +                               budget = min_budget;
1876 +                       break;
1877 +               case BFQ_BFQQ_BUDGET_TIMEOUT:
1878 +                       budget = bfq_default_budget(bfqd, bfqq);
1879 +                       break;
1880 +               case BFQ_BFQQ_BUDGET_EXHAUSTED:
1881 +                       budget = min(budget + 8 * BFQ_BUDGET_STEP,
1882 +                                    bfqd->bfq_max_budget);
1883 +                       break;
1884 +               case BFQ_BFQQ_NO_MORE_REQUESTS:
1885 +               default:
1886 +                       return;
1887 +               }
1888 +       } else
1889 +               budget = bfqd->bfq_max_budget;
1890 +
1891 +       bfqq->max_budget = budget;
1892 +
1893 +       if (bfqq->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
1894 +           bfqq->max_budget > bfqd->bfq_max_budget)
1895 +               bfqq->max_budget = bfqd->bfq_max_budget;
1896 +
1897 +       /*
1898 +        * Make sure that we have enough budget for the next request.
1899 +        * Since the finish time of the bfqq must be kept in sync with
1900 +        * the budget, be sure to call __bfq_bfqq_expire() after the
1901 +        * update.
1902 +        */
1903 +       next_rq = bfqq->next_rq;
1904 +       if (next_rq != NULL)
1905 +               bfqq->entity.budget = max(bfqq->max_budget,
1906 +                                         next_rq->hard_nr_sectors);
1907 +       bfq_log_bfqq(bfqd, bfqq, "budget=%lu (%d)", bfqq->entity.budget,
1908 +                    bfq_bfqq_sync(bfqq));
1909 +}
1910 +
1911 +static bfq_service_t bfq_calc_max_budget(u64 peak_rate, u64 timeout)
1912 +{
1913 +       bfq_service_t max_budget;
1914 +
1915 +       /*
1916 +        * The max_budget calculated when autotuning is equal to the
1917 +        * amount of sectors transfered in 0.75 * timeout_sync at the
1918 +        * estimated peak rate.
1919 +        */
1920 +       max_budget = (bfq_service_t)(peak_rate * 1000 *
1921 +                                    timeout >> BFQ_RATE_SHIFT);
1922 +       max_budget -= max_budget / 4;
1923 +
1924 +       return max_budget;
1925 +}
1926 +
1927 +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1928 +                               int compensate)
1929 +{
1930 +       u64 bw, usecs, expected, timeout;
1931 +       ktime_t delta;
1932 +       int update = 0;
1933 +
1934 +       if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
1935 +               return 0;
1936 +
1937 +       delta = compensate ? bfqd->last_idling_start : ktime_get();
1938 +       delta = ktime_sub(delta, bfqd->last_budget_start);
1939 +       usecs = ktime_to_us(delta);
1940 +
1941 +       /* Don't trust short/unrealistic values. */
1942 +       if (usecs < 100 || usecs >= LONG_MAX)
1943 +               return 0;
1944 +
1945 +       /*
1946 +        * Calculate the bandwidth for the last slice.  We use a 64 bit
1947 +        * value to store the peak rate, in sectors per usec in fixed
1948 +        * point math.  We do so to have enough precision in the estimate
1949 +        * and to avoid overflows.
1950 +        */
1951 +       bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
1952 +       do_div(bw, (unsigned long)usecs);
1953 +
1954 +       timeout = jiffies_to_msecs(bfqd->bfq_timeout[SYNC]);
1955 +
1956 +       /*
1957 +        * Use only long (> 20ms) intervals to filter out spikes for
1958 +        * the peak rate estimation.
1959 +        */
1960 +       if (usecs > 20000) {
1961 +               if (bw > bfqd->peak_rate) {
1962 +                       bfqd->peak_rate = bw;
1963 +                       update = 1;
1964 +                       bfq_log(bfqd, "peak_rate=%llu", bw);
1965 +               }
1966 +
1967 +               update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
1968 +
1969 +               if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
1970 +                       bfqd->peak_rate_samples++;
1971 +
1972 +               if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
1973 +                   update && bfqd->bfq_user_max_budget == 0) {
1974 +                       bfqd->bfq_max_budget =
1975 +                               bfq_calc_max_budget(bfqd->peak_rate, timeout);
1976 +                       bfq_log(bfqd, "max_budget=%lu", bfqd->bfq_max_budget);
1977 +               }
1978 +       }
1979 +
1980 +       /*
1981 +        * A process is considered ``slow'' (i.e., seeky, so that we
1982 +        * cannot treat it fairly in the service domain, as it would
1983 +        * slow down too much the other processes) if, when a slice
1984 +        * ends for whatever reason, it has received service at a
1985 +        * rate that would not be high enough to complete the budget
1986 +        * before the budget timeout expiration.
1987 +        */
1988 +       expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
1989 +
1990 +       return expected > bfqq->entity.budget;
1991 +}
1992 +
1993 +/*
1994 + * bfq_bfqq_expire - expire a queue.
1995 + * @bfqd: device owning the queue.
1996 + * @bfqq: the queue to expire.
1997 + * @compensate: if true, compensate for the time spent idling.
1998 + * @reason: the reason causing the expiration.
1999 + *
2000 + * The behavior is the following: when a queue expires because it has
2001 + * been idling for too much we sync its finish time with the service
2002 + * received and decrease its budget.  If @bfqq expires due to budget
2003 + * exhaustion we increase its budget and sync its finish time.
2004 + * If @bfqq expires due to budget timeout we do not sync its finish time
2005 + * to avoid seeky queues to take too much disk time; instead we charge
2006 + * it the maximum budget value.  Using the max budget value for all the
2007 + * queues that expire due to budget timeout has the effect of using the
2008 + * WF2Q+ scheduler to assign timeslices to those queues, without violating
2009 + * the service domain guarantees for well-behaved queues.
2010 + */
2011 +static void bfq_bfqq_expire(struct bfq_data *bfqd,
2012 +                           struct bfq_queue *bfqq,
2013 +                           int compensate,
2014 +                           enum bfqq_expiration reason)
2015 +{
2016 +       int slow;
2017 +
2018 +       slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
2019 +
2020 +       /*
2021 +        * Treat slow (i.e., seeky) traffic as timed out, to not favor
2022 +        * it over sequential traffic (a seeky queue consumes less budget,
2023 +        * so it would receive smaller timestamps wrt a sequential one
2024 +        * when an idling timer fires).
2025 +        */
2026 +       if (slow && reason == BFQ_BFQQ_TOO_IDLE)
2027 +               reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2028 +
2029 +       if (reason == BFQ_BFQQ_BUDGET_TIMEOUT || !bfq_bfqq_sync(bfqq))
2030 +               bfq_bfqq_charge_full_budget(bfqq);
2031 +
2032 +       bfq_log_bfqq(bfqd, bfqq, "expire (%d, %d)", reason, slow);
2033 +
2034 +       __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2035 +       __bfq_bfqq_expire(bfqd, bfqq);
2036 +}
2037 +
2038 +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2039 +{
2040 +       if (bfq_bfqq_budget_new(bfqq))
2041 +               return 0;
2042 +
2043 +       if (time_before(jiffies, bfqq->budget_timeout))
2044 +               return 0;
2045 +
2046 +       return 1;
2047 +}
2048 +
2049 +/*
2050 + * Select a queue for service.  If we have a current active queue,
2051 + * check whether to continue servicing it, or retrieve and set a new one.
2052 + */
2053 +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2054 +{
2055 +       struct bfq_queue *bfqq;
2056 +       struct request *next_rq;
2057 +       enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2058 +
2059 +       bfqq = bfqd->active_queue;
2060 +       if (bfqq == NULL)
2061 +               goto new_queue;
2062 +
2063 +       if (bfq_bfqq_budget_timeout(bfqq)) {
2064 +               bfq_bfqq_charge_full_budget(bfqq);
2065 +               goto expire;
2066 +       }
2067 +
2068 +       next_rq = bfqq->next_rq;
2069 +       /*
2070 +        * If bfqq has requests queued and it has enough budget left to
2071 +        * serve them, keep the queue, otherwise expire it.
2072 +        */
2073 +       if (next_rq != NULL) {
2074 +               if (next_rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
2075 +                       reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2076 +                       goto expire;
2077 +               } else
2078 +                       goto keep_queue;
2079 +       }
2080 +
2081 +       /*
2082 +        * No requests pending.  If the active queue still has requests in
2083 +        * flight or is idling for a new request, allow either of these
2084 +        * conditions to happen (or time out) before selecting a new queue.
2085 +        */
2086 +       if (timer_pending(&bfqd->idle_slice_timer) ||
2087 +           (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq))) {
2088 +               bfqq = NULL;
2089 +               goto keep_queue;
2090 +       }
2091 +
2092 +       reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2093 +expire:
2094 +       bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2095 +new_queue:
2096 +       bfqq = bfq_set_active_queue(bfqd);
2097 +keep_queue:
2098 +       return bfqq;
2099 +}
2100 +
2101 +/*
2102 + * Dispatch some requests from bfqq, moving them to the request queue
2103 + * dispatch list.
2104 + */
2105 +static int __bfq_dispatch_requests(struct bfq_data *bfqd,
2106 +                                  struct bfq_queue *bfqq,
2107 +                                  int max_dispatch)
2108 +{
2109 +       int dispatched = 0;
2110 +
2111 +       BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2112 +
2113 +       do {
2114 +               struct request *rq;
2115 +
2116 +               /* Follow expired path, else get first next available. */
2117 +               rq = bfq_check_fifo(bfqq);
2118 +               if (rq == NULL)
2119 +                       rq = bfqq->next_rq;
2120 +
2121 +               if (rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) {
2122 +                       /*
2123 +                        * Expire the queue for budget exhaustion, and
2124 +                        * make sure that the next act_budget is enough
2125 +                        * to serve the next request, even if it comes
2126 +                        * from the fifo expired path.
2127 +                        */
2128 +                       bfqq->next_rq = rq;
2129 +                       goto expire;
2130 +               }
2131 +
2132 +               /* Finally, insert request into driver dispatch list. */
2133 +               bfq_bfqq_served(bfqq, rq->hard_nr_sectors);
2134 +               bfq_dispatch_insert(bfqd->queue, rq);
2135 +
2136 +               dispatched++;
2137 +
2138 +               if (bfqd->active_cic == NULL) {
2139 +                       atomic_inc(&RQ_CIC(rq)->ioc->refcount);
2140 +                       bfqd->active_cic = RQ_CIC(rq);
2141 +               }
2142 +
2143 +               if (RB_EMPTY_ROOT(&bfqq->sort_list))
2144 +                       break;
2145 +       } while (dispatched < max_dispatch);
2146 +
2147 +       if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2148 +           dispatched >= bfqd->bfq_max_budget_async_rq) ||
2149 +           bfq_class_idle(bfqq)))
2150 +               goto expire;
2151 +
2152 +       return dispatched;
2153 +
2154 +expire:
2155 +       bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2156 +       return dispatched;
2157 +}
2158 +
2159 +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2160 +{
2161 +       int dispatched = 0;
2162 +
2163 +       while (bfqq->next_rq != NULL) {
2164 +               bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2165 +               dispatched++;
2166 +       }
2167 +
2168 +       BUG_ON(!list_empty(&bfqq->fifo));
2169 +       return dispatched;
2170 +}
2171 +
2172 +/*
2173 + * Drain our current requests.  Used for barriers and when switching
2174 + * io schedulers on-the-fly.
2175 + */
2176 +static int bfq_forced_dispatch(struct bfq_data *bfqd)
2177 +{
2178 +       struct bfq_queue *bfqq, *n;
2179 +       struct bfq_service_tree *st;
2180 +       int dispatched = 0;
2181 +
2182 +       bfqq = bfqd->active_queue;
2183 +       if (bfqq != NULL)
2184 +               __bfq_bfqq_expire(bfqd, bfqq);
2185 +
2186 +       /*
2187 +        * Loop through classes, and be careful to leave the scheduler
2188 +        * in a consistent state, as feedback mechanisms and vtime
2189 +        * updates cannot be disabled during the process.
2190 +        */
2191 +       list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
2192 +               st = bfq_entity_service_tree(&bfqq->entity);
2193 +
2194 +               dispatched += __bfq_forced_dispatch_bfqq(bfqq);
2195 +               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
2196 +
2197 +               bfq_forget_idle(st);
2198 +       }
2199 +
2200 +       BUG_ON(bfqd->busy_queues != 0);
2201 +
2202 +       return dispatched;
2203 +}
2204 +
2205 +static int bfq_dispatch_requests(struct request_queue *q, int force)
2206 +{
2207 +       struct bfq_data *bfqd = q->elevator->elevator_data;
2208 +       struct bfq_queue *bfqq;
2209 +       int dispatched;
2210 +
2211 +       if (bfqd->busy_queues == 0)
2212 +               return 0;
2213 +
2214 +       if (unlikely(force))
2215 +               return bfq_forced_dispatch(bfqd);
2216 +
2217 +       dispatched = 0;
2218 +       while ((bfqq = bfq_select_queue(bfqd)) != NULL) {
2219 +               int max_dispatch;
2220 +
2221 +               max_dispatch = bfqd->bfq_quantum;
2222 +               if (bfq_class_idle(bfqq))
2223 +                       max_dispatch = 1;
2224 +
2225 +               if (!bfq_bfqq_sync(bfqq))
2226 +                       max_dispatch = bfqd->bfq_max_budget_async_rq;
2227 +
2228 +               if (bfqq->dispatched >= max_dispatch) {
2229 +                       if (bfqd->busy_queues > 1)
2230 +                               break;
2231 +                       if (bfqq->dispatched >= 4 * max_dispatch)
2232 +                               break;
2233 +               }
2234 +
2235 +               if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
2236 +                       break;
2237 +
2238 +               bfq_clear_bfqq_wait_request(bfqq);
2239 +               BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2240 +
2241 +               dispatched += __bfq_dispatch_requests(bfqd, bfqq, max_dispatch);
2242 +       }
2243 +
2244 +       bfq_log(bfqd, "dispatched=%d", dispatched);
2245 +       return dispatched;
2246 +}
2247 +
2248 +/*
2249 + * Task holds one reference to the queue, dropped when task exits.  Each rq
2250 + * in-flight on this queue also holds a reference, dropped when rq is freed.
2251 + *
2252 + * Queue lock must be held here.
2253 + */
2254 +static void bfq_put_queue(struct bfq_queue *bfqq)
2255 +{
2256 +       struct bfq_data *bfqd = bfqq->bfqd;
2257 +
2258 +       BUG_ON(atomic_read(&bfqq->ref) <= 0);
2259 +
2260 +       if (!atomic_dec_and_test(&bfqq->ref))
2261 +               return;
2262 +
2263 +       BUG_ON(rb_first(&bfqq->sort_list) != NULL);
2264 +       BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
2265 +       BUG_ON(bfqq->entity.tree != NULL);
2266 +       BUG_ON(bfq_bfqq_busy(bfqq));
2267 +       BUG_ON(bfqd->active_queue == bfqq);
2268 +
2269 +       bfq_log_bfqq(bfqd, bfqq, "freed");
2270 +
2271 +       kmem_cache_free(bfq_pool, bfqq);
2272 +}
2273 +
2274 +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2275 +{
2276 +       if (bfqq == bfqd->active_queue) {
2277 +               __bfq_bfqq_expire(bfqd, bfqq);
2278 +               bfq_schedule_dispatch(bfqd);
2279 +       }
2280 +
2281 +       bfq_put_queue(bfqq);
2282 +}
2283 +
2284 +/*
2285 + * Update the entity prio values; note that the new values will not
2286 + * be used until the next (re)activation.
2287 + */
2288 +static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
2289 +{
2290 +       struct task_struct *tsk = current;
2291 +       int ioprio_class;
2292 +
2293 +       if (!bfq_bfqq_prio_changed(bfqq))
2294 +               return;
2295 +
2296 +       ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2297 +       switch (ioprio_class) {
2298 +       default:
2299 +               printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
2300 +       case IOPRIO_CLASS_NONE:
2301 +               /*
2302 +                * no prio set, inherit CPU scheduling settings
2303 +                */
2304 +               bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
2305 +               bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
2306 +               break;
2307 +       case IOPRIO_CLASS_RT:
2308 +               bfqq->entity.new_ioprio = task_ioprio(ioc);
2309 +               bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
2310 +               break;
2311 +       case IOPRIO_CLASS_BE:
2312 +               bfqq->entity.new_ioprio = task_ioprio(ioc);
2313 +               bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
2314 +               break;
2315 +       case IOPRIO_CLASS_IDLE:
2316 +               bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
2317 +               bfqq->entity.new_ioprio = 7;
2318 +               bfq_clear_bfqq_idle_window(bfqq);
2319 +               break;
2320 +       }
2321 +
2322 +       bfqq->entity.ioprio_changed = 1;
2323 +
2324 +       /*
2325 +        * keep track of original prio settings in case we have to temporarily
2326 +        * elevate the priority of this queue
2327 +        */
2328 +       bfqq->org_ioprio = bfqq->entity.new_ioprio;
2329 +       bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
2330 +       bfq_clear_bfqq_prio_changed(bfqq);
2331 +}
2332 +
2333 +static void bfq_changed_ioprio(struct io_context *ioc,
2334 +                              struct cfq_io_context *cic)
2335 +{
2336 +       struct bfq_data *bfqd;
2337 +       struct bfq_queue *bfqq, *new_bfqq;
2338 +       struct bfq_group *bfqg;
2339 +       unsigned long uninitialized_var(flags);
2340 +
2341 +       bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
2342 +       if (unlikely(bfqd == NULL))
2343 +               return;
2344 +
2345 +       bfqq = cic->cfqq[ASYNC];
2346 +       if (bfqq != NULL) {
2347 +               bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
2348 +                                   sched_data);
2349 +               new_bfqq = bfq_get_queue(bfqd, bfqg, ASYNC, cic->ioc,
2350 +                                        GFP_ATOMIC);
2351 +               if (new_bfqq != NULL) {
2352 +                       cic->cfqq[ASYNC] = new_bfqq;
2353 +                       bfq_put_queue(bfqq);
2354 +               }
2355 +       }
2356 +
2357 +       bfqq = cic->cfqq[SYNC];
2358 +       if (bfqq != NULL)
2359 +               bfq_mark_bfqq_prio_changed(bfqq);
2360 +
2361 +       bfq_put_bfqd_unlock(bfqd, &flags);
2362 +}
2363 +
2364 +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
2365 +                                             struct bfq_group *bfqg,
2366 +                                             int is_sync,
2367 +                                             struct io_context *ioc,
2368 +                                             gfp_t gfp_mask)
2369 +{
2370 +       struct bfq_queue *bfqq, *new_bfqq = NULL;
2371 +       struct cfq_io_context *cic;
2372 +
2373 +retry:
2374 +       cic = bfq_cic_lookup(bfqd, ioc);
2375 +       /* cic always exists here */
2376 +       bfqq = cic_to_bfqq(cic, is_sync);
2377 +
2378 +       if (bfqq == NULL) {
2379 +               if (new_bfqq != NULL) {
2380 +                       bfqq = new_bfqq;
2381 +                       new_bfqq = NULL;
2382 +               } else if (gfp_mask & __GFP_WAIT) {
2383 +                       /*
2384 +                        * Inform the allocator of the fact that we will
2385 +                        * just repeat this allocation if it fails, to allow
2386 +                        * the allocator to do whatever it needs to attempt to
2387 +                        * free memory.
2388 +                        */
2389 +                       spin_unlock_irq(bfqd->queue->queue_lock);
2390 +                       new_bfqq = kmem_cache_alloc_node(bfq_pool,
2391 +                                       gfp_mask | __GFP_NOFAIL | __GFP_ZERO,
2392 +                                       bfqd->queue->node);
2393 +                       spin_lock_irq(bfqd->queue->queue_lock);
2394 +                       goto retry;
2395 +               } else {
2396 +                       bfqq = kmem_cache_alloc_node(bfq_pool,
2397 +                                       gfp_mask | __GFP_ZERO,
2398 +                                       bfqd->queue->node);
2399 +                       if (bfqq == NULL)
2400 +                               goto out;
2401 +               }
2402 +
2403 +               RB_CLEAR_NODE(&bfqq->entity.rb_node);
2404 +               INIT_LIST_HEAD(&bfqq->fifo);
2405 +
2406 +               atomic_set(&bfqq->ref, 0);
2407 +               bfqq->bfqd = bfqd;
2408 +
2409 +               bfq_mark_bfqq_prio_changed(bfqq);
2410 +
2411 +               bfq_init_prio_data(bfqq, ioc);
2412 +               bfq_init_entity(&bfqq->entity, bfqg);
2413 +
2414 +               if (is_sync) {
2415 +                       if (!bfq_class_idle(bfqq))
2416 +                               bfq_mark_bfqq_idle_window(bfqq);
2417 +                       bfq_mark_bfqq_sync(bfqq);
2418 +               }
2419 +               bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
2420 +               bfqq->pid = current->pid;
2421 +
2422 +               bfq_log_bfqq(bfqd, bfqq, "allocated");
2423 +       }
2424 +
2425 +       if (new_bfqq != NULL)
2426 +               kmem_cache_free(bfq_pool, new_bfqq);
2427 +
2428 +out:
2429 +       WARN_ON((gfp_mask & __GFP_WAIT) && bfqq == NULL);
2430 +       return bfqq;
2431 +}
2432 +
2433 +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
2434 +                                              struct bfq_group *bfqg,
2435 +                                              int ioprio_class, int ioprio)
2436 +{
2437 +       switch (ioprio_class) {
2438 +       case IOPRIO_CLASS_RT:
2439 +               return &bfqg->async_bfqq[0][ioprio];
2440 +       case IOPRIO_CLASS_BE:
2441 +               return &bfqg->async_bfqq[1][ioprio];
2442 +       case IOPRIO_CLASS_IDLE:
2443 +               return &bfqg->async_idle_bfqq;
2444 +       default:
2445 +               BUG();
2446 +       }
2447 +}
2448 +
2449 +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
2450 +                                      struct bfq_group *bfqg, int is_sync,
2451 +                                      struct io_context *ioc, gfp_t gfp_mask)
2452 +{
2453 +       const int ioprio = task_ioprio(ioc);
2454 +       const int ioprio_class = task_ioprio_class(ioc);
2455 +       struct bfq_queue **async_bfqq = NULL;
2456 +       struct bfq_queue *bfqq = NULL;
2457 +
2458 +       if (!is_sync) {
2459 +               async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
2460 +                                                 ioprio);
2461 +               bfqq = *async_bfqq;
2462 +       }
2463 +
2464 +       if (bfqq == NULL) {
2465 +               bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
2466 +               if (bfqq == NULL)
2467 +                       return NULL;
2468 +       }
2469 +
2470 +       /*
2471 +        * pin the queue now that it's allocated, scheduler exit will prune it
2472 +        */
2473 +       if (!is_sync && *async_bfqq == NULL) {
2474 +               atomic_inc(&bfqq->ref);
2475 +               *async_bfqq = bfqq;
2476 +       }
2477 +
2478 +       atomic_inc(&bfqq->ref);
2479 +       return bfqq;
2480 +}
2481 +
2482 +static void bfq_update_io_thinktime(struct bfq_data *bfqd,
2483 +                                   struct cfq_io_context *cic)
2484 +{
2485 +       unsigned long elapsed = jiffies - cic->last_end_request;
2486 +       unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
2487 +
2488 +       cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
2489 +       cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
2490 +       cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
2491 +}
2492 +
2493 +static void bfq_update_io_seektime(struct bfq_data *bfqd,
2494 +                                  struct bfq_queue *bfqq,
2495 +                                  struct cfq_io_context *cic,
2496 +                                  struct request *rq)
2497 +{
2498 +       sector_t sdist;
2499 +       u64 total;
2500 +
2501 +       if (cic->last_request_pos < rq->sector)
2502 +               sdist = rq->sector - cic->last_request_pos;
2503 +       else
2504 +               sdist = cic->last_request_pos - rq->sector;
2505 +
2506 +       /*
2507 +        * Don't allow the seek distance to get too large from the
2508 +        * odd fragment, pagein, etc.
2509 +        */
2510 +       if (cic->seek_samples == 0) /* first request, not really a seek */
2511 +               sdist = 0;
2512 +       else if (cic->seek_samples <= 60) /* second&third seek */
2513 +               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
2514 +       else
2515 +               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
2516 +
2517 +       cic->seek_samples = (7*cic->seek_samples + 256) / 8;
2518 +       cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
2519 +       total = cic->seek_total + (cic->seek_samples/2);
2520 +       do_div(total, cic->seek_samples);
2521 +       cic->seek_mean = (sector_t)total;
2522 +
2523 +       bfq_log_bfqq(bfqd, bfqq, "dist=%lu mean=%lu", sdist, cic->seek_mean);
2524 +}
2525 +
2526 +/*
2527 + * Disable idle window if the process thinks too long or seeks so much that
2528 + * it doesn't matter.
2529 + */
2530 +static void bfq_update_idle_window(struct bfq_data *bfqd,
2531 +                                  struct bfq_queue *bfqq,
2532 +                                  struct cfq_io_context *cic)
2533 +{
2534 +       int enable_idle;
2535 +
2536 +       /* Don't idle for async or idle io prio class. */
2537 +       if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
2538 +               return;
2539 +
2540 +       enable_idle = bfq_bfqq_idle_window(bfqq);
2541 +
2542 +       if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
2543 +           bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && CIC_SEEKY(cic)))
2544 +               enable_idle = 0;
2545 +       else if (bfq_sample_valid(cic->ttime_samples)) {
2546 +               if (cic->ttime_mean > bfqd->bfq_slice_idle)
2547 +                       enable_idle = 0;
2548 +               else
2549 +                       enable_idle = 1;
2550 +       }
2551 +
2552 +       if (enable_idle)
2553 +               bfq_mark_bfqq_idle_window(bfqq);
2554 +       else
2555 +               bfq_clear_bfqq_idle_window(bfqq);
2556 +
2557 +       bfq_log_bfqq(bfqd, bfqq, "idle_window=%d (%d)",
2558 +                    enable_idle, CIC_SEEKY(cic));
2559 +}
2560 +
2561 +/*
2562 + * Called when a new fs request (rq) is added to bfqq.  Check if there's
2563 + * something we should do about it.
2564 + */
2565 +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2566 +                           struct request *rq)
2567 +{
2568 +       struct cfq_io_context *cic = RQ_CIC(rq);
2569 +
2570 +       if (rq_is_meta(rq))
2571 +               bfqq->meta_pending++;
2572 +
2573 +       bfq_update_io_thinktime(bfqd, cic);
2574 +       bfq_update_io_seektime(bfqd, bfqq, cic, rq);
2575 +       bfq_update_idle_window(bfqd, bfqq, cic);
2576 +
2577 +       cic->last_request_pos = rq->sector + rq->nr_sectors;
2578 +
2579 +       if (bfqq == bfqd->active_queue && bfq_bfqq_wait_request(bfqq)) {
2580 +               /*
2581 +                * If we are waiting for a request for this queue, let it rip
2582 +                * immediately and flag that we must not expire this queue
2583 +                * just now.
2584 +                */
2585 +               bfq_clear_bfqq_wait_request(bfqq);
2586 +               del_timer(&bfqd->idle_slice_timer);
2587 +               blk_start_queueing(bfqd->queue);
2588 +       }
2589 +}
2590 +
2591 +static void bfq_insert_request(struct request_queue *q, struct request *rq)
2592 +{
2593 +       struct bfq_data *bfqd = q->elevator->elevator_data;
2594 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
2595 +
2596 +       bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
2597 +
2598 +       bfq_add_rq_rb(rq);
2599 +
2600 +       list_add_tail(&rq->queuelist, &bfqq->fifo);
2601 +
2602 +       bfq_rq_enqueued(bfqd, bfqq, rq);
2603 +}
2604 +
2605 +static void bfq_update_hw_tag(struct bfq_data *bfqd)
2606 +{
2607 +       bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
2608 +                                    bfqd->rq_in_driver);
2609 +
2610 +       /*
2611 +        * This sample is valid if the number of outstanding requests
2612 +        * is large enough to allow a queueing behavior.  Note that the
2613 +        * sum is not exact, as it's not taking into account deactivated
2614 +        * requests.
2615 +        */
2616 +       if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
2617 +               return;
2618 +
2619 +       if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
2620 +               return;
2621 +
2622 +       bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
2623 +       bfqd->max_rq_in_driver = 0;
2624 +       bfqd->hw_tag_samples = 0;
2625 +}
2626 +
2627 +static void bfq_completed_request(struct request_queue *q, struct request *rq)
2628 +{
2629 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
2630 +       struct bfq_data *bfqd = bfqq->bfqd;
2631 +       const int sync = rq_is_sync(rq);
2632 +
2633 +       bfq_log_bfqq(bfqd, bfqq, "complete");
2634 +
2635 +       bfq_update_hw_tag(bfqd);
2636 +
2637 +       WARN_ON(!bfqd->rq_in_driver);
2638 +       WARN_ON(!bfqq->dispatched);
2639 +       bfqd->rq_in_driver--;
2640 +       bfqq->dispatched--;
2641 +
2642 +       if (bfq_bfqq_sync(bfqq))
2643 +               bfqd->sync_flight--;
2644 +
2645 +       if (sync)
2646 +               RQ_CIC(rq)->last_end_request = jiffies;
2647 +
2648 +       /*
2649 +        * If this is the active queue, check if it needs to be expired,
2650 +        * or if we want to idle in case it has no pending requests.
2651 +        */
2652 +       if (bfqd->active_queue == bfqq) {
2653 +               if (bfq_bfqq_budget_new(bfqq))
2654 +                       bfq_set_budget_timeout(bfqd);
2655 +
2656 +               if (bfq_bfqq_budget_timeout(bfqq))
2657 +                       bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
2658 +               else if (sync && bfqd->rq_in_driver == 0 &&
2659 +                        RB_EMPTY_ROOT(&bfqq->sort_list))
2660 +                       bfq_arm_slice_timer(bfqd);
2661 +       }
2662 +
2663 +       if (!bfqd->rq_in_driver)
2664 +               bfq_schedule_dispatch(bfqd);
2665 +}
2666 +
2667 +/*
2668 + * We temporarily boost lower priority queues if they are holding fs exclusive
2669 + * resources.  They are boosted to normal prio (CLASS_BE/4).
2670 + */
2671 +static void bfq_prio_boost(struct bfq_queue *bfqq)
2672 +{
2673 +       if (has_fs_excl()) {
2674 +               /*
2675 +                * boost idle prio on transactions that would lock out other
2676 +                * users of the filesystem
2677 +                */
2678 +               if (bfq_class_idle(bfqq))
2679 +                       bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
2680 +               if (bfqq->entity.new_ioprio > IOPRIO_NORM)
2681 +                       bfqq->entity.new_ioprio = IOPRIO_NORM;
2682 +       } else {
2683 +               /*
2684 +                * check if we need to unboost the queue
2685 +                */
2686 +               if (bfqq->entity.new_ioprio_class != bfqq->org_ioprio_class)
2687 +                       bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
2688 +               if (bfqq->entity.new_ioprio != bfqq->org_ioprio)
2689 +                       bfqq->entity.new_ioprio = bfqq->org_ioprio;
2690 +       }
2691 +}
2692 +
2693 +static inline int __bfq_may_queue(struct bfq_queue *bfqq)
2694 +{
2695 +       if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
2696 +               bfq_clear_bfqq_must_alloc(bfqq);
2697 +               return ELV_MQUEUE_MUST;
2698 +       }
2699 +
2700 +       return ELV_MQUEUE_MAY;
2701 +}
2702 +
2703 +static int bfq_may_queue(struct request_queue *q, int rw)
2704 +{
2705 +       struct bfq_data *bfqd = q->elevator->elevator_data;
2706 +       struct task_struct *tsk = current;
2707 +       struct cfq_io_context *cic;
2708 +       struct bfq_queue *bfqq;
2709 +
2710 +       /*
2711 +        * Don't force setup of a queue from here, as a call to may_queue
2712 +        * does not necessarily imply that a request actually will be queued.
2713 +        * so just lookup a possibly existing queue, or return 'may queue'
2714 +        * if that fails.
2715 +        */
2716 +       cic = bfq_cic_lookup(bfqd, tsk->io_context);
2717 +       if (cic == NULL)
2718 +               return ELV_MQUEUE_MAY;
2719 +
2720 +       bfqq = cic_to_bfqq(cic, rw & REQ_RW_SYNC);
2721 +       if (bfqq != NULL) {
2722 +               bfq_init_prio_data(bfqq, cic->ioc);
2723 +               bfq_prio_boost(bfqq);
2724 +
2725 +               return __bfq_may_queue(bfqq);
2726 +       }
2727 +
2728 +       return ELV_MQUEUE_MAY;
2729 +}
2730 +
2731 +/*
2732 + * queue lock held here
2733 + */
2734 +static void bfq_put_request(struct request *rq)
2735 +{
2736 +       struct bfq_queue *bfqq = RQ_BFQQ(rq);
2737 +
2738 +       if (bfqq != NULL) {
2739 +               const int rw = rq_data_dir(rq);
2740 +
2741 +               BUG_ON(!bfqq->allocated[rw]);
2742 +               bfqq->allocated[rw]--;
2743 +
2744 +               put_io_context(RQ_CIC(rq)->ioc);
2745 +
2746 +               rq->elevator_private = NULL;
2747 +               rq->elevator_private2 = NULL;
2748 +
2749 +               bfq_put_queue(bfqq);
2750 +       }
2751 +}
2752 +
2753 +/*
2754 + * Allocate bfq data structures associated with this request.
2755 + */
2756 +static int bfq_set_request(struct request_queue *q, struct request *rq,
2757 +                          gfp_t gfp_mask)
2758 +{
2759 +       struct bfq_data *bfqd = q->elevator->elevator_data;
2760 +       struct cfq_io_context *cic;
2761 +       const int rw = rq_data_dir(rq);
2762 +       const int is_sync = rq_is_sync(rq);
2763 +       struct bfq_queue *bfqq;
2764 +       struct bfq_group *bfqg;
2765 +       unsigned long flags;
2766 +
2767 +       might_sleep_if(gfp_mask & __GFP_WAIT);
2768 +
2769 +       cic = bfq_get_io_context(bfqd, gfp_mask);
2770 +
2771 +       spin_lock_irqsave(q->queue_lock, flags);
2772 +
2773 +       if (cic == NULL)
2774 +               goto queue_fail;
2775 +
2776 +       bfqg = bfq_cic_update_cgroup(cic);
2777 +
2778 +       bfqq = cic_to_bfqq(cic, is_sync);
2779 +       if (bfqq == NULL) {
2780 +               bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask);
2781 +               if (bfqq == NULL)
2782 +                       goto queue_fail;
2783 +
2784 +               cic_set_bfqq(cic, bfqq, is_sync);
2785 +       }
2786 +
2787 +       bfqq->allocated[rw]++;
2788 +       atomic_inc(&bfqq->ref);
2789 +
2790 +       spin_unlock_irqrestore(q->queue_lock, flags);
2791 +
2792 +       rq->elevator_private = cic;
2793 +       rq->elevator_private2 = bfqq;
2794 +
2795 +       return 0;
2796 +
2797 +queue_fail:
2798 +       if (cic != NULL)
2799 +               put_io_context(cic->ioc);
2800 +
2801 +       bfq_schedule_dispatch(bfqd);
2802 +       spin_unlock_irqrestore(q->queue_lock, flags);
2803 +
2804 +       return 1;
2805 +}
2806 +
2807 +static void bfq_kick_queue(struct work_struct *work)
2808 +{
2809 +       struct bfq_data *bfqd =
2810 +               container_of(work, struct bfq_data, unplug_work);
2811 +       struct request_queue *q = bfqd->queue;
2812 +       unsigned long flags;
2813 +
2814 +       spin_lock_irqsave(q->queue_lock, flags);
2815 +       blk_start_queueing(q);
2816 +       spin_unlock_irqrestore(q->queue_lock, flags);
2817 +}
2818 +
2819 +/*
2820 + * Timer running if the active_queue is currently idling inside its time slice
2821 + */
2822 +static void bfq_idle_slice_timer(unsigned long data)
2823 +{
2824 +       struct bfq_data *bfqd = (struct bfq_data *)data;
2825 +       struct bfq_queue *bfqq;
2826 +       unsigned long flags;
2827 +       enum bfqq_expiration reason;
2828 +
2829 +       bfq_log(bfqd, "slice_timer expired");
2830 +
2831 +       spin_lock_irqsave(bfqd->queue->queue_lock, flags);
2832 +
2833 +       bfqq = bfqd->active_queue;
2834 +       /*
2835 +        * Theoretical race here: active_queue can be NULL or different
2836 +        * from the queue that was idling if the timer handler spins on
2837 +        * the queue_lock and a new request arrives for the current
2838 +        * queue and there is a full dispatch cycle that changes the
2839 +        * active_queue.  This can hardly happen, but in the worst case
2840 +        * we just expire a queue too early.
2841 +        */
2842 +       if (bfqq != NULL) {
2843 +               reason = BFQ_BFQQ_TOO_IDLE;
2844 +               if (bfq_bfqq_budget_timeout(bfqq))
2845 +                       reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2846 +
2847 +               bfq_bfqq_expire(bfqd, bfqq, 1, reason);
2848 +       }
2849 +
2850 +       bfq_schedule_dispatch(bfqd);
2851 +
2852 +       spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
2853 +}
2854 +
2855 +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
2856 +{
2857 +       del_timer_sync(&bfqd->idle_slice_timer);
2858 +       kblockd_flush_work(&bfqd->unplug_work);
2859 +}
2860 +
2861 +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
2862 +                                       struct bfq_queue **bfqq_ptr)
2863 +{
2864 +       struct bfq_group *root_group = bfqd->root_group;
2865 +       struct bfq_queue *bfqq = *bfqq_ptr;
2866 +
2867 +       if (bfqq != NULL) {
2868 +               bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
2869 +               bfq_put_queue(bfqq);
2870 +               *bfqq_ptr = NULL;
2871 +       }
2872 +}
2873 +
2874 +/*
2875 + * Release all the bfqg references to its async queues.  If we are
2876 + * deallocating the group these queues may still contain requests, so
2877 + * we reparent them to the root cgroup (i.e., the only one that will
2878 + * exist for sure untill all the requests on a device are gone).
2879 + */
2880 +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
2881 +{
2882 +       int i, j;
2883 +
2884 +       for (i = 0; i < 2; i++)
2885 +               for (j = 0; j < IOPRIO_BE_NR; j++)
2886 +                       __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
2887 +
2888 +       __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
2889 +}
2890 +
2891 +static void bfq_exit_queue(elevator_t *e)
2892 +{
2893 +       struct bfq_data *bfqd = e->elevator_data;
2894 +       struct request_queue *q = bfqd->queue;
2895 +       struct bfq_queue *bfqq, *n;
2896 +       struct cfq_io_context *cic;
2897 +
2898 +       bfq_shutdown_timer_wq(bfqd);
2899 +
2900 +       spin_lock_irq(q->queue_lock);
2901 +
2902 +       while (!list_empty(&bfqd->cic_list)) {
2903 +               cic = list_entry(bfqd->cic_list.next, struct cfq_io_context,
2904 +                                queue_list);
2905 +               __bfq_exit_single_io_context(bfqd, cic);
2906 +       }
2907 +
2908 +       BUG_ON(bfqd->active_queue != NULL);
2909 +       list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
2910 +               bfq_deactivate_bfqq(bfqd, bfqq, 0);
2911 +
2912 +       bfq_disconnect_groups(bfqd);
2913 +       spin_unlock_irq(q->queue_lock);
2914 +
2915 +       bfq_shutdown_timer_wq(bfqd);
2916 +
2917 +       /* Wait for cic->key accessors to exit their grace periods. */
2918 +       synchronize_rcu();
2919 +
2920 +       BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2921 +
2922 +       bfq_free_root_group(bfqd);
2923 +       kfree(bfqd);
2924 +}
2925 +
2926 +static void *bfq_init_queue(struct request_queue *q)
2927 +{
2928 +       struct bfq_group *bfqg;
2929 +       struct bfq_data *bfqd;
2930 +
2931 +       bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2932 +       if (bfqd == NULL)
2933 +               return NULL;
2934 +
2935 +       INIT_LIST_HEAD(&bfqd->cic_list);
2936 +
2937 +       bfqd->queue = q;
2938 +
2939 +       bfqg = bfq_alloc_root_group(bfqd, q->node);
2940 +       if (bfqg == NULL) {
2941 +               kfree(bfqd);
2942 +               return NULL;
2943 +       }
2944 +
2945 +       bfqd->root_group = bfqg;
2946 +
2947 +       init_timer(&bfqd->idle_slice_timer);
2948 +       bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
2949 +       bfqd->idle_slice_timer.data = (unsigned long)bfqd;
2950 +
2951 +       INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
2952 +
2953 +       INIT_LIST_HEAD(&bfqd->active_list);
2954 +       INIT_LIST_HEAD(&bfqd->idle_list);
2955 +
2956 +       bfqd->hw_tag = 1;
2957 +
2958 +       bfqd->bfq_max_budget = bfq_max_budget;
2959 +
2960 +       bfqd->bfq_quantum = bfq_quantum;
2961 +       bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
2962 +       bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
2963 +       bfqd->bfq_back_max = bfq_back_max;
2964 +       bfqd->bfq_back_penalty = bfq_back_penalty;
2965 +       bfqd->bfq_slice_idle = bfq_slice_idle;
2966 +       bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
2967 +       bfqd->bfq_timeout[ASYNC] = bfq_timeout_async;
2968 +       bfqd->bfq_timeout[SYNC] = bfq_timeout_sync;
2969 +
2970 +       return bfqd;
2971 +}
2972 +
2973 +static void bfq_slab_kill(void)
2974 +{
2975 +       if (bfq_pool != NULL)
2976 +               kmem_cache_destroy(bfq_pool);
2977 +       if (bfq_ioc_pool != NULL)
2978 +               kmem_cache_destroy(bfq_ioc_pool);
2979 +}
2980 +
2981 +static int __init bfq_slab_setup(void)
2982 +{
2983 +       bfq_pool = KMEM_CACHE(bfq_queue, 0);
2984 +       if (bfq_pool == NULL)
2985 +               goto fail;
2986 +
2987 +       bfq_ioc_pool = kmem_cache_create("bfq_io_context",
2988 +                                        sizeof(struct cfq_io_context),
2989 +                                        __alignof__(struct cfq_io_context),
2990 +                                        0, NULL);
2991 +       if (bfq_ioc_pool == NULL)
2992 +               goto fail;
2993 +
2994 +       return 0;
2995 +fail:
2996 +       bfq_slab_kill();
2997 +       return -ENOMEM;
2998 +}
2999 +
3000 +static ssize_t bfq_var_show(unsigned int var, char *page)
3001 +{
3002 +       return sprintf(page, "%d\n", var);
3003 +}
3004 +
3005 +static ssize_t bfq_var_store(unsigned int *var, const char *page, size_t count)
3006 +{
3007 +       char *p = (char *)page;
3008 +
3009 +       *var = simple_strtoul(p, &p, 10);
3010 +       return count;
3011 +}
3012 +
3013 +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
3014 +static ssize_t __FUNC(elevator_t *e, char *page)                       \
3015 +{                                                                      \
3016 +       struct bfq_data *bfqd = e->elevator_data;                       \
3017 +       unsigned int __data = __VAR;                                    \
3018 +       if (__CONV)                                                     \
3019 +               __data = jiffies_to_msecs(__data);                      \
3020 +       return bfq_var_show(__data, (page));                            \
3021 +}
3022 +SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
3023 +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
3024 +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
3025 +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
3026 +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
3027 +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
3028 +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
3029 +SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
3030 +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[SYNC], 1);
3031 +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[ASYNC], 1);
3032 +#undef SHOW_FUNCTION
3033 +
3034 +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
3035 +static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)   \
3036 +{                                                                      \
3037 +       struct bfq_data *bfqd = e->elevator_data;                       \
3038 +       unsigned int __data;                                            \
3039 +       int ret = bfq_var_store(&__data, (page), count);                \
3040 +       if (__data < (MIN))                                             \
3041 +               __data = (MIN);                                         \
3042 +       else if (__data > (MAX))                                        \
3043 +               __data = (MAX);                                         \
3044 +       if (__CONV)                                                     \
3045 +               *(__PTR) = msecs_to_jiffies(__data);                    \
3046 +       else                                                            \
3047 +               *(__PTR) = __data;                                      \
3048 +       return ret;                                                     \
3049 +}
3050 +STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
3051 +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
3052 +               INT_MAX, 1);
3053 +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
3054 +               INT_MAX, 1);
3055 +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
3056 +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
3057 +               INT_MAX, 0);
3058 +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
3059 +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
3060 +               1, INT_MAX, 0);
3061 +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[ASYNC], 0,
3062 +               INT_MAX, 1);
3063 +#undef STORE_FUNCTION
3064 +
3065 +static inline bfq_service_t bfq_estimated_max_budget(struct bfq_data *bfqd)
3066 +{
3067 +       u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[SYNC]);
3068 +
3069 +       if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
3070 +               return bfq_calc_max_budget(bfqd->peak_rate, timeout);
3071 +       else
3072 +               return bfq_max_budget;
3073 +}
3074 +
3075 +static ssize_t bfq_max_budget_store(elevator_t *e, const char *page,
3076 +                                   size_t count)
3077 +{
3078 +       struct bfq_data *bfqd = e->elevator_data;
3079 +       unsigned int __data;
3080 +       int ret = bfq_var_store(&__data, (page), count);
3081 +
3082 +       if (__data == 0)
3083 +               bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
3084 +       else {
3085 +               if (__data > INT_MAX)
3086 +                       __data = INT_MAX;
3087 +               bfqd->bfq_max_budget = __data;
3088 +       }
3089 +
3090 +       bfqd->bfq_user_max_budget = __data;
3091 +
3092 +       return ret;
3093 +}
3094 +
3095 +static ssize_t bfq_timeout_sync_store(elevator_t *e, const char *page,
3096 +                                     size_t count)
3097 +{
3098 +       struct bfq_data *bfqd = e->elevator_data;
3099 +       unsigned int __data;
3100 +       int ret = bfq_var_store(&__data, (page), count);
3101 +
3102 +       if (__data < 1)
3103 +               __data = 1;
3104 +       else if (__data > INT_MAX)
3105 +               __data = INT_MAX;
3106 +
3107 +       bfqd->bfq_timeout[SYNC] = msecs_to_jiffies(__data);
3108 +       if (bfqd->bfq_user_max_budget == 0)
3109 +               bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
3110 +
3111 +       return ret;
3112 +}
3113 +
3114 +#define BFQ_ATTR(name) \
3115 +       __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
3116 +
3117 +static struct elv_fs_entry bfq_attrs[] = {
3118 +       BFQ_ATTR(quantum),
3119 +       BFQ_ATTR(fifo_expire_sync),
3120 +       BFQ_ATTR(fifo_expire_async),
3121 +       BFQ_ATTR(back_seek_max),
3122 +       BFQ_ATTR(back_seek_penalty),
3123 +       BFQ_ATTR(slice_idle),
3124 +       BFQ_ATTR(max_budget),
3125 +       BFQ_ATTR(max_budget_async_rq),
3126 +       BFQ_ATTR(timeout_sync),
3127 +       BFQ_ATTR(timeout_async),
3128 +       __ATTR_NULL
3129 +};
3130 +
3131 +static struct elevator_type iosched_bfq = {
3132 +       .ops = {
3133 +               .elevator_merge_fn =            bfq_merge,
3134 +               .elevator_merged_fn =           bfq_merged_request,
3135 +               .elevator_merge_req_fn =        bfq_merged_requests,
3136 +               .elevator_allow_merge_fn =      bfq_allow_merge,
3137 +               .elevator_dispatch_fn =         bfq_dispatch_requests,
3138 +               .elevator_add_req_fn =          bfq_insert_request,
3139 +               .elevator_activate_req_fn =     bfq_activate_request,
3140 +               .elevator_deactivate_req_fn =   bfq_deactivate_request,
3141 +               .elevator_queue_empty_fn =      bfq_queue_empty,
3142 +               .elevator_completed_req_fn =    bfq_completed_request,
3143 +               .elevator_former_req_fn =       elv_rb_former_request,
3144 +               .elevator_latter_req_fn =       elv_rb_latter_request,
3145 +               .elevator_set_req_fn =          bfq_set_request,
3146 +               .elevator_put_req_fn =          bfq_put_request,
3147 +               .elevator_may_queue_fn =        bfq_may_queue,
3148 +               .elevator_init_fn =             bfq_init_queue,
3149 +               .elevator_exit_fn =             bfq_exit_queue,
3150 +               .trim =                         bfq_free_io_context,
3151 +       },
3152 +       .elevator_attrs =       bfq_attrs,
3153 +       .elevator_name =        "bfq",
3154 +       .elevator_owner =       THIS_MODULE,
3155 +};
3156 +
3157 +static int __init bfq_init(void)
3158 +{
3159 +       /*
3160 +        * can be 0 on HZ < 1000 setups
3161 +        */
3162 +       if (bfq_slice_idle == 0)
3163 +               bfq_slice_idle = 1;
3164 +
3165 +       if (bfq_timeout_async == 0)
3166 +               bfq_timeout_async = 1;
3167 +
3168 +       if (bfq_slab_setup())
3169 +               return -ENOMEM;
3170 +
3171 +       elv_register(&iosched_bfq);
3172 +
3173 +       return 0;
3174 +}
3175 +
3176 +static void __exit bfq_exit(void)
3177 +{
3178 +       DECLARE_COMPLETION_ONSTACK(all_gone);
3179 +       elv_unregister(&iosched_bfq);
3180 +       bfq_ioc_gone = &all_gone;
3181 +       /* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */
3182 +       smp_wmb();
3183 +       if (elv_ioc_count_read(bfq_ioc_count) != 0)
3184 +               wait_for_completion(&all_gone);
3185 +       bfq_slab_kill();
3186 +}
3187 +
3188 +module_init(bfq_init);
3189 +module_exit(bfq_exit);
3190 +
3191 +MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
3192 +MODULE_LICENSE("GPL");
3193 +MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
3194 diff --git a/block/bfq-sched.c b/block/bfq-sched.c
3195 new file mode 100644
3196 index 0000000..ad0e629
3197 --- /dev/null
3198 +++ b/block/bfq-sched.c
3199 @@ -0,0 +1,950 @@
3200 +/*
3201 + * BFQ: Hierarchical B-WF2Q+ scheduler.
3202 + *
3203 + * Based on ideas and code from CFQ:
3204 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
3205 + *
3206 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
3207 + *                   Paolo Valente <paolo.valente@unimore.it>
3208 + */
3209 +
3210 +#ifdef CONFIG_CGROUP_BFQIO
3211 +#define for_each_entity(entity)        \
3212 +       for (; entity != NULL; entity = entity->parent)
3213 +
3214 +#define for_each_entity_safe(entity, parent) \
3215 +       for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
3216 +
3217 +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
3218 +                                                int extract);
3219 +
3220 +static int bfq_update_next_active(struct bfq_sched_data *sd)
3221 +{
3222 +       struct bfq_group *bfqg;
3223 +       struct bfq_entity *entity, *next_active;
3224 +
3225 +       if (sd->active_entity != NULL)
3226 +               /* will update/requeue at the end of service */
3227 +               return 0;
3228 +
3229 +       /*
3230 +        * NOTE: this can be improved in may ways, such as returning
3231 +        * 1 (and thus propagating upwards the update) only when the
3232 +        * budget changes, or caching the bfqq that will be scheduled
3233 +        * next from this subtree.  By now we worry more about
3234 +        * correctness than about performance...
3235 +        */
3236 +       next_active = bfq_lookup_next_entity(sd, 0);
3237 +       sd->next_active = next_active;
3238 +
3239 +       if (next_active != NULL) {
3240 +               bfqg = container_of(sd, struct bfq_group, sched_data);
3241 +               entity = bfqg->my_entity;
3242 +               if (entity != NULL)
3243 +                       entity->budget = next_active->budget;
3244 +       }
3245 +
3246 +       return 1;
3247 +}
3248 +
3249 +static inline void bfq_check_next_active(struct bfq_sched_data *sd,
3250 +                                        struct bfq_entity *entity)
3251 +{
3252 +       BUG_ON(sd->next_active != entity);
3253 +}
3254 +#else
3255 +#define for_each_entity(entity)        \
3256 +       for (; entity != NULL; entity = NULL)
3257 +
3258 +#define for_each_entity_safe(entity, parent) \
3259 +       for (parent = NULL; entity != NULL; entity = parent)
3260 +
3261 +static inline int bfq_update_next_active(struct bfq_sched_data *sd)
3262 +{
3263 +       return 0;
3264 +}
3265 +
3266 +static inline void bfq_check_next_active(struct bfq_sched_data *sd,
3267 +                                        struct bfq_entity *entity)
3268 +{
3269 +}
3270 +#endif
3271 +
3272 +/*
3273 + * Shift for timestamp calculations.  This actually limits the maximum
3274 + * service allowed in one timestamp delta (small shift values increase it),
3275 + * the maximum total weight that can be used for the queues in the system
3276 + * (big shift values increase it), and the period of virtual time wraparounds.
3277 + */
3278 +#define WFQ_SERVICE_SHIFT      22
3279 +
3280 +/**
3281 + * bfq_gt - compare two timestamps.
3282 + * @a: first ts.
3283 + * @b: second ts.
3284 + *
3285 + * Return @a > @b, dealing with wrapping correctly.
3286 + */
3287 +static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
3288 +{
3289 +       return (s64)(a - b) > 0;
3290 +}
3291 +
3292 +/**
3293 + * bfq_delta - map service into the virtual time domain.
3294 + * @service: amount of service.
3295 + * @weight: scale factor.
3296 + */
3297 +static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
3298 +                                       bfq_weight_t weight)
3299 +{
3300 +       bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
3301 +
3302 +       do_div(d, weight);
3303 +       return d;
3304 +}
3305 +
3306 +/**
3307 + * bfq_calc_finish - assign the finish time to an entity.
3308 + * @entity: the entity to act upon.
3309 + * @service: the service to be charged to the entity.
3310 + */
3311 +static inline void bfq_calc_finish(struct bfq_entity *entity,
3312 +                                  bfq_service_t service)
3313 +{
3314 +       BUG_ON(entity->weight == 0);
3315 +
3316 +       entity->finish = entity->start + bfq_delta(service, entity->weight);
3317 +}
3318 +
3319 +/**
3320 + * bfq_entity_of - get an entity from a node.
3321 + * @node: the node field of the entity.
3322 + *
3323 + * Convert a node pointer to the relative entity.  This is used only
3324 + * to simplify the logic of some functions and not as the generic
3325 + * conversion mechanism because, e.g., in the tree walking functions,
3326 + * the check for a %NULL value would be redundant.
3327 + */
3328 +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
3329 +{
3330 +       struct bfq_entity *entity = NULL;
3331 +
3332 +       if (node != NULL)
3333 +               entity = rb_entry(node, struct bfq_entity, rb_node);
3334 +
3335 +       return entity;
3336 +}
3337 +
3338 +/**
3339 + * bfq_extract - remove an entity from a tree.
3340 + * @root: the tree root.
3341 + * @entity: the entity to remove.
3342 + */
3343 +static inline void bfq_extract(struct rb_root *root,
3344 +                              struct bfq_entity *entity)
3345 +{
3346 +       BUG_ON(entity->tree != root);
3347 +
3348 +       entity->tree = NULL;
3349 +       rb_erase(&entity->rb_node, root);
3350 +}
3351 +
3352 +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
3353 +{
3354 +       struct bfq_queue *bfqq = NULL;
3355 +
3356 +       BUG_ON(entity == NULL);
3357 +
3358 +       if (entity->my_sched_data == NULL)
3359 +               bfqq = container_of(entity, struct bfq_queue, entity);
3360 +
3361 +       return bfqq;
3362 +}
3363 +
3364 +/**
3365 + * bfq_idle_extract - extract an entity from the idle tree.
3366 + * @st: the service tree of the owning @entity.
3367 + * @entity: the entity being removed.
3368 + */
3369 +static void bfq_idle_extract(struct bfq_service_tree *st,
3370 +                            struct bfq_entity *entity)
3371 +{
3372 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3373 +       struct rb_node *next;
3374 +
3375 +       BUG_ON(entity->tree != &st->idle);
3376 +
3377 +       if (entity == st->first_idle) {
3378 +               next = rb_next(&entity->rb_node);
3379 +               st->first_idle = bfq_entity_of(next);
3380 +       }
3381 +
3382 +       if (entity == st->last_idle) {
3383 +               next = rb_prev(&entity->rb_node);
3384 +               st->last_idle = bfq_entity_of(next);
3385 +       }
3386 +
3387 +       bfq_extract(&st->idle, entity);
3388 +
3389 +       if (bfqq != NULL)
3390 +               list_del(&bfqq->bfqq_list);
3391 +}
3392 +
3393 +/**
3394 + * bfq_insert - generic tree insertion.
3395 + * @root: tree root.
3396 + * @entity: entity to insert.
3397 + *
3398 + * This is used for the idle and the active tree, since they are both
3399 + * ordered by finish time.
3400 + */
3401 +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
3402 +{
3403 +       struct bfq_entity *entry;
3404 +       struct rb_node **node = &root->rb_node;
3405 +       struct rb_node *parent = NULL;
3406 +
3407 +       BUG_ON(entity->tree != NULL);
3408 +
3409 +       while (*node != NULL) {
3410 +               parent = *node;
3411 +               entry = rb_entry(parent, struct bfq_entity, rb_node);
3412 +
3413 +               if (bfq_gt(entry->finish, entity->finish))
3414 +                       node = &parent->rb_left;
3415 +               else
3416 +                       node = &parent->rb_right;
3417 +       }
3418 +
3419 +       rb_link_node(&entity->rb_node, parent, node);
3420 +       rb_insert_color(&entity->rb_node, root);
3421 +
3422 +       entity->tree = root;
3423 +}
3424 +
3425 +/**
3426 + * bfq_update_min - update the min_start field of a entity.
3427 + * @entity: the entity to update.
3428 + * @node: one of its children.
3429 + *
3430 + * This function is called when @entity may store an invalid value for
3431 + * min_start due to updates to the active tree.  The function  assumes
3432 + * that the subtree rooted at @node (which may be its left or its right
3433 + * child) has a valid min_start value.
3434 + */
3435 +static inline void bfq_update_min(struct bfq_entity *entity,
3436 +                                 struct rb_node *node)
3437 +{
3438 +       struct bfq_entity *child;
3439 +
3440 +       if (node != NULL) {
3441 +               child = rb_entry(node, struct bfq_entity, rb_node);
3442 +               if (bfq_gt(entity->min_start, child->min_start))
3443 +                       entity->min_start = child->min_start;
3444 +       }
3445 +}
3446 +
3447 +/**
3448 + * bfq_update_active_node - recalculate min_start.
3449 + * @node: the node to update.
3450 + *
3451 + * @node may have changed position or one of its children may have moved,
3452 + * this function updates its min_start value.  The left and right subtrees
3453 + * are assumed to hold a correct min_start value.
3454 + */
3455 +static inline void bfq_update_active_node(struct rb_node *node)
3456 +{
3457 +       struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
3458 +
3459 +       entity->min_start = entity->start;
3460 +       bfq_update_min(entity, node->rb_right);
3461 +       bfq_update_min(entity, node->rb_left);
3462 +}
3463 +
3464 +/**
3465 + * bfq_update_active_tree - update min_start for the whole active tree.
3466 + * @node: the starting node.
3467 + *
3468 + * @node must be the deepest modified node after an update.  This function
3469 + * updates its min_start using the values held by its children, assuming
3470 + * that they did not change, and then updates all the nodes that may have
3471 + * changed in the path to the root.  The only nodes that may have changed
3472 + * are the ones in the path or their siblings.
3473 + */
3474 +static void bfq_update_active_tree(struct rb_node *node)
3475 +{
3476 +       struct rb_node *parent;
3477 +
3478 +up:
3479 +       bfq_update_active_node(node);
3480 +
3481 +       parent = rb_parent(node);
3482 +       if (parent == NULL)
3483 +               return;
3484 +
3485 +       if (node == parent->rb_left && parent->rb_right != NULL)
3486 +               bfq_update_active_node(parent->rb_right);
3487 +       else if (parent->rb_left != NULL)
3488 +               bfq_update_active_node(parent->rb_left);
3489 +
3490 +       node = parent;
3491 +       goto up;
3492 +}
3493 +
3494 +/**
3495 + * bfq_active_insert - insert an entity in the active tree of its group/device.
3496 + * @st: the service tree of the entity.
3497 + * @entity: the entity being inserted.
3498 + *
3499 + * The active tree is ordered by finish time, but an extra key is kept
3500 + * per each node, containing the minimum value for the start times of
3501 + * its children (and the node itself), so it's possible to search for
3502 + * the eligible node with the lowest finish time in logarithmic time.
3503 + */
3504 +static void bfq_active_insert(struct bfq_service_tree *st,
3505 +                             struct bfq_entity *entity)
3506 +{
3507 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3508 +       struct rb_node *node = &entity->rb_node;
3509 +
3510 +       bfq_insert(&st->active, entity);
3511 +
3512 +       if (node->rb_left != NULL)
3513 +               node = node->rb_left;
3514 +       else if (node->rb_right != NULL)
3515 +               node = node->rb_right;
3516 +
3517 +       bfq_update_active_tree(node);
3518 +
3519 +       if (bfqq != NULL)
3520 +               list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
3521 +}
3522 +
3523 +/**
3524 + * bfq_ioprio_to_weight - calc a weight from an ioprio.
3525 + * @ioprio: the ioprio value to convert.
3526 + */
3527 +static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
3528 +{
3529 +       WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
3530 +       return IOPRIO_BE_NR - ioprio;
3531 +}
3532 +
3533 +static inline void bfq_get_entity(struct bfq_entity *entity)
3534 +{
3535 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3536 +       struct bfq_sched_data *sd;
3537 +
3538 +       if (bfqq != NULL) {
3539 +               sd = entity->sched_data;
3540 +               atomic_inc(&bfqq->ref);
3541 +       }
3542 +}
3543 +
3544 +/**
3545 + * bfq_find_deepest - find the deepest node that an extraction can modify.
3546 + * @node: the node being removed.
3547 + *
3548 + * Do the first step of an extraction in an rb tree, looking for the
3549 + * node that will replace @node, and returning the deepest node that
3550 + * the following modifications to the tree can touch.  If @node is the
3551 + * last node in the tree return %NULL.
3552 + */
3553 +static struct rb_node *bfq_find_deepest(struct rb_node *node)
3554 +{
3555 +       struct rb_node *deepest;
3556 +
3557 +       if (node->rb_right == NULL && node->rb_left == NULL)
3558 +               deepest = rb_parent(node);
3559 +       else if (node->rb_right == NULL)
3560 +               deepest = node->rb_left;
3561 +       else if (node->rb_left == NULL)
3562 +               deepest = node->rb_right;
3563 +       else {
3564 +               deepest = rb_next(node);
3565 +               if (deepest->rb_right != NULL)
3566 +                       deepest = deepest->rb_right;
3567 +               else if (rb_parent(deepest) != node)
3568 +                       deepest = rb_parent(deepest);
3569 +       }
3570 +
3571 +       return deepest;
3572 +}
3573 +
3574 +/**
3575 + * bfq_active_extract - remove an entity from the active tree.
3576 + * @st: the service_tree containing the tree.
3577 + * @entity: the entity being removed.
3578 + */
3579 +static void bfq_active_extract(struct bfq_service_tree *st,
3580 +                              struct bfq_entity *entity)
3581 +{
3582 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3583 +       struct rb_node *node;
3584 +
3585 +       node = bfq_find_deepest(&entity->rb_node);
3586 +       bfq_extract(&st->active, entity);
3587 +
3588 +       if (node != NULL)
3589 +               bfq_update_active_tree(node);
3590 +
3591 +       if (bfqq != NULL)
3592 +               list_del(&bfqq->bfqq_list);
3593 +}
3594 +
3595 +/**
3596 + * bfq_idle_insert - insert an entity into the idle tree.
3597 + * @st: the service tree containing the tree.
3598 + * @entity: the entity to insert.
3599 + */
3600 +static void bfq_idle_insert(struct bfq_service_tree *st,
3601 +                           struct bfq_entity *entity)
3602 +{
3603 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3604 +       struct bfq_entity *first_idle = st->first_idle;
3605 +       struct bfq_entity *last_idle = st->last_idle;
3606 +
3607 +       if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
3608 +               st->first_idle = entity;
3609 +       if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
3610 +               st->last_idle = entity;
3611 +
3612 +       bfq_insert(&st->idle, entity);
3613 +
3614 +       if (bfqq != NULL)
3615 +               list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
3616 +}
3617 +
3618 +/**
3619 + * bfq_forget_entity - remove an entity from the wfq trees.
3620 + * @st: the service tree.
3621 + * @entity: the entity being removed.
3622 + *
3623 + * Update the device status and forget everything about @entity, putting
3624 + * the device reference to it, if it is a queue.  Entities belonging to
3625 + * groups are not refcounted.
3626 + */
3627 +static void bfq_forget_entity(struct bfq_service_tree *st,
3628 +                             struct bfq_entity *entity)
3629 +{
3630 +       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3631 +       struct bfq_sched_data *sd;
3632 +
3633 +       BUG_ON(!entity->on_st);
3634 +
3635 +       entity->on_st = 0;
3636 +       st->wsum -= entity->weight;
3637 +       if (bfqq != NULL) {
3638 +               sd = entity->sched_data;
3639 +               bfq_put_queue(bfqq);
3640 +       }
3641 +}
3642 +
3643 +/**
3644 + * bfq_put_idle_entity - release the idle tree ref of an entity.
3645 + * @st: service tree for the entity.
3646 + * @entity: the entity being released.
3647 + */
3648 +static void bfq_put_idle_entity(struct bfq_service_tree *st,
3649 +                               struct bfq_entity *entity)
3650 +{
3651 +       bfq_idle_extract(st, entity);
3652 +       bfq_forget_entity(st, entity);
3653 +}
3654 +
3655 +/**
3656 + * bfq_forget_idle - update the idle tree if necessary.
3657 + * @st: the service tree to act upon.
3658 + *
3659 + * To preserve the global O(log N) complexity we only remove one entry here;
3660 + * as the idle tree will not grow indefinitely this can be done safely.
3661 + */
3662 +static void bfq_forget_idle(struct bfq_service_tree *st)
3663 +{
3664 +       struct bfq_entity *first_idle = st->first_idle;
3665 +       struct bfq_entity *last_idle = st->last_idle;
3666 +
3667 +       if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
3668 +           !bfq_gt(last_idle->finish, st->vtime)) {
3669 +               /*
3670 +                * Forget the whole idle tree, increasing the vtime past
3671 +                * the last finish time of idle entities.
3672 +                */
3673 +               st->vtime = last_idle->finish;
3674 +       }
3675 +
3676 +       if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
3677 +               bfq_put_idle_entity(st, first_idle);
3678 +}
3679 +
3680 +/**
3681 + * bfq_bfqq_served - update the scheduler status after selection for service.
3682 + * @bfqq: the queue being served.
3683 + * @served: bytes to transfer.
3684 + *
3685 + * NOTE: this can be optimized, as the timestamps of upper level entities
3686 + * are synchronized every time a new bfqq is selected for service.  By now,
3687 + * we keep it to better check consistency.
3688 + */
3689 +static void bfq_bfqq_served(struct bfq_queue *bfqq, bfq_service_t served)
3690 +{
3691 +       struct bfq_entity *entity = &bfqq->entity;
3692 +       struct bfq_service_tree *st;
3693 +
3694 +       for_each_entity(entity) {
3695 +               st = bfq_entity_service_tree(entity);
3696 +
3697 +               entity->service += served;
3698 +
3699 +               WARN_ON_ONCE(entity->service > entity->budget);
3700 +               BUG_ON(st->wsum == 0);
3701 +
3702 +               st->vtime += bfq_delta(served, st->wsum);
3703 +               bfq_forget_idle(st);
3704 +       }
3705 +}
3706 +
3707 +/**
3708 + * bfq_bfqq_charge_full_budget - set the service to the entity budget.
3709 + * @bfqq: the queue that needs a service update.
3710 + *
3711 + * When it's not possible to be fair in the service domain, because
3712 + * a queue is not consuming its budget fast enough (the meaning of
3713 + * fast depends on the timeout parameter), we charge it a full
3714 + * budget.  In this way we should obtain a sort of time-domain
3715 + * fairness among all the seeky/slow queues.
3716 + */
3717 +static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
3718 +{
3719 +       struct bfq_entity *entity = &bfqq->entity;
3720 +
3721 +       bfq_bfqq_served(bfqq, entity->budget - entity->service);
3722 +}
3723 +
3724 +static struct bfq_service_tree *
3725 +__bfq_entity_update_prio(struct bfq_service_tree *old_st,
3726 +                        struct bfq_entity *entity)
3727 +{
3728 +       struct bfq_service_tree *new_st = old_st;
3729 +
3730 +       if (entity->ioprio_changed) {
3731 +               entity->ioprio = entity->new_ioprio;
3732 +               entity->ioprio_class = entity->new_ioprio_class;
3733 +               entity->ioprio_changed = 0;
3734 +
3735 +               old_st->wsum -= entity->weight;
3736 +               entity->weight = bfq_ioprio_to_weight(entity->ioprio);
3737 +
3738 +               /*
3739 +                * NOTE: here we may be changing the weight too early,
3740 +                * this will cause unfairness.  The correct approach
3741 +                * would have required additional complexity to defer
3742 +                * weight changes to the proper time instants (i.e.,
3743 +                * when entity->finish <= old_st->vtime).
3744 +                */
3745 +               new_st = bfq_entity_service_tree(entity);
3746 +               new_st->wsum += entity->weight;
3747 +
3748 +               if (new_st != old_st)
3749 +                       entity->start = new_st->vtime;
3750 +       }
3751 +
3752 +       return new_st;
3753 +}
3754 +
3755 +/**
3756 + * __bfq_activate_entity - activate an entity.
3757 + * @entity: the entity being activated.
3758 + *
3759 + * Called whenever an entity is activated, i.e., it is not active and one
3760 + * of its children receives a new request, or has to be reactivated due to
3761 + * budget exhaustion.  It uses the current budget of the entity (and the
3762 + * service received if @entity is active) of the queue to calculate its
3763 + * timestamps.
3764 + */
3765 +static void __bfq_activate_entity(struct bfq_entity *entity)
3766 +{
3767 +       struct bfq_sched_data *sd = entity->sched_data;
3768 +       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
3769 +
3770 +       if (entity == sd->active_entity) {
3771 +               BUG_ON(entity->tree != NULL);
3772 +               /*
3773 +                * If we are requeueing the current entity we have
3774 +                * to take care of not charging to it service it has
3775 +                * not received.
3776 +                */
3777 +               bfq_calc_finish(entity, entity->service);
3778 +               entity->start = entity->finish;
3779 +               sd->active_entity = NULL;
3780 +       } else if (entity->tree == &st->active) {
3781 +               /*
3782 +                * Requeueing an entity due to a change of some
3783 +                * next_active entity below it.  We reuse the old
3784 +                * start time.
3785 +                */
3786 +               bfq_active_extract(st, entity);
3787 +       } else if (entity->tree == &st->idle) {
3788 +               /*
3789 +                * Must be on the idle tree, bfq_idle_extract() will
3790 +                * check for that.
3791 +                */
3792 +               bfq_idle_extract(st, entity);
3793 +               entity->start = bfq_gt(st->vtime, entity->finish) ?
3794 +                                      st->vtime : entity->finish;
3795 +       } else {
3796 +               /*
3797 +                * The finish time of the entity may be invalid, and
3798 +                * it is in the past for sure, otherwise the queue
3799 +                * would have been on the idle tree.
3800 +                */
3801 +               entity->start = st->vtime;
3802 +               st->wsum += entity->weight;
3803 +               bfq_get_entity(entity);
3804 +
3805 +               BUG_ON(entity->on_st);
3806 +               entity->on_st = 1;
3807 +       }
3808 +
3809 +       st = __bfq_entity_update_prio(st, entity);
3810 +       bfq_calc_finish(entity, entity->budget);
3811 +       bfq_active_insert(st, entity);
3812 +}
3813 +
3814 +/**
3815 + * bfq_activate_entity - activate an entity and its ancestors if necessary.
3816 + * @entity: the entity to activate.
3817 + *
3818 + * Activate @entity and all the entities on the path from it to the root.
3819 + */
3820 +static void bfq_activate_entity(struct bfq_entity *entity)
3821 +{
3822 +       struct bfq_sched_data *sd;
3823 +
3824 +       for_each_entity(entity) {
3825 +               __bfq_activate_entity(entity);
3826 +
3827 +               sd = entity->sched_data;
3828 +               if (!bfq_update_next_active(sd))
3829 +                       /*
3830 +                        * No need to propagate the activation to the
3831 +                        * upper entities, as they will be updated when
3832 +                        * the active entity is rescheduled.
3833 +                        */
3834 +                       break;
3835 +       }
3836 +}
3837 +
3838 +/**
3839 + * __bfq_deactivate_entity - deactivate an entity from its service tree.
3840 + * @entity: the entity to deactivate.
3841 + * @requeue: if false, the entity will not be put into the idle tree.
3842 + *
3843 + * Deactivate an entity, independently from its previous state.  If the
3844 + * entity was not on a service tree just return, otherwise if it is on
3845 + * any scheduler tree, extract it from that tree, and if necessary
3846 + * and if the caller did not specify @requeue, put it on the idle tree.
3847 + *
3848 + * Return %1 if the caller should update the entity hierarchy, i.e.,
3849 + * if the entity was under service or if it was the next_active for
3850 + * its sched_data; return %0 otherwise.
3851 + */
3852 +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
3853 +{
3854 +       struct bfq_sched_data *sd = entity->sched_data;
3855 +       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
3856 +       int was_active = entity == sd->active_entity;
3857 +       int ret = 0;
3858 +
3859 +       if (!entity->on_st)
3860 +               return 0;
3861 +
3862 +       BUG_ON(was_active && entity->tree != NULL);
3863 +
3864 +       if (was_active) {
3865 +               bfq_calc_finish(entity, entity->service);
3866 +               sd->active_entity = NULL;
3867 +       } else if (entity->tree == &st->active)
3868 +               bfq_active_extract(st, entity);
3869 +       else if (entity->tree == &st->idle)
3870 +               bfq_idle_extract(st, entity);
3871 +       else if (entity->tree != NULL)
3872 +               BUG();
3873 +
3874 +       if (was_active || sd->next_active == entity)
3875 +               ret = bfq_update_next_active(sd);
3876 +
3877 +       if (!requeue || !bfq_gt(entity->finish, st->vtime))
3878 +               bfq_forget_entity(st, entity);
3879 +       else
3880 +               bfq_idle_insert(st, entity);
3881 +
3882 +       BUG_ON(sd->active_entity == entity);
3883 +       BUG_ON(sd->next_active == entity);
3884 +
3885 +       return ret;
3886 +}
3887 +
3888 +/**
3889 + * bfq_deactivate_entity - deactivate an entity.
3890 + * @entity: the entity to deactivate.
3891 + * @requeue: true if the entity can be put on the idle tree
3892 + */
3893 +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
3894 +{
3895 +       struct bfq_sched_data *sd;
3896 +       struct bfq_entity *parent;
3897 +
3898 +       for_each_entity_safe(entity, parent) {
3899 +               sd = entity->sched_data;
3900 +
3901 +               if (!__bfq_deactivate_entity(entity, requeue))
3902 +                       /*
3903 +                        * The parent entity is still backlogged, and
3904 +                        * we don't need to update it as it is still
3905 +                        * under service.
3906 +                        */
3907 +                       break;
3908 +
3909 +               if (sd->next_active != NULL)
3910 +                       /*
3911 +                        * The parent entity is still backlogged and
3912 +                        * the budgets on the path towards the root
3913 +                        * need to be updated.
3914 +                        */
3915 +                       goto update;
3916 +
3917 +               /*
3918 +                * If we reach there the parent is no more backlogged and
3919 +                * we want to propagate the dequeue upwards.
3920 +                */
3921 +               requeue = 1;
3922 +       }
3923 +
3924 +       return;
3925 +
3926 +update:
3927 +       entity = parent;
3928 +       for_each_entity(entity) {
3929 +               __bfq_activate_entity(entity);
3930 +
3931 +               sd = entity->sched_data;
3932 +               if (!bfq_update_next_active(sd))
3933 +                       break;
3934 +       }
3935 +}
3936 +
3937 +/**
3938 + * bfq_update_vtime - update vtime if necessary.
3939 + * @st: the service tree to act upon.
3940 + *
3941 + * If necessary update the service tree vtime to have at least one
3942 + * eligible entity, skipping to its start time.  Assumes that the
3943 + * active tree of the device is not empty.
3944 + *
3945 + * NOTE: this hierarchical implementation updates vtimes quite often,
3946 + * we may end up with reactivated tasks getting timestamps after a
3947 + * vtime skip done because we needed a ->first_active entity on some
3948 + * intermediate node.
3949 + */
3950 +static void bfq_update_vtime(struct bfq_service_tree *st)
3951 +{
3952 +       struct bfq_entity *entry;
3953 +       struct rb_node *node = st->active.rb_node;
3954 +
3955 +       entry = rb_entry(node, struct bfq_entity, rb_node);
3956 +       if (bfq_gt(entry->min_start, st->vtime)) {
3957 +               st->vtime = entry->min_start;
3958 +               bfq_forget_idle(st);
3959 +       }
3960 +}
3961 +
3962 +/**
3963 + * bfq_first_active - find the eligible entity with the smallest finish time
3964 + * @st: the service tree to select from.
3965 + *
3966 + * This function searches the first schedulable entity, starting from the
3967 + * root of the tree and going on the left every time on this side there is
3968 + * a subtree with at least one eligible (start >= vtime) entity.  The path
3969 + * on the right is followed only if a) the left subtree contains no eligible
3970 + * entities and b) no eligible entity has been found yet.
3971 + */
3972 +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
3973 +{
3974 +       struct bfq_entity *entry, *first = NULL;
3975 +       struct rb_node *node = st->active.rb_node;
3976 +
3977 +       while (node != NULL) {
3978 +               entry = rb_entry(node, struct bfq_entity, rb_node);
3979 +left:
3980 +               if (!bfq_gt(entry->start, st->vtime))
3981 +                       first = entry;
3982 +
3983 +               BUG_ON(bfq_gt(entry->min_start, st->vtime));
3984 +
3985 +               if (node->rb_left != NULL) {
3986 +                       entry = rb_entry(node->rb_left,
3987 +                                        struct bfq_entity, rb_node);
3988 +                       if (!bfq_gt(entry->min_start, st->vtime)) {
3989 +                               node = node->rb_left;
3990 +                               goto left;
3991 +                       }
3992 +               }
3993 +               if (first != NULL)
3994 +                       break;
3995 +               node = node->rb_right;
3996 +       }
3997 +
3998 +       BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
3999 +       return first;
4000 +}
4001 +
4002 +/**
4003 + * __bfq_lookup_next_entity - return the first eligible entity in @st.
4004 + * @st: the service tree.
4005 + *
4006 + * Update the virtual time in @st and return the first eligible entity
4007 + * it contains.
4008 + */
4009 +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st)
4010 +{
4011 +       struct bfq_entity *entity;
4012 +
4013 +       if (RB_EMPTY_ROOT(&st->active))
4014 +               return NULL;
4015 +
4016 +       bfq_update_vtime(st);
4017 +       entity = bfq_first_active_entity(st);
4018 +       BUG_ON(bfq_gt(entity->start, st->vtime));
4019 +
4020 +       return entity;
4021 +}
4022 +
4023 +/**
4024 + * bfq_lookup_next_entity - return the first eligible entity in @sd.
4025 + * @sd: the sched_data.
4026 + * @extract: if true the returned entity will be also extracted from @sd.
4027 + *
4028 + * NOTE: since we cache the next_active entity at each level of the
4029 + * hierarchy, the complexity of the lookup can be decreased with
4030 + * absolutely no effort just returning the cached next_active value;
4031 + * we prefer to do full lookups to test the consistency of * the data
4032 + * structures.
4033 + */
4034 +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4035 +                                                int extract)
4036 +{
4037 +       struct bfq_service_tree *st = sd->service_tree;
4038 +       struct bfq_entity *entity;
4039 +       int i;
4040 +
4041 +       BUG_ON(sd->active_entity != NULL);
4042 +
4043 +       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++, st++) {
4044 +               entity = __bfq_lookup_next_entity(st);
4045 +               if (entity != NULL) {
4046 +                       if (extract) {
4047 +                               bfq_check_next_active(sd, entity);
4048 +                               bfq_active_extract(st, entity);
4049 +                               sd->active_entity = entity;
4050 +                               sd->next_active = NULL;
4051 +                       }
4052 +                       break;
4053 +               }
4054 +       }
4055 +
4056 +       return entity;
4057 +}
4058 +
4059 +/*
4060 + * Get next queue for service.
4061 + */
4062 +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
4063 +{
4064 +       struct bfq_entity *entity = NULL;
4065 +       struct bfq_sched_data *sd;
4066 +       struct bfq_queue *bfqq;
4067 +
4068 +       BUG_ON(bfqd->active_queue != NULL);
4069 +
4070 +       if (bfqd->busy_queues == 0)
4071 +               return NULL;
4072 +
4073 +       sd = &bfqd->root_group->sched_data;
4074 +       for (; sd != NULL; sd = entity->my_sched_data) {
4075 +               entity = bfq_lookup_next_entity(sd, 1);
4076 +               BUG_ON(entity == NULL);
4077 +               entity->service = 0;
4078 +       }
4079 +
4080 +       bfqq = bfq_entity_to_bfqq(entity);
4081 +       BUG_ON(bfqq == NULL);
4082 +
4083 +       return bfqq;
4084 +}
4085 +
4086 +static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
4087 +{
4088 +       if (bfqd->active_cic != NULL) {
4089 +               put_io_context(bfqd->active_cic->ioc);
4090 +               bfqd->active_cic = NULL;
4091 +       }
4092 +
4093 +       bfqd->active_queue = NULL;
4094 +       del_timer(&bfqd->idle_slice_timer);
4095 +}
4096 +
4097 +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4098 +                               int requeue)
4099 +{
4100 +       struct bfq_entity *entity = &bfqq->entity;
4101 +
4102 +       if (bfqq == bfqd->active_queue)
4103 +               __bfq_bfqd_reset_active(bfqd);
4104 +
4105 +       bfq_deactivate_entity(entity, requeue);
4106 +}
4107 +
4108 +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4109 +{
4110 +       struct bfq_entity *entity = &bfqq->entity;
4111 +
4112 +       bfq_activate_entity(entity);
4113 +}
4114 +
4115 +/*
4116 + * Called when the bfqq no longer has requests pending, remove it from
4117 + * the service tree.
4118 + */
4119 +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4120 +                             int requeue)
4121 +{
4122 +       BUG_ON(!bfq_bfqq_busy(bfqq));
4123 +       BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
4124 +
4125 +       bfq_log_bfqq(bfqd, bfqq, "del from busy");
4126 +
4127 +       bfq_clear_bfqq_busy(bfqq);
4128 +
4129 +       BUG_ON(bfqd->busy_queues == 0);
4130 +       bfqd->busy_queues--;
4131 +
4132 +       bfq_deactivate_bfqq(bfqd, bfqq, requeue);
4133 +}
4134 +
4135 +/*
4136 + * Called when an inactive queue receives a new request.
4137 + */
4138 +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4139 +{
4140 +       BUG_ON(bfq_bfqq_busy(bfqq));
4141 +       BUG_ON(bfqq == bfqd->active_queue);
4142 +
4143 +       bfq_log_bfqq(bfqd, bfqq, "add to busy");
4144 +
4145 +       bfq_activate_bfqq(bfqd, bfqq);
4146 +
4147 +       bfq_mark_bfqq_busy(bfqq);
4148 +       bfqd->busy_queues++;
4149 +}
4150 diff --git a/block/bfq.h b/block/bfq.h
4151 new file mode 100644
4152 index 0000000..421bbe2
4153 --- /dev/null
4154 +++ b/block/bfq.h
4155 @@ -0,0 +1,514 @@
4156 +/*
4157 + * BFQ: data structures and common functions prototypes.
4158 + *
4159 + * Based on ideas and code from CFQ:
4160 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
4161 + *
4162 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
4163 + *                   Paolo Valente <paolo.valente@unimore.it>
4164 + */
4165 +
4166 +#ifndef _BFQ_H
4167 +#define _BFQ_H
4168 +
4169 +#include <linux/blktrace_api.h>
4170 +#include <linux/hrtimer.h>
4171 +#include <linux/ioprio.h>
4172 +#include <linux/rbtree.h>
4173 +
4174 +#define ASYNC                  0
4175 +#define SYNC                   1
4176 +
4177 +#define BFQ_IOPRIO_CLASSES     3
4178 +
4179 +#define BFQ_DEFAULT_GRP_IOPRIO 4
4180 +#define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
4181 +
4182 +typedef u64 bfq_timestamp_t;
4183 +typedef unsigned long bfq_weight_t;
4184 +typedef unsigned long bfq_service_t;
4185 +
4186 +struct bfq_entity;
4187 +
4188 +/**
4189 + * struct bfq_service_tree - per ioprio_class service tree.
4190 + * @active: tree for active entities (i.e., those backlogged).
4191 + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
4192 + * @first_idle: idle entity with minimum F_i.
4193 + * @last_idle: idle entity with maximum F_i.
4194 + * @vtime: scheduler virtual time.
4195 + * @wsum: scheduler weight sum; active and idle entities contribute to it.
4196 + *
4197 + * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
4198 + * ioprio_class has its own independent scheduler, and so its own
4199 + * bfq_service_tree.  All the fields are protected by the queue lock
4200 + * of the containing bfqd.
4201 + */
4202 +struct bfq_service_tree {
4203 +       struct rb_root active;
4204 +       struct rb_root idle;
4205 +
4206 +       struct bfq_entity *first_idle;
4207 +       struct bfq_entity *last_idle;
4208 +
4209 +       bfq_timestamp_t vtime;
4210 +       bfq_weight_t wsum;
4211 +};
4212 +
4213 +/**
4214 + * struct bfq_sched_data - multi-class scheduler.
4215 + * @active_entity: entity under service.
4216 + * @next_active: head-of-the-line entity in the scheduler.
4217 + * @service_tree: array of service trees, one per ioprio_class.
4218 + *
4219 + * bfq_sched_data is the basic scheduler queue.  It supports three
4220 + * ioprio_classes, and can be used either as a toplevel queue or as
4221 + * an intermediate queue on a hierarchical setup.
4222 + * @next_active points to the active entity of the sched_data service
4223 + * trees that will be scheduled next.
4224 + *
4225 + * The supported ioprio_classes are the same as in CFQ, in descending
4226 + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
4227 + * Requests from higher priority queues are served before all the
4228 + * requests from lower priority queues; among requests of the same
4229 + * queue requests are served according to B-WF2Q+.
4230 + * All the fields are protected by the queue lock of the containing bfqd.
4231 + */
4232 +struct bfq_sched_data {
4233 +       struct bfq_entity *active_entity;
4234 +       struct bfq_entity *next_active;
4235 +       struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
4236 +};
4237 +
4238 +/**
4239 + * struct bfq_entity - schedulable entity.
4240 + * @rb_node: service_tree member.
4241 + * @on_st: flag, true if the entity is on a tree (either the active or
4242 + *         the idle one of its service_tree).
4243 + * @finish: B-WF2Q+ finish timestamp (aka F_i).
4244 + * @start: B-WF2Q+ start timestamp (aka S_i).
4245 + * @tree: tree the entity is enqueued into; %NULL if not on a tree.
4246 + * @min_start: minimum start time of the (active) subtree rooted at
4247 + *             this entity; used for O(log N) lookups into active trees.
4248 + * @service: service received during the last round of service.
4249 + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
4250 + * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
4251 + * @parent: parent entity, for hierarchical scheduling.
4252 + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
4253 + *                 associated scheduler queue, %NULL on leaf nodes.
4254 + * @sched_data: the scheduler queue this entity belongs to.
4255 + * @ioprio: the ioprio in use.
4256 + * @new_ioprio: when an ioprio change is requested, the new ioprio value
4257 + * @ioprio_class: the ioprio_class in use.
4258 + * @new_ioprio_class: when an ioprio_class change is requested, the new
4259 + *                    ioprio_class value.
4260 + * @ioprio_changed: flag, true when the user requested an ioprio or
4261 + *                  ioprio_class change.
4262 + *
4263 + * A bfq_entity is used to represent either a bfq_queue (leaf node in the
4264 + * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
4265 + * entity belongs to the sched_data of the parent group in the cgroup
4266 + * hierarchy.  Non-leaf entities have also their own sched_data, stored
4267 + * in @my_sched_data.
4268 + *
4269 + * Each entity stores independently its priority values; this would allow
4270 + * different weights on different devices, but this functionality is not
4271 + * exported to userspace by now.  Priorities are updated lazily, first
4272 + * storing the new values into the new_* fields, then setting the
4273 + * @ioprio_changed flag.  As soon as there is a transition in the entity
4274 + * state that allows the priority update to take place the effective and
4275 + * the requested priority values are synchronized.
4276 + *
4277 + * The weight value is calculated from the ioprio to export the same
4278 + * interface as CFQ.  When dealing with ``well-behaved'' queues (i.e.,
4279 + * queues that do not spend too much time to consume their budget and
4280 + * have true sequential behavior, and when there are no external factors
4281 + * breaking anticipation) the relative weights at each level of the
4282 + * cgroups hierarchy should be guaranteed.
4283 + * All the fields are protected by the queue lock of the containing bfqd.
4284 + */
4285 +struct bfq_entity {
4286 +       struct rb_node rb_node;
4287 +
4288 +       int on_st;
4289 +
4290 +       bfq_timestamp_t finish;
4291 +       bfq_timestamp_t start;
4292 +
4293 +       struct rb_root *tree;
4294 +
4295 +       bfq_timestamp_t min_start;
4296 +
4297 +       bfq_service_t service, budget;
4298 +       bfq_weight_t weight;
4299 +
4300 +       struct bfq_entity *parent;
4301 +
4302 +       struct bfq_sched_data *my_sched_data;
4303 +       struct bfq_sched_data *sched_data;
4304 +
4305 +       unsigned short ioprio, new_ioprio;
4306 +       unsigned short ioprio_class, new_ioprio_class;
4307 +
4308 +       int ioprio_changed;
4309 +};
4310 +
4311 +struct bfq_group;
4312 +
4313 +/**
4314 + * struct bfq_data - per device data structure.
4315 + * @queue: request queue for the managed device.
4316 + * @root_group: root bfq_group for the device.
4317 + * @busy_queues: number of bfq_queues containing requests (including the
4318 + *              queue under service, even if it is idling).
4319 + * @queued: number of queued requests.
4320 + * @rq_in_driver: number of requests dispatched and waiting for completion.
4321 + * @sync_flight: number of sync requests in the driver.
4322 + * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
4323 + *                   completed requests .
4324 + * @hw_tag_samples: nr of samples used to calculate hw_tag.
4325 + * @hw_tag: flag set to one if the driver is showing a queueing behavior.
4326 + * @idle_slice_timer: timer set when idling for the next sequential request
4327 + *                    from the queue under service.
4328 + * @unplug_work: delayed work to restart dispatching on the request queue.
4329 + * @active_queue: bfq_queue under service.
4330 + * @active_cic: cfq_io_context (cic) associated with the @active_queue.
4331 + * @last_position: on-disk position of the last served request.
4332 + * @last_budget_start: beginning of the last budget.
4333 + * @last_idling_start: beginning of the last idle slice.
4334 + * @peak_rate: peak transfer rate observed for a budget.
4335 + * @peak_rate_samples: number of samples used to calculate @peak_rate.
4336 + * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
4337 + * @cic_list: list of all the cics active on the bfq_data device.
4338 + * @group_list: list of all the bfq_groups active on the device.
4339 + * @active_list: list of all the bfq_queues active on the device.
4340 + * @idle_list: list of all the bfq_queues idle on the device.
4341 + * @bfq_quantum: max number of requests dispatched per dispatch round.
4342 + * @bfq_fifo_expire: timeout for async/sync requests; when it expires
4343 + *                   requests are served in fifo order.
4344 + * @bfq_back_penalty: weight of backward seeks wrt forward ones.
4345 + * @bfq_back_max: maximum allowed backward seek.
4346 + * @bfq_slice_idle: maximum idling time.
4347 + * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
4348 + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
4349 + *                           async queues.
4350 + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
4351 + *               to prevent seeky queues to impose long latencies to well
4352 + *               behaved ones (this also implies that seeky queues cannot
4353 + *               receive guarantees in the service domain; after a timeout
4354 + *               they are charged for the whole allocated budget, to try
4355 + *               to preserve a behavior reasonably fair among them, but
4356 + *               without service-domain guarantees).
4357 + *
4358 + * All the fields are protected by the @queue lock.
4359 + */
4360 +struct bfq_data {
4361 +       struct request_queue *queue;
4362 +
4363 +       struct bfq_group *root_group;
4364 +
4365 +       int busy_queues;
4366 +       int queued;
4367 +       int rq_in_driver;
4368 +       int sync_flight;
4369 +
4370 +       int max_rq_in_driver;
4371 +       int hw_tag_samples;
4372 +       int hw_tag;
4373 +
4374 +       struct timer_list idle_slice_timer;
4375 +       struct work_struct unplug_work;
4376 +
4377 +       struct bfq_queue *active_queue;
4378 +       struct cfq_io_context *active_cic;
4379 +
4380 +       sector_t last_position;
4381 +
4382 +       ktime_t last_budget_start;
4383 +       ktime_t last_idling_start;
4384 +       int peak_rate_samples;
4385 +       u64 peak_rate;
4386 +       bfq_service_t bfq_max_budget;
4387 +
4388 +       struct list_head cic_list;
4389 +       struct hlist_head group_list;
4390 +       struct list_head active_list;
4391 +       struct list_head idle_list;
4392 +
4393 +       unsigned int bfq_quantum;
4394 +       unsigned int bfq_fifo_expire[2];
4395 +       unsigned int bfq_back_penalty;
4396 +       unsigned int bfq_back_max;
4397 +       unsigned int bfq_slice_idle;
4398 +
4399 +       unsigned int bfq_user_max_budget;
4400 +       unsigned int bfq_max_budget_async_rq;
4401 +       unsigned int bfq_timeout[2];
4402 +};
4403 +
4404 +/**
4405 + * struct bfq_queue - leaf schedulable entity.
4406 + * @ref: reference counter.
4407 + * @bfqd: parent bfq_data.
4408 + * @sort_list: sorted list of pending requests.
4409 + * @next_rq: if fifo isn't expired, next request to serve.
4410 + * @queued: nr of requests queued in @sort_list.
4411 + * @allocated: currently allocated requests.
4412 + * @meta_pending: pending metadata requests.
4413 + * @fifo: fifo list of requests in sort_list.
4414 + * @entity: entity representing this queue in the scheduler.
4415 + * @max_budget: maximum budget allowed from the feedback mechanism.
4416 + * @budget_timeout: budget expiration (in jiffies).
4417 + * @dispatched: number of requests on the dispatch list or inside driver.
4418 + * @budgets_assigned: number of budgets assigned.
4419 + * @org_ioprio: saved ioprio during boosted periods.
4420 + * @org_ioprio_class: saved ioprio_class during boosted periods.
4421 + * @flags: status flags.
4422 + * @bfqq_list: node for active/idle bfqq list inside our bfqd.
4423 + * @pid: pid of the process owning the queue, used for logging purposes.
4424 + *
4425 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
4426 + * or more (if it is an async one).  @cgroup holds a reference to the
4427 + * cgroup, to be sure that it does not disappear while a bfqq still
4428 + * references it (mostly to avoid races between request issuing and task
4429 + * migration followed by cgroup distruction).
4430 + * All the fields are protected by the queue lock of the containing bfqd.
4431 + */
4432 +struct bfq_queue {
4433 +       atomic_t ref;
4434 +       struct bfq_data *bfqd;
4435 +
4436 +       struct rb_root sort_list;
4437 +       struct request *next_rq;
4438 +       int queued[2];
4439 +       int allocated[2];
4440 +       int meta_pending;
4441 +       struct list_head fifo;
4442 +
4443 +       struct bfq_entity entity;
4444 +
4445 +       bfq_service_t max_budget;
4446 +       unsigned long budget_timeout;
4447 +
4448 +       int dispatched;
4449 +       int budgets_assigned;
4450 +
4451 +       unsigned short org_ioprio;
4452 +       unsigned short org_ioprio_class;
4453 +
4454 +       unsigned int flags;
4455 +
4456 +       struct list_head bfqq_list;
4457 +
4458 +       pid_t pid;
4459 +};
4460 +
4461 +enum bfqq_state_flags {
4462 +       BFQ_BFQQ_FLAG_busy = 0,         /* has requests or is under service */
4463 +       BFQ_BFQQ_FLAG_wait_request,     /* waiting for a request */
4464 +       BFQ_BFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
4465 +       BFQ_BFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
4466 +       BFQ_BFQQ_FLAG_idle_window,      /* slice idling enabled */
4467 +       BFQ_BFQQ_FLAG_prio_changed,     /* task priority has changed */
4468 +       BFQ_BFQQ_FLAG_sync,             /* synchronous queue */
4469 +       BFQ_BFQQ_FLAG_budget_new,       /* no completion with this budget */
4470 +};
4471 +
4472 +#define BFQ_BFQQ_FNS(name)                                             \
4473 +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)                \
4474 +{                                                                      \
4475 +       (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);                   \
4476 +}                                                                      \
4477 +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)       \
4478 +{                                                                      \
4479 +       (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);                  \
4480 +}                                                                      \
4481 +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)                \
4482 +{                                                                      \
4483 +       return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;      \
4484 +}
4485 +
4486 +BFQ_BFQQ_FNS(busy);
4487 +BFQ_BFQQ_FNS(wait_request);
4488 +BFQ_BFQQ_FNS(must_alloc);
4489 +BFQ_BFQQ_FNS(fifo_expire);
4490 +BFQ_BFQQ_FNS(idle_window);
4491 +BFQ_BFQQ_FNS(prio_changed);
4492 +BFQ_BFQQ_FNS(sync);
4493 +BFQ_BFQQ_FNS(budget_new);
4494 +#undef BFQ_BFQQ_FNS
4495 +
4496 +/* Logging facilities. */
4497 +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
4498 +       blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
4499 +
4500 +#define bfq_log(bfqd, fmt, args...) \
4501 +       blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
4502 +
4503 +/* Expiration reasons. */
4504 +enum bfqq_expiration {
4505 +       BFQ_BFQQ_TOO_IDLE = 0,          /* queue has been idling for too long */
4506 +       BFQ_BFQQ_BUDGET_TIMEOUT,        /* budget took too long to be used */
4507 +       BFQ_BFQQ_BUDGET_EXHAUSTED,      /* budget consumed */
4508 +       BFQ_BFQQ_NO_MORE_REQUESTS,      /* the queue has no more requests */
4509 +};
4510 +
4511 +#ifdef CONFIG_CGROUP_BFQIO
4512 +/**
4513 + * struct bfq_group - per (device, cgroup) data structure.
4514 + * @entity: schedulable entity to insert into the parent group sched_data.
4515 + * @sched_data: own sched_data, to contain child entities (they may be
4516 + *              both bfq_queues and bfq_groups).
4517 + * @group_node: node to be inserted into the bfqio_cgroup->group_data
4518 + *              list of the containing cgroup's bfqio_cgroup.
4519 + * @bfqd_node: node to be inserted into the @bfqd->group_list list
4520 + *             of the groups active on the same device; used for cleanup.
4521 + * @bfqd: the bfq_data for the device this group acts upon.
4522 + * @async_bfqq: array of async queues for all the tasks belonging to
4523 + *              the group, one queue per ioprio value per ioprio_class,
4524 + *              except for the idle class that has only one queue.
4525 + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
4526 + * @my_entity: pointer to @entity, %NULL for the toplevel group; used
4527 + *             to avoid too many special cases during group creation/migration.
4528 + *
4529 + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
4530 + * there is a set of bfq_groups, each one collecting the lower-level
4531 + * entities belonging to the group that are acting on the same device.
4532 + *
4533 + * Locking works as follows:
4534 + *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
4535 + *      via RCU from its readers.
4536 + *    o @bfqd is protected by the queue lock, RCU is used to access it
4537 + *      from the readers.
4538 + *    o All the other fields are protected by the @bfqd queue lock.
4539 + */
4540 +struct bfq_group {
4541 +       struct bfq_entity entity;
4542 +       struct bfq_sched_data sched_data;
4543 +
4544 +       struct hlist_node group_node;
4545 +       struct hlist_node bfqd_node;
4546 +
4547 +       void *bfqd;
4548 +
4549 +       struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
4550 +       struct bfq_queue *async_idle_bfqq;
4551 +
4552 +       struct bfq_entity *my_entity;
4553 +};
4554 +
4555 +/**
4556 + * struct bfqio_cgroup - bfq cgroup data structure.
4557 + * @css: subsystem state for bfq in the containing cgroup.
4558 + * @ioprio: cgroup ioprio.
4559 + * @ioprio_class: cgroup ioprio_class.
4560 + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
4561 + * @group_data: list containing the bfq_group belonging to this cgroup.
4562 + *
4563 + * @group_data is accessed using RCU, with @lock protecting the updates,
4564 + * @ioprio and @ioprio_class are protected by @lock.
4565 + */
4566 +struct bfqio_cgroup {
4567 +       struct cgroup_subsys_state css;
4568 +
4569 +       unsigned short ioprio, ioprio_class;
4570 +
4571 +       spinlock_t lock;
4572 +       struct hlist_head group_data;
4573 +};
4574 +#else
4575 +struct bfq_group {
4576 +       struct bfq_sched_data sched_data;
4577 +
4578 +       struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
4579 +       struct bfq_queue *async_idle_bfqq;
4580 +};
4581 +#endif
4582 +
4583 +static inline struct bfq_service_tree *
4584 +bfq_entity_service_tree(struct bfq_entity *entity)
4585 +{
4586 +       struct bfq_sched_data *sched_data = entity->sched_data;
4587 +       unsigned int idx = entity->ioprio_class - 1;
4588 +
4589 +       BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
4590 +       BUG_ON(sched_data == NULL);
4591 +
4592 +       return sched_data->service_tree + idx;
4593 +}
4594 +
4595 +static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic,
4596 +                                           int is_sync)
4597 +{
4598 +       return cic->cfqq[!!is_sync];
4599 +}
4600 +
4601 +static inline void cic_set_bfqq(struct cfq_io_context *cic,
4602 +                               struct bfq_queue *bfqq, int is_sync)
4603 +{
4604 +       cic->cfqq[!!is_sync] = bfqq;
4605 +}
4606 +
4607 +static inline void call_for_each_cic(struct io_context *ioc,
4608 +                                    void (*func)(struct io_context *,
4609 +                                    struct cfq_io_context *))
4610 +{
4611 +       struct cfq_io_context *cic;
4612 +       struct hlist_node *n;
4613 +
4614 +       rcu_read_lock();
4615 +       hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
4616 +               func(ioc, cic);
4617 +       rcu_read_unlock();
4618 +}
4619 +
4620 +/**
4621 + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
4622 + * @ptr: a pointer to a bfqd.
4623 + * @flags: storage for the flags to be saved.
4624 + *
4625 + * This function allows cic->key and bfqg->bfqd to be protected by the
4626 + * queue lock of the bfqd they reference; the pointer is dereferenced
4627 + * under RCU, so the storage for bfqd is assured to be safe as long
4628 + * as the RCU read side critical section does not end.  After the
4629 + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
4630 + * sure that no other writer accessed it.  If we raced with a writer,
4631 + * the function returns NULL, with the queue unlocked, otherwise it
4632 + * returns the dereferenced pointer, with the queue locked.
4633 + */
4634 +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
4635 +                                                  unsigned long *flags)
4636 +{
4637 +       struct bfq_data *bfqd;
4638 +
4639 +       rcu_read_lock();
4640 +       bfqd = rcu_dereference(*(struct bfq_data **)ptr);
4641 +       if (bfqd != NULL) {
4642 +               spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
4643 +               if (*ptr == bfqd)
4644 +                       goto out;
4645 +               spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
4646 +               bfqd = NULL;
4647 +       }
4648 +
4649 +out:
4650 +       rcu_read_unlock();
4651 +       return bfqd;
4652 +}
4653 +
4654 +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
4655 +                                      unsigned long *flags)
4656 +{
4657 +       spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
4658 +}
4659 +
4660 +static void bfq_changed_ioprio(struct io_context *ioc,
4661 +                              struct cfq_io_context *cic);
4662 +static void bfq_put_queue(struct bfq_queue *bfqq);
4663 +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
4664 +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
4665 +                                      struct bfq_group *bfqg, int is_sync,
4666 +                                      struct io_context *ioc, gfp_t gfp_mask);
4667 +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
4668 +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
4669 +#endif
4670