summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 14:27:12 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 14:27:12 -0700
commit6c0029211382011af508273c4fc98a732f841d95 (patch)
tree63b47915c50542b28d1de48d0e50318afdb32dbb /block
parent16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (diff)
parentf46ec84b5acbf8d7067d71a6bbdde213d4b86036 (diff)
downloadlinux-6c0029211382011af508273c4fc98a732f841d95.tar.gz
linux-6c0029211382011af508273c4fc98a732f841d95.tar.bz2
linux-6c0029211382011af508273c4fc98a732f841d95.zip
Merge tag 'for-5.13/block-2021-04-27' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "Pretty quiet round this time, which is nice. In detail: - Series revamping bounce buffer support (Christoph) - Dead code removal (Christoph, Bart) - Partition iteration revamp, now using xarray (Christoph) - Passthrough request scheduler improvements (Lin) - Series of BFQ improvements (Paolo) - Fix ioprio task iteration (Peter) - Various little tweaks and fixes (Tejun, Saravanan, Bhaskar, Max, Nikolay)" * tag 'for-5.13/block-2021-04-27' of git://git.kernel.dk/linux-block: (41 commits) blk-iocost: don't ignore vrate_min on QD contention blk-mq: Fix spurious debugfs directory creation during initialization bfq/mq-deadline: remove redundant check for passthrough request blk-mq: bypass IO scheduler's limit_depth for passthrough request block: Remove an obsolete comment from sg_io() block: move bio_list_copy_data to pktcdvd block: remove zero_fill_bio_iter block: add queue_to_disk() to get gendisk from request_queue block: remove an incorrect check from blk_rq_append_bio block: initialize ret in bdev_disk_changed block: Fix sys_ioprio_set(.which=IOPRIO_WHO_PGRP) task iteration block: remove disk_part_iter block: simplify diskstats_show block: simplify show_partition block: simplify printk_all_partitions block: simplify partition_overlaps block: simplify partition removal block: take bd_mutex around delete_partitions in del_gendisk block: refactor blk_drop_partitions block: move more syncing and invalidation to delete_partition ...
Diffstat (limited to 'block')
-rw-r--r--block/bfq-cgroup.c2
-rw-r--r--block/bfq-iosched.c398
-rw-r--r--block/bfq-iosched.h15
-rw-r--r--block/bfq-wf2q.c8
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/bio.c43
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-iocost.c4
-rw-r--r--block/blk-map.c119
-rw-r--r--block/blk-mq-debugfs.c8
-rw-r--r--block/blk-mq-tag.c8
-rw-r--r--block/blk-mq.c3
-rw-r--r--block/blk-settings.c53
-rw-r--r--block/blk-sysfs.c9
-rw-r--r--block/blk-zoned.c8
-rw-r--r--block/blk.h18
-rw-r--r--block/bounce.c138
-rw-r--r--block/elevator.c3
-rw-r--r--block/genhd.c183
-rw-r--r--block/ioprio.c11
-rw-r--r--block/mq-deadline.c7
-rw-r--r--block/partitions/core.c54
-rw-r--r--block/scsi_ioctl.c6
23 files changed, 585 insertions, 522 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b791e2041e49..e2f14508f2d6 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
entity->orig_weight = entity->weight = entity->new_weight = d->weight;
entity->my_sched_data = &bfqg->sched_data;
+ entity->last_bfqq_created = NULL;
+
bfqg->my_entity = entity; /*
* the root_group's will be set to NULL
* in bfq_init_queue()
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 95586137194e..0270cd7ca165 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1012,7 +1012,7 @@ static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
struct bfq_io_cq *bic, bool bfq_already_existing)
{
- unsigned int old_wr_coeff = bfqq->wr_coeff;
+ unsigned int old_wr_coeff = 1;
bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
if (bic->saved_has_short_ttime)
@@ -1033,7 +1033,13 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
bfqq->ttime = bic->saved_ttime;
bfqq->io_start_time = bic->saved_io_start_time;
bfqq->tot_idle_time = bic->saved_tot_idle_time;
- bfqq->wr_coeff = bic->saved_wr_coeff;
+ /*
+ * Restore weight coefficient only if low_latency is on
+ */
+ if (bfqd->low_latency) {
+ old_wr_coeff = bfqq->wr_coeff;
+ bfqq->wr_coeff = bic->saved_wr_coeff;
+ }
bfqq->service_from_wr = bic->saved_service_from_wr;
bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
@@ -1069,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
- (bfqq->weight_counter != NULL);
+ (bfqq->weight_counter != NULL) - bfqq->stable_ref;
}
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -2622,6 +2628,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
return true;
}
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
+
+static void bfq_put_stable_ref(struct bfq_queue *bfqq);
+
/*
* Attempt to schedule a merge of bfqq with the currently in-service
* queue or with a close queue among the scheduled queues. Return
@@ -2644,11 +2655,50 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
*/
static struct bfq_queue *
bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- void *io_struct, bool request)
+ void *io_struct, bool request, struct bfq_io_cq *bic)
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
/*
+ * Check delayed stable merge for rotational or non-queueing
+ * devs. For this branch to be executed, bfqq must not be
+ * currently merged with some other queue (i.e., bfqq->bic
+ * must be non null). If we considered also merged queues,
+ * then we should also check whether bfqq has already been
+ * merged with bic->stable_merge_bfqq. But this would be
+ * costly and complicated.
+ */
+ if (unlikely(!bfqd->nonrot_with_queueing)) {
+ if (bic->stable_merge_bfqq &&
+ !bfq_bfqq_just_created(bfqq) &&
+ time_is_after_jiffies(bfqq->split_time +
+ msecs_to_jiffies(200))) {
+ struct bfq_queue *stable_merge_bfqq =
+ bic->stable_merge_bfqq;
+ int proc_ref = min(bfqq_process_refs(bfqq),
+ bfqq_process_refs(stable_merge_bfqq));
+
+ /* deschedule stable merge, because done or aborted here */
+ bfq_put_stable_ref(stable_merge_bfqq);
+
+ bic->stable_merge_bfqq = NULL;
+
+ if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
+ proc_ref > 0) {
+ /* next function will take at least one ref */
+ struct bfq_queue *new_bfqq =
+ bfq_setup_merge(bfqq, stable_merge_bfqq);
+
+ bic->stably_merged = true;
+ if (new_bfqq && new_bfqq->bic)
+ new_bfqq->bic->stably_merged = true;
+ return new_bfqq;
+ } else
+ return NULL;
+ }
+ }
+
+ /*
* Do not perform queue merging if the device is non
* rotational and performs internal queueing. In fact, such a
* device reaches a high speed through internal parallelism
@@ -2789,6 +2839,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
}
+
+static void
+bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
+{
+ if (cur_bfqq->entity.parent &&
+ cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
+ cur_bfqq->entity.parent->last_bfqq_created = new_bfqq;
+ else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq)
+ cur_bfqq->bfqd->last_bfqq_created = new_bfqq;
+}
+
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
@@ -2806,6 +2867,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq != bfqd->in_service_queue)
bfq_del_bfqq_busy(bfqd, bfqq, false);
+ bfq_reassign_last_bfqq(bfqq, NULL);
+
bfq_put_queue(bfqq);
}
@@ -2823,6 +2886,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_clear_bfqq_IO_bound(bfqq);
/*
+ * The processes associated with bfqq are cooperators of the
+ * processes associated with new_bfqq. So, if bfqq has a
+ * waker, then assume that all these processes will be happy
+ * to let bfqq's waker freely inject I/O when they have no
+ * I/O.
+ */
+ if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq &&
+ bfqq->waker_bfqq != new_bfqq) {
+ new_bfqq->waker_bfqq = bfqq->waker_bfqq;
+ new_bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then
+ * new_bfqq->waker_bfqq must be reset. So insert
+ * new_bfqq into the woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ hlist_add_head(&new_bfqq->woken_list_node,
+ &new_bfqq->waker_bfqq->woken_list);
+
+ }
+
+ /*
* If bfqq is weight-raised, then let new_bfqq inherit
* weight-raising. To reduce false positives, neglect the case
* where bfqq has just been created, but has not yet made it
@@ -2879,6 +2965,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
*/
new_bfqq->pid = -1;
bfqq->bic = NULL;
+
+ bfq_reassign_last_bfqq(bfqq, new_bfqq);
+
bfq_release_process_ref(bfqd, bfqq);
}
@@ -2906,7 +2995,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* We take advantage of this function to perform an early merge
* of the queues of possible cooperating processes.
*/
- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic);
if (new_bfqq) {
/*
* bic still points to bfqq, then it has not yet been
@@ -4491,9 +4580,15 @@ check_queue:
bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
bfqq->bic->bfqq[0]->next_rq ?
bfqq->bic->bfqq[0] : NULL;
+ struct bfq_queue *blocked_bfqq =
+ !hlist_empty(&bfqq->woken_list) ?
+ container_of(bfqq->woken_list.first,
+ struct bfq_queue,
+ woken_list_node)
+ : NULL;
/*
- * The next three mutually-exclusive ifs decide
+ * The next four mutually-exclusive ifs decide
* whether to try injection, and choose the queue to
* pick an I/O request from.
*
@@ -4526,7 +4621,15 @@ check_queue:
* next bfqq's I/O is brought forward dramatically,
* for it is not blocked for milliseconds.
*
- * The third if checks whether bfqq is a queue for
+ * The third if checks whether there is a queue woken
+ * by bfqq, and currently with pending I/O. Such a
+ * woken queue does not steal bandwidth from bfqq,
+ * because it remains soon without I/O if bfqq is not
+ * served. So there is virtually no risk of loss of
+ * bandwidth for bfqq if this woken queue has I/O
+ * dispatched while bfqq is waiting for new I/O.
+ *
+ * The fourth if checks whether bfqq is a queue for
* which it is better to avoid injection. It is so if
* bfqq delivers more throughput when served without
* any further I/O from other queues in the middle, or
@@ -4546,11 +4649,11 @@ check_queue:
* bfq_update_has_short_ttime(), it is rather likely
* that, if I/O is being plugged for bfqq and the
* waker queue has pending I/O requests that are
- * blocking bfqq's I/O, then the third alternative
+ * blocking bfqq's I/O, then the fourth alternative
* above lets the waker queue get served before the
* I/O-plugging timeout fires. So one may deem the
* second alternative superfluous. It is not, because
- * the third alternative may be way less effective in
+ * the fourth alternative may be way less effective in
* case of a synchronization. For two main
* reasons. First, throughput may be low because the
* inject limit may be too low to guarantee the same
@@ -4559,7 +4662,7 @@ check_queue:
* guarantees (the second alternative unconditionally
* injects a pending I/O request of the waker queue
* for each bfq_dispatch_request()). Second, with the
- * third alternative, the duration of the plugging,
+ * fourth alternative, the duration of the plugging,
* i.e., the time before bfqq finally receives new I/O,
* may not be minimized, because the waker queue may
* happen to be served only after other queues.
@@ -4577,6 +4680,14 @@ check_queue:
bfq_bfqq_budget_left(bfqq->waker_bfqq)
)
bfqq = bfqq->waker_bfqq;
+ else if (blocked_bfqq &&
+ bfq_bfqq_busy(blocked_bfqq) &&
+ blocked_bfqq->next_rq &&
+ bfq_serv_to_charge(blocked_bfqq->next_rq,
+ blocked_bfqq) <=
+ bfq_bfqq_budget_left(blocked_bfqq)
+ )
+ bfqq = blocked_bfqq;
else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
(bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
!bfq_bfqq_has_short_ttime(bfqq)))
@@ -4983,6 +5094,12 @@ void bfq_put_queue(struct bfq_queue *bfqq)
bfqg_and_blkg_put(bfqg);
}
+static void bfq_put_stable_ref(struct bfq_queue *bfqq)
+{
+ bfqq->stable_ref--;
+ bfq_put_queue(bfqq);
+}
+
static void bfq_put_cooperator(struct bfq_queue *bfqq)
{
struct bfq_queue *__bfqq, *next;
@@ -5039,6 +5156,24 @@ static void bfq_exit_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
+ if (bic->stable_merge_bfqq) {
+ struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
+
+ /*
+ * bfqd is NULL if scheduler already exited, and in
+ * that case this is the last time bfqq is accessed.
+ */
+ if (bfqd) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_put_stable_ref(bic->stable_merge_bfqq);
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ bfq_put_stable_ref(bic->stable_merge_bfqq);
+ }
+ }
+
bfq_exit_icq_bfqq(bic, true);
bfq_exit_icq_bfqq(bic, false);
}
@@ -5099,7 +5234,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic);
+ struct bfq_io_cq *bic,
+ bool respawn);
static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
{
@@ -5119,7 +5255,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
bfq_release_process_ref(bfqd, bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
bic_set_bfqq(bic, bfqq, false);
}
@@ -5162,6 +5298,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* set end request to minus infinity from now */
bfqq->ttime.last_end_request = now_ns + 1;
+ bfqq->creation_time = jiffies;
+
bfqq->io_start_time = now_ns;
bfq_mark_bfqq_IO_bound(bfqq);
@@ -5211,9 +5349,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
}
}
+static struct bfq_queue *
+bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic,
+ struct bfq_queue *last_bfqq_created)
+{
+ struct bfq_queue *new_bfqq =
+ bfq_setup_merge(bfqq, last_bfqq_created);
+
+ if (!new_bfqq)
+ return bfqq;
+
+ if (new_bfqq->bic)
+ new_bfqq->bic->stably_merged = true;
+ bic->stably_merged = true;
+
+ /*
+ * Reusing merge functions. This implies that
+ * bfqq->bic must be set too, for
+ * bfq_merge_bfqqs to correctly save bfqq's
+ * state before killing it.
+ */
+ bfqq->bic = bic;
+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+
+ return new_bfqq;
+}
+
+/*
+ * Many throughput-sensitive workloads are made of several parallel
+ * I/O flows, with all flows generated by the same application, or
+ * more generically by the same task (e.g., system boot). The most
+ * counterproductive action with these workloads is plugging I/O
+ * dispatch when one of the bfq_queues associated with these flows
+ * remains temporarily empty.
+ *
+ * To avoid this plugging, BFQ has been using a burst-handling
+ * mechanism for years now. This mechanism has proven effective for
+ * throughput, and not detrimental for service guarantees. The
+ * following function pushes this mechanism a little bit further,
+ * basing on the following two facts.
+ *
+ * First, all the I/O flows of a the same application or task
+ * contribute to the execution/completion of that common application
+ * or task. So the performance figures that matter are total
+ * throughput of the flows and task-wide I/O latency. In particular,
+ * these flows do not need to be protected from each other, in terms
+ * of individual bandwidth or latency.
+ *
+ * Second, the above fact holds regardless of the number of flows.
+ *
+ * Putting these two facts together, this commits merges stably the
+ * bfq_queues associated with these I/O flows, i.e., with the
+ * processes that generate these IO/ flows, regardless of how many the
+ * involved processes are.
+ *
+ * To decide whether a set of bfq_queues is actually associated with
+ * the I/O flows of a common application or task, and to merge these
+ * queues stably, this function operates as follows: given a bfq_queue,
+ * say Q2, currently being created, and the last bfq_queue, say Q1,
+ * created before Q2, Q2 is merged stably with Q1 if
+ * - very little time has elapsed since when Q1 was created
+ * - Q2 has the same ioprio as Q1
+ * - Q2 belongs to the same group as Q1
+ *
+ * Merging bfq_queues also reduces scheduling overhead. A fio test
+ * with ten random readers on /dev/nullb shows a throughput boost of
+ * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of
+ * the total per-request processing time, the above throughput boost
+ * implies that BFQ's overhead is reduced by more than 50%.
+ *
+ * This new mechanism most certainly obsoletes the current
+ * burst-handling heuristics. We keep those heuristics for the moment.
+ */
+static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ struct bfq_queue **source_bfqq = bfqq->entity.parent ?
+ &bfqq->entity.parent->last_bfqq_created :
+ &bfqd->last_bfqq_created;
+
+ struct bfq_queue *last_bfqq_created = *source_bfqq;
+
+ /*
+ * If last_bfqq_created has not been set yet, then init it. If
+ * it has been set already, but too long ago, then move it
+ * forward to bfqq. Finally, move also if bfqq belongs to a
+ * different group than last_bfqq_created, or if bfqq has a
+ * different ioprio or ioprio_class. If none of these
+ * conditions holds true, then try an early stable merge or
+ * schedule a delayed stable merge.
+ *
+ * A delayed merge is scheduled (instead of performing an
+ * early merge), in case bfqq might soon prove to be more
+ * throughput-beneficial if not merged. Currently this is
+ * possible only if bfqd is rotational with no queueing. For
+ * such a drive, not merging bfqq is better for throughput if
+ * bfqq happens to contain sequential I/O. So, we wait a
+ * little bit for enough I/O to flow through bfqq. After that,
+ * if such an I/O is sequential, then the merge is
+ * canceled. Otherwise the merge is finally performed.
+ */
+ if (!last_bfqq_created ||
+ time_before(last_bfqq_created->creation_time +
+ bfqd->bfq_burst_interval,
+ bfqq->creation_time) ||
+ bfqq->entity.parent != last_bfqq_created->entity.parent ||
+ bfqq->ioprio != last_bfqq_created->ioprio ||
+ bfqq->ioprio_class != last_bfqq_created->ioprio_class)
+ *source_bfqq = bfqq;
+ else if (time_after_eq(last_bfqq_created->creation_time +
+ bfqd->bfq_burst_interval,
+ bfqq->creation_time)) {
+ if (likely(bfqd->nonrot_with_queueing))
+ /*
+ * With this type of drive, leaving
+ * bfqq alone may provide no
+ * throughput benefits compared with
+ * merging bfqq. So merge bfqq now.
+ */
+ bfqq = bfq_do_early_stable_merge(bfqd, bfqq,
+ bic,
+ last_bfqq_created);
+ else { /* schedule tentative stable merge */
+ /*
+ * get reference on last_bfqq_created,
+ * to prevent it from being freed,
+ * until we decide whether to merge
+ */
+ last_bfqq_created->ref++;
+ /*
+ * need to keep track of stable refs, to
+ * compute process refs correctly
+ */
+ last_bfqq_created->stable_ref++;
+ /*
+ * Record the bfqq to merge to.
+ */
+ bic->stable_merge_bfqq = last_bfqq_created;
+ }
+ }
+
+ return bfqq;
+}
+
+
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic)
+ struct bfq_io_cq *bic,
+ bool respawn)
{
const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
@@ -5271,7 +5556,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
out:
bfqq->ref++; /* get a process reference to this queue */
- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+
+ if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn)
+ bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic);
+
rcu_read_unlock();
return bfqq;
}
@@ -5521,7 +5809,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq),
- *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true,
+ RQ_BIC(rq));
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
@@ -5627,7 +5916,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
- if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
+
+ /*
+ * Reqs with at_head or passthrough flags set are to be put
+ * directly into dispatch list. Additional case for putting rq
+ * directly into the dispatch queue: the only active
+ * bfq_queues are bfqq and either its waker bfq_queue or one
+ * of its woken bfq_queues. The rationale behind this
+ * additional condition is as follows:
+ * - consider a bfq_queue, say Q1, detected as a waker of
+ * another bfq_queue, say Q2
+ * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
+ * some I/O of Q1 needs to be completed for new I/O of Q2
+ * to arrive. A notable example of waker is journald
+ * - so, Q1 and Q2 are in any respect the queues of two
+ * cooperating processes (or of two cooperating sets of
+ * processes): the goal of Q1's I/O is doing what needs to
+ * be done so that new Q2's I/O can finally be
+ * issued. Therefore, if the service of Q1's I/O is delayed,
+ * then Q2's I/O is delayed too. Conversely, if Q2's I/O is
+ * delayed, the goal of Q1's I/O is hindered.
+ * - as a consequence, if some I/O of Q1/Q2 arrives while
+ * Q2/Q1 is the only queue in service, there is absolutely
+ * no point in delaying the service of such an I/O. The
+ * only possible result is a throughput loss
+ * - so, when the above condition holds, the best option is to
+ * have the new I/O dispatched as soon as possible
+ * - the most effective and efficient way to attain the above
+ * goal is to put the new I/O directly in the dispatch
+ * list
+ * - as an additional restriction, Q1 and Q2 must be the only
+ * busy queues for this commit to put the I/O of Q2/Q1 in
+ * the dispatch list. This is necessary, because, if also
+ * other queues are waiting for service, then putting new
+ * I/O directly in the dispatch list may evidently cause a
+ * violation of service guarantees for the other queues
+ */
+ if (!bfqq ||
+ (bfqq != bfqd->in_service_queue &&
+ bfqd->in_service_queue != NULL &&
+ bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
+ (bfqq->waker_bfqq == bfqd->in_service_queue ||
+ bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
@@ -5767,7 +6097,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
1UL<<(BFQ_RATE_SHIFT - 10))
bfq_update_rate_reset(bfqd, NULL);
bfqd->last_completion = now_ns;
- bfqd->last_completed_rq_bfqq = bfqq;
+ /*
+ * Shared queues are likely to receive I/O at a high
+ * rate. This may deceptively let them be considered as wakers
+ * of other queues. But a false waker will unjustly steal
+ * bandwidth to its supposedly woken queue. So considering
+ * also shared queues in the waking mechanism may cause more
+ * control troubles than throughput benefits. Then do not set
+ * last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
+ */
+ if (!bfq_bfqq_coop(bfqq))
+ bfqd->last_completed_rq_bfqq = bfqq;
/*
* If we are waiting to discover whether the request pattern
@@ -6124,7 +6464,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
if (bfqq)
bfq_put_queue(bfqq);
- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
bic_set_bfqq(bic, bfqq, is_sync);
if (split && is_sync) {
@@ -6245,8 +6585,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (likely(!new_queue)) {
/* If the queue was seeky for too long, break it apart. */
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
+ !bic->stably_merged) {
+ struct bfq_queue *old_bfqq = bfqq;
/* Update bic before losing reference to bfqq */
if (bfq_bfqq_in_large_burst(bfqq))
@@ -6255,11 +6596,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
bfqq = bfq_split_bfqq(bic, bfqq);
split = true;
- if (!bfqq)
+ if (!bfqq) {
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
- else
+ bfqq->waker_bfqq = old_bfqq->waker_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then
+ * new_bfqq->waker_bfqq must be
+ * reset. So insert new_bfqq into the
+ * woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ if (bfqq->waker_bfqq)
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+ } else
bfqq_already_existing = true;
}
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index b8e793c34ff1..99c2a3cb081e 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -197,6 +197,9 @@ struct bfq_entity {
/* flag, set if the entity is counted in groups_with_pending_reqs */
bool in_groups_with_pending_reqs;
+
+ /* last child queue of entity created (for non-leaf entities) */
+ struct bfq_queue *last_bfqq_created;
};
struct bfq_group;
@@ -230,6 +233,8 @@ struct bfq_ttime {
struct bfq_queue {
/* reference counter */
int ref;
+ /* counter of references from other queues for delayed stable merge */
+ int stable_ref;
/* parent bfq_data */
struct bfq_data *bfqd;
@@ -365,6 +370,8 @@ struct bfq_queue {
unsigned long first_IO_time; /* time of first I/O for this queue */
+ unsigned long creation_time; /* when this queue is created */
+
/* max service rate measured so far */
u32 max_service_rate;
@@ -454,6 +461,11 @@ struct bfq_io_cq {
u64 saved_last_serv_time_ns;
unsigned int saved_inject_limit;
unsigned long saved_decrease_time_jif;
+
+ /* candidate queue for a stable merge (due to close creation time) */
+ struct bfq_queue *stable_merge_bfqq;
+
+ bool stably_merged; /* non splittable if true */
};
/**
@@ -578,6 +590,9 @@ struct bfq_data {
/* bfqq owning the last completed rq */
struct bfq_queue *last_completed_rq_bfqq;
+ /* last bfqq created, among those in the root group */
+ struct bfq_queue *last_bfqq_created;
+
/* time of last transition from empty to non-empty (ns) */
u64 last_empty_occupied_ns;
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 070e34a7feb1..7a462df71f68 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1706,4 +1706,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;
+
+ /* Move bfqq to the head of the woken list of its waker */
+ if (!hlist_unhashed(&bfqq->woken_list_node) &&
+ &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) {
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+ }
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index dfa652122a2d..4b4eb8964a6f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -204,7 +204,6 @@ bool bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
void *buf;
unsigned long start, end;
unsigned int len, nr_pages;
@@ -238,7 +237,7 @@ bool bio_integrity_prep(struct bio *bio)
/* Allocate kernel buffer for protection data */
len = intervals * bi->tuple_size;
- buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+ buf = kmalloc(len, GFP_NOIO);
status = BLK_STS_RESOURCE;
if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n");
diff --git a/block/bio.c b/block/bio.c
index 50e579088aca..44205dfb6b60 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -493,20 +493,20 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
}
EXPORT_SYMBOL(bio_kmalloc);
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+void zero_fill_bio(struct bio *bio)
{
unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
- __bio_for_each_segment(bv, bio, iter, start) {
+ bio_for_each_segment(bv, bio, iter) {
char *data = bvec_kmap_irq(&bv, &flags);
memset(data, 0, bv.bv_len);
flush_dcache_page(bv.bv_page);
bvec_kunmap_irq(data, &flags);
}
}
-EXPORT_SYMBOL(zero_fill_bio_iter);
+EXPORT_SYMBOL(zero_fill_bio);
/**
* bio_truncate - truncate the bio to small size of @new_size
@@ -1236,43 +1236,6 @@ void bio_copy_data(struct bio *dst, struct bio *src)
}
EXPORT_SYMBOL(bio_copy_data);
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
- struct bvec_iter src_iter = src-&