summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorMike Snitzer <snitzer@redhat.com>2020-09-29 16:27:21 -0400
committerMike Snitzer <snitzer@redhat.com>2020-09-29 16:31:35 -0400
commit1471308fb5ec4335f9ae9fc65f65048dbe7c336e (patch)
tree69aef90f47105e1c730e5277f352d3d5446a1174 /block
parent4c07ae0ad493b7b2d3dd3e53870e594f136ce8a5 (diff)
parent76cffccd606acffde1b91e8b029b39b5fd1a3117 (diff)
downloadlinux-1471308fb5ec4335f9ae9fc65f65048dbe7c336e.tar.gz
linux-1471308fb5ec4335f9ae9fc65f65048dbe7c336e.tar.bz2
linux-1471308fb5ec4335f9ae9fc65f65048dbe7c336e.zip
Merge remote-tracking branch 'jens/for-5.10/block' into dm-5.10
DM depends on these block 5.10 commits: 22ada802ede8 block: use lcm_not_zero() when stacking chunk_sectors 07d098e6bbad block: allow 'chunk_sectors' to be non-power-of-2 021a24460dc2 block: add QUEUE_FLAG_NOWAIT 6abc49468eea dm: add support for REQ_NOWAIT and enable it for linear target Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig2
-rw-r--r--block/bfq-iosched.c9
-rw-r--r--block/blk-cgroup.c32
-rw-r--r--block/blk-core.c243
-rw-r--r--block/blk-integrity.c4
-rw-r--r--block/blk-iocost.c1619
-rw-r--r--block/blk-lib.c2
-rw-r--r--block/blk-map.c177
-rw-r--r--block/blk-merge.c201
-rw-r--r--block/blk-mq-debugfs.c11
-rw-r--r--block/blk-mq-sched.c124
-rw-r--r--block/blk-mq-sched.h3
-rw-r--r--block/blk-mq-tag.c156
-rw-r--r--block/blk-mq-tag.h56
-rw-r--r--block/blk-mq.c90
-rw-r--r--block/blk-mq.h76
-rw-r--r--block/blk-settings.c40
-rw-r--r--block/blk-sysfs.c277
-rw-r--r--block/blk-throttle.c59
-rw-r--r--block/blk.h25
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/genhd.c158
-rw-r--r--block/ioctl.c33
-rw-r--r--block/ioprio.c2
-rw-r--r--block/kyber-iosched.c6
-rw-r--r--block/mq-deadline.c6
-rw-r--r--block/partitions/core.c27
-rw-r--r--block/scsi_ioctl.c4
28 files changed, 2194 insertions, 1250 deletions
diff --git a/block/Kconfig b/block/Kconfig
index bbad5e8bbffe..a2297edfdde8 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -161,8 +161,6 @@ config BLK_WBT_MQ
depends on BLK_WBT
help
Enable writeback throttling by default on multiqueue devices.
- Multiqueue currently doesn't have support for IO scheduling,
- enabling this option is recommended.
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fa98470df3f0..9e81d1052091 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ if (!atomic_read(&hctx->elevator_queued))
+ return false;
+
/*
* Avoiding lock: a race on bfqd->busy_queues should cause at
* most a call to dispatch for nothing
@@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bfq_insert_request(hctx, rq, at_head);
+ atomic_inc(&hctx->elevator_queued);
}
}
@@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq)
bfq_completed_request(bfqq, bfqd);
bfq_finish_requeue_request_body(bfqq);
+ atomic_dec(&rq->mq_hctx->elevator_queued);
spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
@@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c195365c9817..f9b55614d67d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work)
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
+ struct blk_plug plug;
+ bool need_plug = false;
/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work)
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);
+ /* start plug only when bio_list contains at least 2 bios */
+ if (bios.head && bios.head->bi_next) {
+ need_plug = true;
+ blk_start_plug(&plug);
+ }
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
+ if (need_plug)
+ blk_finish_plug(&plug);
}
/**
@@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
+ bool clamp;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
int tok;
while (blkg->parent) {
- if (atomic_read(&blkg->use_delay)) {
+ int use_delay = atomic_read(&blkg->use_delay);
+
+ if (use_delay) {
+ u64 this_delay;
+
blkcg_scale_delay(blkg, now);
- delay_nsec = max_t(u64, delay_nsec,
- atomic64_read(&blkg->delay_nsec));
+ this_delay = atomic64_read(&blkg->delay_nsec);
+ if (this_delay > delay_nsec) {
+ delay_nsec = this_delay;
+ clamp = use_delay > 0;
+ }
}
blkg = blkg->parent;
}
@@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
* Let's not sleep for all eternity if we've amassed a huge delay.
* Swapping or metadata IO can accumulate 10's of seconds worth of
* delay, and we want userspace to be able to do _something_ so cap the
- * delays at 1 second. If there's 10's of seconds worth of delay then
- * the tasks will be delayed for 1 second for every syscall.
+ * delays at 0.25s. If there's 10's of seconds worth of delay then the
+ * tasks will be delayed for 0.25 second for every syscall. If
+ * blkcg_set_delay() was used as indicated by negative use_delay, the
+ * caller is responsible for regulating the range.
*/
- delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+ if (clamp)
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
if (use_memdelay)
psi_memstall_enter(&pflags);
diff --git a/block/blk-core.c b/block/blk-core.c
index 10c08ac50697..4884f1e7451b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- rq->tag = -1;
- rq->internal_tag = -1;
+ rq->tag = BLK_MQ_NO_TAG;
+ rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
@@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id)
if (!q->stats)
goto fail_stats;
- q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
- q->backing_dev_info->io_pages = VM_READAHEAD_PAGES;
- q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->node = node_id;
+ atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
+
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
@@ -643,162 +642,6 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-static void blk_account_io_merge_bio(struct request *req)
-{
- if (!blk_do_io_stat(req))
- return;
-
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
-}
-
-bool bio_attempt_back_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_back_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_backmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_free_ctx(bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_front_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_front_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_frontmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- bio->bi_next = req->bio;
- req->bio = bio;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_do_front_merge(req, bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- unsigned short segments = blk_rq_nr_discard_segments(req);
-
- if (segments >= queue_max_discard_segments(q))
- goto no_merge;
- if (blk_rq_sectors(req) + bio_sectors(bio) >
- blk_rq_get_max_sectors(req, blk_rq_pos(req)))
- goto no_merge;
-
- rq_qos_merge(q, req, bio);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
- req->nr_phys_segments = segments + 1;
-
- blk_account_io_merge_bio(req);
- return true;
-no_merge:
- req_set_nomerge(q, req);
- return false;
-}
-
-/**
- * blk_attempt_plug_merge - try to merge with %current's plugged list
- * @q: request_queue new bio is being queued at
- * @bio: new bio being queued
- * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
- *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list. Returns %true if merge was successful,
- * otherwise %false.
- *
- * Plugging coalesces IOs from the same issuer for the same purpose without
- * going through @q->queue_lock. As such it's more of an issuing mechanism
- * than scheduling, and the request, while may have elvpriv data, is not
- * added on the elevator at this point. In addition, we don't have
- * reliable access to the elevator outside queue lock. Only check basic
- * merging parameters without querying the elevator.
- *
- * Caller must ensure !blk_queue_nomerges(q) beforehand.
- */
-bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq)
-{
- struct blk_plug *plug;
- struct request *rq;
- struct list_head *plug_list;
-
- plug = blk_mq_plug(q, bio);
- if (!plug)
- return false;
-
- plug_list = &plug->mq_list;
-
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
- bool merged = false;
-
- if (rq->q == q && same_queue_rq) {
- /*
- * Only blk-mq multiple hardware queues case checks the
- * rq in the same queue, there should be only one such
- * rq in a queue
- **/
- *same_queue_rq = rq;
- }
-
- if (rq->q != q || !blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- break;
- }
-
- if (merged)
- return true;
- }
-
- return false;
-}
-
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
@@ -971,9 +814,9 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
- * if queue is not a request based queue.
+ * if queue does not support NOWAIT.
*/
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
+ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
goto not_supported;
if (should_fail_bio(bio))
@@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio);
* limits when retrying requests on other queues. Those requests need
* to be checked against the new queue limits again during dispatch.
*/
-static int blk_cloned_rq_check_limits(struct request_queue *q,
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
struct request *rq)
{
- if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+
+ if (blk_rq_sectors(rq) > max_sectors) {
+ /*
+ * SCSI device does not have a good way to return if
+ * Write Same/Zero is actually supported. If a device rejects
+ * a non-read/write command (discard, write same,etc.) the
+ * low-level device driver will set the relevant queue limit to
+ * 0 to prevent blk-lib from issuing more of the offending
+ * operations. Commands queued prior to the queue limit being
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
+ * errors being propagated to upper layers.
+ */
+ if (max_sectors == 0)
+ return BLK_STS_NOTSUPP;
+
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
- __func__, blk_rq_sectors(rq),
- blk_queue_get_max_sectors(q, req_op(rq)));
- return -EIO;
+ __func__, blk_rq_sectors(rq), max_sectors);
+ return BLK_STS_IOERR;
}
/*
@@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
- return -EIO;
+ return BLK_STS_IOERR;
}
- return 0;
+ return BLK_STS_OK;
}
/**
@@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
*/
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
- if (blk_cloned_rq_check_limits(q, rq))
- return BLK_STS_IOERR;
+ blk_status_t ret;
+
+ ret = blk_cloned_rq_check_limits(q, rq);
+ if (ret != BLK_STS_OK)
+ return ret;
if (rq->rq_disk &&
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
@@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq)
part_stat_unlock();
}
-unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
- unsigned int op)
+static unsigned long __part_start_io_acct(struct hd_struct *part,
+ unsigned int sectors, unsigned int op)
{
- struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
@@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
return now;
}
+
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
+ struct bio *bio)
+{
+ *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
+
+ return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
+}
+EXPORT_SYMBOL_GPL(part_start_io_acct);
+
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+{
+ return __part_start_io_acct(&disk->part0, sectors, op);
+}
EXPORT_SYMBOL(disk_start_io_acct);
-void disk_end_io_acct(struct gendisk *disk, unsigned int op,
- unsigned long start_time)
+static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
+ unsigned long start_time)
{
- struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time;
@@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long start_time)
+{
+ __part_end_io_acct(part, bio_op(bio), start_time);
+ hd_struct_put(part);
+}
+EXPORT_SYMBOL_GPL(part_end_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+{
+ __part_end_io_acct(&disk->part0, op, start_time);
+}
EXPORT_SYMBOL(disk_end_io_acct);
/*
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index c03705cbb9c9..2b36a8f9b813 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
- disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) {
@@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register);
*/
void blk_integrity_unregister(struct gendisk *disk)
{
- disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
}
EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index d37b55db2409..b82649c1440b 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -68,7 +68,7 @@
* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
* 12.5% each. The distribution mechanism only cares about these flattened
* shares. They're called hweights (hierarchical weights) and always add
- * upto 1 (HWEIGHT_WHOLE).
+ * upto 1 (WEIGHT_ONE).
*
* A given cgroup's vtime runs slower in inverse proportion to its hweight.
* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
@@ -179,6 +179,8 @@
#include <linux/parser.h>
#include <linux/sched/signal.h>
#include <linux/blk-cgroup.h>
+#include <asm/local.h>
+#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
@@ -215,36 +217,21 @@ enum {
MAX_PERIOD = USEC_PER_SEC,
/*
- * A cgroup's vtime can run 50% behind the device vtime, which
+ * iocg->vtime is targeted at 50% behind the device vtime, which
* serves as its IO credit buffer. Surplus weight adjustment is
* immediately canceled if the vtime margin runs below 10%.
*/
- MARGIN_PCT = 50,
- INUSE_MARGIN_PCT = 10,
+ MARGIN_MIN_PCT = 10,
+ MARGIN_LOW_PCT = 20,
+ MARGIN_TARGET_PCT = 50,
- /* Have some play in waitq timer operations */
- WAITQ_TIMER_MARGIN_PCT = 5,
+ INUSE_ADJ_STEP_PCT = 25,
- /*
- * vtime can wrap well within a reasonable uptime when vrate is
- * consistently raised. Don't trust recorded cgroup vtime if the
- * period counter indicates that it's older than 5mins.
- */
- VTIME_VALID_DUR = 300 * USEC_PER_SEC,
-
- /*
- * Remember the past three non-zero usages and use the max for
- * surplus calculation. Three slots guarantee that we remember one
- * full period usage from the last active stretch even after
- * partial deactivation and re-activation periods. Don't start
- * giving away weight before collecting two data points to prevent
- * hweight adjustments based on one partial activation period.
- */
- NR_USAGE_SLOTS = 3,
- MIN_VALID_USAGES = 2,
+ /* Have some play in timer operations */
+ TIMER_SLACK_PCT = 1,
/* 1/64k is granular enough and can easily be handled w/ u32 */
- HWEIGHT_WHOLE = 1 << 16,
+ WEIGHT_ONE = 1 << 16,
/*
* As vtime is used to calculate the cost of each IO, it needs to
@@ -275,16 +262,37 @@ enum {
/* unbusy hysterisis */
UNBUSY_THR_PCT = 75,
- /* don't let cmds which take a very long time pin lagging for too long */
- MAX_LAGGING_PERIODS = 10,
-
/*
- * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
- * donate the surplus.
+ * The effect of delay is indirect and non-linear and a huge amount of
+ * future debt can accumulate abruptly while unthrottled. Linearly scale
+ * up delay as debt is going up and then let it decay exponentially.
+ * This gives us quick ramp ups while delay is accumulating and long
+ * tails which can help reducing the frequency of debt explosions on
+ * unthrottle. The parameters are experimentally determined.
+ *
+ * The delay mechanism provides adequate protection and behavior in many
+ * cases. However, this is far from ideal and falls shorts on both
+ * fronts. The debtors are often throttled too harshly costing a
+ * significant level of fairness and possibly total work while the
+ * protection against their impacts on the system can be choppy and
+ * unreliable.
+ *
+ * The shortcoming primarily stems from the fact that, unlike for page
+ * cache, the kernel doesn't have well-defined back-pressure propagation
+ * mechanism and policies for anonymous memory. Fully addressing this
+ * issue will likely require substantial improvements in the area.
*/
- SURPLUS_SCALE_PCT = 125, /* * 125% */
- SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
- SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
+ MIN_DELAY_THR_PCT = 500,
+ MAX_DELAY_THR_PCT = 25000,
+ MIN_DELAY = 250,
+ MAX_DELAY = 250 * USEC_PER_MSEC,
+
+ /* halve debts if avg usage over 100ms is under 50% */
+ DFGV_USAGE_PCT = 50,
+ DFGV_PERIOD = 100 * USEC_PER_MSEC,
+
+ /* don't let cmds which take a very long time pin lagging for too long */
+ MAX_LAGGING_PERIODS = 10,
/* switch iff the conditions are met for longer than this */
AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
@@ -372,9 +380,15 @@ struct ioc_params {
u32 too_slow_vrate_pct;
};
+struct ioc_margins {
+ s64 min;
+ s64 low;
+ s64 target;
+};
+
struct ioc_missed {
- u32 nr_met;
- u32 nr_missed;
+ local_t nr_met;
+ local_t nr_missed;
u32 last_met;
u32 last_missed;
};
@@ -382,7 +396,7 @@ struct ioc_missed {
struct ioc_pcpu_stat {
struct ioc_missed missed[2];
- u64 rq_wait_ns;
+ local64_t rq_wait_ns;
u64 last_rq_wait_ns;
};
@@ -393,8 +407,9 @@ struct ioc {
bool enabled;
struct ioc_params params;
+ struct ioc_margins margins;
u32 period_us;
- u32 margin_us;
+ u32 timer_slack_ns;
u64 vrate_min;
u64 vrate_max;
@@ -405,18 +420,24 @@ struct ioc {
enum ioc_running running;
atomic64_t vtime_rate;
+ u64 vtime_base_rate;
+ s64 vtime_err;
seqcount_spinlock_t period_seqcount;
- u32 period_at; /* wallclock starttime */
+ u64 period_at; /* wallclock starttime */
u64 period_at_vtime; /* vtime starttime */
atomic64_t cur_period; /* inc'd each period */
int busy_level; /* saturation history */
- u64 inuse_margin_vtime;
bool weights_updated;
atomic_t hweight_gen; /* for lazy hweights */
+ /* debt forgivness */
+ u64 dfgv_period_at;
+ u64 dfgv_period_rem;
+ u64 dfgv_usage_us_sum;
+
u64 autop_too_fast_at;
u64 autop_too_slow_at;
int autop_idx;
@@ -424,6 +445,17 @@ struct ioc {
bool user_cost_model:1;
};
+struct iocg_pcpu_stat {
+ local64_t abs_vusage;
+};
+
+struct iocg_stat {
+ u64 usage_us;
+ u64 wait_us;
+ u64 indebt_us;
+ u64 indelay_us;
+};
+
/* per device-cgroup pair */
struct ioc_gq {
struct blkg_policy_data pd;
@@ -443,12 +475,17 @@ struct ioc_gq {
*
* `last_inuse` remembers `inuse` while an iocg is idle to persist
* surplus adjustments.
+ *
+ * `inuse` may be adjusted dynamically during period. `saved_*` are used
+ * to determine and track adjustments.
*/
u32 cfg_weight;
u32 weight;
u32 active;
u32 inuse;
+
u32 last_inuse;
+ s64 saved_margin;
sector_t cursor; /* to detect randio */
@@ -461,14 +498,14 @@ struct ioc_gq {
* `vtime_done` is the same but progressed on completion rather
* than issue. The delta behind `vtime` represents the cost of
* currently in-flight IOs.
- *
- * `last_vtime` is used to remember `vtime` at the end of the last
- * period to calculate utilization.
*/
atomic64_t vtime;
atomic64_t done_vtime;
u64 abs_vdebt;
- u64 last_vtime;
+
+ /* current delay in effect and when it started */
+ u64 delay;
+ u64 delay_at;
/*
* The period this iocg was last active in. Used for deactivation
@@ -477,21 +514,35 @@ struct ioc_gq {
atomic64_t active_period;
struct list_head active_list;
- /* see __propagate_active_weight() and current_hweight() for details */
+ /* see __propagate_weights() and current_hweight() for details */
u64 child_active_sum;
u64 child_inuse_sum;
+ u64 child_adjusted_sum;
int hweight_gen;
u32 hweight_active;
u32 hweight_inuse;
- bool has_surplus;
+ u32 hweight_donating;
+ u32 hweight_after_donation;
+
+ struct list_head walk_list;
+ struct list_head surplus_list;
struct wait_queue_head waitq;
struct hrtimer waitq_timer;
- struct hrtimer delay_timer;
- /* usage is recorded as fractions of HWEIGHT_WHOLE */
- int usage_idx;
- u32 usages[NR_USAGE_SLOTS];
+ /* timestamp at the latest activation */
+ u64 activated_at;
+
+ /* statistics */
+ struct iocg_pcpu_stat __percpu *pcpu_stat;
+ struct iocg_stat local_stat;
+ struct iocg_stat desc_stat;
+ struct iocg_stat last_stat;
+ u64 last_stat_abs_vusage;
+ u64 usage_delta_us;
+ u64 wait_since;
+ u64 indebt_since;
+ u64 indelay_since;
/* this iocg's depth in the hierarchy and ancestors including self */
int level;
@@ -506,7 +557,7 @@ struct ioc_cgrp {
struct ioc_now {
u64 now_ns;
- u32 now;
+ u64 now;
u64 vnow;
u64 vrate;
};
@@ -656,7 +707,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
*/
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
+ return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
}
/*
@@ -664,18 +715,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
*/
static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
}
-static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
+static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
+ u64 abs_cost, u64 cost)
{
+ struct iocg_pcpu_stat *gcs;
+
bio->bi_iocost_cost = cost;
atomic64_add(cost, &iocg->vtime);
+
+ gcs = get_cpu_ptr(iocg->pcpu_stat);
+ local64_add(abs_cost, &gcs->abs_vusage);
+ put_cpu_ptr(gcs);
+}
+
+static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
+{
+ if (lock_ioc) {
+ spin_lock_irqsave(&iocg->ioc->lock, *flags);
+ spin_lock(&iocg->waitq.lock);
+ } else {
+ spin_lock_irqsave(&iocg->waitq.lock, *flags);
+ }
+}
+
+static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
+{
+ if (unlock_ioc) {
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
+ } else {
+ spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
+ }
}
#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>
+static void ioc_refresh_margins(struct ioc *ioc)
+{
+ struct ioc_margins *margins = &ioc->margins;
+ u32 period_us = ioc->period_us;
+ u64 vrate = ioc->vtime_base_rate;
+
+ margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
+ margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
+ margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
+}
+
/* latency Qos params changed, update period_us and all the dependent params */
static void ioc_refresh_period_us(struct ioc *ioc)
{
@@ -709,9 +798,10 @@ static void ioc_refresh_period_us(struct ioc *ioc)
/* calculate dependent params */
ioc->period_us = period_us;
- ioc->margin_us = period_us * MARGIN_PCT / 100;
- ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
- period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
+ ioc->timer_slack_ns = div64_u64(
+ (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
+ 100);
+ ioc_refresh_margins(ioc);
}
static int ioc_autop_idx(struct ioc *ioc)
@@ -738,8 +828,7 @@ static int ioc_autop_idx(