diff options
Diffstat (limited to 'block')
| -rw-r--r-- | block/Kconfig | 2 | ||||
| -rw-r--r-- | block/bfq-iosched.c | 9 | ||||
| -rw-r--r-- | block/blk-cgroup.c | 32 | ||||
| -rw-r--r-- | block/blk-core.c | 243 | ||||
| -rw-r--r-- | block/blk-integrity.c | 4 | ||||
| -rw-r--r-- | block/blk-iocost.c | 1619 | ||||
| -rw-r--r-- | block/blk-lib.c | 2 | ||||
| -rw-r--r-- | block/blk-map.c | 177 | ||||
| -rw-r--r-- | block/blk-merge.c | 201 | ||||
| -rw-r--r-- | block/blk-mq-debugfs.c | 11 | ||||
| -rw-r--r-- | block/blk-mq-sched.c | 124 | ||||
| -rw-r--r-- | block/blk-mq-sched.h | 3 | ||||
| -rw-r--r-- | block/blk-mq-tag.c | 156 | ||||
| -rw-r--r-- | block/blk-mq-tag.h | 56 | ||||
| -rw-r--r-- | block/blk-mq.c | 90 | ||||
| -rw-r--r-- | block/blk-mq.h | 76 | ||||
| -rw-r--r-- | block/blk-settings.c | 40 | ||||
| -rw-r--r-- | block/blk-sysfs.c | 277 | ||||
| -rw-r--r-- | block/blk-throttle.c | 59 | ||||
| -rw-r--r-- | block/blk.h | 25 | ||||
| -rw-r--r-- | block/bsg-lib.c | 2 | ||||
| -rw-r--r-- | block/genhd.c | 158 | ||||
| -rw-r--r-- | block/ioctl.c | 33 | ||||
| -rw-r--r-- | block/ioprio.c | 2 | ||||
| -rw-r--r-- | block/kyber-iosched.c | 6 | ||||
| -rw-r--r-- | block/mq-deadline.c | 6 | ||||
| -rw-r--r-- | block/partitions/core.c | 27 | ||||
| -rw-r--r-- | block/scsi_ioctl.c | 4 |
28 files changed, 2194 insertions, 1250 deletions
diff --git a/block/Kconfig b/block/Kconfig index bbad5e8bbffe..a2297edfdde8 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -161,8 +161,6 @@ config BLK_WBT_MQ depends on BLK_WBT help Enable writeback throttling by default on multiqueue devices. - Multiqueue currently doesn't have support for IO scheduling, - enabling this option is recommended. config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fa98470df3f0..9e81d1052091 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + if (!atomic_read(&hctx->elevator_queued)) + return false; + /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); } } @@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); + atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) struct blk_mq_tags *tags = hctx->sched_tags; unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c195365c9817..f9b55614d67d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work) async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; + struct blk_plug plug; + bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock_bh(&blkg->async_bio_lock); @@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work) bio_list_init(&blkg->async_bios); spin_unlock_bh(&blkg->async_bio_lock); + /* start plug only when bio_list contains at least 2 bios */ + if (bios.head && bios.head->bi_next) { + need_plug = true; + blk_start_plug(&plug); + } while ((bio = bio_list_pop(&bios))) submit_bio(bio); + if (need_plug) + blk_finish_plug(&plug); } /** @@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; + bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { - if (atomic_read(&blkg->use_delay)) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + blkcg_scale_delay(blkg, now); - delay_nsec = max_t(u64, delay_nsec, - atomic64_read(&blkg->delay_nsec)); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } } blkg = blkg->parent; } @@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the - * delays at 1 second. If there's 10's of seconds worth of delay then - * the tasks will be delayed for 1 second for every syscall. + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. */ - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); diff --git a/block/blk-core.c b/block/blk-core.c index 10c08ac50697..4884f1e7451b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - rq->tag = -1; - rq->internal_tag = -1; + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); @@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id) if (!q->stats) goto fail_stats; - q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; - q->backing_dev_info->io_pages = VM_READAHEAD_PAGES; - q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->node = node_id; + atomic_set(&q->nr_active_requests_shared_sbitmap, 0); + timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); @@ -643,162 +642,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -static void blk_account_io_merge_bio(struct request *req) -{ - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); -} - -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_back_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_backmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_free_ctx(bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_front_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_frontmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - bio->bi_next = req->bio; - req->bio = bio; - - req->__sector = bio->bi_iter.bi_sector; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_do_front_merge(req, bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio) -{ - unsigned short segments = blk_rq_nr_discard_segments(req); - - if (segments >= queue_max_discard_segments(q)) - goto no_merge; - if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req, blk_rq_pos(req))) - goto no_merge; - - rq_qos_merge(q, req, bio); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - req->nr_phys_segments = segments + 1; - - blk_account_io_merge_bio(req); - return true; -no_merge: - req_set_nomerge(q, req); - return false; -} - -/** - * blk_attempt_plug_merge - try to merge with %current's plugged list - * @q: request_queue new bio is being queued at - * @bio: new bio being queued - * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) - * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, - * otherwise %false. - * - * Plugging coalesces IOs from the same issuer for the same purpose without - * going through @q->queue_lock. As such it's more of an issuing mechanism - * than scheduling, and the request, while may have elvpriv data, is not - * added on the elevator at this point. In addition, we don't have - * reliable access to the elevator outside queue lock. Only check basic - * merging parameters without querying the elevator. - * - * Caller must ensure !blk_queue_nomerges(q) beforehand. - */ -bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) -{ - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - - plug = blk_mq_plug(q, bio); - if (!plug) - return false; - - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - bool merged = false; - - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(rq, bio, nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(rq, bio, nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - break; - } - - if (merged) - return true; - } - - return false; -} - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -971,9 +814,9 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP - * if queue is not a request based queue. + * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) goto not_supported; if (should_fail_bio(bio)) @@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio); * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ -static int blk_cloned_rq_check_limits(struct request_queue *q, +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + + if (blk_rq_sectors(rq) > max_sectors) { + /* + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. + */ + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), - blk_queue_get_max_sectors(q, req_op(rq))); - return -EIO; + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; } /* @@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", __func__, rq->nr_phys_segments, queue_max_segments(q)); - return -EIO; + return BLK_STS_IOERR; } - return 0; + return BLK_STS_OK; } /** @@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - if (blk_cloned_rq_check_limits(q, rq)) - return BLK_STS_IOERR; + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) @@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq) part_stat_unlock(); } -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) +static unsigned long __part_start_io_acct(struct hd_struct *part, + unsigned int sectors, unsigned int op) { - struct hd_struct *part = &disk->part0; const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); @@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, return now; } + +unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, + struct bio *bio) +{ + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); + + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); +} +EXPORT_SYMBOL_GPL(part_start_io_acct); + +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) +{ + return __part_start_io_acct(&disk->part0, sectors, op); +} EXPORT_SYMBOL(disk_start_io_acct); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) +static void __part_end_io_acct(struct hd_struct *part, unsigned int op, + unsigned long start_time) { - struct hd_struct *part = &disk->part0; const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; @@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_unlock(); } + +void part_end_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time) +{ + __part_end_io_acct(part, bio_op(bio), start_time); + hd_struct_put(part); +} +EXPORT_SYMBOL_GPL(part_end_io_acct); + +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time) +{ + __part_end_io_acct(&disk->part0, op, start_time); +} EXPORT_SYMBOL(disk_end_io_acct); /* diff --git a/block/blk-integrity.c b/block/blk-integrity.c index c03705cbb9c9..2b36a8f9b813 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; - disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->ksm) { @@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register); */ void blk_integrity_unregister(struct gendisk *disk) { - disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d37b55db2409..b82649c1440b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -68,7 +68,7 @@ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add - * upto 1 (HWEIGHT_WHOLE). + * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) @@ -179,6 +179,8 @@ #include <linux/parser.h> #include <linux/sched/signal.h> #include <linux/blk-cgroup.h> +#include <asm/local.h> +#include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" @@ -215,36 +217,21 @@ enum { MAX_PERIOD = USEC_PER_SEC, /* - * A cgroup's vtime can run 50% behind the device vtime, which + * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ - MARGIN_PCT = 50, - INUSE_MARGIN_PCT = 10, + MARGIN_MIN_PCT = 10, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, - /* Have some play in waitq timer operations */ - WAITQ_TIMER_MARGIN_PCT = 5, + INUSE_ADJ_STEP_PCT = 25, - /* - * vtime can wrap well within a reasonable uptime when vrate is - * consistently raised. Don't trust recorded cgroup vtime if the - * period counter indicates that it's older than 5mins. - */ - VTIME_VALID_DUR = 300 * USEC_PER_SEC, - - /* - * Remember the past three non-zero usages and use the max for - * surplus calculation. Three slots guarantee that we remember one - * full period usage from the last active stretch even after - * partial deactivation and re-activation periods. Don't start - * giving away weight before collecting two data points to prevent - * hweight adjustments based on one partial activation period. - */ - NR_USAGE_SLOTS = 3, - MIN_VALID_USAGES = 2, + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ - HWEIGHT_WHOLE = 1 << 16, + WEIGHT_ONE = 1 << 16, /* * As vtime is used to calculate the cost of each IO, it needs to @@ -275,16 +262,37 @@ enum { /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, - /* don't let cmds which take a very long time pin lagging for too long */ - MAX_LAGGING_PERIODS = 10, - /* - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, - * donate the surplus. + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. */ - SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, + + /* halve debts if avg usage over 100ms is under 50% */ + DFGV_USAGE_PCT = 50, + DFGV_PERIOD = 100 * USEC_PER_MSEC, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -372,9 +380,15 @@ struct ioc_params { u32 too_slow_vrate_pct; }; +struct ioc_margins { + s64 min; + s64 low; + s64 target; +}; + struct ioc_missed { - u32 nr_met; - u32 nr_missed; + local_t nr_met; + local_t nr_missed; u32 last_met; u32 last_missed; }; @@ -382,7 +396,7 @@ struct ioc_missed { struct ioc_pcpu_stat { struct ioc_missed missed[2]; - u64 rq_wait_ns; + local64_t rq_wait_ns; u64 last_rq_wait_ns; }; @@ -393,8 +407,9 @@ struct ioc { bool enabled; struct ioc_params params; + struct ioc_margins margins; u32 period_us; - u32 margin_us; + u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; @@ -405,18 +420,24 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; seqcount_spinlock_t period_seqcount; - u32 period_at; /* wallclock starttime */ + u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ - u64 inuse_margin_vtime; bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ + /* debt forgivness */ + u64 dfgv_period_at; + u64 dfgv_period_rem; + u64 dfgv_usage_us_sum; + u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; @@ -424,6 +445,17 @@ struct ioc { bool user_cost_model:1; }; +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; + u64 wait_us; + u64 indebt_us; + u64 indelay_us; +}; + /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; @@ -443,12 +475,17 @@ struct ioc_gq { * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; + u32 last_inuse; + s64 saved_margin; sector_t cursor; /* to detect randio */ @@ -461,14 +498,14 @@ struct ioc_gq { * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. - * - * `last_vtime` is used to remember `vtime` at the end of the last - * period to calculate utilization. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; - u64 last_vtime; + + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; /* * The period this iocg was last active in. Used for deactivation @@ -477,21 +514,35 @@ struct ioc_gq { atomic64_t active_period; struct list_head active_list; - /* see __propagate_active_weight() and current_hweight() for details */ + /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; + u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; - bool has_surplus; + u32 hweight_donating; + u32 hweight_after_donation; + + struct list_head walk_list; + struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; - struct hrtimer delay_timer; - /* usage is recorded as fractions of HWEIGHT_WHOLE */ - int usage_idx; - u32 usages[NR_USAGE_SLOTS]; + /* timestamp at the latest activation */ + u64 activated_at; + + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat local_stat; + struct iocg_stat desc_stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + u64 usage_delta_us; + u64 wait_since; + u64 indebt_since; + u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -506,7 +557,7 @@ struct ioc_cgrp { struct ioc_now { u64 now_ns; - u32 now; + u64 now; u64 vnow; u64 vrate; }; @@ -656,7 +707,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* @@ -664,18 +715,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } -static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) { + struct iocg_pcpu_stat *gcs; + bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = ioc->vtime_base_rate; + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; +} + /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { @@ -709,9 +798,10 @@ static void ioc_refresh_period_us(struct ioc *ioc) /* calculate dependent params */ ioc->period_us = period_us; - ioc->margin_us = period_us * MARGIN_PCT / 100; - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); } static int ioc_autop_idx(struct ioc *ioc) @@ -738,8 +828,7 @@ static int ioc_autop_idx(struct ioc *ioc) return idx; /* step up/down based on the vrate */ - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, - VTIME_PER_USEC); + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = ktime_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { @@ -847,6 +936,43 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) return true; } +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); |
