From ad7c3b41e86b59943a903d23c7b037d820e6270c Mon Sep 17 00:00:00 2001 From: Jinke Han Date: Mon, 8 May 2023 01:06:31 +0800 Subject: blk-throttle: Fix io statistics for cgroup v1 After commit f382fb0bcef4 ("block: remove legacy IO schedulers"), blkio.throttle.io_serviced and blkio.throttle.io_service_bytes become the only stable io stats interface of cgroup v1, and these statistics are done in the blk-throttle code. But the current code only counts the bios that are actually throttled. When the user does not add the throttle limit, the io stats for cgroup v1 has nothing. I fix it according to the statistical method of v2, and made it count all ios accurately. Fixes: a7b36ee6ba29 ("block: move blk-throtl fast path inline") Tested-by: Andrea Righi Signed-off-by: Jinke Han Acked-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230507170631.89607-1-hanjinke.666@bytedance.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 ++++-- block/blk-throttle.c | 6 ------ block/blk-throttle.h | 9 +++++++++ 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index cab33bd4f252..c8b28ec5dde9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2062,6 +2062,9 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags; + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; @@ -2093,8 +2096,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9d010d867fbf..7397ff199d66 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2178,12 +2178,6 @@ bool __blk_throtl_bio(struct bio *bio) rcu_read_lock(); - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); - } - spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index ef4b7a4de987..d1ccbfe9f797 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -185,6 +185,15 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); int rw = bio_data_dir(bio); + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + } + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true; -- cgit v1.2.3 From c6b7a3a26e809c9d2a51ae303764c1d2994f31cf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 24 Jun 2023 21:01:05 +0800 Subject: blk-mq: fix two misuses on RQF_USE_SCHED Request allocated from sched tags can't be issued via ->queue_rqs() directly, since driver tag isn't allocated yet. This is the 1st misuse of RQF_USE_SCHED for figuring out plug->has_elevator. Request allocated from sched tags can't be ended by blk_mq_end_request_batch() too, fix the 2nd RQF_USE_SCHED misuse in blk_mq_add_to_batch(). Without this patch, NVMe uring cmd passthrough IO workload can run into hang easily with real io scheduler. Fixes: dd6216bb16e8 ("blk-mq: make sure elevator callbacks aren't called for passthrough request") Reported-by: Guangwu Zhang Closes: https://lore.kernel.org/linux-block/CAGS2=YrBjpLPOKa-gzcKuuOG60AGth5794PNCDwatdnnscB9ug@mail.gmail.com/ Cc: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230624130105.1443879-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 720b5061ffe8..32e50bc0cbb0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1280,7 +1280,11 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED)) + /* + * Any request allocated from sched tags can't be issued to + * ->queue_rqs() directly + */ + if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq->rq_next = NULL; rq_list_add(&plug->mq_list, rq); -- cgit v1.2.3 From 645a829e03384a235b3760959d4ebe420a0f2027 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:40 +0800 Subject: blk-wbt: don't create wbt sysfs entry if CONFIG_BLK_WBT is disabled sysfs entry /sys/block/[device]/queue/wbt_lat_usec will be created even if CONFIG_BLK_WBT is disabled, while read and write will always fail. It doesn't make sense to create a sysfs entry that can't be accessed, so don't create such entry. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 143 ++++++++++++++++++++++++++++-------------------------- block/blk-wbt.h | 19 -------- 2 files changed, 74 insertions(+), 88 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a64208583853..6c1c4ba66bc0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -47,19 +47,6 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_var_store64(s64 *var, const char *page) -{ - int err; - s64 v; - - err = kstrtos64(page, 10, &v); - if (err < 0) - return err; - - *var = v; - return 0; -} - static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, page); @@ -451,61 +438,6 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, return count; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) -{ - if (!wbt_rq_qos(q)) - return -EINVAL; - - if (wbt_disabled(q)) - return sprintf(page, "0\n"); - - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); -} - -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, - size_t count) -{ - struct rq_qos *rqos; - ssize_t ret; - s64 val; - - ret = queue_var_store64(&val, page); - if (ret < 0) - return ret; - if (val < -1) - return -EINVAL; - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(q->disk); - if (ret) - return ret; - } - - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - return count; - - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - wbt_set_min_lat(q, val); - - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - return count; -} - static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -598,7 +530,6 @@ QUEUE_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); @@ -617,6 +548,78 @@ QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +#ifdef CONFIG_BLK_WBT +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!wbt_rq_qos(q)) + return -EINVAL; + + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + struct rq_qos *rqos; + ssize_t ret; + s64 val; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + if (val < -1) + return -EINVAL; + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(q->disk); + if (ret) + return ret; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + return count; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + wbt_set_min_lat(q, val); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return count; +} + +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +#endif + static struct attribute *queue_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -655,7 +658,9 @@ static struct attribute *queue_attrs[] = { &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, +#ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, +#endif &queue_poll_delay_entry.attr, &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-wbt.h b/block/blk-wbt.h index ba6cca5849a6..8a029e138f7a 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -18,10 +18,6 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline int wbt_init(struct gendisk *disk) -{ - return -EINVAL; -} static inline void wbt_disable_default(struct gendisk *disk) { } @@ -31,21 +27,6 @@ static inline void wbt_enable_default(struct gendisk *disk) static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline u64 wbt_get_min_lat(struct request_queue *q) -{ - return 0; -} -static inline void wbt_set_min_lat(struct request_queue *q, u64 val) -{ -} -static inline u64 wbt_default_latency_nsec(struct request_queue *q) -{ - return 0; -} -static inline bool wbt_disabled(struct request_queue *q) -{ - return true; -} #endif /* CONFIG_BLK_WBT */ -- cgit v1.2.3 From 71b8642e79f277459555629f2bea1a8d1fed307e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:41 +0800 Subject: blk-wbt: remove dead code to handle wbt enable/disable with io inflight enable or disable wbt is always called with queue freezed, so that wbt can never be enabled or disabled while io is still inflight, and this behaviour should always hold to avoid io hang(There have been reported several times). Therefor, the code to handle wbt enable/diskble with io inflight is not and never will be used, hence remove such dead code. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 53bf5aa6f9ad..21bbeb31a444 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -200,15 +200,6 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, inflight = atomic_dec_return(&rqw->inflight); - /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. - */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we @@ -545,13 +536,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; - /* - * If we got disabled, just return UINT_MAX. This ensures that - * we'll properly inc a new IO, and dec+wakeup at the end. - */ - if (!rwb_enabled(rwb)) - return UINT_MAX; - if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; -- cgit v1.2.3 From 06257fda83ebfd1c33fb992e41dba7be4e1184d4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:42 +0800 Subject: blk-wbt: cleanup rwb_enabled() and wbt_disabled() 'wb_normal' will set to 0 if 'min_lat_nsec' is 0, and 'min_lat_nsec' can only be set to 0 through sysfs configuration where 'WBT_STATE_OFF_MANUAL' is set together, in the meantime, they can only be cleared together through sysfs afterwards. Hence 'wb_normal != 0' is the same as 'rwb->enable_state != WBT_STATE_OFF_MANUAL'. The code is redundan, hence replace the checking of 'wb_normal' to 'enable_state' in rwb_enabled() and reuse rwb_enabled() for wbt_disabled(). Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 21bbeb31a444..9f7c99c025f3 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -146,7 +146,7 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && - rwb->wb_normal != 0; + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -494,8 +494,7 @@ bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); - return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || - RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; + return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) -- cgit v1.2.3 From eebc21d12f56c1e09a163abf91e351fa2a55a938 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:43 +0800 Subject: blk-iocost: move wbt_enable/disable_default() out of spinlock There are following smatch warning: block/blk-wbt.c:843 wbt_init() warn: sleeping in atomic context ioc_qos_write() <- disables preempt -> wbt_enable_default() -> wbt_init() wbt_init() will be called from wbt_enable_default() if wbt is not initialized, currently this is only possible in blk_register_queue(), hence wbt_init() will never be called from iocost and this warning is false positive. However, we might support rq_qos destruction dynamically in the future, and it's better to prevent that, hence move wbt_enable_default() outside 'ioc->lock'. This is safe because queue is still freezed. Reported-by: Dan Carpenter Link: https://lore.kernel.org/lkml/Y+Ja5SRs886CEz7a@kadam/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6084a9519883..9dfcf540f400 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3301,11 +3301,9 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - wbt_disable_default(disk); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; - wbt_enable_default(disk); } if (user) { @@ -3318,6 +3316,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + if (enable) + wbt_disable_default(disk); + else + wbt_enable_default(disk); + blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); -- cgit v1.2.3 From 6d85ebf95c44e52337ca1d07f0db4b435d1e6762 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:44 +0800 Subject: blk-sysfs: add a new attr_group for blk_mq Currently wbt sysfs entry is created for bio based device, and wbt can be enabled for such device through sysfs while it doesn't make sense because wbt can only work for rq based device. In the meantime, there are other similar sysfs entries. Fix this by adding a new attr_group for blk_mq, and sysfs entries will only be created when the device is rq based. Suggested-by: Christoph Hellwig Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6c1c4ba66bc0..afc797fb0dfc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -621,7 +621,6 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif static struct attribute *queue_attrs[] = { - &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -629,7 +628,6 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, - &elv_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -650,7 +648,6 @@ static struct attribute *queue_attrs[] = { &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, - &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_random_entry.attr, @@ -658,11 +655,7 @@ static struct attribute *queue_attrs[] = { &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, -#ifdef CONFIG_BLK_WBT - &queue_wb_lat_entry.attr, -#endif &queue_poll_delay_entry.attr, - &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif @@ -671,16 +664,23 @@ static struct attribute *queue_attrs[] = { NULL, }; +static struct attribute *blk_mq_queue_attrs[] = { + &queue_requests_entry.attr, + &elv_iosched_entry.attr, + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif + NULL, +}; + static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - if (attr == &queue_io_timeout_entry.attr && - (!q->mq_ops || !q->mq_ops->timeout)) - return 0; - if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) @@ -689,11 +689,30 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, return attr->mode; } +static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; + + if (!queue_is_mq(q)) + return 0; + + if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) + return 0; + + return attr->mode; +} + static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; +static struct attribute_group blk_mq_queue_attr_group = { + .attrs = blk_mq_queue_attrs, + .is_visible = blk_mq_queue_attr_visible, +}; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) @@ -738,6 +757,7 @@ static const struct sysfs_ops queue_sysfs_ops = { static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, + &blk_mq_queue_attr_group, NULL }; -- cgit v1.2.3