From db04e18dbb0146d3c753dc05f7233350375bbc48 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Aug 2020 14:34:03 +0200 Subject: block: Make request_queue.rpm_status an enum request_queue.rpm_status is assigned values of the rpm_status enum only, so reflect that in its type. Note that including is (currently) a no-op, as it is already included through and , but it is better to play it safe. Signed-off-by: Geert Uytterhoeven Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb5636cc17b9..0a1730b30ad2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -458,7 +459,7 @@ struct request_queue { #ifdef CONFIG_PM struct device *dev; - int rpm_status; + enum rpm_status rpm_status; unsigned int nr_pending; #endif -- cgit v1.2.3 From 611bee526b4a89d49f1b9914a770bfdc101d5fb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Aug 2020 11:10:41 +0200 Subject: block: replace bd_set_size with bd_set_nr_sectors Replace bd_set_size with a version that takes the number of sectors instead, as that fits most of the current and future callers much better. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4ab853461dff..39025dc0397c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -375,7 +375,7 @@ void unregister_blkdev(unsigned int major, const char *name); int revalidate_disk(struct gendisk *disk); int check_disk_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_size(struct block_device *bdev, loff_t size); +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); -- cgit v1.2.3 From c2b4bb8cb3741c0bacf3683e4c1ecd04c977ada3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 23 Aug 2020 11:10:42 +0200 Subject: block: fix locking for struct block_device size updates Two different callers use two different mutexes for updating the block device size, which obviously doesn't help to actually protect against concurrent updates from the different callers. In addition one of the locks, bd_mutex is rather prone to deadlocks with other parts of the block stack that use it for high level synchronization. Switch to using a new spinlock protecting just the size updates, as that is all we need, and make sure everyone does the update through the proper helper. This fixes a bug reported with the nvme revalidating disks during a hot removal operation, which can currently deadlock on bd_mutex. Reported-by: Xianting Tian Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4ecf4fed171f..5accc2549d22 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -38,6 +38,7 @@ struct block_device { /* number of times partitions within this device have been opened. */ unsigned bd_part_count; int bd_invalidated; + spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; -- cgit v1.2.3 From f3256075ba49d80835b601bfbff350a2140b2924 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:45 +0200 Subject: block: remove the BIO_NULL_MAPPED flag We can simply use a boolean flag in the bio_map_data data structure instead. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5accc2549d22..78b073956884 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -257,7 +257,6 @@ enum { BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_USER_MAPPED, /* contains user pages */ - BIO_NULL_MAPPED, /* contains invalid user pages */ BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ -- cgit v1.2.3 From 3310eebafe6f9a872c1f757b3d822dafae9c0cd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Aug 2020 17:37:48 +0200 Subject: block: remove the BIO_USER_MAPPED flag Just check if there is private data, in which case the bio must have originated from bio_copy_user_iov. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 78b073956884..63a39e47fc60 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -256,7 +256,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_USER_MAPPED, /* contains user pages */ BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ -- cgit v1.2.3 From bdc6a287bc98e8f32bf52c9cb2d1bdf75975f5a0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 28 Aug 2020 10:52:55 +0800 Subject: block: Move blk_mq_bio_list_merge() into blk-merge.c Move the blk_mq_bio_list_merge() into blk-merge.c and rename it as a generic name. Reviewed-by: Christoph Hellwig Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9d2d5ad367a4..21a02e0577dd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -489,8 +489,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); -- cgit v1.2.3 From 7b8917f5e29c377be1db5680249fe30e038cb3eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:33 +0200 Subject: block: remove the alignment_offset field from struct hd_struct The alignment offset is only used in slow path callers, so just calculate it on the fly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ++--- include/linux/genhd.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a1730b30ad2..ba1f5f5e11c6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1456,10 +1456,9 @@ static inline int bdev_alignment_offset(struct block_device *bdev) if (q->limits.misaligned) return -1; - if (bdev != bdev->bd_contains) - return bdev->bd_part->alignment_offset; - + return queue_limit_alignment_offset(&q->limits, + bdev->bd_part->start_sect); return q->limits.alignment_offset; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 39025dc0397c..bfa411c80dbb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,7 +65,6 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; - sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; -- cgit v1.2.3 From 7cf34d97ab45203b975396393ded9d3867dfa8bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:34 +0200 Subject: block: remove the discard_alignment field from struct hd_struct The alignment offset is only used in slow path callers, so just calculate it on the fly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- include/linux/genhd.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba1f5f5e11c6..d0d61bc81615 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1498,8 +1498,8 @@ static inline int bdev_discard_alignment(struct block_device *bdev) struct request_queue *q = bdev_get_queue(bdev); if (bdev != bdev->bd_contains) - return bdev->bd_part->discard_alignment; - + return queue_limit_discard_alignment(&q->limits, + bdev->bd_part->start_sect); return q->limits.discard_alignment; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index bfa411c80dbb..9ea2ca31c278 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,7 +65,6 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; - unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; -- cgit v1.2.3 From 46d40cfad13ccbd0739019d754d46d8f93e1d5aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2020 20:02:35 +0200 Subject: block: remove an outdated comment on the bd_dev field kdev_t is long gone, so we don't need to comment a field isn't one.. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 63a39e47fc60..59d9150165c4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,7 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { - dev_t bd_dev; /* not a kdev_t - it's a search key */ + dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; -- cgit v1.2.3 From 1aa50d020c7148f5f0bde15ca80fe6f91a8c5a4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:44 -0400 Subject: blk-iocost: calculate iocg->usages[] from iocg->local_stat.usage_us Currently, iocg->usages[] which are used to guide inuse adjustments are calculated from vtime deltas. This, however, assumes that the hierarchical inuse weight at the time of calculation held for the entire period, which often isn't true and can lead to significant errors. Now that we have absolute usage information collected, we can derive iocg->usages[] from iocg->local_stat.usage_us so that inuse adjustment decisions are made based on actual absolute usage. The calculated usage is clamped between 1 and WEIGHT_ONE and WEIGHT_ONE is also used to signal saturation regardless of the current hierarchical inuse weight. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/iocost.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index c2f580fd371b..a905ecc0342f 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate, __field(u64, vrate) __field(u64, last_period) __field(u64, cur_period) - __field(u64, last_vtime) __field(u64, vtime) __field(u32, weight) __field(u32, inuse) @@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate, __entry->vrate = now->vrate; __entry->last_period = last_period; __entry->cur_period = cur_period; - __entry->last_vtime = iocg->last_vtime; __entry->vtime = vtime; __entry->weight = iocg->weight; __entry->inuse = iocg->inuse; @@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate, ), TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " - "period=%llu->%llu vtime=%llu->%llu " + "period=%llu->%llu vtime=%llu " "weight=%u/%u hweight=%llu/%llu", __get_str(devname), __get_str(cgroup), __entry->now, __entry->vnow, __entry->vrate, __entry->last_period, __entry->cur_period, - __entry->last_vtime, __entry->vtime, - __entry->inuse, __entry->weight, + __entry->vtime, __entry->inuse, __entry->weight, __entry->hweight_inuse, __entry->hweight_active ) ); -- cgit v1.2.3 From 065655c862fedf4b04e1b28b83ca6f338d81cf0b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:46 -0400 Subject: blk-iocost: decouple vrate adjustment from surplus transfers Budget donations are inaccurate and could take multiple periods to converge. To prevent triggering vrate adjustments while surplus transfers were catching up, vrate adjustment was suppressed if donations were increasing, which was indicated by non-zero nr_surpluses. This entangling won't be necessary with the scheduled rewrite of donation mechanism which will make it precise and immediate. Let's decouple the two in preparation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/iocost.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index a905ecc0342f..ee024fe8fef6 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -128,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, TRACE_EVENT(iocost_ioc_vrate_adj, TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm, - u32 rq_wait_pct, int nr_lagging, int nr_shortages, - int nr_surpluses), + u32 rq_wait_pct, int nr_lagging, int nr_shortages), - TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, - nr_surpluses), + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages), TP_STRUCT__entry ( __string(devname, ioc_name(ioc)) @@ -144,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __field(u32, rq_wait_pct) __field(int, nr_lagging) __field(int, nr_shortages) - __field(int, nr_surpluses) ), TP_fast_assign( @@ -157,15 +154,13 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __entry->rq_wait_pct = rq_wait_pct; __entry->nr_lagging = nr_lagging; __entry->nr_shortages = nr_shortages; - __entry->nr_surpluses = nr_surpluses; ), - TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d", __get_str(devname), __entry->old_vrate, __entry->new_vrate, __entry->busy_level, __entry->read_missed_ppm, __entry->write_missed_ppm, - __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, - __entry->nr_surpluses + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages ) ); -- cgit v1.2.3 From 046037551721e8831f6718ac2149887f6bb1f802 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:55 -0400 Subject: blk-iocost: restore inuse update tracepoints Update and restore the inuse update tracepoints. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/iocost.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index ee024fe8fef6..b350860d2e71 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -95,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update, ) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -105,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -115,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, -- cgit v1.2.3 From 9d3a39a5f1e45827b008fff1ee9cf3cac3409665 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Mon, 24 Aug 2020 15:10:34 -0700 Subject: block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping. Retain CAP_SYS_ADMIN permission for backwards compatibility. Signed-off-by: Khazhismel Kumykov Reviewed-by: Bart Van Assche Acked-by: Serge Hallyn Signed-off-by: Jens Axboe --- include/uapi/linux/capability.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 395dd0df8d08..c6ca33034147 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { processes and setting the scheduling algorithm used by another process. */ /* Allow setting cpu affinity on other processes */ +/* Allow setting realtime ioprio class */ +/* Allow setting ioprio class on other processes */ #define CAP_SYS_NICE 23 -- cgit v1.2.3 From f4ad06f2bb8476548b08f89919ee65abc4e40212 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 17:57:42 +0200 Subject: block: rename bd_invalidated Replace bd_invalidate with a new BDEV_NEED_PART_SCAN flag in a bd_flags variable to better describe the condition. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 59d9150165c4..6ffa783e1633 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -19,6 +19,8 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; +#define BDEV_NEED_PART_SCAN 0 + struct block_device { dev_t bd_dev; int bd_openers; @@ -37,7 +39,7 @@ struct block_device { struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; - int bd_invalidated; + unsigned long bd_flags; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; -- cgit v1.2.3 From 659e56ba864d37b7ee0a49cd432205b2a5ca815e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 17:57:43 +0200 Subject: block: add a new revalidate_disk_size helper revalidate_disk is a relative awkward helper for driver use, as it first calls an optional driver method and then updates the block device size, while most callers either don't need the method call at all, or want to keep state between the caller and the called method. Add a revalidate_disk_size helper that just performs the update of the block device size from the gendisk one, and switch all drivers that do not implement ->revalidate_disk to use the new helper instead of revalidate_disk() Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Acked-by: Song Liu Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9ea2ca31c278..f76c8baf6b7d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -371,6 +371,7 @@ int register_blkdev(unsigned int major, const char *name); void unregister_blkdev(unsigned int major, const char *name); int revalidate_disk(struct gendisk *disk); +void revalidate_disk_size(struct gendisk *disk, bool verbose); int check_disk_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); -- cgit v1.2.3 From b8086d3f5a0e88b1912d55a158b8a6a43ad6604b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 17:57:44 +0200 Subject: block: use revalidate_disk_size in set_capacity_revalidate_and_notify Only virtio_blk and xen-blkfront set the revalidate argument to true, and both do not implement the ->revalidate_disk method. So switch to the helper that just updates the size instead. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/genhd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f76c8baf6b7d..02a73198b289 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -313,8 +313,8 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -extern void set_capacity_revalidate_and_notify(struct gendisk *disk, - sector_t size, bool revalidate); +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool update_bdev); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ -- cgit v1.2.3 From de09077c89183cbc627d9393706343662da7f5a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 17:57:48 +0200 Subject: block: remove revalidate_disk() Remove the now unused helper. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Acked-by: Song Liu Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 02a73198b289..c618b27292fc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -370,7 +370,6 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); int register_blkdev(unsigned int major, const char *name); void unregister_blkdev(unsigned int major, const char *name); -int revalidate_disk(struct gendisk *disk); void revalidate_disk_size(struct gendisk *disk, bool verbose); int check_disk_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -- cgit v1.2.3 From 51db1c37ee166159c5753ce8d64d6bacf113e0f0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 19 Aug 2020 23:20:19 +0800 Subject: blk-mq: Rename BLK_MQ_F_TAG_SHARED as BLK_MQ_F_TAG_QUEUE_SHARED BLK_MQ_F_TAG_SHARED actually means that tags is shared among request queues, all of which should belong to LUNs attached to same HBA. So rename it to make the point explicitly. [jpg: rebase a few times, add rnbd-clt.c change] Suggested-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: John Garry Tested-by: Douglas Gilbert Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 21a02e0577dd..982c4f92b63c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -378,7 +378,7 @@ struct blk_mq_ops { enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, - BLK_MQ_F_TAG_SHARED = 1 << 1, + BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: -- cgit v1.2.3 From 32bc15afed04bd73e29d713d8db47818d6aa89af Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 19 Aug 2020 23:20:24 +0800 Subject: blk-mq: Facilitate a shared sbitmap per tagset Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support multiple reply queues with single hostwide tags. In addition, these drivers want to use interrupt assignment in pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0], CPU hotplug may cause in-flight IO completion to not be serviced when an interrupt is shutdown. That problem is solved in commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline"). However, to take advantage of that blk-mq feature, the HBA HW queuess are required to be mapped to that of the blk-mq hctx's; to do that, the HBA HW queues need to be exposed to the upper layer. In making that transition, the per-SCSI command request tags are no longer unique per Scsi host - they are just unique per hctx. As such, the HBA LLDD would have to generate this tag internally, which has a certain performance overhead. However another problem is that blk-mq assumes the host may accept (Scsi_host.can_queue * #hw queue) commands. In commit 6eb045e092ef ("scsi: core: avoid host-wide host_busy counter for scsi_mq"), the Scsi host busy counter was removed, which would stop the LLDD being sent more than .can_queue commands; however, it should still be ensured that the block layer does not issue more than .can_queue commands to the Scsi host. To solve this problem, introduce a shared sbitmap per blk_mq_tag_set, which may be requested at init time. New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the tagset to indicate whether the shared sbitmap should be used. Even when BLK_MQ_F_TAG_HCTX_SHARED is set, a full set of tags and requests are still allocated per hctx; the reason for this is that if tags and requests were only allocated for a single hctx - like hctx0 - it may break block drivers which expect a request be associated with a specific hctx, i.e. not always hctx0. This will introduce extra memory usage. This change is based on work originally from Ming Lei in [1] and from Bart's suggestion in [2]. [0] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/ [1] https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@redhat.com/ [2] https://lore.kernel.org/linux-block/ff77beff-5fd9-9f05-12b6-826922bace1f@huawei.com/T/#m3db0a602f095cbcbff27e9c884d6b4ae826144be Signed-off-by: John Garry Tested-by: Don Brace #SCSI resv cmds patches used Tested-by: Douglas Gilbert Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 982c4f92b63c..df7b903ce7ae 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -231,6 +231,9 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. + * @__bitmap_tags: A shared tags sbitmap, used over all hctx's + * @__breserved_tags: + * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @tag_list_lock: Serializes tag_list accesses. @@ -250,6 +253,8 @@ struct blk_mq_tag_set { unsigned int flags; void *driver_data; + struct sbitmap_queue __bitmap_tags; + struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; struct mutex tag_list_lock; @@ -384,6 +389,7 @@ enum { * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, + BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, -- cgit v1.2.3 From bccf5e26d99c28980bd6ced474422a1b18402263 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 19 Aug 2020 23:20:26 +0800 Subject: blk-mq: Record nr_active_requests per queue for when using shared sbitmap The per-hctx nr_active value can no longer be used to fairly assign a share of tag depth per request queue for when using a shared sbitmap, as it does not consider that the tags are shared tags over all hctx's. For this case, record the nr_active_requests per request_queue, and make the judgement based on that value. Co-developed-with: Kashyap Desai Signed-off-by: John Garry Tested-by: Don Brace #SCSI resv cmds patches used Tested-by: Douglas Gilbert Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0d61bc81615..6277aee2aeaa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -485,6 +485,8 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; + atomic_t nr_active_requests_shared_sbitmap; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); -- cgit v1.2.3 From f1b49fdc1c64db110aa1315831e5fe0f8599fa56 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 19 Aug 2020 23:20:27 +0800 Subject: blk-mq: Record active_queues_shared_sbitmap per tag_set for when using shared sbitmap For when using a shared sbitmap, no longer should the number of active request queues per hctx be relied on for when judging how to share the tag bitmap. Instead maintain the number of active request queues per tag_set, and make the judgement based on that. Originally-from: Kashyap Desai Signed-off-by: John Garry Tested-by: Don Brace #SCSI resv cmds patches used Tested-by: Douglas Gilbert Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index df7b903ce7ae..8279c807e1f3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -252,6 +252,7 @@ struct blk_mq_tag_set { unsigned int timeout; unsigned int flags; void *driver_data; + atomic_t active_queues_shared_sbitmap; struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6277aee2aeaa..7d82959e7b86 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,6 +618,7 @@ struct request_queue { #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ +#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) -- cgit v1.2.3 From b445547ec1bbd3e7bf4b1c142550942f70527d95 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 19 Aug 2020 23:20:28 +0800 Subject: blk-mq, elevator: Count requests per hctx to improve performance High CPU utilization on "native_queued_spin_lock_slowpath" due to lock contention is possible for mq-deadline and bfq IO schedulers when nr_hw_queues is more than one. It is because kblockd work queue can submit IO from all online CPUs (through blk_mq_run_hw_queues()) even though only one hctx has pending commands. The elevator callback .has_work for mq-deadline and bfq scheduler considers pending work if there are any IOs on request queue but it does not account hctx context. Add a per-hctx 'elevator_queued' count to the hctx to avoid triggering the elevator even though there are no requests queued. [jpg: Relocated atomic_dec() in dd_dispatch_request(), update commit message per Kashyap] Signed-off-by: Kashyap Desai Signed-off-by: Hannes Reinecke Signed-off-by: John Garry Tested-by: Douglas Gilbert Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8279c807e1f3..b23eeca4d677 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -139,6 +139,10 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; + /** + * @elevator_queued: Number of queued requests on hctx. + */ + atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; -- cgit v1.2.3 From 384d87ef2c954fc58e6c5fd8253e4a1984f5fe02 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Sep 2020 10:58:52 +0200 Subject: block: Do not discard buffers under a mounted filesystem Discarding blocks and buffers under a mounted filesystem is hardly anything admin wants to do. Usually it will confuse the filesystem and sometimes the loss of buffer_head state (including b_private field) can even cause crashes like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 4 PID: 203778 Comm: jbd2/dm-3-8 Kdump: loaded Tainted: G O --------- - - 4.18.0-147.5.0.5.h126.eulerosv2r9.x86_64 #1 Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 1.57 08/11/2015 RIP: 0010:jbd2_journal_grab_journal_head+0x1b/0x40 [jbd2] ... Call Trace: __jbd2_journal_insert_checkpoint+0x23/0x70 [jbd2] jbd2_journal_commit_transaction+0x155f/0x1b60 [jbd2] kjournald2+0xbd/0x270 [jbd2] So if we don't have block device open with O_EXCL already, claim the block device while we truncate buffer cache. This makes sure any exclusive block device user (such as filesystem) cannot operate on the device while we are discarding buffer cache. Reported-by: Ye Bin Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig [axboe: fix !CONFIG_BLOCK error in truncate_bdev_range()] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d82959e7b86..37ec5a73d027 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1987,11 +1987,18 @@ void bdput(struct block_device *); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } +static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + return 0; +} static inline int sync_blockdev(struct block_device *bdev) { return 0; -- cgit v1.2.3 From 95f6f3a46fc4ee1a2b216a6b46bdf2b450f1877f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 16:53:29 +0200 Subject: block: add a bdev_check_media_change helper Like check_disk_changed, except that it does not call ->revalidate_disk but leaves that to the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c618b27292fc..322d48a20772 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -315,7 +315,6 @@ extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev); -extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; @@ -372,6 +371,7 @@ void unregister_blkdev(unsigned int major, const char *name); void revalidate_disk_size(struct gendisk *disk, bool verbose); int check_disk_change(struct block_device *bdev); +bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); -- cgit v1.2.3 From fec2cf607ba9305770436b1e5c485963a9f0a7bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 16:53:42 +0200 Subject: ide-gd: stop using the disk events mechanism ide-gd is only using the disk events mechanism to be able to force an invalidation and partition scan on opening removable media. Just open code the logic without invoving the block layer. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/ide.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/ide.h b/include/linux/ide.h index a254841bd315..62653769509f 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -490,8 +490,6 @@ enum { IDE_DFLAG_NOPROBE = BIT(9), /* need to do check_media_change() */ IDE_DFLAG_REMOVABLE = BIT(10), - /* needed for removable devices */ - IDE_DFLAG_ATTACH = BIT(11), IDE_DFLAG_FORCED_GEOM = BIT(12), /* disallow setting unmask bit */ IDE_DFLAG_NO_UNMASK = BIT(13), -- cgit v1.2.3 From b92b53079aedbfb56bbb9ea360e5119fb563a2a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 16:53:47 +0200 Subject: block: remove check_disk_change Remove the now unused check_disk_change helper. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 322d48a20772..1c97cf84f011 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -370,7 +370,6 @@ int register_blkdev(unsigned int major, const char *name); void unregister_blkdev(unsigned int major, const char *name); void revalidate_disk_size(struct gendisk *disk, bool verbose); -int check_disk_change(struct block_device *bdev); bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); -- cgit v1.2.3 From 7b26410b05f8c262688de8a689ba8e5d0c3cff01 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 31 Aug 2020 15:27:23 -0700 Subject: block: introduce part_[begin|end]_io_acct These functions can be used to enable iostat for partitions on devices like md, bcache. Signed-off-by: Song Liu Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37ec5a73d027..5bd96fbab9b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1933,6 +1933,11 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); +unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, + struct bio *bio); +void part_end_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time); + /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for -- cgit v1.2.3 From 07d098e6bbad04030dab5b3e64149601fcb063ce Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 21 Sep 2020 22:32:49 -0400 Subject: block: allow 'chunk_sectors' to be non-power-of-2 It is possible, albeit more unlikely, for a block device to have a non power-of-2 for chunk_sectors (e.g. 10+2 RAID6 with 128K chunk_sectors, which results in a full-stripe size of 1280K. This causes the RAID6's io_opt to be advertised as 1280K, and a stacked device _could_ then be made to use a blocksize, aka chunk_sectors, that matches non power-of-2 io_opt of underlying RAID6 -- resulting in stacked device's chunk_sectors being a non power-of-2). Update blk_queue_chunk_sectors() and blk_max_size_offset() to accommodate drivers that need a non power-of-2 chunk_sectors. Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5bd96fbab9b4..6e19a7aa1672 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1063,11 +1063,17 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, static inline unsigned int blk_max_size_offset(struct request_queue *q, sector_t offset) { - if (!q->limits.chunk_sectors) + unsigned int chunk_sectors = q->limits.chunk_sectors; + + if (!chunk_sectors) return q->limits.max_sectors; - return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors - - (offset & (q->limits.chunk_sectors - 1)))); + if (likely(is_power_of_2(chunk_sectors))) + chunk_sectors -= offset & (chunk_sectors - 1); + else + chunk_sectors -= sector_div(offset, chunk_sectors); + + return min(q->limits.max_sectors, chunk_sectors); } static inline unsigned int blk_rq_get_max_sectors(struct request *rq, -- cgit v1.2.3 From 38430f0876fa8b9549ec434f569dce03e057c076 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:45 +0200 Subject: block: move the NEED_PART_SCAN flag to struct gendisk We can only scan for partitions on the whole disk, so move the flag from struct block_device to struct gendisk. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 +--- include/linux/genhd.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6ffa783e1633..eb20e28184ab 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -19,8 +19,6 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; -#define BDEV_NEED_PART_SCAN 0 - struct block_device { dev_t bd_dev; int bd_openers; @@ -39,7 +37,7 @@ struct block_device { struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; - unsigned long bd_flags; + spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1c97cf84f011..38f23d757013 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -191,6 +191,8 @@ struct gendisk { void *private_data; int flags; + unsigned long state; +#define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir; -- cgit v1.2.3 From bb3247a399801ebba20bef101c89e563f5fe7f02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:55 +0200 Subject: PM: rewrite is_hibernate_resume_dev to not require an inode Just check the dev_t to help simplifying the code. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jens Axboe --- include/linux/suspend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index cb9afad82a90..8af13ba60c7e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -473,9 +473,9 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV -int is_hibernate_resume_dev(const struct inode *); +int is_hibernate_resume_dev(dev_t dev); #else -static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; } +static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } #endif /* Hibernation and suspend events */ -- cgit v1.2.3 From 21bd900572f3708e281ea25f051fc92462eb1193 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:56 +0200 Subject: mm: split swap_type_of swap_type_of is used for two entirely different purposes: (1) check what swap type a given device/offset corresponds to (2) find the first available swap device that can be written to Mixing both in a single function creates an unreadable mess. Create two separate functions instead, and switch both to pass a dev_t instead of a struct block_device to further simplify the code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 661046994db4..4340a7b6e7a1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -467,7 +467,8 @@ extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); -extern int swap_type_of(dev_t, sector_t, struct block_device **); +int swap_type_of(dev_t device, sector_t offset); +int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); -- cgit v1.2.3 From 1fb1a2ad75e33e646d33e42b9ed17d879d472859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:58 +0200 Subject: block: mark blkdev_get static There are no users outside the core block code left now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6e19a7aa1672..be5ef6f4ba19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1981,7 +1981,6 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #define BLKDEV_MAJOR_MAX 0 #endif -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); -- cgit v1.2.3 From 402dd2cf46b177be5bcb138b7d7fd8f38aa130e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:28 +0200 Subject: fs: remove the unused SB_I_MULTIROOT flag The last user of SB_I_MULTIROOT is disappeared with commit f2aedb713c28 ("NFS: Add fs_context support.") Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..fbd74df5ce5f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1385,7 +1385,6 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -- cgit v1.2.3 From c2e4cd57cfa1f627b786c764d185fff85fd12be9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:34 +0200 Subject: block: lift setting the readahead size into the block layer Drivers shouldn't really mess with the readahead size, as that is a VM concept. Instead set it based on the optimal I/O size by lifting the algorithm from the md driver when registering the disk. Also set bdi->io_pages there as well by applying the same scheme based on max_sectors. To ensure the limits work well for stacking drivers a new helper is added to update the readahead limits from the block limits, which is also called from disk_stack_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Reviewed-by: Mike Snitzer Reviewed-by: Martin K. Petersen Acked-by: Coly Li Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index be5ef6f4ba19..282f5ca424f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1140,6 +1140,7 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q, extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); +void blk_queue_update_readahead(struct request_queue *q); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); -- cgit v1.2.3 From ed7b6b4f6e915cb0bc52d0000bcc63168867b6ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:35 +0200 Subject: bdi: remove BDI_CAP_CGROUP_WRITEBACK Just checking SB_I_CGROUPWB for cgroup writeback support is enough. Either the file system allocates its own bdi (e.g. btrfs), in which case it is known to support cgroup writeback, or the bdi comes from the block layer, which always supports cgroup writeback. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0b06b2d26c9a..52583b6f2ea0 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -123,7 +123,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. * - * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be * inefficient. */ @@ -233,9 +232,9 @@ int inode_congested(struct inode *inode, int cong_bits); * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * - * cgroup writeback requires support from both the bdi and filesystem. - * Also, both memcg and iocg have to be on the default hierarchy. Test - * whether all conditions are met. + * Cgroup writeback requires support from the filesystem. Also, both memcg and + * iocg have to be on the default hierarchy. Test whether all conditions are + * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. @@ -247,7 +246,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && bdi_cap_account_dirty(bdi) && - (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } -- cgit v1.2.3 From a8b456d01cd6b37191f14248f3e2bdbe5ce3a89e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:36 +0200 Subject: bdi: remove BDI_CAP_SYNCHRONOUS_IO BDI_CAP_SYNCHRONOUS_IO is only checked in the swap code, and used to decided if ->rw_page can be used on a block device. Just check up for the method instead. The only complication is that zram needs a second set of block_device_operations as it can switch between modes that actually support ->rw_page and those who don't. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 52583b6f2ea0..860ea33571bc 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -122,9 +122,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. - * - * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be - * inefficient. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -132,7 +129,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 -#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -174,11 +170,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(int sync, long timeout); -static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) -{ - return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; -} - static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { return bdi->capabilities & BDI_CAP_STABLE_WRITES; -- cgit v1.2.3 From 1cb039f3dc1619eb795c54aad0a98fdb379b4237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:38 +0200 Subject: bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag The BDI_CAP_STABLE_WRITES is one of the few bits of information in the backing_dev_info shared between the block drivers and the writeback code. To help untangling the dependency replace it with a queue flag and a superblock flag derived from it. This also helps with the case of e.g. a file system requiring stable writes due to its own checksumming, but not forcing it on other users of the block device like the swap code. One downside is that we an't support the stable_pages_required bdi attribute in sysfs anymore. It is replaced with a queue attribute which also is writable for easier testing. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 6 ------ include/linux/blkdev.h | 3 +++ include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 860ea33571bc..5da4ea3dd0cc 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -126,7 +126,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 -#define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 @@ -170,11 +169,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(int sync, long timeout); -static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) -{ - return bdi->capabilities & BDI_CAP_STABLE_WRITES; -} - static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 282f5ca424f1..8e77f12de522 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -606,6 +606,7 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ +#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 17 /* Write back caching */ #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ @@ -635,6 +636,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_stable_writes(q) \ + test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) diff --git a/include/linux/fs.h b/include/linux/fs.h index fbd74df5ce5f..222465b7cf41 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1385,6 +1385,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -- cgit v1.2.3 From 823423ef55f4d9c470b1edc9c5b5c93d06abfaae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:39 +0200 Subject: bdi: invert BDI_CAP_NO_ACCT_WB Replace BDI_CAP_NO_ACCT_WB with a positive BDI_CAP_WRITEBACK_ACCT to make the checks more obvious. Also remove the pointless bdi_cap_account_writeback wrapper that just obsfucates the check. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da4ea3dd0cc..b217344a2c63 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -120,17 +120,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting * BDI_CAP_NO_WRITEBACK: Don't write pages back - * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages + * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 -#define BDI_CAP_NO_ACCT_WB 0x00000004 +#define BDI_CAP_WRITEBACK_ACCT 0x00000004 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ - (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) + (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY) extern struct backing_dev_info noop_backing_dev_info; @@ -179,13 +179,6 @@ static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); } -static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) -{ - /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ - return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | - BDI_CAP_NO_WRITEBACK)); -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host)); -- cgit v1.2.3 From f56753ac2a90810726334df04d735e9f8f5a32d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:40 +0200 Subject: bdi: replace BDI_CAP_NO_{WRITEBACK,ACCT_DIRTY} with a single flag Replace the two negative flags that are always used together with a single positive flag that indicates the writeback capability instead of two related non-capabilities. Also remove the pointless wrappers to just check the flag. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 48 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b217344a2c63..44df4fcef65c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -110,27 +110,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); /* * Flags in backing_dev_info::capability * - * The first three flags control whether dirty pages will contribute to the - * VM's accounting and whether writepages() should be called for dirty pages - * (something that would not, for example, be appropriate for ramfs) - * - * WARNING: these flags are closely related and should not normally be - * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these -