From 880b9577855edddda1e732748e849c63199d489b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2023 09:00:09 -0700 Subject: fs: distinguish between user initiated freeze and kernel initiated freeze Userspace can freeze a filesystem using the FIFREEZE ioctl or by suspending the block device; this state persists until userspace thaws the filesystem with the FITHAW ioctl or resuming the block device. Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for the fsfreeze ioctl") we only allow the first freeze command to succeed. The kernel may decide that it is necessary to freeze a filesystem for its own internal purposes, such as suspends in progress, filesystem fsck activities, or quiescing a device prior to removal. Userspace thaw commands must never break a kernel freeze, and kernel thaw commands shouldn't undo userspace's freeze command. Introduce a couple of freeze holder flags and wire it into the sb_writers state. One kernel and one userspace freeze are allowed to coexist at the same time; the filesystem will not thaw until both are lifted. I wonder if the f2fs/gfs2 code should be using a kernel freeze here, but for now we'll use FREEZE_HOLDER_USERSPACE to preserve existing behaviors. Cc: mcgrof@kernel.org Cc: jack@suse.cz Cc: hch@infradead.org Cc: ruansy.fnst@fujitsu.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Jan Kara --- block/bdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 979e28a46b98..80ea3fa3593b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -248,9 +248,9 @@ int freeze_bdev(struct block_device *bdev) if (!sb) goto sync; if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb); + error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); else - error = freeze_super(sb); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); deactivate_super(sb); if (error) { @@ -291,9 +291,9 @@ int thaw_bdev(struct block_device *bdev) goto out; if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb); + error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); else - error = thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) bdev->bd_fsfreeze_count++; else -- cgit v1.2.3 From ab6860f62bfe329e11e5b5b7295b673c9c3a62d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:19 +0200 Subject: block: simplify the disk_force_media_change interface Hard code the events to DISK_EVENT_MEDIA_CHANGE as that is the only useful use case, and drop the superfluous return value. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-9-hch@lst.de> Signed-off-by: Christian Brauner --- block/disk-events.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/disk-events.c b/block/disk-events.c index 0cfac464e6d1..6189b819b235 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -294,25 +294,18 @@ EXPORT_SYMBOL(disk_check_media_change); * @disk: the disk which will raise the event * @events: the events to raise * - * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, - * attempt to free all dentries and inodes and invalidates all block + * Should be called when the media changes for @disk. Generates a uevent + * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. - * - * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. */ -bool disk_force_media_change(struct gendisk *disk, unsigned int events) +void disk_force_media_change(struct gendisk *disk) { - disk_event_uevent(disk, events); - - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - + disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", disk->disk_name); set_bit(GD_NEED_PART_SCAN, &disk->state); - return true; } EXPORT_SYMBOL_GPL(disk_force_media_change); -- cgit v1.2.3 From 127a5093c79d9cdea9e6edc016ab346e6dd16c23 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:23 +0200 Subject: block: drop the "busy inodes on changed media" log message This message isn't exactly helpful, and file systems already print way more useful messages when shut down while active. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-13-hch@lst.de> Signed-off-by: Christian Brauner --- block/disk-events.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/disk-events.c b/block/disk-events.c index 6189b819b235..6b858d350477 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -281,9 +281,7 @@ bool disk_check_media_change(struct gendisk *disk) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; - if (__invalidate_device(disk->part0, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - disk->disk_name); + __invalidate_device(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } @@ -302,9 +300,7 @@ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); - if (__invalidate_device(disk->part0, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - disk->disk_name); + __invalidate_device(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); -- cgit v1.2.3 From 560e20e4bf6484a0c12f9f3c7a1aa55056948e1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:24 +0200 Subject: block: consolidate __invalidate_device and fsync_bdev We currently have two interfaces that take a block_devices and the find a mounted file systems to flush or invaldidate data on it. Both are a bit problematic because they only work for the "main" block devices that is used as s_dev for the super_block, and because they don't call into the file system at all. Merge the two into a new bdev_mark_dead helper that does both the syncing and invalidation and which is properly documented. This is in preparation of merging the functionality into the ->mark_dead holder operation so that it will work on additional block devices used by a file systems and give us a single entry point for invalidation of dead devices or media. Note that a single standalone fsync_bdev call for an obscure ioctl remains for now, but that one will also be deal with in a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-14-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 33 ++++++++++++++++++++++++++++----- block/disk-events.c | 4 ++-- block/genhd.c | 3 +-- block/partitions/core.c | 5 +---- 4 files changed, 32 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 979e28a46b98..074a9ffa9d8b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -221,7 +221,6 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } -EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev - lock a filesystem and force it into a consistent state @@ -960,12 +959,27 @@ out_path_put: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev, bool kill_dirty) +/** + * bdev_mark_dead - mark a block device as dead + * @bdev: block device to operate on + * @surprise: indicate a surprise removal + * + * Tell the file system that this devices or media is dead. If @surprise is set + * to %true the device or media is already gone, if not we are preparing for an + * orderly removal. + * + * This syncs out all dirty data and writes back inodes and then invalidates any + * cached data in the inodes on the file system, the inodes themselves and the + * block device mapping. + */ +void bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb = get_super(bdev); int res = 0; if (sb) { + if (!surprise) + sync_filesystem(sb); /* * no need to lock the super, get_super holds the * read mutex so the filesystem cannot go away @@ -973,13 +987,22 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb, kill_dirty); + res = invalidate_inodes(sb, true); drop_super(sb); + } else { + if (!surprise) + sync_blockdev(bdev); } invalidate_bdev(bdev); - return res; } -EXPORT_SYMBOL(__invalidate_device); +#ifdef CONFIG_DASD_MODULE +/* + * Drivers should not use this directly, but the DASD driver has historically + * had a shutdown to offline mode that doesn't actually remove the gendisk + * that otherwise looks a lot like a safe device removal. + */ +EXPORT_SYMBOL_GPL(bdev_mark_dead); +#endif void sync_bdevs(bool wait) { diff --git a/block/disk-events.c b/block/disk-events.c index 6b858d350477..422db8292d09 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -281,7 +281,7 @@ bool disk_check_media_change(struct gendisk *disk) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; - __invalidate_device(disk->part0, true); + bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } @@ -300,7 +300,7 @@ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); - __invalidate_device(disk->part0, true); + bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); diff --git a/block/genhd.c b/block/genhd.c index 3d287b32d50d..afc2cb09eb94 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -647,8 +647,7 @@ void del_gendisk(struct gendisk *disk) mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) { remove_inode_hash(part->bd_inode); - fsync_bdev(part); - __invalidate_device(part, true); + bdev_mark_dead(part, false); } mutex_unlock(&disk->open_mutex); diff --git a/block/partitions/core.c b/block/partitions/core.c index 13a7341299a9..e137a87f4db0 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,10 +281,7 @@ static void delete_partition(struct block_device *part) * looked up any more even when openers still hold references. */ remove_inode_hash(part->bd_inode); - - fsync_bdev(part); - __invalidate_device(part, true); - + bdev_mark_dead(part, false); drop_partition(part); } -- cgit v1.2.3 From d8530de5a6e82be0ce17a5fdf727a394bcf6444c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:25 +0200 Subject: block: call into the file system for bdev_mark_dead Combine the newly merged bdev_mark_dead helper with the existing mark_dead holder operation so that all operations that invalidate a device that is dead or being removed now go through the holder ops. This allows file systems to explicitly shutdown either ASAP (for a surprise removal) or after writing back data (for an orderly removal), and do so not only for the main device. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-15-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 30 +++++++++--------------------- block/genhd.c | 44 ++++++++++++++++++++++++-------------------- 2 files changed, 33 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 074a9ffa9d8b..4d580083a114 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -968,31 +968,19 @@ EXPORT_SYMBOL(lookup_bdev); * to %true the device or media is already gone, if not we are preparing for an * orderly removal. * - * This syncs out all dirty data and writes back inodes and then invalidates any - * cached data in the inodes on the file system, the inodes themselves and the - * block device mapping. + * This calls into the file system, which then typicall syncs out all dirty data + * and writes back inodes and then invalidates any cached data in the inodes on + * the file system. In addition we also invalidate the block device mapping. */ void bdev_mark_dead(struct block_device *bdev, bool surprise) { - struct super_block *sb = get_super(bdev); - int res = 0; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) + bdev->bd_holder_ops->mark_dead(bdev, surprise); + else + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_holder_lock); - if (sb) { - if (!surprise) - sync_filesystem(sb); - /* - * no need to lock the super, get_super holds the - * read mutex so the filesystem cannot go away - * under us (->put_super runs with the write lock - * hold). - */ - shrink_dcache_sb(sb); - res = invalidate_inodes(sb, true); - drop_super(sb); - } else { - if (!surprise) - sync_blockdev(bdev); - } invalidate_bdev(bdev); } #ifdef CONFIG_DASD_MODULE diff --git a/block/genhd.c b/block/genhd.c index afc2cb09eb94..cc32a0c704eb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -554,7 +554,7 @@ out_exit_elevator: } EXPORT_SYMBOL(device_add_disk); -static void blk_report_disk_dead(struct gendisk *disk) +static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; @@ -565,10 +565,7 @@ static void blk_report_disk_dead(struct gendisk *disk) continue; rcu_read_unlock(); - mutex_lock(&bdev->bd_holder_lock); - if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) - bdev->bd_holder_ops->mark_dead(bdev); - mutex_unlock(&bdev->bd_holder_lock); + bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); @@ -576,14 +573,7 @@ static void blk_report_disk_dead(struct gendisk *disk) rcu_read_unlock(); } -/** - * blk_mark_disk_dead - mark a disk as dead - * @disk: disk to mark as dead - * - * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O - * to this disk. - */ -void blk_mark_disk_dead(struct gendisk *disk) +static void __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. @@ -603,8 +593,19 @@ void blk_mark_disk_dead(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(disk->queue); +} - blk_report_disk_dead(disk); +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + __blk_mark_disk_dead(disk); + blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); @@ -641,17 +642,20 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); /* - * Prevent new openers by unlinked the bdev inode, and write out - * dirty data before marking the disk dead and stopping all I/O. + * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); - xa_for_each(&disk->part_tbl, idx, part) { + xa_for_each(&disk->part_tbl, idx, part) remove_inode_hash(part->bd_inode); - bdev_mark_dead(part, false); - } mutex_unlock(&disk->open_mutex); - blk_mark_disk_dead(disk); + /* + * Tell the file system to write back all dirty data and shut down if + * it hasn't been notified earlier. + */ + if (!test_bit(GD_DEAD, &disk->state)) + blk_report_disk_dead(disk, false); + __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. -- cgit v1.2.3 From 2142b88c37a3e49fbca4a36b8674626917d9bf40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:26 +0200 Subject: block: call into the file system for ioctl BLKFLSBUF BLKFLSBUF is a historic ioctl that is called on a file handle to a block device and syncs either the file system mounted on that block device if there is one, or otherwise the just the data on the block device. Replace the get_super based syncing with a holder operation to remove the last usage of get_super, and to also support syncing the file system if the block device is not the main block device stored in s_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-16-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 16 ---------------- block/ioctl.c | 9 ++++++++- 2 files changed, 8 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 4d580083a114..a20263fa27a4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -206,22 +206,6 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) } EXPORT_SYMBOL(sync_blockdev_range); -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = sync_filesystem(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} - /** * freeze_bdev - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock diff --git a/block/ioctl.c b/block/ioctl.c index 3be11941fb2d..648670ddb164 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -364,7 +364,14 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, { if (!capable(CAP_SYS_ADMIN)) return -EACCES; - fsync_bdev(bdev); + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) + bdev->bd_holder_ops->sync(bdev); + else + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_holder_lock); + invalidate_bdev(bdev); return 0; } -- cgit v1.2.3