diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-04 20:00:14 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-04 20:00:14 -0700 |
| commit | fa9db655d0e112c108fe838809608caf759bdf5e (patch) | |
| tree | 899a983b333871688095fd14b413c199b9a38f73 /drivers/md | |
| parent | e495274793ea602415d050452088a496abcd9e6c (diff) | |
| parent | bc792884b76f0da2f5c9a8d720e430e2de9756f5 (diff) | |
| download | linux-fa9db655d0e112c108fe838809608caf759bdf5e.tar.gz linux-fa9db655d0e112c108fe838809608caf759bdf5e.tar.bz2 linux-fa9db655d0e112c108fe838809608caf759bdf5e.zip | |
Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
- NVMe pull requests via Christoph:
- add support for In-Band authentication (Hannes Reinecke)
- handle the persistent internal error AER (Michael Kelley)
- use in-capsule data for TCP I/O queue connect (Caleb Sander)
- remove timeout for getting RDMA-CM established event (Israel
Rukshin)
- misc cleanups (Joel Granados, Sagi Grimberg, Chaitanya Kulkarni,
Guixin Liu, Xiang wangx)
- use command_id instead of req->tag in trace_nvme_complete_rq()
(Bean Huo)
- various fixes for the new authentication code (Lukas Bulwahn,
Dan Carpenter, Colin Ian King, Chaitanya Kulkarni, Hannes
Reinecke)
- small cleanups (Liu Song, Christoph Hellwig)
- restore compat_ioctl support (Nick Bowler)
- make a nvmet-tcp workqueue lockdep-safe (Sagi Grimberg)
- enable generic interface (/dev/ngXnY) for unknown command sets
(Joel Granados, Christoph Hellwig)
- don't always build constants.o (Christoph Hellwig)
- print the command name of aborted commands (Christoph Hellwig)
- MD pull requests via Song:
- Improve raid5 lock contention, by Logan Gunthorpe.
- Misc fixes to raid5, by Logan Gunthorpe.
- Fix race condition with md_reap_sync_thread(), by Guoqing Jiang.
- Fix potential deadlock with raid5_quiesce and
raid5_get_active_stripe, by Logan Gunthorpe.
- Refactoring md_alloc(), by Christoph"
- Fix md disk_name lifetime problems, by Christoph Hellwig
- Convert prepare_to_wait() to wait_woken() api, by Logan
Gunthorpe;
- Fix sectors_to_do bitmap issue, by Logan Gunthorpe.
- Work on unifying the null_blk module parameters and configfs API
(Vincent)
- drbd bitmap IO error fix (Lars)
- Set of rnbd fixes (Guoqing, Md Haris)
- Remove experimental marker on bcache async device registration (Coly)
- Series from cleaning up the bio splitting (Christoph)
- Removal of the sx8 block driver. This hardware never really
widespread, and it didn't receive a lot of attention after the
initial merge of it back in 2005 (Christoph)
- A few fixes for s390 dasd (Eric, Jiang)
- Followup set of fixes for ublk (Ming)
- Support for UBLK_IO_NEED_GET_DATA for ublk (ZiyangZhang)
- Fixes for the dio dma alignment (Keith)
- Misc fixes and cleanups (Ming, Yu, Dan, Christophe
* tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block: (136 commits)
s390/dasd: Establish DMA alignment
s390/dasd: drop unexpected word 'for' in comments
ublk_drv: add support for UBLK_IO_NEED_GET_DATA
ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA
ublk_drv: cleanup ublksrv_ctrl_dev_info
ublk_drv: add SET_PARAMS/GET_PARAMS control command
ublk_drv: fix ublk device leak in case that add_disk fails
ublk_drv: cancel device even though disk isn't up
block: fix leaking page ref on truncated direct io
block: ensure bio_iov_add_page can't fail
block: ensure iov_iter advances for added pages
drivers:md:fix a potential use-after-free bug
md/raid5: Ensure batch_last is released before sleeping for quiesce
md/raid5: Move stripe_request_ctx up
md/raid5: Drop unnecessary call to r5c_check_stripe_cache_usage()
md/raid5: Make is_inactive_blocked() helper
md/raid5: Refactor raid5_get_active_stripe()
block: pass struct queue_limits to the bio splitting helpers
block: move bio_allowed_max_sectors to blk-merge.c
block: move the call to get_max_io_size out of blk_bio_segment_split
...
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/bcache/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/md/dm-raid.c | 1 | ||||
| -rw-r--r-- | drivers/md/dm.c | 8 | ||||
| -rw-r--r-- | drivers/md/md-autodetect.c | 21 | ||||
| -rw-r--r-- | drivers/md/md-cluster.c | 4 | ||||
| -rw-r--r-- | drivers/md/md.c | 424 | ||||
| -rw-r--r-- | drivers/md/md.h | 19 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 5 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 40 | ||||
| -rw-r--r-- | drivers/md/raid5-log.h | 77 | ||||
| -rw-r--r-- | drivers/md/raid5-ppl.c | 2 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 727 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 2 |
13 files changed, 804 insertions, 528 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index cf3e8096942a..529c9d04e9a4 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG operations that get stuck. config BCACHE_ASYNC_REGISTRATION - bool "Asynchronous device registration (EXPERIMENTAL)" + bool "Asynchronous device registration" depends on BCACHE help Add a sysfs file /sys/fs/bcache/register_async. Writing registering diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1ec17c32867f..c640be453313 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 99642f69bfa7..28bd4a35b86b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work) while (io) { struct dm_io *next = io->next; - dm_io_rewind(io, &md->queue->bio_split); + dm_io_rewind(io, &md->disk->bio_split); io->next = NULL; __dm_io_complete(io, false); @@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector) * Does the target need to split IO even further? * - varied (per target) IO splitting is a tenet of DM; this * explains why stacked chunk_sectors based splitting via - * blk_queue_split() isn't possible here. + * bio_split_to_limits() isn't possible here. */ if (!ti->max_io_len) return len; @@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md, is_abnormal = is_abnormal_io(bio); if (unlikely(is_abnormal)) { /* - * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. */ - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); } init_clone_info(&ci, md, map, bio, is_abnormal); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 2cf973722f59..91836e6de326 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args) char *devname = args->device_names; dev_t devices[MD_SB_DISKS + 1], mdev; struct mdu_array_info_s ainfo = { }; - struct block_device *bdev; struct mddev *mddev; int err = 0, i; char name[16]; @@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_info("md: Loading %s: %s\n", name, args->device_names); - bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { - pr_err("md: open failed - cannot start array %s\n", name); + mddev = md_alloc(mdev, name); + if (IS_ERR(mddev)) { + pr_err("md: md_alloc failed - cannot start array %s\n", name); return; } - err = -EIO; - if (WARN(bdev->bd_disk->fops != &md_fops, - "Opening block device %x resulted in non-md device\n", - mdev)) - goto out_blkdev_put; - - mddev = bdev->bd_disk->private_data; - err = mddev_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); - goto out_blkdev_put; + goto out_mddev_put; } if (!list_empty(&mddev->disks) || mddev->raid_disks) { @@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_warn("md: starting %s failed\n", name); out_unlock: mddev_unlock(mddev); -out_blkdev_put: - blkdev_put(bdev, FMODE_READ); +out_mddev_put: + mddev_put(mddev); } static int __init raid_setup(char *str) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 37cbcce3cc66..742b2349fea3 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -40,7 +40,7 @@ struct resync_info { /* Lock the send communication. This is done through * bit manipulation as opposed to a mutex in order to - * accomodate lock and hold. See next comment. + * accommodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 /* If cluster operations (such as adding a disk) must lock the @@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) /* * If resync thread run after raid1d thread, then process_metadata_update * could not continue if raid1d held reconfig_mutex (and raid1d is blocked - * since another node already got EX on Token and waitting the EX of Ack), + * since another node already got EX on Token and waiting the EX of Ack), * so let resync wake up thread in case flag is set. */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4df78e30b76a..afaf36b2f6ab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio) return; } - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) @@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); -static void mddev_put(struct mddev *mddev) +void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del_init(&mddev->all_mddevs); + set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that @@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { - kobject_init(&mddev->kobj, &md_ktype); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->bitmap_info.mutex); @@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void) return dev; } -static struct mddev *mddev_find(dev_t unit) -{ - struct mddev *mddev; - - if (MAJOR(unit) != MD_MAJOR) - unit &= ~((1 << MdpMinorShift) - 1); - - spin_lock(&all_mddevs_lock); - mddev = mddev_find_locked(unit); - if (mddev) - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - - return mddev; -} - static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; @@ -791,6 +756,15 @@ out_free_new: return ERR_PTR(error); } +static void mddev_free(struct mddev *mddev) +{ + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); + + kfree(mddev); +} + static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) @@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) @@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* Need to check that all other rdevs with the same - * ->bdev do not overlap. 'rcu' is sufficient to walk - * the rdev lists safely. - * This check does not provide a hard guarantee, it - * just helps avoid dangerous mistakes. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - - rcu_read_lock(); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - if (overlap) { - mddev_put(mddev); - break; - } - } - rcu_read_unlock(); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; } return len; } @@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -5001,7 +4984,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); @@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } -static void md_free(struct kobject *ko) +static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); @@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); - put_disk(mddev->gendisk); - } - percpu_ref_exit(&mddev->writes_pending); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - kfree(mddev); + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { @@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = { .store = md_attr_store, }; static struct kobj_type md_ktype = { - .release = md_free, + .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; @@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init_writes_pending); -static int md_alloc(dev_t dev, char *name) +struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with @@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name) mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { - mutex_unlock(&disks_mutex); - return PTR_ERR(mddev); + error = PTR_ERR(mddev); + goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); @@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name) strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; - goto out_unlock_disks_mutex; + goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } @@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name) error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto out_unlock_disks_mutex; + goto out_free_mddev; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = add_disk(disk); if (error) - goto out_cleanup_disk; + goto out_put_disk; + kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) - goto out_del_gendisk; + if (error) { + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. + */ + mddev->hold_active = 0; + mutex_unlock(&disks_mutex); + mddev_put(mddev); + return ERR_PTR(error); + } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); - goto out_unlock_disks_mutex; + mutex_unlock(&disks_mutex); + return mddev; -out_del_gendisk: - del_gendisk(disk); -out_cleanup_disk: +out_put_disk: put_disk(disk); -out_unlock_disks_mutex: +out_free_mddev: + mddev_free(mddev); +out_unlock: mutex_unlock(&disks_mutex); + return ERR_PTR(error); +} + +static int md_alloc_and_put(dev_t dev, char *name) +{ + struct mddev *mddev = md_alloc(dev, name); + + if (IS_ERR(mddev)) + return PTR_ERR(mddev); mddev_put(mddev); - return error; + return 0; } static void md_probe(dev_t dev) @@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev) if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) - md_alloc(dev, NULL); + md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) @@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp) return -E2BIG; strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) - return md_alloc(0, buf); + return md_alloc_and_put(0, buf); if (strncmp(buf, "md", 2) == 0 && isdigit(buf[2]) && kstrtoul(buf+2, 10, &devnum) == 0 && devnum <= MINORMASK) - return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); return -EINVAL; } @@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ if (mddev->event_work.func) flush_workqueue(md_misc_wq); + md_bitmap_destroy(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -6497,9 +6491,8 @@ static void autorun_devices(int part) break; } - md_probe(dev); - mddev = mddev_find(dev); - if (!mddev) + mddev = md_alloc(dev, NULL); + if (IS_ERR(mddev)) break; if (mddev_lock(mddev)) @@ -7782,45 +7775,33 @@ out_unlock: static int md_open(struct block_device *bdev, fmode_t mode) { - /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. - */ - struct mddev *mddev = mddev_find(bdev->bd_dev); + struct mddev *mddev; int err; + spin_lock(&all_mddevs_lock); + mddev = mddev_get(bdev->bd_disk->private_data); + spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - return -EBUSY; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) goto out; - if (test_bit(MD_CLOSING, &mddev->flags)) { - mutex_unlock(&mddev->open_mutex); - err = -ENODEV; - goto out; - } + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; - err = 0; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); bdev_check_media_change(bdev); - out: - if (err) - mddev_put(mddev); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); return err; } @@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) return ret; } +static void md_free_disk(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + percpu_ref_exit(&mddev->writes_pending); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + + mddev_free(mddev); +} + const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops = .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, + .free_disk = md_free_disk, }; static int md_thread(void *arg) @@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; - if (resync <= 3) { + if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; - } else if (resync > max_sectors) + } else if (resync > max_sectors) { resync = max_sectors; - else + } else { resync -= atomic_read(&mddev->recovery_active); + if (resync < MD_RESYNC_ACTIVE) { + /* + * Resync has started, but the subtraction has + * yielded one of the special values. Force it + * to active to ensure the status reports an + * active resync. + */ + resync = MD_RESYNC_ACTIVE; + } + } - if (resync == 0) { + if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; @@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } return 0; } - if (resync < 3) { + if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } @@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); return mddev; } @@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; struct mddev *next_mddev, *mddev = v; + struct mddev *to_put = NULL; ++*pos; if (v == (void*)2) return NULL; spin_lock(&all_mddevs_lock); - if (v == (void*)1) + if (v == (void*)1) { tmp = all_mddevs.next; - else + } else { + to_put = mddev; + tmp = mddev->all_mddevs.next; + } + + for (;;) { + if (tmp == &all_mddevs) { + next_mddev = (void*)2; + *pos = 0x10000; + break; + } + next_mddev = list_entry(tmp, struct mddev, all_mddevs); + if (mddev_get(next_mddev)) + break; + mddev = next_mddev; tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; } spin_unlock(&all_mddevs_lock); - if (v != (void*)1) + if (to_put) mddev_put(mddev); return next_mddev; @@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread) unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; - struct list_head *tmp; sector_t last_check; int skipped = 0; struct md_rdev *rdev; @@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread) mddev->last_sync_action = action ?: desc; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commence - * other == active in resync - this many blocks - * + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread) do { int mddev2_minor = -1; - mddev->curr_resync = 2; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - for_each_mddev(mddev2, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev), mdname(mddev2)); } - mddev_put(mddev2); + spin_unlock(&all_mddevs_lock); + if (signal_pending(current)) flush_signals(current); schedule(); @@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); + spin_unlock(&all_mddevs_lock); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); @@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev-> |
