summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 20:00:14 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 20:00:14 -0700
commitfa9db655d0e112c108fe838809608caf759bdf5e (patch)
tree899a983b333871688095fd14b413c199b9a38f73 /drivers/md
parente495274793ea602415d050452088a496abcd9e6c (diff)
parentbc792884b76f0da2f5c9a8d720e430e2de9756f5 (diff)
downloadlinux-fa9db655d0e112c108fe838809608caf759bdf5e.tar.gz
linux-fa9db655d0e112c108fe838809608caf759bdf5e.tar.bz2
linux-fa9db655d0e112c108fe838809608caf759bdf5e.zip
Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: - NVMe pull requests via Christoph: - add support for In-Band authentication (Hannes Reinecke) - handle the persistent internal error AER (Michael Kelley) - use in-capsule data for TCP I/O queue connect (Caleb Sander) - remove timeout for getting RDMA-CM established event (Israel Rukshin) - misc cleanups (Joel Granados, Sagi Grimberg, Chaitanya Kulkarni, Guixin Liu, Xiang wangx) - use command_id instead of req->tag in trace_nvme_complete_rq() (Bean Huo) - various fixes for the new authentication code (Lukas Bulwahn, Dan Carpenter, Colin Ian King, Chaitanya Kulkarni, Hannes Reinecke) - small cleanups (Liu Song, Christoph Hellwig) - restore compat_ioctl support (Nick Bowler) - make a nvmet-tcp workqueue lockdep-safe (Sagi Grimberg) - enable generic interface (/dev/ngXnY) for unknown command sets (Joel Granados, Christoph Hellwig) - don't always build constants.o (Christoph Hellwig) - print the command name of aborted commands (Christoph Hellwig) - MD pull requests via Song: - Improve raid5 lock contention, by Logan Gunthorpe. - Misc fixes to raid5, by Logan Gunthorpe. - Fix race condition with md_reap_sync_thread(), by Guoqing Jiang. - Fix potential deadlock with raid5_quiesce and raid5_get_active_stripe, by Logan Gunthorpe. - Refactoring md_alloc(), by Christoph" - Fix md disk_name lifetime problems, by Christoph Hellwig - Convert prepare_to_wait() to wait_woken() api, by Logan Gunthorpe; - Fix sectors_to_do bitmap issue, by Logan Gunthorpe. - Work on unifying the null_blk module parameters and configfs API (Vincent) - drbd bitmap IO error fix (Lars) - Set of rnbd fixes (Guoqing, Md Haris) - Remove experimental marker on bcache async device registration (Coly) - Series from cleaning up the bio splitting (Christoph) - Removal of the sx8 block driver. This hardware never really widespread, and it didn't receive a lot of attention after the initial merge of it back in 2005 (Christoph) - A few fixes for s390 dasd (Eric, Jiang) - Followup set of fixes for ublk (Ming) - Support for UBLK_IO_NEED_GET_DATA for ublk (ZiyangZhang) - Fixes for the dio dma alignment (Keith) - Misc fixes and cleanups (Ming, Yu, Dan, Christophe * tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block: (136 commits) s390/dasd: Establish DMA alignment s390/dasd: drop unexpected word 'for' in comments ublk_drv: add support for UBLK_IO_NEED_GET_DATA ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA ublk_drv: cleanup ublksrv_ctrl_dev_info ublk_drv: add SET_PARAMS/GET_PARAMS control command ublk_drv: fix ublk device leak in case that add_disk fails ublk_drv: cancel device even though disk isn't up block: fix leaking page ref on truncated direct io block: ensure bio_iov_add_page can't fail block: ensure iov_iter advances for added pages drivers:md:fix a potential use-after-free bug md/raid5: Ensure batch_last is released before sleeping for quiesce md/raid5: Move stripe_request_ctx up md/raid5: Drop unnecessary call to r5c_check_stripe_cache_usage() md/raid5: Make is_inactive_blocked() helper md/raid5: Refactor raid5_get_active_stripe() block: pass struct queue_limits to the bio splitting helpers block: move bio_allowed_max_sectors to blk-merge.c block: move the call to get_max_io_size out of blk_bio_segment_split ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Kconfig2
-rw-r--r--drivers/md/dm-raid.c1
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/md/md-autodetect.c21
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c424
-rw-r--r--drivers/md/md.h19
-rw-r--r--drivers/md/raid10.c5
-rw-r--r--drivers/md/raid5-cache.c40
-rw-r--r--drivers/md/raid5-log.h77
-rw-r--r--drivers/md/raid5-ppl.c2
-rw-r--r--drivers/md/raid5.c727
-rw-r--r--drivers/md/raid5.h2
13 files changed, 804 insertions, 528 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index cf3e8096942a..529c9d04e9a4 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG
operations that get stuck.
config BCACHE_ASYNC_REGISTRATION
- bool "Asynchronous device registration (EXPERIMENTAL)"
+ bool "Asynchronous device registration"
depends on BCACHE
help
Add a sysfs file /sys/fs/bcache/register_async. Writing registering
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1ec17c32867f..c640be453313 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 99642f69bfa7..28bd4a35b86b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
while (io) {
struct dm_io *next = io->next;
- dm_io_rewind(io, &md->queue->bio_split);
+ dm_io_rewind(io, &md->disk->bio_split);
io->next = NULL;
__dm_io_complete(io, false);
@@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
* Does the target need to split IO even further?
* - varied (per target) IO splitting is a tenet of DM; this
* explains why stacked chunk_sectors based splitting via
- * blk_queue_split() isn't possible here.
+ * bio_split_to_limits() isn't possible here.
*/
if (!ti->max_io_len)
return len;
@@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
is_abnormal = is_abnormal_io(bio);
if (unlikely(is_abnormal)) {
/*
- * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+ * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
*/
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
}
init_clone_info(&ci, md, map, bio, is_abnormal);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 2cf973722f59..91836e6de326 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args)
char *devname = args->device_names;
dev_t devices[MD_SB_DISKS + 1], mdev;
struct mdu_array_info_s ainfo = { };
- struct block_device *bdev;
struct mddev *mddev;
int err = 0, i;
char name[16];
@@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_info("md: Loading %s: %s\n", name, args->device_names);
- bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
- if (IS_ERR(bdev)) {
- pr_err("md: open failed - cannot start array %s\n", name);
+ mddev = md_alloc(mdev, name);
+ if (IS_ERR(mddev)) {
+ pr_err("md: md_alloc failed - cannot start array %s\n", name);
return;
}
- err = -EIO;
- if (WARN(bdev->bd_disk->fops != &md_fops,
- "Opening block device %x resulted in non-md device\n",
- mdev))
- goto out_blkdev_put;
-
- mddev = bdev->bd_disk->private_data;
-
err = mddev_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
- goto out_blkdev_put;
+ goto out_mddev_put;
}
if (!list_empty(&mddev->disks) || mddev->raid_disks) {
@@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_warn("md: starting %s failed\n", name);
out_unlock:
mddev_unlock(mddev);
-out_blkdev_put:
- blkdev_put(bdev, FMODE_READ);
+out_mddev_put:
+ mddev_put(mddev);
}
static int __init raid_setup(char *str)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 37cbcce3cc66..742b2349fea3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -40,7 +40,7 @@ struct resync_info {
/* Lock the send communication. This is done through
* bit manipulation as opposed to a mutex in order to
- * accomodate lock and hold. See next comment.
+ * accommodate lock and hold. See next comment.
*/
#define MD_CLUSTER_SEND_LOCK 4
/* If cluster operations (such as adding a disk) must lock the
@@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
/*
* If resync thread run after raid1d thread, then process_metadata_update
* could not continue if raid1d held reconfig_mutex (and raid1d is blocked
- * since another node already got EX on Token and waitting the EX of Ack),
+ * since another node already got EX on Token and waiting the EX of Ack),
* so let resync wake up thread in case flag is set.
*/
if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4df78e30b76a..afaf36b2f6ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
-/*
- * iterates through all used mddevs in the system.
- * We take care to grab the all_mddevs_lock whenever navigating
- * the list, and to always hold a refcount when unlocked.
- * Any code which breaks out of this loop while own
- * a reference to the current mddev and must mddev_put it.
- */
-#define for_each_mddev(_mddev,_tmp) \
- \
- for (({ spin_lock(&all_mddevs_lock); \
- _tmp = all_mddevs.next; \
- _mddev = NULL;}); \
- ({ if (_tmp != &all_mddevs) \
- mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
- spin_unlock(&all_mddevs_lock); \
- if (_mddev) mddev_put(_mddev); \
- _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
- _tmp != &all_mddevs;}); \
- ({ spin_lock(&all_mddevs_lock); \
- _tmp = _tmp->next;}) \
- )
-
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
return;
}
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
@@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request);
static inline struct mddev *mddev_get(struct mddev *mddev)
{
+ lockdep_assert_held(&all_mddevs_lock);
+
+ if (test_bit(MD_DELETED, &mddev->flags))
+ return NULL;
atomic_inc(&mddev->active);
return mddev;
}
static void mddev_delayed_delete(struct work_struct *ws);
-static void mddev_put(struct mddev *mddev)
+void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev)
mddev->ctime == 0 && !mddev->hold_active) {
/* Array is not configured at all, and not held active,
* so destroy it */
- list_del_init(&mddev->all_mddevs);
+ set_bit(MD_DELETED, &mddev->flags);
/*
* Call queue_work inside the spinlock so that
@@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t);
void mddev_init(struct mddev *mddev)
{
- kobject_init(&mddev->kobj, &md_ktype);
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->bitmap_info.mutex);
@@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void)
return dev;
}
-static struct mddev *mddev_find(dev_t unit)
-{
- struct mddev *mddev;
-
- if (MAJOR(unit) != MD_MAJOR)
- unit &= ~((1 << MdpMinorShift) - 1);
-
- spin_lock(&all_mddevs_lock);
- mddev = mddev_find_locked(unit);
- if (mddev)
- mddev_get(mddev);
- spin_unlock(&all_mddevs_lock);
-
- return mddev;
-}
-
static struct mddev *mddev_alloc(dev_t unit)
{
struct mddev *new;
@@ -791,6 +756,15 @@ out_free_new:
return ERR_PTR(error);
}
+static void mddev_free(struct mddev *mddev)
+{
+ spin_lock(&all_mddevs_lock);
+ list_del(&mddev->all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+
+ kfree(mddev);
+}
+
static const struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
@@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
}
-static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
{
/* check if two start/length pairs overlap */
- if (s1+l1 <= s2)
- return 0;
- if (s2+l2 <= s1)
- return 0;
- return 1;
+ if (a->data_offset + a->sectors <= b->data_offset)
+ return false;
+ if (b->data_offset + b->sectors <= a->data_offset)
+ return false;
+ return true;
+}
+
+static bool md_rdev_overlaps(struct md_rdev *rdev)
+{
+ struct mddev *mddev;
+ struct md_rdev *rdev2;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+ if (test_bit(MD_DELETED, &mddev->flags))
+ continue;
+ rdev_for_each(rdev2, mddev) {
+ if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
+ md_rdevs_overlap(rdev, rdev2)) {
+ spin_unlock(&all_mddevs_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&all_mddevs_lock);
+ return false;
}
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
@@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
return -EINVAL; /* component must fit device */
rdev->sectors = sectors;
- if (sectors > oldsectors && my_mddev->external) {
- /* Need to check that all other rdevs with the same
- * ->bdev do not overlap. 'rcu' is sufficient to walk
- * the rdev lists safely.
- * This check does not provide a hard guarantee, it
- * just helps avoid dangerous mistakes.
- */
- struct mddev *mddev;
- int overlap = 0;
- struct list_head *tmp;
-
- rcu_read_lock();
- for_each_mddev(mddev, tmp) {
- struct md_rdev *rdev2;
- rdev_for_each(rdev2, mddev)
- if (rdev->bdev == rdev2->bdev &&
- rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->sectors,
- rdev2->data_offset,
- rdev2->sectors)) {
- overlap = 1;
- break;
- }
- if (overlap) {
- mddev_put(mddev);
- break;
- }
- }
- rcu_read_unlock();
- if (overlap) {
- /* Someone else could have slipped in a size
- * change here, but doing so is just silly.
- * We put oldsectors back because we *know* it is
- * safe, and trust userspace not to race with
- * itself
- */
- rdev->sectors = oldsectors;
- return -EBUSY;
- }
+ /*
+ * Check that all other rdevs with the same bdev do not overlap. This
+ * check does not provide a hard guarantee, it just helps avoid
+ * dangerous mistakes.
+ */
+ if (sectors > oldsectors && my_mddev->external &&
+ md_rdev_overlaps(rdev)) {
+ /*
+ * Someone else could have slipped in a size change here, but
+ * doing so is just silly. We put oldsectors back because we
+ * know it is safe, and trust userspace not to race with itself.
+ */
+ rdev->sectors = oldsectors;
+ return -EBUSY;
}
return len;
}
@@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
+ sector_t save_rp = mddev->reshape_position;
+
+ mddev_unlock(mddev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
+ mddev_lock_nointr(mddev);
+ /*
+ * set RECOVERY_INTR again and restore reshape
+ * position in case others changed them after
+ * got lock, eg, reshape_position_store and
+ * md_check_recovery.
+ */
+ mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
@@ -5001,7 +4984,7 @@ static ssize_t
sync_speed_show(struct mddev *mddev, char *page)
{
unsigned long resync, dt, db;
- if (mddev->curr_resync == 0)
+ if (mddev->curr_resync == MD_RESYNC_NONE)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
@@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
- if (mddev->curr_resync == 1 ||
- mddev->curr_resync == 2)
+ if (mddev->curr_resync == MD_RESYNC_YIELDED ||
+ mddev->curr_resync == MD_RESYNC_DELAYED)
return sprintf(page, "delayed\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
spin_lock(&all_mddevs_lock);
- if (list_empty(&mddev->all_mddevs)) {
+ if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
- mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->show(mddev, page);
@@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
spin_lock(&all_mddevs_lock);
- if (list_empty(&mddev->all_mddevs)) {
+ if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
- mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
-static void md_free(struct kobject *ko)
+static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
@@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
- if (mddev->gendisk) {
- del_gendisk(mddev->gendisk);
- put_disk(mddev->gendisk);
- }
- percpu_ref_exit(&mddev->writes_pending);
-
- bioset_exit(&mddev->bio_set);
- bioset_exit(&mddev->sync_set);
- kfree(mddev);
+ del_gendisk(mddev->gendisk);
+ put_disk(mddev->gendisk);
}
static const struct sysfs_ops md_sysfs_ops = {
@@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = {
.store = md_attr_store,
};
static struct kobj_type md_ktype = {
- .release = md_free,
+ .release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
};
@@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
- kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
@@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-static int md_alloc(dev_t dev, char *name)
+struct mddev *md_alloc(dev_t dev, char *name)
{
/*
* If dev is zero, name is the name of a device to allocate with
@@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name)
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
if (IS_ERR(mddev)) {
- mutex_unlock(&disks_mutex);
- return PTR_ERR(mddev);
+ error = PTR_ERR(mddev);
+ goto out_unlock;
}
partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
@@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name)
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
error = -EEXIST;
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
}
spin_unlock(&all_mddevs_lock);
}
@@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name)
error = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name)
mddev->gendisk = disk;
error = add_disk(disk);
if (error)
- goto out_cleanup_disk;
+ goto out_put_disk;
+ kobject_init(&mddev->kobj, &md_ktype);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
- if (error)
- goto out_del_gendisk;
+ if (error) {
+ /*
+ * The disk is already live at this point. Clear the hold flag
+ * and let mddev_put take care of the deletion, as it isn't any
+ * different from a normal close on last release now.
+ */
+ mddev->hold_active = 0;
+ mutex_unlock(&disks_mutex);
+ mddev_put(mddev);
+ return ERR_PTR(error);
+ }
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
- goto out_unlock_disks_mutex;
+ mutex_unlock(&disks_mutex);
+ return mddev;
-out_del_gendisk:
- del_gendisk(disk);
-out_cleanup_disk:
+out_put_disk:
put_disk(disk);
-out_unlock_disks_mutex:
+out_free_mddev:
+ mddev_free(mddev);
+out_unlock:
mutex_unlock(&disks_mutex);
+ return ERR_PTR(error);
+}
+
+static int md_alloc_and_put(dev_t dev, char *name)
+{
+ struct mddev *mddev = md_alloc(dev, name);
+
+ if (IS_ERR(mddev))
+ return PTR_ERR(mddev);
mddev_put(mddev);
- return error;
+ return 0;
}
static void md_probe(dev_t dev)
@@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev)
if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
return;
if (create_on_open)
- md_alloc(dev, NULL);
+ md_alloc_and_put(dev, NULL);
}
static int add_named_array(const char *val, const struct kernel_param *kp)
@@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
return -E2BIG;
strscpy(buf, val, len+1);
if (strncmp(buf, "md_", 3) == 0)
- return md_alloc(0, buf);
+ return md_alloc_and_put(0, buf);
if (strncmp(buf, "md", 2) == 0 &&
isdigit(buf[2]) &&
kstrtoul(buf+2, 10, &devnum) == 0 &&
devnum <= MINORMASK)
- return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+ return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
return -EINVAL;
}
@@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
@@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- md_bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
if (mddev->event_work.func)
flush_workqueue(md_misc_wq);
+ md_bitmap_destroy(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
spin_unlock(&mddev->lock);
@@ -6497,9 +6491,8 @@ static void autorun_devices(int part)
break;
}
- md_probe(dev);
- mddev = mddev_find(dev);
- if (!mddev)
+ mddev = md_alloc(dev, NULL);
+ if (IS_ERR(mddev))
break;
if (mddev_lock(mddev))
@@ -7782,45 +7775,33 @@ out_unlock:
static int md_open(struct block_device *bdev, fmode_t mode)
{
- /*
- * Succeed if we can lock the mddev, which confirms that
- * it isn't being stopped right now.
- */
- struct mddev *mddev = mddev_find(bdev->bd_dev);
+ struct mddev *mddev;
int err;
+ spin_lock(&all_mddevs_lock);
+ mddev = mddev_get(bdev->bd_disk->private_data);
+ spin_unlock(&all_mddevs_lock);
if (!mddev)
return -ENODEV;
- if (mddev->gendisk != bdev->bd_disk) {
- /* we are racing with mddev_put which is discarding this
- * bd_disk.
- */
- mddev_put(mddev);
- /* Wait until bdev->bd_disk is definitely gone */
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
- return -EBUSY;
- }
- BUG_ON(mddev != bdev->bd_disk->private_data);
-
- if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
+ err = mutex_lock_interruptible(&mddev->open_mutex);
+ if (err)
goto out;
- if (test_bit(MD_CLOSING, &mddev->flags)) {
- mutex_unlock(&mddev->open_mutex);
- err = -ENODEV;
- goto out;
- }
+ err = -ENODEV;
+ if (test_bit(MD_CLOSING, &mddev->flags))
+ goto out_unlock;
- err = 0;
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
bdev_check_media_change(bdev);
- out:
- if (err)
- mddev_put(mddev);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&mddev->open_mutex);
+out:
+ mddev_put(mddev);
return err;
}
@@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
return ret;
}
+static void md_free_disk(struct gendisk *disk)
+{
+ struct mddev *mddev = disk->private_data;
+
+ percpu_ref_exit(&mddev->writes_pending);
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
+
+ mddev_free(mddev);
+}
+
const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
@@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops =
.getgeo = md_getgeo,
.check_events = md_check_events,
.set_read_only = md_set_read_only,
+ .free_disk = md_free_disk,
};
static int md_thread(void *arg)
@@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
- if (resync <= 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
- } else if (resync > max_sectors)
+ } else if (resync > max_sectors) {
resync = max_sectors;
- else
+ } else {
resync -= atomic_read(&mddev->recovery_active);
+ if (resync < MD_RESYNC_ACTIVE) {
+ /*
+ * Resync has started, but the subtraction has
+ * yielded one of the special values. Force it
+ * to active to ensure the status reports an
+ * active resync.
+ */
+ resync = MD_RESYNC_ACTIVE;
+ }
+ }
- if (resync == 0) {
+ if (resync == MD_RESYNC_NONE) {
if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
struct md_rdev *rdev;
@@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
return 0;
}
- if (resync < 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
@@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
if (!l--) {
mddev = list_entry(tmp, struct mddev, all_mddevs);
mddev_get(mddev);
+ if (!mddev_get(mddev))
+ continue;
spin_unlock(&all_mddevs_lock);
return mddev;
}
@@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct list_head *tmp;
struct mddev *next_mddev, *mddev = v;
+ struct mddev *to_put = NULL;
++*pos;
if (v == (void*)2)
return NULL;
spin_lock(&all_mddevs_lock);
- if (v == (void*)1)
+ if (v == (void*)1) {
tmp = all_mddevs.next;
- else
+ } else {
+ to_put = mddev;
+ tmp = mddev->all_mddevs.next;
+ }
+
+ for (;;) {
+ if (tmp == &all_mddevs) {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ break;
+ }
+ next_mddev = list_entry(tmp, struct mddev, all_mddevs);
+ if (mddev_get(next_mddev))
+ break;
+ mddev = next_mddev;
tmp = mddev->all_mddevs.next;
- if (tmp != &all_mddevs)
- next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
- else {
- next_mddev = (void*)2;
- *pos = 0x10000;
}
spin_unlock(&all_mddevs_lock);
- if (v != (void*)1)
+ if (to_put)
mddev_put(mddev);
return next_mddev;
@@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread)
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
- struct list_head *tmp;
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
@@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread)
mddev->last_sync_action = action ?: desc;
- /* we overload curr_resync somewhat here.
- * 0 == not engaged in resync at all
- * 2 == checking that there is no conflict with another sync
- * 1 == like 2, but have yielded to allow conflicting resync to
- * commence
- * other == active in resync - this many blocks
- *
+ /*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
@@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread)
do {
int mddev2_minor = -1;
- mddev->curr_resync = 2;
+ mddev->curr_resync = MD_RESYNC_DELAYED;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
- for_each_mddev(mddev2, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
+ if (test_bit(MD_DELETED, &mddev2->flags))
+ continue;
if (mddev2 == mddev)
continue;
if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
- if (mddev < mddev2 && mddev->curr_resync == 2) {
+ if (mddev < mddev2 &&
+ mddev->curr_resync == MD_RESYNC_DELAYED) {
/* arbitrarily yield */
- mddev->curr_resync = 1;
+ mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
}
- if (mddev > mddev2 && mddev->curr_resync == 1)
+ if (mddev > mddev2 &&
+ mddev->curr_resync == MD_RESYNC_YIELDED)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
@@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev),
mdname(mddev2));
}
- mddev_put(mddev2);
+ spin_unlock(&all_mddevs_lock);
+
if (signal_pending(current))
flush_signals(current);
schedule();
@@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread)
finish_wait(&resync_wait, &wq);
}
}
- } while (mddev->curr_resync < 2);
+ spin_unlock(&all_mddevs_lock);
+ } while (mddev->curr_resync < MD_RESYNC_DELAYED);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev));
mddev->curr_resync = j;
} else
- mddev->curr_resync = 3; /* no longer delayed */
+ mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
@@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread)
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->