diff options
| -rw-r--r-- | drivers/md/dm-raid.c | 4 | ||||
| -rw-r--r-- | drivers/md/md-bitmap.c | 93 | ||||
| -rw-r--r-- | drivers/md/md-bitmap.h | 8 | ||||
| -rw-r--r-- | drivers/md/md-cluster.c | 17 | ||||
| -rw-r--r-- | drivers/md/md-multipath.c | 4 | ||||
| -rw-r--r-- | drivers/md/md.c | 226 | ||||
| -rw-r--r-- | drivers/md/md.h | 37 | ||||
| -rw-r--r-- | drivers/md/raid1-10.c | 63 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 36 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 2 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 179 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 2 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 22 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 68 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 2 | ||||
| -rw-r--r-- | lib/raid6/neon.h | 22 | ||||
| -rw-r--r-- | lib/raid6/neon.uc | 1 | ||||
| -rw-r--r-- | lib/raid6/recov_neon.c | 8 | ||||
| -rw-r--r-- | lib/raid6/recov_neon_inner.c | 1 |
19 files changed, 508 insertions, 287 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c8821fcb8299..8846bf510a35 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, * canceling read-auto mode */ mddev->ro = 0; - if (!mddev->suspended && mddev->sync_thread) + if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!mddev->suspended && mddev->thread) + if (!mddev->suspended) md_wakeup_thread(mddev->thread); return 0; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bc8d7565171d..1ff712889a3b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -54,14 +54,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -1023,7 +1016,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) return set; } - /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -1033,8 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) + if (!md_bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1063,6 +1054,35 @@ void md_bitmap_unplug(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_unplug); +struct bitmap_unplug_work { + struct work_struct work; + struct bitmap *bitmap; + struct completion *done; +}; + +static void md_bitmap_unplug_fn(struct work_struct *work) +{ + struct bitmap_unplug_work *unplug_work = + container_of(work, struct bitmap_unplug_work, work); + + md_bitmap_unplug(unplug_work->bitmap); + complete(unplug_work->done); +} + +void md_bitmap_unplug_async(struct bitmap *bitmap) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct bitmap_unplug_work unplug_work; + + INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); + unplug_work.bitmap = bitmap; + unplug_work.done = &done; + + queue_work(md_bitmap_wq, &unplug_work.work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(md_bitmap_unplug_async); + static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the @@ -1241,11 +1261,28 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); +static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, + bool force) +{ + struct md_thread *thread; + + rcu_read_lock(); + thread = rcu_dereference(mddev->thread); + + if (!thread) + goto out; + + if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) + thread->timeout = timeout; + +out: + rcu_read_unlock(); +} + /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ - void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; @@ -1269,7 +1306,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; @@ -1366,8 +1403,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1387,6 +1423,14 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || @@ -1820,8 +1864,7 @@ void md_bitmap_destroy(struct mddev *mddev) mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } @@ -1964,7 +2007,7 @@ int md_bitmap_load(struct mddev *mddev) /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); md_bitmap_update_sb(bitmap); @@ -2469,17 +2512,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { - mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } - } + mddev_set_timeout(mddev, timeout, false); + md_wakeup_thread(mddev->thread); + return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index cfd7395de8fd..8a3788c9bfef 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t new_lo, sector_t new_hi); void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, @@ -273,6 +274,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + #endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..3d9fd74233df 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -75,14 +75,14 @@ struct md_cluster_info { sector_t suspend_hi; int suspend_from; /* the slot which broadcast suspend_lo/hi */ - struct md_thread *recovery_thread; + struct md_thread __rcu *recovery_thread; unsigned long recovery_map; /* communication loc resources */ struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; struct dlm_lock_resource *no_new_dev_lockres; - struct md_thread *recv_thread; + struct md_thread __rcu *recv_thread; struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot) set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { - cinfo->recovery_thread = md_register_thread(recover_bitmaps, - mddev, "recover"); + rcu_assign_pointer(cinfo->recovery_thread, + md_register_thread(recover_bitmaps, mddev, "recover")); if (!cinfo->recovery_thread) { pr_warn("md-cluster: Could not create recovery thread\n"); return; @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { int got_lock = 0; + struct md_thread *thread; struct md_cluster_info *cinfo = mddev->cluster_info; mddev->good_device_nr = le32_to_cpu(msg->raid_slot); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - wait_event(mddev->thread->wqueue, + + /* daemaon thread must exist */ + thread = rcu_dereference_protected(mddev->thread, true); + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); md_reload_sb(mddev, mddev->good_device_nr); @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes) } /* Initiate the communication resources */ ret = -ENOMEM; - cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + rcu_assign_pointer(cinfo->recv_thread, + md_register_thread(recv_daemon, mddev, "cluster_recv")); if (!cinfo->recv_thread) { pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); goto err; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 66edf5e72bd6..92c45be203d7 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev) if (ret) goto out_free_conf; - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); + rcu_assign_pointer(mddev->thread, + md_register_thread(multipathd, mddev, "multipath")); if (!mddev->thread) goto out_free_conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0de7ddd943..cf3733c90c47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -70,11 +70,7 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ +/* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -87,23 +83,13 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; -static struct workqueue_struct *md_rdev_misc_wq; +struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); - -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -360,10 +346,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -static bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -457,13 +439,19 @@ static void md_submit_bio(struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); - lockdep_assert_held(&mddev->reconfig_mutex); + struct md_thread *thread = rcu_dereference_protected(mddev->thread, + lockdep_is_held(&mddev->reconfig_mutex)); + + WARN_ON_ONCE(thread && current == thread->tsk); if (mddev->suspended++) return; wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + + if (mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); @@ -655,9 +643,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -759,6 +749,24 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + mutex_lock(&mddev->delete_mutex); + if (list_empty(&mddev->deleting)) + goto out; + + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } +out: + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -800,13 +808,10 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); + md_free_rdev(mddev); + md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -2440,13 +2445,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ @@ -2467,6 +2465,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2480,15 +2480,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev, rdev->mddev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3556,6 +3558,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3563,6 +3566,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3571,6 +3578,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -3796,8 +3807,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3809,7 +3821,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; @@ -4479,6 +4491,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } @@ -4493,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4534,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -4806,11 +4805,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } @@ -5594,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -6271,10 +6279,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; @@ -6335,10 +6345,12 @@ static int do_md_stop(struct mddev *mddev, int mode, } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && @@ -6735,7 +6747,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; @@ -7557,9 +7568,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -7888,13 +7896,29 @@ static int md_thread(void *arg) return 0; } -void md_wakeup_thread(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); +} + +void md_wakeup_thread(struct md_thread __rcu *thread) +{ + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + wake_up(&t->wqueue); } + rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); @@ -7924,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct md_thread __rcu **threadp) { - struct md_thread *thread; + struct md_thread *thread = rcu_dereference_protected(*threadp, true); - /* - * Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - thread = *threadp; - if (!thread) { - spin_unlock(&pers_lock); + if (!thread) return; - } - *threadp = NULL; - spin_unlock(&pers_lock); + + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -9102,6 +9119,7 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); + wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } @@ -9204,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); @@ -9621,9 +9638,10 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + goto err_bitmap_wq; ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) @@ -9643,8 +9661,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: + destroy_workqueue(md_bitmap_wq); +err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9940,8 +9958,8 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); + destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index fd8f260ed5f8..bfd2306bc750 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -122,8 +122,6 @@ struct md_rdev { struct serial_in_rdev *serial; /* used for raid1 io serialization */ - struct work_struct del_work; /* used for delayed sysfs removal */ - struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ @@ -367,8 +365,8 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ + struct md_thread __rcu *thread; /* management thread */ + struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* 'last_sync_action' is initialized to "none". It is set when a * sync operation (i.e "data-check", "requested-resync", "resync", @@ -531,6 +529,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; @@ -555,6 +561,23 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -614,6 +637,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -734,8 +758,8 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); +extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); @@ -828,6 +852,7 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; +extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index cd349e69ed77..169ebe296f2d 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -21,6 +21,7 @@ #define IO_MADE_GOOD ((struct bio *)2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +#define MAX_PLUG_BIO 32 /* for managing resync I/O pages */ struct resync_pages { @@ -31,6 +32,7 @@ struct resync_pages { struct raid1_plug_cb { struct blk_plug_cb cb; struct bio_list pending; + unsigned int count; }; static void rbio_pool_free(void *rbio, void *data) @@ -110,3 +112,64 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug, int copies) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + if (++plug->count / MAX_PLUG_BIO >= copies) { + list_del(&cb->list); + cb->callback(cb, false); + } + + + return true; +} + +/* + * current->bio_list will be set under submit_bio() context, in this case bitmap + * io will be added to the list and wait for current io submission to finish, + * while current io submission must wait for bitmap io to be done. In order to + * avoid such deadlock, submit bitmap io asynchronously. + */ +static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +{ + if (current->bio_list) + md_bitmap_unplug_async(bit |
