From 3de13550a20fd570441c0f854abe58c6cc46c0bc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:22:12 +0200 Subject: raid6: neon: add missing prototypes The raid6 syndrome functions are generated for different sizes and have no generic prototype, while in the inner functions have a prototype in a header that cannot be included from the correct file. In both cases, the compiler warns about missing prototypes: lib/raid6/recov_neon_inner.c:27:6: warning: no previous prototype for '__raid6_2data_recov_neon' [-Wmissing-prototypes] lib/raid6/recov_neon_inner.c:77:6: warning: no previous prototype for '__raid6_datap_recov_neon' [-Wmissing-prototypes] lib/raid6/neon1.c:56:6: warning: no previous prototype for 'raid6_neon1_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon1.c:86:6: warning: no previous prototype for 'raid6_neon1_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:56:6: warning: no previous prototype for 'raid6_neon2_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:97:6: warning: no previous prototype for 'raid6_neon2_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:56:6: warning: no previous prototype for 'raid6_neon4_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:119:6: warning: no previous prototype for 'raid6_neon4_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:56:6: warning: no previous prototype for 'raid6_neon8_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:163:6: warning: no previous prototype for 'raid6_neon8_xor_syndrome_real' [-Wmissing-prototypes] Add a new header file that contains the prototypes for both to avoid the warnings. Signed-off-by: Arnd Bergmann Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230517132220.937200-1-arnd@kernel.org --- lib/raid6/neon.h | 22 ++++++++++++++++++++++ lib/raid6/neon.uc | 1 + lib/raid6/recov_neon.c | 8 +------- lib/raid6/recov_neon_inner.c | 1 + 4 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 lib/raid6/neon.h diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h new file mode 100644 index 000000000000..2ca41ee9b499 --- /dev/null +++ b/lib/raid6/neon.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + + diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc index b7c68030da4f..355270af0cd6 100644 --- a/lib/raid6/neon.uc +++ b/lib/raid6/neon.uc @@ -25,6 +25,7 @@ */ #include +#include "neon.h" typedef uint8x16_t unative_t; diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index d6fba8bf8c0a..1bfc14174d4d 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -8,6 +8,7 @@ #ifdef __KERNEL__ #include +#include "neon.h" #else #define kernel_neon_begin() #define kernel_neon_end() @@ -19,13 +20,6 @@ static int raid6_has_neon(void) return cpu_has_neon(); } -void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, - uint8_t *dq, const uint8_t *pbmul, - const uint8_t *qmul); - -void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, - const uint8_t *qmul); - static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, int failb, void **ptrs) { diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c index 90eb80d43790..f9e7e8f5a151 100644 --- a/lib/raid6/recov_neon_inner.c +++ b/lib/raid6/recov_neon_inner.c @@ -5,6 +5,7 @@ */ #include +#include "neon.h" #ifdef CONFIG_ARM /* -- cgit v1.2.3 From 301867b1c16805aebbc306aafa6ecdc68b73c7e5 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 15 May 2023 21:48:05 +0800 Subject: md/raid10: check slab-out-of-bounds in md_bitmap_get_counter If we write a large number to md/bitmap_set_bits, md_bitmap_checkpage() will return -EINVAL because 'page >= bitmap->pages', but the return value was not checked immediately in md_bitmap_get_counter() in order to set *blocks value and slab-out-of-bounds occurs. Move check of 'page >= bitmap->pages' to md_bitmap_get_counter() and return directly if true. Fixes: ef4256733506 ("md/bitmap: optimise scanning of empty bitmaps.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230515134808.3936750-2-linan666@huaweicloud.com --- drivers/md/md-bitmap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bc8d7565171d..358a06495902 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -54,14 +54,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -1387,6 +1380,14 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || -- cgit v1.2.3 From 46038b30b308c3ebf49e79548f109d00a8d74b31 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:06 +0800 Subject: md/raid5: don't allow replacement while reshape is in progress If reshape is interrupted(for example, echo frozen to sync_action), then rdev replacement can be set. It's safe because reshape is always prior to resync in md_check_recovery(). However, if system reboots, then kernel will complain cannot handle concurrent replacement and reshape and this array is not able to assemble anymore. Fix this problem by don't allow replacement until reshape is done. Reported-by: Peter Neuwirth Link: https://lore.kernel.org/linux-raid/e2f96772-bfbc-f43b-6da1-f520e5164536@online.de/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-2-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4739ed891e75..5950932323fc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8377,6 +8377,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->disks + disk; tmp = rdev_mdlock_deref(mddev, p->rdev); if (test_bit(WantReplacement, &tmp->flags) && + mddev->reshape_position == MaxSector && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); -- cgit v1.2.3 From 873f50ece41aad5c4f788a340960c53774b5526e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:07 +0800 Subject: md: fix data corruption for raid456 when reshape restart while grow up Currently, if reshape is interrupted, echo "reshape" to sync_action will restart reshape from scratch, for example: echo frozen > sync_action echo reshape > sync_action This will corrupt data before reshape_position if the array is growing, fix the problem by continue reshape from reshape_position. Reported-by: Peter Neuwirth Link: https://lore.kernel.org/linux-raid/e2f96772-bfbc-f43b-6da1-f520e5164536@online.de/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0de7ddd943..b7f83784710b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4806,11 +4806,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } -- cgit v1.2.3 From 431e61257d631157e1d374f1368febf37aa59f7c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:08 +0800 Subject: md: export md_is_rdwr() and is_md_suspended() The two apis will be used later to fix a deadlock in raid456, there are no functional changes. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 16 ---------------- drivers/md/md.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b7f83784710b..fb060e381ae7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -93,18 +93,6 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -360,10 +348,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -static bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. diff --git a/drivers/md/md.h b/drivers/md/md.h index fd8f260ed5f8..da173d6bf726 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -555,6 +555,23 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); -- cgit v1.2.3 From 3e00777d51572bdd75cef29c9c31106b52d7cc8f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:09 +0800 Subject: md: add a new api prepare_suspend() in md_personality There are no functional changes, the new api will be used later to do special handling for raid456 in md_suspend(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 4 ++++ drivers/md/md.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index fb060e381ae7..2f29d4e365c5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -448,6 +448,10 @@ void mddev_suspend(struct mddev *mddev) wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + + if (mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); diff --git a/drivers/md/md.h b/drivers/md/md.h index da173d6bf726..1eec65cf783c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -631,6 +631,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour -- cgit v1.2.3 From 868bba54a3bcbfc34314e963d5a7e66717f39d4e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:10 +0800 Subject: md/raid5: fix a deadlock in the case that reshape is interrupted If reshape is in progress and io across reshape_position is issued, such io will wait for reshape to make progress(see details in the case that make_stripe_request() return STRIPE_SCHEDULE_AND_RETRY). It has been reported several times that if system reboot while growing raid5 to raid6, array assemble will hang infinitely([1, 2]). This is because following deadlock is triggered: 1) a normal io is waiting for reshape to progress, this io can be from system-udevd or mdadm. 2) while assemble, mdadm tries to suspend the array, hence 'reconfig_mutex' is held and mddev_suspend() must wait for normal io to be done. 3) daemon thread can't start reshape because 'reconfig_mutex' can't be held. 1) and 3) is unbreakable because they're foundation design. In order to break 2), following is possible solutions that I can think of: a) Let mddev_suspend() fail is not a good option, because this will break many scenarios since mddev_suspend() doesn't fail before. b) Fail the io that is waiting for reshape to make progress from mddev_suspend(). c) Return false for the io that is waiting for reshape to make progress from raid5_make_request(), and these io will wait for suspend to be done in md_handle_request(), where 'active_io' is not grabbed. c) sounds better than b), however, b) is used because it's easy and straightforward, and it's verified that mdadm can assemble in this case. On the other hand, c) breaks the logic that mddev_suspend() will wait for submitted io to be completely handled. Fix the problem by checking reshape in mddev_suspend(), if reshape can't make progress and there are still some io waiting for reshape, fail those io. [1] https://lore.kernel.org/all/CAFig2csUV2QiomUhj_t3dPOgV300dbQ6XtM9ygKPdXJFSH__Nw@mail.gmail.com/ [2] https://lore.kernel.org/all/CAO2ABipzbw6QL5eNa44CQHjiVa-LTvS696Mh9QaTw+qsUKFUCw@mail.gmail.com/ Reported-by: Jove Reported-by: David Gilmour Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-6-yukuai1@huaweicloud.com --- drivers/md/md.c | 1 + drivers/md/raid5.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2f29d4e365c5..36af585b0e96 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9100,6 +9100,7 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); + wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5950932323fc..01c55f24ab09 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5966,6 +5966,19 @@ out: return ret; } +static bool reshape_inprogress(struct mddev *mddev) +{ + return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery); +} + +static bool reshape_disabled(struct mddev *mddev) +{ + return is_md_suspended(mddev) || !md_is_rdwr(mddev); +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5997,7 +6010,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6076,6 +6090,15 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && + reshape_disabled(mddev)) { + bi->bi_status = BLK_STS_IOERR; + ret = STRIPE_FAIL; + pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", + mdname(mddev)); + } + return ret; } @@ -9044,6 +9067,22 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || + percpu_ref_is_zero(&mddev->active_io)); + if (percpu_ref_is_zero(&mddev->active_io)) + return; + + /* + * Reshape is not in progress, and array is suspended, io that is + * waiting for reshpape can never be done. + */ + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -9064,6 +9103,7 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9088,6 +9128,7 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9113,6 +9154,7 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, -- cgit v1.2.3 From 6beb489b2eed25978523f379a605073f99240c50 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 22 May 2023 15:25:33 +0800 Subject: md/raid10: fix overflow of md/safe_mode_delay There is no input check when echo md/safe_mode_delay in safe_delay_store(). And msec might also overflow when HZ < 1000 in safe_delay_show(), Fix it by checking overflow in safe_delay_store() and use unsigned long conversion in safe_delay_show(). Fixes: 72e02075a33f ("md: factor out parsing of fixed-point numbers") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230522072535.1523740-2-linan666@huaweicloud.com --- drivers/md/md.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 36af585b0e96..2fc8d25f6e80 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3784,8 +3784,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3797,7 +3798,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; -- cgit v1.2.3 From f8b20a405428803bd9881881d8242c9d72c6b2b2 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 22 May 2023 15:25:34 +0800 Subject: md/raid10: fix wrong setting of max_corr_read_errors There is no input check when echo md/max_read_errors and overflow might occur. Add check of input number. Fixes: 1e50915fe0bb ("raid: improve MD/raid10 handling of correctable read errors.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230522072535.1523740-3-linan666@huaweicloud.com --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2fc8d25f6e80..49c83ed08937 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4468,6 +4468,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } -- cgit v1.2.3 From 3ce94ce5d05ae89190a23f6187f64d8f4b2d3782 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 09:27:27 +0800 Subject: md: fix duplicate filename for rdev Commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") delays the deletion of rdev, however, this introduces a window that rdev can be added again while the deletion is not done yet, and sysfs will complain about duplicate filename. Follow up patches try to fix this problem by flushing workqueue, however, flush_rdev_wq() is just dead code, the progress in md_kick_rdev_from_array(): 1) list_del_rcu(&rdev->same_set); 2) synchronize_rcu(); 3) queue_work(md_rdev_misc_wq, &rdev->del_work); So in flush_rdev_wq(), if rdev is found in the list, work_pending() can never pass, in the meantime, if work is queued, then rdev can never be found in the list. flush_rdev_wq() can be replaced by flush_workqueue() directly, however, this approach is not good: - the workqueue is global, this synchronization for all raid disks is not necessary. - flush_workqueue can't be called under 'reconfig_mutex', there is still a small window between flush_workqueue() and mddev_lock() that other contexts can queue new work, hence the problem is not solved completely. sysfs already has apis to support delete itself through writer, and these apis, specifically sysfs_break/unbreak_active_protection(), is used to support deleting rdev synchronously. Therefore, the above commit can be reverted, and sysfs duplicate filename can be avoided. A new mdadm regression test is proposed as well([1]). [1] https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523012727.3042247-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 86 +++++++++++++++++++++++++++++---------------------------- drivers/md/md.h | 10 +++++-- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 49c83ed08937..724c7414b241 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -87,11 +87,11 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; -static struct workqueue_struct *md_rdev_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -643,9 +643,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -747,6 +749,24 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + mutex_lock(&mddev->delete_mutex); + if (list_empty(&mddev->deleting)) + goto out; + + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } +out: + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); + md_free_rdev(mddev); + /* As we've dropped the mutex we need a spinlock to * make sure the thread doesn't disappear */ @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ @@ -2455,6 +2470,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2468,15 +2485,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev, rdev->mddev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3544,6 +3563,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3551,6 +3571,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3559,6 +3583,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -4484,20 +4512,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4525,7 +4539,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -5595,7 +5608,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -7558,9 +7570,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -9623,10 +9632,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9645,8 +9650,6 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9942,7 +9945,6 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 1eec65cf783c..7156fc05f834 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -122,8 +122,6 @@ struct md_rdev { struct serial_in_rdev *serial; /* used for raid1 io serialization */ - struct work_struct del_work; /* used for delayed sysfs removal */ - struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ @@ -531,6 +529,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; -- cgit v1.2.3 From e5e9b9cb71a09d86d5e8d147e6a6457e1f8887b5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:13 +0800 Subject: md: factor out a helper to wake up md_thread directly md_wakeup_thread() can't wakeup md_thread->tsk if md_thread->run is still in progress, and in some cases md_thread->tsk need to be woke up directly, like md_set_readonly() and do_md_stop(). Commit 9dfbdafda3b3 ("md: unlock mddev before reap sync_thread in action_store") introduce a new scenario where unregister sync_thread is not protected by 'reconfig_mutex', this can cause null-ptr-deference in theroy: t1: md_set_readonly t2: action_store md_unregister_thread // 'reconfig_mutex' is not held // 'reconfig_mutex' is held by caller if (mddev->sync_thread) thread = *threadp *threadp = NULL wake_up_process(mddev->sync_thread->tsk) // null-ptr-deference Fix this problem by factoring out a helper to wake up md_thread directly, so that 'sync_thread' won't be accessed multiple times from the reader side. This helper also prepare to protect md_thread with rcu. Noted that later patches is going to fix that unregister sync_thread is not protected by 'reconfig_mutex' from action_store(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 724c7414b241..9d54de3441ef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -92,6 +92,7 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -6284,10 +6285,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; @@ -6348,10 +6351,12 @@ static int do_md_stop(struct mddev *mddev, int mode, } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && @@ -7898,6 +7903,12 @@ static int md_thread(void *arg) return 0; } +static void md_wakeup_thread_directly(struct md_thread *thread) +{ + if (thread) + wake_up_process(thread->tsk); +} + void md_wakeup_thread(struct md_thread *thread) { if (thread) { -- cgit v1.2.3 From 955a257d69e44cea09b0375b8f2f3d4d9fcf7b4e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:14 +0800 Subject: dm-raid: remove useless checking in raid_message() md_wakeup_thread() handle the case that pass in md_thread is NULL, there is no need to check this. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-3-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c8821fcb8299..8846bf510a35 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, * canceling read-auto mode */ mddev->ro = 0; - if (!mddev->suspended && mddev->sync_thread) + if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!mddev->suspended && mddev->thread) + if (!mddev->suspended) md_wakeup_thread(mddev->thread); return 0; -- cgit v1.2.3 From c333673a78307abe6b1f6998809288fcd86740ed Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:15 +0800 Subject: md/bitmap: always wake up md_thread in timeout_store md_wakeup_thread() can handle the case that pass in md_thread is NULL, the only difference is that md_wakeup_thread() will be called when current timeout is 'MAX_SCHEDULE_TIMEOUT', this should not matter because timeout_store() is not hot path, and the daemon process is woke up more than demand from other context already. Prepare to factor out a helper to set timeout. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-4-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 358a06495902..4b5ba81c53be 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2476,11 +2476,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) * the bitmap is all clean and we don't need to * adjust the timeout right now */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } } + + md_wakeup_thread(mddev->thread); return len; } -- cgit v1.2.3 From 4eeb6535cd51100460ec8873bb68addef17b3e81 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:16 +0800 Subject: md/bitmap: factor out a helper to set timeout Register/unregister 'mddev->thread' are both under 'reconfig_mutex', however, some context didn't hold the mutex to access mddev->thread, which can cause null-ptr-deference: 1) md_bitmap_daemon_work() can be called from md_check_recovery() where 'reconfig_mutex' is not held, deference 'mddev->thread' might cause null-ptr-deference, because md_unregister_thread() reset the pointer before stopping the thread. 2) timeout_store() access 'mddev->thread' multiple times, null-ptr-deference can be triggered if 'mddev->thread' is reset in the middle. This patch factor out a helper to set timeout, the new helper always check if 'mddev->thread' is null first, so that problem 1 can be fixed. Now that this helper only access 'mddev->thread' once, but it's possible that 'mddev->thread' can be freed while this helper is still in progress, hence the problem is not fixed yet. Follow up patches will fix this by protecting md_thread with rcu. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-5-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 4b5ba81c53be..23522df41ca5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1234,11 +1234,22 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); +static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, + bool force) +{ + struct md_thread *thread = mddev->thread; + + if (!thread) + return; + + if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) + thread->timeout = timeout; +} + /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ - void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; @@ -1262,7 +1273,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; @@ -1359,8 +1370,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1821,8 +1831,7 @@ void md_bitmap_destroy(struct mddev *mddev) mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } @@ -1965,7 +1974,7 @@ int md_bitmap_load(struct mddev *mddev) /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); md_bitmap_update_sb(bitmap); @@ -2470,17 +2479,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; - mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) - mddev->thread->timeout = timeout; - } + mddev->bitmap_info.daemon_sleep = timeout; + mddev_set_timeout(mddev, timeout, false); md_wakeup_thread(mddev->thread); + return len; } -- cgit v1.2.3 From 4469315439827290923fce4f3f672599cabeb366 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:17 +0800 Subject: md: protect md_thread with rcu Currently, there are many places that md_thread can be accessed without protection, following are known scenarios that can cause null-ptr-dereference or uaf: 1) sync_thread that is allocated and started from md_start_sync() 2) mddev->thread can be accessed directly from timeout_store() and md_bitmap_daemon_work() 3) md_unregister_thread() from action_store(). Currently, a global spinlock 'pers_lock' is borrowed to protect 'mddev->thread' in some places, this problem can be fixed likewise, however, use a global lock for all the cases is not good. Fix this problem by protecting all md_thread with rcu. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-6-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 10 +++++-- drivers/md/md-cluster.c | 17 +++++++----- drivers/md/md-multipath.c | 4 +-- drivers/md/md.c | 69 ++++++++++++++++++++++------------------------- drivers/md/md.h | 8 +++--- drivers/md/raid1.c | 7 ++--- drivers/md/raid1.h | 2 +- drivers/md/raid10.c | 20 ++++++++------ drivers/md/raid10.h | 2 +- drivers/md/raid5-cache.c | 22 ++++++++------- drivers/md/raid5.c | 15 ++++++----- drivers/md/raid5.h | 2 +- 12 files changed, 97 insertions(+), 81 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 23522df41ca5..ad5a3456cd8a 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1237,13 +1237,19 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, bool force) { - struct md_thread *thread = mddev->thread; + struct md_thread *thread; + + rcu_read_lock(); + thread = rcu_dereference(mddev->thread); if (!thread) - return; + goto out; if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) thread->timeout = timeout; + +out: + rcu_read_unlock(); } /* diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..3d9fd74233df 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -75,14 +75,14 @@ struct md_cluster_info { sector_t suspend_hi; int suspend_from; /* the slot which broadcast suspend_lo/hi */ - struct md_thread *recovery_thread; + struct md_thread __rcu *recovery_thread; unsigned long recovery_map; /* communication loc resources */ struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; struct dlm_lock_resource *no_new_dev_lockres; - struct md_thread *recv_thread; + struct md_thread __rcu *recv_thread; struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot) set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { - cinfo->recovery_thread = md_register_thread(recover_bitmaps, - mddev, "recover"); + rcu_assign_pointer(cinfo->recovery_thread, + md_register_thread(recover_bitmaps, mddev, "recover")); if (!cinfo->recovery_thread) { pr_warn("md-cluster: Could not create recovery thread\n"); return; @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { int got_lock = 0; + struct md_thread *thread; struct md_cluster_info *cinfo = mddev->cluster_info; mddev->good_device_nr = le32_to_cpu(msg->raid_slot); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - wait_event(mddev->thread->wqueue, + + /* daemaon thread must exist */ + thread = rcu_dereference_protected(mddev->thread, true); + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); md_reload_sb(mddev, mddev->good_device_nr); @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes) } /* Initiate the communication resources */ ret = -ENOMEM; - cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + rcu_assign_pointer(cinfo->recv_thread, + md_register_thread(recv_daemon, mddev, "cluster_recv")); if (!cinfo->recv_thread) { pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); goto err; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 66edf5e72bd6..92c45be203d7 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev) if (ret) goto out_free_conf; - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); + rcu_assign_pointer(mddev->thread, + md_register_thread(multipathd, mddev, "multipath")); if (!mddev->thread) goto out_free_conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9d54de3441ef..a6ede3d1c99e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -70,11 +70,7 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ +/* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -92,7 +88,7 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); -static void md_wakeup_thread_directly(struct md_thread *thread); +static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -442,8 +438,10 @@ static void md_submit_bio(struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); - lockdep_assert_held(&mddev->reconfig_mutex); + struct md_thread *thread = rcu_dereference_protected(mddev->thread, + lockdep_is_held(&mddev->reconfig_mutex)); + + WARN_ON_ONCE(thread && current == thread->tsk); if (mddev->suspended++) return; wake_up(&mddev->sb_wait); @@ -811,13 +809,8 @@ void mddev_unlock(struct mddev *mddev) md_free_rdev(mddev); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -7903,19 +7896,29 @@ static int md_thread(void *arg) return 0; } -static void md_wakeup_thread_directly(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { - if (thread) - wake_up_process(thread->tsk); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); } -void md_wakeup_thread(struct md_thread *thread) +void md_wakeup_thread(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + wake_up(&t->wqueue); } + rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); @@ -7945,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct md_thread __rcu **threadp) { - struct md_thread *thread; + struct md_thread *thread = rcu_dereference_protected(*threadp, true); - /* - * Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - thread = *threadp; - if (!thread) { - spin_unlock(&pers_lock); + if (!thread) return; - } - *threadp = NULL; - spin_unlock(&pers_lock); + + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -9226,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7156fc05f834..a50122165fa1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -365,8 +365,8 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ + struct md_thread __rcu *thread; /* management thread */ + struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* 'last_sync_action' is initialized to "none". It is set when a * sync operation (i.e "data-check", "requested-resync", "resync", @@ -758,8 +758,8 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); +extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3570da63969b..220f6ce761a5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3087,7 +3087,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, "raid1"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid1d, mddev, "raid1")); if (!conf->thread) goto abort; @@ -3180,8 +3181,8 @@ static int raid1_run(struct mddev *mddev) /* * Ok, everything is just fine now */ - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ebb6788820e7..468f189da7a0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -130,7 +130,7 @@ struct r1conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* Keep track of cluster resync window to send to other * nodes. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 381c21f7fb06..0ae7e52983fa 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -982,6 +982,7 @@ static void lower_barrier(struct r10conf *conf) static bool stop_waiting_barrier(struct r10conf *conf) { struct bio_list *bio_list = current->bio_list; + struct md_thread *thread; /* barrier is dropped */ if (!conf->barrier) @@ -997,12 +998,14 @@ static bool stop_waiting_barrier(struct r10conf *conf) (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) return true; + /* daemon thread must exist while handling io */ + thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done. */ - if (conf->mddev->thread->tsk == current) { + if (thread->tsk == current) { WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); return true; } @@ -4107,7 +4110,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) atomic_set(&conf->nr_pending, 0); err = -ENOMEM; - conf->thread = md_register_thread(raid10d, mddev, "raid10"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid10d, mddev, "raid10")); if (!conf->thread) goto out; @@ -4152,8 +4156,8 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -4296,8 +4300,8 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto out_free_conf; } @@ -4698,8 +4702,8 @@ out: set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { ret = -EAGAIN; goto abort; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 8c072ce0bc54..63e48b11b552 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -100,7 +100,7 @@ struct r10conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* * Keep track of cluster resync window to send to other nodes. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 852b265c5db4..47ba7d9e81e1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -120,7 +120,7 @@ struct r5l_log { struct bio_set bs; mempool_t meta_pool; - struct md_thread *reclaim_thread; + struct md_thread __rcu *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be * reclaimed. if it's 0, reclaim spaces * used by io_units which are in @@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) void r5l_quiesce(struct r5l_log *log, int quiesce) { - struct mddev *mddev; + struct mddev *mddev = log->rdev->mddev; + struct md_thread *thread = rcu_dereference_protected( + log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ - mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - kthread_park(log->reclaim_thread->tsk); + kthread_park(thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); } else - kthread_unpark(log->reclaim_thread->tsk); + kthread_unpark(thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct r5l_log *log; + struct md_thread *thread; int ret; pr_debug("md/raid:%s: using device %pg as journal\n", @@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->tree_lock); INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - if (!log->reclaim_thread) + thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, + "reclaim"); + if (!thread) goto reclaim_thread; - log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + rcu_assign_pointer(log->reclaim_thread, thread); init_waitqueue_head(&log->iounit_wait); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 01c55f24ab09..7e2bbcfef325 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7731,7 +7731,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) } sprintf(pers_name, "raid%d", mddev->new_level); - conf->thread = md_register_thread(raid5d, mddev, pers_name); + rcu_assign_pointer(conf->thread, + md_register_thread(raid5d, mddev, pers_name)); if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); @@ -7954,8 +7955,8 @@ static int raid5_run(struct mddev *mddev) } conf->min_offset_diff = min_offset_diff; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; @@ -8052,8 +8053,8 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto abort; } @@ -8631,8 +8632,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e873938a6125..f19707189a7b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -679,7 +679,7 @@ struct r5conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; -- cgit v1.2.3 From 75aa7a1b8f85b03971df1d0f5b1a3a9edf020dff Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:34:10 +0800 Subject: md/raid5: don't start reshape when recovery or replace is in progress When recovery is interrupted (reboot, etc.) check for MD_RECOVERY_RUNNING is not enough to tell recovery is in progress. Also check recovery_cp before starting reshape. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529133410.2125914-1-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7e2bbcfef325..f8bc74e16811 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8525,6 +8525,7 @@ static int raid5_start_reshape(struct mddev *mddev) struct r5conf *conf = mddev->private; struct md_rdev *rdev; int spares = 0; + int i; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -8536,6 +8537,13 @@ static int raid5_start_reshape(struct mddev *mddev) if (has_failed(conf)) return -EINVAL; + /* raid5 can't handle concurrent reshape and recovery */ + if (mddev->recovery_cp < MaxSector) + return -EBUSY; + for (i = 0; i < conf->raid_disks; i++) + if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + return -EBUSY; + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) -- cgit v1.2.3 From 34817a2441747b48e444cb0e05d84e14bc9443da Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 15:22:15 +0800 Subject: md/raid10: fix null-ptr-deref of mreplace in raid10_sync_request There are two check of 'mreplace' in raid10_sync_request(). In the first check, 'need_replace' will be set and 'mreplace' will be used later if no-Faulty 'mreplace' exists, In the second check, 'mreplace' will be set to NULL if it is Faulty, but 'need_replace' will not be changed accordingly. null-ptr-deref occurs if Faulty is set between two check. Fix it by merging two checks into one. And replace 'need_replace' with 'mreplace' because their values are always the same. Fixes: ee37d7314a32 ("md/raid10: Fix raid10 replace hang when new added disk faulty") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230527072218.2365857-2-linan666@huaweicloud.com --- drivers/md/raid10.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0ae7e52983fa..51552280b325 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3441,7 +3441,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int must_sync; int any_working; int need_recover = 0; - int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3453,11 +3452,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, !test_bit(Faulty, &mrdev->flags) && !test_bit(In_sync, &mrdev->flags)) need_recover = 1; - if (mreplace != NULL && - !test_bit(Faulty, &mreplace->flags)) - need_replace = 1; + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; - if (!need_recover && !need_replace) { + if (!need_recover && !mreplace) { rcu_read_unlock(); continue; } @@ -3473,8 +3471,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - if (mreplace && test_bit(Faulty, &mreplace->flags)) - mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it