From fad9c80e6371ee04a3fa5728efe20b88d8e4cccd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2023 22:51:01 +0200 Subject: maple_tree: fix a few documentation issues The documentation of mt_next() claims that it starts the search at the provided index. That's incorrect as it starts the search after the provided index. The documentation of mt_find() is slightly confusing. "Handles locking" is not really helpful as it does not explain how the "locking" works. Also the documentation of index talks about a range, while in reality the index is updated on a succesful search to the index of the found entry plus one. Fix similar issues for mt_find_after() and mt_prev(). Reword the confusing "Note: Will not return the zero entry." comment on mt_for_each() and document @__index correctly. Link: https://lkml.kernel.org/r/87ttw2n556.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Cc: Shanker Donthineni Signed-off-by: Andrew Morton --- lib/maple_tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4dd73cf936a6..f512bb9766aa 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5750,7 +5750,11 @@ EXPORT_SYMBOL_GPL(mas_next_range); * @index: The start index * @max: The maximum index to check * - * Return: The entry at @index or higher, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { @@ -5856,7 +5860,11 @@ EXPORT_SYMBOL_GPL(mas_prev_range); * @index: The start index * @min: The minimum index to check * - * Return: The entry at @index or lower, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { @@ -6468,9 +6476,15 @@ EXPORT_SYMBOL(mtree_destroy); * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search - * @max: The maximum value to check + * @max: The maximum value of the search range + * + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst * - * Handles locking. @index will be incremented to one beyond the range. + * In case that an entry is found @index is updated to point to the next + * possible entry independent whether the found entry is occupying a + * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ @@ -6528,7 +6542,9 @@ EXPORT_SYMBOL(mt_find); * @index: Pointer which contains the start location of the search * @max: The maximum value to check * - * Handles locking, detects wrapping on index == 0 + * Same as mt_find() except that it checks @index for 0 before + * searching. If @index == 0, the search is aborted. This covers a wrap + * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ -- cgit v1.2.3 From d6e8d0dc19a3ebea185cd8e99f2e960d81b153ad Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:54 +0800 Subject: maple_tree: add test for mas_wr_modify() fast path Patch series "Optimize the fast path of mas_store()", v4. Add fast paths for mas_wr_append() and mas_wr_slot_store() respectively. The newly added fast path of mas_wr_append() is used in fork() and how much it benefits fork() depends on how many VMAs are duplicated. Thanks Liam for the review. This patch (of 4): Add tests for all cases of mas_wr_append() and mas_wr_slot_store(). Link: https://lkml.kernel.org/r/20230628073657.75314-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230628073657.75314-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 8d4c92cbdd0c..3207c2107918 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1157,6 +1157,71 @@ static noinline void __init check_ranges(struct maple_tree *mt) MT_BUG_ON(mt, !mt_height(mt)); mtree_destroy(mt); + /* Check in-place modifications */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + /* Append to the start of last range */ + mt_set_non_kernel(50); + for (i = 0; i <= 500; i++) { + val = i * 5 + 1; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the last range without touching any boundaries */ + for (i = 0; i < 10; i++) { + val = val2 + 5; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the end of last range */ + val = val2; + for (i = 0; i < 10; i++) { + val += 5; + MT_BUG_ON(mt, mtree_test_store_range(mt, val, ULONG_MAX, + xa_mk_value(val)) != 0); + } + + /* Overwriting the range and over a part of the next range */ + for (i = 10; i < 30; i += 2) { + val = i * 5 + 1; + val2 = val + 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Overwriting a part of the range and over the next range */ + for (i = 50; i < 70; i += 2) { + val2 = i * 5; + val = val2 - 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + for (i = 100; i < 130; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges, in RCU mode + */ + mt_set_in_rcu(mt); + for (i = 150; i < 180; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); + mt_set_non_kernel(0); + mtree_destroy(mt); + /* Test rebalance gaps */ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); mt_set_non_kernel(50); -- cgit v1.2.3 From 23e9dde0b246d47e4a1942ea50bf7fef63e2d41a Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:56 +0800 Subject: maple_tree: optimize mas_wr_append(), also improve duplicating VMAs When the new range can be completely covered by the original last range without touching the boundaries on both sides, two new entries can be appended to the end as a fast path. We update the original last pivot at the end, and the newly appended two entries will not be accessed before this, so it is also safe in RCU mode. This is useful for sequential insertion, which is what we do in dup_mmap(). Enabling BENCH_FORK in test_maple_tree and just running bench_forking() gives the following time-consuming numbers: before: after: 17,874.83 msec 15,738.38 msec It shows about a 12% performance improvement for duplicating VMAs. Link: https://lkml.kernel.org/r/20230628073657.75314-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f512bb9766aa..3e0c91c01781 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4267,10 +4267,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * * Return: True if appended, false otherwise */ -static inline bool mas_wr_append(struct ma_wr_state *wr_mas) +static inline bool mas_wr_append(struct ma_wr_state *wr_mas, + unsigned char new_end) { unsigned char end = wr_mas->node_end; - unsigned char new_end = end + 1; struct ma_state *mas = wr_mas->mas; unsigned char node_pivots = mt_pivots[wr_mas->type]; @@ -4282,16 +4282,27 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas) ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); } - if (mas->last == wr_mas->r_max) { - /* Append to end of range */ - rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry); - wr_mas->pivots[end] = mas->index - 1; - mas->offset = new_end; + if (new_end == wr_mas->node_end + 1) { + if (mas->last == wr_mas->r_max) { + /* Append to end of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = new_end; + } else { + /* Append to start of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->content); + wr_mas->pivots[end] = mas->last; + rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + } } else { - /* Append to start of range */ + /* Append to the range without touching any boundaries. */ rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); - wr_mas->pivots[end] = mas->last; - rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + wr_mas->pivots[end + 1] = mas->last; + rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) @@ -4338,7 +4349,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas) goto slow_path; /* Attempt to append */ - if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas)) + if (mas_wr_append(wr_mas, new_end)) return; if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas)) -- cgit v1.2.3 From 64891ba3e51fb841b0af70db029038eb93bd5a43 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:57 +0800 Subject: maple_tree: add a fast path case in mas_wr_slot_store() When expanding a range in two directions, only partially overwriting the previous and next ranges, the number of entries will not be increased, so we can just update the pivots as a fast path. However, it may introduce potential risks in RCU mode, because it updates two pivots. We only enable it in non-RCU mode. Link: https://lkml.kernel.org/r/20230628073657.75314-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3e0c91c01781..7fad4a7a0b05 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4168,23 +4168,35 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; + void __rcu **slots = wr_mas->slots; bool gap = false; - if (wr_mas->offset_end - offset != 1) - return false; - - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset); - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset + 1); + gap |= !mt_slot_locked(mas->tree, slots, offset); + gap |= !mt_slot_locked(mas->tree, slots, offset + 1); - if (mas->index == wr_mas->r_min) { - /* Overwriting the range and over a part of the next range. */ - rcu_assign_pointer(wr_mas->slots[offset], wr_mas->entry); - wr_mas->pivots[offset] = mas->last; - } else { - /* Overwriting a part of the range and over the next range */ - rcu_assign_pointer(wr_mas->slots[offset + 1], wr_mas->entry); + if (wr_mas->offset_end - offset == 1) { + if (mas->index == wr_mas->r_min) { + /* Overwriting the range and a part of the next one */ + rcu_assign_pointer(slots[offset], wr_mas->entry); + wr_mas->pivots[offset] = mas->last; + } else { + /* Overwriting a part of the range and the next one */ + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); + wr_mas->pivots[offset] = mas->index - 1; + mas->offset++; /* Keep mas accurate. */ + } + } else if (!mt_in_rcu(mas->tree)) { + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + gap |= !mt_slot_locked(mas->tree, slots, offset + 2); + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; + wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ + } else { + return false; } trace_ma_write(__func__, mas, 0, wr_mas->entry); -- cgit v1.2.3 From d695c30a8ca07ac7e2138435b461b36289d5656e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:38 +0800 Subject: maple_tree: don't use MAPLE_ARANGE64_META_MAX to indicate no gap Patch series "Improve the validation for maple tree and some cleanup", v2. This patch (of 7): Do not use a special offset to indicate that there is no gap. When there is no gap, offset can point to any valid slots because its gap is 0. Link: https://lkml.kernel.org/r/20230711035444.526-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230711035444.526-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7fad4a7a0b05..da2c8542ad2f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1610,8 +1610,6 @@ ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * - * If the metadata gap is set to MAPLE_ARANGE64_META_MAX, there is no gap. - * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) @@ -1628,9 +1626,6 @@ static inline unsigned long mas_max_gap(struct ma_state *mas) node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node, mt); - if (offset == MAPLE_ARANGE64_META_MAX) - return 0; - gaps = ma_gaps(node, mt); return gaps[offset]; } @@ -1662,10 +1657,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode, pmt); - if (meta_offset == MAPLE_ARANGE64_META_MAX) - meta_gap = 0; - else - meta_gap = pgaps[meta_offset]; + meta_gap = pgaps[meta_offset]; pgaps[offset] = new; @@ -1678,7 +1670,6 @@ ascend: ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { - meta_offset = 15; new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } @@ -2076,7 +2067,7 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; - unsigned char offset = 15; + unsigned char offset = 0; gaps = ma_gaps(node, mt); do { -- cgit v1.2.3 From f8e5eac8abe3d26106e5470c735058f04f60f61e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:39 +0800 Subject: maple_tree: make mas_validate_gaps() to check metadata Make mas_validate_gaps() check whether the offset in the metadata points to the largest gap. By the way, simplify this function. Add the verification that gaps beyond the node limit are zero. Link: https://lkml.kernel.org/r/20230711035444.526-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 78 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index da2c8542ad2f..9ce78e5e6084 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6957,15 +6957,16 @@ EXPORT_SYMBOL_GPL(mt_dump); static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; - struct maple_node *p_mn; + struct maple_node *p_mn, *node = mte_to_node(mte); + enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; - unsigned char p_slot; + unsigned char p_slot, offset; unsigned long *gaps = NULL; - unsigned long *pivots = ma_pivots(mte_to_node(mte), mte_node_type(mte)); - int i; + unsigned long *pivots = ma_pivots(node, mt); + unsigned int i; - if (ma_is_dense(mte_node_type(mte))) { + if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) @@ -6978,52 +6979,59 @@ static void mas_validate_gaps(struct ma_state *mas) goto counted; } - gaps = ma_gaps(mte_to_node(mte), mte_node_type(mte)); + gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mte_node_type(mte)); + p_end = mas_logical_pivot(mas, pivots, i, mt); if (!gaps) { - if (mas_get_slot(mas, i)) { - gap = 0; - goto not_empty; - } - - gap += p_end - p_start + 1; + if (!mas_get_slot(mas, i)) + gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; - if (!entry) { - if (gap != p_end - p_start + 1) { - pr_err("%p[%u] -> %p %lu != %lu - %lu + 1\n", - mas_mn(mas), i, - mas_get_slot(mas, i), gap, - p_end, p_start); - mt_dump(mas->tree, mt_dump_hex); - - MT_BUG_ON(mas->tree, - gap != p_end - p_start + 1); - } - } else { - if (gap > p_end - p_start + 1) { - pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", - mas_mn(mas), i, gap, p_end, p_start, - p_end - p_start + 1); - MT_BUG_ON(mas->tree, - gap > p_end - p_start + 1); - } + MT_BUG_ON(mas->tree, !entry); + + if (gap > p_end - p_start + 1) { + pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + mas_mn(mas), i, gap, p_end, p_start, + p_end - p_start + 1); + MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; -not_empty: + p_start = p_end + 1; if (p_end >= mas->max) break; } counted: + if (mt == maple_arange_64) { + offset = ma_meta_gap(node, mt); + if (offset > i) { + pr_err("gap offset %p[%u] is invalid\n", node, offset); + MT_BUG_ON(mas->tree, 1); + } + + if (gaps[offset] != max_gap) { + pr_err("gap %p[%u] is not the largest gap %lu\n", + node, offset, max_gap); + MT_BUG_ON(mas->tree, 1); + } + + MT_BUG_ON(mas->tree, !gaps); + for (i++ ; i < mt_slot_count(mte); i++) { + if (gaps[i] != 0) { + pr_err("gap %p[%u] beyond node limit != 0\n", + node, i); + MT_BUG_ON(mas->tree, 1); + } + } + } + if (mte_is_root(mte)) return; @@ -7033,10 +7041,8 @@ counted: if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); + MT_BUG_ON(mas->tree, 1); } - - MT_BUG_ON(mas->tree, - ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap); } static void mas_validate_parent_slot(struct ma_state *mas) -- cgit v1.2.3 From e93fda5a1ab7a0c6143ae8a6f231c9f5f3c417b1 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:40 +0800 Subject: maple_tree: fix mas_validate_child_slot() to check last missed slot Don't break the loop before checking the last slot. Also here check if non-leaf nodes are missing children. Link: https://lkml.kernel.org/r/20230711035444.526-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9ce78e5e6084..af8fb75ad688 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7093,11 +7093,12 @@ static void mas_validate_child_slot(struct ma_state *mas) for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); - if (!pivots[i] || pivots[i] == mas->max) - break; - if (!child) - break; + if (!child) { + pr_err("Non-leaf node lacks child at %p[%u]\n", + mas_mn(mas), i); + MT_BUG_ON(mas->tree, 1); + } if (mte_parent_slot(child) != i) { pr_err("Slot error at %p[%u]: child %p has pslot %u\n", @@ -7112,6 +7113,9 @@ static void mas_validate_child_slot(struct ma_state *mas) mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } + + if (i < mt_pivots[type] && pivots[i] == mas->max) + break; } } -- cgit v1.2.3 From 33af39d0244ce4944ab16728f7b04df9dfc6d365 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:41 +0800 Subject: maple_tree: make mas_validate_limits() check root node and node limit Update mas_validate_limits() to check root node, check node limit pivot if there is enough room for it to exist and check data_end. Remove the check for child existence as it is done in mas_validate_child_slot(). Link: https://lkml.kernel.org/r/20230711035444.526-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index af8fb75ad688..31ac4f2c4426 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7120,7 +7120,9 @@ static void mas_validate_child_slot(struct ma_state *mas) } /* - * Validate all pivots are within mas->min and mas->max. + * Validate all pivots are within mas->min and mas->max, check metadata ends + * where the maximum ends and ensure there is no slots or pivots set outside of + * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { @@ -7130,26 +7132,15 @@ static void mas_validate_limits(struct ma_state *mas) void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); - /* all limits are fine here. */ - if (mte_is_root(mas->node)) - return; - for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); - if (!piv && (i != 0)) - break; - - if (!mte_is_leaf(mas->node)) { - void *entry = mas_slot(mas, slots, i); - - if (!entry) - pr_err("%p[%u] cannot be null\n", - mas_mn(mas), i); - - MT_BUG_ON(mas->tree, !entry); + if (!piv && (i != 0)) { + pr_err("Missing node limit pivot at %p[%u]", + mas_mn(mas), i); + MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { @@ -7172,6 +7163,13 @@ static void mas_validate_limits(struct ma_state *mas) if (piv == mas->max) break; } + + if (mas_data_end(mas) != i) { + pr_err("node%p: data_end %u != the last slot offset %u\n", + mas_mn(mas), mas_data_end(mas), i); + MT_BUG_ON(mas->tree, 1); + } + for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); -- cgit v1.2.3 From a489539e33c29b469bcd023a32c99078c2597c7c Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:42 +0800 Subject: maple_tree: update mt_validate() Instead of using mas_first_entry() to find the leftmost leaf, use a simple loop instead. Remove an unneeded check for root node. To make the error message more accurate, check pivots first and then slots, because checking slots depend on the node limit pivot to break the loop. Link: https://lkml.kernel.org/r/20230711035444.526-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 31ac4f2c4426..e08ef44926c6 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7244,21 +7244,20 @@ void mt_validate(struct maple_tree *mt) if (!mas_searchable(&mas)) goto done; - mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node)); + while (!mte_is_leaf(mas.node)) + mas_descend(&mas); + while (!mas_is_none(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); - if (!mte_is_root(mas.node)) { - end = mas_data_end(&mas); - if (MAS_WARN_ON(&mas, - (end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX))) { - pr_err("Invalid size %u of %p\n", end, - mas_mn(&mas)); - } + end = mas_data_end(&mas); + if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && + (mas.max != ULONG_MAX))) { + pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); } + mas_validate_parent_slot(&mas); - mas_validate_child_slot(&mas); mas_validate_limits(&mas); + mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); -- cgit v1.2.3 From 29b2681f1aa95cff6ec0afdeac0b2cab659a5564 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:43 +0800 Subject: maple_tree: replace mas_logical_pivot() with mas_safe_pivot() Replace mas_logical_pivot() with mas_safe_pivot() and drop mas_logical_pivot() since it won't be used anymore. We can do this since now all nodes will have node limit pivot (if it is not full node). Link: https://lkml.kernel.org/r/20230711035444.526-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e08ef44926c6..4f3209ca0e3b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -728,33 +728,6 @@ mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) return mas->min; } -/* - * mas_logical_pivot() - Get the logical pivot of a given offset. - * @mas: The maple state - * @pivots: The pointer to the maple node pivots - * @offset: The offset into the pivot array - * @type: The maple node type - * - * When there is no value at a pivot (beyond the end of the data), then the - * pivot is actually @mas->max. - * - * Return: the logical pivot of a given @offset. - */ -static inline unsigned long -mas_logical_pivot(struct ma_state *mas, unsigned long *pivots, - unsigned char offset, enum maple_type type) -{ - unsigned long lpiv = mas_safe_pivot(mas, pivots, offset, type); - - if (likely(lpiv)) - return lpiv; - - if (likely(offset)) - return mas->max; - - return lpiv; -} - /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node @@ -2202,7 +2175,7 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, goto b_end; /* Handle new range ending before old range ends */ - piv = mas_logical_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); + piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); @@ -4935,7 +4908,7 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { - pivot = mas_logical_pivot(mas, pivots, offset, type); + pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) @@ -6981,7 +6954,7 @@ static void mas_validate_gaps(struct ma_state *mas) gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mt); + p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) -- cgit v1.2.3 From 6783bd4b5f72b483cf492dc09500548b495670b5 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:44 +0800 Subject: maple_tree: drop mas_first_entry() The internal function mas_first_entry() is no longer used, so drop it. Link: https://lkml.kernel.org/r/20230711035444.526-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 -------------------------------------------------------- 1 file changed, 72 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4f3209ca0e3b..cef47ce8eddd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6636,78 +6636,6 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, offset); } - -/* - * mas_first_entry() - Go the first leaf and find the first entry. - * @mas: the maple state. - * @limit: the maximum index to check. - * @*r_start: Pointer to set to the range start. - * - * Sets mas->offset to the offset of the entry, r_start to the range minimum. - * - * Return: The first entry or MAS_NONE. - */ -static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, - unsigned long limit, enum maple_type mt) - -{ - unsigned long max; - unsigned long *pivots; - void __rcu **slots; - void *entry = NULL; - - mas->index = mas->min; - if (mas->index > limit) - goto none; - - max = mas->max; - mas->offset = 0; - while (likely(!ma_is_leaf(mt))) { - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - max = pivots[0]; - mas->node = entry; - mn = mas_mn(mas); - mt = mte_node_type(mas->node); - } - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - - mas->max = max; - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - if (unlikely(ma_dead_node(mn))) - return NULL; - - /* Slot 0 or 1 must be set */ - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - - mas->offset = 1; - entry = mas_slot(mas, slots, 1); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - - mas->index = pivots[0] + 1; - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - -none: - if (likely(!ma_dead_node(mn))) - mas->node = MAS_NONE; - return NULL; -} - /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { -- cgit v1.2.3 From efb78fa86e95832b78ca0ba60f3706788a818938 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 14 Jul 2023 11:52:38 +1000 Subject: lib/test_meminit: allocate pages up to order MAX_ORDER test_pages() tests the page allocator by calling alloc_pages() with different orders up to order 10. However, different architectures and platforms support different maximum contiguous allocation sizes. The default maximum allocation order (MAX_ORDER) is 10, but architectures can use CONFIG_ARCH_FORCE_MAX_ORDER to override this. On platforms where this is less than 10, test_meminit() will blow up with a WARN(). This is expected, so let's not do that. Replace the hardcoded "10" with the MAX_ORDER macro so that we test allocations up to the expected platform limit. Link: https://lkml.kernel.org/r/20230714015238.47931-1-ajd@linux.ibm.com Fixes: 5015a300a522 ("lib: introduce test_meminit module") Signed-off-by: Andrew Donnellan Reviewed-by: Alexander Potapenko Cc: Xiaoke Wang Cc: Signed-off-by: Andrew Morton --- lib/test_meminit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 60e1984c060f..0ae35223d773 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i; - for (i = 0; i < 10; i++) + for (i = 0; i <= MAX_ORDER; i++) num_tests += do_alloc_pages_order(i, &failures); REPORT_FAILURES_IN_FN(); -- cgit v1.2.3 From 4445e58264aea8ec6bb1287add79606f0e3f3988 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 17:39:20 +0300 Subject: maple_tree: mtree_insert*: fix typo in kernel-doc description Replace "Insert and entry at a give index" with "Insert an entry at a given index" Link: https://lkml.kernel.org/r/20230715143920.994812-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cef47ce8eddd..616ec7f3be81 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6274,7 +6274,7 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, EXPORT_SYMBOL(mtree_store); /** - * mtree_insert_range() - Insert an entry at a give range if there is no value. + * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range @@ -6310,7 +6310,7 @@ retry: EXPORT_SYMBOL(mtree_insert_range); /** - * mtree_insert() - Insert an entry at a give index if there is no value. + * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store -- cgit v1.2.3 From 4ae6944d15727c50ff1c0bb3fe38b9b412520d85 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 11:40:38 +0300 Subject: maple_tree: mtree_insert: fix typo in kernel-doc description of GFP flags Replace FGP_FLAGS with GFP_FLAGS Link: https://lkml.kernel.org/r/20230715084038.987955-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 616ec7f3be81..b6b3973897f0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6314,7 +6314,7 @@ EXPORT_SYMBOL(mtree_insert_range); * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store - * @gfp: The FGP_FLAGS to use for allocations. + * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. -- cgit v1.2.3 From 19a462f06eb5a78e0c3ebe4fd4fbdc71620b8788 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:51 -0400 Subject: maple_tree: Be more strict about locking Use lockdep to check the write path in the maple tree holds the lock in write mode. Introduce mt_write_lock_is_held() to check if the lock is held for writing. Update the necessary checks for rcu_dereference_protected() to use the new write lock check. Link: https://lkml.kernel.org/r/20230714195551.894800-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b6b3973897f0..3b6f8c8dac65 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -777,6 +777,12 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) } } +static inline bool mt_write_locked(const struct maple_tree *mt) +{ + return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : + lockdep_is_held(&mt->ma_lock); +} + static inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : @@ -792,7 +798,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mt)); + return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. @@ -835,7 +841,7 @@ static inline void *mas_root(struct ma_state *mas) static inline void *mt_root_locked(struct maple_tree *mt) { - return rcu_dereference_protected(mt->ma_root, mt_locked(mt)); + return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* -- cgit v1.2.3 From 361c678be709f67a5b609ec3666ff5fec7eb8baf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:43 -0400 Subject: maple_tree: add benchmarking for mas_for_each Patch series "Reduce preallocations for maple tree", v3. Initial work on preallocations showed no regression in performance during testing, but recently some users (both on [1] and off [android] list) have reported that preallocating the worst-case number of nodes has caused some slow down. This patch set addresses the number of allocations in a few ways. During munmap() most munmap() operations will remove a single VMA, so leverage the fact that the maple tree can place a single pointer at range 0 - 0 without allocating. This is done by changing the index of the VMAs to be indexed by the count, starting at 0. Re-introduce the entry argument to mas_preallocate() so that a more intelligent guess of the node count can be made. Implement the more intelligent guess of the node count, although there is more work to be done. During development of v2 of this patch set, I also noticed that the number of nodes being allocated for a rebalance was beyond what could possibly be needed. This is addressed in patch 0008. This patch (of 15): Add a way to test the speed of mas_for_each() to the testing code. Link: https://lkml.kernel.org/r/20230724183157.3939892-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230724183157.3939892-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3207c2107918..9c4cf5fb2b7f 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -44,6 +44,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_WALK */ /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ +/* #define BENCH_MAS_FOR_EACH */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1770,6 +1771,37 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_FOR_EACH) +static noinline void __init bench_mas_for_each(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 0; + + mas_for_each(&mas, entry, max) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j += 5; + } + mas_set(&mas, 0); + } + rcu_read_unlock(); + +} +#endif + /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3498,6 +3530,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_FOR_EACH) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_for_each(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); -- cgit v1.2.3 From 8c314f3b55fbc42284ea1367bb2807f2accad8ae Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:44 -0400 Subject: maple_tree: add benchmarking for mas_prev() Add some benchmarking functions in testing for mas_prev(). This is useful to ensure there are no regressions added during modifications. Link: https://lkml.kernel.org/r/20230724183157.3939892-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 9c4cf5fb2b7f..0674aebd4423 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -45,6 +45,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ /* #define BENCH_MAS_FOR_EACH */ +/* #define BENCH_MAS_PREV */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1801,7 +1802,36 @@ static noinline void __init bench_mas_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_PREV) +static noinline void __init bench_mas_prev(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 2495; + + mas_set(&mas, ULONG_MAX); + while ((entry = mas_prev(&mas, 0)) != NULL) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j -= 5; + } + } + rcu_read_unlock(); +} +#endif /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3537,6 +3567,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_PREV) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_prev(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); -- cgit v1.2.3 From da0892547b101df6e13255b378380d077975368d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:49 -0400 Subject: maple_tree: re-introduce entry to mas_preallocate() arguments The current preallocation strategy is to preallocate the absolute worst-case allocation for a tree modification. The entry (or NULL) is needed to know how many nodes are needed to write to the tree. Start by adding the argument to the mas_preallocate() definition. Link: https://lkml.kernel.org/r/20230724183157.3939892-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3b6f8c8dac65..0d7e30c7d999 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5529,11 +5529,12 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state + * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { int ret; -- cgit v1.2.3 From c108df767fb786586274b2435473885151d6f360 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:50 -0400 Subject: maple_tree: adjust node allocation on mas_rebalance() mas_rebalance() is called to rebalance an insufficient node into a single node or two sufficient nodes. The preallocation estimate is always too many in this case as the height of the tree will never grow and there is no possibility to have a three way split in this case, so revise the node allocation count. Link: https://lkml.kernel.org/r/20230724183157.3939892-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d7e30c7d999..494f884ef17f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3136,7 +3136,7 @@ static inline int mas_rebalance(struct ma_state *mas, * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ - mas_node_count(mas, 1 + empty_count * 3); + mas_node_count(mas, empty_count * 2 - 1); if (mas_is_err(mas)) return 0; -- cgit v1.2.3 From a7496ad529dfd96e37219849bddcda121d133536 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:53 -0400 Subject: maple_tree: move mas_wr_end_piv() below mas_wr_extend_null() Relocate it and call mas_wr_extend_null() from within mas_wr_end_piv(). Extending the NULL may affect the end pivot value so call mas_wr_endtend_null() from within mas_wr_end_piv() to keep it all together. Link: https://lkml.kernel.org/r/20230724183157.3939892-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 494f884ef17f..db61cdd8a649 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4180,18 +4180,6 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) return true; } -static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) -{ - while ((wr_mas->offset_end < wr_mas->node_end) && - (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) - wr_mas->offset_end++; - - if (wr_mas->offset_end < wr_mas->node_end) - wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; - else - wr_mas->end_piv = wr_mas->mas->max; -} - static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4228,6 +4216,21 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) } } +static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) +{ + while ((wr_mas->offset_end < wr_mas->node_end) && + (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) + wr_mas->offset_end++; + + if (wr_mas->offset_end < wr_mas->node_end) + wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; + else + wr_mas->end_piv = wr_mas->mas->max; + + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); +} + static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4371,10 +4374,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas) /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); - - if (!wr_mas->entry) - mas_wr_extend_null(wr_mas); - /* New root for a single pointer */ if (unlikely(!mas->index && mas->last == ULONG_MAX)) { mas_new_root(mas, wr_mas->entry); -- cgit v1.2.3 From 17983dc617837a588a52848ab4034d8efa6c1fa6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:55 -0400 Subject: maple_tree: refine mas_preallocate() node calculations Calculate the number of nodes based on the pending write action instead of assuming the worst case. This addresses a performance regression introduced in platforms that have longer allocation timing. Link: https://lkml.kernel.org/r/20230724183157.3939892-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db61cdd8a649..4a111785360f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5535,9 +5535,51 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { + MA_WR_STATE(wr_mas, mas, entry); + unsigned char node_size; + int request = 1; int ret; - mas_node_count_gfp(mas, 1 + mas_mt_height(mas) * 3, gfp); + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + goto ask_now; + + mas_wr_store_setup(&wr_mas); + wr_mas.content = mas_start(mas); + /* Root expand */ + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + goto ask_now; + + if (unlikely(!mas_wr_walk(&wr_mas))) { + /* Spanning store, use worst case for now */ + request = 1 + mas_mt_height(mas) * 3; + goto ask_now; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + /* Exact fit, no nodes needed. */ + if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) + return 0; + + mas_wr_end_piv(&wr_mas); + node_size = mas_wr_new_end(&wr_mas); + if (node_size >= mt_slots[wr_mas.type]) { + /* Split, worst case for now. */ + request = 1 + mas_mt_height(mas) * 2; + goto ask_now; + } + + /* New root needs a singe node */ + if (unlikely(mte_is_root(mas->node))) + goto ask_now; + + /* Potential spanning rebalance collapsing a node, use worst-case */ + if (node_size - 1 <= mt_min_slots[wr_mas.type]) + request = mas_mt_height(mas) * 2 - 1; + + /* node store, slot store needs one node */ +ask_now: + mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; if (likely(!mas_is_err(mas))) return 0; -- cgit v1.2.3 From fec29364348fec535c55708b1f4025b321aba572 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:56 -0400 Subject: maple_tree: reduce resets during store setup mas_prealloc() may walk partially down the tree before finding that a split or spanning store is needed. When the write occurs, relax the logic on resetting the walk so that partial walks will not restart, but walks that have gone too far (a store that affects beyond the current node) should be restarted. Link: https://lkml.kernel.org/r/20230724183157.3939892-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4a111785360f..a3d602cfd030 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5424,19 +5424,34 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (mas_is_start(wr_mas->mas)) + return; + if (unlikely(mas_is_paused(wr_mas->mas))) - mas_reset(wr_mas->mas); + goto reset; - if (!mas_is_start(wr_mas->mas)) { - if (mas_is_none(wr_mas->mas)) { - mas_reset(wr_mas->mas); - } else { - wr_mas->r_max = wr_mas->mas->max; - wr_mas->type = mte_node_type(wr_mas->mas->node); - if (mas_is_span_wr(wr_mas)) - mas_reset(wr_mas->mas); - } - } + if (unlikely(mas_is_none(wr_mas->mas))) + goto reset; + + /* + * A less strict version of mas_is_span_wr() where we allow spanning + * writes within this node. This is to stop partial walks in + * mas_prealloc() from being reset. + */ + if (wr_mas->mas->last > wr_mas->mas->max) + goto reset; + + if (wr_mas->entry) + return; + + if (mte_is_leaf(wr_mas->mas->node) && + wr_mas->mas->last == wr_mas->mas->max) + goto reset; + + return; + +reset: + mas_reset(wr_mas->mas); } /* Interface */ -- cgit v1.2.3 From 83d97f620f611ab3fbf2de585bf34bd9dab513c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:46 -0400 Subject: maple_tree: add hex output to maple_arange64 dump Patch series "maple_tree: Change replacement strategy". The maple tree marks nodes dead as soon as they are going to be replaced. This could be problematic when used in the RCU context since the writer may be starved of CPU time by the readers. This patch set addresses the issue by switching the data replacement strategy to one that will only mark data as dead once the new data is available. This series changes the ordering of the node replacement so that the new data is live before the old data is marked 'dead'. When readers hit 'dead' nodes, they will restart from the top of the tree and end up in the new data. In more complex scenarios, the replacement strategy means a subtree is built and graphed into the tree leaving some nodes to point to the old parent. The view of tasks into the old data will either remain with the old data, or see the new data once the old data is marked 'dead'. Iterators will see the 'dead' node and restart on their own and switch to the new data. There is no risk of the reader seeing old data in these cases. The 'dead' subtree of data is then fully marked dead, but reused nodes will still point to the dead nodes until the parent pointer is updated. Walking up to a 'dead' node will cause a re-walk from the top of the tree and enter the new data area where old data is not reachable. Once the parent pointers are fully up to date in the active data, the 'dead' subtree is iterated to collect entirely 'dead' subtrees, and dead nodes (nodes that partially contained reused data). This patch (of 6): When dumping the tree, honour formatting request to output hex for the maple node type arange64. Link: https://lkml.kernel.org/r/20230804165951.2661157-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230804165951.2661157-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a3d602cfd030..880ce0fcdcac 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6833,11 +6833,27 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, int i; pr_cont(" contents: "); - for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) - pr_cont("%lu ", node->gap[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%lx ", node->gap[i]); + break; + default: + case mt_dump_dec: + pr_cont("%lu ", node->gap[i]); + } + } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); - for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + break; + default: + case mt_dump_dec: + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + } + } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; -- cgit v1.2.3 From 72bcf4aa86ece2b49fbdc7fe83d3a05c7ebcfc97 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:47 -0400 Subject: maple_tree: reorder replacement of nodes to avoid live lock Replacing nodes may cause a live lock-up if CPU resources are saturated by write operations on the tree by continuously retrying on dead nodes. To avoid the continuous retry scenario, ensure the new node is inserted into the tree prior to marking the old data as dead. This will define a window where old and new data is swapped. When reusing lower level nodes, ensure the parent pointer is updated after the parent is marked dead. This ensures that the child is still reachable from the top of the tree, but walking up to a dead node will result in a single retry that will start a fresh walk from the top down through the new node. Link: https://lkml.kernel.org/r/20230804165951.2661157-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 880ce0fcdcac..0d4573a8d134 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1756,6 +1756,36 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) } } +/* + * mas_replace_node() - Replace a node by putting it in the tree, marking it + * dead, and freeing it. + * the parent encoding to locate the maple node in the tree. + * @mas - the ma_state with @mas->node pointing to the new node. + * @old_enode - The old maple encoded node. + */ +static inline void mas_replace_node(struct ma_state *mas, + struct maple_enode *old_enode) + __must_hold(mas->tree->ma_lock) +{ + if (mte_is_root(mas->node)) { + mas_mn(mas)->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + mas_set_height(mas); + } else { + unsigned char offset = 0; + void __rcu **slots = NULL; + + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); + rcu_assign_pointer(slots[offset], mas->node); + } + + mte_set_node_dead(old_enode); + mas_free(mas, old_enode); +} + /* * mas_new_child() - Find the new child of a node. * @mas: the maple state @@ -3176,7 +3206,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; - struct maple_enode *eparent; + struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; @@ -3218,7 +3248,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; - eparent = mt_mk_node(mte_parent(l_mas.node), + old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { @@ -3234,7 +3264,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); - mte_set_pivot(eparent, mte_parent_slot(l_mas.node), + mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ @@ -3242,6 +3272,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); + eparent = old_eparent; goto done; } @@ -3266,7 +3297,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); - memcpy(parent, mte_to_node(eparent), sizeof(struct maple_node)); + memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; @@ -3278,8 +3309,10 @@ done: mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); - if (in_rcu) - mas_replace(mas, false); + if (in_rcu) { + mas_replace_node(mas, old_eparent); + mas_adopt_children(mas, mas->node); + } mas_update_gap(mas); } @@ -3596,11 +3629,13 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; + struct maple_enode *old_enode; unsigned char b_end = b_node->b_end; enum maple_type b_type = b_node->type; + old_enode = wr_mas->mas->node; if ((b_end < mt_min_slots[b_type]) && - (!mte_is_root(wr_mas->mas->node)) && + (!mte_is_root(old_enode)) && (mas_mt_height(wr_mas->mas) > 1)) return mas_rebalance(wr_mas->mas, b_node); @@ -3618,7 +3653,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, node->parent = mas_mn(wr_mas->mas)->parent; wr_mas->mas->node = mt_mk_node(node, b_type); mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); - mas_replace(wr_mas->mas, false); + mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); return 1; @@ -4117,9 +4152,10 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { - mte_set_node_dead(mas->node); + struct maple_enode *old_enode = mas->node; + mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace(mas, false); + mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } -- cgit v1.2.3 From 1238f6a226dc27ec34d229b71b02f0d6c46bbf11 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:48 -0400 Subject: maple_tree: introduce mas_put_in_tree() mas_replace() has a single user that takes a flag which is now always true. Replace this function with mas_put_in_tree() to better align with mas_replace_node(). Inline the remaining logic into the only caller; mas_wmb_replace(). Link: https://lkml.kernel.org/r/20230804165951.2661157-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton