summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap2
-rw-r--r--Documentation/mm/zsmalloc.rst135
-rw-r--r--fs/dax.c37
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/nilfs2/btree.c1
-rw-r--r--fs/nilfs2/direct.c1
-rw-r--r--fs/nilfs2/segment.c3
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nilfs2/the_nilfs.c12
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--kernel/fork.c3
-rw-r--r--lib/maple_tree.c306
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/huge_memory.c19
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/memory.c16
-rw-r--r--mm/mempolicy.c104
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmalloc.c8
-rw-r--r--tools/mm/page_owner_sort.c2
-rw-r--r--tools/testing/radix-tree/maple.c16
26 files changed, 476 insertions, 257 deletions
diff --git a/.mailmap b/.mailmap
index e42486317d18..86d0760c15eb 100644
--- a/.mailmap
+++ b/.mailmap
@@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com>
John Crispin <john@phrozen.org> <blogic@openwrt.org>
John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
John Stultz <johnstul@us.ibm.com>
+<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com>
+<jon.toppins+linux@gmail.com> <jtoppins@redhat.com>
Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org>
<josh@joshtriplett.org> <josh@freedesktop.org>
<josh@joshtriplett.org> <josh@kernel.org>
diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst
index 64d127bfc221..a3c26d587752 100644
--- a/Documentation/mm/zsmalloc.rst
+++ b/Documentation/mm/zsmalloc.rst
@@ -39,13 +39,12 @@ With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
# cat /sys/kernel/debug/zsmalloc/zram0/classes
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
+ class size 10% 20% 30% 40% 50% 60% 70% 80% 90% 99% 100% obj_allocated obj_used pages_used pages_per_zspage freeable
...
...
- 9 176 0 1 186 129 8 4
- 10 192 1 0 2880 2872 135 3
- 11 208 0 1 819 795 42 2
- 12 224 0 1 219 159 12 4
+ 30 512 0 12 4 1 0 1 0 0 1 0 414 3464 3346 433 1 14
+ 31 528 2 7 2 2 1 0 1 0 0 2 117 4154 3793 536 4 44
+ 32 544 6 3 4 1 2 1 0 0 0 1 260 4170 3965 556 2 26
...
...
@@ -54,10 +53,28 @@ class
index
size
object size zspage stores
-almost_empty
- the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full
- the number of ZS_ALMOST_FULL zspages(see below)
+10%
+ the number of zspages with usage ratio less than 10% (see below)
+20%
+ the number of zspages with usage ratio between 10% and 20%
+30%
+ the number of zspages with usage ratio between 20% and 30%
+40%
+ the number of zspages with usage ratio between 30% and 40%
+50%
+ the number of zspages with usage ratio between 40% and 50%
+60%
+ the number of zspages with usage ratio between 50% and 60%
+70%
+ the number of zspages with usage ratio between 60% and 70%
+80%
+ the number of zspages with usage ratio between 70% and 80%
+90%
+ the number of zspages with usage ratio between 80% and 90%
+99%
+ the number of zspages with usage ratio between 90% and 99%
+100%
+ the number of zspages with usage ratio 100%
obj_allocated
the number of objects allocated
obj_used
@@ -66,19 +83,14 @@ pages_used
the number of pages allocated for the class
pages_per_zspage
the number of 0-order pages to make a zspage
+freeable
+ the approximate number of pages class compaction can free
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
-
-* n = number of allocated objects
-* N = total number of objects zspage can store
-* f = fullness_threshold_frac(ie, 4 at the moment)
-
-Similarly, we assign zspage to:
-
-* ZS_ALMOST_FULL when n > N / f
-* ZS_EMPTY when n == 0
-* ZS_FULL when n == N
-
+Each zspage maintains inuse counter which keeps track of the number of
+objects stored in the zspage. The inuse counter determines the zspage's
+"fullness group" which is calculated as the ratio of the "inuse" objects to
+the total number of objects the zspage can hold (objs_per_zspage). The
+closer the inuse counter is to objs_per_zspage, the better.
Internals
=========
@@ -94,10 +106,10 @@ of objects that each zspage can store.
For instance, consider the following size classes:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
...
- 94 1536 0 0 0 0 0 3 0
- 100 1632 0 0 0 0 0 2 0
+ 94 1536 0 .... 0 0 0 0 3 0
+ 100 1632 0 .... 0 0 0 0 2 0
...
@@ -134,10 +146,11 @@ reduces memory wastage.
Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
+
...
- 202 3264 0 0 0 0 0 4 0
- 254 4096 0 0 0 0 0 1 0
+ 202 3264 0 .. 0 0 0 0 4 0
+ 254 4096 0 .. 0 0 0 0 1 0
...
Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages
@@ -151,40 +164,42 @@ efficient storage of large objects.
For zspage chain size of 8, huge class watermark becomes 3632 bytes:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
+
...
- 202 3264 0 0 0 0 0 4 0
- 211 3408 0 0 0 0 0 5 0
- 217 3504 0 0 0 0 0 6 0
- 222 3584 0 0 0 0 0 7 0
- 225 3632 0 0 0 0 0 8 0
- 254 4096 0 0 0 0 0 1 0
+ 202 3264 0 .. 0 0 0 0 4 0
+ 211 3408 0 .. 0 0 0 0 5 0
+ 217 3504 0 .. 0 0 0 0 6 0
+ 222 3584 0 .. 0 0 0 0 7 0
+ 225 3632 0 .. 0 0 0 0 8 0
+ 254 4096 0 .. 0 0 0 0 1 0
...
For zspage chain size of 16, huge class watermark becomes 3840 bytes:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
+
...
- 202 3264 0 0 0 0 0 4 0
- 206 3328 0 0 0 0 0 13 0
- 207 3344 0 0 0 0 0 9 0
- 208 3360 0 0 0 0 0 14 0
- 211 3408 0 0 0 0 0 5 0
- 212 3424 0 0 0 0 0 16 0
- 214 3456 0 0 0 0 0 11 0
- 217 3504 0 0 0 0 0 6 0
- 219 3536 0 0 0 0 0 13 0
- 222 3584 0 0 0 0 0 7 0
- 223 3600 0 0 0 0 0 15 0
- 225 3632 0 0 0 0 0 8 0
- 228 3680 0 0 0 0 0 9 0
- 230 3712 0 0 0 0 0 10 0
- 232 3744 0 0 0 0 0 11 0
- 234 3776 0 0 0 0 0 12 0
- 235 3792 0 0 0 0 0 13 0
- 236 3808 0 0 0 0 0 14 0
- 238 3840 0 0 0 0 0 15 0
- 254 4096 0 0 0 0 0 1 0
+ 202 3264 0 .. 0 0 0 0 4 0
+ 206 3328 0 .. 0 0 0 0 13 0
+ 207 3344 0 .. 0 0 0 0 9 0
+ 208 3360 0 .. 0 0 0 0 14 0
+ 211 3408 0 .. 0 0 0 0 5 0
+ 212 3424 0 .. 0 0 0 0 16 0
+ 214 3456 0 .. 0 0 0 0 11 0
+ 217 3504 0 .. 0 0 0 0 6 0
+ 219 3536 0 .. 0 0 0 0 13 0
+ 222 3584 0 .. 0 0 0 0 7 0
+ 223 3600 0 .. 0 0 0 0 15 0
+ 225 3632 0 .. 0 0 0 0 8 0
+ 228 3680 0 .. 0 0 0 0 9 0
+ 230 3712 0 .. 0 0 0 0 10 0
+ 232 3744 0 .. 0 0 0 0 11 0
+ 234 3776 0 .. 0 0 0 0 12 0
+ 235 3792 0 .. 0 0 0 0 13 0
+ 236 3808 0 .. 0 0 0 0 14 0
+ 238 3840 0 .. 0 0 0 0 15 0
+ 254 4096 0 .. 0 0 0 0 1 0
...
Overall the combined zspage chain size effect on zsmalloc pool configuration:::
@@ -214,9 +229,10 @@ zram as a build artifacts storage (Linux kernel compilation).
zsmalloc classes stats:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
+
...
- Total 13 51 413836 412973 159955 3
+ Total 13 .. 51 413836 412973 159955 3
zram mm_stat:::
@@ -227,9 +243,10 @@ zram as a build artifacts storage (Linux kernel compilation).
zsmalloc classes stats:::
- class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
+ class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
+
...
- Total 18 87 414852 412978 156666 0
+ Total 18 .. 87 414852 412978 156666 0
zram mm_stat:::
diff --git a/fs/dax.c b/fs/dax.c
index 5d2e9b10030e..2ababb89918d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -781,6 +781,33 @@ out:
return ret;
}
+static int __dax_clear_dirty_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ unsigned int scanned = 0;
+ void *entry;
+
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, end) {
+ entry = get_unlocked_entry(&xas, 0);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
+
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+
+ return 0;
+}
+
/*
* Delete DAX entry at @index from @mapping. Wait for it
* to be unlocked before deleting it.
@@ -1440,6 +1467,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* written by write(2) is visible in mmap.
*/
if (iomap->flags & IOMAP_F_NEW || cow) {
+ /*
+ * Filesystem allows CoW on non-shared extents. The src extents
+ * may have been mmapped with dirty mark before. To be able to
+ * invalidate its dax entries, we need to clear the dirty mark
+ * in advance.
+ */
+ if (cow)
+ __dax_clear_dirty_range(iomi->inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (end - 1) >> PAGE_SHIFT);
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 195dc23e0d83..1db3e3c24b43 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -978,6 +978,16 @@ restart:
continue;
}
+ /*
+ * If wb_tryget fails, the wb has been shutdown, skip it.
+ *
+ * Pin @wb so that it stays on @bdi->wb_list. This allows
+ * continuing iteration from @wb after dropping and
+ * regrabbing rcu read lock.
+ */
+ if (!wb_tryget(wb))
+ continue;
+
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
@@ -986,13 +996,6 @@ restart:
work->done = &fallback_work_done;
wb_queue_work(wb, work);
-
- /*
- * Pin @wb so that it stays on @bdi->wb_list. This allows
- * continuing iteration from @wb after dropping and
- * regrabbing rcu read lock.
- */
- wb_get(wb);
last_wb = wb;
rcu_read_unlock();
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 2681a449edc1..13592e82eaf6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2219,6 +2219,7 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
/* on-disk format */
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
binfo->bi_dat.bi_level = level;
+ memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
return 0;
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index a35f2795b242..4c85914f2abc 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -314,6 +314,7 @@ static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
binfo->bi_dat.bi_level = 0;
+ memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
return 0;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 19446a8243d7..6ad41390fa74 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2609,11 +2609,10 @@ static int nilfs_segctor_thread(void *arg)
goto loop;
end_thread:
- spin_unlock(&sci->sc_state_lock);
-
/* end sync. */
sci->sc_task = NULL;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+ spin_unlock(&sci->sc_state_lock);
return 0;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1422b8ba24ed..77f1e5778d1c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -482,6 +482,7 @@ static void nilfs_put_super(struct super_block *sb)
up_write(&nilfs->ns_sem);
}
+ nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
@@ -1105,6 +1106,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
nilfs_put_root(fsroot);
failed_unload:
+ nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 3a4c9c150cbf..2894152a6b25 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -87,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs)
{
might_sleep();
if (nilfs_init(nilfs)) {
- nilfs_sysfs_delete_device_group(nilfs);
brelse(nilfs->ns_sbh[0]);
brelse(nilfs->ns_sbh[1]);
}
@@ -305,6 +304,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto failed;
}
+ err = nilfs_sysfs_create_device_group(sb);
+ if (unlikely(err))
+ goto sysfs_error;
+
if (valid_fs)
goto skip_recovery;
@@ -366,6 +369,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto failed;
failed_unload:
+ nilfs_sysfs_delete_device_group(nilfs);
+
+ sysfs_error:
iput(nilfs->ns_cpfile);
iput(nilfs->ns_sufile);
iput(nilfs->ns_dat);
@@ -697,10 +703,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
- err = nilfs_sysfs_create_device_group(sb);
- if (err)
- goto failed_sbh;
-
set_nilfs_init(nilfs);
err = 0;
out:
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8395605790f6..3b2a41c330e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1977,8 +1977,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
- /* Ignore unsupported features (userspace built against newer kernel) */
- features = uffdio_api.features & UFFD_API_FEATURES;
+ features = uffdio_api.features;
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 092f842a854f..3fc9e680f174 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -800,7 +800,8 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
-#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
+ MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2bde99037652..9051bc07b600 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,6 +683,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -766,6 +767,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index ae37a167e25d..5577e6da6fe3 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -185,7 +185,7 @@ static void mt_free_rcu(struct rcu_head *head)
*/
static void ma_free_rcu(struct maple_node *node)
{
- node->parent = ma_parent_ptr(node);
+ WARN_ON(node->parent != ma_parent_ptr(node));
call_rcu(&node->rcu, mt_free_rcu);
}
@@ -539,11 +539,14 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode)
*/
static inline bool ma_dead_node(const struct maple_node *node)
{
- struct maple_node *parent = (void *)((unsigned long)
- node->parent & ~MAPLE_NODE_MASK);
+ struct maple_node *parent;
+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
+ parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
return (parent == node);
}
+
/*
* mte_dead_node() - check if the @enode is dead.
* @enode: The encoded maple node
@@ -555,6 +558,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode)
struct maple_node *parent, *node;
node = mte_to_node(enode);
+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
parent = mte_parent(enode);
return (parent == node);
}
@@ -625,6 +630,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas)
* @node - the maple node
* @type - the node type
*
+ * In the event of a dead node, this array may be %NULL
+ *
* Return: A pointer to the maple node pivots
*/
static inline unsigned long *ma_pivots(struct maple_node *node,
@@ -817,6 +824,11 @@ static inline void *mt_slot(const struct maple_tree *mt,
return rcu_dereference_check(slots[offset], mt_locked(mt));
}
+static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots,
+ unsigned char offset)
+{
+ return rcu_dereference_protected(slots[offset], mt_locked(mt));
+}
/*
* mas_slot_locked() - Get the slot value when holding the maple tree lock.
* @mas: The maple state
@@ -828,7 +840,7 @@ static inline void *mt_slot(const struct maple_tree *mt,
static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots,
unsigned char offset)
{
- return rcu_dereference_protected(slots[offset], mt_locked(mas->tree));
+ return mt_slot_locked(mas->tree, slots, offset);
}
/*
@@ -900,6 +912,45 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
}
/*
+ * mt_clear_meta() - clear the metadata information of a node, if it exists
+ * @mt: The maple tree
+ * @mn: The maple node
+ * @type: The maple node type
+ * @offset: The offset of the highest sub-gap in this node.
+ * @end: The end of the data in this node.
+ */
+static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
+ enum maple_type type)
+{
+ struct maple_metadata *meta;
+ unsigned long *pivots;
+ void __rcu **slots;
+ void *next;
+
+ switch (type) {
+ case maple_range_64:
+ pivots = mn->mr64.pivot;
+ if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
+ slots = mn->mr64.slot;
+ next = mt_slot_locked(mt, slots,
+ MAPLE_RANGE64_SLOTS - 1);
+ if (unlikely((mte_to_node(next) &&
+ mte_node_type(next))))
+ return; /* no metadata, could be node */
+ }
+ fallthrough;
+ case maple_arange_64:
+ meta = ma_meta(mn, type);
+ break;
+ default:
+ return;
+ }
+
+ meta->gap = 0;
+ meta->end = 0;
+}
+
+/*
* ma_meta_end() - Get the data end of a node from the metadata
* @mn: The maple node
* @mt: The maple node type
@@ -1096,8 +1147,11 @@ static int mas_ascend(struct ma_state *mas)
a_type = mas_parent_enum(mas, p_enode);
a_node = mte_parent(p_enode);
a_slot = mte_parent_slot(p_enode);
- pivots = ma_pivots(a_node, a_type);
a_enode = mt_mk_node(a_node, a_type);
+ pivots = ma_pivots(a_node, a_type);
+
+ if (unlikely(ma_dead_node(a_node)))
+ return 1;
if (!set_min && a_slot) {
set_min = true;
@@ -1249,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
node = mas->alloc;
node->request_count = 0;
while (requested) {
- max_req = MAPLE_ALLOC_SLOTS;
- if (node->node_count) {
- unsigned int offset = node->node_count;
-
- slots = (void **)&node->slot[offset];
- max_req -= offset;
- } else {
- slots = (void **)&node->slot;
- }
-
+ max_req = MAPLE_ALLOC_SLOTS - node->node_count;
+ slots = (void **)&node->slot[node->node_count];
max_req = min(requested, max_req);
count = mt_alloc_bulk(gfp, max_req, slots);
if (!count)
goto nomem_bulk;
+ if (node->node_count == 0) {
+ node->slot[0]->node_count = 0;
+ node->slot[0]->request_count = 0;
+ }
+
node->node_count += count;
allocated += count;
node = node->slot[0];
- node->node_count = 0;
- node->request_count = 0;
requested -= count;
}
mas->alloc->total = allocated;
@@ -1354,12 +1403,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas)
mas->max = ULONG_MAX;
mas->depth = 0;
+retry:
root = mas_root(mas);
/* Tree with nodes */
if (likely(xa_is_node(root))) {
mas->depth = 1;
mas->node = mte_safe_root(root);
mas->offset = 0;
+ if (mte_dead_node(mas->node))
+ goto retry;
+
return NULL;
}
@@ -1401,6 +1454,9 @@ static inline unsigned char ma_data_end(struct maple_node *node,
{
unsigned char offset;
+ if (!pivots)
+ return 0;
+
if (type == maple_arange_64)
return ma_meta_end(node, type);
@@ -1436,6 +1492,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas)
return ma_meta_end(node, type);
pivots = ma_pivots(node, type);
+ if (unlikely(ma_dead_node(node)))
+ return 0;
+
offset = mt_pivots[type] - 1;
if (likely(!pivots[offset]))
return ma_meta_end(node, type);
@@ -1724,8 +1783,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced)
rcu_assign_pointer(slots[offset], mas->node);
}
- if (!advanced)
+ if (!advanced) {
+ mte_set_node_dead(old_enode);
mas_free(mas, old_enode);
+ }
}
/*
@@ -3659,10 +3720,9 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry)
slot++;
mas->depth = 1;
mas_set_height(mas);
-
+ ma_set_meta(node, maple_leaf_64, 0, slot);
/* swap the new root into the tree */
rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
- ma_set_meta(node, maple_leaf_64, 0, slot);
return slot;
}
@@ -3875,18 +3935,13 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
end = ma_data_end(node, type, pivots, max);
if (unlikely(ma_dead_node(node)))
goto dead_node;
-
- if (pivots[offset] >= mas->index)
- goto next;
-
do {
- offset++;
- } while ((offset < end) && (pivots[offset] < mas->index));
-
- if (likely(offset > end))
- max = pivots[offset];
+ if (pivots[offset] >= mas->index) {
+ max = pivots[offset];
+ break;
+ }
+ } while (++offset < end);
-next:
slots = ma_slots(node, type);
next = mt_slot(mas->tree, slots, offset);
if (unlikely(ma_dead_node(node)))
@@ -4164,6 +4219,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas)
done:
mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end);
if (in_rcu) {
+ mte_set_node_dead(mas->node);
mas->node = mt_mk_node(newnode, wr_mas->type);
mas_replace(mas, false);
} else {
@@ -4505,6 +4561,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
node = mas_mn(mas);
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
mas->max = pivots[offset];
if (offset)
mas->min = pivots[offset - 1] + 1;
@@ -4526,6 +4585,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
offset = ma_data_end(node, mt, pivots, mas->max);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
if (offset)
mas->min = pivots[offset - 1] + 1;
@@ -4574,6 +4636,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
struct maple_enode *enode;
int level = 0;
unsigned char offset;
+ unsigned char node_end;
enum maple_type mt;
void __rcu **slots;
@@ -4597,7 +4660,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
node = mas_mn(mas);
mt = mte_node_type(mas->node);
pivots = ma_pivots(node, mt);
- } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max)));
+ node_end = ma_data_end(node, mt, pivots, mas->max);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
+ } while (unlikely(offset == node_end));
slots = ma_slots(node, mt);
pivot = mas_safe_pivot(mas, pivots, ++offset, mt);
@@ -4613,6 +4680,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
mt = mte_node_type(mas->node);
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
offset = 0;
pivot = pivots[0];
}
@@ -4659,11 +4729,14 @@ static inline void *mas_next_nentry(struct ma_state *mas,
return NULL;
}
- pivots = ma_pivots(node, type);
slots = ma_slots(node, type);
- mas->index = mas_safe_min(mas, pivots, mas->offset);
+ pivots = ma_pivots(node, type);
count = ma_data_end(node, type, pivots, mas->max);
- if (ma_dead_node(node))
+ if (unlikely(ma_dead_node(node)))
+ return NULL;
+
+ mas->index = mas_safe_min(mas, pivots, mas->offset);
+ if (unlikely(ma_dead_node(node)))
return NULL;
if (mas->index > max)
@@ -4817,6 +4890,11 @@ retry:
slots = ma_slots(mn, mt);
pivots = ma_pivots(mn, mt);
+ if (unlikely(ma_dead_node(mn))) {
+ mas_rewalk(mas, index);
+ goto retry;
+ }
+
if (offset == mt_pivots[mt])
pivot = mas->max;
else
@@ -5400,24 +5478,26 @@ no_gap:
}