From b5e7b59c3480f355910f9d2c6ece5857922a5e54 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: change nfs_access_get_cached to only report the mask Currently the nfs_access_get_cached family of functions report a 'struct nfs_access_entry' as the result, with both .mask and .cred set. However the .cred is never used. This is probably good and there is no guarantee that it won't be freed before use. Change to only report the 'mask' - as this is all that is used or needed. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 05f249f20f55..f33559acbcc2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -533,8 +533,8 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); -extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, - bool may_block); +extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block); /* * linux/fs/nfs/symlink.c -- cgit v1.2.3 From 73fbb3fa647bdb5b60469af8101c741ece03a825 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: pass cred explicitly for access tests Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So a future patch will remove the ->cred ref from nfs_access_entry. To prepare, change various functions to not assume there is a 'cred' in the nfs_access_entry, but to pass the cred around explicitly. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 2 +- include/linux/nfs_xdr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f33559acbcc2..c33b1b792bc9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -396,7 +396,7 @@ extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fa extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); -extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); +extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct user_namespace *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 967a0098f0a9..9102bdaa3faa 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1737,7 +1737,7 @@ struct nfs_rpc_ops { struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); - int (*access) (struct inode *, struct nfs_access_entry *); + int (*access) (struct inode *, struct nfs_access_entry *, const struct cred *); int (*readlink)(struct inode *, struct page *, unsigned int, unsigned int); int (*create) (struct inode *, struct dentry *, -- cgit v1.2.3 From 6238aec83f3fb12132f964937e5bbcf248fea8f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: don't store 'struct cred *' in struct nfs_access_entry Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So remove the 'struct cred *' and replace it with the fields we need: kuid_t, kgid_t, and struct group_info * This makes the 'struct nfs_access_entry' 64 bits larger. New function "access_cmp" is introduced which is identical to cred_fscmp() except that the second arg is an 'nfs_access_entry', rather than a 'cred' Fixes: b68572e07c58 ("NFS: change access cache to use 'struct cred'.") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c33b1b792bc9..450e393d4bf2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -61,7 +61,9 @@ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; - const struct cred * cred; + kuid_t fsuid; + kgid_t fsgid; + struct group_info *group_info; __u32 mask; struct rcu_head rcu_head; }; -- cgit v1.2.3 From 1ab5be4ac5b1c9ce39ce1037c45b68d2ce6eede0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:54 -0500 Subject: NFSv4: Add some support for case insensitive filesystems Add capabilities to allow the NFS client to recognise when it is dealing with case insensitive and case preserving filesystems. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 2 ++ include/linux/nfs_xdr.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2a9acbfe00f0..f24fc67af42d 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -271,6 +271,8 @@ struct nfs_server { #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) #define NFS_CAP_LGOPEN (1U << 5) +#define NFS_CAP_CASE_INSENSITIVE (1U << 6) +#define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9102bdaa3faa..ddff92396a3f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1194,6 +1194,8 @@ struct nfs4_server_caps_res { u32 has_links; u32 has_symlinks; u32 fh_expire_type; + u32 case_insensitive; + u32 case_preserving; }; #define NFS4_PATHNAME_MAXCOMPONENTS 512 -- cgit v1.2.3 From 8a59bb93b7e3cca389af44781a429ac12ac49be6 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:30 -0500 Subject: NFSv4 store server support for fs_location attribute Define and store if server returns it supports fs_locations attribute as a capability. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f24fc67af42d..8c08b356c8ca 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -289,5 +289,5 @@ struct nfs_server { #define NFS_CAP_COPY_NOTIFY (1U << 27) #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) - +#define NFS_CAP_FS_LOCATIONS (1U << 30) #endif -- cgit v1.2.3 From 1976b2b31462151403c9fc110204fcc2a77bdfd1 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 12 Jan 2022 10:27:38 -0500 Subject: NFSv4.1 query for fs_location attr on a new file system Query the server for other possible trunkable locations for a given file system on a 4.1+ mount. v2: -- added missing static to nfs4_discover_trunking, reported by the kernel test robot Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ddff92396a3f..728cb0c1f0b6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1797,6 +1797,7 @@ struct nfs_rpc_ops { struct nfs_server *(*create_server)(struct fs_context *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); + int (*discover_trunking)(struct nfs_server *, struct nfs_fh *); }; /* -- cgit v1.2.3 From 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Dec 2021 13:19:52 +0100 Subject: perf: Fix perf_event_read_local() time Time readers that cannot take locks (due to NMI etc..) currently make use of perf_event::shadow_ctx_time, which, for that event gives: time' = now + (time - timestamp) or, alternatively arranged: time' = time + (now - timestamp) IOW, the progression of time since the last time the shadow_ctx_time was updated. There's problems with this: A) the shadow_ctx_time is per-event, even though the ctx_time it reflects is obviously per context. The direct concequence of this is that the context needs to iterate all events all the time to keep the shadow_ctx_time in sync. B) even with the prior point, the context itself might not be active meaning its time should not advance to begin with. C) shadow_ctx_time isn't consistently updated when ctx_time is There are 3 users of this stuff, that suffer differently from this: - calc_timer_values() - perf_output_read() - perf_event_update_userpage() /* A */ - perf_event_read_local() /* A,B */ In particular, perf_output_read() doesn't suffer at all, because it's sample driven and hence only relevant when the event is actually running. This same was supposed to be true for perf_event_update_userpage(), after all self-monitoring implies the context is active *HOWEVER*, as per commit f79256532682 ("perf/core: fix userpage->time_enabled of inactive events") this goes wrong when combined with counter overcommit, in that case those events that do not get scheduled when the context becomes active (task events typically) miss out on the EVENT_TIME update and ENABLED time is inflated (for a little while) with the time the context was inactive. Once the event gets rotated in, this gets corrected, leading to a non-monotonic timeflow. perf_event_read_local() made things even worse, it can request time at any point, suffering all the problems perf_event_update_userpage() does and more. Because while perf_event_update_userpage() is limited by the context being active, perf_event_read_local() users have no such constraint. Therefore, completely overhaul things and do away with perf_event::shadow_ctx_time. Instead have regular context time updates keep track of this offset directly and provide perf_event_time_now() to complement perf_event_time(). perf_event_time_now() will, in adition to being context wide, also take into account if the context is active. For inactive context, it will not advance time. This latter property means the cgroup perf_cgroup_info context needs to grow addition state to track this. Additionally, since all this is strictly per-cpu, we can use barrier() to order context activity vs context time. Fixes: 7d9285e82db5 ("perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers"") Signed-off-by: Peter Zijlstra (Intel) Tested-by: Song Liu Tested-by: Namhyung Kim Link: https://lkml.kernel.org/r/YcB06DasOBtU0b00@hirez.programming.kicks-ass.net --- include/linux/perf_event.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 117f230bcdfd..733649184b27 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -693,18 +693,6 @@ struct perf_event { u64 total_time_running; u64 tstamp; - /* - * timestamp shadows the actual context timing but it can - * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in, - * or when ctx_sched_in failed to schedule the event because we - * run out of PMC. - * - * ctx_time already accounts for ctx->timestamp. Therefore to - * compute ctx_time for a sample, simply add perf_clock(). - */ - u64 shadow_ctx_time; - struct perf_event_attr attr; u16 header_size; u16 id_header_size; @@ -852,6 +840,7 @@ struct perf_event_context { */ u64 time; u64 timestamp; + u64 timeoffset; /* * These fields let us detect when two contexts have both @@ -934,6 +923,8 @@ struct bpf_perf_event_data_kern { struct perf_cgroup_info { u64 time; u64 timestamp; + u64 timeoffset; + int active; }; struct perf_cgroup { -- cgit v1.2.3 From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 11 Jan 2022 15:23:09 -0800 Subject: psi: Fix uaf issue when psi trigger is destroyed while being polled With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Analyzed-by: Eric Biggers Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Biggers Acked-by: Johannes Weiner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com --- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index a70ca833c6d7..f8ce53bfdb2a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 516c0fe836fd..1a3cef26d129 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -141,9 +141,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; }; struct psi_group { -- cgit v1.2.3 From 0e3872499de1a1230cef5221607d71aa09264bd5 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 7 Jan 2022 17:52:54 +0800 Subject: kernel/sched: Remove dl_boosted flag comment since commit 2279f540ea7d ("sched/deadline: Fix priority inheritance with multiple scheduling classes"), we should not keep it here. Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20220107095254.GA49258@localhost.localdomain --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7a1f16df66e3..7f8b4495e243 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -614,10 +614,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_boosted tells if we are boosted due to DI. If so we are - * outside bandwidth enforcement mechanism (but only until we - * exit the critical section); - * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * -- cgit v1.2.3 From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Tue, 18 Jan 2022 14:20:13 -0500 Subject: net: fix information leakage in /proc/net/ptype In one net namespace, after creating a packet socket without binding it to a device, users in other net namespaces can observe the new `packet_type` added by this packet socket by reading `/proc/net/ptype` file. This is minor information leakage as packet socket is namespace aware. Add a net pointer in `packet_type` to keep the net namespace of of corresponding packet socket. In `ptype_seq_show`, this net pointer must be checked when it is not NULL. Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") Signed-off-by: Congyu Liu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3213c7227b59..e490b84732d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2548,6 +2548,7 @@ struct packet_type { struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); + struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; -- cgit v1.2.3 From e2f08207c558bc0bc8abaa557cdb29bad776ac7b Mon Sep 17 00:00:00 2001 From: Moshe Tal Date: Thu, 20 Jan 2022 11:55:50 +0200 Subject: ethtool: Fix link extended state for big endian The link extended sub-states are assigned as enum that is an integer size but read from a union as u8, this is working for small values on little endian systems but for big endian this always give 0. Fix the variable in the union to match the enum size. Fixes: ecc31c60240b ("ethtool: Add link extended state") Signed-off-by: Moshe Tal Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Gal Pressman Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a26f37a27167..11efc45de66a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -111,7 +111,7 @@ struct ethtool_link_ext_state_info { enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; enum ethtool_link_ext_substate_module module; - u8 __link_ext_substate; + u32 __link_ext_substate; }; }; -- cgit v1.2.3 From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..fdac22d16c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif -- cgit v1.2.3 From a37d9a17f099072fe4d3a9048b0321978707a918 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 20 Jan 2022 23:53:04 +0200 Subject: fsnotify: invalidate dcache before IN_DELETE event Apparently, there are some applications that use IN_DELETE event as an invalidation mechanism and expect that if they try to open a file with the name reported with the delete event, that it should not contain the content of the deleted file. Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify will have access to a positive dentry. This allowed a race where opening the deleted file via cached dentry is now possible after receiving the IN_DELETE event. To fix the regression, create a new hook fsnotify_delete() that takes the unlinked inode as an argument and use a helper d_delete_notify() to pin the inode, so we can pass it to fsnotify_delete() after d_delete(). Backporting hint: this regression is from v5.3. Although patch will apply with only trivial conflicts to v5.4 and v5.10, it won't build, because fsnotify_delete() implementation is different in each of those versions (see fsnotify_link()). A follow up patch will fix the fsnotify_unlink/rmdir() calls in pseudo filesystem that do not need to call d_delete(). Link: https://lore.kernel.org/r/20220120215305.282577-1-amir73il@gmail.com Reported-by: Ivan Delalande Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/ Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 49 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 3a2d7dc3c607..bb8467cd11ae 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -224,6 +224,43 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, dir, &new_dentry->d_name, 0); } +/* + * fsnotify_delete - @dentry was unlinked and unhashed + * + * Caller must make sure that dentry->d_name is stable. + * + * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode + * as this may be called after d_delete() and old_dentry may be negative. + */ +static inline void fsnotify_delete(struct inode *dir, struct inode *inode, + struct dentry *dentry) +{ + __u32 mask = FS_DELETE; + + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + + fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name, + 0); +} + +/** + * d_delete_notify - delete a dentry and call fsnotify_delete() + * @dentry: The dentry to delete + * + * This helper is used to guaranty that the unlinked inode cannot be found + * by lookup of this name after fsnotify_delete() event has been delivered. + */ +static inline void d_delete_notify(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + ihold(inode); + d_delete(dentry); + fsnotify_delete(dir, inode, dentry); + iput(inode); +} + /* * fsnotify_unlink - 'name' was unlinked * @@ -231,10 +268,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { - /* Expected to be called before d_delete() */ - WARN_ON_ONCE(d_is_negative(dentry)); + if (WARN_ON_ONCE(d_is_negative(dentry))) + return; - fsnotify_dirent(dir, dentry, FS_DELETE); + fsnotify_delete(dir, d_inode(dentry), dentry); } /* @@ -258,10 +295,10 @@ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { - /* Expected to be called before d_delete() */ - WARN_ON_ONCE(d_is_negative(dentry)); + if (WARN_ON_ONCE(d_is_negative(dentry))) + return; - fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR); + fsnotify_delete(dir, d_inode(dentry), dentry); } /* -- cgit v1.2.3 From 9daf0a4d32d60a57f2a2533bdf4c178be7fdff7f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 16 Jan 2022 04:59:36 -0800 Subject: quota: cleanup double word in comment Remove the second 'handle'. Link: https://lore.kernel.org/r/20220116125936.389767-1-trix@redhat.com Signed-off-by: Tom Rix Signed-off-by: Jan Kara --- include/linux/quota.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/quota.h b/include/linux/quota.h index 18ebd39c9487..fd692b4a41d5 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -91,7 +91,7 @@ extern bool qid_valid(struct kqid qid); * * When there is no mapping defined for the user-namespace, type, * qid tuple an invalid kqid is returned. Callers are expected to - * test for and handle handle invalid kqids being returned. + * test for and handle invalid kqids being returned. * Invalid kqids may be tested for using qid_valid(). */ static inline struct kqid make_kqid(struct user_namespace *from, -- cgit v1.2.3 From 945c37ed564770c78dfe6b9f08bed57a1b4e60ef Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 10 Jan 2022 20:43:28 +0800 Subject: usb: roles: fix include/linux/usb/role.h compile issue when CONFIG_USB_ROLE_SWITCH is not defined, add usb_role_switch_find_by_fwnode() definition which return NULL. Fixes: c6919d5e0cd1 ("usb: roles: Add usb_role_switch_find_by_fwnode()") Signed-off-by: Linyu Yuan Link: https://lore.kernel.org/r/1641818608-25039-1-git-send-email-quic_linyyuan@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/role.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/role.h b/include/linux/usb/role.h index 031f148ab373..b5deafd91f67 100644 --- a/include/linux/usb/role.h +++ b/include/linux/usb/role.h @@ -91,6 +91,12 @@ fwnode_usb_role_switch_get(struct fwnode_handle *node) static inline void usb_role_switch_put(struct usb_role_switch *sw) { } +static inline struct usb_role_switch * +usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode) +{ + return NULL; +} + static inline struct usb_role_switch * usb_role_switch_register(struct device *parent, const struct usb_role_switch_desc *desc) -- cgit v1.2.3 From 33569ef3c754a82010f266b7b938a66a3ccf90a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Wed, 19 Jan 2022 11:47:51 +0100 Subject: PM: hibernate: Remove register_nosave_region_late() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is an unused wrapper forcing kmalloc allocation for registering nosave regions. Also, rename __register_nosave_region() to register_nosave_region() now that there is no need for disambiguation. Signed-off-by: Amadeusz Sławiński Reviewed-by: Cezary Rojewski Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5785d909c321..3e8ecdebe601 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -430,15 +430,7 @@ struct platform_hibernation_ops { #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ -extern void __register_nosave_region(unsigned long b, unsigned long e, int km); -static inline void __init register_nosave_region(unsigned long b, unsigned long e) -{ - __register_nosave_region(b, e, 0); -} -static inline void __init register_nosave_region_late(unsigned long b, unsigned long e) -{ - __register_nosave_region(b, e, 1); -} +extern void register_nosave_region(unsigned long b, unsigned long e); extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); @@ -458,7 +450,6 @@ int pfn_is_nosave(unsigned long pfn); int hibernate_quiet_exec(int (*func)(void *data), void *data); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} -static inline void register_nosave_region_late(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} -- cgit v1.2.3 From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 26 Jan 2022 09:19:20 -0800 Subject: xfs, iomap: limit individual ioend chain lengths in writeback Trond Myklebust reported soft lockups in XFS IO completion such as this: watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106] CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1 Workqueue: xfs-conv/md127 xfs_end_io [xfs] RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20 Call Trace: wake_up_page_bit+0x8a/0x110 iomap_finish_ioend+0xd7/0x1c0 iomap_finish_ioends+0x7f/0xb0 xfs_end_ioend+0x6b/0x100 [xfs] xfs_end_io+0xb9/0xe0 [xfs] process_one_work+0x1a7/0x360 worker_thread+0x1fa/0x390 kthread+0x116/0x130 ret_from_fork+0x35/0x40 Ioends are processed as an atomic completion unit when all the chained bios in the ioend have completed their IO. Logically contiguous ioends can also be merged and completed as a single, larger unit. Both of these things can be problematic as both the bio chains per ioend and the size of the merged ioends processed as a single completion are both unbound. If we have a large sequential dirty region in the page cache, write_cache_pages() will keep feeding us sequential pages and we will keep mapping them into ioends and bios until we get a dirty page at a non-sequential file offset. These large sequential runs can will result in bio and ioend chaining to optimise the io patterns. The pages iunder writeback are pinned within these chains until the submission chaining is broken, allowing the entire chain to be completed. This can result in huge chains being processed in IO completion context. We get deep bio chaining if we have large contiguous physical extents. We will keep adding pages to the current bio until it is full, then we'll chain a new bio to keep adding pages for writeback. Hence we can build bio chains that map millions of pages and tens of gigabytes of RAM if the page cache contains big enough contiguous dirty file regions. This long bio chain pins those pages until the final bio in the chain completes and the ioend can iterate all the chained bios and complete them. OTOH, if we have a physically fragmented file, we end up submitting one ioend per physical fragment that each have a small bio or bio chain attached to them. We do not chain these at IO submission time, but instead we chain them at completion time based on file offset via iomap_ioend_try_merge(). Hence we can end up with unbound ioend chains being built via completion merging. XFS can then do COW remapping or unwritten extent conversion on that merged chain, which involves walking an extent fragment at a time and running a transaction to modify the physical extent information. IOWs, we merge all the discontiguous ioends together into a contiguous file range, only to then process them individually as discontiguous extents. This extent manipulation is computationally expensive and can run in a tight loop, so merging logically contiguous but physically discontigous ioends gains us nothing except for hiding the fact the fact we broke the ioends up into individual physical extents at submission and then need to loop over those individual physical extents at completion. Hence we need to have mechanisms to limit ioend sizes and to break up completion processing of large merged ioend chains: 1. bio chains per ioend need to be bound in length. Pure overwrites go straight to iomap_finish_ioend() in softirq context with the exact bio chain attached to the ioend by submission. Hence the only way to prevent long holdoffs here is to bound ioend submission sizes because we can't reschedule in softirq context. 2. iomap_finish_ioends() has to handle unbound merged ioend chains correctly. This relies on any one call to iomap_finish_ioend() being bound in runtime so that cond_resched() can be issued regularly as the long ioend chain is processed. i.e. this relies on mechanism #1 to limit individual ioend sizes to work correctly. 3. filesystems have to loop over the merged ioends to process physical extent manipulations. This means they can loop internally, and so we break merging at physical extent boundaries so the filesystem can easily insert reschedule points between individual extent manipulations. Signed-off-by: Dave Chinner Reported-and-tested-by: Trond Myklebust Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b55bd49e55f5..97a3a2edb585 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -263,9 +263,11 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From d7e4f8545b497b3f5687e592f1c355cbaee64c8c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 26 Jan 2022 13:04:26 +0800 Subject: pid: Introduce helper task_is_in_init_pid_ns() Currently the kernel uses open code in multiple places to check if a task is in the root PID namespace with the kind of format: if (task_active_pid_ns(current) == &init_pid_ns) do_something(); This patch creates a new helper function, task_is_in_init_pid_ns(), it returns true if a passed task is in the root PID namespace, otherwise returns false. So it will be used to replace open codes. Suggested-by: Suzuki K Poulose Signed-off-by: Leo Yan Reviewed-by: Leon Romanovsky Acked-by: Suzuki K Poulose Acked-by: Balbir Singh Signed-off-by: Jakub Kicinski --- include/linux/pid_namespace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c7e627503d2..07481bb87d4e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -86,4 +86,9 @@ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) +{ + return task_active_pid_ns(tsk) == &init_pid_ns; +} + #endif /* _LINUX_PID_NS_H */ -- cgit v1.2.3 From 364df53c081d93fcfd6b91085ff2650c7f17b3c7 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 27 Jan 2022 17:13:01 +0800 Subject: net: socket: rename SKB_DROP_REASON_SOCKET_FILTER Rename SKB_DROP_REASON_SOCKET_FILTER, which is used as the reason of skb drop out of socket filter before it's part of a released kernel. It will be used for more protocols than just TCP in future series. Signed-off-by: Menglong Dong Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20220127091308.91401-2-imagedong@tencent.com/ Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf11e1fbd69b..8a636e678902 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -318,7 +318,7 @@ enum skb_drop_reason { SKB_DROP_REASON_NO_SOCKET, SKB_DROP_REASON_PKT_TOO_SMALL, SKB_DROP_REASON_TCP_CSUM, - SKB_DROP_REASON_TCP_FILTER, + SKB_DROP_REASON_SOCKET_FILTER, SKB_DROP_REASON_UDP_CSUM, SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 7f5056b9e7b71149bf11073f00a57fa1ac2921a9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 26 Jan 2022 15:35:14 -0500 Subject: security, lsm: dentry_init_security() Handle multi LSM registration A ceph user has reported that ceph is crashing with kernel NULL pointer dereference. Following is the backtrace. /proc/version: Linux version 5.16.2-arch1-1 (linux@archlinux) (gcc (GCC) 11.1.0, GNU ld (GNU Binutils) 2.36.1) #1 SMP PREEMPT Thu, 20 Jan 2022 16:18:29 +0000 distro / arch: Arch Linux / x86_64 SELinux is not enabled ceph cluster version: 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) relevant dmesg output: [ 30.947129] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 30.947206] #PF: supervisor read access in kernel mode [ 30.947258] #PF: error_code(0x0000) - not-present page [ 30.947310] PGD 0 P4D 0 [ 30.947342] Oops: 0000 [#1] PREEMPT SMP PTI [ 30.947388] CPU: 5 PID: 778 Comm: touch Not tainted 5.16.2-arch1-1 #1 86fbf2c313cc37a553d65deb81d98e9dcc2a3659 [ 30.947486] Hardware name: Gigabyte Technology Co., Ltd. B365M DS3H/B365M DS3H, BIOS F5 08/13/2019 [ 30.947569] RIP: 0010:strlen+0x0/0x20 [ 30.947616] Code: b6 07 38 d0 74 16 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 31 d2 89 d6 89 d7 c3 48 89 f8 31 d2 89 d6 89 d7 c3 0 f 1f 40 00 <80> 3f 00 74 12 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 31 ff [ 30.947782] RSP: 0018:ffffa4ed80ffbbb8 EFLAGS: 00010246 [ 30.947836] RAX: 0000000000000000 RBX: ffffa4ed80ffbc60 RCX: 0000000000000000 [ 30.947904] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 30.947971] RBP: ffff94b0d15c0ae0 R08: 0000000000000000 R09: 0000000000000000 [ 30.948040] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 30.948106] R13: 0000000000000001 R14: ffffa4ed80ffbc60 R15: 0000000000000000 [ 30.948174] FS: 00007fc7520f0740(0000) GS:ffff94b7ced40000(0000) knlGS:0000000000000000 [ 30.948252] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 30.948308] CR2: 0000000000000000 CR3: 0000000104a40001 CR4: 00000000003706e0 [ 30.948376] Call Trace: [ 30.948404] [ 30.948431] ceph_security_init_secctx+0x7b/0x240 [ceph 49f9c4b9bf5be8760f19f1747e26da33920bce4b] [ 30.948582] ceph_atomic_open+0x51e/0x8a0 [ceph 49f9c4b9bf5be8760f19f1747e26da33920bce4b] [ 30.948708] ? get_cached_acl+0x4d/0xa0 [ 30.948759] path_openat+0x60d/0x1030 [ 30.948809] do_filp_open+0xa5/0x150 [ 30.948859] do_sys_openat2+0xc4/0x190 [ 30.948904] __x64_sys_openat+0x53/0xa0 [ 30.948948] do_syscall_64+0x5c/0x90 [ 30.948989] ? exc_page_fault+0x72/0x180 [ 30.949034] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 30.949091] RIP: 0033:0x7fc7521e25bb [ 30.950849] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 0 0 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 54 24 28 64 48 2b 14 25 Core of the problem is that ceph checks for return code from security_dentry_init_security() and if return code is 0, it assumes everything is fine and continues to call strlen(name), which crashes. Typically SELinux LSM returns 0 and sets name to "security.selinux" and it is not a problem. Or if selinux is not compiled in or disabled, it returns -EOPNOTSUP and ceph deals with it. But somehow in this configuration, 0 is being returned and "name" is not being initialized and that's creating the problem. Our suspicion is that BPF LSM is registering a hook for dentry_init_security() and returns hook default of 0. LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,...) I have not been able to reproduce it just by doing CONFIG_BPF_LSM=y. Stephen has tested the patch though and confirms it solves the problem for him. dentry_init_security() is written in such a way that it expects only one LSM to register the hook. Atleast that's the expectation with current code. If another LSM returns a hook and returns default, it will simply return 0 as of now and that will break ceph. Hence, suggestion is that change semantics of this hook a bit. If there are no LSMs or no LSM is taking ownership and initializing security context, then return -EOPNOTSUP. Also allow at max one LSM to initialize security context. This hook can't deal with multiple LSMs trying to init security context. This patch implements this new behavior. Reported-by: Stephen Muth Tested-by: Stephen Muth Suggested-by: Casey Schaufler Acked-by: Casey Schaufler Reviewed-by: Serge Hallyn Cc: Jeff Layton Cc: Christian Brauner Cc: Paul Moore Cc: # 5.16.0 Signed-off-by: Vivek Goyal Reviewed-by: Jeff Layton Acked-by: Paul Moore Acked-by: Christian Brauner Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..f0c7b352340a 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -82,7 +82,7 @@ LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val, int len, void **mnt_opts) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) -LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry, +LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, -- cgit v1.2.3 From e45c47d1f94e0cc7b6b079fdb4bcce2995e2adc4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 28 Jan 2022 10:58:39 -0500 Subject: block: add bio_start_io_acct_time() to control start_time bio_start_io_acct_time() interface is like bio_start_io_acct() that allows start_time to be passed in. This gives drivers the ability to defer starting accounting until after IO is issued (but possibily not entirely due to bio splitting). Reviewed-by: Christoph Hellwig Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220128155841.39644-2-snitzer@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9c95df26fc26..f35aea98bc35 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1258,6 +1258,7 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); +void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); -- cgit v1.2.3 From 6cb917411e028dcb66ce8f5db1b47361b78d7d3f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 29 Jan 2022 13:40:52 -0800 Subject: include/linux/sysctl.h: fix register_sysctl_mount_point() return type The CONFIG_SYSCTL=n stub returns the wrong type. Fixes: ee9efac48a082 ("sysctl: add helper to register a sysctl mount point") Reported-by: kernel test robot Acked-by: Luis Chamberlain Cc: Tong Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 180adf7da785..6353d6db69b2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -265,7 +265,7 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } -static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; } -- cgit v1.2.3 From 536f4217ced62b671bd759f6b549621a5654a70f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 29 Jan 2022 13:41:04 -0800 Subject: mm: page->mapping folio->mapping should have the same offset As with the other members of folio, the offset of page->mapping and folio->mapping must be the same. The compile-time check was inadvertently removed during development. Add it back. [willy@infradead.org: changelog redo] Link: https://lkml.kernel.org/r/20220104011734.21714-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9db36dc5d4cf..5140e5feb486 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -261,6 +261,7 @@ static_assert(sizeof(struct page) == sizeof(struct folio)); static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); +FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); -- cgit v1.2.3 From 27fe73394a1c6d0b07fa4d95f1bca116d1cc66e9 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Sat, 29 Jan 2022 13:41:14 -0800 Subject: mm, kasan: use compare-exchange operation to set KASAN page tag It has been reported that the tag setting operation on newly-allocated pages can cause the page flags to be corrupted when performed concurrently with other flag updates as a result of the use of non-atomic operations. Fix the problem by using a compare-exchange loop to update the tag. Link: https://lkml.kernel.org/r/20220120020148.1632253-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0cec63101 Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e1a84b1e6787..213cc569b192 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,11 +1506,18 @@ static inline u8 page_kasan_tag(const struct page *page) static inline void page_kasan_tag_set(struct page *page, u8 tag) { - if (kasan_enabled()) { - tag ^= 0xff; - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) -- cgit v1.2.3 From 51e50fbd3efc6064c30ed73a5e009018b36e290a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 29 Jan 2022 13:41:17 -0800 Subject: psi: fix "no previous prototype" warnings when CONFIG_CGROUPS=n When CONFIG_CGROUPS is disabled psi code generates the following warnings: kernel/sched/psi.c:1112:21: warning: no previous prototype for 'psi_trigger_create' [-Wmissing-prototypes] 1112 | struct psi_trigger *psi_trigger_create(struct psi_group *group, | ^~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1182:6: warning: no previous prototype for 'psi_trigger_destroy' [-Wmissing-prototypes] 1182 | void psi_trigger_destroy(struct psi_trigger *t) | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1249:10: warning: no previous prototype for 'psi_trigger_poll' [-Wmissing-prototypes] 1249 | __poll_t psi_trigger_poll(void **trigger_ptr, | ^~~~~~~~~~~~~~~~ Change the declarations of these functions in the header to provide the prototypes even when they are unused. Link: https://lkml.kernel.org/r/20220119223940.787748-2-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index f8ce53bfdb2a..7f7d1d88c3bb 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -25,18 +25,17 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); - -#ifdef CONFIG_CGROUPS -int psi_cgroup_alloc(struct cgroup *cgrp); -void psi_cgroup_free(struct cgroup *cgrp); -void cgroup_move_task(struct task_struct *p, struct css_set *to); - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); #endif #else /* CONFIG_PSI */ -- cgit v1.2.3 From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:22 +0000 Subject: kvm: add guest_state_{enter,exit}_irqoff() When transitioning to/from guest mode, it is necessary to inform lockdep, tracing, and RCU in a specific order, similar to the requirements for transitions to/from user mode. Additionally, it is necessary to perform vtime accounting for a window around running the guest, with RCU enabled, such that timer interrupts taken from the guest can be accounted as guest time. Most architectures don't handle all the necessary pieces, and a have a number of common bugs, including unsafe usage of RCU during the window between guest_enter() and guest_exit(). On x86, this was dealt with across commits: 87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs") 0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit") 9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit") 3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") 135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text") 160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling") bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers") ... but those fixes are specific to x86, and as the resulting logic (while correct) is split across generic helper functions and x86-specific helper functions, it is difficult to see that the entry/exit accounting is balanced. This patch adds generic helpers which architectures can use to handle guest entry/exit consistently and correctly. The guest_{enter,exit}() helpers are split into guest_timing_{enter,exit}() to perform vtime accounting, and guest_context_{enter,exit}() to perform the necessary context tracking and RCU management. The existing guest_{enter,exit}() heleprs are left as wrappers of these. Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff() helpers are added to handle the ordering of lockdep, tracing, and RCU manageent. These are inteneded to mirror exit_to_user_mode() and enter_from_user_mode(). Subsequent patches will migrate architectures over to the new helpers, following a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); This sequences handles all of the above correctly, and more clearly balances the entry and exit portions, making it easier to understand. The existing helpers are marked as deprecated, and will be removed once all architectures have been converted. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f079820f52b5..b3810976a27f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -368,8 +370,11 @@ struct kvm_vcpu { u64 last_used_slot_gen; }; -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +/* + * Start accounting time towards a guest. + * Must be called before entering guest context. + */ +static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the @@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); +} +/* + * Enter guest context and enter an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_enter_irqoff(void) +{ /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* + * Deprecated. Architectures should move to guest_timing_enter_irqoff() and + * guest_state_enter_irqoff(). + */ +static __always_inline void guest_enter_irqoff(void) +{ + guest_timing_enter_irqoff(); + guest_context_enter_irqoff(); +} + +/** + * guest_state_enter_irqoff - Fixup state when entering a guest + * + * Entry to a guest will enable interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code before entering a guest. + * Must be called with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_enter_irqoff() before this. + * + * Note: this is analogous to exit_to_user_mode(). + */ +static __always_inline void guest_state_enter_irqoff(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_context_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* + * Exit guest context and exit an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_exit_irqoff(void) { context_tracking_guest_exit(); +} +/* + * Stop accounting time towards a guest. + * Must be called after exiting guest context. + */ +static __always_inline void guest_timing_exit_irqoff(void) +{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } +/* + * Deprecated. Architectures should move to guest_state_exit_irqoff() and + * guest_timing_exit_irqoff(). + */ +static __always_inline void guest_exit_irqoff(void) +{ + guest_context_exit_irqoff(); + guest_timing_exit_irqoff(); +} + static inline void guest_exit(void) { unsigned long flags; @@ -413,6 +492,33 @@ static inline void guest_exit(void) local_irq_restore(flags); } +/** + * guest_state_exit_irqoff - Establish state when returning from guest mode + * + * Entry from a guest disables interrupts, but guest mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific code after exiting a guest. + * Must be invoked with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_exit_irqoff() after this. + * + * Note: this is analogous to enter_from_user_mode(). + */ +static __always_inline void guest_state_exit_irqoff(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_context_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:50 +0000 Subject: netfs, cachefiles: Add a method to query presence of data in the cache Add a netfs_cache_ops method by which a network filesystem can ask the cache about what data it has available and where so that it can make a multipage read more efficient. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- include/linux/netfs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b46c39d98bbd..614f22213e21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -244,6 +244,13 @@ struct netfs_cache_ops { int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + + /* Query the occupancy of the cache in a region, returning where the + * next chunk of data starts and how long it is. + */ + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; struct readahead_control; -- cgit v1.2.3 From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:22:41 -0800 Subject: net/mlx5e: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() in struct vlan_ethhdr around members h_dest and h_source, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of h_dest. "pahole" shows no size nor member offset changes to struct vlan_ethhdr. "objdump -d" shows no object code changes. Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- include/linux/if_vlan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8420fe504927..2be4dd7e90a9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -46,8 +46,10 @@ struct vlan_hdr { * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; + struct_group(addrs, + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; -- cgit v1.2.3 From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:29 +0100 Subject: Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)" This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578. Revert the second (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D