From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:19 -0700 Subject: perf: Add branch stack counters Currently, the additional information of a branch entry is stored in a u64 space. With more and more information added, the space is running out. For example, the information of occurrences of events will be added for each branch. Two places were suggested to append the counters. https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/ One place is right after the flags of each branch entry. It changes the existing struct perf_branch_entry. The later ARCH specific implementation has to be really careful to consistently pick the right struct. The other place is right after the entire struct perf_branch_stack. The disadvantage is that the pointer of the extra space has to be recorded. The common interface perf_sample_save_brstack() has to be updated. The latter is much straightforward, and should be easily understood and maintained. It is implemented in the patch. Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate the event which is recorded in the branch info. The "u64 counters" may store the occurrences of several events. The information regarding the number of events/counters and the width of each counter should be exposed via sysfs as a reference for the perf tool. Define the branch_counter_nr and branch_counter_width ABI here. The support will be implemented later in the Intel-specific patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com --- include/uapi/linux/perf_event.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 39c6a250dd1b..4461f380425b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -235,6 +237,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -982,6 +986,12 @@ enum perf_event_type { * { u64 nr; * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi -- cgit v1.2.3 From 33744916196b4ed7a50f6f47af7c3ad46b730ce6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:23 -0700 Subject: perf/x86/intel: Support branch counters logging The branch counters logging (A.K.A LBR event logging) introduces a per-counter indication of precise event occurrences in LBRs. It can provide a means to attribute exposed retirement latency to combinations of events across a block of instructions. It also provides a means of attributing Timed LBR latencies to events. The feature is first introduced on SRF/GRR. It is an enhancement of the ARCH LBR. It adds new fields in the LBR_INFO MSRs to log the occurrences of events on the GP counters. The information is displayed by the order of counters. The design proposed in this patch requires that the events which are logged must be in a group with the event that has LBR. If there are more than one LBR group, the counters logging information only from the current group (overflowed) are stored for the perf tool, otherwise the perf tool cannot know which and when other groups are scheduled especially when multiplexing is triggered. The user can ensure it uses the maximum number of counters that support LBR info (4 by now) by making the group large enough. The HW only logs events by the order of counters. The order may be different from the order of enabling which the perf tool can understand. When parsing the information of each branch entry, convert the counter order to the enabled order, and store the enabled order in the extension space. Unconditionally reset LBRs for an LBR event group when it's deleted. The logged counter information is only valid for the current LBR group. If another LBR group is scheduled later, the information from the stale LBRs would be otherwise wrongly interpreted. Add a sanity check in intel_pmu_hw_config(). Disable the feature if other counter filters (inv, cmask, edge, in_tx) are set or LBR call stack mode is enabled. (For the LBR call stack mode, we cannot simply flush the LBR, since it will break the call stack. Also, there is no obvious usage with the call stack mode for now.) Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't require any branch stack setup. Expose the maximum number of supported counters and the width of the counters into the sysfs. The perf tool can use the information to parse the logged counters in each branch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-5-kan.liang@linux.intel.com --- include/uapi/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4461f380425b..3a64499b0f5d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1437,6 +1437,9 @@ struct perf_branch_entry { reserved:31; }; +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + union perf_sample_weight { __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) -- cgit v1.2.3 From 155addf0814a92d08fce26a11b27e3315cdba977 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 3 Nov 2023 19:49:00 -0700 Subject: bpf: Use named fields for certain bpf uapi structs Martin and Vadim reported a verifier failure with bpf_dynptr usage. The issue is mentioned but Vadim workarounded the issue with source change ([1]). The below describes what is the issue and why there is a verification failure. int BPF_PROG(skb_crypto_setup) { struct bpf_dynptr algo, key; ... bpf_dynptr_from_mem(..., ..., 0, &algo); ... } The bpf program is using vmlinux.h, so we have the following definition in vmlinux.h: struct bpf_dynptr { long: 64; long: 64; }; Note that in uapi header bpf.h, we have struct bpf_dynptr { long: 64; long: 64; } __attribute__((aligned(8))); So we lost alignment information for struct bpf_dynptr by using vmlinux.h. Let us take a look at a simple program below: $ cat align.c typedef unsigned long long __u64; struct bpf_dynptr_no_align { __u64 :64; __u64 :64; }; struct bpf_dynptr_yes_align { __u64 :64; __u64 :64; } __attribute__((aligned(8))); void bar(void *, void *); int foo() { struct bpf_dynptr_no_align a; struct bpf_dynptr_yes_align b; bar(&a, &b); return 0; } $ clang --target=bpf -O2 -S -emit-llvm align.c Look at the generated IR file align.ll: ... %a = alloca %struct.bpf_dynptr_no_align, align 1 %b = alloca %struct.bpf_dynptr_yes_align, align 8 ... The compiler dictates the alignment for struct bpf_dynptr_no_align is 1 and the alignment for struct bpf_dynptr_yes_align is 8. So theoretically compiler could allocate variable %a with alignment 1 although in reallity the compiler may choose a different alignment by considering other local variables. In [1], the verification failure happens because variable 'algo' is allocated on the stack with alignment 4 (fp-28). But the verifer wants its alignment to be 8. To fix the issue, the RFC patch ([1]) tried to add '__attribute__((aligned(8)))' to struct bpf_dynptr plus other similar structs. Andrii suggested that we could directly modify uapi struct with named fields like struct 'bpf_iter_num': struct bpf_iter_num { /* opaque iterator state; having __u64 here allows to preserve correct * alignment requirements in vmlinux.h, generated from BTF */ __u64 __opaque[1]; } __attribute__((aligned(8))); Indeed, adding named fields for those affected structs in this patch can preserve alignment when bpf program references them in vmlinux.h. With this patch, the verification failure in [1] can also be resolved. [1] https://lore.kernel.org/bpf/1b100f73-7625-4c1f-3ae5-50ecf84d3ff0@linux.dev/ [2] https://lore.kernel.org/bpf/20231103055218.2395034-1-yonghong.song@linux.dev/ Cc: Vadim Fedorenko Cc: Martin KaFai Lau Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231104024900.1539182-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f6cdf52b1da..095ca7238ac2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7151,40 +7151,31 @@ struct bpf_spin_lock { }; struct bpf_timer { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_dynptr { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_head { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_node { - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[3]; } __attribute__((aligned(8))); struct bpf_rb_root { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_rb_node { - __u64 :64; - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[4]; } __attribute__((aligned(8))); struct bpf_refcount { - __u32 :32; + __u32 __opaque[1]; } __attribute__((aligned(4))); struct bpf_sysctl { -- cgit v1.2.3 From b8e3a87a627b575896e448021e5c2f8a3bc19931 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 8 Nov 2023 03:23:34 -0800 Subject: bpf: Add crosstask check to __bpf_get_stack Currently get_perf_callchain only supports user stack walking for the current task. Passing the correct *crosstask* param will return 0 frames if the task passed to __bpf_get_stack isn't the current one instead of a single incorrect frame/address. This change passes the correct *crosstask* param but also does a preemptive check in __bpf_get_stack if the task is current and returns -EOPNOTSUPP if it is not. This issue was found using bpf_get_task_stack inside a BPF iterator ("iter/task"), which iterates over all tasks. bpf_get_task_stack works fine for fetching kernel stacks but because get_perf_callchain relies on the caller to know if the requested *task* is the current one (via *crosstask*) it was failing in a confusing way. It might be possible to get user stacks for all tasks utilizing something like access_process_vm but that requires the bpf program calling bpf_get_task_stack to be sleepable and would therefore be a breaking change. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231108112334.3433136-1-jordalgo@meta.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 095ca7238ac2..7cf8bcf9f6a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4517,6 +4517,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4528,6 +4530,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. -- cgit v1.2.3 From 89ef42088b3ba884a007ad10bd89ce8a81b9dedd Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Thu, 9 Nov 2023 15:59:00 +0200 Subject: ASoC: SOF: Add support for configuring PDM interface from topology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only support configuration for number of channels and sample rate. Reviewed-by: Péter Ujfalusi Reviewed-by: Iuliana Prodan Signed-off-by: Daniel Baluta Link: https://lore.kernel.org/r/20231109135900.88310-3-daniel.baluta@oss.nxp.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/tokens.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index 453cab2a1209..0fb39780f9bd 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -213,4 +213,8 @@ #define SOF_TKN_AMD_ACPI2S_CH 1701 #define SOF_TKN_AMD_ACPI2S_TDM_MODE 1702 +/* MICFIL PDM */ +#define SOF_TKN_IMX_MICFIL_RATE 2000 +#define SOF_TKN_IMX_MICFIL_CH 2001 + #endif -- cgit v1.2.3 From f3b8788cde61b02f1e6c202f8fac4360e6adbafc Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:46 -0700 Subject: LSM: Identify modules by more than name Create a struct lsm_id to contain identifying information about Linux Security Modules (LSMs). At inception this contains the name of the module and an identifier associated with the security module. Change the security_add_hooks() interface to use this structure. Change the individual modules to maintain their own struct lsm_id and pass it to security_add_hooks(). The values are for LSM identifiers are defined in a new UAPI header file linux/lsm.h. Each existing LSM has been updated to include it's LSMID in the lsm_id. The LSM ID values are sequential, with the oldest module LSM_ID_CAPABILITY being the lowest value and the existing modules numbered in the order they were included in the main line kernel. This is an arbitrary convention for assigning the values, but none better presents itself. The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. This may include attributes of the LSM infrastructure itself, possibly related to namespacing or network attribute management. A special range is identified for such attributes to help reduce confusion for developers unfamiliar with LSMs. LSM attribute values are defined for the attributes presented by modules that are available today. As with the LSM IDs, The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. Cc: linux-security-module Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Kees Cook Nacked-by: Tetsuo Handa [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- include/uapi/linux/lsm.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 include/uapi/linux/lsm.h (limited to 'include/uapi') diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h new file mode 100644 index 000000000000..f27c9a9cc376 --- /dev/null +++ b/include/uapi/linux/lsm.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Linux Security Modules (LSM) - User space API + * + * Copyright (C) 2022 Casey Schaufler + * Copyright (C) 2022 Intel Corporation + */ + +#ifndef _UAPI_LINUX_LSM_H +#define _UAPI_LINUX_LSM_H + +/* + * ID tokens to identify Linux Security Modules (LSMs) + * + * These token values are used to uniquely identify specific LSMs + * in the kernel as well as in the kernel's LSM userspace API. + * + * A value of zero/0 is considered undefined and should not be used + * outside the kernel. Values 1-99 are reserved for potential + * future use. + */ +#define LSM_ID_UNDEF 0 +#define LSM_ID_CAPABILITY 100 +#define LSM_ID_SELINUX 101 +#define LSM_ID_SMACK 102 +#define LSM_ID_TOMOYO 103 +#define LSM_ID_IMA 104 +#define LSM_ID_APPARMOR 105 +#define LSM_ID_YAMA 106 +#define LSM_ID_LOADPIN 107 +#define LSM_ID_SAFESETID 108 +#define LSM_ID_LOCKDOWN 109 +#define LSM_ID_BPF 110 +#define LSM_ID_LANDLOCK 111 + +/* + * LSM_ATTR_XXX definitions identify different LSM attributes + * which are used in the kernel's LSM userspace API. Support + * for these attributes vary across the different LSMs. None + * are required. + * + * A value of zero/0 is considered undefined and should not be used + * outside the kernel. Values 1-99 are reserved for potential + * future use. + */ +#define LSM_ATTR_UNDEF 0 +#define LSM_ATTR_CURRENT 100 +#define LSM_ATTR_EXEC 101 +#define LSM_ATTR_FSCREATE 102 +#define LSM_ATTR_KEYCREATE 103 +#define LSM_ATTR_PREV 104 +#define LSM_ATTR_SOCKCREATE 105 + +#endif /* _UAPI_LINUX_LSM_H */ -- cgit v1.2.3 From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:49 -0700 Subject: LSM: syscalls for current process attributes Create a system call lsm_get_self_attr() to provide the security module maintained attributes of the current process. Create a system call lsm_set_self_attr() to set a security module maintained attribute of the current process. Historically these attributes have been exposed to user space via entries in procfs under /proc/self/attr. The attribute value is provided in a lsm_ctx structure. The structure identifies the size of the attribute, and the attribute value. The format of the attribute value is defined by the security module. A flags field is included for LSM specific information. It is currently unused and must be 0. The total size of the data, including the lsm_ctx structure and any padding, is maintained as well. struct lsm_ctx { __u64 id; __u64 flags; __u64 len; __u64 ctx_len; __u8 ctx[]; }; Two new LSM hooks are used to interface with the LSMs. security_getselfattr() collects the lsm_ctx values from the LSMs that support the hook, accounting for space requirements. security_setselfattr() identifies which LSM the attribute is intended for and passes it along. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/uapi/linux/lsm.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index f27c9a9cc376..eeda59a77c02 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -9,6 +9,36 @@ #ifndef _UAPI_LINUX_LSM_H #define _UAPI_LINUX_LSM_H +#include +#include + +/** + * struct lsm_ctx - LSM context information + * @id: the LSM id number, see LSM_ID_XXX + * @flags: LSM specific flags + * @len: length of the lsm_ctx struct, @ctx and any other data or padding + * @ctx_len: the size of @ctx + * @ctx: the LSM context value + * + * The @len field MUST be equal to the size of the lsm_ctx struct + * plus any additional padding and/or data placed after @ctx. + * + * In all cases @ctx_len MUST be equal to the length of @ctx. + * If @ctx is a string value it should be nul terminated with + * @ctx_len equal to `strlen(@ctx) + 1`. Binary values are + * supported. + * + * The @flags and @ctx fields SHOULD only be interpreted by the + * LSM specified by @id; they MUST be set to zero/0 when not used. + */ +struct lsm_ctx { + __u64 id; + __u64 flags; + __u64 len; + __u64 ctx_len; + __u8 ctx[]; +}; + /* * ID tokens to identify Linux Security Modules (LSMs) * @@ -51,4 +81,10 @@ #define LSM_ATTR_PREV 104 #define LSM_ATTR_SOCKCREATE 105 +/* + * LSM_FLAG_XXX definitions identify special handling instructions + * for the API. + */ +#define LSM_FLAG_SINGLE 0x0001 + #endif /* _UAPI_LINUX_LSM_H */ -- cgit v1.2.3 From 5f42375904b08890f2e8e7cd955c5bf0c2c0d05a Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:51 -0700 Subject: LSM: wireup Linux Security Module syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wireup lsm_get_self_attr, lsm_set_self_attr and lsm_list_modules system calls. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Acked-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Cc: linux-api@vger.kernel.org Reviewed-by: Mickaël Salaün [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- include/uapi/asm-generic/unistd.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 756b013fb832..55cc0bcfb58d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -829,8 +829,15 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_lsm_get_self_attr 457 +__SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr) +#define __NR_lsm_set_self_attr 458 +__SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) +#define __NR_lsm_list_modules 459 +__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) + #undef __NR_syscalls -#define __NR_syscalls 457 +#define __NR_syscalls 460 /* * 32 bit systems traditionally used different -- cgit v1.2.3 From edd71f8e266c7ba15eedfec338864e53ddde1c25 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 18 Oct 2023 17:41:41 -0400 Subject: lsm: drop LSM_ID_IMA When IMA becomes a proper LSM we will reintroduce an appropriate LSM ID, but drop it from the userspace API for now in an effort to put an end to debates around the naming of the LSM ID macro. Reviewed-by: Roberto Sassu Signed-off-by: Paul Moore --- include/uapi/linux/lsm.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index eeda59a77c02..f0386880a78e 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -54,14 +54,13 @@ struct lsm_ctx { #define LSM_ID_SELINUX 101 #define LSM_ID_SMACK 102 #define LSM_ID_TOMOYO 103 -#define LSM_ID_IMA 104 -#define LSM_ID_APPARMOR 105 -#define LSM_ID_YAMA 106 -#define LSM_ID_LOADPIN 107 -#define LSM_ID_SAFESETID 108 -#define LSM_ID_LOCKDOWN 109 -#define LSM_ID_BPF 110 -#define LSM_ID_LANDLOCK 111 +#define LSM_ID_APPARMOR 104 +#define LSM_ID_YAMA 105 +#define LSM_ID_LOADPIN 106 +#define LSM_ID_SAFESETID 107 +#define LSM_ID_LOCKDOWN 108 +#define LSM_ID_BPF 109 +#define LSM_ID_LANDLOCK 110 /* * LSM_ATTR_XXX definitions identify different LSM attributes -- cgit v1.2.3 From 48f996d4adf15a0a0af8b8184d3ec6042a684ea4 Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Mon, 23 Oct 2023 07:03:23 -0700 Subject: RDMA/bnxt_re: Remove roundup_pow_of_two depth for all hardware queue resources Rounding up the queue depth to power of two is not a hardware requirement. In order to optimize the per connection memory usage, removing drivers implementation which round up to the queue depths to the power of 2. Implements a mask to maintain backward compatibility with older library. Signed-off-by: Chandramohan Akula Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1698069803-1787-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- include/uapi/rdma/bnxt_re-abi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index 6e7c67a0cca3..a1b896d6d940 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -54,6 +54,7 @@ enum { BNXT_RE_UCNTX_CMASK_HAVE_MODE = 0x02ULL, BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED = 0x04ULL, BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL, + BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL, }; enum bnxt_re_wqe_mode { @@ -62,6 +63,14 @@ enum bnxt_re_wqe_mode { BNXT_QPLIB_WQE_MODE_INVALID = 0x02, }; +enum { + BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT = 0x01, +}; + +struct bnxt_re_uctx_req { + __aligned_u64 comp_mask; +}; + struct bnxt_re_uctx_resp { __u32 dev_id; __u32 max_qp; -- cgit v1.2.3 From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2 Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 211b86de35ac..308cc70bd6ab 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1201,6 +1211,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_USER_MEMORY2 231 #ifdef KVM_CAP_IRQ_ROUTING @@ -1483,6 +1494,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { -- cgit v1.2.3 From 16f95f3b95caded251a0440051e44a2fbe9e5f55 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:51 -0700 Subject: KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace Add a new KVM exit type to allow userspace to handle memory faults that KVM cannot resolve, but that userspace *may* be able to handle (without terminating the guest). KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit conversions between private and shared memory. With guest private memory, there will be two kind of memory conversions: - explicit conversion: happens when the guest explicitly calls into KVM to map a range (as private or shared) - implicit conversion: happens when the guest attempts to access a gfn that is configured in the "wrong" state (private vs. shared) On x86 (first architecture to support guest private memory), explicit conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE, but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable as there is (obviously) no hypercall, and there is no guarantee that the guest actually intends to convert between private and shared, i.e. what KVM thinks is an implicit conversion "request" could actually be the result of a guest code bug. KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to be implicit conversions. Note! To allow for future possibilities where KVM reports KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's perspective), not '0'! Due to historical baggage within KVM, exiting to userspace with '0' from deep callstacks, e.g. in emulation paths, is infeasible as doing so would require a near-complete overhaul of KVM, whereas KVM already propagates -errno return codes to userspace even when the -errno originated in a low level helper. Report the gpa+size instead of a single gfn even though the initial usage is expected to always report single pages. It's entirely possible, likely even, that KVM will someday support sub-page granularity faults, e.g. Intel's sub-page protection feature allows for additional protections at 128-byte granularity. Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com Cc: Anish Moorthy Cc: David Matlack Suggested-by: Sean Christopherson Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-10-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 308cc70bd6ab..59010a685007 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -275,6 +275,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 +#define KVM_EXIT_MEMORY_FAULT 39 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -528,6 +529,12 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; + /* KVM_EXIT_MEMORY_FAULT */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; /* Fix the size of the union. */ char padding[256]; }; @@ -1212,6 +1219,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_MEMORY_FAULT_INFO 232 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 5a475554db1e476a14216e742ea2bdb77362d5d5 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:55 -0700 Subject: KVM: Introduce per-page memory attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In confidential computing usages, whether a page is private or shared is necessary information for KVM to perform operations like page fault handling, page zapping etc. There are other potential use cases for per-page memory attributes, e.g. to make memory read-only (or no-exec, or exec-only, etc.) without having to modify memslots. Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory attributes to a guest memory range. Use an xarray to store the per-page attributes internally, with a naive, not fully optimized implementation, i.e. prioritize correctness over performance for the initial implementation. Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX attributes/protections in the future, e.g. to give userspace fine-grained control over read, write, and execute protections for guest memory. Provide arch hooks for handling attribute changes before and after common code sets the new attributes, e.g. x86 will use the "pre" hook to zap all relevant mappings, and the "post" hook to track whether or not hugepages can be used to map the range. To simplify the implementation wrap the entire sequence with kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly guaranteed to be an invalidation. For the initial use case, x86 *will* always invalidate memory, and preventing arch code from creating new mappings while the attributes are in flux makes it much easier to reason about the correctness of consuming attributes. It's possible that future usages may not require an invalidation, e.g. if KVM ends up supporting RWX protections and userspace grants _more_ protections, but again opt for simplicity and punt optimizations to if/when they are needed. Suggested-by: Sean Christopherson Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com Cc: Fuad Tabba Cc: Xu Yilun Cc: Mickaël Salaün Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 59010a685007..e8d167e54980 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1220,6 +1220,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #define KVM_CAP_USER_MEMORY2 231 #define KVM_CAP_MEMORY_FAULT_INFO 232 +#define KVM_CAP_MEMORY_ATTRIBUTES 233 #ifdef KVM_CAP_IRQ_ROUTING @@ -2288,4 +2289,16 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* Available with KVM_CAP_MEMORY_ATTRIBUTES */ +#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) + +struct kvm_memory_attributes { + __u64 address; + __u64 size; + __u64 attributes; + __u64 flags; +}; + +#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 07afe1ba288c04280622fa002ed385f1ac0b6fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 7 Sep 2023 03:09:08 +0200 Subject: batman-adv: mcast: implement multicast packet reception and forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement functionality to receive and forward a new TVLV capable multicast packet type. The new batman-adv multicast packet type allows to contain several originator destination addresses within a TVLV. Routers on the way will potentially split the batman-adv multicast packet and adjust its tracker TVLV contents. Routing decisions are still based on the selected BATMAN IV or BATMAN V routing algorithm. So this new batman-adv multicast packet type retains the same loop-free properties. Also a new OGM multicast TVLV flag is introduced to signal to other nodes that we are capable of handling a batman-adv multicast packet and multicast tracker TVLV. And that all of our hard interfaces have an MTU of at least 1280 bytes (IPv6 minimum MTU), as a simple solution for now to avoid MTU issues while forwarding. Signed-off-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 45 +++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 9204e4494b25..6e25753015df 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -116,6 +116,9 @@ enum batadv_icmp_packettype { * only need routable IPv4 multicast packets we signed up for explicitly * @BATADV_MCAST_WANT_NO_RTR6: we have no IPv6 multicast router and therefore * only need routable IPv6 multicast packets we signed up for explicitly + * @BATADV_MCAST_HAVE_MC_PTYPE_CAPA: we can parse, receive and forward + * batman-adv multicast packets with a multicast tracker TVLV. And all our + * hard interfaces have an MTU of at least 1280 bytes. */ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, @@ -123,6 +126,7 @@ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, BATADV_MCAST_WANT_NO_RTR4 = 1UL << 3, BATADV_MCAST_WANT_NO_RTR6 = 1UL << 4, + BATADV_MCAST_HAVE_MC_PTYPE_CAPA = 1UL << 5, }; /* tt data subtypes */ @@ -174,14 +178,16 @@ enum batadv_bla_claimframe { * @BATADV_TVLV_TT: translation table tvlv * @BATADV_TVLV_ROAM: roaming advertisement tvlv * @BATADV_TVLV_MCAST: multicast capability tvlv + * @BATADV_TVLV_MCAST_TRACKER: multicast tracker tvlv */ enum batadv_tvlv_type { - BATADV_TVLV_GW = 0x01, - BATADV_TVLV_DAT = 0x02, - BATADV_TVLV_NC = 0x03, - BATADV_TVLV_TT = 0x04, - BATADV_TVLV_ROAM = 0x05, - BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_GW = 0x01, + BATADV_TVLV_DAT = 0x02, + BATADV_TVLV_NC = 0x03, + BATADV_TVLV_TT = 0x04, + BATADV_TVLV_ROAM = 0x05, + BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_MCAST_TRACKER = 0x07, }; #pragma pack(2) @@ -487,6 +493,25 @@ struct batadv_bcast_packet { */ }; +/** + * struct batadv_mcast_packet - multicast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @reserved: reserved byte for alignment + * @tvlv_len: length of the appended tvlv buffer (in bytes) + */ +struct batadv_mcast_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 reserved; + __be16 tvlv_len; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header @@ -628,6 +653,14 @@ struct batadv_tvlv_mcast_data { __u8 reserved[3]; }; +/** + * struct batadv_tvlv_mcast_tracker - payload of a multicast tracker tvlv + * @num_dests: number of subsequent destination originator MAC addresses + */ +struct batadv_tvlv_mcast_tracker { + __be16 num_dests; +}; + #pragma pack() #endif /* _UAPI_LINUX_BATADV_PACKET_H_ */ -- cgit v1.2.3 From a7800aa80ea4d5356b8474c2302812e9d4926fa6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Nov 2023 05:42:34 -0500 Subject: KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based memory that is tied to a specific KVM virtual machine and whose primary purpose is to serve guest memory. A guest-first memory subsystem allows for optimizations and enhancements that are kludgy or outright infeasible to implement/support in a generic memory subsystem. With guest_memfd, guest protections and mapping sizes are fully decoupled from host userspace mappings. E.g. KVM currently doesn't support mapping memory as writable in the guest without it also being writable in host userspace, as KVM's ABI uses VMA protections to define the allow guest protection. Userspace can fudge this by establishing two mappings, a writable mapping for the guest and readable one for itself, but that’s suboptimal on multiple fronts. Similarly, KVM currently requires the guest mapping size to be a strict subset of the host userspace mapping size, e.g. KVM doesn’t support creating a 1GiB guest mapping unless userspace also has a 1GiB guest mapping. Decoupling the mappings sizes would allow userspace to precisely map only what is needed without impacting guest performance, e.g. to harden against unintentional accesses to guest memory. Decoupling guest and userspace mappings may also allow for a cleaner alternative to high-granularity mappings for HugeTLB, which has reached a bit of an impasse and is unlikely to ever be merged. A guest-first memory subsystem also provides clearer line of sight to things like a dedicated memory pool (for slice-of-hardware VMs) and elimination of "struct page" (for offload setups where userspace _never_ needs to mmap() guest memory). More immediately, being able to map memory into KVM guests without mapping said memory into the host is critical for Confidential VMs (CoCo VMs), the initial use case for guest_memfd. While AMD's SEV and Intel's TDX prevent untrusted software from reading guest private data by encrypting guest memory with a key that isn't usable by the untrusted host, projects such as Protected KVM (pKVM) provide confidentiality and integrity *without* relying on memory encryption. And with SEV-SNP and TDX, accessing guest private memory can be fatal to the host, i.e. KVM must be prevent host userspace from accessing guest memory irrespective of hardware behavior. Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as being mappable only by KVM (or a similarly enlightened kernel subsystem). That approach was abandoned largely due to it needing to play games with PROT_NONE to prevent userspace from accessing guest memory. Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping guest private memory into userspace, but that approach failed to meet several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel wouldn't easily be able to enforce a 1:1 page:guest association, let alone a 1:1 pfn:gfn mapping. And using PG_hwpoison does not work for memory that isn't backed by 'struct page', e.g. if devices gain support for exposing encrypted memory regions to guests. Attempt #3 was to extend the memfd() syscall and wrap shmem to provide dedicated file-based guest memory. That approach made it as far as v10 before feedback from Hugh Dickins and Christian Brauner (and others) led to it demise. Hugh's objection was that piggybacking shmem made no sense for KVM's use case as KVM didn't actually *want* the features provided by shmem. I.e. KVM was using memfd() and shmem to avoid having to manage memory directly, not because memfd() and shmem were the optimal solution, e.g. things like read/write/mmap in shmem were dead weight. Christian pointed out flaws with implementing a partial overlay (wrapping only _some_ of shmem), e.g. poking at inode_operations or super_operations would show shmem stuff, but address_space_operations and file_operations would show KVM's overlay. Paraphrashing heavily, Christian suggested KVM stop being lazy and create a proper API. Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org Cc: Fuad Tabba Cc: Vishal Annapurve Cc: Ackerley Tng Cc: Jarkko Sakkinen Cc: Maciej Szmigiero Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Quentin Perret Cc: Michael Roth Cc: Wang Cc: Liam Merwick Cc: Isaku Yamahata Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Co-developed-by: Chao Peng Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-17-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e8d167e54980..2802d10aa88c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 { __u64 guest_phys_addr; __u64 memory_size; __u64 userspace_addr; - __u64 pad[16]; + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; + __u64 pad2[14]; }; /* @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_GUEST_MEMFD (1UL << 2) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -1221,6 +1225,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_USER_MEMORY2 231 #define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_MEMORY_ATTRIBUTES 233 +#define KVM_CAP_GUEST_MEMFD 234 #ifdef KVM_CAP_IRQ_ROUTING @@ -2301,4 +2306,12 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) + +struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; +}; + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 8dd2eee9d526c30fccfe75da7ec5365c6476e510 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:02 -0700 Subject: KVM: x86/mmu: Handle page fault for private memory Add support for resolving page faults on guest private memory for VMs that differentiate between "shared" and "private" memory. For such VMs, KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and hva-based shared memory, and KVM needs to map in the "correct" variant, i.e. KVM needs to map the gfn shared/private as appropriate based on the current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag. For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request shared vs. private via a bit in the guest page tables, i.e. what the guest wants may conflict with the current memory attributes. To support such "implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT to forward the request to userspace. Add a new flag for memory faults, KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to map memory as shared vs. private. Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to exit on missing mappings when handling guest page fault VM-Exits. In that case, userspace will want to know RWX information in order to correctly/precisely resolve the fault. Note, private memory *must* be backed by guest_memfd, i.e. shared mappings always come from the host userspace page tables, and private mappings always come from a guest_memfd instance. Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2802d10aa88c..8eb10f560c69 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -535,6 +535,7 @@ struct kvm_run { } notify; /* KVM_EXIT_MEMORY_FAULT */ struct { +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) __u64 flags; __u64 gpa; __u64 size; -- cgit v1.2.3 From 89ea60c2c7b5838bf192c50062d5720cd6ab8662 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:05 -0700 Subject: KVM: x86: Add support for "protected VMs" that can utilize private memory Add a new x86 VM type, KVM_X86_SW_PROTECTED_VM, to serve as a development and testing vehicle for Confidential (CoCo) VMs, and potentially to even become a "real" product in the distant future, e.g. a la pKVM. The private memory support in KVM x86 is aimed at AMD's SEV-SNP and Intel's TDX, but those technologies are extremely complex (understatement), difficult to debug, don't support running as nested guests, and require hardware that's isn't universally accessible. I.e. relying SEV-SNP or TDX for maintaining guest private memory isn't a realistic option. At the very least, KVM_X86_SW_PROTECTED_VM will enable a variety of selftests for guest_memfd and private memory support without requiring unique hardware. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-24-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8eb10f560c69..e9cb2df67a1d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 +#define KVM_CAP_VM_TYPES 235 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:00 -0800 Subject: bpf: add register bounds sanity checks and sanitization Add simple sanity checks that validate well-formed ranges (min <= max) across u64, s64, u32, and s32 ranges. Also for cases when the value is constant (either 64-bit or 32-bit), we validate that ranges and tnums are in agreement. These bounds checks are performed at the end of BPF_ALU/BPF_ALU64 operations, on conditional jumps, and for LDX instructions (where subreg zero/sign extension is probably the most important to check). This covers most of the interesting cases. Also, we validate the sanity of the return register when manually adjusting it for some special helpers. By default, sanity violation will trigger a warning in verifier log and resetting register bounds to "unbounded" ones. But to aid development and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will trigger hard failure of verification with -EFAULT on register bounds violations. This allows selftests to catch such issues. veristat will also gain a CLI option to enable this behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7cf8bcf9f6a2..8a5855fcee69 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1200,6 +1200,9 @@ enum bpf_perf_event_type { */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_SANITY_STRICT (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -- cgit v1.2.3 From 5d33213fac5929a2e7766c88d78779fd443b0fe8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Nov 2023 10:39:24 +0300 Subject: media: v4l2-subdev: Fix a 64bit bug The problem is this line here from subdev_do_ioctl(). client_cap->capabilities &= ~V4L2_SUBDEV_CLIENT_CAP_STREAMS; The "client_cap->capabilities" variable is a u64. The AND operation is supposed to clear out the V4L2_SUBDEV_CLIENT_CAP_STREAMS flag. But because it's a 32 bit variable it accidentally clears out the high 32 bits as well. Currently we only use the first bit and none of the upper bits so this doesn't affect runtime behavior. Fixes: f57fa2959244 ("media: v4l2-subdev: Add new ioctl for client capabilities") Signed-off-by: Dan Carpenter Reviewed-by: Tomi Valkeinen Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-subdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h index 4a195b68f28f..b383c2fe0cf3 100644 --- a/include/uapi/linux/v4l2-subdev.h +++ b/include/uapi/linux/v4l2-subdev.h @@ -239,7 +239,7 @@ struct v4l2_subdev_routing { * set (which is the default), the 'stream' fields will be forced to 0 by the * kernel. */ - #define V4L2_SUBDEV_CLIENT_CAP_STREAMS (1U << 0) + #define V4L2_SUBDEV_CLIENT_CAP_STREAMS (1ULL << 0) /** * struct v4l2_subdev_client_capability - Capabilities of the client accessing -- cgit v1.2.3 From c6e9dba3be5ef3b701b29b143609561915e5d0e9 Mon Sep 17 00:00:00 2001 From: Alce Lafranque Date: Tue, 14 Nov 2023 11:36:57 -0600 Subject: vxlan: add support for flowlabel inherit By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an option for a fixed value. This commits add the ability to inherit the flow label from the inner packet, like for other tunnel implementations. This enables devices using only L3 headers for ECMP to correctly balance VXLAN-encapsulated IPv6 packets. ``` $ ./ip/ip link add dummy1 type dummy $ ./ip/ip addr add 2001:db8::2/64 dev dummy1 $ ./ip/ip link set up dev dummy1 $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2 $ ./ip/ip link set up dev vxlan1 $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1 $ ./ip/ip link set arp off dev vxlan1 $ ping -q 2001:db8:1::1 & $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1 [...] Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb [...] Virtual eXtensible Local Area Network Flags: 0x0800, VXLAN Network ID (VNI) Group Policy ID: 0 VXLAN Network Identifier (VNI): 100 [...] Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb ``` Signed-off-by: Alce Lafranque Co-developed-by: Vincent Bernat Signed-off-by: Vincent Bernat Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 29ff80da2775..8181ef23a7a2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -856,6 +856,7 @@ enum { IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -873,6 +874,13 @@ enum ifla_vxlan_df { VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; +enum ifla_vxlan_label_policy { + VXLAN_LABEL_FIXED = 0, + VXLAN_LABEL_INHERIT = 1, + __VXLAN_LABEL_END, + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, -- cgit v1.2.3 From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 09:14:04 -0800 Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0]. A few selftests and veristat need to be adjusted in the same patch as well. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8a5855fcee69..7a5498242eaa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1201,7 +1201,7 @@ enum bpf_perf_event_type { #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* The verifier internal test flag. Behavior is undefined */ -#define BPF_F_TEST_SANITY_STRICT (1U << 7) +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. -- cgit v1.2.3 From 98d2b43081972abeb5bb5a087bc3e3197531c46e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:01:59 +0200 Subject: add unique mount ID If a mount is released then its mnt_id can immediately be reused. This is bad news for user interfaces that want to uniquely identify a mount. Implementing a unique mount ID is trivial (use a 64bit counter). Unfortunately userspace assumes 32bit size and would overflow after the counter reaches 2^32. Introduce a new 64bit ID alongside the old one. Initialize the counter to 2^32, this guarantees that the old and new IDs are never mixed up. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-2-mszeredi@redhat.com Reviewed-by: Ian Kent Signed-off-by: Christian Brauner --- include/uapi/linux/stat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7cab2c65d3d7..2f2ee82d5517 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -154,6 +154,7 @@ struct statx { #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ -- cgit v1.2.3 From acec05fb78abb74fdab2195bfca9a6d38a732643 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:35 +0100 Subject: net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask Timestamping software or hardware flags are often used as a group, therefore adding these masks will easier future use. I did not use SOF_TIMESTAMPING_SYS_HARDWARE flag as it is deprecated and not use at all. Signed-off-by: Kory Maincent Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/net_tstamp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index a2c66b3d7f0f..df8091998c8d 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -48,6 +48,14 @@ enum { SOF_TIMESTAMPING_TX_SCHED | \ SOF_TIMESTAMPING_TX_ACK) +#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ + SOF_TIMESTAMPING_TX_SOFTWARE | \ + SOF_TIMESTAMPING_SOFTWARE) + +#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ + SOF_TIMESTAMPING_TX_HARDWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) + /** *