From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:23 -0800 Subject: bpf: Add __bpf_dynptr_data* for in kernel use Different types of bpf dynptr have different internal data storage. Specifically, SKB and XDP type of dynptr may have non-continuous data. Therefore, it is not always safe to directly access dynptr->data. Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to dynptr->data. Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of dynptr->data. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb29..eb84caf133df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1222,6 +1222,8 @@ enum bpf_dynptr_type { int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); -- cgit v1.2.3 From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:37 -0800 Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the btf_field_type enum. Further patches in the series will use BPF_GRAPH_NODE, so let's move this useful definition out of btf.c. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb84caf133df..4001d11be151 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -186,8 +186,8 @@ enum btf_field_type { BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | - BPF_RB_NODE | BPF_RB_ROOT, + BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, + BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), }; -- cgit v1.2.3 From 155addf0814a92d08fce26a11b27e3315cdba977 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 3 Nov 2023 19:49:00 -0700 Subject: bpf: Use named fields for certain bpf uapi structs Martin and Vadim reported a verifier failure with bpf_dynptr usage. The issue is mentioned but Vadim workarounded the issue with source change ([1]). The below describes what is the issue and why there is a verification failure. int BPF_PROG(skb_crypto_setup) { struct bpf_dynptr algo, key; ... bpf_dynptr_from_mem(..., ..., 0, &algo); ... } The bpf program is using vmlinux.h, so we have the following definition in vmlinux.h: struct bpf_dynptr { long: 64; long: 64; }; Note that in uapi header bpf.h, we have struct bpf_dynptr { long: 64; long: 64; } __attribute__((aligned(8))); So we lost alignment information for struct bpf_dynptr by using vmlinux.h. Let us take a look at a simple program below: $ cat align.c typedef unsigned long long __u64; struct bpf_dynptr_no_align { __u64 :64; __u64 :64; }; struct bpf_dynptr_yes_align { __u64 :64; __u64 :64; } __attribute__((aligned(8))); void bar(void *, void *); int foo() { struct bpf_dynptr_no_align a; struct bpf_dynptr_yes_align b; bar(&a, &b); return 0; } $ clang --target=bpf -O2 -S -emit-llvm align.c Look at the generated IR file align.ll: ... %a = alloca %struct.bpf_dynptr_no_align, align 1 %b = alloca %struct.bpf_dynptr_yes_align, align 8 ... The compiler dictates the alignment for struct bpf_dynptr_no_align is 1 and the alignment for struct bpf_dynptr_yes_align is 8. So theoretically compiler could allocate variable %a with alignment 1 although in reallity the compiler may choose a different alignment by considering other local variables. In [1], the verification failure happens because variable 'algo' is allocated on the stack with alignment 4 (fp-28). But the verifer wants its alignment to be 8. To fix the issue, the RFC patch ([1]) tried to add '__attribute__((aligned(8)))' to struct bpf_dynptr plus other similar structs. Andrii suggested that we could directly modify uapi struct with named fields like struct 'bpf_iter_num': struct bpf_iter_num { /* opaque iterator state; having __u64 here allows to preserve correct * alignment requirements in vmlinux.h, generated from BTF */ __u64 __opaque[1]; } __attribute__((aligned(8))); Indeed, adding named fields for those affected structs in this patch can preserve alignment when bpf program references them in vmlinux.h. With this patch, the verification failure in [1] can also be resolved. [1] https://lore.kernel.org/bpf/1b100f73-7625-4c1f-3ae5-50ecf84d3ff0@linux.dev/ [2] https://lore.kernel.org/bpf/20231103055218.2395034-1-yonghong.song@linux.dev/ Cc: Vadim Fedorenko Cc: Martin KaFai Lau Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231104024900.1539182-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f6cdf52b1da..095ca7238ac2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7151,40 +7151,31 @@ struct bpf_spin_lock { }; struct bpf_timer { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_dynptr { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_head { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_node { - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[3]; } __attribute__((aligned(8))); struct bpf_rb_root { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_rb_node { - __u64 :64; - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[4]; } __attribute__((aligned(8))); struct bpf_refcount { - __u32 :32; + __u32 __opaque[1]; } __attribute__((aligned(4))); struct bpf_sysctl { -- cgit v1.2.3 From 689b097a06bafb461ec162fc3b3ecc9765cea67b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 6 Nov 2023 03:18:02 +0000 Subject: compiler-gcc: Suppress -Wmissing-prototypes warning for all supported GCC The kernel supports a minimum GCC version of 5.1.0 for building. However, the "__diag_ignore_all" directive only suppresses the "-Wmissing-prototypes" warning for GCC versions >= 8.0.0. As a result, when building the kernel with older GCC versions, warnings may be triggered. The example below illustrates the warnings reported by the kernel test robot using GCC 7.5.0: compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0 All warnings (new ones prefixed by >>): kernel/bpf/helpers.c:1893:19: warning: no previous prototype for 'bpf_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) ^~~~~~~~~~~~~~~~ kernel/bpf/helpers.c:1907:19: warning: no previous prototype for 'bpf_percpu_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) [...] To address this, we should also suppress the "-Wmissing-prototypes" warning for older GCC versions. "#pragma GCC diagnostic push" is supported as of GCC 4.6, and both "-Wmissing-prototypes" and "-Wmissing-declarations" are supported for all the GCC versions that we currently support. Therefore, it is reasonable to suppress these warnings for all supported GCC versions. With this adjustment, it's important to note that after implementing "__diag_ignore_all", it will effectively suppress warnings for all the supported GCC versions. In the future, if you wish to suppress warnings that are only supported on higher GCC versions, it is advisable to explicitly use "__diag_ignore" to specify the GCC version you are targeting. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311031651.A7crZEur-lkp@intel.com/ Suggested-by: Arnd Bergmann Signed-off-by: Yafang Shao Cc: Kumar Kartikeya Dwivedi Cc: Arnd Bergmann Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231106031802.4188-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec1..aebb65bf95a7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -136,7 +136,7 @@ #endif #define __diag_ignore_all(option, comment) \ - __diag_GCC(8, ignore, option) + __diag(__diag_GCC_ignore option) /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" -- cgit v1.2.3 From b8e3a87a627b575896e448021e5c2f8a3bc19931 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 8 Nov 2023 03:23:34 -0800 Subject: bpf: Add crosstask check to __bpf_get_stack Currently get_perf_callchain only supports user stack walking for the current task. Passing the correct *crosstask* param will return 0 frames if the task passed to __bpf_get_stack isn't the current one instead of a single incorrect frame/address. This change passes the correct *crosstask* param but also does a preemptive check in __bpf_get_stack if the task is current and returns -EOPNOTSUPP if it is not. This issue was found using bpf_get_task_stack inside a BPF iterator ("iter/task"), which iterates over all tasks. bpf_get_task_stack works fine for fetching kernel stacks but because get_perf_callchain relies on the caller to know if the requested *task* is the current one (via *crosstask*) it was failing in a confusing way. It might be possible to get user stacks for all tasks utilizing something like access_process_vm but that requires the bpf program calling bpf_get_task_stack to be sleepable and would therefore be a breaking change. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231108112334.3433136-1-jordalgo@meta.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 095ca7238ac2..7cf8bcf9f6a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4517,6 +4517,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4528,6 +4530,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. -- cgit v1.2.3 From 07afe1ba288c04280622fa002ed385f1ac0b6fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 7 Sep 2023 03:09:08 +0200 Subject: batman-adv: mcast: implement multicast packet reception and forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement functionality to receive and forward a new TVLV capable multicast packet type. The new batman-adv multicast packet type allows to contain several originator destination addresses within a TVLV. Routers on the way will potentially split the batman-adv multicast packet and adjust its tracker TVLV contents. Routing decisions are still based on the selected BATMAN IV or BATMAN V routing algorithm. So this new batman-adv multicast packet type retains the same loop-free properties. Also a new OGM multicast TVLV flag is introduced to signal to other nodes that we are capable of handling a batman-adv multicast packet and multicast tracker TVLV. And that all of our hard interfaces have an MTU of at least 1280 bytes (IPv6 minimum MTU), as a simple solution for now to avoid MTU issues while forwarding. Signed-off-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 45 +++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 9204e4494b25..6e25753015df 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -116,6 +116,9 @@ enum batadv_icmp_packettype { * only need routable IPv4 multicast packets we signed up for explicitly * @BATADV_MCAST_WANT_NO_RTR6: we have no IPv6 multicast router and therefore * only need routable IPv6 multicast packets we signed up for explicitly + * @BATADV_MCAST_HAVE_MC_PTYPE_CAPA: we can parse, receive and forward + * batman-adv multicast packets with a multicast tracker TVLV. And all our + * hard interfaces have an MTU of at least 1280 bytes. */ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, @@ -123,6 +126,7 @@ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, BATADV_MCAST_WANT_NO_RTR4 = 1UL << 3, BATADV_MCAST_WANT_NO_RTR6 = 1UL << 4, + BATADV_MCAST_HAVE_MC_PTYPE_CAPA = 1UL << 5, }; /* tt data subtypes */ @@ -174,14 +178,16 @@ enum batadv_bla_claimframe { * @BATADV_TVLV_TT: translation table tvlv * @BATADV_TVLV_ROAM: roaming advertisement tvlv * @BATADV_TVLV_MCAST: multicast capability tvlv + * @BATADV_TVLV_MCAST_TRACKER: multicast tracker tvlv */ enum batadv_tvlv_type { - BATADV_TVLV_GW = 0x01, - BATADV_TVLV_DAT = 0x02, - BATADV_TVLV_NC = 0x03, - BATADV_TVLV_TT = 0x04, - BATADV_TVLV_ROAM = 0x05, - BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_GW = 0x01, + BATADV_TVLV_DAT = 0x02, + BATADV_TVLV_NC = 0x03, + BATADV_TVLV_TT = 0x04, + BATADV_TVLV_ROAM = 0x05, + BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_MCAST_TRACKER = 0x07, }; #pragma pack(2) @@ -487,6 +493,25 @@ struct batadv_bcast_packet { */ }; +/** + * struct batadv_mcast_packet - multicast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @reserved: reserved byte for alignment + * @tvlv_len: length of the appended tvlv buffer (in bytes) + */ +struct batadv_mcast_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 reserved; + __be16 tvlv_len; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header @@ -628,6 +653,14 @@ struct batadv_tvlv_mcast_data { __u8 reserved[3]; }; +/** + * struct batadv_tvlv_mcast_tracker - payload of a multicast tracker tvlv + * @num_dests: number of subsequent destination originator MAC addresses + */ +struct batadv_tvlv_mcast_tracker { + __be16 num_dests; +}; + #pragma pack() #endif /* _UAPI_LINUX_BATADV_PACKET_H_ */ -- cgit v1.2.3 From 4aea6a6d61cd6e3df9ed98345638abad1b1e5276 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 30 Mar 2023 15:05:38 -0700 Subject: net/mlx5: Query maximum frequency adjustment of the PTP hardware clock Some mlx5 devices do not support the default advertised maximum frequency adjustment value for the PTP hardware clock that is set by the driver. These devices need to be queried when initializing the clock functionality in order to get the maximum supported frequency adjustment value. This value can be greater than the minimum supported frequency adjustment across mlx5 devices (50 million ppb). Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f38..ce2e71cd6d2a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10103,7 +10103,10 @@ enum { struct mlx5_ifc_mtutc_reg_bits { u8 reserved_at_0[0x5]; u8 freq_adj_units[0x3]; - u8 reserved_at_8[0x14]; + u8 reserved_at_8[0x3]; + u8 log_max_freq_adjustment[0x5]; + + u8 reserved_at_10[0xc]; u8 operation[0x4]; u8 freq_adjustment[0x20]; -- cgit v1.2.3 From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:57 -0800 Subject: bpf: generalize reg_set_min_max() to handle non-const register comparisons Generalize bounds adjustment logic of reg_set_min_max() to handle not just register vs constant case, but in general any register vs any register cases. For most of the operations it's trivial extension based on range vs range comparison logic, we just need to properly pick min/max of a range to compare against min/max of the other range. For BPF_JSET we keep the original capabilities, just make sure JSET is integrated in the common framework. This is manifested in the internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more uniform rev_opcode() handling. See the code for details. This allows to reuse the same code exactly both for TRUE and FALSE branches without explicitly handling both conditions with custom code. Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE case none of the registers are constants. This is now just a normal generic case handled by reg_set_min_max(). To make tnum handling cleaner, tnum_with_subreg() helper is added, as that's a common operator when dealing with 32-bit subregister bounds. This keeps the overall logic much less noisy when it comes to tnums. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 1c3948a1d6ad..3c13240077b8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a); struct tnum tnum_subreg(struct tnum a); /* Returns the tnum with the lower 32-bit subreg cleared */ struct tnum tnum_clear_subreg(struct tnum a); +/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower + * 32-bit subreg in *subreg* + */ +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg); /* Returns the tnum with the lower 32-bit subreg set to value */ struct tnum tnum_const_subreg(struct tnum a, u32 value); /* Returns true if 32-bit subreg @a is a known constant*/ -- cgit v1.2.3 From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:00 -0800 Subject: bpf: add register bounds sanity checks and sanitization Add simple sanity checks that validate well-formed ranges (min <= max) across u64, s64, u32, and s32 ranges. Also for cases when the value is constant (either 64-bit or 32-bit), we validate that ranges and tnums are in agreement. These bounds checks are performed at the end of BPF_ALU/BPF_ALU64 operations, on conditional jumps, and for LDX instructions (where subreg zero/sign extension is probably the most important to check). This covers most of the interesting cases. Also, we validate the sanity of the return register when manually adjusting it for some special helpers. By default, sanity violation will trigger a warning in verifier log and resetting register bounds to "unbounded" ones. But to aid development and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will trigger hard failure of verification with -EFAULT on register bounds violations. This allows selftests to catch such issues. veristat will also gain a CLI option to enable this behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79..402b6bc44a1b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,6 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ + bool test_sanity_strict; /* fail verification on sanity violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7cf8bcf9f6a2..8a5855fcee69 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1200,6 +1200,9 @@ enum bpf_perf_event_type { */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_SANITY_STRICT (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -- cgit v1.2.3 From 3185d57cfcd34fadbe28f4ed57a6cb5122277ece Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 14 Nov 2023 11:42:02 +0100 Subject: indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR macro. Signed-off-by: Tobias Klauser Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231114104202.4680-1-tklauser@distanz.ch Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce..adb83a42a6b9 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -11,7 +11,7 @@ * @__VA_ARGS__: arguments for @f * * Avoid retpoline overhead for known builtin, checking @f vs each of them and - * eventually invoking directly the builtin function. The functions are check + * eventually invoking directly the builtin function. The functions are checked * in the given order. Fallback to the indirect call. */ #define INDIRECT_CALL_1(f, f1, ...) \ -- cgit v1.2.3 From c6e9dba3be5ef3b701b29b143609561915e5d0e9 Mon Sep 17 00:00:00 2001 From: Alce Lafranque Date: Tue, 14 Nov 2023 11:36:57 -0600 Subject: vxlan: add support for flowlabel inherit By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an option for a fixed value. This commits add the ability to inherit the flow label from the inner packet, like for other tunnel implementations. This enables devices using only L3 headers for ECMP to correctly balance VXLAN-encapsulated IPv6 packets. ``` $ ./ip/ip link add dummy1 type dummy $ ./ip/ip addr add 2001:db8::2/64 dev dummy1 $ ./ip/ip link set up dev dummy1 $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2 $ ./ip/ip link set up dev vxlan1 $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1 $ ./ip/ip link set arp off dev vxlan1 $ ping -q 2001:db8:1::1 & $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1 [...] Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb [...] Virtual eXtensible Local Area Network Flags: 0x0800, VXLAN Network ID (VNI) Group Policy ID: 0 VXLAN Network Identifier (VNI): 100 [...] Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb ``` Signed-off-by: Alce Lafranque Co-developed-by: Vincent Bernat Signed-off-by: Vincent Bernat Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ include/net/vxlan.h | 33 +++++++++++++++++---------------- include/uapi/linux/if_link.h | 8 ++++++++ 3 files changed, 36 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index f346b4efbc30..2d746f4c9a0a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6a9f8a5f387c..33ba6fc151cf 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -210,22 +210,23 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; }; enum { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 29ff80da2775..8181ef23a7a2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -856,6 +856,7 @@ enum { IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -873,6 +874,13 @@ enum ifla_vxlan_df { VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; +enum ifla_vxlan_label_policy { + VXLAN_LABEL_FIXED = 0, + VXLAN_LABEL_INHERIT = 1, + __VXLAN_LABEL_END, + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, -- cgit v1.2.3 From 96fa96e198f9707285003075fbbce7db6a485112 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Nov 2023 11:39:18 +0000 Subject: net: linkmode: add linkmode_fill() helper Add a linkmode_fill() helper, which will allow us to convert phylink's open coded bitmap_fill() operations. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/linkmode.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index 7303b4bc2ce0..287f590ed56b 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst) bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline void linkmode_fill(unsigned long *dst) +{ + bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + static inline void linkmode_copy(unsigned long *dst, const unsigned long *src) { bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS); -- cgit v1.2.3 From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 09:14:04 -0800 Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0]. A few selftests and veristat need to be adjusted in the same patch as well. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 402b6bc44a1b..52a4012b8255 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,7 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ - bool test_sanity_strict; /* fail verification on sanity violations */ + bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8a5855fcee69..7a5498242eaa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1201,7 +1201,7 @@ enum bpf_perf_event_type { #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* The verifier internal test flag. Behavior is undefined */ -#define BPF_F_TEST_SANITY_STRICT (1U << 7) +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. -- cgit v1.2.3 From 446e2305827b76e8081057ce56bbd2703b4da8a9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:29 +0100 Subject: net: Convert PHYs hwtstamp callback to use kernel_hwtstamp_config The PHYs hwtstamp callback are still getting the timestamp config from ifreq and using copy_from/to_user. Get rid of these functions by using timestamp configuration in parameter. This also allow to move on to kernel_hwtstamp_config and be similar to net devices using the new ndo_hwstamp_get/set. This adds the possibility to manipulate the timestamp configuration from the kernel which was not possible with the copy_from/to_user. Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mii_timestamper.h | 4 +++- include/linux/phy.h | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index fa940bbaf8ae..26b04f73f214 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -9,6 +9,7 @@ #include #include #include +#include struct phy_device; @@ -51,7 +52,8 @@ struct mii_timestamper { struct sk_buff *skb, int type); int (*hwtstamp)(struct mii_timestamper *mii_ts, - struct ifreq *ifreq); + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3cc52826f18e..e5f1f41e399c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1560,9 +1560,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev) return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } -static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) +static inline int phy_hwtstamp(struct phy_device *phydev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { - return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); + return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, -- cgit v1.2.3 From b8768dc4077712915f045ba1b198f521493c7914 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 14 Nov 2023 12:28:31 +0100 Subject: net: ethtool: Refactor identical get_ts_info implementations. The vlan, macvlan and the bonding drivers call their "real" device driver in order to report the time stamping capabilities. Provide a core ethtool helper function to avoid copy/paste in the stack. Signed-off-by: Richard Cochran Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Reviewed-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 689028257fcc..c2bb74143eda 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1043,6 +1043,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, return -EINVAL; } +/** + * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. + * @dev: pointer to net_device structure + * @info: buffer to hold the result + * Returns zero on success, non-zero otherwise. + */ +int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info); + /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update -- cgit v1.2.3 From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:33 +0100 Subject: net: Make dev_set_hwtstamp_phylib accessible Make the dev_set_hwtstamp_phylib function accessible in prevision to use it from ethtool to reset the tstamp current configuration. Reviewed-by: Florian Fainelli Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a16c9cc063fe..2d840d7056f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3942,6 +3942,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev, int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, -- cgit v1.2.3 From acec05fb78abb74fdab2195bfca9a6d38a732643 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:35 +0100 Subject: net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask Timestamping software or hardware flags are often used as a group, therefore adding these masks will easier future use. I did not use SOF_TIMESTAMPING_SYS_HARDWARE flag as it is deprecated and not use at all. Signed-off-by: Kory Maincent Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/net_tstamp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index a2c66b3d7f0f..df8091998c8d 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -48,6 +48,14 @@ enum { SOF_TIMESTAMPING_TX_SCHED | \ SOF_TIMESTAMPING_TX_ACK) +#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ + SOF_TIMESTAMPING_TX_SOFTWARE | \ + SOF_TIMESTAMPING_SOFTWARE) + +#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ + SOF_TIMESTAMPING_TX_HARDWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) + /** * struct so_timestamping - SO_TIMESTAMPING parameter * -- cgit v1.2.3 From 11d55be06df0aedf19b05ab61c2d26b31a3c7e64 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:36 +0100 Subject: net: ethtool: Add a command to expose current time stamping layer Time stamping on network packets may happen either in the MAC or in the PHY, but not both. In preparation for making the choice selectable, expose both the current layers via ethtool. In accordance with the kernel implementation as it stands, the current layer will always read as "phy" when a PHY time stamping device is present. Future patches will allow changing the current layer administratively. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ include/uapi/linux/net_tstamp.h | 10 ++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 73e2c10dc2cc..cb51136328cf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,6 +57,7 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_TS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -109,6 +110,7 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_TS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -975,6 +977,18 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; +/* TS LAYER */ + +enum { + ETHTOOL_A_TS_UNSPEC, + ETHTOOL_A_TS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TS_LAYER, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TS_CNT, + ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index df8091998c8d..4551fb3d7720 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,6 +13,16 @@ #include #include /* for SO_TIMESTAMPING */ +/* Layer of the TIMESTAMPING provider */ +enum timestamping_layer { + NO_TIMESTAMPING, + SOFTWARE_TIMESTAMPING, + MAC_TIMESTAMPING, + PHY_TIMESTAMPING, + + __TIMESTAMPING_COUNT, +}; + /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), -- cgit v1.2.3 From d905f9c753295ee5a30af265f4b724f10050e7d3 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:38 +0100 Subject: net: ethtool: Add a command to list available time stamping layers Introduce a new netlink message that lists all available time stamping layers on a given interface. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cb51136328cf..62b885d44d06 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -58,6 +58,7 @@ enum { ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_TS_GET, + ETHTOOL_MSG_TS_LIST_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -111,6 +112,7 @@ enum { ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, ETHTOOL_MSG_TS_GET_REPLY, + ETHTOOL_MSG_TS_LIST_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -989,6 +991,18 @@ enum { ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) }; +/* TS LIST LAYER */ + +enum { + ETHTOOL_A_TS_LIST_UNSPEC, + ETHTOOL_A_TS_LIST_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TS_LIST_LAYER, /* array, u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TS_LIST_CNT, + ETHTOOL_A_TS_LIST_MAX = (__ETHTOOL_A_TS_LIST_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:40 +0100 Subject: net: Replace hwtstamp_source by timestamping layer Replace hwtstamp_source which is only used by the kernel_hwtstamp_config structure by the more widely use timestamp_layer structure. This is done to prepare the support of selectable timestamping source. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index eb01c37e71e0..bb289c2ad376 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,11 +5,6 @@ #include -enum hwtstamp_source { - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -20,8 +15,8 @@ enum hwtstamp_source { * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from the netdev or from - * an attached phylib PHY + * @source: indication whether timestamps should come from software, the netdev + * or from an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -33,7 +28,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum hwtstamp_source source; + enum timestamping_layer source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, -- cgit v1.2.3 From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:41 +0100 Subject: net: Change the API of PHY default timestamp to MAC Change the API to select MAC default time stamping instead of the PHY. Indeed the PHY is closer to the wire therefore theoretically it has less delay than the MAC timestamping but the reality is different. Due to lower time stamping clock frequency, latency in the MDIO bus and no PHC hardware synchronization between different PHY, the PHY PTP is often less precise than the MAC. The exception is for PHY designed specially for PTP case but these devices are not very widespread. For not breaking the compatibility I introduce a default_timestamp flag in phy_device that is set by the phy driver to know we are using the old API behavior. The phy_set_timestamp function is called at each call of phy_attach_direct. In case of MAC driver using phylink this function is called when the interface is turned up. Then if the interface goes down and up again the last choice of timestamp will be overwritten by the default choice. A solution could be to cache the timestamp status but it can bring other issues. In case of SFP, if we change the module, it doesn't make sense to blindly re-set the timestamp back to PHY, if the new module has a PHY with mediocre timestamping capabilities. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ include/linux/phy.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d840d7056f2..f020d2790c12 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -2074,6 +2075,8 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. + * @ts_layer: Tracks which network device + * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2435,6 +2438,8 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif + + enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index e5f1f41e399c..317def2a7843 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,6 +604,8 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume + * @default_timestamp: Flag indicating whether we are using the phy + * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -667,6 +669,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + unsigned default_timestamp:1; + int rate_matching; enum phy_state state; -- cgit v1.2.3 From 152c75e1d00200edc4da1beb67dd099a462ea86b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:43 +0100 Subject: net: ethtool: ts: Let the active time stamping layer be selectable Now that the current timestamp is saved in a variable lets add the ETHTOOL_MSG_TS_SET ethtool netlink socket to make it selectable. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 62b885d44d06..df6c4fcc62c1 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -59,6 +59,7 @@ enum { ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_TS_GET, ETHTOOL_MSG_TS_LIST_GET, + ETHTOOL_MSG_TS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, -- cgit v1.2.3 From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:16 -0800 Subject: bpf: move verbose_linfo() into kernel/bpf/log.c verifier.c is huge. Let's try to move out parts that are logging-related into log.c, as we previously did with bpf_log() and other related stuff. This patch moves line info verbose output routines: it's pretty self-contained and isolated code, so there is no problem with this. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 52a4012b8255..d896f3db6a22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...); + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; -- cgit v1.2.3 From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:17 -0800 Subject: bpf: move verifier state printing code to kernel/bpf/log.c Move a good chunk of code from verifier.c to log.c: verifier state verbose printing logic. This is an important and very much logging/debugging oriented code. It fits the overlall log.c's focus on verifier logging, and moving it allows to keep growing it without unnecessarily adding to verifier.c code that otherwise contains a core verification logic. There are not many shared dependencies between this code and the rest of verifier.c code, except a few single-line helpers for various register type checks and a bit of state "scratching" helpers. We move all such trivial helpers into include/bpf/bpf_verifier.h as static inlines. No functional changes in this patch. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d896f3db6a22..39edc76f436e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type) return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } +static inline bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + +static inline bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + +static inline bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + type = base_type(type); + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + +static inline bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; +} + +static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); +const char *dynptr_type_str(enum bpf_dynptr_type type); +const char *iter_type_str(const struct btf *btf, u32 btf_id); +const char *iter_state_str(enum bpf_iter_state state); + +void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 18:38:05 -0800 Subject: net: partial revert of the "Make timestamping selectable: series Revert following commits: commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask") commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer") commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp") commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers") commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers") commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer") commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC") commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp") commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable") commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command") They need more time for reviews. Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/ Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 11 ++++++++--- include/linux/netdevice.h | 5 ----- include/linux/phy.h | 4 ---- include/uapi/linux/ethtool_netlink.h | 29 ----------------------------- include/uapi/linux/net_tstamp.h | 18 ------------------ 5 files changed, 8 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index bb289c2ad376..eb01c37e71e0 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,6 +5,11 @@ #include +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV, + HWTSTAMP_SOURCE_PHYLIB, +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -15,8 +20,8 @@ * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from software, the netdev - * or from an attached phylib PHY + * @source: indication whether timestamps should come from the netdev or from + * an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -28,7 +33,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum timestamping_layer source; + enum hwtstamp_source source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f020d2790c12..2d840d7056f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -2075,8 +2074,6 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. - * @ts_layer: Tracks which network device - * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2438,8 +2435,6 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif - - enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index 317def2a7843..e5f1f41e399c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,8 +604,6 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume - * @default_timestamp: Flag indicating whether we are using the phy - * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -669,8 +667,6 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; - unsigned default_timestamp:1; - int rate_matching; enum phy_state state; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index df6c4fcc62c1..73e2c10dc2cc 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,9 +57,6 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_TS_GET, - ETHTOOL_MSG_TS_LIST_GET, - ETHTOOL_MSG_TS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -112,8 +109,6 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_TS_GET_REPLY, - ETHTOOL_MSG_TS_LIST_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -980,30 +975,6 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; -/* TS LAYER */ - -enum { - ETHTOOL_A_TS_UNSPEC, - ETHTOOL_A_TS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TS_LAYER, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TS_CNT, - ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) -}; - -/* TS LIST LAYER */ - -enum { - ETHTOOL_A_TS_LIST_UNSPEC, - ETHTOOL_A_TS_LIST_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TS_LIST_LAYER, /* array, u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TS_LIST_CNT, - ETHTOOL_A_TS_LIST_MAX = (__ETHTOOL_A_TS_LIST_CNT - 1) -}; - /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 4551fb3d7720..a2c66b3d7f0f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,16 +13,6 @@ #include #include /* for SO_TIMESTAMPING */ -/* Layer of the TIMESTAMPING provider */ -enum timestamping_layer { - NO_TIMESTAMPING, - SOFTWARE_TIMESTAMPING, - MAC_TIMESTAMPING, - PHY_TIMESTAMPING, - - __TIMESTAMPING_COUNT, -}; - /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), @@ -58,14 +48,6 @@ enum { SOF_TIMESTAMPING_TX_SCHED | \ SOF_TIMESTAMPING_TX_ACK) -#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ - SOF_TIMESTAMPING_TX_SOFTWARE | \ - SOF_TIMESTAMPING_SOFTWARE) - -#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ - SOF_TIMESTAMPING_TX_HARDWARE | \ - SOF_TIMESTAMPING_RAW_HARDWARE) - /** * struct so_timestamping - SO_TIMESTAMPING parameter * -- cgit v1.2.3 From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 15 Nov 2023 20:01:08 +0800 Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink if a PF has 256 or more VFs, ip link command will allocate an order 3 memory or more, and maybe trigger OOM due to memory fragment, the VFs needed memory size is computed in rtnl_vfinfo_size. so introduce nlmsg_new_large which calls netlink_alloc_large_skb in which vmalloc is used for large memory, to avoid the failure of allocating memory ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\ __GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0 CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P OE Call Trace: dump_stack+0x57/0x6a dump_header+0x4a/0x210 oom_kill_process+0xe4/0x140 out_of_memory+0x3e8/0x790 __alloc_pages_slowpath.constprop.116+0x953/0xc50 __alloc_pages_nodemask+0x2af/0x310 kmalloc_large_node+0x38/0xf0 __kmalloc_node_track_caller+0x417/0x4d0 __kmalloc_reserve.isra.61+0x2e/0x80 __alloc_skb+0x82/0x1c0 rtnl_getlink+0x24f/0x370 rtnetlink_rcv_msg+0x12c/0x350 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1b2/0x280 netlink_sendmsg+0x355/0x4a0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1ea/0x250 ___sys_sendmsg+0x88/0xd0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95a65a5b70 Cc: Yunsheng Lin Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 1 + include/net/netlink.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 75d7de34c908..abe91ed6b9aa 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -351,5 +351,6 @@ bool netlink_ns_cap