From fa96b24204af42274ec13dfb2f2e6990d7510e55 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 5 Aug 2022 14:48:14 -0700 Subject: btf: Add a new kfunc flag which allows to mark a function to be sleepable This allows to declare a kfunc as sleepable and prevents its use in a non sleepable program. Signed-off-by: Benjamin Tissoires Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220805214821.1058337-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cdb376d53238..976cbdd2981f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,6 +49,7 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ struct btf; struct btf_member; -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -- cgit v1.2.3 From 4dd48c6f1f83290d4bc61b43e61d86f8bc6c310e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:03 +0200 Subject: bpf: add destructive kfunc flag Add KF_DESTRUCTIVE flag for destructive functions. Functions with this flag set will require CAP_SYS_BOOT capabilities. Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-2-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 976cbdd2981f..ad93c2d9cc1c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,7 +49,8 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ -#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ struct btf; struct btf_member; -- cgit v1.2.3 From 0630f64d25a0f0a8c6a9ce9fde8750b3b561e6f5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 15 Aug 2022 12:07:47 -0700 Subject: net: phy: broadcom: Implement suspend/resume for AC131 and BCM5241 Implement the suspend/resume procedure for the Broadcom AC131 and BCM5241 type of PHYs (10/100 only) by entering the standard power down followed by the proprietary standby mode in the auxiliary mode 4 shadow register. On resume, the PHY software reset is enough to make it come out of standby mode so we can utilize brcm_fet_config_init() as the resume hook. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6ff567ece34a..9e77165f3ef6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -293,6 +293,7 @@ #define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ #define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ +#define MII_BRCM_FET_SHDW_AM4_STANDBY 0x0008 /* Standby enable */ #define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 #define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 -- cgit v1.2.3 From e34cfee65ec891a319ce79797dda18083af33a76 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Wed, 17 Aug 2022 14:43:24 +0800 Subject: stmmac: intel: remove unused 'has_crossts' flag The 'has_crossts' flag was not used anywhere in the stmmac driver, removing it from both header file and dwmac-intel driver. Signed-off-by: Wong Vee Khee Reviewed-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220817064324.10025-1-veekhee@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8df475db88c0..fb2e88614f5d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -257,7 +257,6 @@ struct plat_stmmacenet_data { u8 vlan_fail_q; unsigned int eee_usecs_rate; struct pci_dev *pdev; - bool has_crossts; int int_snapshot_num; int ext_snapshot_num; bool int_snapshot_en; -- cgit v1.2.3 From 24426654ed3ae83d1127511891fb782c54f49203 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:17 -0700 Subject: bpf: net: Avoid sk_setsockopt() taking sk lock when called from bpf Most of the code in bpf_setsockopt(SOL_SOCKET) are duplicated from the sk_setsockopt(). The number of supported optnames are increasing ever and so as the duplicated code. One issue in reusing sk_setsockopt() is that the bpf prog has already acquired the sk lock. This patch adds a has_current_bpf_ctx() to tell if the sk_setsockopt() is called from a bpf prog. The bpf prog calling bpf_setsockopt() is either running in_task() or in_serving_softirq(). Both cases have the current->bpf_ctx initialized. Thus, the has_current_bpf_ctx() only needs to test !!current->bpf_ctx. This patch also adds sockopt_{lock,release}_sock() helpers for sk_setsockopt() to use. These helpers will test has_current_bpf_ctx() before acquiring/releasing the lock. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Note on the change in sock_setbindtodevice(). sockopt_lock_sock() is done in sock_setbindtodevice() instead of doing the lock_sock in sock_bindtoindex(..., lock_sk = true). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061717.4175589-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a627a02cf8ab..39bd36359c1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1966,6 +1966,15 @@ static inline bool unprivileged_ebpf_enabled(void) return !sysctl_unprivileged_bpf_disabled; } +/* Not all bpf prog type has the bpf_ctx. + * For the bpf prog type that has initialized the bpf_ctx, + * this function can be used to decide if a kernel function + * is called by a bpf program. + */ +static inline bool has_current_bpf_ctx(void) +{ + return !!current->bpf_ctx; +} #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2175,6 +2184,10 @@ static inline bool unprivileged_ebpf_enabled(void) return false; } +static inline bool has_current_bpf_ctx(void) +{ + return false; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -- cgit v1.2.3 From 5e61fe157a27afc7c0d4f7bcbceefdca536c015f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:52 +0200 Subject: net: phy: Introduce QUSGMII PHY mode The QUSGMII mode is a derivative of Cisco's USXGMII standard. This standard is pretty similar to SGMII, but allows for faster speeds, and has the build-in bits for Quad and Octa variants (like QSGMII). The main difference with SGMII/QSGMII is that USXGMII/QUSGMII re-uses the preamble to carry various information, named 'Extensions'. As of today, the USXGMII standard only mentions the "PCH" extension, which is used to convey timestamps, allowing in-band signaling of PTP timestamps without having to modify the frame itself. This commit adds support for that mode. When no extension is in use, it behaves exactly like QSGMII, although it's not compatible with QSGMII. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 87638c55d844..9eeab9b9a74c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -115,6 +115,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -152,6 +153,7 @@ typedef enum { PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, + PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -267,6 +269,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; + case PHY_INTERFACE_MODE_QUSGMII: + return "qusgmii"; default: return "unknown"; } -- cgit v1.2.3 From c04ade27cb7b952b6b9b9a0efa0a6129cc63f2ae Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:54 +0200 Subject: net: phy: Add helper to derive the number of ports from a phy mode Some phy modes such as QSGMII multiplex several MAC<->PHY links on one single physical interface. QSGMII used to be the only one supported, but other modes such as QUSGMII also carry multiple links. This helper allows getting the number of links that are multiplexed on a given interface. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9eeab9b9a74c..7c49ab95441b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -968,6 +968,8 @@ struct phy_fixup { const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); +int phy_interface_num_ports(phy_interface_t interface); + /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ -- cgit v1.2.3 From 1202cdd665315c525b5237e96e0bedc76d7e754f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 17 Aug 2022 17:43:21 -0700 Subject: Remove DECnet support from kernel DECnet is an obsolete network protocol that receives more attention from kernel janitors than users. It belongs in computer protocol history museum not in Linux kernel. It has been "Orphaned" in kernel since 2010. The iproute2 support for DECnet was dropped in 5.0 release. The documentation link on Sourceforge says it is abandoned there as well. Leave the UAPI alone to keep userspace programs compiling. This means that there is still an empty neighbour table for AF_DECNET. The table of /proc/sys/net entries was updated to match current directories and reformatted to be alphabetical. Signed-off-by: Stephen Hemminger Acked-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ---- include/linux/netfilter.h | 5 ----- include/linux/netfilter_defs.h | 8 -------- 3 files changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a3cb93c3dcc..64e8662632f8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1837,7 +1837,6 @@ enum netdev_ml_priv_type { * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data - * @dn_ptr: DECnet specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering @@ -2133,9 +2132,6 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif -#if IS_ENABLED(CONFIG_DECNET) - struct dn_dev __rcu *dn_ptr; -#endif #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c2c6f332fb90..d8817d381c14 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,11 +243,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; -#if IS_ENABLED(CONFIG_DECNET) - case NFPROTO_DECNET: - hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); - break; -#endif default: WARN_ON_ONCE(1); break; diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h index 8dddfb151f00..a5f7bef1b3a4 100644 --- a/include/linux/netfilter_defs.h +++ b/include/linux/netfilter_defs.h @@ -7,14 +7,6 @@ /* in/out/forward only */ #define NF_ARP_NUMHOOKS 3 -/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */ -#define NF_DN_NUMHOOKS 7 - -#if IS_ENABLED(CONFIG_DECNET) -/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */ -#define NF_MAX_HOOKS NF_DN_NUMHOOKS -#else #define NF_MAX_HOOKS NF_INET_NUMHOOKS -#endif #endif -- cgit v1.2.3 From 0ba985024ae7db226776725d9aa436b5c1c9fca2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:16 +0300 Subject: flow_dissector: Make 'bpf_flow_dissect' return the bpf program retcode Let 'bpf_flow_dissect' callers know the BPF program's retcode and act accordingly. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-2-shmulik.ladkani@gmail.com --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..87921996175c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1460,8 +1460,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); struct bpf_flow_dissector; -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags); +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, -- cgit v1.2.3 From dea6a4e17013382b20717664ebf3d7cc405e0952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:51 -0700 Subject: bpf: Introduce cgroup_{common,current}_func_proto Split cgroup_base_func_proto into the following: * cgroup_common_func_proto - common helpers for all cgroup hooks * cgroup_current_func_proto - common helpers for all cgroup hooks running in the process context (== have meaningful 'current'). Move bpf_{g,s}et_retval and other cgroup-related helpers into kernel/bpf/cgroup.c so they closer to where they are being used. Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220823222555.523590-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2bd1b5f8de9b..57e9e109257e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); + +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + +static inline const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( -- cgit v1.2.3 From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:52 -0700 Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks The following hooks are per-cgroup hooks but they are not using cgroup_{common,current}_func_proto, fix it: * BPF_PROG_TYPE_CGROUP_SKB (cg_skb) * BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr) * BPF_PROG_TYPE_CGROUP_SOCK (cg_sock) * BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP Also: * move common func_proto's into cgroup func_proto handlers * make sure bpf_{g,s}et_retval are not accessible from recvmsg, getpeername and getsockname (return/errno is ignored in these places) * as a side effect, expose get_current_pid_tgid, get_current_comm_proto, get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup hooks Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39bd36359c1e..99fc7a64564f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2375,6 +2375,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; -- cgit v1.2.3 From c205cc7534a97f2d6fbd2a23a94ed7c036c6e2aa Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 21 Aug 2022 13:18:58 +0800 Subject: net: skb: prevent the split of kfree_skb_reason() by gcc Sometimes, gcc will optimize the function by spliting it to two or more functions. In this case, kfree_skb_reason() is splited to kfree_skb_reason and kfree_skb_reason.part.0. However, the function/tracepoint trace_kfree_skb() in it needs the return address of kfree_skb_reason(). This split makes the call chains becomes: kfree_skb_reason() -> kfree_skb_reason.part.0 -> trace_kfree_skb() which makes the return address that passed to trace_kfree_skb() be kfree_skb(). Therefore, introduce '__fix_address', which is the combination of '__noclone' and 'noinline', and apply it to kfree_skb_reason() to prevent to from being splited or made inline. (Is it better to simply apply '__noclone oninline' to kfree_skb_reason? I'm thinking maybe other functions have the same problems) Meanwhile, wrap 'skb_unref()' with 'unlikely()', as the compiler thinks it is likely return true and splits kfree_skb_reason(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/compiler_attributes.h | 7 +++++++ include/linux/skbuff.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..fc93c9488c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -371,4 +371,11 @@ */ #define __weak __attribute__((__weak__)) +/* + * Used by functions that use '__builtin_return_address'. These function + * don't want to be splited or made inline, which can make + * the '__builtin_return_address' get unexpected address. + */ +#define __fix_address noinline __noclone + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..b51d07a727c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1195,7 +1195,8 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason -- cgit v1.2.3 From 9d9d00ac29d0ef7ce426964de46fa6b380357d0a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:31:25 +0200 Subject: bpf: Fix reference state management for synchronous callbacks Currently, verifier verifies callback functions (sync and async) as if they will be executed once, (i.e. it explores execution state as if the function was being called once). The next insn to explore is set to start of subprog and the exit from nested frame is handled using curframe > 0 and prepare_func_exit. In case of async callback it uses a customized variant of push_stack simulating a kind of branch to set up custom state and execution context for the async callback. While this approach is simple and works when callback really will be executed only once, it is unsafe for all of our current helpers which are for_each style, i.e. they execute the callback multiple times. A callback releasing acquired references of the caller may do so multiple times, but currently verifier sees it as one call inside the frame, which then returns to caller. Hence, it thinks it released some reference that the cb e.g. got access through callback_ctx (register filled inside cb from spilled typed register on stack). Similarly, it may see that an acquire call is unpaired inside the callback, so the caller will copy the reference state of callback and then will have to release the register with new ref_obj_ids. But again, the callback may execute multiple times, but the verifier will only account for acquired references for a single symbolic execution of the callback, which will cause leaks. Note that for async callback case, things are different. While currently we have bpf_timer_set_callback which only executes it once, even for multiple executions it would be safe, as reference state is NULL and check_reference_leak would force program to release state before BPF_EXIT. The state is also unaffected by analysis for the caller frame. Hence async callback is safe. Since we want the reference state to be accessible, e.g. for pointers loaded from stack through callback_ctx's PTR_TO_STACK, we still have to copy caller's reference_state to callback's bpf_func_state, but we enforce that whatever references it adds to that reference_state has been released before it hits BPF_EXIT. This requires introducing a new callback_ref member in the reference state to distinguish between caller vs callee references. Hence, check_reference_leak now errors out if it sees we are in callback_fn and we have not released callback_ref refs. Since there can be multiple nested callbacks, like frame 0 -> cb1 -> cb2 etc. we need to also distinguish between whether this particular ref belongs to this callback frame or parent, and only error for our own, so we store state->frameno (which is always non-zero for callbacks). In short, callbacks can read parent reference_state, but cannot mutate it, to be able to use pointers acquired by the caller. They must only undo their changes (by releasing their own acquired_refs before BPF_EXIT) on top of caller reference_state before returning (at which point the caller and callback state will match anyway, so no need to copy it back to caller). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013125.24938-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bad8640dc..1fdddbf3546b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,17 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; }; /* state of the program: -- cgit v1.2.3 From ea5cba269fb1fe22b84f4d01bb3d56320e6ffa3e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Aug 2022 11:26:23 +0200 Subject: wifi: cfg80211/mac80211: check EHT capability size correctly For AP/non-AP the EHT MCS/NSS subfield size differs, the 4-octet subfield is only used for 20 MHz-only non-AP STA. Pass an argument around everywhere to be able to parse it properly. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..6f70394417ac 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2886,7 +2886,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ static inline u8 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, - const struct ieee80211_eht_cap_elem_fixed *eht_cap) + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) { u8 count = 0; @@ -2907,7 +2908,10 @@ ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) count += 3; - return count ? count : 4; + if (count) + return count; + + return from_ap ? 3 : 4; } /* 802.11be EHT PPE Thresholds */ @@ -2943,7 +2947,8 @@ ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) } static inline bool -ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) { const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); @@ -2952,7 +2957,8 @@ ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) return false; needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, - (const void *)data); + (const void *)data, + from_ap); if (len < needed) return false; -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99fc7a64564f..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, -- cgit v1.2.3 From b9030780971b56c0c455c3b66244efd96608846d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:32:43 +0200 Subject: netdev: Use try_cmpxchg in napi_if_scheduled_mark_missed Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in napi_if_scheduled_mark_missed. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20220822143243.2798-1-ubizjak@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64e8662632f8..8839abc1f941 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -546,8 +546,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; + val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); if (val & NAPIF_STATE_DISABLE) return true; @@ -555,7 +555,7 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) return false; new = val | NAPIF_STATE_MISSED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); return true; } -- cgit v1.2.3 From 9c5d03d362519f36cd551aec596388f895c93d2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Aug 2022 17:18:30 -0700 Subject: genetlink: start to validate reserved header bytes We had historically not checked that genlmsghdr.reserved is 0 on input which prevents us from using those precious bytes in the future. One use case would be to extend the cmd field, which is currently just 8 bits wide and 256 is not a lot of commands for some core families. To make sure that new families do the right thing by default put the onus of opting out of validation on existing families. Signed-off-by: Jakub Kicinski Acked-by: Paul Moore (NetLabel) Signed-off-by: David S. Miller --- include/linux/genl_magic_func.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 939b1a8f571b..4a4b387181ad 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -294,6 +294,7 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, + .resv_start_op = 42, /* drbd is currently the only user */ .n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps), .module = THIS_MODULE, }; -- cgit v1.2.3 From e8013f8edaa30e17fd583a326daff1fa86b75c82 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Thu, 25 Aug 2022 11:28:35 +0200 Subject: ethernet: Add helpers to recognize addresses mapped to IP multicast IP multicast must sometimes be discriminated from non-IP multicast, e.g. when determining the forwarding behavior of a given group in the presence of multicast router ports on an offloaded bridge. Therefore, provide helpers to identify these groups. Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 92b10e67d5f8..a541f0c4f146 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -428,6 +428,28 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, return true; } +static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ip_mcast(const u8 *addr) +{ + return ether_addr_is_ipv4_mcast(addr) || + ether_addr_is_ipv6_mcast(addr); +} + /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3 From 690252f19f0e486abb8590b3a7a03d4e065d93d4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:31 -0700 Subject: netlink: add support for ext_ack missing attributes There is currently no way to report via extack in a structured way that an attribute is missing. This leads to families resorting to string messages. Add a pair of attributes - @offset and @type for machine-readable way of reporting missing attributes. The @offset points to the nest which should have contained the attribute, @type is the expected nla_type. The offset will be skipped if the attribute is missing at the message level rather than inside a nest. User space should be able to figure out which attribute enum (AKA attribute space AKA attribute set) the nest pointed to by @offset is using. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index bda1c385cffb..1619221c415c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -71,6 +71,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute + * @miss_type: attribute type which was missing + * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ @@ -78,6 +80,8 @@ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; + const struct nlattr *miss_nest; + u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -126,6 +130,15 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) +#define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->miss_nest = (nest); \ + __extack->miss_type = (type); \ + } \ +} while (0) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { -- cgit v1.2.3 From 45dca15759643806660e9285e6af8a1ba3c76c82 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:32 -0700 Subject: netlink: add helpers for extack attr presence checking Being able to check attribute presence and set extack if not on one line is handy, add helpers. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1619221c415c..d51e041d2242 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -139,6 +139,17 @@ struct netlink_ext_ack { } \ } while (0) +#define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ + struct nlattr **__tb = (tb); \ + u32 __attr = (type); \ + int __retval; \ + \ + __retval = !__tb[__attr]; \ + if (__retval) \ + NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ + __retval; \ +}) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { -- cgit v1.2.3 From f9cad07b840ec8a8eb54928908d694b6e262631c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 30 Aug 2022 18:32:47 +0300 Subject: thunderbolt: Show link type for XDomain connections too Following what we do for routers already, extend this to XDomain connections as well. This will show in sysfs whether the link is in USB4 or Thunderbolt mode. Signed-off-by: Mika Westerberg Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 9f442d73f3df..90cd08ab2f5d 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -187,6 +187,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); * @device_name: Name of the device (or %NULL if not known) * @link_speed: Speed of the link in Gb/s * @link_width: Width of the link (1 or 2) + * @link_usb4: Downstream link is USB4 * @is_unplugged: The XDomain is unplugged * @needs_uuid: If the XDomain does not have @remote_uuid it will be * queried first @@ -234,6 +235,7 @@ struct tb_xdomain { const char *device_name; unsigned int link_speed; unsigned int link_width; + bool link_usb4; bool is_unplugged; bool needs_uuid; struct ida service_ids; -- cgit v1.2.3 From 977f1aa5e4d1942bc8012bf3f0e695008979ff4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 18:44:27 +0000 Subject: net: bql: add more documentation Add some documentation for netdev_tx_sent_queue() and netdev_tx_completed_queue() Stating that netdev_tx_completed_queue() must be called once per TX completion round is apparently not obvious for everybody. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eec35f9b6616..cfe41c053e11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3353,6 +3353,16 @@ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_qu #endif } +/** + * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue + * @dev_queue: network device queue + * @bytes: number of bytes queued to the device queue + * + * Report the number of bytes queued for sending/completion to the network + * device hardware queue. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). + */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { @@ -3398,13 +3408,14 @@ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, } /** - * netdev_sent_queue - report the number of bytes queued to hardware - * @dev: network device - * @bytes: number of bytes queued to the hardware device queue + * netdev_sent_queue - report the number of bytes queued to hardware + * @dev: network device + * @bytes: number of bytes queued to the hardware device queue * - * Report the number of bytes queued for sending/completion to the network - * device hardware queue. @bytes should be a good approximation and should - * exactly match netdev_completed_queue() @bytes + * Report the number of bytes queued for sending/completion to the network + * device hardware queue#0. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { @@ -3419,6 +3430,15 @@ static inline bool __netdev_sent_queue(struct net_device *dev, xmit_more); } +/** + * netdev_tx_completed_queue - report number of packets/bytes at TX completion. + * @dev_queue: network device queue + * @pkts: number of packets (currently ignored) + * @bytes: number of bytes dequeued from the device queue + * + * Must be called at most once per TX completion round (and not per + * individual packet), so that BQL can adjust its limits appropriately. + */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { -- cgit v1.2.3 From c3f760ef128789252e7c4f10d3c1721422dceba9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2022 17:00:58 -0700 Subject: net: remove netif_tx_napi_add() All callers are now gone. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cfe41c053e11..f0068c1ff1df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2569,8 +2569,6 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } -#define netif_tx_napi_add netif_napi_add_tx_weight - /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device -- cgit v1.2.3 From dc84dbbcc97bfb47e0f2b175d816e601b2890c91 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 31 Aug 2022 11:19:06 +0800 Subject: bpf, tnums: Warn against the usage of tnum_in(tnum_range(), ...) Commit a657182a5c51 ("bpf: Don't use tnum_range on array range checking for poke descriptors") has shown that using tnum_range() as argument to tnum_in() can lead to misleading code that looks like tight bound check when in fact the actual allowed range is much wider. Document such behavior to warn against its usage in general, and suggest some scenario where result can be trusted. Signed-off-by: Shung-Hsi Yu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Link: https://www.openwall.com/lists/oss-security/2022/08/26/1 Link: https://lore.kernel.org/bpf/20220831031907.16133-3-shung-hsi.yu@suse.com Link: https://lore.kernel.org/bpf/20220831031907.16133-2-shung-hsi.yu@suse.com --- include/linux/tnum.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 498dbcedb451..1c3948a1d6ad 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -21,7 +21,12 @@ struct tnum { struct tnum tnum_const(u64 value); /* A completely unknown value */ extern const struct tnum tnum_unknown; -/* A value that's unknown except that @min <= value <= @max */ +/* An unknown value that is a superset of @min <= value <= @max. + * + * Could include values outside the range of [@min, @max]. + * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*}, + * rather than the intended set of {0, 1, 2}. + */ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ @@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a) */ bool tnum_is_aligned(struct tnum a, u64 size); -/* Returns true if @b represents a subset of @a. */ +/* Returns true if @b represents a subset of @a. + * + * Note that using tnum_range() as @a requires extra cautions as tnum_in() may + * return true unexpectedly due to tnum limited ability to represent tight + * range, e.g. + * + * tnum_in(tnum_range(0, 2), tnum_const(3)) == true + * + * As a rule of thumb, if @a is explicitly coded rather than coming from + * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1), + * or tnum_range(2**n, 2**(n+1) - 1). + */ bool tnum_in(struct tnum a, struct tnum b); /* Formatting functions. These have snprintf-like semantics: they will write -- cgit v1.2.3 From 4ff09db1b79b98b4a2a7511571c640b76cab3beb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:02 -0700 Subject: bpf: net: Change sk_getsockopt() to take the sockptr_t argument This patch changes sk_getsockopt() to take the sockptr_t argument such that it can be used by bpf_getsockopt(SOL_SOCKET) in a latter patch. security_socket_getpeersec_stream() is not changed. It stays with the __user ptr (optval.user and optlen.user) to avoid changes to other security hooks. bpf_getsockopt(SOL_SOCKET) also does not support SO_PEERSEC. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002802.2888419-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +-- include/linux/sockptr.h | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a5f21dc3c432..527ae1d64e27 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); -int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, - unsigned int len); +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index d45902fb4cad..bae5e2369b4f 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, return 0; } +static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset(dst, 0, src, size); +} + static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); -- cgit v1.2.3 From 728f064cd7ebea8c182e99e6f152c8b4a0a6b071 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:28 -0700 Subject: bpf: net: Change do_ip_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument. This patch also changes do_ip_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt(). Note on the change in ip_mc_gsfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ip_get_mcast_msfilter() and compat_ip_get_mcast_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip_mc_gsfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002828.2890585-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/igmp.h | 4 ++-- include/linux/mroute.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 93c262ecbdc9..78890143f079 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen); + sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 6cbbfe94348c..80b8400ab8b2 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt) } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); @@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname, return -ENOPROTOOPT; } -static inline int ip_mroute_getsockopt(struct sock *sock, int optname, - char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sk, int optname, + sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } -- cgit v1.2.3 From 6dadbe4bac68309eb46ab0f30e8ff47a789df49a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:53 -0700 Subject: bpf: net: Change do_ipv6_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument . This patch also changes do_ipv6_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt(). Note on the change in ip6_mc_msfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ipv6_get_msfilter() and compat_ipv6_get_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip6_mc_msfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002853.2892532-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/mroute6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index bc351a85ce9b..8f2b307fb124 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -27,7 +27,7 @@ struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); @@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, static inline int ip6_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) + int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } -- cgit v1.2.3 From 4a502cf4d77e12119e7061a05d5789cd3129d185 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 Sep 2022 10:32:03 +0200 Subject: net: pcs: add new PCS driver for altera TSE PCS The Altera Triple Speed Ethernet has a SGMII/1000BaseC PCS that can be integrated in several ways. It can either be part of the TSE MAC's address space, accessed through 32 bits accesses on the mapped mdio device 0, or through a dedicated 16 bits register set. This driver allows using the TSE PCS outside of altera TSE's driver, since it can be used standalone by other MACs. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/pcs-altera-tse.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/pcs-altera-tse.h (limited to 'include/linux') diff --git a/include/linux/pcs-altera-tse.h b/include/linux/pcs-altera-tse.h new file mode 100644 index 000000000000..92ab9f08e835 --- /dev/null +++ b/include/linux/pcs-altera-tse.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 Bootlin + * + * Maxime Chevallier + */ + +#ifndef __LINUX_PCS_ALTERA_TSE_H +#define __LINUX_PCS_ALTERA_TSE_H + +struct phylink_pcs; +struct net_device; + +struct phylink_pcs *alt_tse_pcs_create(struct net_device *ndev, + void __iomem *pcs_base, int reg_width); + +#endif /* __LINUX_PCS_ALTERA_TSE_H */ -- cgit v1.2.3 From 8a2dd123f12f69e5373d3103da2c97fc36223e0c Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 29 Aug 2022 12:06:04 +0300 Subject: RDMA/mlx5: Move function mlx5_core_query_ib_ppcnt() to mlx5_ib This patch doesn't change any functionality, but move one function to mlx5_ib because it is not used by mlx5_core. The actual fix is in the next patch. Reviewed-by: Roi Dayan Signed-off-by: Chris Mi Link: https://lore.kernel.org/r/fd47b9138412bd94ed30f838026cbb4cf3878150.1661763871.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76d7661e3e63..432bd202e2fe 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1079,8 +1079,6 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); -int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, - u8 port_num, void *out, size_t sz); int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 05ad5d4581c3c1cc724fe50d4652833fb9f3037b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 2 Sep 2022 18:02:39 -0400 Subject: net: phy: Add 1000BASE-KX interface mode Add 1000BASE-KX interface mode. This 1G backplane ethernet as described in clause 70. Clause 73 autonegotiation is mandatory, and only full duplex operation is supported. Although at the PMA level this interface mode is identical to 1000BASE-X, it uses a different form of in-band autonegation. This justifies a separate interface mode, since the interface mode (along with the MLO_AN_* autonegotiation mode) sets the type of autonegotiation which will be used on a link. This results in more than just electrical differences between the link modes. With regard to 1000BASE-X, 1000BASE-KX holds a similar position to SGMII: same signaling, but different autonegotiation. PCS drivers (which typically handle in-band autonegotiation) may only support 1000BASE-X, and not 1000BASE-KX. Similarly, the phy mode is used to configure serdes phys with phy_set_mode_ext. Due to the different electrical standards (SFI or XFI vs Clause 70), they will likely want to use different configuration. Adding a phy interface mode for 1000BASE-KX helps simplify configuration in these areas. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c49ab95441b..337230c135f7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -116,6 +116,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII + * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -154,6 +155,7 @@ typedef enum { /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, + PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -251,6 +253,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; + case PHY_INTERFACE_MODE_1000BASEKX: + return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: -- cgit v1.2.3 From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:43 -0700 Subject: bpf: Introduce any context BPF specific memory allocator. Tracing BPF programs can attach to kprobe and fentry. Hence they run in unknown context where calling plain kmalloc() might not be safe. Front-end kmalloc() with minimal per-cpu cache of free elements. Refill this cache asynchronously from irq_work. BPF programs always run with migration disabled. It's safe to allocate from cache of the current cpu with irqs disabled. Free-ing is always done into bucket of the current cpu as well. irq_work trims extra free elements from buckets with kfree and refills them with kmalloc, so global kmalloc logic takes care of freeing objects allocated by one cpu and freed on another. struct bpf_mem_alloc supports two modes: - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. This is typical bpf hash map use case when all elements have equal size. - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on kmalloc/kfree. Max allocation size is 4096 in this case. This is bpf_dynptr and bpf_kptr use case. bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/bpf_mem_alloc.h (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc