From 537cf4e3cc2f6cc9088dcd6162de573f603adc29 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 20 Nov 2020 12:53:39 +0100 Subject: xsk: Fix umem cleanup bug at socket destruct Fix a bug that is triggered when a partially setup socket is destroyed. For a fully setup socket, a socket that has been bound to a device, the cleanup of the umem is performed at the end of the buffer pool's cleanup work queue item. This has to be performed in a work queue, and not in RCU cleanup, as it is doing a vunmap that cannot execute in interrupt context. However, when a socket has only been partially set up so that a umem has been created but the buffer pool has not, the code erroneously directly calls the umem cleanup function instead of using a work queue, and this leads to a BUG_ON() in vunmap(). As there in this case is no buffer pool, we cannot use its work queue, so we need to introduce a work queue for the umem and schedule this for the cleanup. So in the case there is no pool, we are going to use the umem's own work queue to schedule the cleanup. But if there is a pool, the cleanup of the umem is still being performed by the pool's work queue, as it is important that the umem is cleaned up after the pool. Fixes: e5e1a4bc916d ("xsk: Fix possible memory leak at socket close") Reported-by: Marek Majtyka Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Tested-by: Marek Majtyka Link: https://lore.kernel.org/bpf/1605873219-21629-1-git-send-email-magnus.karlsson@gmail.com --- include/net/xdp_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 1a9559c0cbdd..4f4e93bf814c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,7 @@ struct xdp_umem { struct page **pgs; int id; struct list_head xsk_dma_list; + struct work_struct work; }; struct xsk_map { -- cgit v1.2.3 From 36ccdf85829a7dd6936dba5d02fa50138471f0d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 23 Nov 2020 18:56:00 +0100 Subject: net, xsk: Avoid taking multiple skbuff references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") addressed the problem that packets were discarded from the Tx AF_XDP ring, when the driver returned NETDEV_TX_BUSY. Part of the fix was bumping the skbuff reference count, so that the buffer would not be freed by dev_direct_xmit(). A reference count larger than one means that the skbuff is "shared", which is not the case. If the "shared" skbuff is sent to the generic XDP receive path, netif_receive_generic_xdp(), and pskb_expand_head() is entered the BUG_ON(skb_shared(skb)) will trigger. This patch adds a variant to dev_direct_xmit(), __dev_direct_xmit(), where a user can select the skbuff free policy. This allows AF_XDP to avoid bumping the reference count, but still keep the NETDEV_TX_BUSY behavior. Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") Reported-by: Yonghong Song Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201123175600.146255-1-bjorn.topel@gmail.com --- include/linux/netdevice.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 964b494b0e8d..76775abf259d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2813,9 +2813,21 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); + int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); + +static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + int ret; + + ret = __dev_direct_xmit(skb, queue_id); + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + return ret; +} + int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); -- cgit v1.2.3 From 3c78e9e0d33a27ab8050e4492c03c6a1f8d0ed6b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Nov 2020 23:50:07 +0100 Subject: netfilter: nftables_offload: set address type in control dissector This patch adds nft_flow_rule_set_addr_type() to set the address type from the nft_payload expression accordingly. If the address type is not set in the control dissector then a rule that matches either on source or destination IP address does not work. After this patch, nft hardware offload generates the flow dissector configuration as tc-flower does to match on an IP address. This patch has been also tested functionally to make sure packets are filtered out by the NIC. This is also getting the code aligned with the existing netfilter flow offload infrastructure which is also setting the control dissector. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index ea7d1d78b92d..bddd34c5bd79 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -37,6 +37,7 @@ void nft_offload_update_dependency(struct nft_offload_ctx *ctx, struct nft_flow_key { struct flow_dissector_key_basic basic; + struct flow_dissector_key_control control; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -62,6 +63,9 @@ struct nft_flow_rule { #define NFT_OFFLOAD_F_ACTION (1 << 0) +void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, + enum flow_dissector_key_id addr_type); + struct nft_rule; struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule); void nft_flow_rule_destroy(struct nft_flow_rule *flow); -- cgit v1.2.3 From a5d45bc0dc50f9dd83703510e9804d813a9cac32 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Nov 2020 23:50:17 +0100 Subject: netfilter: nftables_offload: build mask based from the matching bytes Userspace might match on prefix bytes of header fields if they are on the byte boundary, this requires that the mask is adjusted accordingly. Use NFT_OFFLOAD_MATCH_EXACT() for meta since prefix byte matching is not allowed for this type of selector. The bitwise expression might be optimized out by userspace, hence the kernel needs to infer the prefix from the number of payload bytes to match on. This patch adds nft_payload_offload_mask() to calculate the bitmask to match on the prefix. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index bddd34c5bd79..1d34fe154fe0 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -78,6 +78,9 @@ int nft_flow_rule_offload_commit(struct net *net); offsetof(struct nft_flow_key, __base.__field); \ (__reg)->len = __len; \ (__reg)->key = __key; \ + +#define NFT_OFFLOAD_MATCH_EXACT(__key, __base, __field, __len, __reg) \ + NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ memset(&(__reg)->mask, 0xff, (__reg)->len); int nft_chain_offload_priority(struct nft_base_chain *basechain); -- cgit v1.2.3 From 2867e1eac61016f59b3d730e3f7aa488e186e917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 30 Nov 2020 19:37:05 +0100 Subject: inet_ecn: Fix endianness of checksum update when setting ECT(1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When adding support for propagating ECT(1) marking in IP headers it seems I suffered from endianness-confusion in the checksum update calculation: In fact the ECN field is in the *lower* bits of the first 16-bit word of the IP header when calculating in network byte order. This means that the addition performed to update the checksum field was wrong; let's fix that. Fixes: b723748750ec ("tunnel: Propagate ECT(1) when decapsulating as recommended by RFC6040") Reported-by: Jonathan Morton Tested-by: Pete Heist Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20201130183705.17540-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- include/net/inet_ecn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index e1eaf1780288..563457fec557 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -107,7 +107,7 @@ static inline int IP_ECN_set_ect1(struct iphdr *iph) if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; - check += (__force u16)htons(0x100); + check += (__force u16)htons(0x1); iph->check = (__force __sum16)(check + (check>=0xFFFF)); iph->tos ^= INET_ECN_MASK; -- cgit v1.2.3 From d421e466c2373095f165ddd25cbabd6c5b077928 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 2 Dec 2020 20:39:46 -0800 Subject: net/mlx5: DR, Proper handling of unsupported Connect-X6DX SW steering STEs format for Connect-X5 and Connect-X6DX different. Currently, on Connext-X6DX the SW steering would break at some point when building STEs w/o giving a proper error message. Fix this by checking the STE format of the current device when initializing domain: add mlx5_ifc definitions for Connect-X6DX SW steering, read FW capability to get the current format version, and check this version when domain is being created. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a092346c7b2d..233352447b1a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1223,6 +1223,11 @@ enum mlx5_fc_bulk_alloc_bitmask { #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum)) +enum { + MLX5_STEERING_FORMAT_CONNECTX_5 = 0, + MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x30]; u8 vhca_id[0x10]; @@ -1521,7 +1526,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x20]; + u8 reserved_at_440[0x4]; + u8 steering_format_version[0x4]; + u8 create_qp_start_hint[0x18]; u8 reserved_at_460[0x3]; u8 log_max_uctx[0x5]; -- cgit v1.2.3