From 94623f579ce338b5fa61b5acaa5beb8aa657fb9e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 3 Apr 2023 13:54:37 +0200
Subject: netfilter: br_netfilter: fix recent physdev match breakage

Recent attempt to ensure PREROUTING hook is executed again when a
decrypted ipsec packet received on a bridge passes through the network
stack a second time broke the physdev match in INPUT hook.

We can't discard the nf_bridge info strct from sabotage_in hook, as
this is needed by the physdev match.

Keep the struct around and handle this with another conditional instead.

Fixes: 2b272bb558f1 ("netfilter: br_netfilter: disable sabotage_in hook after first suppression")
Reported-and-tested-by: Farid BENAMROUCHE <fariouche@yahoo.fr>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h          |  1 +
 net/bridge/br_netfilter_hooks.c | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff7ad331fb82..1ec3530c8191 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -294,6 +294,7 @@ struct nf_bridge_info {
 	u8			pkt_otherhost:1;
 	u8			in_prerouting:1;
 	u8			bridged_dnat:1;
+	u8			sabotage_in_done:1;
 	__u16			frag_max_size;
 	struct net_device	*physindev;
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 638a4d5359db..4bc6761517bb 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv,
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
-	if (nf_bridge && !nf_bridge->in_prerouting &&
-	    !netif_is_l3_master(skb->dev) &&
-	    !netif_is_l3_slave(skb->dev)) {
-		nf_bridge_info_free(skb);
-		state->okfn(state->net, state->sk, skb);
-		return NF_STOLEN;
+	if (nf_bridge) {
+		if (nf_bridge->sabotage_in_done)
+			return NF_ACCEPT;
+
+		if (!nf_bridge->in_prerouting &&
+		    !netif_is_l3_master(skb->dev) &&
+		    !netif_is_l3_slave(skb->dev)) {
+			nf_bridge->sabotage_in_done = 1;
+			state->okfn(state->net, state->sk, skb);
+			return NF_STOLEN;
+		}
 	}
 
 	return NF_ACCEPT;
-- 
cgit v1.2.3


From af0acf22aea359e04412237d68787401f96bb583 Mon Sep 17 00:00:00 2001
From: Chen Aotian <chenaotian2@163.com>
Date: Thu, 6 Apr 2023 12:01:51 +0800
Subject: netfilter: nf_tables: Modify nla_memdup's flag to GFP_KERNEL_ACCOUNT

For memory alloc that store user data from nla[NFTA_OBJ_USERDATA],
use GFP_KERNEL_ACCOUNT is more suitable.

Fixes: 33758c891479 ("memcg: enable accounting for nft objects")
Signed-off-by: Chen Aotian <chenaotian2@163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6004d4b24451..cd52109e674a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7052,7 +7052,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
 	}
 
 	if (nla[NFTA_OBJ_USERDATA]) {
-		obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL);
+		obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
 		if (obj->udata == NULL)
 			goto err_userdata;
 
-- 
cgit v1.2.3


From 3037933448f60f9acb705997eae62013ecb81e0d Mon Sep 17 00:00:00 2001
From: Gwangun Jung <exsociety@gmail.com>
Date: Thu, 13 Apr 2023 19:35:54 +0900
Subject: net: sched: sch_qfq: prevent slab-out-of-bounds in qfq_activate_agg

If the TCA_QFQ_LMAX value is not offered through nlattr, lmax is determined by the MTU value of the network device.
The MTU of the loopback device can be set up to 2^31-1.
As a result, it is possible to have an lmax value that exceeds QFQ_MIN_LMAX.

Due to the invalid lmax value, an index is generated that exceeds the QFQ_MAX_INDEX(=24) value, causing out-of-bounds read/write errors.

The following reports a oob access:

[   84.582666] BUG: KASAN: slab-out-of-bounds in qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313)
[   84.583267] Read of size 4 at addr ffff88810f676948 by task ping/301
[   84.583686]
[   84.583797] CPU: 3 PID: 301 Comm: ping Not tainted 6.3.0-rc5 #1
[   84.584164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
[   84.584644] Call Trace:
[   84.584787]  <TASK>
[   84.584906] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
[   84.585108] print_report (mm/kasan/report.c:320 mm/kasan/report.c:430)
[   84.585570] kasan_report (mm/kasan/report.c:538)
[   84.585988] qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313)
[   84.586599] qfq_enqueue (net/sched/sch_qfq.c:1255)
[   84.587607] dev_qdisc_enqueue (net/core/dev.c:3776)
[   84.587749] __dev_queue_xmit (./include/net/sch_generic.h:186 net/core/dev.c:3865 net/core/dev.c:4212)
[   84.588763] ip_finish_output2 (./include/net/neighbour.h:546 net/ipv4/ip_output.c:228)
[   84.589460] ip_output (net/ipv4/ip_output.c:430)
[   84.590132] ip_push_pending_frames (./include/net/dst.h:444 net/ipv4/ip_output.c:126 net/ipv4/ip_output.c:1586 net/ipv4/ip_output.c:1606)
[   84.590285] raw_sendmsg (net/ipv4/raw.c:649)
[   84.591960] sock_sendmsg (net/socket.c:724 net/socket.c:747)
[   84.592084] __sys_sendto (net/socket.c:2142)
[   84.593306] __x64_sys_sendto (net/socket.c:2150)
[   84.593779] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[   84.593902] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[   84.594070] RIP: 0033:0x7fe568032066
[   84.594192] Code: 0e 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 41 89 ca 64 8b 04 25 18 00 00 00 85 c09[ 84.594796] RSP: 002b:00007ffce388b4e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c

Code starting with the faulting instruction
===========================================
[   84.595047] RAX: ffffffffffffffda RBX: 00007ffce388cc70 RCX: 00007fe568032066
[   84.595281] RDX: 0000000000000040 RSI: 00005605fdad6d10 RDI: 0000000000000003
[   84.595515] RBP: 00005605fdad6d10 R08: 00007ffce388eeec R09: 0000000000000010
[   84.595749] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040
[   84.595984] R13: 00007ffce388cc30 R14: 00007ffce388b4f0 R15: 0000001d00000001
[   84.596218]  </TASK>
[   84.596295]
[   84.596351] Allocated by task 291:
[   84.596467] kasan_save_stack (mm/kasan/common.c:46)
[   84.596597] kasan_set_track (mm/kasan/common.c:52)
[   84.596725] __kasan_kmalloc (mm/kasan/common.c:384)
[   84.596852] __kmalloc_node (./include/linux/kasan.h:196 mm/slab_common.c:967 mm/slab_common.c:974)
[   84.596979] qdisc_alloc (./include/linux/slab.h:610 ./include/linux/slab.h:731 net/sched/sch_generic.c:938)
[   84.597100] qdisc_create (net/sched/sch_api.c:1244)
[   84.597222] tc_modify_qdisc (net/sched/sch_api.c:1680)
[   84.597357] rtnetlink_rcv_msg (net/core/rtnetlink.c:6174)
[   84.597495] netlink_rcv_skb (net/netlink/af_netlink.c:2574)
[   84.597627] netlink_unicast (net/netlink/af_netlink.c:1340 net/netlink/af_netlink.c:1365)
[   84.597759] netlink_sendmsg (net/netlink/af_netlink.c:1942)
[   84.597891] sock_sendmsg (net/socket.c:724 net/socket.c:747)
[   84.598016] ____sys_sendmsg (net/socket.c:2501)
[   84.598147] ___sys_sendmsg (net/socket.c:2557)
[   84.598275] __sys_sendmsg (./include/linux/file.h:31 net/socket.c:2586)
[   84.598399] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
[   84.598520] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[   84.598688]
[   84.598744] The buggy address belongs to the object at ffff88810f674000
[   84.598744]  which belongs to the cache kmalloc-8k of size 8192
[   84.599135] The buggy address is located 2664 bytes to the right of
[   84.599135]  allocated 7904-byte region [ffff88810f674000, ffff88810f675ee0)
[   84.599544]
[   84.599598] The buggy address belongs to the physical page:
[   84.599777] page:00000000e638567f refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10f670
[   84.600074] head:00000000e638567f order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[   84.600330] flags: 0x200000000010200(slab|head|node=0|zone=2)
[   84.600517] raw: 0200000000010200 ffff888100043180 dead000000000122 0000000000000000
[   84.600764] raw: 0000000000000000 0000000080020002 00000001ffffffff 0000000000000000
[   84.601009] page dumped because: kasan: bad access detected
[   84.601187]
[   84.601241] Memory state around the buggy address:
[   84.601396]  ffff88810f676800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   84.601620]  ffff88810f676880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   84.601845] >ffff88810f676900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   84.602069]                                               ^
[   84.602243]  ffff88810f676980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   84.602468]  ffff88810f676a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   84.602693] ==================================================================
[   84.602924] Disabling lock debugging due to kernel taint

Fixes: 3015f3d2a3cd ("pkt_sched: enable QFQ to support TSO/GSO")
Reported-by: Gwangun Jung <exsociety@gmail.com>
Signed-off-by: Gwangun Jung <exsociety@gmail.com>
Acked-by: Jamal Hadi Salim<jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_qfq.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index cf5ebe43b3b4..02098a02943e 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -421,15 +421,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	} else
 		weight = 1;
 
-	if (tb[TCA_QFQ_LMAX]) {
+	if (tb[TCA_QFQ_LMAX])
 		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
-		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
-			pr_notice("qfq: invalid max length %u\n", lmax);
-			return -EINVAL;
-		}
-	} else
+	else
 		lmax = psched_mtu(qdisc_dev(sch));
 
+	if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+		pr_notice("qfq: invalid max length %u\n", lmax);
+		return -EINVAL;
+	}
+
 	inv_w = ONE_FP / weight;
 	weight = ONE_FP / inv_w;
 
-- 
cgit v1.2.3


From c730fce7c70cfce831f4bdc9e49880ba1f61a092 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 14 Apr 2023 17:47:55 +0200
Subject: s390/bpf: Fix bpf_arch_text_poke() with new_addr == NULL

Thomas Richter reported a crash in linux-next with a backtrace similar
to the following one:

	 [<0000000000000000>] 0x0
	([<000000000031a182>] bpf_trace_run4+0xc2/0x218)
	 [<00000000001d59f4>] __bpf_trace_sched_switch+0x1c/0x28
	 [<0000000000c44a3a>] __schedule+0x43a/0x890
	 [<0000000000c44ef8>] schedule+0x68/0x110
	 [<0000000000c4e5ca>] do_nanosleep+0xa2/0x168
	 [<000000000026e7fe>] hrtimer_nanosleep+0xf6/0x1c0
	 [<000000000026eb6e>] __s390x_sys_nanosleep+0xb6/0xf0
	 [<0000000000c3b81c>] __do_syscall+0x1e4/0x208
	 [<0000000000c50510>] system_call+0x70/0x98
	Last Breaking-Event-Address:
	 [<000003ff7fda1814>] bpf_prog_65e887c70a835bbf_on_switch+0x1a4/0x1f0

The problem is that bpf_arch_text_poke() with new_addr == NULL is
susceptible to the following race condition:

	T1                 T2
        -----------------  -------------------
	plt.target = NULL
	                   entry: brcl 0xf,plt
	entry.mask = 0
	                   lgrl %r1,plt.target
	                   br %r1

Fix by setting PLT target to the instruction following `brcl 0xf,plt`
instead of 0. This way T2 will simply resume the execution of the eBPF
program, which is the desired effect of passing new_addr == NULL.

Fixes: f1d5df84cd8c ("s390/bpf: Implement bpf_arch_text_poke()")
Reported-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20230414154755.184502-1-iii@linux.ibm.com
---
 arch/s390/net/bpf_jit_comp.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d0846ba818ee..6b1876e4ad3f 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -539,7 +539,7 @@ static void bpf_jit_plt(void *plt, void *ret, void *target)
 {
 	memcpy(plt, bpf_plt, BPF_PLT_SIZE);
 	*(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret;
-	*(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target;
+	*(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret;
 }
 
 /*
@@ -2010,7 +2010,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	} __packed insn;
 	char expected_plt[BPF_PLT_SIZE];
 	char current_plt[BPF_PLT_SIZE];
+	char new_plt[BPF_PLT_SIZE];
 	char *plt;
+	char *ret;
 	int err;
 
 	/* Verify the branch to be patched. */
@@ -2032,12 +2034,15 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE);
 		if (err < 0)
 			return err;
-		bpf_jit_plt(expected_plt, (char *)ip + 6, old_addr);
+		ret = (char *)ip + 6;
+		bpf_jit_plt(expected_plt, ret, old_addr);
 		if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE))
 			return -EINVAL;
 		/* Adjust the call address. */
+		bpf_jit_plt(new_plt, ret, new_addr);
 		s390_kernel_write(plt + (bpf_plt_target - bpf_plt),
-				  &new_addr, sizeof(void *));
+				  new_plt + (bpf_plt_target - bpf_plt),
+				  sizeof(void *));
 	}
 
 	/* Adjust the mask of the branch. */
-- 
cgit v1.2.3


From 853618d5886bf94812f31228091cd37d308230f7 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Fri, 14 Apr 2023 14:08:35 +0800
Subject: virtio_net: bugfix overflow inside xdp_linearize_page()

Here we copy the data from the original buf to the new page. But we
not check that it may be overflow.

As long as the size received(including vnethdr) is greater than 3840
(PAGE_SIZE -VIRTIO_XDP_HEADROOM). Then the memcpy will overflow.

And this is completely possible, as long as the MTU is large, such
as 4096. In our test environment, this will cause crash. Since crash is
caused by the written memory, it is meaningless, so I do not include it.

Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous buffers")
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2396c28c0122..ea1bd4bb326d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -814,8 +814,13 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 				       int page_off,
 				       unsigned int *len)
 {
-	struct page *page = alloc_page(GFP_ATOMIC);
+	int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	struct page *page;
+
+	if (page_off + *len + tailroom > PAGE_SIZE)
+		return NULL;
 
+	page = alloc_page(GFP_ATOMIC);
 	if (!page)
 		return NULL;
 
@@ -823,7 +828,6 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 	page_off += *len;
 
 	while (--*num_buf) {
-		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 		unsigned int buflen;
 		void *buf;
 		int off;
-- 
cgit v1.2.3


From a80bb8e7233b2ad6ff119646b6e33fb3edcec37b Mon Sep 17 00:00:00 2001
From: Ding Hui <dinghui@sangfor.com.cn>
Date: Fri, 14 Apr 2023 23:23:06 +0800
Subject: sfc: Fix use-after-free due to selftest_work

There is a use-after-free scenario that is:

When the NIC is down, user set mac address or vlan tag to VF,
the xxx_set_vf_mac() or xxx_set_vf_vlan() will invoke efx_net_stop()
and efx_net_open(), since netif_running() is false, the port will not
start and keep port_enabled false, but selftest_work is scheduled
in efx_net_open().

If we remove the device before selftest_work run, the efx_stop_port()
will not be called since the NIC is down, and then efx is freed,
we will soon get a UAF in run_timer_softirq() like this:

[ 1178.907941] ==================================================================
[ 1178.907948] BUG: KASAN: use-after-free in run_timer_softirq+0xdea/0xe90
[ 1178.907950] Write of size 8 at addr ff11001f449cdc80 by task swapper/47/0
[ 1178.907950]
[ 1178.907953] CPU: 47 PID: 0 Comm: swapper/47 Kdump: loaded Tainted: G           O     --------- -t - 4.18.0 #1
[ 1178.907954] Hardware name: SANGFOR X620G40/WI2HG-208T1061A, BIOS SPYH051032-U01 04/01/2022
[ 1178.907955] Call Trace:
[ 1178.907956]  <IRQ>
[ 1178.907960]  dump_stack+0x71/0xab
[ 1178.907963]  print_address_description+0x6b/0x290
[ 1178.907965]  ? run_timer_softirq+0xdea/0xe90
[ 1178.907967]  kasan_report+0x14a/0x2b0
[ 1178.907968]  run_timer_softirq+0xdea/0xe90
[ 1178.907971]  ? init_timer_key+0x170/0x170
[ 1178.907973]  ? hrtimer_cancel+0x20/0x20
[ 1178.907976]  ? sched_clock+0x5/0x10
[ 1178.907978]  ? sched_clock_cpu+0x18/0x170
[ 1178.907981]  __do_softirq+0x1c8/0x5fa
[ 1178.907985]  irq_exit+0x213/0x240
[ 1178.907987]  smp_apic_timer_interrupt+0xd0/0x330
[ 1178.907989]  apic_timer_interrupt+0xf/0x20
[ 1178.907990]  </IRQ>
[ 1178.907991] RIP: 0010:mwait_idle+0xae/0x370

If the NIC is not actually brought up, there is no need to schedule
selftest_work, so let's move invoking efx_selftest_async_start()
into efx_start_all(), and it will be canceled by broughting down.

Fixes: dd40781e3a4e ("sfc: Run event/IRQ self-test asynchronously when interface is brought up")
Fixes: e340be923012 ("sfc: add ndo_set_vf_mac() function for EF10")
Debugged-by: Huang Cun <huangcun@sangfor.com.cn>
Cc: Donglin Peng <pengdonglin@sangfor.com.cn>
Suggested-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx.c        | 1 -
 drivers/net/ethernet/sfc/efx_common.c | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 884d8d168862..1eceffa02b55 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -541,7 +541,6 @@ int efx_net_open(struct net_device *net_dev)
 	else
 		efx->state = STATE_NET_UP;
 
-	efx_selftest_async_start(efx);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c
index cc30524c2fe4..361687de308d 100644
--- a/drivers/net/ethernet/sfc/efx_common.c
+++ b/drivers/net/ethernet/sfc/efx_common.c
@@ -544,6 +544,8 @@ void efx_start_all(struct efx_nic *efx)
 	/* Start the hardware monitor if there is one */
 	efx_start_monitor(efx);
 
+	efx_selftest_async_start(efx);
+
 	/* Link state detection is normally event-driven; we have
 	 * to poll now because we could have missed a change
 	 */
-- 
cgit v1.2.3


From 338469d677e5d426f5ada88761f16f6d2c7c1981 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Sat, 15 Apr 2023 12:33:09 -0300
Subject: net/sched: clear actions pointer in miss cookie init fail

Palash reports a UAF when using a modified version of syzkaller[1].

When 'tcf_exts_miss_cookie_base_alloc()' fails in 'tcf_exts_init_ex()'
a call to 'tcf_exts_destroy()' is made to free up the tcf_exts
resources.
In flower, a call to '__fl_put()' when 'tcf_exts_init_ex()' fails is made;
Then calling 'tcf_exts_destroy()', which triggers an UAF since the
already freed tcf_exts action pointer is lingering in the struct.

Before the offending patch, this was not an issue since there was no
case where the tcf_exts action pointer could linger. Therefore, restore
the old semantic by clearing the action pointer in case of a failure to
initialize the miss_cookie.

[1] https://github.com/cmu-pasta/linux-kernel-enriched-corpus

v1->v2: Fix compilation on configs without tc actions (kernel test robot)

Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action")
Reported-by: Palash Oswal <oswalpalash@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2a6b6be0811b..35785a36c802 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3235,6 +3235,9 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action,
 
 err_miss_alloc:
 	tcf_exts_destroy(exts);
+#ifdef CONFIG_NET_CLS_ACT
+	exts->actions = NULL;
+#endif
 	return err;
 }
 EXPORT_SYMBOL(tcf_exts_init_ex);
-- 
cgit v1.2.3


From c55c0e91c813589dc55bea6bf9a9fbfaa10ae41d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Apr 2023 10:21:36 +0200
Subject: netfilter: nf_tables: fix ifdef to also consider nf_tables=m

nftables can be built as a module, so fix the preprocessor conditional
accordingly.

Fixes: 478b360a47b7 ("netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n")
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1ec3530c8191..dbcaac8b6966 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4713,7 +4713,7 @@ static inline void nf_reset_ct(struct sk_buff *skb)
 
 static inline void nf_reset_trace(struct sk_buff *skb)
 {
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	skb->nf_trace = 0;
 #endif
 }
@@ -4733,7 +4733,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 	dst->_nfct = src->_nfct;
 	nf_conntrack_get(skb_nfct(src));
 #endif
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	if (copy)
 		dst->nf_trace = src->nf_trace;
 #endif
-- 
cgit v1.2.3


From 8485d093b076e59baff424552e8aecfc5bd2d261 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Fri, 24 Mar 2023 18:16:38 +0100
Subject: i40e: fix accessing vsi->active_filters without holding lock

Fix accessing vsi->active_filters without holding the mac_filter_hash_lock.
Move vsi->active_filters = 0 inside critical section and
move clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state) after the critical
section to ensure the new filters from other threads can be added only after
filters cleaning in the critical section is finished.

Fixes: 278e7d0b9d68 ("i40e: store MAC/VLAN filters in a hash with the MAC Address as key")
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 228cd502bb48..6313575a4b6c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -14133,15 +14133,15 @@ static int i40e_add_vsi(struct i40e_vsi *vsi)
 		vsi->id = ctxt.vsi_number;
 	}
 
-	vsi->active_filters = 0;
-	clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	vsi->active_filters = 0;
 	/* If macvlan filters already exist, force them to get loaded */
 	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
 		f->state = I40E_FILTER_NEW;
 		f_count++;
 	}
 	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
 
 	if (f_count) {
 		vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
-- 
cgit v1.2.3


From c86c00c6935505929cc9adb29ddb85e48c71f828 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Mon, 3 Apr 2023 07:13:18 +0200
Subject: i40e: fix i40e_setup_misc_vector() error handling

Add error handling of i40e_setup_misc_vector() in i40e_rebuild().
In case interrupt vectors setup fails do not re-open vsi-s and
do not bring up vf-s, we have no interrupts to serve a traffic
anyway.

Fixes: 41c445ff0f48 ("i40e: main driver core")
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6313575a4b6c..7c30abd0dfc2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11059,8 +11059,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired)
 					     pf->hw.aq.asq_last_status));
 	}
 	/* reinit the misc interrupt */
-	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
 		ret = i40e_setup_misc_vector(pf);
+		if (ret)
+			goto end_unlock;
+	}
 
 	/* Add a filter to drop all Flow control frames from any VSI from being
 	 * transmitted. By doing so we stop a malicious VF from sending out
-- 
cgit v1.2.3


From 1a2bd3bd72e978304cdc0a7385e8048e8242225d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 14 Apr 2023 09:26:14 -0700
Subject: ice: document RDMA devlink parameters

Commit e523af4ee560 ("net/ice: Add support for enable_iwarp and enable_roce
devlink param") added support for the enable_roce and enable_iwarp
parameters in the ice driver. It didn't document these parameters in the
ice devlink documentation file. Add this documentation, including a note
about the mutual exclusion between the two modes.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://lore.kernel.org/r/20230414162614.571861-1-jacob.e.keller@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/ice.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst
index 10f282c2117c..2f60e34ab926 100644
--- a/Documentation/networking/devlink/ice.rst
+++ b/Documentation/networking/devlink/ice.rst
@@ -7,6 +7,21 @@ ice devlink support
 This document describes the devlink features implemented by the ``ice``
 device driver.
 
+Parameters
+==========
+
+.. list-table:: Generic parameters implemented
+
+   * - Name
+     - Mode
+     - Notes
+   * - ``enable_roce``
+     - runtime
+     - mutually exclusive with ``enable_iwarp``
+   * - ``enable_iwarp``
+     - runtime
+     - mutually exclusive with ``enable_roce``
+
 Info versions
 =============
 
-- 
cgit v1.2.3


From d46fc894147cf98dd6e8210aa99ed46854191840 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 17 Apr 2023 12:14:29 +0200
Subject: netfilter: nf_tables: validate catch-all set elements

catch-all set element might jump/goto to chain that uses expressions
that require validation.

Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +++
 net/netfilter/nf_tables_api.c     | 64 +++++++++++++++++++++++++++++++++++----
 net/netfilter/nft_lookup.c        | 36 +++-------------------
 3 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9430128aae99..1b8e305bb54a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1085,6 +1085,10 @@ struct nft_chain {
 };
 
 int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+			 const struct nft_set_iter *iter,
+			 struct nft_set_elem *elem);
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cd52109e674a..98043e83af71 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 	return 0;
 }
 
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+			 const struct nft_set_iter *iter,
+			 struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+	const struct nft_data *data;
+	int err;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		pctx->level++;
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		pctx->level--;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct nft_set_elem_catchall {
+	struct list_head	list;
+	struct rcu_head		rcu;
+	void			*elem;
+};
+
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask))
+			continue;
+
+		elem.priv = catchall->elem;
+		ret = nft_setelem_validate(ctx, set, NULL, &elem);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
+}
+
 static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 					     const struct nft_chain *chain,
 					     const struct nlattr *nla);
@@ -4759,12 +4817,6 @@ err_set_name:
 	return err;
 }
 
-struct nft_set_elem_catchall {
-	struct list_head	list;
-	struct rcu_head		rcu;
-	void			*elem;
-};
-
 static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
 				     struct nft_set *set)
 {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index cae5a6724163..cecf8ab90e58 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -199,37 +199,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
-				       struct nft_set *set,
-				       const struct nft_set_iter *iter,
-				       struct nft_set_elem *elem)
-{
-	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
-	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
-	const struct nft_data *data;
-	int err;
-
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
-	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
-		return 0;
-
-	data = nft_set_ext_data(ext);
-	switch (data->verdict.code) {
-	case NFT_JUMP:
-	case NFT_GOTO:
-		pctx->level++;
-		err = nft_chain_validate(ctx, data->verdict.chain);
-		if (err < 0)
-			return err;
-		pctx->level--;
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
 static int nft_lookup_validate(const struct nft_ctx *ctx,
 			       const struct nft_expr *expr,
 			       const struct nft_data **d)
@@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx,
 	iter.skip	= 0;
 	iter.count	= 0;
 	iter.err	= 0;
-	iter.fn		= nft_lookup_validate_setelem;
+	iter.fn		= nft_setelem_validate;
 
 	priv->set->ops->walk(ctx, priv->set, &iter);
+	if (!iter.err)
+		iter.err = nft_set_catchall_validate(ctx, priv->set);
+
 	if (iter.err < 0)
 		return iter.err;
 
-- 
cgit v1.2.3


From e50b9b9e8610d47b7c22529443e45a16b1ea3a15 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Sat, 15 Apr 2023 16:12:27 +0800
Subject: cxgb4: fix use after free bugs caused by circular dependency problem

The flower_stats_timer can schedule flower_stats_work and
flower_stats_work can also arm the flower_stats_timer. The
process is shown below:

----------- timer schedules work ------------
ch_flower_stats_cb() //timer handler
  schedule_work(&adap->flower_stats_work);

----------- work arms timer ------------
ch_flower_stats_handler() //workqueue callback function
  mod_timer(&adap->flower_stats_timer, ...);

When the cxgb4 device is detaching, the timer and workqueue
could still be rearmed. The process is shown below:

  (cleanup routine)           | (timer and workqueue routine)
remove_one()                  |
  free_some_resources()       | ch_flower_stats_cb() //timer
    cxgb4_cleanup_tc_flower() |   schedule_work()
      del_timer_sync()        |
                              | ch_flower_stats_handler() //workqueue
                              |   mod_timer()
      cancel_work_sync()      |
  kfree(adapter) //FREE       | ch_flower_stats_cb() //timer
                              |   adap->flower_stats_work //USE

This patch changes del_timer_sync() to timer_shutdown_sync(),
which could prevent rearming of the timer from the workqueue.

Fixes: e0f911c81e93 ("cxgb4: fetch stats for offloaded tc flower flows")
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Link: https://lore.kernel.org/r/20230415081227.7463-1-duoming@zju.edu.cn
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index dd9be229819a..d3541159487d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -1135,7 +1135,7 @@ void cxgb4_cleanup_tc_flower(struct adapter *adap)
 		return;
 
 	if (adap->flower_stats_timer.function)
-		del_timer_sync(&adap->flower_stats_timer);
+		timer_shutdown_sync(&adap->flower_stats_timer);
 	cancel_work_sync(&adap->flower_stats_work);
 	rhashtable_destroy(&adap->flower_tbl);
 	adap->tc_flower_initialized = false;
-- 
cgit v1.2.3


From d4eb7e39929a3b1ff30fb751b4859fc2410702a0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 17 Apr 2023 17:50:28 +0200
Subject: netfilter: nf_tables: tighten netlink attribute requirements for
 catch-all elements

If NFT_SET_ELEM_CATCHALL is set on, then userspace provides no set element
key. Otherwise, bail out with -EINVAL.

Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 98043e83af71..e48ab8dfb541 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6108,7 +6108,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (err < 0)
 		return err;
 
-	if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+	if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
+	    (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
 		return -EINVAL;
 
 	if (flags != 0) {
-- 
cgit v1.2.3


From e8b51a1a15d5a3cce231e0669f6a161dc5bb9b75 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Sun, 16 Apr 2023 23:58:18 -0700
Subject: bnxt_en: Do not initialize PTP on older P3/P4 chips

The driver does not support PTP on these older chips and it is assuming
that firmware on these older chips will not return the
PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS flag in __bnxt_hwrm_ptp_qcfg(),
causing the function to abort quietly.

But newer firmware now sets this flag and so __bnxt_hwrm_ptp_qcfg()
will proceed further.  Eventually it will fail in bnxt_ptp_init() ->
bnxt_map_ptp_regs() because there is no code to support the older chips.
The driver will then complain:

"PTP initialization failed.\n"

Fix it so that we abort quietly earlier without going through the
unnecessary steps and alarming the user with the warning log.

Fixes: ae5c42f0b92c ("bnxt_en: Get PTP hardware capability from firmware")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c23e3b397bcf..ef97a4190b39 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7627,7 +7627,7 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp)
 	u8 flags;
 	int rc;
 
-	if (bp->hwrm_spec_code < 0x10801) {
+	if (bp->hwrm_spec_code < 0x10801 || !BNXT_CHIP_P5_THOR(bp)) {
 		rc = -ENODEV;
 		goto no_ptp;
 	}
-- 
cgit v1.2.3


From 4f4e54b1041e60694117893cd986831153a3e719 Mon Sep 17 00:00:00 2001
From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Date: Sun, 16 Apr 2023 23:58:19 -0700
Subject: bnxt_en: Fix a possible NULL pointer dereference in unload path

In the driver unload path, the driver currently checks the valid
BNXT_FLAG_ROCE_CAP flag in bnxt_rdma_aux_device_uninit() before
proceeding.  This is flawed because the flag may not be set initially
during driver load.  It may be set later after the NVRAM setting is
changed followed by a firmware reset.  Relying on the
BNXT_FLAG_ROCE_CAP flag may crash in bnxt_rdma_aux_device_uninit() if
the aux device was never initialized:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
PGD 8ae6aa067 P4D 0
Oops: 0000 [#1] SMP NOPTI
CPU: 39 PID: 42558 Comm: rmmod Kdump: loaded Tainted: G           OE    --------- -  - 4.18.0-348.el8.x86_64 #1
Hardware name: Dell Inc. PowerEdge R750/0WT8Y6, BIOS 1.5.4 12/17/2021
RIP: 0010:device_del+0x1b/0x410
Code: 89 a5 50 03 00 00 4c 89 a5 58 03 00 00 eb 89 0f 1f 44 00 00 41 56 41 55 41 54 4c 8d a7 80 00 00 00 55 53 48 89 fb 48 83 ec 18 <48> 8b 2f 4c 89 e7 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 c0
RSP: 0018:ff7f82bf469a7dc8 EFLAGS: 00010292
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000206 RDI: 0000000000000000
RBP: ff31b7cd114b0ac0 R08: 0000000000000000 R09: ffffffff935c3400
R10: ff31b7cd45bc3440 R11: 0000000000000001 R12: 0000000000000080
R13: ffffffffc1069f40 R14: 0000000000000000 R15: 0000000000000000
FS:  00007fc9903ce740(0000) GS:ff31b7d4ffac0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000992fee004 CR4: 0000000000773ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 bnxt_rdma_aux_device_uninit+0x1f/0x30 [bnxt_en]
 bnxt_remove_one+0x2f/0x1f0 [bnxt_en]
 pci_device_remove+0x3b/0xc0
 device_release_driver_internal+0x103/0x1f0
 driver_detach+0x54/0x88
 bus_remove_driver+0x77/0xc9
 pci_unregister_driver+0x2d/0xb0
 bnxt_exit+0x16/0x2c [bnxt_en]
 __x64_sys_delete_module+0x139/0x280
 do_syscall_64+0x5b/0x1a0
 entry_SYSCALL_64_after_hwframe+0x65/0xca
RIP: 0033:0x7fc98f3af71b

Fix this by modifying the check inside bnxt_rdma_aux_device_uninit()
to check for bp->aux_priv instead.  We also need to make some changes
in bnxt_rdma_aux_device_init() to make sure that bp->aux_priv is set
only when the aux device is fully initialized.

Fixes: d80d88b0dfff ("bnxt_en: Add auxiliary driver support")
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index e7b5e28ee29f..852eb449ccae 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -304,7 +304,7 @@ void bnxt_rdma_aux_device_uninit(struct bnxt *bp)
 	struct auxiliary_device *adev;
 
 	/* Skip if no auxiliary device init was done. */
-	if (!(bp->flags & BNXT_FLAG_ROCE_CAP))
+	if (!bp->aux_priv)
 		return;
 
 	aux_priv = bp->aux_priv;
@@ -324,6 +324,7 @@ static void bnxt_aux_dev_release(struct device *dev)
 	bp->edev = NULL;
 	kfree(aux_priv->edev);
 	kfree(aux_priv);
+	bp->aux_priv = NULL;
 }
 
 static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp)
@@ -359,19 +360,18 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp)
 	if (!(bp->flags & BNXT_FLAG_ROCE_CAP))
 		return;
 
-	bp->aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL);
-	if (!bp->aux_priv)
+	aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL);
+	if (!aux_priv)
 		goto exit;
 
-	bp->aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL);
-	if (bp->aux_priv->id < 0) {
+	aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL);
+	if (aux_priv->id < 0) {
 		netdev_warn(bp->dev,
 			    "ida alloc failed for ROCE auxiliary device\n");
-		kfree(bp->aux_priv);
+		kfree(aux_priv);
 		goto exit;
 	}
 
-	aux_priv = bp->aux_priv;
 	aux_dev = &aux_priv->aux_dev;
 	aux_dev->id = aux_priv->id;
 	aux_dev->name = "rdma";
@@ -380,10 +380,11 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp)
 
 	rc = auxiliary_device_init(aux_dev);
 	if (rc) {
-		ida_free(&bnxt_aux_dev_ids, bp->aux_priv->id);
-		kfree(bp->aux_priv);
+		ida_free(&bnxt_aux_dev_ids, aux_priv->id);
+		kfree(aux_priv);
 		goto exit;
 	}
+	bp->aux_priv = aux_priv;
 
 	/* From this point, all cleanup will happen via the .release callback &
 	 * any error unwinding will need to include a call to
-- 
cgit v1.2.3


From c0e73276f0fcbbd3d4736ba975d7dc7a48791b0c Mon Sep 17 00:00:00 2001
From: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Date: Mon, 17 Apr 2023 05:07:18 -0700
Subject: mlxfw: fix null-ptr-deref in mlxfw_mfa2_tlv_next()

Function mlxfw_mfa2_tlv_multi_get() returns NULL if 'tlv' in
question does not pass checks in mlxfw_mfa2_tlv_payload_get(). This
behaviour may lead to NULL pointer dereference in 'multi->total_len'.
Fix this issue by testing mlxfw_mfa2_tlv_multi_get()'s return value
against NULL.

Found by Linux Verification Center (linuxtesting.org) with static
analysis tool SVACE.

Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process")
Co-developed-by: Natalia Petrova <n.petrova@fintech.ru>
Signed-off-by: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20230417120718.52325-1-n.zhandarovich@fintech.ru
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
index 017d68f1e123..972c571b4158 100644
--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
@@ -31,6 +31,8 @@ mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file,
 
 	if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) {
 		multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv);
+		if (!multi)
+			return NULL;
 		tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len));
 	}
 
-- 
cgit v1.2.3


From 8267fc71abb2dc47338570e56dd3473a58313fce Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 17 Apr 2023 23:53:22 +0200
Subject: veth: take into account peer device for NETDEV_XDP_ACT_NDO_XMIT
 xdp_features flag

For veth pairs, NETDEV_XDP_ACT_NDO_XMIT is supported by the current
device if the peer one is running a XDP program or if it has GRO enabled.
Fix the xdp_features flags reporting considering peer device and not
current one for NETDEV_XDP_ACT_NDO_XMIT.

Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/4f1ca6f6f6b42ae125bfdb5c7782217c83968b2e.1681767806.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/veth.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index e1b38fbf1dd9..4b3c6647edc6 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev)
 
 	peer = rtnl_dereference(priv->peer);
 	if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) {
+		struct veth_priv *priv_peer = netdev_priv(peer);
 		xdp_features_t val = NETDEV_XDP_ACT_BASIC |
 				     NETDEV_XDP_ACT_REDIRECT |
 				     NETDEV_XDP_ACT_RX_SG;
 
-		if (priv->_xdp_prog || veth_gro_requested(dev))
+		if (priv_peer->_xdp_prog || veth_gro_requested(peer))
 			val |= NETDEV_XDP_ACT_NDO_XMIT |
 			       NETDEV_XDP_ACT_NDO_XMIT_SG;
 		xdp_set_features_flag(dev, val);
@@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev,
 {
 	netdev_features_t changed = features ^ dev->features;
 	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer;
 	int err;
 
 	if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
 		return 0;
 
+	peer = rtnl_dereference(priv->peer);
 	if (features & NETIF_F_GRO) {
 		err = veth_napi_enable(dev);
 		if (err)
 			return err;
 
-		xdp_features_set_redirect_target(dev, true);
+		if (peer)
+			xdp_features_set_redirect_target(peer, true);
 	} else {
-		xdp_features_clear_redirect_target(dev);
+		if (peer)
+			xdp_features_clear_redirect_target(peer);
 		veth_napi_del(dev);
 	}
 	return 0;
@@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 			peer->max_mtu = max_mtu;
 		}
 
-		xdp_features_set_redirect_target(dev, true);
+		xdp_features_set_redirect_target(peer, true);
 	}
 
 	if (old_prog) {
 		if (!prog) {
-			if (!veth_gro_requested(dev))
-				xdp_features_clear_redirect_target(dev);
+			if (peer && !veth_gro_requested(dev))
+				xdp_features_clear_redirect_target(peer);
 
 			if (dev->flags & IFF_UP)
 				veth_disable_xdp(dev);
-- 
cgit v1.2.3


From c484fcc058bada604d7e4e5228d4affb646ddbc2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Mon, 17 Apr 2023 09:12:16 +0300
Subject: bonding: Fix memory leak when changing bond type to Ethernet

When a net device is put administratively up, its 'IFF_UP' flag is set
(if not set already) and a 'NETDEV_UP' notification is emitted, which
causes the 8021q driver to add VLAN ID 0 on the device. The reverse
happens when a net device is put administratively down.

When changing the type of a bond to Ethernet, its 'IFF_UP' flag is
incorrectly cleared, resulting in the kernel skipping the above process
and VLAN ID 0 being leaked [1].

Fix by restoring the flag when changing the type to Ethernet, in a
similar fashion to the restoration of the 'IFF_SLAVE' flag.

The issue can be reproduced using the script in [2], with example out
before and after the fix in [3].

[1]
unreferenced object 0xffff888103479900 (size 256):
  comm "ip", pid 329, jiffies 4294775225 (age 28.561s)
  hex dump (first 32 bytes):
    00 a0 0c 15 81 88 ff ff 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81a6051a>] kmalloc_trace+0x2a/0xe0
    [<ffffffff8406426c>] vlan_vid_add+0x30c/0x790
    [<ffffffff84068e21>] vlan_device_event+0x1491/0x21a0
    [<ffffffff81440c8e>] notifier_call_chain+0xbe/0x1f0
    [<ffffffff8372383a>] call_netdevice_notifiers_info+0xba/0x150
    [<ffffffff837590f2>] __dev_notify_flags+0x132/0x2e0
    [<ffffffff8375ad9f>] dev_change_flags+0x11f/0x180
    [<ffffffff8379af36>] do_setlink+0xb96/0x4060
    [<ffffffff837adf6a>] __rtnl_newlink+0xc0a/0x18a0
    [<ffffffff837aec6c>] rtnl_newlink+0x6c/0xa0
    [<ffffffff837ac64e>] rtnetlink_rcv_msg+0x43e/0xe00
    [<ffffffff839a99e0>] netlink_rcv_skb+0x170/0x440
    [<ffffffff839a738f>] netlink_unicast+0x53f/0x810
    [<ffffffff839a7fcb>] netlink_sendmsg+0x96b/0xe90
    [<ffffffff8369d12f>] ____sys_sendmsg+0x30f/0xa70
    [<ffffffff836a6d7a>] ___sys_sendmsg+0x13a/0x1e0
unreferenced object 0xffff88810f6a83e0 (size 32):
  comm "ip", pid 329, jiffies 4294775225 (age 28.561s)
  hex dump (first 32 bytes):
    a0 99 47 03 81 88 ff ff a0 99 47 03 81 88 ff ff  ..G.......G.....
    81 00 00 00 01 00 00 00 cc cc cc cc cc cc cc cc  ................
  backtrace:
    [<ffffffff81a6051a>] kmalloc_trace+0x2a/0xe0
    [<ffffffff84064369>] vlan_vid_add+0x409/0x790
    [<ffffffff84068e21>] vlan_device_event+0x1491/0x21a0
    [<ffffffff81440c8e>] notifier_call_chain+0xbe/0x1f0
    [<ffffffff8372383a>] call_netdevice_notifiers_info+0xba/0x150
    [<ffffffff837590f2>] __dev_notify_flags+0x132/0x2e0
    [<ffffffff8375ad9f>] dev_change_flags+0x11f/0x180
    [<ffffffff8379af36>] do_setlink+0xb96/0x4060
    [<ffffffff837adf6a>] __rtnl_newlink+0xc0a/0x18a0
    [<ffffffff837aec6c>] rtnl_newlink+0x6c/0xa0
    [<ffffffff837ac64e>] rtnetlink_rcv_msg+0x43e/0xe00
    [<ffffffff839a99e0>] netlink_rcv_skb+0x170/0x440
    [<ffffffff839a738f>] netlink_unicast+0x53f/0x810
    [<ffffffff839a7fcb>] netlink_sendmsg+0x96b/0xe90
    [<ffffffff8369d12f>] ____sys_sendmsg+0x30f/0xa70
    [<ffffffff836a6d7a>] ___sys_sendmsg+0x13a/0x1e0

[2]
ip link add name t-nlmon type nlmon
ip link add name t-dummy type dummy
ip link add name t-bond type bond mode active-backup

ip link set dev t-bond up
ip link set dev t-nlmon master t-bond
ip link set dev t-nlmon nomaster
ip link show dev t-bond
ip link set dev t-dummy master t-bond
ip link show dev t-bond

ip link del dev t-bond
ip link del dev t-dummy
ip link del dev t-nlmon

[3]
Before:

12: t-bond: <NO-CARRIER,BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000
    link/netlink
12: t-bond: <BROADCAST,MULTICAST,MASTER,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 46:57:39:a4:46:a2 brd ff:ff:ff:ff:ff:ff

After:

12: t-bond: <NO-CARRIER,BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000
    link/netlink
12: t-bond: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 66:48:7b:74:b6:8a brd ff:ff:ff:ff:ff:ff

Fixes: e36b9d16c6a6 ("bonding: clean muticast addresses when device changes type")
Fixes: 75c78500ddad ("bonding: remap muticast addresses without using dev_close() and dev_open()")
Fixes: 9ec7eb60dcbc ("bonding: restore IFF_MASTER/SLAVE flags on bond enslave ether type change")
Reported-by: Mirsad Goran Todorovac <mirsad.todorovac@alu.unizg.hr>
Link: https://lore.kernel.org/netdev/78a8a03b-6070-3e6b-5042-f848dab16fb8@alu.unizg.hr/
Tested-by: Mirsad Goran Todorovac <mirsad.todorovac@alu.unizg.hr>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8cc9a74789b7..7a7d584f378a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1777,14 +1777,15 @@ void bond_lower_state_changed(struct slave *slave)
 
 /* The bonding driver uses ether_setup() to convert a master bond device
  * to ARPHRD_ETHER, that resets the target netdevice's flags so we always
- * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE if it was set
+ * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP
+ * if they were set
  */
 static void bond_ether_setup(struct net_device *bond_dev)
 {
-	unsigned int slave_flag = bond_dev->flags & IFF_SLAVE;
+	unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP);
 
 	ether_setup(bond_dev);
-	bond_dev->flags |= IFF_MASTER | slave_flag;
+	bond_dev->flags |= IFF_MASTER | flags;
 	bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 }
 
-- 
cgit v1.2.3


From 6f4833383e8514ea796d094e05c24889b8997fde Mon Sep 17 00:00:00 2001
From: Seiji Nishikawa <snishika@redhat.com>
Date: Mon, 17 Apr 2023 21:21:27 +0900
Subject: net: vmxnet3: Fix NULL pointer dereference in
 vmxnet3_rq_rx_complete()

When vmxnet3_rq_create() fails to allocate rq->data_ring.base due to page
allocation failure, subsequent call to vmxnet3_rq_rx_complete() can result in
NULL pointer dereference.

To fix this bug, check not only that rxDataRingUsed is true but also that
adapter->rxdataring_enabled is true before calling memcpy() in
vmxnet3_rq_rx_complete().

[1728352.477993] ethtool: page allocation failure: order:9, mode:0x6000c0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
...
[1728352.478009] Call Trace:
[1728352.478028]  dump_stack+0x41/0x60
[1728352.478035]  warn_alloc.cold.120+0x7b/0x11b
[1728352.478038]  ? _cond_resched+0x15/0x30
[1728352.478042]  ? __alloc_pages_direct_compact+0x15f/0x170
[1728352.478043]  __alloc_pages_slowpath+0xcd3/0xd10
[1728352.478047]  __alloc_pages_nodemask+0x2e2/0x320
[1728352.478049]  __dma_direct_alloc_pages.constprop.25+0x8a/0x120
[1728352.478053]  dma_direct_alloc+0x5a/0x2a0
[1728352.478056]  vmxnet3_rq_create.part.57+0x17c/0x1f0 [vmxnet3]
...
[1728352.478188] vmxnet3 0000:0b:00.0 ens192: rx data ring will be disabled
...
[1728352.515347] BUG: unable to handle kernel NULL pointer dereference at 0000000000000034
...
[1728352.515440] RIP: 0010:memcpy_orig+0x54/0x130
...
[1728352.515655] Call Trace:
[1728352.515665]  <IRQ>
[1728352.515672]  vmxnet3_rq_rx_complete+0x419/0xef0 [vmxnet3]
[1728352.515690]  vmxnet3_poll_rx_only+0x31/0xa0 [vmxnet3]
...

Signed-off-by: Seiji Nishikawa <snishika@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index da488cbb0542..f2b76ee866a4 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1504,7 +1504,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 				goto rcd_done;
 			}
 
-			if (rxDataRingUsed) {
+			if (rxDataRingUsed && adapter->rxdataring_enabled) {
 				size_t sz;
 
 				BUG_ON(rcd->len > rq->data_ring.desc_size);
-- 
cgit v1.2.3


From 4e006c7a6dac0ead4c1bf606000aa90a372fc253 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 17 Apr 2023 09:00:52 -0400
Subject: net: rpl: fix rpl header size calculation

This patch fixes a missing 8 byte for the header size calculation. The
ipv6_rpl_srh_size() is used to check a skb_pull() on skb->data which
points to skb_transport_header(). Currently we only check on the
calculated addresses fields using CmprI and CmprE fields, see:

https://www.rfc-editor.org/rfc/rfc6554#section-3

there is however a missing 8 byte inside the calculation which stands
for the fields before the addresses field. Those 8 bytes are represented
by sizeof(struct ipv6_rpl_sr_hdr) expression.

Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr")
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Reported-by: maxpl0it <maxpl0it@protonmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/rpl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
index 488aec9e1a74..d1876f192225 100644
--- a/net/ipv6/rpl.c
+++ b/net/ipv6/rpl.c
@@ -32,7 +32,8 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
 size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
 			 unsigned char cmpre)
 {
-	return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+	return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) +
+		IPV6_PFXTAIL_LEN(cmpre);
 }
 
 void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
-- 
cgit v1.2.3


From 2a6a870e44dd88f1a6a2893c65ef756a9edfb4c7 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 17 Apr 2023 16:00:40 +0200
Subject: mptcp: stops worker on unaccepted sockets at listener close

This is a partial revert of the blamed commit, with a relevant
change: mptcp_subflow_queue_clean() now just change the msk
socket status and stop the worker, so that the UaF issue addressed
by the blamed commit is not re-introduced.

The above prevents the mptcp worker from running concurrently with
inet_csk_listen_stop(), as such race would trigger a warning, as
reported by Christoph:

RSP: 002b:00007f784fe09cd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
WARNING: CPU: 0 PID: 25807 at net/ipv4/inet_connection_sock.c:1387 inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387
RAX: ffffffffffffffda RBX: 00000000006bc050 RCX: 00007f7850afd6a9
RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000004
Modules linked in:
RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006bc05c
R13: fffffffffffffea8 R14: 00000000006bc050 R15: 000000000001fe40

 </TASK>
CPU: 0 PID: 25807 Comm: syz-executor.7 Not tainted 6.2.0-g778e54711659 #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
RIP: 0010:inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387
RAX: 0000000000000000 RBX: ffff888100dfbd40 RCX: 0000000000000000
RDX: ffff8881363aab80 RSI: ffffffff81c494f4 RDI: 0000000000000005
RBP: ffff888126dad080 R08: 0000000000000005 R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000000 R12: ffff888100dfe040
R13: 0000000000000001 R14: 0000000000000000 R15: ffff888100dfbdd8
FS:  00007f7850a2c800(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b32d26000 CR3: 000000012fdd8006 CR4: 0000000000770ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 <TASK>
 __tcp_close+0x5b2/0x620 net/ipv4/tcp.c:2875
 __mptcp_close_ssk+0x145/0x3d0 net/mptcp/protocol.c:2427
 mptcp_destroy_common+0x8a/0x1c0 net/mptcp/protocol.c:3277
 mptcp_destroy+0x41/0x60 net/mptcp/protocol.c:3304
 __mptcp_destroy_sock+0x56/0x140 net/mptcp/protocol.c:2965
 __mptcp_close+0x38f/0x4a0 net/mptcp/protocol.c:3057
 mptcp_close+0x24/0xe0 net/mptcp/protocol.c:3072
 inet_release+0x53/0xa0 net/ipv4/af_inet.c:429
 __sock_release+0x4e/0xf0 net/socket.c:651
 sock_close+0x15/0x20 net/socket.c:1393
 __fput+0xff/0x420 fs/file_table.c:321
 task_work_run+0x8b/0xe0 kernel/task_work.c:179
 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline]
 exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
 exit_to_user_mode_prepare+0x113/0x120 kernel/entry/common.c:203
 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline]
 syscall_exit_to_user_mode+0x1d/0x40 kernel/entry/common.c:296
 do_syscall_64+0x46/0x90 arch/x86/entry/common.c:86
 entry_SYSCALL_64_after_hwframe+0x72/0xdc
RIP: 0033:0x7f7850af70dc
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00007f7850af70dc
RDX: 00007f7850a2c800 RSI: 0000000000000002 RDI: 0000000000000003
RBP: 00000000006bd980 R08: 0000000000000000 R09: 00000000000018a0
R10: 00000000316338a4 R11: 0000000000000293 R12: 0000000000211e31
R13: 00000000006bc05c R14: 00007f785062c000 R15: 0000000000211af0

Fixes: 0a3f4f1f9c27 ("mptcp: fix UaF in listener shutdown")
Cc: stable@vger.kernel.org
Reported-by: Christoph Paasch <cpaasch@apple.com>
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/371
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c |  6 ++++-
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 06c5872e3b00..5181fb91595b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2365,8 +2365,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		mptcp_subflow_drop_ctx(ssk);
 	} else {
 		/* otherwise tcp will dispose of the ssk and subflow ctx */
-		if (ssk->sk_state == TCP_LISTEN)
+		if (ssk->sk_state == TCP_LISTEN) {
+			tcp_set_state(ssk, TCP_CLOSE);
+			mptcp_subflow_queue_clean(sk, ssk);
+			inet_csk_listen_stop(ssk);
 			mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+		}
 
 		__tcp_close(ssk, 0);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 339a6f072989..3a2db1b862dd 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -629,6 +629,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		     struct mptcp_subflow_context *subflow);
 void __mptcp_subflow_send_ack(struct sock *ssk);
 void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
 void mptcp_sock_graft(struct sock *sk, struct socket *parent);
 struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
 bool __mptcp_close(struct sock *sk, long timeout);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index d34588850545..bf5e5c72b5ee 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1819,6 +1819,78 @@ static void subflow_state_change(struct sock *sk)
 	}
 }
 
+void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
+{
+	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+	struct mptcp_sock *msk, *next, *head = NULL;
+	struct request_sock *req;
+
+	/* build a list of all unaccepted mptcp sockets */
+	spin_lock_bh(&queue->rskq_lock);
+	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
+		struct mptcp_subflow_context *subflow;
+		struct sock *ssk = req->sk;
+
+		if (!sk_is_mptcp(ssk))
+			continue;
+
+		subflow = mptcp_subflow_ctx(ssk);
+		if (!subflow || !subflow->conn)
+			continue;
+
+		/* skip if already in list */
+		msk = mptcp_sk(subflow->conn);
+		if (msk->dl_next || msk == head)
+			continue;
+
+		sock_hold(subflow->conn);
+		msk->dl_next = head;
+		head = msk;
+	}
+	spin_unlock_bh(&queue->rskq_lock);
+	if (!head)
+		return;
+
+	/* can't acquire the msk socket lock under the subflow one,
+	 * or will cause ABBA deadlock
+	 */
+	release_sock(listener_ssk);
+
+	for (msk = head; msk; msk = next) {
+		struct sock *sk = (struct sock *)msk;
+
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+		next = msk->dl_next;
+		msk->dl_next = NULL;
+
+		/* prevent the stack from later re-schedule the worker for
+		 * this socket
+		 */
+		inet_sk_state_store(sk, TCP_CLOSE);
+		release_sock(sk);
+
+		/* lockdep will report a false positive ABBA deadlock
+		 * between cancel_work_sync and the listener socket.
+		 * The involved locks belong to different sockets WRT
+		 * the existing AB chain.
+		 * Using a per socket key is problematic as key
+		 * deregistration requires process context and must be
+		 * performed at socket disposal time, in atomic
+		 * context.
+		 * Just tell lockdep to consider the listener socket
+		 * released here.
+		 */
+		mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
+		mptcp_cancel_work(sk);
+		mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+
+		sock_put(sk);
+	}
+
+	/* we are still under the listener msk socket lock */
+	lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+}
+
 static int subflow_ulp_init(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-- 
cgit v1.2.3


From 63740448a32eb662e05894425b47bcc5814136f4 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 17 Apr 2023 16:00:41 +0200
Subject: mptcp: fix accept vs worker race

The mptcp worker and mptcp_accept() can race, as reported by Christoph:

refcount_t: addition on 0; use-after-free.
WARNING: CPU: 1 PID: 14351 at lib/refcount.c:25 refcount_warn_saturate+0x105/0x1b0 lib/refcount.c:25
Modules linked in:
CPU: 1 PID: 14351 Comm: syz-executor.2 Not tainted 6.3.0-rc1-gde5e8fd0123c #11
Hardware name: QEM