From 1c4e97dd2d3c9a3e84f7e26346aa39bc426d3249 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Mar 2024 12:01:21 -0800 Subject: tcp: Fix NEW_SYN_RECV handling in inet_twsk_purge() inet_twsk_purge() uses rcu to find TIME_WAIT and NEW_SYN_RECV objects to purge. These objects use SLAB_TYPESAFE_BY_RCU semantic and need special care. We need to use refcount_inc_not_zero(&sk->sk_refcnt). Reuse the existing correct logic I wrote for TIME_WAIT, because both structures have common locations for sk_state, sk_family, and netns pointer. If after the refcount_inc_not_zero() the object fields longer match the keys, use sock_gen_put(sk) to release the refcount. Then we can call inet_twsk_deschedule_put() for TIME_WAIT, inet_csk_reqsk_queue_drop_and_put() for NEW_SYN_RECV sockets, with BH disabled. Then we need to restart the loop because we had drop rcu_read_lock(). Fixes: 740ea3c4a0b2 ("tcp: Clean up kernel listener's reqsk in inet_twsk_purge()") Link: https://lore.kernel.org/netdev/CANn89iLvFuuihCtt9PME2uS1WJATnf5fKjDToa1WzVnRzHnPfg@mail.gmail.com/T/#u Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240308200122.64357-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_timewait_sock.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5befa4de5b24..e8de45d34d56 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -263,12 +263,12 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); +/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { - struct inet_timewait_sock *tw; - struct sock *sk; struct hlist_nulls_node *node; unsigned int slot; + struct sock *sk; for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; @@ -277,38 +277,35 @@ restart_rcu: rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) { - /* A kernel listener socket might not hold refcnt for net, - * so reqsk_timer_handler() could be fired after net is - * freed. Userspace listener and reqsk never exist here. - */ - if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV && - hashinfo->pernet)) { - struct request_sock *req = inet_reqsk(sk); - - inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); - } + int state = inet_sk_state_load(sk); + if ((1 << state) & ~(TCPF_TIME_WAIT | + TCPF_NEW_SYN_RECV)) continue; - } - tw = inet_twsk(sk); - if ((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->ns.count)) + if (sk->sk_family != family || + refcount_read(&sock_net(sk)->ns.count)) continue; - if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->ns.count))) { - inet_twsk_put(tw); + if (unlikely(sk->sk_family != family || + refcount_read(&sock_net(sk)->ns.count))) { + sock_gen_put(sk); goto restart; } rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule_put(tw); + if (state == TCP_TIME_WAIT) { + inet_twsk_deschedule_put(inet_twsk(sk)); + } else { + struct request_sock *req = inet_reqsk(sk); + + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, + req); + } local_bh_enable(); goto restart_rcu; } -- cgit v1.2.3 From 2a750d6a5b365265dbda33330a6188547ddb5c24 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 8 Mar 2024 12:01:22 -0800 Subject: rds: tcp: Fix use-after-free of net in reqsk_timer_handler(). syzkaller reported a warning of netns tracker [0] followed by KASAN splat [1] and another ref tracker warning [1]. syzkaller could not find a repro, but in the log, the only suspicious sequence was as follows: 18:26:22 executing program 1: r0 = socket$inet6_mptcp(0xa, 0x1, 0x106) ... connect$inet6(r0, &(0x7f0000000080)={0xa, 0x4001, 0x0, @loopback}, 0x1c) (async) The notable thing here is 0x4001 in connect(), which is RDS_TCP_PORT. So, the scenario would be: 1. unshare(CLONE_NEWNET) creates a per netns tcp listener in rds_tcp_listen_init(). 2. syz-executor connect()s to it and creates a reqsk. 3. syz-executor exit()s immediately. 4. netns is dismantled. [0] 5. reqsk timer is fired, and UAF happens while freeing reqsk. [1] 6. listener is freed after RCU grace period. [2] Basically, reqsk assumes that the listener guarantees netns safety until all reqsk timers are expired by holding the listener's refcount. However, this was not the case for kernel sockets. Commit 740ea3c4a0b2 ("tcp: Clean up kernel listener's reqsk in inet_twsk_purge()") fixed this issue only for per-netns ehash. Let's apply the same fix for the global ehash. [0]: ref_tracker: net notrefcnt@0000000065449cc3 has 1/1 users at sk_alloc (./include/net/net_namespace.h:337 net/core/sock.c:2146) inet6_create (net/ipv6/af_inet6.c:192 net/ipv6/af_inet6.c:119) __sock_create (net/socket.c:1572) rds_tcp_listen_init (net/rds/tcp_listen.c:279) rds_tcp_init_net (net/rds/tcp.c:577) ops_init (net/core/net_namespace.c:137) setup_net (net/core/net_namespace.c:340) copy_net_ns (net/core/net_namespace.c:497) create_new_namespaces (kernel/nsproxy.c:110) unshare_nsproxy_namespaces (kernel/nsproxy.c:228 (discriminator 4)) ksys_unshare (kernel/fork.c:3429) __x64_sys_unshare (kernel/fork.c:3496) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) ... WARNING: CPU: 0 PID: 27 at lib/ref_tracker.c:179 ref_tracker_dir_exit (lib/ref_tracker.c:179) [1]: BUG: KASAN: slab-use-after-free in inet_csk_reqsk_queue_drop (./include/net/inet_hashtables.h:180 net/ipv4/inet_connection_sock.c:952 net/ipv4/inet_connection_sock.c:966) Read of size 8 at addr ffff88801b370400 by task swapper/0/0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) kasan_report (mm/kasan/report.c:603) inet_csk_reqsk_queue_drop (./include/net/inet_hashtables.h:180 net/ipv4/inet_connection_sock.c:952 net/ipv4/inet_connection_sock.c:966) reqsk_timer_handler (net/ipv4/inet_connection_sock.c:979 net/ipv4/inet_connection_sock.c:1092) call_timer_fn (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/timer.h:127 kernel/time/timer.c:1701) __run_timers.part.0 (kernel/time/timer.c:1752 kernel/time/timer.c:2038) run_timer_softirq (kernel/time/timer.c:2053) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:554) irq_exit_rcu (kernel/softirq.c:427 kernel/softirq.c:632 kernel/softirq.c:644) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1076 (discriminator 14)) Allocated by task 258 on cpu 0 at 83.612050s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) __kasan_slab_alloc (mm/kasan/common.c:343) kmem_cache_alloc (mm/slub.c:3813 mm/slub.c:3860 mm/slub.c:3867) copy_net_ns (./include/linux/slab.h:701 net/core/net_namespace.c:421 net/core/net_namespace.c:480) create_new_namespaces (kernel/nsproxy.c:110) unshare_nsproxy_namespaces (kernel/nsproxy.c:228 (discriminator 4)) ksys_unshare (kernel/fork.c:3429) __x64_sys_unshare (kernel/fork.c:3496) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) Freed by task 27 on cpu 0 at 329.158864s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) kasan_save_free_info (mm/kasan/generic.c:643) __kasan_slab_free (mm/kasan/common.c:265) kmem_cache_free (mm/slub.c:4299 mm/slub.c:4363) cleanup_net (net/core/net_namespace.c:456 net/core/net_namespace.c:446 net/core/net_namespace.c:639) process_one_work (kernel/workqueue.c:2638) worker_thread (kernel/workqueue.c:2700 kernel/workqueue.c:2787) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:250) The buggy address belongs to the object at ffff88801b370000 which belongs to the cache net_namespace of size 4352 The buggy address is located 1024 bytes inside of freed 4352-byte region [ffff88801b370000, ffff88801b371100) [2]: WARNING: CPU: 0 PID: 95 at lib/ref_tracker.c:228 ref_tracker_free (lib/ref_tracker.c:228 (discriminator 1)) Modules linked in: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:ref_tracker_free (lib/ref_tracker.c:228 (discriminator 1)) ... Call Trace: __sk_destruct (./include/net/net_namespace.h:353 net/core/sock.c:2204) rcu_core (./arch/x86/include/asm/preempt.h:26 kernel/rcu/tree.c:2165 kernel/rcu/tree.c:2433) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:554) irq_exit_rcu (kernel/softirq.c:427 kernel/softirq.c:632 kernel/softirq.c:644) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1076 (discriminator 14)) Reported-by: syzkaller Suggested-by: Eric Dumazet Fixes: 467fa15356ac ("RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240308200122.64357-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 52040b0e2616..f0761f060a83 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -398,10 +398,6 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); } else if (!purged_once) { - /* The last refcount is decremented in tcp_sk_exit_batch() */ - if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) - continue; - inet_twsk_purge(&tcp_hashinfo, family); purged_once = true; } -- cgit v1.2.3 From 584c2a9184a33a40fceee838f856de3cffa19be3 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 11 Mar 2024 12:38:29 -0400 Subject: soc: fsl: qbman: Always disable interrupts when taking cgr_lock smp_call_function_single disables IRQs when executing the callback. To prevent deadlocks, we must disable IRQs when taking cgr_lock elsewhere. This is already done by qman_update_cgr and qman_delete_cgr; fix the other lockers. Fixes: 96f413f47677 ("soc/fsl/qbman: fix issue in qman_delete_cgr_safe()") CC: stable@vger.kernel.org Signed-off-by: Sean Anderson Reviewed-by: Camelia Groza Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/soc/fsl/qbman/qman.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 739e4eee6b75..1bf1f1ea67f0 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -1456,11 +1456,11 @@ static void qm_congestion_task(struct work_struct *work) union qm_mc_result *mcr; struct qman_cgr *cgr; - spin_lock(&p->cgr_lock); + spin_lock_irq(&p->cgr_lock); qm_mc_start(&p->p); qm_mc_commit(&p->p, QM_MCC_VERB_QUERYCONGESTION); if (!qm_mc_result_timeout(&p->p, &mcr)) { - spin_unlock(&p->cgr_lock); + spin_unlock_irq(&p->cgr_lock); dev_crit(p->config->dev, "QUERYCONGESTION timeout\n"); qman_p_irqsource_add(p, QM_PIRQ_CSCI); return; @@ -1476,7 +1476,7 @@ static void qm_congestion_task(struct work_struct *work) list_for_each_entry(cgr, &p->cgr_cbs, node) if (cgr->cb && qman_cgrs_get(&c, cgr->cgrid)) cgr->cb(p, cgr, qman_cgrs_get(&rr, cgr->cgrid)); - spin_unlock(&p->cgr_lock); + spin_unlock_irq(&p->cgr_lock); qman_p_irqsource_add(p, QM_PIRQ_CSCI); } @@ -2440,7 +2440,7 @@ int qman_create_cgr(struct qman_cgr *cgr, u32 flags, preempt_enable(); cgr->chan = p->config->channel; - spin_lock(&p->cgr_lock); + spin_lock_irq(&p->cgr_lock); if (opts) { struct qm_mcc_initcgr local_opts = *opts; @@ -2477,7 +2477,7 @@ int qman_create_cgr(struct qman_cgr *cgr, u32 flags, qman_cgrs_get(&p->cgrs[1], cgr->cgrid)) cgr->cb(p, cgr, 1); out: - spin_unlock(&p->cgr_lock); + spin_unlock_irq(&p->cgr_lock); put_affine_portal(); return ret; } -- cgit v1.2.3 From fbec4e7fed89b579f2483041fabf9650fb0dd6bc Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 11 Mar 2024 12:38:30 -0400 Subject: soc: fsl: qbman: Use raw spinlock for cgr_lock smp_call_function always runs its callback in hard IRQ context, even on PREEMPT_RT, where spinlocks can sleep. So we need to use a raw spinlock for cgr_lock to ensure we aren't waiting on a sleeping task. Although this bug has existed for a while, it was not apparent until commit ef2a8d5478b9 ("net: dpaa: Adjust queue depth on rate change") which invokes smp_call_function_single via qman_update_cgr_safe every time a link goes up or down. Fixes: 96f413f47677 ("soc/fsl/qbman: fix issue in qman_delete_cgr_safe()") CC: stable@vger.kernel.org Reported-by: Vladimir Oltean Closes: https://lore.kernel.org/all/20230323153935.nofnjucqjqnz34ej@skbuf/ Reported-by: Steffen Trumtrar Closes: https://lore.kernel.org/linux-arm-kernel/87wmsyvclu.fsf@pengutronix.de/ Signed-off-by: Sean Anderson Reviewed-by: Camelia Groza Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/soc/fsl/qbman/qman.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 1bf1f1ea67f0..7e9074519ad2 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -991,7 +991,7 @@ struct qman_portal { /* linked-list of CSCN handlers. */ struct list_head cgr_cbs; /* list lock */ - spinlock_t cgr_lock; + raw_spinlock_t cgr_lock; struct work_struct congestion_work; struct work_struct mr_work; char irqname[MAX_IRQNAME]; @@ -1281,7 +1281,7 @@ static int qman_create_portal(struct qman_portal *portal, /* if the given mask is NULL, assume all CGRs can be seen */ qman_cgrs_fill(&portal->cgrs[0]); INIT_LIST_HEAD(&portal->cgr_cbs); - spin_lock_init(&portal->cgr_lock); + raw_spin_lock_init(&portal->cgr_lock); INIT_WORK(&portal->congestion_work, qm_congestion_task); INIT_WORK(&portal->mr_work, qm_mr_process_task); portal->bits = 0; @@ -1456,11 +1456,14 @@ static void qm_congestion_task(struct work_struct *work) union qm_mc_result *mcr; struct qman_cgr *cgr; - spin_lock_irq(&p->cgr_lock); + /* + * FIXME: QM_MCR_TIMEOUT is 10ms, which is too long for a raw spinlock! + */ + raw_spin_lock_irq(&p->cgr_lock); qm_mc_start(&p->p); qm_mc_commit(&p->p, QM_MCC_VERB_QUERYCONGESTION); if (!qm_mc_result_timeout(&p->p, &mcr)) { - spin_unlock_irq(&p->cgr_lock); + raw_spin_unlock_irq(&p->cgr_lock); dev_crit(p->config->dev, "QUERYCONGESTION timeout\n"); qman_p_irqsource_add(p, QM_PIRQ_CSCI); return; @@ -1476,7 +1479,7 @@ static void qm_congestion_task(struct work_struct *work) list_for_each_entry(cgr, &p->cgr_cbs, node) if (cgr->cb && qman_cgrs_get(&c, cgr->cgrid)) cgr->cb(p, cgr, qman_cgrs_get(&rr, cgr->cgrid)); - spin_unlock_irq(&p->cgr_lock); + raw_spin_unlock_irq(&p->cgr_lock); qman_p_irqsource_add(p, QM_PIRQ_CSCI); } @@ -2440,7 +2443,7 @@ int qman_create_cgr(struct qman_cgr *cgr, u32 flags, preempt_enable(); cgr->chan = p->config->channel; - spin_lock_irq(&p->cgr_lock); + raw_spin_lock_irq(&p->cgr_lock); if (opts) { struct qm_mcc_initcgr local_opts = *opts; @@ -2477,7 +2480,7 @@ int qman_create_cgr(struct qman_cgr *cgr, u32 flags, qman_cgrs_get(&p->cgrs[1], cgr->cgrid)) cgr->cb(p, cgr, 1); out: - spin_unlock_irq(&p->cgr_lock); + raw_spin_unlock_irq(&p->cgr_lock); put_affine_portal(); return ret; } @@ -2512,7 +2515,7 @@ int qman_delete_cgr(struct qman_cgr *cgr) return -EINVAL; memset(&local_opts, 0, sizeof(struct qm_mcc_initcgr)); - spin_lock_irqsave(&p->cgr_lock, irqflags); + raw_spin_lock_irqsave(&p->cgr_lock, irqflags); list_del(&cgr->node); /* * If there are no other CGR objects for this CGRID in the list, @@ -2537,7 +2540,7 @@ int qman_delete_cgr(struct qman_cgr *cgr) /* add back to the list */ list_add(&cgr->node, &p->cgr_cbs); release_lock: - spin_unlock_irqrestore(&p->cgr_lock, irqflags); + raw_spin_unlock_irqrestore(&p->cgr_lock, irqflags); put_affine_portal(); return ret; } @@ -2577,9 +2580,9 @@ static int qman_update_cgr(struct qman_cgr *cgr, struct qm_mcc_initcgr *opts) if (!p) return -EINVAL; - spin_lock_irqsave(&p->cgr_lock, irqflags); + raw_spin_lock_irqsave(&p->cgr_lock, irqflags); ret = qm_modify_cgr(cgr, 0, opts); - spin_unlock_irqrestore(&p->cgr_lock, irqflags); + raw_spin_unlock_irqrestore(&p->cgr_lock, irqflags); put_affine_portal(); return ret; } -- cgit v1.2.3 From e642921dfeed1e15e73f78f2c3b6746f72b6deb2 Mon Sep 17 00:00:00 2001 From: Linu Cherian Date: Tue, 12 Mar 2024 12:36:22 +0530 Subject: octeontx2-af: Use matching wake_up API variant in CGX command interface Use wake_up API instead of wake_up_interruptible, since wait_event_timeout API is used for waiting on command completion. Fixes: 1463f382f58d ("octeontx2-af: Add support for CGX link management") Signed-off-by: Linu Cherian Signed-off-by: Sunil Goutham Signed-off-by: Subbaraya Sundeep Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 6c70c8498690..3c0f55b3e48e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -1338,7 +1338,7 @@ static irqreturn_t cgx_fwi_event_handler(int irq, void *data) /* Release thread waiting for completion */ lmac->cmd_pend = false; - wake_up_interruptible(&lmac->wq_cmd_cmplt); + wake_up(&lmac->wq_cmd_cmplt); break; case CGX_EVT_ASYNC: if (cgx_event_is_linkevent(event)) -- cgit v1.2.3 From 343041b59b7810f9cdca371f445dd43b35c740b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Mar 2024 20:46:28 +0000 Subject: net/sched: taprio: proper TCA_TAPRIO_TC_ENTRY_INDEX check taprio_parse_tc_entry() is not correctly checking TCA_TAPRIO_TC_ENTRY_INDEX attribute: int tc; // Signed value tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } syzbot reported that it could fed arbitary negative values: UBSAN: shift-out-of-bounds in net/sched/sch_taprio.c:1722:18 shift exponent -2147418108 is negative CPU: 0 PID: 5066 Comm: syz-executor367 Not tainted 6.8.0-rc7-syzkaller-00136-gc8a5c731fd12 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:217 [inline] __ubsan_handle_shift_out_of_bounds+0x3c7/0x420 lib/ubsan.c:386 taprio_parse_tc_entry net/sched/sch_taprio.c:1722 [inline] taprio_parse_tc_entries net/sched/sch_taprio.c:1768 [inline] taprio_change+0xb87/0x57d0 net/sched/sch_taprio.c:1877 taprio_init+0x9da/0xc80 net/sched/sch_taprio.c:2134 qdisc_create+0x9d4/0x1190 net/sched/sch_api.c:1355 tc_modify_qdisc+0xa26/0x1e40 net/sched/sch_api.c:1776 rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6617 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543 netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367 netlink_sendmsg+0xa3b/0xd70 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7f1b2dea3759 Code: 48 83 c4 28 c3 e8 d7 19 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd4de452f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f1b2def0390 RCX: 00007f1b2dea3759 RDX: 0000000000000000 RSI: 00000000200007c0 RDI: 0000000000000004 RBP: 0000000000000003 R08: 0000555500000000 R09: 0000555500000000 R10: 0000555500000000 R11: 0000000000000246 R12: 00007ffd4de45340 R13: 00007ffd4de45310 R14: 0000000000000001 R15: 00007ffd4de45340 Fixes: a54fc09e4cba ("net/sched: taprio: allow user input of per-tc max SDU") Reported-and-tested-by: syzbot+a340daa06412d6028918@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Vladimir Oltean Reviewed-by: Michal Kubiak Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c5de70efdc86..a0d54b422186 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -997,7 +997,8 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { - [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, + TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, -- cgit v1.2.3 From d7d75124965aee23e5e4421d78376545cf070b0a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 12 Mar 2024 12:52:38 +0200 Subject: devlink: Fix devlink parallel commands processing Commit 870c7ad4a52b ("devlink: protect devlink->dev by the instance lock") added devlink instance locking inside a loop that iterates over all the registered devlink instances on the machine in the pre-doit phase. This can lead to serialization of devlink commands over different devlink instances. For example: While the first devlink instance is executing firmware flash, all commands to other devlink instances on the machine are forced to wait until the first devlink finishes. Therefore, in the pre-doit phase, take the devlink instance lock only for the devlink instance the command is targeting. Devlink layer is taking a reference on the devlink instance, ensuring the devlink->dev pointer is valid. This reference taking was introduced by commit a380687200e0 ("devlink: take device reference for devlink object"). Without this commit, it would not be safe to access devlink->dev lockless. Fixes: 870c7ad4a52b ("devlink: protect devlink->dev by the instance lock") Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/devlink/netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 499885c8b9ca..593605c1b1ef 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -193,12 +193,13 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); devlinks_xa_for_each_registered_get(net, index, devlink) { - devl_dev_lock(devlink, dev_lock); - if (devl_is_registered(devlink) && - strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) - return devlink; - devl_dev_unlock(devlink, dev_lock); + if (strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0) { + devl_dev_lock(devlink, dev_lock); + if (devl_is_registered(devlink)) + return devlink; + devl_dev_unlock(devlink, dev_lock); + } devlink_put(devlink); } -- cgit v1.2.3 From 04d9d1fc428ac9f581d55118d67e0cb546701feb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 8 Mar 2024 12:16:23 -0800 Subject: tcp: Fix refcnt handling in __inet_hash_connect(). syzbot reported a warning in sk_nulls_del_node_init_rcu(). The commit 66b60b0c8c4a ("dccp/tcp: Unhash sk from ehash for tb2 alloc failure after check_estalblished().") tried to fix an issue that an unconnected socket occupies an ehash entry when bhash2 allocation fails. In such a case, we need to revert changes done by check_established(), which does not hold refcnt when inserting socket into ehash. So, to revert the change, we need to __sk_nulls_add_node_rcu() instead of sk_nulls_add_node_rcu(). Otherwise, sock_put() will cause refcnt underflow and leak the socket. [0]: WARNING: CPU: 0 PID: 23948 at include/net/sock.h:799 sk_nulls_del_node_init_rcu+0x166/0x1a0 include/net/sock.h:799 Modules linked in: CPU: 0 PID: 23948 Comm: syz-executor.2 Not tainted 6.8.0-rc6-syzkaller-00159-gc055fc00c07b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 RIP: 0010:sk_nulls_del_node_init_rcu+0x166/0x1a0 include/net/sock.h:799 Code: e8 7f 71 c6 f7 83 fb 02 7c 25 e8 35 6d c6 f7 4d 85 f6 0f 95 c0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc e8 1b 6d c6 f7 90 <0f> 0b 90 eb b2 e8 10 6d c6 f7 4c 89 e7 be 04 00 00 00 e8 63 e7 d2 RSP: 0018:ffffc900032d7848 EFLAGS: 00010246 RAX: ffffffff89cd0035 RBX: 0000000000000001 RCX: 0000000000040000 RDX: ffffc90004de1000 RSI: 000000000003ffff RDI: 0000000000040000 RBP: 1ffff1100439ac26 R08: ffffffff89ccffe3 R09: 1ffff1100439ac28 R10: dffffc0000000000 R11: ffffed100439ac29 R12: ffff888021cd6140 R13: dffffc0000000000 R14: ffff88802a9bf5c0 R15: ffff888021cd6130 FS: 00007f3b823f16c0(0000) GS:ffff8880b9400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3b823f0ff8 CR3: 000000004674a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __inet_hash_connect+0x140f/0x20b0 net/ipv4/inet_hashtables.c:1139 dccp_v6_connect+0xcb9/0x1480 net/dccp/ipv6.c:956 __inet_stream_connect+0x262/0xf30 net/ipv4/af_inet.c:678 inet_stream_connect+0x65/0xa0 net/ipv4/af_inet.c:749 __sys_connect_file net/socket.c:2048 [inline] __sys_connect+0x2df/0x310 net/socket.c:2065 __do_sys_connect net/socket.c:2075 [inline] __se_sys_connect net/socket.c:2072 [inline] __x64_sys_connect+0x7a/0x90 net/socket.c:2072 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7f3b8167dda9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3b823f10c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 00007f3b817abf80 RCX: 00007f3b8167dda9 RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003 RBP: 00007f3b823f1120 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 R13: 000000000000000b R14: 00007f3b817abf80 R15: 00007ffd3beb57b8 Reported-by: syzbot+12c506c1aae251e70449@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=12c506c1aae251e70449 Fixes: 66b60b0c8c4a ("dccp/tcp: Unhash sk from ehash for tb2 alloc failure after check_estalblished().") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240308201623.65448-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/inet_hashtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7498af320164..cf88eca5f1b4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1135,7 +1135,7 @@ error: sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(lock); - sk_nulls_del_node_init_rcu(sk); + __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); sk->sk_hash = 0; -- cgit v1.2.3 From e127ce7699c1e05279ee5ee61f00893e7bfa9671 Mon Sep 17 00:00:00 2001 From: William Tu Date: Sat, 9 Mar 2024 20:31:47 +0200 Subject: vmxnet3: Fix missing reserved tailroom Use rbi->len instead of rcd->len for non-dataring packet. Found issue: XDP_WARN: xdp_update_frame_from_buff(line:278): Driver BUG: missing reserved tailroom WARNING: CPU: 0 PID: 0 at net/core/xdp.c:586 xdp_warn+0xf/0x20 CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W O 6.5.1 #1 RIP: 0010:xdp_warn+0xf/0x20 ... ? xdp_warn+0xf/0x20 xdp_do_redirect+0x15f/0x1c0 vmxnet3_run_xdp+0x17a/0x400 [vmxnet3] vmxnet3_process_xdp+0xe4/0x760 [vmxnet3] ? vmxnet3_tq_tx_complete.isra.0+0x21e/0x2c0 [vmxnet3] vmxnet3_rq_rx_complete+0x7ad/0x1120 [vmxnet3] vmxnet3_poll_rx_only+0x2d/0xa0 [vmxnet3] __napi_poll+0x20/0x180 net_rx_action+0x177/0x390 Reported-by: Martin Zaharinov Tested-by: Martin Zaharinov Link: https://lore.kernel.org/netdev/74BF3CC8-2A3A-44FF-98C2-1E20F110A92E@gmail.com/ Fixes: 54f00cce1178 ("vmxnet3: Add XDP support.") Signed-off-by: William Tu Link: https://lore.kernel.org/r/20240309183147.28222-1-witu@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/vmxnet3/vmxnet3_xdp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 80ddaff759d4..a6c787454a1a 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -382,12 +382,12 @@ vmxnet3_process_xdp(struct vmxnet3_adapter *adapter, page = rbi->page; dma_sync_single_for_cpu(&adapter->pdev->dev, page_pool_get_dma_addr(page) + - rq->page_pool->p.offset, rcd->len, + rq->page_pool->p.offset, rbi->len, page_pool_get_dma_dir(rq->page_pool)); - xdp_init_buff(&xdp, rbi->len, &rq->xdp_rxq); + xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq); xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset, - rcd->len, false); + rbi->len, false); xdp_buff_clear_frags_flag(&xdp); xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog); -- cgit v1.2.3 From ddbec99f58571301679addbc022256970ca3eac6 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Wed, 13 Mar 2024 00:27:19 +0900 Subject: hsr: Fix uninit-value access in hsr_get_node() KMSAN reported the following uninit-value access issue [1]: ===================================================== BUG: KMSAN: uninit-value in hsr_get_node+0xa2e/0xa40 net/hsr/hsr_framereg.c:246 hsr_get_node+0xa2e/0xa40 net/hsr/hsr_framereg.c:246 fill_frame_info net/hsr/hsr_forward.c:577 [inline] hsr_forward_skb+0xe12/0x30e0 net/hsr/hsr_forward.c:615 hsr_dev_xmit+0x1a1/0x270 net/hsr/hsr_device.c:223 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x735/0xa10 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1c0 net/socket.c:2199 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x6d/0x140 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2787 packet_alloc_skb net/packet/af_packet.c:2936 [inline] packet_snd net/packet/af_packet.c:3030 [inline] packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x735/0xa10 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1c0 net/socket.c:2199 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x6d/0x140 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 1 PID: 5033 Comm: syz-executor334 Not tainted 6.7.0-syzkaller-00562-g9f8413c4a66f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== If the packet type ID field in the Ethernet header is either ETH_P_PRP or ETH_P_HSR, but it is not followed by an HSR tag, hsr_get_skb_sequence_nr() reads an invalid value as a sequence number. This causes the above issue. This patch fixes the issue by returning NULL if the Ethernet header is not followed by an HSR tag. Fixes: f266a683a480 ("net/hsr: Better frame dispatch") Reported-and-tested-by: syzbot+2ef3a8ce8e91b5a50098@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2ef3a8ce8e91b5a50098 [1] Signed-off-by: Shigeru Yoshida Link: https://lore.kernel.org/r/20240312152719.724530-1-syoshida@redhat.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_framereg.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 6d14d935ee82..26329db09210 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -228,6 +228,10 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { + /* Check if skb contains hsr_ethhdr */ + if (skb->mac_len < sizeof(struct hsr_ethhdr)) + return NULL; + /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ -- cgit v1.2.3 From 6b2536462fd48b49563aef0555517cb91047c5f5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 Mar 2024 23:37:17 +0000 Subject: rxrpc: Fix use of changed alignment param to page_frag_alloc_align() Commit 411c5f36805c ("mm/page_alloc: modify page_frag_alloc_align() to accept align as an argument") changed the way page_frag_alloc_align() worked, but it didn't fix AF_RXRPC as that use of that allocator function hadn't been merged yet at the time. Now, when the AFS filesystem is used, this results in: WARNING: CPU: 4 PID: 379 at include/linux/gfp.h:323 rxrpc_alloc_data_txbuf+0x9d/0x2b0 [rxrpc] Fix this by using __page_frag_alloc_align() instead. Note that it might be better to use an order-based alignment rather than a mask-based alignment. Fixes: 49489bb03a50 ("rxrpc: Do zerocopy using MSG_SPLICE_PAGES and page frags") Signed-off-by: David Howells Reported-by: Marc Dionne cc: Yunsheng Lin cc: Alexander Duyck cc: Michael S. Tsirkin cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: Paolo Abeni --- net/rxrpc/txbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index b2a82ab756c2..e0679658d9de 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -33,8 +33,8 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ total = hoff + sizeof(*whdr) + data_size; mutex_lock(&call->conn->tx_data_alloc_lock); - buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, - ~(data_align - 1) & ~(L1_CACHE_BYTES - 1)); + buf = __page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, + ~(data_align - 1) & ~(L1_CACHE_BYTES - 1)); mutex_unlock(&call->conn->tx_data_alloc_lock); if (!buf) { kfree(txb); -- cgit v1.2.3 From 89e4354110ca64bf4949cca83b55149bc80733bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 Mar 2024 23:37:18 +0000 Subject: rxrpc: Fix error check on ->alloc_txbuf() rxrpc_alloc_*_txbuf() and ->alloc_txbuf() return NULL to indicate no memory, but rxrpc_send_data() uses IS_ERR(). Fix rxrpc_send_data() to check for NULL only and set -ENOMEM if it sees that. Fixes: 49489bb03a50 ("rxrpc: Do zerocopy using MSG_SPLICE_PAGES and page frags") Signed-off-by: David Howells Reported-by: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: Paolo Abeni --- net/rxrpc/sendmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6f765768c49c..894b8fa68e5e 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -349,8 +349,8 @@ reload: */ remain = more ? INT_MAX : msg_data_left(msg); txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); - if (IS_ERR(txb)) { - ret = PTR_ERR(txb); + if (!txb) { + ret = -ENOMEM; goto maybe_error; } } -- cgit v1.2.3 From 1c6368679979019f884557b1843e1dedffac231c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 12 Mar 2024 20:23:29 -0700 Subject: docs: networking: fix indentation errors in multi-pf-netdev Stephen reports new warnings in the docs: Documentation/networking/multi-pf-netdev.rst:94: ERROR: Unexpected indentation. Documentation/networking/multi-pf-netdev.rst:106: ERROR: Unexpected indentation. Fixes: 77d9ec3f6c8c ("Documentation: networking: Add description for multi-pf netdev") Reported-by: Stephen Rothwell Link: https://lore.kernel.org/all/20240312153304.0ef1b78e@canb.auug.org.au/ Signed-off-by: Jakub Kicinski Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20240313032329.3919036-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/networking/multi-pf-netdev.rst | 58 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/networking/multi-pf-netdev.rst b/Documentation/networking/multi-pf-netdev.rst index be8e4bcadf11..268819225866 100644 --- a/Documentation/networking/multi-pf-netdev.rst +++ b/Documentation/networking/multi-pf-netdev.rst @@ -87,35 +87,35 @@ all using the same instance under "priv->mdev". Observability ============= -The relation between PF, irq, napi, and queue can be observed via netlink spec: - -$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}' -[{'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'rx'}, - {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'rx'}, - {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'rx'}, - {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'rx'}, - {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'rx'}, - {'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'tx'}, - {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'tx'}, - {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'tx'}, - {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'tx'}, - {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'tx'}] - -$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}' -[{'id': 543, 'ifindex': 13, 'irq': 42}, - {'id': 542, 'ifindex': 13, 'irq': 41}, - {'id': 541, 'ifindex': 13, 'irq': 40}, - {'id': 540, 'ifindex': 13, 'irq': 39}, - {'id': 539, 'ifindex': 13, 'irq': 36}] - -Here you can clearly observe our channels distribution policy: - -$ ls /proc/irq/{36,39,40,41,42}/mlx5* -d -1 -/proc/irq/36/mlx5_comp1@pci:0000:08:00.0 -/proc/irq/39/mlx5_comp1@pci:0000:09:00.0 -/proc/irq/40/mlx5_comp2@pci:0000:08:00.0 -/proc/irq/41/mlx5_comp2@pci:0000:09:00.0 -/proc/irq/42/mlx5_comp3@pci:0000:08:00.0 +The relation between PF, irq, napi, and queue can be observed via netlink spec:: + + $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}' + [{'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'rx'}, + {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'rx'}, + {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'rx'}, + {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'rx'}, + {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'rx'}, + {'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'tx'}, + {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'tx'}, + {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'tx'}, + {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'tx'}, + {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'tx'}] + + $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}' + [{'id': 543, 'ifindex': 13, 'irq': 42}, + {'id': 542, 'ifindex': 13, 'irq': 41}, + {'id': 541, 'ifindex': 13, 'irq': 40}, + {'id': 540, 'ifindex': 13, 'irq': 39}, + {'id': 539, 'ifindex': 13, 'irq': 36}] + +Here you can clearly observe our channels distribution policy:: + + $ ls /proc/irq/{36,39,40,41,42}/mlx5* -d -1 + /proc/irq/36/mlx5_comp1@pci:0000:08:00.0 + /proc/irq/39/mlx5_comp1@pci:0000:09:00.0 + /proc/irq/40/mlx5_comp2@pci:0000:08:00.0 + /proc/irq/41/mlx5_comp2@pci:0000:09:00.0 + /proc/irq/42/mlx5_comp3@pci:0000:08:00.0 Steering ======== -- cgit v1.2.3 From e30cef001da259e8df354b813015d0e5acc08740 Mon Sep 17 00:00:00 2001 From: Duanqiang Wen Date: Wed, 13 Mar 2024 16:06:34 +0800 Subject: net: txgbe: fix clk_name exceed MAX_DEV_ID limits txgbe register clk which name is i2c_designware.pci_dev_id(), clk_name will be stored in clk_lookup_alloc. If PCIe bus number is larger than 0x39, clk_name size will be larger than 20 bytes. It exceeds clk_lookup_alloc MAX_DEV_ID limits. So the driver shortened clk_name. Fixes: b63f20485e43 ("net: txgbe: Register fixed rate clock") Signed-off-by: Duanqiang Wen Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20240313080634.459523-1-duanqiangwen@net-swift.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 93295916b1d2..5b5d5e4310d1 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -571,7 +571,7 @@ static int txgbe_clock_register(struct txgbe *txgbe) char clk_name[32]; struct clk *clk; - snprintf(clk_name, sizeof(clk_name), "i2c_designware.%d", + snprintf(clk_name, sizeof(clk_name), "i2c_dw.%d", pci_dev_id(pdev)); clk = clk_register_fixed_rate(NULL, clk_name, NULL, 0, 156250000); -- cgit v1.2.3 From e54e09c05c00120cbe817bdb037088035be4bd79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 09:55:45 -0600 Subject: net: remove {revc,send}msg_copy_msghdr() from exports The only user of these was io_uring, and it's not using them anymore. Make them static and remove them from the socket header file. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/1b6089d3-c1cf-464a-abd3-b0f0b6bb2523@kernel.dk Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 7 ------- net/socket.c | 14 +++++++------- tools/perf/trace/beauty/include/linux/socket.h | 7 ------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index cfcb7e2c3813..139c330ccf2c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); -extern int sendmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct iovec **iov); -extern int recvmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct sockaddr __user **uaddr, - struct iovec **iov); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); diff --git a/net/socket.c b/net/socket.c index 7e9c8fc9a5b4..e5f3af49a8b6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2600,9 +2600,9 @@ out: return err; } -int sendmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct iovec **iov) +static int sendmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct iovec **iov) { int err; @@ -2753,10 +2753,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } -int recvmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct sockaddr __user **uaddr, - struct iovec **iov) +static int recvmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct sockaddr __user **uaddr, + struct iovec **iov) { ssize_t err; diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index cfcb7e2c3813..139c330ccf2c 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); -extern int sendmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct iovec **iov); -extern int recvmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct sockaddr __user **uaddr, - struct iovec **iov); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); -- cgit v1.2.3 From f1b85ef15a99f06ed48871ce933d591127d2dcc0 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 13 Mar 2024 22:50:18 +0000 Subject: net: mediatek: mtk_eth_soc: clear MAC_MCR_FORCE_LINK only when MAC is up Clearing bit MAC_MCR_FORCE_LINK which forces the link down too early can result in MAC ending up in a broken/blocked state. Fix this by handling this bit in the .mac_link_up and .mac_link_down calls instead of in .mac_finish. Fixes: b8fc9f30821e ("net: ethernet: mediatek: Add basic PHYLINK support") Suggested-by: Mason-cw Chang Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index de123350bd46..caa13b9cedff 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -677,8 +677,7 @@ static int mtk_mac_finish(struct phylink_config *config, unsigned int mode, mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); mcr_new = mcr_cur; mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE | - MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK | - MAC_MCR_RX_FIFO_CLR_DIS; + MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_RX_FIFO_CLR_DIS; /* Only update control register when needed! */ if (mcr_new != mcr_cur) @@ -694,7 +693,7 @@ static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode, phylink_config); u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); - mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN); + mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN | MAC_MCR_FORCE_LINK); mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); } @@ -803,7 +802,7 @@ static void mtk_mac_link_up(struct phylink_config *config, if (rx_pause) mcr |= MAC_MCR_FORCE_RX_FC; - mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN; + mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN | MAC_MCR_FORCE_LINK; mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); } -- cgit v1.2.3 From ea80e3ed09ab2c2b75724faf5484721753e92c31 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 13 Mar 2024 22:50:40 +0000 Subject: net: ethernet: mtk_eth_soc: fix PPE hanging issue A patch to resolve an issue was found in MediaTek's GPL-licensed SDK: In the mtk_ppe_stop() function, the PPE scan mode is not disabled before disabling the PPE. This can potentially lead to a hang during the process of disabling the PPE. Without this patch, the PPE may experience a hang during the reboot test. Link: https://git01.mediatek.com/plugins/gitiles/openwrt/feeds/mtk-openwrt-feeds/+/b40da332dfe763932a82f9f62a4709457a15dd6c Fixes: ba37b7caf1ed ("net: ethernet: mtk_eth_soc: add support for initializing the PPE") Suggested-by: Bc-bocun Chen Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_ppe.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index b2a5d9c3733d..6ce0db3a1a92 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -994,7 +994,7 @@ void mtk_ppe_start(struct mtk_ppe *ppe) MTK_PPE_KEEPALIVE_DISABLE) | FIELD_PREP(MTK_PPE_TB_CFG_HASH_MODE, 1) | FIELD_PREP(MTK_PPE_TB_CFG_SCAN_MODE, - MTK_PPE_SCAN_MODE_KEEPALIVE_AGE) | + MTK_PPE_SCAN_MODE_CHECK_AGE) | FIELD_PREP(MTK_PPE_TB_CFG_ENTRY_NUM, MTK_PPE_ENTRIES_SHIFT); if (mtk_is_netsys_v2_or_greater(ppe->eth)) @@ -1090,17 +1090,21 @@ int mtk_ppe_stop(struct mtk_ppe *ppe) mtk_ppe_cache_enable(ppe, false); - /* disable offload engine */ - ppe_clear(ppe, MTK_PPE_GLO_CFG, MTK_PPE_GLO_CFG_EN); - ppe_w32(ppe, MTK_PPE_FLOW_CFG, 0); - /* disable aging */ val = MTK_PPE_TB_CFG_AGE_NON_L4 | MTK_PPE_TB_CFG_AGE_UNBIND | MTK_PPE_TB_CFG_AGE_TCP | MTK_PPE_TB_CFG_AGE_UDP | - MTK_PPE_TB_CFG_AGE_TCP_FIN; + MTK_PPE_TB_CFG_AGE_TCP_FIN | + MTK_PPE_TB_CFG_SCAN_MODE; ppe_clear(ppe, MTK_PPE_TB_CFG, val); - return mtk_ppe_wait_busy(ppe); + if (mtk_ppe_wait_busy(ppe)) + return -ETIMEDOUT; + + /* disable offload engine */ + ppe_clear(ppe, MTK_PPE_GLO_CFG, MTK_PPE_GLO_CFG_EN); + ppe_w32(ppe, MTK_PPE_FLOW_CFG, 0); + + return 0; } -- cgit v1.2.3 From badc9e33c79541cd62fc8238c2eb564d41feac56 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Thu, 14 Mar 2024 13:33:46 +0100 Subject: net: wan: fsl_qmc_hdlc: Fix module compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fsl_qmc_driver does not compile as module: error: ‘qmc_hdlc_driver’ undeclared here (not in a function); 405 | MODULE_DEVICE_TABLE(of, qmc_hdlc_driver); | ^~~~~~~~~~~~~~~ Fix the typo. Fixes: b40f00ecd463 ("net: wan: Add support for QMC HDLC") Reported-by: Michael Ellerman Closes: https://lore.kernel.org/linux-kernel/87ttl93f7i.fsf@mail.lhotse/ Signed-off-by: Herve Codina Signed-off-by: David S. Miller --- drivers/net/wan/fsl_qmc_hdlc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index 960371df470a..f69b1f579a0c 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -780,7 +780,7 @@ static const struct of_device_id qmc_hdlc_id_table[] = { { .compatible = "fsl,qmc-hdlc" }, {} /* sentinel */ }; -MODULE_DEVICE_TABLE(of, qmc_hdlc_driver); +MODULE_DEVICE_TABLE(of, qmc_hdlc_id_table); static struct platform_driver qmc_hdlc_driver = { .driver = { -- cgit v1.2.3 From 6ebfad33161afacb3e1e59ed1c2feefef70f9f97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2024 14:18:16 +0000 Subject: packet: annotate data-races around ignore_outgoing ignore_outgoing is read locklessly from dev_queue_xmit_nit() and packet_getsockopt() Add appropriate READ_ONCE()/WRITE_ONCE() annotations. syzbot reported: BUG: KCSAN: data-race in dev_queue_xmit_nit / packet_setsockopt write to 0xffff888107804542 of 1 bytes by task 22618 on cpu 0: packet_setsockopt+0xd83/0xfd0 net/packet/af_packet.c:4003 do_sock_setsockopt net/socket.c:2311 [inline] __sys_setsockopt+0x1d8/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2340 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff888107804542 of 1 bytes by task 27 on cpu 1: dev_queue_xmit_nit+0x82/0x620 net/core/dev.c:2248 xmit_one net/core/dev.c:3527 [inline] dev_hard_start_xmit+0xcc/0x3f0 net/core/dev.c:3547 __dev_queue_xmit+0xf24/0x1dd0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] batadv_send_skb_packet+0x264/0x300 net/batman-adv/send.c:108 batadv_send_broadcast_skb+0x24/0x30 net/batman-adv/send.c:127 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:392 [inline] batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:420 [inline] batadv_iv_send_outstanding_bat_ogm_packet+0x3f0/0x4b0 net/batman-adv/bat_iv_ogm.c:1700 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0x465/0x990 kernel/workqueue.c:3335 worker_thread+0x526/0x730 kernel/workqueue.c:3416 kthread+0x1d1/0x210 kernel/kthread.c:388 ret_from_fork+0x4b/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 27 Comm: kworker/u8:1 Tainted: G W 6.8.0-syzkaller-08073-g480e035fc4c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet Fixes: fa788d986a3a ("packet: add sockopt to ignore outgoing packets") Reported-by: syzbot+c669c1136495a2e7c31f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/CANn89i+Z7MfbkBLOv=p7KZ7=K1rKHO4P1OL5LYDCtBiyqsa9oQ@mail.gmail.com/T/#t Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/packet/af_packet.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0766a245816b..722787c32755 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2245,7 +2245,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { - if (ptype->ignore_outgoing) + if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 61270826b9ac..7cfc7d301508 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4000,7 +4000,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (val < 0 || val > 1) return -EINVAL; - po->prot_hook.ignore_outgoing = !!val; + WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: @@ -4134,7 +4134,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_IGNORE_OUTGOING: - val = po->prot_hook.ignore_outgoing; + val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) -- cgit v1.2.3 From c3198822c6cb9fb588e446540485669cc81c5d34 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 8 Mar 2024 17:26:00 +0200 Subject: net: esp: fix bad handling of pages from page_pool When the skb is reorganized during esp_output (!esp->inline), the pages coming from the original skb fragments are supposed to be released back to the system through put_page. But if the skb fragment pages are originating from a page_pool, calling put_page on them will trigger a page_pool leak which will eventually result in a crash. This leak can be easily observed when using CONFIG_DEBUG_VM and doing ipsec + gre (non offloaded) forwarding: BUG: Bad page state in process ksoftirqd/16 pfn:1451b6 page:00000000de2b8d32 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1451b6000 pfn:0x1451b6 flags: 0x200000000000000(node=0|zone=2) page_type: 0xffffffff() raw: 0200000000000000 dead000000000040 ffff88810d23c000 0000000000000000 raw: 00000001451b6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak Modules linked in: ip_gre gre mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay zram zsmalloc fuse [last unloaded: mlx5_core] CPU: 16 PID: 96 Comm: ksoftirqd/16 Not tainted 6.8.0-rc4+ #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x36/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x27a/0x460 free_unref_page+0x38/0x120 esp_ssg_unref.isra.0+0x15f/0x200 esp_output_tail+0x66d/0x780 esp_xmit+0x2c5/0x360 validate_xmit_xfrm+0x313/0x370 ? validate_xmit_skb+0x1d/0x330 validate_xmit_skb_list+0x4c/0x70 sch_direct_xmit+0x23e/0x350 __dev_queue_xmit+0x337/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x25e/0x580 iptunnel_xmit+0x19b/0x240 ip_tunnel_xmit+0x5fb/0xb60 ipgre_xmit+0x14d/0x280 [ip_gre] dev_hard_start_xmit+0xc3/0x1c0 __dev_queue_xmit+0x208/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x1ca/0x580 ip_sublist_rcv_finish+0x32/0x40 ip_sublist_rcv+0x1b2/0x1f0 ? ip_rcv_finish_core.constprop.0+0x460/0x460 ip_list_rcv+0x103/0x130 __netif_receive_skb_list_core+0x181/0x1e0 netif_receive_skb_list_internal+0x1b3/0x2c0 napi_gro_receive+0xc8/0x200 gro_cell_poll+0x52/0x90 __napi_poll+0x25/0x1a0 net_rx_action+0x28e/0x300 __do_softirq+0xc3/0x276 ? sort_range+0x20/0x20 run_ksoftirqd+0x1e/0x30 smpboot_thread_fn+0xa6/0x130 kthread+0xcd/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x31/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 The suggested fix is to introduce a new wrapper (skb_page_unref) that covers page refcounting for page_pool pages as well. Cc: stable@vger.kernel.org Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-and-tested-by: Anatoli N.Chechelnickiy Reported-by: Ian Kumlien Link: https://lore.kernel.org/netdev/CAA85sZvvHtrpTQRqdaOx6gd55zPAVsqMYk_Lwh4Md5knTq7AyA@mail.gmail.com Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Acked-by: Ilias Apalodimas Signed-off-by: Steffen Klassert --- include/linux/skbuff.h | 10 ++++++++++ net/ipv4/esp4.c | 8 ++++---- net/ipv6/esp6.c | 8 ++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3023bc2be6a1..b49a7d6591e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3523,6 +3523,16 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); bool napi_pp_put_page(struct page *page, bool napi_safe); +static inline void +skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +{ +#ifdef CONFIG_PAGE_POOL + if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + return; +#endif + put_page(page); +} + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4dd9e5040672..d33d12421814 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -95,7 +95,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET_ESPINTCP @@ -260,7 +260,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -639,7 +639,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 6e6efe026cdc..7371886d4f9f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -112,7 +112,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET6_ESPINTCP @@ -294,7 +294,7 @@ static void esp_output_done(void *data, int err)