summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2025-02-20 13:18:54 +0000
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-03-07 18:26:57 +0100
commitc31a732fac46b00b95b78fcc9c37cb48dd6f2e0c (patch)
tree326e899b574cd0132357e0acb5b358f74a878845
parented5efd1aef64fe33349a622fd149f07e4586e980 (diff)
downloadlinux-c31a732fac46b00b95b78fcc9c37cb48dd6f2e0c.tar.gz
linux-c31a732fac46b00b95b78fcc9c37cb48dd6f2e0c.tar.bz2
linux-c31a732fac46b00b95b78fcc9c37cb48dd6f2e0c.zip
net: better track kernel sockets lifetime
[ Upstream commit 5c70eb5c593d64d93b178905da215a9fd288a4b5 ] While kernel sockets are dismantled during pernet_operations->exit(), their freeing can be delayed by any tx packets still held in qdisc or device queues, due to skb_set_owner_w() prior calls. This then trigger the following warning from ref_tracker_dir_exit() [1] To fix this, make sure that kernel sockets own a reference on net->passive. Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket is converted to a refcounted one. [1] [ 136.263918][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.263918][ T35] sk_alloc+0x2b3/0x370 [ 136.263918][ T35] inet6_create+0x6ce/0x10f0 [ 136.263918][ T35] __sock_create+0x4c0/0xa30 [ 136.263918][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.263918][ T35] igmp6_net_init+0x39/0x390 [ 136.263918][ T35] ops_init+0x31e/0x590 [ 136.263918][ T35] setup_net+0x287/0x9e0 [ 136.263918][ T35] copy_net_ns+0x33f/0x570 [ 136.263918][ T35] create_new_namespaces+0x425/0x7b0 [ 136.263918][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.263918][ T35] ksys_unshare+0x57d/0xa70 [ 136.263918][ T35] __x64_sys_unshare+0x38/0x40 [ 136.263918][ T35] do_syscall_64+0xf3/0x230 [ 136.263918][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 136.263918][ T35] [ 136.343488][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.343488][ T35] sk_alloc+0x2b3/0x370 [ 136.343488][ T35] inet6_create+0x6ce/0x10f0 [ 136.343488][ T35] __sock_create+0x4c0/0xa30 [ 136.343488][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.343488][ T35] ndisc_net_init+0xa7/0x2b0 [ 136.343488][ T35] ops_init+0x31e/0x590 [ 136.343488][ T35] setup_net+0x287/0x9e0 [ 136.343488][ T35] copy_net_ns+0x33f/0x570 [ 136.343488][ T35] create_new_namespaces+0x425/0x7b0 [ 136.343488][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.343488][ T35] ksys_unshare+0x57d/0xa70 [ 136.343488][ T35] __x64_sys_unshare+0x38/0x40 [ 136.343488][ T35] do_syscall_64+0xf3/0x230 [ 136.343488][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") Reported-by: syzbot+30a19e01a97420719891@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r--include/net/sock.h1
-rw-r--r--net/core/sock.c27
-rw-r--r--net/mptcp/subflow.c5
-rw-r--r--net/netlink/af_netlink.c10
-rw-r--r--net/rds/tcp.c8
-rw-r--r--net/smc/af_smc.c5
-rw-r--r--net/sunrpc/svcsock.c5
-rw-r--r--net/sunrpc/xprtsock.c8
8 files changed, 30 insertions, 39 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 691ca7695d1d..d3efb581c2ff 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1750,6 +1750,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern);
void sk_free(struct sock *sk);
+void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index be84885f9290..9d5dd99cc581 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2233,6 +2233,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
} else {
+ net_passive_inc(net);
__netns_tracker_alloc(net, &sk->ns_tracker,
false, priority);
}
@@ -2257,6 +2258,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -2288,14 +2290,28 @@ static void __sk_destruct(struct rcu_head *head)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- if (likely(sk->sk_net_refcnt))
- put_net_track(sock_net(sk), &sk->ns_tracker);
- else
- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
-
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
@@ -2392,6 +2408,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* is not properly dismantling its kernel sockets at netns
* destroy time.
*/
+ net_passive_inc(sock_net(newsk));
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
false, priority);
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fd021cf8286e..dfcbef9c4624 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1772,10 +1772,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
* needs it.
* Update ns_tracker to current stack trace and refcounted tracker.
*/
- __netns_tracker_free(net, &sf->sk->ns_tracker, false);
- sf->sk->sk_net_refcnt = 1;
- get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sf->sk);
err = tcp_set_ulp(sf->sk, "mptcp");
if (err)
goto err_free;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f4e7b5e4bb59..e88a1ac160bc 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -795,16 +795,6 @@ static int netlink_release(struct socket *sock)
sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
- /* Because struct net might disappear soon, do not keep a pointer. */
- if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
- /* Because of deferred_put_nlk_sk and use of work queue,
- * it is possible netns will be freed before this socket.
- */
- sock_net_set(sk, &init_net);
- __netns_tracker_alloc(&init_net, &sk->ns_tracker,
- false, GFP_KERNEL);
- }
call_rcu(&nlk->rcu, deferred_put_nlk_sk);
return 0;
}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 0581c53e6517..3cc2f303bf78 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock)
release_sock(sk);
return false;
}
- /* Update ns_tracker to current stack trace and refcounted tracker */
- __netns_tracker_free(net, &sk->ns_tracker, false);
-
- sk->sk_net_refcnt = 1;
- netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sk);
+ put_net(net);
}
rtn = net_generic(net, rds_tcp_netid);
if (rtn->sndbuf_size > 0) {
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ebc41a7b13db..ba834cefb177 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3334,10 +3334,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family)
* which need net ref.
*/
sk = smc->clcsock->sk;
- __netns_tracker_free(net, &sk->ns_tracker, false);
- sk->sk_net_refcnt = 1;
- get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sk);
return 0;
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index cb3bd12f5818..72e5a01df3d3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1541,10 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
newlen = error;
if (protocol == IPPROTO_TCP) {
- __netns_tracker_free(net, &sock->sk->ns_tracker, false);
- sock->sk->sk_net_refcnt = 1;
- get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sock->sk);
if ((error = kernel_listen(sock, 64)) < 0)
goto bummer;
}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6b80b2aaf763..83cc095846d3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1941,12 +1941,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
goto out;
}
- if (protocol == IPPROTO_TCP) {
- __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false);
- sock->sk->sk_net_refcnt = 1;
- get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(xprt->xprt_net, 1);
- }
+ if (protocol == IPPROTO_TCP)
+ sk_net_refcnt_upgrade(sock->sk);
filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
if (IS_ERR(filp))