From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:50 -0800 Subject: net: better skb->sender_cpu and skb->napi_id cohabitation skb->sender_cpu and skb->napi_id share a common storage, and we had various bugs about this. We had to call skb_sender_cpu_clear() in some places to not leave a prior skb->napi_id and fool netdev_pick_tx() As suggested by Alexei, we could split the space so that these errors can not happen. 0 value being reserved as the common (not initialized) value, let's reserve [1 .. NR_CPUS] range for valid sender_cpu, and [NR_CPUS+1 .. ~0U] for valid napi_id. This will allow proper busy polling support over tunnels. Signed-off-by: Eric Dumazet Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ae00b894e675..2582c24a75c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; +static unsigned int napi_gen_id = NR_CPUS; static DEFINE_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -3021,7 +3021,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -4676,25 +4678,22 @@ EXPORT_SYMBOL_GPL(napi_by_id); void napi_hash_add(struct napi_struct *napi) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; - spin_lock(&napi_hash_lock); + spin_lock(&napi_hash_lock); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; - } + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); - spin_unlock(&napi_hash_lock); - } + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); -- cgit v1.2.3 From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:52 -0800 Subject: net: un-inline sk_busy_loop() There is really little gain from inlining this big function. We'll soon make it even bigger in following patches. This means we no longer need to export napi_by_id() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2582c24a75c6..74a816b299df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -4663,7 +4664,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4674,7 +4675,52 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); + +#if defined(CONFIG_NET_RX_BUSY_POLL) +bool sk_busy_loop(struct sock *sk, int nonblock) +{ + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_busy_poll) + goto out; + + do { + rc = ops->ndo_busy_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + cpu_relax(); + + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ void napi_hash_add(struct napi_struct *napi) { -- cgit v1.2.3 From 2a028ecb76497d05e5cd4e3e8b09d965cac2e3f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:53 -0800 Subject: net: allow BH servicing in sk_busy_loop() Instead of blocking BH in whole sk_busy_loop(), block them only around ->ndo_busy_poll() calls. This has many benefits. 1) allow tunneled traffic to use busy poll as well as native traffic. Tunnels handlers usually call netif_rx() and depend on net_rx_action() being run (from sofirq handler) 2) allow RFS/RPS being used (sending IPI to other cpus if needed) 3) use the 'lets burn cpu cycles' budget to do useful work (like TX completions, timers, RCU callbacks...) 4) reduce BH latencies, making busy poll a better citizen. Tested: Tested with SIT tunnel lpaa5:~# echo 0 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 37373.93 16384 87380 Now enable busy poll on both hosts lpaa5:~# echo 70 >/proc/sys/net/core/busy_read lpaa6:~# echo 70 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 58314.77 16384 87380 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 74a816b299df..2002eec2617d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4684,11 +4684,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock) struct napi_struct *napi; int rc = false; - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); + rcu_read_lock(); napi = napi_by_id(sk->sk_napi_id); if (!napi) @@ -4699,23 +4695,23 @@ bool sk_busy_loop(struct sock *sk, int nonblock) goto out; do { + local_bh_disable(); rc = ops->ndo_busy_poll(napi); + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: - rcu_read_unlock_bh(); + rcu_read_unlock(); return rc; } EXPORT_SYMBOL(sk_busy_loop); -- cgit v1.2.3 From ce6aea93f7510437dde625b77a7a2f4d20b72660 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:54 -0800 Subject: net: network drivers no longer need to implement ndo_busy_poll() Instead of having to implement complex ndo_busy_poll() method, drivers can simply rely on NAPI poll logic. Busy polling gains are mainly coming from polling itself, not on exact details on how we poll the device. ndo_busy_poll() if implemented can avoid touching napi state, but it adds extra synchronization between normal napi->poll() and busy poll handler, slowing down the common path (non busy polling) with extra atomic operations. In practice few drivers ever got busy poll because of the complexity. We could go one step further, and make busy polling available for all NAPI drivers, but this would require that all netif_napi_del() calls are done in process context so that we can call synchronize_rcu(). Full audit would be required. Before this is done, a driver still needs to call : - skb_mark_napi_id() for each skb provided to the stack. - napi_hash_add() and napi_hash_del() to allocate a napi_id per napi struct. - Make sure RCU grace period is respected after napi_hash_del() before memory containing napi structure is freed. Followup patch implements busy poll for mlx5 driver as an example. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2002eec2617d..93009610aee8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4677,10 +4677,11 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; + int (*busy_poll)(struct napi_struct *dev); struct napi_struct *napi; int rc = false; @@ -4690,13 +4691,27 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (!napi) goto out; - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; do { + rc = 0; local_bh_disable(); - rc = ops->ndo_busy_poll(napi); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); + } if (rc > 0) NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); -- cgit v1.2.3 From 93f93a4404159ecf7e9148f5ad0718ec702ac4cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:59 -0800 Subject: net: move skb_mark_napi_id() into core networking stack We would like to automatically provide busy polling support to all NAPI drivers, without them having to implement anything. skb_mark_napi_id() can be called from napi_gro_receive() and napi_get_frags(). Few drivers are still calling skb_mark_napi_id() because they use netif_receive_skb(). They should eventually call napi_gro_receive() instead. I will leave this to drivers maintainers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 93009610aee8..83b48747928c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4356,6 +4356,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4390,6 +4391,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; + skb_mark_napi_id(skb, napi); } return skb; } -- cgit v1.2.3 From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:00 -0800 Subject: net: add netif_tx_napi_add() netif_tx_napi_add() is a variant of netif_napi_add() It should be used by drivers that use a napi structure to exclusively poll TX. We do not want to add this kind of napi in napi_hash[] in following patches, adding generic busy polling to all NAPI drivers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 83b48747928c..ff58a8bc5e3c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4737,7 +4737,8 @@ EXPORT_SYMBOL(sk_busy_loop); void napi_hash_add(struct napi_struct *napi) { - if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) return; spin_lock(&napi_hash_lock); -- cgit v1.2.3 From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:01 -0800 Subject: net: move napi_hash[] into read mostly section We do not often add/delete a napi context. Moving napi_hash[] into read mostly section avoids potential false sharing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ff58a8bc5e3c..02dfbd91a8e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL(dev_base_lock); static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; -static DEFINE_HASHTABLE(napi_hash, 8); +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; -- cgit v1.2.3 From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:02 -0800 Subject: net: napi_hash_del() returns a boolean status napi_hash_del() will soon be used from both drivers (if they want) or core networking stack. Callers are responsibles to ensure an RCU grace period is respected before freeing napi structure : napi_hash_del() can signal if this RCU grace period is needed or not. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 02dfbd91a8e4..59dddac1c2e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4760,14 +4760,18 @@ EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); -- cgit v1.2.3 From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:03 -0800 Subject: net: provide generic busy polling to all NAPI drivers NAPI drivers no longer need to observe a particular protocol to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y) napi_hash_add() and napi_hash_del() are automatically called from core networking stack, respectively from netif_napi_add() and netif_napi_del() This patch depends on free_netdev() and netif_napi_del() being called from process context, which seems to be the norm. Drivers might still prefer to call napi_hash_del() on their own, since they might combine all the rcu grace periods into a single one, knowing their NAPI structures lifetime, while core networking stack has no idea of a possible combining. Once this patch proves to not bring serious regressions, we will cleanup drivers to either remove napi_hash_del() or provide appropriate rcu grace periods combining. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 59dddac1c2e7..41cef3e3f558 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4807,6 +4807,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4826,8 +4827,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -7227,11 +7232,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); -- cgit v1.2.3 From e2f9dc3bd213792ac006e83f50a5453f23b8c354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Nov 2015 12:11:23 -0800 Subject: net: avoid NULL deref in napi_get_frags() napi_alloc_skb() can return NULL. We should not crash should this happen. Fixes: 93f93a440415 ("net: move skb_mark_napi_id() into core networking stack") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 41cef3e3f558..5df6cbce727c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4390,8 +4390,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; - skb_mark_napi_id(skb, napi); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } -- cgit v1.2.3 From b811580d91e9c0945b0a923dcec3e10cce04ac30 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Nov 2015 12:24:22 -0800 Subject: net: IPv6 fib lookup tracepoint Add tracepoint to show fib6 table lookups and result. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/net-traces.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/net-traces.c b/net/core/net-traces.c index adef015b2f41..92da5e4ceb4f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -32,6 +32,10 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); -- cgit v1.2.3 From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Nov 2015 13:55:39 +0800 Subject: net: Generalise wq_has_sleeper helper The memory barrier in the helper wq_has_sleeper is needed by just about every user of waitqueue_active. This patch generalises it by making it take a wait_queue_head_t directly. The existing helper is renamed to skwq_has_sleeper. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 8 ++++---- net/core/stream.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 1e4dd54bfb5a..2769bd3a4d7c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2283,7 +2283,7 @@ static void sock_def_wakeup(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } @@ -2294,7 +2294,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); @@ -2306,7 +2306,7 @@ static void sock_def_readable(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -2324,7 +2324,7 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); diff --git a/net/core/stream.c b/net/core/stream.c index d70f77a0c889..8ff9d63b4265 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) -- cgit v1.2.3 From b03804e7c3ad41c265c0ca21ddb306b252b4f99f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Dec 2015 12:12:03 +0100 Subject: net: Check CHANGEUPPER notifier return value switchdev drivers reflect the newly requested topology to hardware when CHANGEUPPER is received, after software links were already formed. However, the operation can fail and user will not be notified, as the return value of the notifier is not checked. Add this check and rollback software links if necessary. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5df6cbce727c..939cd1b1da15 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5490,8 +5490,12 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, - &changeupper_info.info); + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback_lower_mesh; + return 0; rollback_lower_mesh: -- cgit v1.2.3 From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:10 +0100 Subject: net: propagate upper priv via netdev_master_upper_dev_link Eliminate netdev_master_upper_dev_link_private and pass priv directly as a parameter of netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 939cd1b1da15..27d052bb78bc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *private) + void *upper_priv) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5452,7 +5452,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; @@ -5557,6 +5557,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5565,20 +5566,13 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv) { - return __netdev_upper_dev_link(dev, upper_dev, true, NULL); + return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv); } EXPORT_SYMBOL(netdev_master_upper_dev_link); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private) -{ - return __netdev_upper_dev_link(dev, upper_dev, true, private); -} -EXPORT_SYMBOL(netdev_master_upper_dev_link_private); - /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device -- cgit v1.2.3 From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:11 +0100 Subject: net: add possibility to pass information about upper device via notifier Sometimes the drivers and other code would find it handy to know some internal information about upper device being changed. So allow upper-code to pass information down to notifier listeners during linking. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 27d052bb78bc..8ed886663c6d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv) + void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5445,6 +5445,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, changeupper_info.upper_dev = upper_dev; changeupper_info.master = master; changeupper_info.linking = true; + changeupper_info.upper_info = upper_info; ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, &changeupper_info.info); @@ -5549,7 +5550,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -5558,6 +5559,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5567,9 +5569,10 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv) + void *upper_priv, void *upper_info) { - return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv); + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info); } EXPORT_SYMBOL(netdev_master_upper_dev_link); -- cgit v1.2.3 From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:15 +0100 Subject: net: introduce change lower state notifier When lower device like bonding slave, team/bridge port, etc changes its state, it is useful for others to notice this change. Currently this is implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This patch aims to replace this specific usage and make this more generic to be used for all upper-lower devices. Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and netdev_lower_state_changed() helper. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8ed886663c6d..d1706e88fbeb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5756,6 +5756,26 @@ int dev_get_nest_level(struct net_device *dev, } EXPORT_SYMBOL(dev_get_nest_level); +/** + * netdev_lower_change - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3 From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Dec 2015 15:01:31 +0100 Subject: net: constify netif_is_* helpers net_device param As suggested by Eric, these helpers should have const dev param. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d1706e88fbeb..e5c395473eba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5734,7 +5734,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)) + bool (*type_check)(const struct net_device *dev)) { struct net_device *lower = NULL; struct list_head *iter; -- cgit v1.2.3 From ea3793ee29d3621faf857fa8ef5425e9ff9a756d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Sun, 6 Dec 2015 21:11:34 +0000 Subject: core: enable more fine-grained datagram reception control The __skb_recv_datagram routine in core/ datagram.c provides a general skb reception factility supposed to be utilized by protocol modules providing datagram sockets. It encompasses both the actual recvmsg code and a surrounding 'sleep until data is available' loop. This is inconvenient if a protocol module has to use additional locking in order to maintain some per-socket state the generic datagram socket code is unaware of (as the af_unix code does). The patch below moves the recvmsg proper code into a new __skb_try_recv_datagram routine which doesn't sleep and renames wait_for_more_packets to __skb_wait_for_more_packets, both routines being exported interfaces. The original __skb_recv_datagram routine is reimplemented on top of these two functions such that its user-visible behaviour remains unchanged. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/core/datagram.c | 77 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index d62af69ad844..7daff66d3d0b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn /* * Wait for the last received packet to be different from skb */ -static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, - const struct sk_buff *skb) +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -130,6 +130,7 @@ out_noerr: error = 1; goto out; } +EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { @@ -161,13 +162,15 @@ done: } /** - * __skb_recv_datagram - Receive a datagram skbuff + * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned + * @last: set to last peeked message to inform the wait function + * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and @@ -175,9 +178,11 @@ done: * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * - * This function will lock the socket if a skb is returned, so the caller - * needs to unlock the socket in that case (usually by calling - * skb_free_datagram) + * This function will lock the socket if a skb is returned, so + * the caller needs to unlock the socket in that case (usually by + * calling skb_free_datagram). Returns NULL with *err set to + * -EAGAIN if no data was available or to some other value if an + * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve @@ -191,13 +196,13 @@ done: * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, - int *peeked, int *off, int *err) +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err, + struct sk_buff **last) { struct sk_buff_head *queue = &sk->sk_receive_queue; - struct sk_buff *skb, *last; + struct sk_buff *skb; unsigned long cpu_flags; - long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ @@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, */ int _off = *off; - last = (struct sk_buff *)queue; + *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { - last = skb; + *last = skb; *peeked = skb->peeked; if (flags & MSG_PEEK) { if (_off >= skb->len && (skb->len || _off || @@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, skb = skb_set_peeked(skb); error = PTR_ERR(skb); - if (IS_ERR(skb)) - goto unlock_err; + if (IS_ERR(skb)) { + spin_unlock_irqrestore(&queue->lock, + cpu_flags); + goto no_packet; + } atomic_inc(&skb->users); } else @@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, *off = _off; return skb; } + spin_unlock_irqrestore(&queue->lock, cpu_flags); + } while (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)); - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)) - continue; + error = -EAGAIN; - /* User doesn't want to wait */ - error = -EAGAIN; - if (!timeo) - goto no_packet; +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_try_recv_datagram); - } while (!wait_for_more_packets(sk, err, &timeo, last)); +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err) +{ + struct sk_buff *skb, *last; + long timeo; - return NULL; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, + &last); + if (skb) + return skb; + + if (*err != EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, err, &timeo, last)); -unlock_err: - spin_unlock_irqrestore(&queue->lock, cpu_flags); -no_packet: - *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); -- cgit v1.2.3 From 760a4322470e3990b14e09bfe80c9c75c77f33dd Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Tue, 8 Dec 2015 14:47:56 +0000 Subject: net: Fix inverted test in __skb_recv_datagram As the kernel generally uses negated error numbers, *err needs to be compared with -EAGAIN (d'oh). Signed-off-by: Rainer Weikusat Fixes: ea3793ee29d3 ("core: enable more fine-grained datagram reception control") Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 7daff66d3d0b..fa9dc6450b08 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -275,7 +275,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (skb) return skb; - if (*err != EAGAIN) + if (*err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, err, &timeo, last)); -- cgit v1.2.3 From 297dbde19cf6a0ccb6fd4396c6220a5912ed61e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:51 -0500 Subject: netprio_cgroup: limit the maximum css->id to USHRT_MAX netprio builds per-netdev contiguous priomap array which is indexed by css->id. The array is allocated using kzalloc() effectively limiting the maximum ID supported to some thousand range. This patch caps the maximum supported css->id to USHRT_MAX which should be way above what is actually useable. This allows reducing sock->sk_cgrp_prioidx to u16 from u32. The freed up part will be used to overload the cgroup related fields. sock->sk_cgrp_prioidx's position is swapped with sk_mark so that the two cgroup related fields are adjacent. Signed-off-by: Tejun Heo Acked-by: Daniel Wagner Cc: Daniel Borkmann CC: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..2b9159b7a28a 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,6 +27,12 @@ #include +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX + #define PRIOMAP_MIN_SZ 128 /* @@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) struct net_device *dev; int ret = 0; + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + if (!parent_css) return 0; -- cgit v1.2.3 From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:52 -0500 Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data. ->sk_cgroup_prioidx and ->sk_classid are moved into it. The struct and its accessors are defined in cgroup-defs.h. This is to prepare for overloading the fields with a cgroup pointer. This patch mostly performs equivalent conversions but the followings are noteworthy. * Equality test before updating classid is removed from sock_update_classid(). This shouldn't make any noticeable difference and a similar test will be implemented on the helper side later. * sock_update_netprioidx() now takes struct sock_cgroup_data and can be moved to netprio_cgroup.h without causing include dependency loop. Moved. * The dummy version of sock_update_netprioidx() converted to a static inline function while at it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- net/core/netclassid_cgroup.c | 4 ++-- net/core/netprio_cgroup.c | 3 ++- net/core/scm.c | 4 ++-- net/core/sock.c | 15 ++------------- 5 files changed, 10 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e5c395473eba..8f705fcedb94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2929,7 +2929,8 @@ static void skb_update_prio(struct sk_buff *skb) struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + unsigned int prioidx = + sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 2e4df84c34a1..e60ded46b3ac 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -62,8 +62,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct socket *sock = sock_from_file(file, &err); if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, + (unsigned long)v); return 0; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 2b9159b7a28a..de42aa7f6c77 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -223,7 +223,8 @@ static int update_netprio(const void *v, struct file *file, unsigned n) int err; struct socket *sock = sock_from_file(file, &err); if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); return 0; } diff --git a/net/core/scm.c b/net/core/scm.c index 8a1741b14302..14596fb37172 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/sock.c b/net/core/sock.c index 7965ef487375..947741dc43fa 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1393,17 +1393,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1432,8 +1421,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); } return sk; -- cgit v1.2.3 From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:53 -0500 Subject: sock, cgroup: add sock->sk_cgroup In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Signed-off-by: David S. Miller --- net/core/netclassid_cgroup.c | 7 ++++++- net/core/netprio_cgroup.c | 7 ++++++- net/core/sock.c | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index e60ded46b3ac..04257a0e3534 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) int err; struct socket *sock = sock_from_file(file, &err); - if (sock) + if (sock) { + spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } @@ -98,6 +101,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, { struct cgroup_cls_state *cs = css_cls_state(css); + cgroup_sk_alloc_disable(); + cs->classid = (u32)value; update_classid(css, (void *)(unsigned long)cs->classid); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index de42aa7f6c77..053d60c33395 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -209,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; + cgroup_sk_alloc_disable(); + rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -222,9 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) + if (sock) { + spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } diff --git a/net/core/sock.c b/net/core/sock.c index 947741dc43fa..1278d7b7bd9a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1363,6 +1363,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1385,6 +1386,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) owner = prot->owner; slab = prot->slab; + cgroup_sk_free(&sk->sk_cgrp_data); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); -- cgit v1.2.3 From 6ff64f6f9242d7e50f3e99cb280f69d1927a5fa6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Dec 2015 16:03:35 +0100 Subject: switchdev: Pass original device to port netdev driver switchdev drivers need to know the netdev on which the switchdev op was invoked. For example, the STP state of a VLAN interface configured on top of a port can change while being member in a bridge. In this case, the underlying driver should only change the STP state of that particular VLAN and not of all the VLANs configured on the port. However, current switchdev infrastructure only passes the port netdev down to the driver. Solve that by passing the original device down to the driver as part of the required switchdev object / attribute. This doesn't entail any change in current switchdev drivers. It simply enables those supporting stacked devices to know the originating device and act accordingly. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 1 + net/core/rtnetlink.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f88a62ab019d..bca8c350e7f3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -471,6 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct switchdev_attr attr = { + .orig_dev = netdev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34ba7a08876d..d8b0113d3eec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1027,6 +1027,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct switchdev_attr attr = { + .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; -- cgit v1.2.3 From 53692b1de419c1b59106909c7f6b4dd3dbc768ac Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:41 -0800 Subject: sctp: Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC The SCTP checksum is really a CRC and is very different from the standards 1's complement checksum that serves as the checksum for IP protocols. This offload interface is also very different. Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC to highlight these differences. The term CSUM should be reserved in the stack to refer to the standard 1's complement IP checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 29edf74846fc..4a0cab85d67d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -87,7 +87,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", [NETIF_F_RXHASH_BIT] = "rx-hashing", @@ -235,7 +235,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; -- cgit v1.2.3 From a188222b6ed29404ac2d4232d35d1fe0e77af370 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:43 -0800 Subject: net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK The name NETIF_F_ALL_CSUM is a misnomer. This does not correspond to the set of features for offloading all checksums. This is a mask of the checksum offload related features bits. It is incorrect to set both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM or NETIF_F_IPV6 at the same time for features of a device. This patch: - Changes instances of NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK (where NETIF_F_ALL_CSUM is being used as a mask). - Changes bonding, sfc/efx, ipvlan, macvlan, vlan, and team drivers to use NEITF_F_HW_CSUM in features list instead of NETIF_F_ALL_CSUM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- net/core/ethtool.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8f705fcedb94..5a3b5a404642 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,7 +2645,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2792,7 +2792,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && + if (!(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -7572,15 +7572,15 @@ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); return all; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4a0cab85d67d..09948a726347 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -235,7 +235,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CRC; + return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; -- cgit v1.2.3 From c8cd0989bd151fda87bbf10887b3df18021284bc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:44 -0800 Subject: net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM These netif flags are unnecessary convolutions. It is more straightforward to just use NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, and NETIF_F_IPV6_CSUM directly. This patch also: - Cleans up can_checksum_protocol - Simplifies netdev_intersect_features Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ net/core/pktgen.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5a3b5a404642..45b013f27625 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6467,9 +6467,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + if (!(features & NETIF_F_HW_CSUM) && + ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { netdev_dbg(dev, "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; @@ -7571,7 +7571,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) + if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; @@ -7579,8 +7579,8 @@ netdev_features_t netdev_increment_features(netdev_features_t all, all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index de8d5cc5eb24..2be144498bcf 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2898,7 +2898,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V4_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; udp4_hwcsum(skb, iph->saddr, iph->daddr); @@ -3032,7 +3032,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V6_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); -- cgit v1.2.3 From 6ae23ad36253a8033c5714c52b691b84456487c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:46 -0800 Subject: net: Add driver helper functions to determine checksum offloadability Add skb_csum_offload_chk driver helper function to determine if a device with limited checksum offload capabilities is able to offload the checksum for a given packet. This patch includes: - The skb_csum_offload_chk function. Returns true if checksum is offloadable, else false. Optionally, in the case that the checksum is not offloable, the function can call skb_checksum_help to resolve the checksum. skb_csum_offload_chk also returns whether the checksum refers to an encapsulated checksum. - Definition of skb_csum_offl_spec structure that caller uses to indicate rules about what it can offload (e.g. IPv4/v6, TCP/UDP only, whether encapsulated checksums can be offloaded, whether checksum with IPv6 extension headers can be offloaded). - Ancilary functions called skb_csum_offload_chk_help, skb_csum_off_chk_help_cmn, skb_csum_off_chk_help_cmn_v4_only. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 45b013f27625..914b4a24c654 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -138,6 +138,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2471,6 +2472,141 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +/* skb_csum_offload_check - Driver helper function to determine if a device + * with limited checksum offload capabilities is able to offload the checksum + * for a given packet. + * + * Arguments: + * skb - sk_buff for the packet in question + * spec - contains the description of what device can offload + * csum_encapped - returns true if the checksum being offloaded is + * encpasulated. That is it is checksum for the transport header + * in the inner headers. + * checksum_help - when set indicates that helper function should + * call skb_checksum_help if offload checks fail + * + * Returns: + * true: Packet has passed the checksum checks and should be offloadable to + * the device (a driver may still need to check for additional + * restrictions of its device) + * false: Checksum is not offloadable. If checksum_help was set then + * skb_checksum_help was called to resolve checksum for non-GSO + * packets and when IP protocol is not SCTP + */ +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + struct iphdr *iph; + struct ipv6hdr *ipv6; + void *nhdr; + int protocol; + u8 ip_proto; + + if (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)) { + if (!spec->vlan_okay) + goto need_help; + } + + /* We check whether the checksum refers to a transport layer checksum in + * the outermost header or an encapsulated transport layer checksum that + * corresponds to the inner headers of th