summaryrefslogtreecommitdiff
path: root/net/netfilter/ipvs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 15:47:48 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 15:47:48 -0800
commit7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (patch)
treeae0427c5a3b905f24b3a44b510a9bcf35d9b67a3 /net/netfilter/ipvs
parent1ca06f1c1acecbe02124f14a37cce347b8c1a90c (diff)
parent7c4a6309e27f411743817fe74a832ec2d2798a4b (diff)
downloadlinux-7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91.tar.gz
linux-7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91.tar.bz2
linux-7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91.zip
Merge tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Allow live renaming when an interface is up - Add retpoline wrappers for tc, improving considerably the performances of complex queue discipline configurations - Add inet drop monitor support - A few GRO performance improvements - Add infrastructure for atomic dev stats, addressing long standing data races - De-duplicate common code between OVS and conntrack offloading infrastructure - A bunch of UBSAN_BOUNDS/FORTIFY_SOURCE improvements - Netfilter: introduce packet parser for tunneled packets - Replace IPVS timer-based estimators with kthreads to scale up the workload with the number of available CPUs - Add the helper support for connection-tracking OVS offload BPF: - Support for user defined BPF objects: the use case is to allocate own objects, build own object hierarchies and use the building blocks to build own data structures flexibly, for example, linked lists in BPF - Make cgroup local storage available to non-cgroup attached BPF programs - Avoid unnecessary deadlock detection and failures wrt BPF task storage helpers - A relevant bunch of BPF verifier fixes and improvements - Veristat tool improvements to support custom filtering, sorting, and replay of results - Add LLVM disassembler as default library for dumping JITed code - Lots of new BPF documentation for various BPF maps - Add bpf_rcu_read_{,un}lock() support for sleepable programs - Add RCU grace period chaining to BPF to wait for the completion of access from both sleepable and non-sleepable BPF programs - Add support storing struct task_struct objects as kptrs in maps - Improve helper UAPI by explicitly defining BPF_FUNC_xxx integer values - Add libbpf *_opts API-variants for bpf_*_get_fd_by_id() functions Protocols: - TCP: implement Protective Load Balancing across switch links - TCP: allow dynamically disabling TCP-MD5 static key, reverting back to fast[er]-path - UDP: Introduce optional per-netns hash lookup table - IPv6: simplify and cleanup sockets disposal - Netlink: support different type policies for each generic netlink operation - MPTCP: add MSG_FASTOPEN and FastOpen listener side support - MPTCP: add netlink notification support for listener sockets events - SCTP: add VRF support, allowing sctp sockets binding to VRF devices - Add bridging MAC Authentication Bypass (MAB) support - Extensions for Ethernet VPN bridging implementation to better support multicast scenarios - More work for Wi-Fi 7 support, comprising conversion of all the existing drivers to internal TX queue usage - IPSec: introduce a new offload type (packet offload) allowing complete header processing and crypto offloading - IPSec: extended ack support for more descriptive XFRM error reporting - RXRPC: increase SACK table size and move processing into a per-local endpoint kernel thread, reducing considerably the required locking - IEEE 802154: synchronous send frame and extended filtering support, initial support for scanning available 15.4 networks - Tun: bump the link speed from 10Mbps to 10Gbps - Tun/VirtioNet: implement UDP segmentation offload support Driver API: - PHY/SFP: improve power level switching between standard level 1 and the higher power levels - New API for netdev <-> devlink_port linkage - PTP: convert existing drivers to new frequency adjustment implementation - DSA: add support for rx offloading - Autoload DSA tagging driver when dynamically changing protocol - Add new PCP and APPTRUST attributes to Data Center Bridging - Add configuration support for 800Gbps link speed - Add devlink port function attribute to enable/disable RoCE and migratable - Extend devlink-rate to support strict prioriry and weighted fair queuing - Add devlink support to directly reading from region memory - New device tree helper to fetch MAC address from nvmem - New big TCP helper to simplify temporary header stripping New hardware / drivers: - Ethernet: - Marvel Octeon CNF95N and CN10KB Ethernet Switches - Marvel Prestera AC5X Ethernet Switch - WangXun 10 Gigabit NIC - Motorcomm yt8521 Gigabit Ethernet - Microchip ksz9563 Gigabit Ethernet Switch - Microsoft Azure Network Adapter - Linux Automation 10Base-T1L adapter - PHY: - Aquantia AQR112 and AQR412 - Motorcomm YT8531S - PTP: - Orolia ART-CARD - WiFi: - MediaTek Wi-Fi 7 (802.11be) devices - RealTek rtw8821cu, rtw8822bu, rtw8822cu and rtw8723du USB devices - Bluetooth: - Broadcom BCM4377/4378/4387 Bluetooth chipsets - Realtek RTL8852BE and RTL8723DS - Cypress.CYW4373A0 WiFi + Bluetooth combo device Drivers: - CAN: - gs_usb: bus error reporting support - kvaser_usb: listen only and bus error reporting support - Ethernet NICs: - Intel (100G): - extend action skbedit to RX queue mapping - implement devlink-rate support - support direct read from memory - nVidia/Mellanox (mlx5): - SW steering improvements, increasing rules update rate - Support for enhanced events compression - extend H/W offload packet manipulation capabilities - implement IPSec packet offload mode - nVidia/Mellanox (mlx4): - better big TCP support - Netronome Ethernet NICs (nfp): - IPsec offload support - add support for multicast filter - Broadcom: - RSS and PTP support improvements - AMD/SolarFlare: - netlink extened ack improvements - add basic flower matches to offload, and related stats - Virtual NICs: - ibmvnic: introduce affinity hint support - small / embedded: - FreeScale fec: add initial XDP support - Marvel mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood - TI am65-cpsw: add suspend/resume support - Mediatek MT7986: add RX wireless wthernet dispatch support - Realtek 8169: enable GRO software interrupt coalescing per default - Ethernet high-speed switches: - Microchip (sparx5): - add support for Sparx5 TC/flower H/W offload via VCAP - Mellanox mlxsw: - add 802.1X and MAC Authentication Bypass offload support - add ip6gre support - Embedded Ethernet switches: - Mediatek (mtk_eth_soc): - improve PCS implementation, add DSA untag support - enable flow offload support - Renesas: - add rswitch R-Car Gen4 gPTP support - Microchip (lan966x): - add full XDP support - add TC H/W offload via VCAP - enable PTP on bridge interfaces - Microchip (ksz8): - add MTU support for KSZ8 series - Qualcomm 802.11ax WiFi (ath11k): - support configuring channel dwell time during scan - MediaTek WiFi (mt76): - enable Wireless Ethernet Dispatch (WED) offload support - add ack signal support - enable coredump support - remain_on_channel support - Intel WiFi (iwlwifi): - enable Wi-Fi 7 Extremely High Throughput (EHT) PHY capabilities - 320 MHz channels support - RealTek WiFi (rtw89): - new dynamic header firmware format support - wake-over-WLAN support" * tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2002 commits) ipvs: fix type warning in do_div() on 32 bit net: lan966x: Remove a useless test in lan966x_ptp_add_trap() net: ipa: add IPA v4.7 support dt-bindings: net: qcom,ipa: Add SM6350 compatible bnxt: Use generic HBH removal helper in tx path IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver selftests: forwarding: Add bridge MDB test selftests: forwarding: Rename bridge_mdb test bridge: mcast: Support replacement of MDB port group entries bridge: mcast: Allow user space to specify MDB entry routing protocol bridge: mcast: Allow user space to add (*, G) with a source list and filter mode bridge: mcast: Add support for (*, G) with a source list and filter mode bridge: mcast: Avoid arming group timer when (S, G) corresponds to a source bridge: mcast: Add a flag for user installed source entries bridge: mcast: Expose __br_multicast_del_group_src() bridge: mcast: Expose br_multicast_new_group_src() bridge: mcast: Add a centralized error path bridge: mcast: Place netlink policy before validation functions bridge: mcast: Split (*, G) and (S, G) addition into different functions bridge: mcast: Do not derive entry type from its filter mode ...
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c40
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c452
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c883
3 files changed, 1201 insertions, 174 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 51ad557a525b..2fcc26507d69 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
s = this_cpu_ptr(cp->dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void)
ip_vs_conn_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
+ /* common rcu_barrier() used by:
+ * - ip_vs_control_cleanup()
+ */
+ rcu_barrier();
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 988222fff9f0..c9f598505642 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -49,8 +49,7 @@
MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
/* sysctl variables */
@@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work)
}
#endif
+static void est_reload_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, est_reload_work.work);
+ int genid_done = atomic_read(&ipvs->est_genid_done);
+ unsigned long delay = HZ / 10; /* repeat startups after failure */
+ bool repeat = false;
+ int genid;
+ int id;
+
+ mutex_lock(&ipvs->est_mutex);
+ genid = atomic_read(&ipvs->est_genid);
+ for (id = 0; id < ipvs->est_kt_count; id++) {
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+ /* netns clean up started, abort delayed work */
+ if (!ipvs->enable)
+ goto unlock;
+ if (!kd)
+ continue;
+ /* New config ? Stop kthread tasks */
+ if (genid != genid_done)
+ ip_vs_est_kthread_stop(kd);
+ if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ /* Do not start kthreads above 0 in calc phase */
+ if ((!id || !ipvs->est_calc_phase) &&
+ ip_vs_est_kthread_start(ipvs, kd) < 0)
+ repeat = true;
+ }
+ }
+
+ atomic_set(&ipvs->est_genid_done, genid);
+
+ if (repeat)
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+ delay);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+}
+
int
ip_vs_use_count_inc(void)
{
@@ -471,7 +511,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static void ip_vs_service_free(struct ip_vs_service *svc)
{
- free_percpu(svc->stats.cpustats);
+ ip_vs_stats_release(&svc->stats);
kfree(svc);
}
@@ -483,17 +523,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
ip_vs_service_free(svc);
}
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
- if (do_delay)
- call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
- else
- ip_vs_service_free(svc);
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
}
@@ -780,14 +817,22 @@ out:
return dest;
}
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest;
+
+ dest = container_of(head, struct ip_vs_dest, rcu_head);
+ ip_vs_stats_release(&dest->stats);
+ ip_vs_dest_put_and_free(dest);
+}
+
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
__ip_vs_dst_cache_reset(dest);
- __ip_vs_svc_put(svc, false);
- free_percpu(dest->stats.cpustats);
- ip_vs_dest_put_and_free(dest);
+ __ip_vs_svc_put(svc);
+ call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
}
/*
@@ -811,12 +856,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
}
}
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_stats_rcu *rs = container_of(head,
+ struct ip_vs_stats_rcu,
+ rcu_head);
+
+ ip_vs_stats_release(&rs->s);
+ kfree(rs);
+}
+
static void
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
- spin_lock_bh(&src->lock);
+ spin_lock(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -826,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
ip_vs_read_estimator(dst, src);
- spin_unlock_bh(&src->lock);
+ spin_unlock(&src->lock);
}
static void
@@ -847,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
- spin_lock_bh(&stats->lock);
+ spin_lock(&stats->lock);
/* get current counters as zero point, rates are zeroed */
@@ -861,7 +916,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
ip_vs_zero_estimator(stats);
- spin_unlock_bh(&stats->lock);
+ spin_unlock(&stats->lock);
+}
+
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+ int i;
+
+ spin_lock_init(&s->lock);
+ s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!s->cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+ u64_stats_init(&cs->syncp);
+ }
+ return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+ struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s && ip_vs_stats_init_alloc(s) >= 0)
+ return s;
+ kfree(s);
+ return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+ free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+ if (stats) {
+ ip_vs_stats_release(stats);
+ kfree(stats);
+ }
}
/*
@@ -923,7 +1019,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
- __ip_vs_svc_put(old_svc, true);
+ __ip_vs_svc_put(old_svc);
}
}
@@ -942,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock_bh(&dest->dst_lock);
if (add) {
- ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -963,14 +1058,13 @@ static int
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
- unsigned int atype, i;
+ unsigned int atype;
+ int ret;
EnterFunction(2);
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
- int ret;
-
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
@@ -992,15 +1086,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
if (dest == NULL)
return -ENOMEM;
- dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!dest->stats.cpustats)
+ ret = ip_vs_stats_init_alloc(&dest->stats);
+ if (ret < 0)
goto err_alloc;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_dest_stats;
- ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
- u64_stats_init(&ip_vs_dest_stats->syncp);
- }
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err_stats;
dest->af = udest->af;
dest->protocol = svc->protocol;
@@ -1017,15 +1109,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
- spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
LeaveFunction(2);
return 0;
+err_stats:
+ ip_vs_stats_release(&dest->stats);
+
err_alloc:
kfree(dest);
- return -ENOMEM;
+ return ret;
}
@@ -1087,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err;
__ip_vs_update_dest(svc, dest, udest, 1);
- ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
ret = ip_vs_new_dest(svc, udest);
}
+
+err:
LeaveFunction(2);
return ret;
@@ -1284,7 +1382,7 @@ static int
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0, i;
+ int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
@@ -1344,18 +1442,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
- svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!svc->stats.cpustats) {
- ret = -ENOMEM;
+ ret = ip_vs_stats_init_alloc(&svc->stats);
+ if (ret < 0)
goto out_err;
- }
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_stats;
- ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
- u64_stats_init(&ip_vs_stats->syncp);
- }
-
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
@@ -1372,7 +1461,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
@@ -1382,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
sched = NULL;
}
+ ret = ip_vs_start_estimator(ipvs, &svc->stats);
+ if (ret < 0)
+ goto out_err;
+
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
@@ -1394,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
if (svc->pe && svc->pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
- ip_vs_start_estimator(ipvs, &svc->stats);
-
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
@@ -1406,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_svc_hash(svc);
*svc_p = svc;
- /* Now there is a service - full throttle */
- ipvs->enable = 1;
+
+ if (!ipvs->enable) {
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
+
+ /* Start estimation for first time */
+ ip_vs_est_reload_start(ipvs);
+ }
+
return 0;
@@ -1571,7 +1668,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/*
* Free the service if nobody refers to it
*/
- __ip_vs_svc_put(svc, true);
+ __ip_vs_svc_put(svc);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1761,7 +1858,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
}
}
- ip_vs_zero_stats(&ipvs->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats->s);
return 0;
}
@@ -1843,6 +1940,148 @@ proc_do_sync_ports(struct ctl_table *table, int write,
return rc;
}
+static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ cpumask_var_t newmask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buffer, newmask);
+ if (ret)
+ goto out;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (!ipvs->est_cpulist_valid) {
+ if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ipvs->est_cpulist_valid = 1;
+ }
+ cpumask_and(newmask, newmask, &current->cpus_mask);
+ cpumask_copy(*valp, newmask);
+ /* est_max_threads may depend on cpulist size */
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+ ipvs->est_calc_phase = 1;
+ ip_vs_est_reload_start(ipvs);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+
+out:
+ free_cpumask_var(newmask);
+ return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
+ size_t size)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ struct cpumask *mask;
+ int ret;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+ mutex_unlock(&ipvs->est_mutex);
+
+ return ret;
+}
+
+static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ /* Ignore both read and write(append) if *ppos not 0 */
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ /* proc_sys_call_handler() appends terminator */
+ ret = ipvs_proc_est_cpumask_set(table, buffer);
+ if (ret >= 0)
+ *ppos += *lenp;
+ } else {
+ /* proc_sys_call_handler() allocates 1 byte for terminator */
+ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+ if (ret >= 0) {
+ *lenp = ret;
+ *ppos += *lenp;
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < MIN_NICE || val > MAX_NICE) {
+ ret = -EINVAL;
+ } else {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ return ret;
+}
+
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
@@ -2017,7 +2256,19 @@ static struct ctl_table vs_vars[] = {
.procname = "run_estimation",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = ipvs_proc_run_estimation,
+ },
+ {
+ .procname = "est_cpulist",
+ .maxlen = NR_CPUS, /* unused */
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_cpulist,
+ },
+ {
+ .procname = "est_nice",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_nice,
},
#ifdef CONFIG_IP_VS_DEBUG
{
@@ -2255,7 +2506,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_puts(seq,
" Conns Packets Packets Bytes Bytes\n");
- ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)show.conns,
(unsigned long long)show.inpkts,
@@ -2279,7 +2530,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
- struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_kstats kstats;
int i;
@@ -2296,13 +2547,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
- conns = u->cnt.conns;
- inpkts = u->cnt.inpkts;
- outpkts = u->cnt.outpkts;
- inbytes = u->cnt.inbytes;
- outbytes = u->cnt.outbytes;
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ start = u64_stats_fetch_begin(&u->syncp);
+ conns = u64_stats_read(&u->cnt.conns);
+ inpkts = u64_stats_read(&u->cnt.inpkts);
+ outpkts = u64_stats_read(&u->cnt.outpkts);
+ inbytes = u64_stats_read(&u->cnt.inbytes);
+ outbytes = u64_stats_read(&u->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
@@ -4027,13 +4278,17 @@ static void ip_vs_genl_unregister(void)
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
- int idx;
struct ctl_table *tbl;
+ int idx, ret;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+ expire_nodest_conn_handler);
+ ipvs->est_stopped = 0;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4094,31 +4349,44 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_run_estimation = 1;
+ tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+ ipvs->est_cpulist_valid = 0;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+ ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_nice;
+
#ifdef CONFIG_IP_VS_DEBUG
/* Global sysctls must be ro in non-init netns */
if (!net_eq(net, &init_net))
tbl[idx++].mode = 0444;
#endif
+ ret = -ENOMEM;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
- if (ipvs->sysctl_hdr == NULL) {
- if (!net_eq(net, &init_net))
- kfree(tbl);
- return -ENOMEM;
- }
- ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
+ if (!ipvs->sysctl_hdr)
+ goto err;
ipvs->sysctl_tbl = tbl;
+
+ ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ret < 0)
+ goto err;
+
/* Schedule defense work */
- INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
queue_delayed_work(system_long_wq, &ipvs->defense_work,
DEFENSE_TIMER_PERIOD);
- /* Init delayed work for expiring no dest conn */
- INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
- expire_nodest_conn_handler);
-
return 0;
+
+err:
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return ret;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
@@ -4129,7 +4397,10 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+
+ if (ipvs->est_cpulist_valid)
+ free_cpumask_var(ipvs->sysctl_est_cpulist);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -4151,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- int i, idx;
+ int ret = -ENOMEM;
+ int idx;
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4164,18 +4436,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
- /* procfs stats */
- ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!ipvs->tot_stats.cpustats)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ipvs_tot_stats;
- ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
- u64_stats_init(&ipvs_tot_stats->syncp);
- }
+ INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
- spin_lock_init(&ipvs->tot_stats.lock);
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+ if (!ipvs->tot_stats)
+ goto out;
+ if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
+ goto err_tot_stats;
#ifdef CONFIG_PROC_FS
if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
@@ -4190,7 +4458,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
goto err_percpu;
#endif
- if (ip_vs_control_net_init_sysctl(ipvs))
+ ret = ip_vs_control_net_init_sysctl(ipvs);
+ if (ret < 0)
goto err;
return 0;
@@ -4207,20 +4476,26 @@ err_stats:
err_vs:
#endif
- free_percpu(ipvs->tot_stats.cpustats);
- return -ENOMEM;
+ ip_vs_stats_release(&ipvs->tot_stats->s);
+
+err_tot_stats:
+ kfree(ipvs->tot_stats);
+
+out:
+ return ret;
}
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+ cancel_delayed_work_sync(&ipvs->est_reload_work);
#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
#endif
- free_percpu(ipvs->tot_stats.cpustats);
+ call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
}
int __init ip_vs_register_nl_ioctl(void)
@@ -4280,5 +4555,6 @@ void ip_vs_control_cleanup(void)
{
EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
+ /* relying on common rcu_barrier() in ip_vs_cleanup() */
LeaveFunction(2);
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 9a1a7af6a186..ce2a1549b304 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -30,9 +30,6 @@
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
- Currently, the measurement is activated by slow timer handler. Hope
- this measurement will not introduce too much load.
-
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
@@ -47,68 +44,79 @@
to 32-bit values for conns, packets, bps, cps and pps.
* A lot of code is taken from net/core/gen_estimator.c
- */
-
-/*
- * Make a summary from each cpu
+ KEY POINTS:
+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+ - kthreads read the cpustats to update the estimators (svcs, dests, total)
+ - the states of estimators can be read (get stats) or modified (zero stats)
+ from processes
+