summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-12-15 17:56:29 -0800
committerJakub Kicinski <kuba@kernel.org>2023-12-15 17:56:29 -0800
commit358105ab92fc588aee0f37402f5705b031dc6f6f (patch)
treedcf0343765740f75ebd2b1af6a36fca7c85d491f
parent758a8d5b6a64ad63a8c0728f68dd3e21481013db (diff)
parent207184853dbdb62d8b02c7a141d3297e94e33451 (diff)
downloadlinux-358105ab92fc588aee0f37402f5705b031dc6f6f.tar.gz
linux-358105ab92fc588aee0f37402f5705b031dc6f6f.tar.bz2
linux-358105ab92fc588aee0f37402f5705b031dc6f6f.zip
Merge branch 'tcp-dccp-refine-source-port-selection'
Eric Dumazet says: ==================== tcp/dccp: refine source port selection This patch series leverages IP_LOCAL_PORT_RANGE option to no longer favor even source port selection at connect() time. This should lower time taken by connect() for hosts having many active connections to the same destination. ==================== Link: https://lore.kernel.org/r/20231214192939.1962891-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/net/ip.h2
-rw-r--r--net/ipv4/inet_connection_sock.c21
-rw-r--r--net/ipv4/inet_hashtables.c27
3 files changed, 33 insertions, 17 deletions
diff --git a/include/net/ip.h b/include/net/ip.h
index b31be912489a..de0c69c57e3c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
*low = range & 0xffff;
*high = range >> 16;
}
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 70be0f6fe879..bd325b029dd1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+/**
+ * inet_sk_get_local_port_range - fetch ephemeral ports range
+ * @sk: socket
+ * @low: pointer to low port
+ * @high: pointer to high port
+ *
+ * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
+ * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
+ * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
+ */
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
- const struct inet_sock *inet = inet_sk(sk);
- const struct net *net = sock_net(sk);
int lo, hi, sk_lo, sk_hi;
+ bool local_range = false;
u32 sk_range;
- inet_get_local_port_range(net, &lo, &hi);
+ inet_get_local_port_range(sock_net(sk), &lo, &hi);
- sk_range = READ_ONCE(inet->local_port_range);
+ sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
if (unlikely(sk_range)) {
sk_lo = sk_range & 0xffff;
sk_hi = sk_range >> 16;
@@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
lo = sk_lo;
if (lo <= sk_hi && sk_hi <= hi)
hi = sk_hi;
+ local_range = true;
}
*low = lo;
*high = hi;
+ return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a532f749e477..9ff201bc4e6d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
- int l3mdev;
+ bool local_ports;
+ int step, l3mdev;
u32 index;
if (port) {
@@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
l3mdev = inet_sk_bound_l3mdev(sk);
- inet_sk_get_local_port_range(sk, &low, &high);
+ local_ports = inet_sk_get_local_port_range(sk, &low, &high);
+ step = local_ports ? 1 : 2;
+
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
- if (likely(remaining > 1))
+ if (!local_ports && remaining > 1)
remaining &= ~1U;
get_random_sleepable_once(table_perturb,
@@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
- offset &= ~1U;
+ if (!local_ports)
+ offset &= ~1U;
other_parity_scan:
port = low + offset;
- for (i = 0; i < remaining; i += 2, port += 2) {
+ for (i = 0; i < remaining; i += step, port += step) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
@@ -1083,10 +1087,11 @@ next_port:
cond_resched();
}
- offset++;
- if ((offset & 1) && remaining > 1)
- goto other_parity_scan;
-
+ if (!local_ports) {
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
+ }
return -EADDRNOTAVAIL;
ok:
@@ -1109,8 +1114,8 @@ ok:
* on low contention the randomness is maximal and on high contention
* it may be inexistent.
*/
- i = max_t(int, i, get_random_u32_below(8) * 2);
- WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
+ i = max_t(int, i, get_random_u32_below(8) * step);
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);