diff options
author | David S. Miller <davem@davemloft.net> | 2015-10-03 04:32:52 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-10-03 04:32:52 -0700 |
commit | c3fc7ac9a0b978ee8538058743d21feef25f7b33 (patch) | |
tree | 0caf05649d27830ba0f9548704abbb1ec4b5bb91 | |
parent | f6d3125fa3c2f55ddf7cf69365c41089de6cfae6 (diff) | |
parent | e994b2f0fb9229aeff5eea9541320bd7b2ca8714 (diff) | |
download | linux-c3fc7ac9a0b978ee8538058743d21feef25f7b33.tar.gz linux-c3fc7ac9a0b978ee8538058743d21feef25f7b33.tar.bz2 linux-c3fc7ac9a0b978ee8538058743d21feef25f7b33.zip |
Merge branch 'tcp-lockless-listener'
Eric Dumazet says:
====================
tcp/dccp: lockless listener
TCP listener refactoring : this is becoming interesting !
This patch series takes the steps to use normal TCP/DCCP ehash
table to store SYN_RECV requests, instead of the private per-listener
hash table we had until now.
SYNACK skb are now attached to their syn_recv request socket,
so that we no longer heavily modify listener sk_wmem_alloc.
listener lock is no longer held in fast path, including
SYNCOOKIE mode.
During my tests, my server was able to process 3,500,000
SYN packets per second on one listener and still had available
cpu cycles.
That is about 2 to 3 order of magnitude what we had with older kernels.
This effort started two years ago and I am pleased to reach expectations.
We'll probably extend SO_REUSEPORT to add proper cpu/numa affinities,
so that heavy duty TCP servers can get proper siloing thanks to multi-queues
NIC.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet6_connection_sock.h | 9 | ||||
-rw-r--r-- | include/net/inet_connection_sock.h | 9 | ||||
-rw-r--r-- | include/net/inet_hashtables.h | 1 | ||||
-rw-r--r-- | include/net/request_sock.h | 132 | ||||
-rw-r--r-- | include/net/tcp.h | 10 | ||||
-rw-r--r-- | net/core/request_sock.c | 84 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 64 | ||||
-rw-r--r-- | net/dccp/ipv6.c | 72 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 145 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 96 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 14 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 30 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 159 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 22 | ||||
-rw-r--r-- | net/ipv6/inet6_connection_sock.c | 67 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 117 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 12 |
20 files changed, 307 insertions, 746 deletions
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 79b2a4c09ca6..064cfbe639d0 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -28,15 +28,6 @@ int inet6_csk_bind_conflict(const struct sock *sk, struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto); -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned long timeout); - void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ee54f21a8113..3208a65d1c28 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); @@ -282,8 +278,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); -static inline void inet_csk_reqsk_queue_added(struct sock *sk, - const unsigned long timeout) +static inline void inet_csk_reqsk_queue_added(struct sock *sk) { reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); } @@ -300,7 +295,7 @@ static inline int inet_csk_reqsk_queue_young(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); + return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; } void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3fb778d7c875..6683ada25fef 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +int inet_ehash_insert(struct sock *sk, struct sock *osk); void __inet_hash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d2544de329bd..bae6936d75c4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -69,6 +69,16 @@ struct request_sock { u32 peer_secid; }; +static inline struct request_sock *inet_reqsk(struct sock *sk) +{ + return (struct request_sock *)sk; +} + +static inline struct sock *req_to_sk(struct request_sock *req) +{ + return (struct sock *)req; +} + static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) { @@ -78,6 +88,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) req->rsk_ops = ops; sock_hold(sk_listener); req->rsk_listener = sk_listener; + req_to_sk(req)->sk_prot = sk_listener->sk_prot; + sk_node_init(&req_to_sk(req)->sk_node); req->saved_syn = NULL; /* Following is temporary. It is coupled with debugging * helpers in reqsk_put() & reqsk_free() @@ -87,16 +99,6 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) return req; } -static inline struct request_sock *inet_reqsk(struct sock *sk) -{ - return (struct request_sock *)sk; -} - -static inline struct sock *req_to_sk(struct request_sock *req) -{ - return (struct sock *)req; -} - static inline void reqsk_free(struct request_sock *req) { /* temporary debugging */ @@ -117,25 +119,6 @@ static inline void reqsk_put(struct request_sock *req) extern int sysctl_max_syn_backlog; -/** struct listen_sock - listen state - * - * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs - */ -struct listen_sock { - int qlen_inc; /* protected by listener lock */ - int young_inc;/* protected by listener lock */ - - /* following fields can be updated by timer */ - atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */ - atomic_t young_dec; - - u32 max_qlen_log ____cacheline_aligned_in_smp; - u32 synflood_warned; - u32 hash_rnd; - u32 nr_table_entries; - struct request_sock *syn_table[0]; -}; - /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by @@ -169,43 +152,29 @@ struct fastopen_queue { * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() - * @syn_wait_lock - serializer - * - * %syn_wait_lock is necessary only to avoid proc interface having to grab the main - * lock sock while browsing the listening hash (otherwise it's deadlock prone). * */ struct request_sock_queue { + spinlock_t rskq_lock; + u8 rskq_defer_accept; + + u32 synflood_warned; + atomic_t qlen; + atomic_t young; + struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; - u8 rskq_defer_accept; - struct listen_sock *listen_opt; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ - - /* temporary alignment, our goal is to get rid of this lock */ - spinlock_t syn_wait_lock ____cacheline_aligned_in_smp; }; -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +void reqsk_queue_alloc(struct request_sock_queue *queue); -void __reqsk_queue_destroy(struct request_sock_queue *queue); -void reqsk_queue_destroy(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); -static inline struct request_sock * - reqsk_queue_yank_acceptq(struct request_sock_queue *queue) -{ - struct request_sock *req = queue->rskq_accept_head; - - queue->rskq_accept_head = NULL; - return req; -} - -static inline int reqsk_queue_empty(struct request_sock_queue *queue) +static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { return queue->rskq_accept_head == NULL; } @@ -215,6 +184,7 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, struct sock *parent, struct sock *child) { + spin_lock(&queue->rskq_lock); req->sk = child; sk_acceptq_added(parent); @@ -225,68 +195,48 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, queue->rskq_accept_tail = req; req->dl_next = NULL; + spin_unlock(&queue->rskq_lock); } -static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue) +static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, + struct sock *parent) { - struct request_sock *req = queue->rskq_accept_head; - - WARN_ON(req == NULL); - - queue->rskq_accept_head = req->dl_next; - if (queue->rskq_accept_head == NULL) - queue->rskq_accept_tail = NULL; + struct request_sock *req; + spin_lock_bh(&queue->rskq_lock); + req = queue->rskq_accept_head; + if (req) { + sk_acceptq_removed(parent); + queue->rskq_accept_head = req->dl_next; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_tail = NULL; + } + spin_unlock_bh(&queue->rskq_lock); return req; } static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - if (req->num_timeout == 0) - atomic_inc(&lopt->young_dec); - atomic_inc(&lopt->qlen_dec); + atomic_dec(&queue->young); + atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { - struct listen_sock *lopt = queue->listen_opt; - - lopt->young_inc++; - lopt->qlen_inc++; -} - -static inline int listen_sock_qlen(const struct listen_sock *lopt) -{ - return lopt->qlen_inc - atomic_read(&lopt->qlen_dec); -} - -static inline int listen_sock_young(const struct listen_sock *lopt) -{ - return lopt->young_inc - atomic_read(&lopt->young_dec); + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { - const struct listen_sock *lopt = queue->listen_opt; - - return lopt ? listen_sock_qlen(lopt) : 0; + return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { - return listen_sock_young(queue->listen_opt); + return atomic_read(&queue->young); } -static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) -{ - return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; -} - -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout); - #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c7dfe52f473..a6be56d5f0e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + bool attach_req); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -1618,7 +1619,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, - TCP_SEQ_STATE_OPENREQ, TCP_SEQ_STATE_ESTABLISHED, }; @@ -1637,7 +1637,6 @@ struct tcp_iter_state { enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; - kuid_t uid; loff_t last_pos; }; @@ -1717,9 +1716,8 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc); - void (*queue_hash_add)(struct sock *sk, struct request_sock *req, - const unsigned long timeout); + u16 queue_mapping, struct tcp_fastopen_cookie *foc, + bool attach_req); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/core/request_sock.c b/net/core/request_sock.c index e22cfa4ed25f..15c853806518 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,28 +37,9 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; - - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); - - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); - if (!lopt) - return -ENOMEM; - - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - spin_lock_init(&queue->syn_wait_lock); + spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; @@ -67,67 +48,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* make all the listen_opt local to us */ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - if (listen_sock_qlen(lopt) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); - kvfree(lopt); } /* diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5b7818c63cec..8910c9567719 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -444,36 +444,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e8753aa3b7a4..1361a3f45df7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e1527882a578..89eedfbd4ad5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,10 +330,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); @@ -477,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -572,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - bool found = false; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + spinlock_t *lock; + bool found; - spin_lock(&queue->syn_wait_lock); + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -612,15 +552,12 @@ static void reqsk_timer_handler(unsigned long data) struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -641,9 +578,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); - if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + qlen = reqsk_queue_len(queue); + if ((qlen << 1) > sk_listener->sk_max_ack_backlog) { + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -665,41 +602,41 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } +drop: inet_csk_reqsk_queue_drop(sk_listener, req); reqsk_put(req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -792,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close); int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; @@ -819,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); @@ -832,11 +766,7 @@ void inet_csk_listen_stop(struct sock *sk) |