diff options
| -rw-r--r-- | include/net/tcp.h | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 128 | ||||
| -rw-r--r-- | net/mptcp/options.c | 30 | ||||
| -rw-r--r-- | net/mptcp/pm.c | 3 | ||||
| -rw-r--r-- | net/mptcp/pm_netlink.c | 6 | ||||
| -rw-r--r-- | net/mptcp/protocol.c | 998 | ||||
| -rw-r--r-- | net/mptcp/protocol.h | 72 | ||||
| -rw-r--r-- | net/mptcp/subflow.c | 33 |
8 files changed, 776 insertions, 498 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index d643ee4e4249..d5d22c411918 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -322,6 +322,7 @@ void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); +void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); @@ -329,6 +330,8 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); +struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); @@ -392,6 +395,7 @@ void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); +void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2bc3d7fe9e8..b285b338a019 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -954,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { if (skb && !skb->len) { tcp_unlink_write_queue(skb, sk); @@ -964,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } } +struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool can_coalesce; + int copy, i; + + if (!skb || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { +new_segment: + if (!sk_stream_memory_free(sk)) + return NULL; + + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + tcp_rtx_and_write_queues_empty(sk)); + if (!skb) + return NULL; + +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + skb_entail(sk, skb); + copy = size_goal; + } + + if (copy > *size) + copy = *size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (!sk_wmem_schedule(sk, copy)) + return NULL; + + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + *size = copy; + return skb; +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -999,60 +1061,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; while (size > 0) { - struct sk_buff *skb = tcp_write_queue_tail(sk); - int copy, i; - bool can_coalesce; + struct sk_buff *skb; + size_t copy = size; - if (!skb || (copy = size_goal - skb->len) <= 0 || - !tcp_skb_can_collapse_to(skb)) { -new_segment: - if (!sk_stream_memory_free(sk)) - goto wait_for_space; - - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); - if (!skb) - goto wait_for_space; - -#ifdef CONFIG_TLS_DEVICE - skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); -#endif - skb_entail(sk, skb); - copy = size_goal; - } - - if (copy > size) - copy = size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (!sk_wmem_schedule(sk, copy)) + skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©); + if (!skb) goto wait_for_space; - if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); - } - - if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk_wmem_queued_add(sk, copy); - sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; - WRITE_ONCE(tp->write_seq, tp->write_seq + copy); - TCP_SKB_CB(skb)->end_seq += copy; - tcp_skb_pcount_set(skb, 0); - if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -2405,13 +2420,12 @@ bool tcp_check_oom(struct sock *sk, int shift) return too_many_orphans || out_of_socket_memory; } -void tcp_close(struct sock *sk, long timeout) +void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TCP_LISTEN) { @@ -2575,6 +2589,12 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); +} + +void tcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a044dd43411d..f2d1e27a2bc1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -492,7 +492,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); + snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -809,11 +809,14 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + const struct sock *ssk, + struct mptcp_options_received *mp_opt) { u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end); + u64 snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way @@ -822,15 +825,28 @@ static void update_una(struct mptcp_sock *msk, new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + while (after64(new_wnd_end, old_wnd_end)) { + wnd_end = old_wnd_end; + old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end, + new_wnd_end); + if (old_wnd_end == wnd_end) { + if (mptcp_send_head(sk)) + mptcp_schedule_work(sk); + break; + } + } + while (after64(new_snd_una, old_snd_una)) { snd_una = old_snd_una; old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, new_snd_una); if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); + mptcp_data_acked(sk); break; } } @@ -930,7 +946,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e19e1525ecbb..f9c88e2abb8e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -89,8 +89,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 446ef8f07734..f8a9d82a0ea8 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -416,14 +416,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (msk->pm.rm_id != subflow->remote_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_accepted--; @@ -452,14 +451,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (rm_id != subflow->local_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.local_addr_used--; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index eaa61e227492..8df013daea88 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/xfrm.h> #include "protocol.h" #include "mib.h" @@ -41,6 +42,9 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; +static void __mptcp_destroy_sock(struct sock *sk); +static void __mptcp_check_send_data_fin(struct sock *sk); + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -53,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } +/* Returns end sequence number of the receiver's advertised window */ +static u64 mptcp_wnd_end(const struct mptcp_sock *msk) +{ + return atomic64_read(&msk->wnd_end); +} + static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -102,6 +112,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssock->sk); subflow->request_mptcp = 1; /* accept() will wait on first subflow sk_wq, and we always wakes up @@ -169,6 +180,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) if (after64(seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); + pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -323,6 +335,19 @@ static void mptcp_stop_timer(struct sock *sk) mptcp_sk(sk)->timer_ival = 0; } +static void mptcp_close_wake_up(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +} + static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -341,20 +366,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk) switch (sk->sk_state) { case TCP_FIN_WAIT1: inet_sk_state_store(sk, TCP_FIN_WAIT2); - sk->sk_state_change(sk); break; case TCP_CLOSING: case TCP_LAST_ACK: inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_state_change(sk); break; } - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -388,13 +407,27 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_check_data_fin(struct sock *sk) +static void mptcp_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + } +} + +static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; + bool ret = false; if (__mptcp_check_fallback(msk) || !msk->first) - return; + return ret; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -410,8 +443,6 @@ static void mptcp_check_data_fin(struct sock *sk) */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { - struct mptcp_subflow_context *subflow; - WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); @@ -428,7 +459,6 @@ static void mptcp_check_data_fin(struct sock *sk) break; case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); - // @@ Close subflows now? break; default: /* Other states not expected */ @@ -436,23 +466,12 @@ static void mptcp_check_data_fin(struct sock *sk) break; } + ret = true; mptcp_set_timeout(sk, NULL); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); - } - - sk->sk_state_change(sk); - - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_send_ack(msk); + mptcp_close_wake_up(sk); } + return ret; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, @@ -620,9 +639,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) * this is not a good place to change state. Let the workqueue * do it. */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); } spin_unlock_bh(&sk->sk_lock.slock); @@ -692,6 +710,10 @@ static void mptcp_reset_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; + /* prevent rescheduling on close */ + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + return; + /* should never be called with mptcp level timer cleared */ tout = READ_ONCE(mptcp_sk(sk)->timer_ival); if (WARN_ON_ONCE(!tout)) @@ -699,23 +721,33 @@ static void mptcp_reset_timer(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } +bool mptcp_schedule_work(struct sock *sk) +{ + if (inet_sk_state_load(sk) != TCP_CLOSE && + schedule_work(&mptcp_sk(sk)->work)) { + /* each subflow already holds a reference to the sk, and the + * workqueue is invoked by a subflow, so sk can't go away here. + */ + sock_hold(sk); + return true; + } + return false; +} + void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && - schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || + mptcp_send_head(sk) || + (inet_sk_state_load(sk) != TCP_ESTABLISHED))) + mptcp_schedule_work(sk); } void mptcp_subflow_eof(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && - schedule_work(&msk->work)) - sock_hold(sk); + if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); } static void mptcp_check_for_eof(struct mptcp_sock *msk) @@ -726,8 +758,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) receivers += !subflow->rx_eof; + if (receivers) + return; - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ @@ -737,6 +771,19 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + /* fallback sockets skip TCP_CLOSING - TCP will take care */ + inet_sk_state_store(sk, TCP_CLOSE); + break; + default: + return; + } + mptcp_close_wake_up(sk); } static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) @@ -783,6 +830,7 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && + pfrag->size - pfrag->offset > 0 && df->data_seq + df->data_len == msk->write_seq; } @@ -801,20 +849,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -826,13 +860,16 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->write_seq); + atomic64_set(&msk->snd_una, msk->snd_nxt); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; + if (WARN_ON_ONCE(dfrag == msk->first_pending)) + break; dfrag_clear(sk, dfrag); cleaned = true; } @@ -841,12 +878,13 @@ static void mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->data_len)) + if (WARN_ON_ONCE(delta > dfrag->already_sent)) goto out; dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; + dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); cleaned = true; @@ -864,13 +902,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk) mptcp_clean_una(sk); /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ + if (sk_stream_is_writeable(sk)) { + clear_bit(MPTCP_NOSPACE, &msk->flags); sk_stream_write_space(sk); } } @@ -880,12 +913,23 @@ static void mptcp_clean_una_wakeup(struct sock *sk) */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + bool first = true; + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; - sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (first) + tcp_enter_memory_pressure(ssk); + sk_stream_moderate_sndbuf(ssk); + first = false; + } return false; } @@ -901,149 +945,109 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } +struct mptcp_sendmsg_info { + int mss_now; + int size_goal; + u16 limit; + u16 sent; + unsigned int flags; +}; + +static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, + int avail_size) +{ + u64 window_end = mptcp_wnd_end(msk); + + if (__mptcp_check_fallback(msk)) + return avail_size; + + if (!before64(data_seq + avail_size, window_end)) { + u64 allowed_size = window_end - data_seq; + + return min_t(unsigned int, allowed_size, avail_size); + } + + return avail_size; +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, struct mptcp_data_frag *dfrag, - long *timeo, int *pmss_now, - int *ps_goal) + struct mptcp_data_frag *dfrag, + struct mptcp_sendmsg_info *info) { - int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; - bool dfrag_collapsed, can_collapse = false; + u64 data_seq = dfrag->data_seq + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); + bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - struct page_frag *pfrag; - struct page *page; - u64 *write_seq; - size_t psize; - - /* use the mptcp page cache so that we can easily move the data - * from one substream to another, but do per subflow memory accounting - * Note: pfrag is used only !retransmission, but the compiler if - * fooled into a warning if we don't init here - */ - pfrag = sk_page_frag(sk); - if (!retransmission) { - write_seq = &msk->write_seq; - page = pfrag->page; - } else { - write_seq = &dfrag->data_seq; - page = dfrag->page; - } + bool can_collapse = false; + int avail_size; + size_t ret; + + pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); - /* compute copy limit */ - mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - *pmss_now = mss_now; - *ps_goal = size_goal; - avail_size = size_goal; + /* compute send limit */ + info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); + avail_size = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { - mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ - can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(*write_seq, skb, mpext); + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + can_collapse = (info->size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(data_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else - avail_size = size_goal - skb->len; + avail_size = info->size_goal - skb->len; } - if (!retransmission) { - /* reuse tail pfrag, if possible, or carve a new one from the - * page allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); - offset = dfrag->offset; - frag_truesize = dfrag->overhead; - } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), - psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; - - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { - iov_iter_revert(&msg->msg_iter, psize); - return -ENOMEM; - } - } else { - offset = dfrag->offset; - psize = min_t(size_t, dfrag->data_len, avail_size); + /* Zero window and all data acked? Probe. */ + avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); + if (avail_size == 0) { + if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt) + return 0; + zero_window_probe = true; + data_seq = atomic64_read(&msk->snd_una) - 1; + avail_size = 1; } - /* tell the TCP stack to delay the push so that we can safely - * access the skb after the sendpages call - */ - ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) { - if (!retransmission) - iov_iter_revert(&msg->msg_iter, psize); - return ret; - } + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; - frag_truesize += ret; - if (!retransmission) { - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); - - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - sk_wmem_queued_add(sk, frag_truesize); - } else { - sk_wmem_queued_add(sk, ret); - } - - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; + ret = info->limit - info->sent; + tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, + dfrag->offset + info->sent, &ret); + if (!tail) { + tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + return -ENOMEM; } - /* if the tail skb extension is still the cached one, collapsing - * really happened. Note: we can't check for 'same skb' as the sk_buff - * hdr on tail can be transmitted, freed and re-allocated by the - * do_tcp_sendpages() call + /* if the tail skb is still the cached one, collapsing really happened. */ - tail = tcp_write_queue_tail(ssk); - if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + if (skb == tail) { WARN_ON_ONCE(!can_collapse); mpext->data_len += ret; + WARN_ON_ONCE(zero_window_probe); goto out; } - skb = tcp_write_queue_tail(ssk); - mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext); msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = *write_seq; + mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -1053,12 +1057,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); + if (zero_window_probe) { + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mpext->frozen = 1; + ret = 0; + tcp_push_pending_frames(ssk); + } out: - if (!retransmission) - pfrag->offset += frag_truesize; - WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; } @@ -1066,17 +1072,25 @@ static void mptcp_nospace(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - clear_bit(MPTCP_SEND_SPACE, &msk->flags); + set_bit(MPTCP_NOSPACE, &msk->flags); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool ssk_writeable = sk_stream_is_writeable(ssk); struct socket *sock = READ_ONCE(ssk->sk_socket); + if (ssk_writeable || !sock) + continue; + /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); + set_bit(SOCK_NOSPACE, &sock->flags); } + + /* mptcp_data_acked() could run just before we set the NOSPACE bit, + * so explicitly check for snd_una value + */ + mptcp_clean_una((struct sock *)msk); } static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) @@ -1180,21 +1194,86 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) +static void mptcp_push_release(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); + release_sock(ssk); +} + +static void mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + u32 sndbuf; + + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + prev_ssk = ssk; + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + + /* try to keep the subflow socket lock across + * consecutive xmit on the same socket + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(sk, prev_ssk, &info); + if (!ssk) + goto out; + + if (ssk != prev_ssk || !prev_ssk) + lock_sock(ssk); + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + + /* at this point we held the socket lock for the last subflow we used */ + if (ssk) + mptcp_push_release(sk, ssk, &info); + +out: + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (copied) + __mptcp_check_send_data_fin(sk); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; - struct sock *ssk; - u32 sndbuf; - bool tx_ok; + int ret = 0; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -1211,130 +1290,92 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); -restart: mp |
