summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/tcp.c128
-rw-r--r--net/mptcp/options.c30
-rw-r--r--net/mptcp/pm.c3
-rw-r--r--net/mptcp/pm_netlink.c6
-rw-r--r--net/mptcp/protocol.c998
-rw-r--r--net/mptcp/protocol.h72
-rw-r--r--net/mptcp/subflow.c33
8 files changed, 776 insertions, 498 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d643ee4e4249..d5d22c411918 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -322,6 +322,7 @@ void tcp_shutdown(struct sock *sk, int how);
int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);
+void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
@@ -329,6 +330,8 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
+struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
@@ -392,6 +395,7 @@ void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
+void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b2bc3d7fe9e8..b285b338a019 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -954,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
* users.
*/
-static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
{
if (skb && !skb->len) {
tcp_unlink_write_queue(skb, sk);
@@ -964,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
}
}
+struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool can_coalesce;
+ int copy, i;
+
+ if (!skb || (copy = size_goal - skb->len) <= 0 ||
+ !tcp_skb_can_collapse_to(skb)) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ return NULL;
+
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ tcp_rtx_and_write_queues_empty(sk));
+ if (!skb)
+ return NULL;
+
+#ifdef CONFIG_TLS_DEVICE
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+ skb_entail(sk, skb);
+ copy = size_goal;
+ }
+
+ if (copy > *size)
+ copy = *size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (!sk_wmem_schedule(sk, copy))
+ return NULL;
+
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ *size = copy;
+ return skb;
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -999,60 +1061,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out_err;
while (size > 0) {
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int copy, i;
- bool can_coalesce;
+ struct sk_buff *skb;
+ size_t copy = size;
- if (!skb || (copy = size_goal - skb->len) <= 0 ||
- !tcp_skb_can_collapse_to(skb)) {
-new_segment:
- if (!sk_stream_memory_free(sk))
- goto wait_for_space;
-
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
- if (!skb)
- goto wait_for_space;
-
-#ifdef CONFIG_TLS_DEVICE
- skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
-#endif
- skb_entail(sk, skb);
- copy = size_goal;
- }
-
- if (copy > size)
- copy = size;
-
- i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
- tcp_mark_push(tp, skb);
- goto new_segment;
- }
- if (!sk_wmem_schedule(sk, copy))
+ skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
+ if (!skb)
goto wait_for_space;
- if (can_coalesce) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- } else {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
- }
-
- if (!(flags & MSG_NO_SHARED_FRAGS))
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
- sk_wmem_queued_add(sk, copy);
- sk_mem_charge(sk, copy);
- skb->ip_summed = CHECKSUM_PARTIAL;
- WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
- TCP_SKB_CB(skb)->end_seq += copy;
- tcp_skb_pcount_set(skb, 0);
-
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
@@ -2405,13 +2420,12 @@ bool tcp_check_oom(struct sock *sk, int shift)
return too_many_orphans || out_of_socket_memory;
}
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
- lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TCP_LISTEN) {
@@ -2575,6 +2589,12 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
release_sock(sk);
sock_put(sk);
}
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index a044dd43411d..f2d1e27a2bc1 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -492,7 +492,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool ret = false;
mpext = skb ? mptcp_get_ext(skb) : NULL;
- snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable);
+ snd_data_fin_enable = mptcp_data_fin_enabled(msk);
if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
unsigned int map_size;
@@ -809,11 +809,14 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
return cur_ack;
}
-static void update_una(struct mptcp_sock *msk,
- struct mptcp_options_received *mp_opt)
+static void ack_update_msk(struct mptcp_sock *msk,
+ const struct sock *ssk,
+ struct mptcp_options_received *mp_opt)
{
u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
- u64 write_seq = READ_ONCE(msk->write_seq);
+ u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end);
+ u64 snd_nxt = READ_ONCE(msk->snd_nxt);
+ struct sock *sk = (struct sock *)msk;
/* avoid ack expansion on update conflict, to reduce the risk of
* wrongly expanding to a future ack sequence number, which is way
@@ -822,15 +825,28 @@ static void update_una(struct mptcp_sock *msk,
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */
- if (after64(new_snd_una, write_seq))
+ if (after64(new_snd_una, snd_nxt))
new_snd_una = old_snd_una;
+ new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
+
+ while (after64(new_wnd_end, old_wnd_end)) {
+ wnd_end = old_wnd_end;
+ old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end,
+ new_wnd_end);
+ if (old_wnd_end == wnd_end) {
+ if (mptcp_send_head(sk))
+ mptcp_schedule_work(sk);
+ break;
+ }
+ }
+
while (after64(new_snd_una, old_snd_una)) {
snd_una = old_snd_una;
old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
new_snd_una);
if (old_snd_una == snd_una) {
- mptcp_data_acked((struct sock *)msk);
+ mptcp_data_acked(sk);
break;
}
}
@@ -930,7 +946,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
* monodirectional flows will stuck
*/
if (mp_opt.use_ack)
- update_una(msk, &mp_opt);
+ ack_update_msk(msk, sk, &mp_opt);
/* Zero-data-length packets are dropped by the caller and not
* propagated to the MPTCP layer, so the skb extension does not
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index e19e1525ecbb..f9c88e2abb8e 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -89,8 +89,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return false;
msk->pm.status |= BIT(new_status);
- if (schedule_work(&msk->work))
- sock_hold((struct sock *)msk);
+ mptcp_schedule_work((struct sock *)msk);
return true;
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 446ef8f07734..f8a9d82a0ea8 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -416,14 +416,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
- long timeout = 0;
if (msk->pm.rm_id != subflow->remote_id)
continue;
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
- __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ __mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
msk->pm.add_addr_accepted--;
@@ -452,14 +451,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
- long timeout = 0;
if (rm_id != subflow->local_id)
continue;
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
- __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ __mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
msk->pm.local_addr_used--;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index eaa61e227492..8df013daea88 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,6 +21,7 @@
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
+#include <net/xfrm.h>
#include "protocol.h"
#include "mib.h"
@@ -41,6 +42,9 @@ struct mptcp_skb_cb {
static struct percpu_counter mptcp_sockets_allocated;
+static void __mptcp_destroy_sock(struct sock *sk);
+static void __mptcp_check_send_data_fin(struct sock *sk);
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -53,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
return msk->subflow;
}
+/* Returns end sequence number of the receiver's advertised window */
+static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
+{
+ return atomic64_read(&msk->wnd_end);
+}
+
static bool mptcp_is_tcpsk(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
@@ -102,6 +112,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
msk->subflow = ssock;
subflow = mptcp_subflow_ctx(ssock->sk);
list_add(&subflow->node, &msk->conn_list);
+ sock_hold(ssock->sk);
subflow->request_mptcp = 1;
/* accept() will wait on first subflow sk_wq, and we always wakes up
@@ -169,6 +180,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
if (after64(seq, max_seq)) {
/* out of window */
mptcp_drop(sk, skb);
+ pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return;
}
@@ -323,6 +335,19 @@ static void mptcp_stop_timer(struct sock *sk)
mptcp_sk(sk)->timer_ival = 0;
}
+static void mptcp_close_wake_up(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+}
+
static void mptcp_check_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -341,20 +366,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
switch (sk->sk_state) {
case TCP_FIN_WAIT1:
inet_sk_state_store(sk, TCP_FIN_WAIT2);
- sk->sk_state_change(sk);
break;
case TCP_CLOSING:
case TCP_LAST_ACK:
inet_sk_state_store(sk, TCP_CLOSE);
- sk->sk_state_change(sk);
break;
}
- if (sk->sk_shutdown == SHUTDOWN_MASK ||
- sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- else
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ mptcp_close_wake_up(sk);
}
}
@@ -388,13 +407,27 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
-static void mptcp_check_data_fin(struct sock *sk)
+static void mptcp_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ }
+}
+
+static bool mptcp_check_data_fin(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
u64 rcv_data_fin_seq;
+ bool ret = false;
if (__mptcp_check_fallback(msk) || !msk->first)
- return;
+ return ret;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
@@ -410,8 +443,6 @@ static void mptcp_check_data_fin(struct sock *sk)
*/
if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
- struct mptcp_subflow_context *subflow;
-
WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
WRITE_ONCE(msk->rcv_data_fin, 0);
@@ -428,7 +459,6 @@ static void mptcp_check_data_fin(struct sock *sk)
break;
case TCP_FIN_WAIT2:
inet_sk_state_store(sk, TCP_CLOSE);
- // @@ Close subflows now?
break;
default:
/* Other states not expected */
@@ -436,23 +466,12 @@ static void mptcp_check_data_fin(struct sock *sk)
break;
}
+ ret = true;
mptcp_set_timeout(sk, NULL);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- lock_sock(ssk);
- tcp_send_ack(ssk);
- release_sock(ssk);
- }
-
- sk->sk_state_change(sk);
-
- if (sk->sk_shutdown == SHUTDOWN_MASK ||
- sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- else
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ mptcp_send_ack(msk);
+ mptcp_close_wake_up(sk);
}
+ return ret;
}
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
@@ -620,9 +639,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
* this is not a good place to change state. Let the workqueue
* do it.
*/
- if (mptcp_pending_data_fin(sk, NULL) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ if (mptcp_pending_data_fin(sk, NULL))
+ mptcp_schedule_work(sk);
}
spin_unlock_bh(&sk->sk_lock.slock);
@@ -692,6 +710,10 @@ static void mptcp_reset_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long tout;
+ /* prevent rescheduling on close */
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+ return;
+
/* should never be called with mptcp level timer cleared */
tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
if (WARN_ON_ONCE(!tout))
@@ -699,23 +721,33 @@ static void mptcp_reset_timer(struct sock *sk)
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
+bool mptcp_schedule_work(struct sock *sk)
+{
+ if (inet_sk_state_load(sk) != TCP_CLOSE &&
+ schedule_work(&mptcp_sk(sk)->work)) {
+ /* each subflow already holds a reference to the sk, and the
+ * workqueue is invoked by a subflow, so sk can't go away here.
+ */
+ sock_hold(sk);
+ return true;
+ }
+ return false;
+}
+
void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
- if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
- (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
- schedule_work(&mptcp_sk(sk)->work))
- sock_hold(sk);
+ if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
+ mptcp_send_head(sk) ||
+ (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
+ mptcp_schedule_work(sk);
}
void mptcp_subflow_eof(struct sock *sk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
}
static void mptcp_check_for_eof(struct mptcp_sock *msk)
@@ -726,8 +758,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow)
receivers += !subflow->rx_eof;
+ if (receivers)
+ return;
- if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
/* hopefully temporary hack: propagate shutdown status
* to msk, when all subflows agree on it
*/
@@ -737,6 +771,19 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
set_bit(MPTCP_DATA_READY, &msk->flags);
sk->sk_data_ready(sk);
}
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ inet_sk_state_store(sk, TCP_CLOSE_WAIT);
+ break;
+ case TCP_FIN_WAIT1:
+ /* fallback sockets skip TCP_CLOSING - TCP will take care */
+ inet_sk_state_store(sk, TCP_CLOSE);
+ break;
+ default:
+ return;
+ }
+ mptcp_close_wake_up(sk);
}
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
@@ -783,6 +830,7 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
const struct mptcp_data_frag *df)
{
return df && pfrag->page == df->page &&
+ pfrag->size - pfrag->offset > 0 &&
df->data_seq + df->data_len == msk->write_seq;
}
@@ -801,20 +849,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
-static bool mptcp_is_writeable(struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
-
- if (!sk_stream_is_writeable((struct sock *)msk))
- return false;
-
- mptcp_for_each_subflow(msk, subflow) {
- if (sk_stream_is_writeable(subflow->tcp_sock))
- return true;
- }
- return false;
-}
-
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -826,13 +860,16 @@ static void mptcp_clean_una(struct sock *sk)
* plain TCP
*/
if (__mptcp_check_fallback(msk))
- atomic64_set(&msk->snd_una, msk->write_seq);
+ atomic64_set(&msk->snd_una, msk->snd_nxt);
+
snd_una = atomic64_read(&msk->snd_una);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
+ if (WARN_ON_ONCE(dfrag == msk->first_pending))
+ break;
dfrag_clear(sk, dfrag);
cleaned = true;
}
@@ -841,12 +878,13 @@ static void mptcp_clean_una(struct sock *sk)
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = snd_una - dfrag->data_seq;
- if (WARN_ON_ONCE(delta > dfrag->data_len))
+ if (WARN_ON_ONCE(delta > dfrag->already_sent))
goto out;
dfrag->data_seq += delta;
dfrag->offset += delta;
dfrag->data_len -= delta;
+ dfrag->already_sent -= delta;
dfrag_uncharge(sk, delta);
cleaned = true;
@@ -864,13 +902,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
mptcp_clean_una(sk);
/* Only wake up writers if a subflow is ready */
- if (mptcp_is_writeable(msk)) {
- set_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
-
- /* set SEND_SPACE before sk_stream_write_space clears
- * NOSPACE
- */
+ if (sk_stream_is_writeable(sk)) {
+ clear_bit(MPTCP_NOSPACE, &msk->flags);
sk_stream_write_space(sk);
}
}
@@ -880,12 +913,23 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
*/
static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool first = true;
+
if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
pfrag, sk->sk_allocation)))
return true;
- sk->sk_prot->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (first)
+ tcp_enter_memory_pressure(ssk);
+ sk_stream_moderate_sndbuf(ssk);
+ first = false;
+ }
return false;
}
@@ -901,149 +945,109 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
dfrag->data_seq = msk->write_seq;
dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->already_sent = 0;
dfrag->page = pfrag->page;
return dfrag;
}
+struct mptcp_sendmsg_info {
+ int mss_now;
+ int size_goal;
+ u16 limit;
+ u16 sent;
+ unsigned int flags;
+};
+
+static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
+ int avail_size)
+{
+ u64 window_end = mptcp_wnd_end(msk);
+
+ if (__mptcp_check_fallback(msk))
+ return avail_size;
+
+ if (!before64(data_seq + avail_size, window_end)) {
+ u64 allowed_size = window_end - data_seq;
+
+ return min_t(unsigned int, allowed_size, avail_size);
+ }
+
+ return avail_size;
+}
+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
- struct msghdr *msg, struct mptcp_data_frag *dfrag,
- long *timeo, int *pmss_now,
- int *ps_goal)
+ struct mptcp_data_frag *dfrag,
+ struct mptcp_sendmsg_info *info)
{
- int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
- bool dfrag_collapsed, can_collapse = false;
+ u64 data_seq = dfrag->data_seq + info->sent;
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool zero_window_probe = false;
struct mptcp_ext *mpext = NULL;
- bool retransmission = !!dfrag;
struct sk_buff *skb, *tail;
- struct page_frag *pfrag;
- struct page *page;
- u64 *write_seq;
- size_t psize;
-
- /* use the mptcp page cache so that we can easily move the data
- * from one substream to another, but do per subflow memory accounting
- * Note: pfrag is used only !retransmission, but the compiler if
- * fooled into a warning if we don't init here
- */
- pfrag = sk_page_frag(sk);
- if (!retransmission) {
- write_seq = &msk->write_seq;
- page = pfrag->page;
- } else {
- write_seq = &dfrag->data_seq;
- page = dfrag->page;
- }
+ bool can_collapse = false;
+ int avail_size;
+ size_t ret;
+
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
+ msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
- /* compute copy limit */
- mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
- *pmss_now = mss_now;
- *ps_goal = size_goal;
- avail_size = size_goal;
+ /* compute send limit */
+ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
+ avail_size = info->size_goal;
skb = tcp_write_queue_tail(ssk);
if (skb) {
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
-
/* Limit the write to the size available in the
* current skb, if any, so that we create at most a new skb.
* Explicitly tells TCP internals to avoid collapsing on later
* queue management operation, to avoid breaking the ext <->
* SSN association set here
*/
- can_collapse = (size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ can_collapse = (info->size_goal - skb->len > 0) &&
+ mptcp_skb_can_collapse_to(data_seq, skb, mpext);
if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1;
else
- avail_size = size_goal - skb->len;
+ avail_size = info->size_goal - skb->len;
}
- if (!retransmission) {
- /* reuse tail pfrag, if possible, or carve a new one from the
- * page allocator
- */
- dfrag = mptcp_rtx_tail(sk);
- offset = pfrag->offset;
- dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
- if (!dfrag_collapsed) {
- dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
- offset = dfrag->offset;
- frag_truesize = dfrag->overhead;
- }
- psize = min_t(size_t, pfrag->size - offset, avail_size);
-
- /* Copy to page */
- pr_debug("left=%zu", msg_data_left(msg));
- psize = copy_page_from_iter(pfrag->page, offset,
- min_t(size_t, msg_data_left(msg),
- psize),
- &msg->msg_iter);
- pr_debug("left=%zu", msg_data_left(msg));
- if (!psize)
- return -EINVAL;
-
- if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) {
- iov_iter_revert(&msg->msg_iter, psize);
- return -ENOMEM;
- }
- } else {
- offset = dfrag->offset;
- psize = min_t(size_t, dfrag->data_len, avail_size);
+ /* Zero window and all data acked? Probe. */
+ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
+ if (avail_size == 0) {
+ if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt)
+ return 0;
+ zero_window_probe = true;
+ data_seq = atomic64_read(&msk->snd_una) - 1;
+ avail_size = 1;
}
- /* tell the TCP stack to delay the push so that we can safely
- * access the skb after the sendpages call
- */
- ret = do_tcp_sendpages(ssk, page, offset, psize,
- msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
- if (ret <= 0) {
- if (!retransmission)
- iov_iter_revert(&msg->msg_iter, psize);
- return ret;
- }
+ if (WARN_ON_ONCE(info->sent > info->limit ||
+ info->limit > dfrag->data_len))
+ return 0;
- frag_truesize += ret;
- if (!retransmission) {
- if (unlikely(ret < psize))
- iov_iter_revert(&msg->msg_iter, psize - ret);
-
- /* send successful, keep track of sent data for mptcp-level
- * retransmission
- */
- dfrag->data_len += ret;
- if (!dfrag_collapsed) {
- get_page(dfrag->page);
- list_add_tail(&dfrag->list, &msk->rtx_queue);
- sk_wmem_queued_add(sk, frag_truesize);
- } else {
- sk_wmem_queued_add(sk, ret);
- }
-
- /* charge data on mptcp rtx queue to the master socket
- * Note: we charge such data both to sk and ssk
- */
- sk->sk_forward_alloc -= frag_truesize;
+ ret = info->limit - info->sent;
+ tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
+ dfrag->offset + info->sent, &ret);
+ if (!tail) {
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
+ return -ENOMEM;
}
- /* if the tail skb extension is still the cached one, collapsing
- * really happened. Note: we can't check for 'same skb' as the sk_buff
- * hdr on tail can be transmitted, freed and re-allocated by the
- * do_tcp_sendpages() call
+ /* if the tail skb is still the cached one, collapsing really happened.
*/
- tail = tcp_write_queue_tail(ssk);
- if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
+ if (skb == tail) {
WARN_ON_ONCE(!can_collapse);
mpext->data_len += ret;
+ WARN_ON_ONCE(zero_window_probe);
goto out;
}
- skb = tcp_write_queue_tail(ssk);
- mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
+ mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext);
msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext));
- mpext->data_seq = *write_seq;
+ mpext->data_seq = data_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
@@ -1053,12 +1057,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64);
+ if (zero_window_probe) {
+ mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+ mpext->frozen = 1;
+ ret = 0;
+ tcp_push_pending_frames(ssk);
+ }
out:
- if (!retransmission)
- pfrag->offset += frag_truesize;
- WRITE_ONCE(*write_seq, *write_seq + ret);
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
-
return ret;
}
@@ -1066,17 +1072,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ set_bit(MPTCP_NOSPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool ssk_writeable = sk_stream_is_writeable(ssk);
struct socket *sock = READ_ONCE(ssk->sk_socket);
+ if (ssk_writeable || !sock)
+ continue;
+
/* enables ssk->write_space() callbacks */
- if (sock)
- set_bit(SOCK_NOSPACE, &sock->flags);
+ set_bit(SOCK_NOSPACE, &sock->flags);
}
+
+ /* mptcp_data_acked() could run just before we set the NOSPACE bit,
+ * so explicitly check for snd_una value
+ */
+ mptcp_clean_una((struct sock *)msk);
}
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
@@ -1180,21 +1194,86 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
return NULL;
}
-static void ssk_check_wmem(struct mptcp_sock *msk)
+static void mptcp_push_release(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
{
- if (unlikely(!mptcp_is_writeable(msk)))
- mptcp_nospace(msk);
+ mptcp_set_timeout(sk, ssk);
+ tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
+ release_sock(ssk);
+}
+
+static void mptcp_push_pending(struct sock *sk, unsigned int flags)
+{
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ struct mptcp_data_frag *dfrag;
+ int len, copied = 0;
+ u32 sndbuf;
+
+ while ((dfrag = mptcp_send_head(sk))) {
+ info.sent = dfrag->already_sent;
+ info.limit = dfrag->data_len;
+ len = dfrag->data_len - dfrag->already_sent;
+ while (len > 0) {
+ int ret = 0;
+
+ prev_ssk = ssk;
+ __mptcp_flush_join_list(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
+
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+
+ /* try to keep the subflow socket lock across
+ * consecutive xmit on the same socket
+ */
+ if (ssk != prev_ssk && prev_ssk)
+ mptcp_push_release(sk, prev_ssk, &info);
+ if (!ssk)
+ goto out;
+
+ if (ssk != prev_ssk || !prev_ssk)
+ lock_sock(ssk);
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0) {
+ mptcp_push_release(sk, ssk, &info);
+ goto out;
+ }
+
+ info.sent += ret;
+ dfrag->already_sent += ret;
+ msk->snd_nxt += ret;
+ msk->snd_burst -= ret;
+ copied += ret;
+ len -= ret;
+ }
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
+ /* at this point we held the socket lock for the last subflow we used */
+ if (ssk)
+ mptcp_push_release(sk, ssk, &info);
+
+out:
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ if (copied)
+ __mptcp_check_send_data_fin(sk);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
- int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
size_t copied = 0;
- struct sock *ssk;
- u32 sndbuf;
- bool tx_ok;
+ int ret = 0;
long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@@ -1211,130 +1290,92 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
pfrag = sk_page_frag(sk);
-restart:
mp