summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTrond Myklebust <trond.myklebust@hammerspace.com>2019-11-18 10:55:55 +0100
committerTrond Myklebust <trond.myklebust@hammerspace.com>2019-11-18 10:55:55 +0100
commit4e121fcae809a94aa952407bd74b0757b858ce19 (patch)
tree6dfb8b2850d47a93f6b8310673e2a7aa8f4c992a
parentf751c5452594f6ef77b39c78f9888275e60d0770 (diff)
parenta52c23b8b207d676d6cdf531af482a79fa622b9d (diff)
downloadlinux-4e121fcae809a94aa952407bd74b0757b858ce19.tar.gz
linux-4e121fcae809a94aa952407bd74b0757b858ce19.tar.bz2
linux-4e121fcae809a94aa952407bd74b0757b858ce19.zip
Merge tag 'nfs-rdma-for-5.5-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
NFSoRDMA Client Updates for Linux 5.5 New Features: - New tracepoints for congestion control and Local Invalidate WRs Bugfixes and Cleanups: - Eliminate log noise in call_reserveresult - Fix unstable connections after a reconnect - Clean up some code duplication - Close race between waking a sender and posting a receive - Fix MR list corruption, and clean up MR usage - Remove unused rpcrdma_sendctx fields - Try to avoid DMA mapping pages if it is too costly - Wake pending tasks if connection fails - Replace some dprintk()s with tracepoints
-rw-r--r--include/trace/events/rpcrdma.h198
-rw-r--r--include/trace/events/sunrpc.h93
-rw-r--r--net/sunrpc/clnt.c14
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c53
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c413
-rw-r--r--net/sunrpc/xprtrdma/transport.c33
-rw-r--r--net/sunrpc/xprtrdma/verbs.c194
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h18
10 files changed, 646 insertions, 394 deletions
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index a13830616107..69a8278e5cef 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -85,6 +85,44 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt,
), \
TP_ARGS(r_xprt))
+DECLARE_EVENT_CLASS(xprtrdma_connect_class,
+ TP_PROTO(
+ const struct rpcrdma_xprt *r_xprt,
+ int rc
+ ),
+
+ TP_ARGS(r_xprt, rc),
+
+ TP_STRUCT__entry(
+ __field(const void *, r_xprt)
+ __field(int, rc)
+ __field(int, connect_status)
+ __string(addr, rpcrdma_addrstr(r_xprt))
+ __string(port, rpcrdma_portstr(r_xprt))
+ ),
+
+ TP_fast_assign(
+ __entry->r_xprt = r_xprt;
+ __entry->rc = rc;
+ __entry->connect_status = r_xprt->rx_ep.rep_connected;
+ __assign_str(addr, rpcrdma_addrstr(r_xprt));
+ __assign_str(port, rpcrdma_portstr(r_xprt));
+ ),
+
+ TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d",
+ __get_str(addr), __get_str(port), __entry->r_xprt,
+ __entry->rc, __entry->connect_status
+ )
+);
+
+#define DEFINE_CONN_EVENT(name) \
+ DEFINE_EVENT(xprtrdma_connect_class, xprtrdma_##name, \
+ TP_PROTO( \
+ const struct rpcrdma_xprt *r_xprt, \
+ int rc \
+ ), \
+ TP_ARGS(r_xprt, rc))
+
DECLARE_EVENT_CLASS(xprtrdma_rdch_event,
TP_PROTO(
const struct rpc_task *task,
@@ -333,47 +371,81 @@ TRACE_EVENT(xprtrdma_cm_event,
)
);
-TRACE_EVENT(xprtrdma_disconnect,
+TRACE_EVENT(xprtrdma_inline_thresh,
TP_PROTO(
- const struct rpcrdma_xprt *r_xprt,
- int status
+ const struct rpcrdma_xprt *r_xprt
),
- TP_ARGS(r_xprt, status),
+ TP_ARGS(r_xprt),
TP_STRUCT__entry(
__field(const void *, r_xprt)
- __field(int, status)
- __field(int, connected)
+ __field(unsigned int, inline_send)
+ __field(unsigned int, inline_recv)
+ __field(unsigned int, max_send)
+ __field(unsigned int, max_recv)
__string(addr, rpcrdma_addrstr(r_xprt))
__string(port, rpcrdma_portstr(r_xprt))
),
TP_fast_assign(
+ const struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
__entry->r_xprt = r_xprt;
- __entry->status = status;
- __entry->connected = r_xprt->rx_ep.rep_connected;
+ __entry->inline_send = ep->rep_inline_send;
+ __entry->inline_recv = ep->rep_inline_recv;
+ __entry->max_send = ep->rep_max_inline_send;
+ __entry->max_recv = ep->rep_max_inline_recv;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
- TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected",
- __get_str(addr), __get_str(port),
- __entry->r_xprt, __entry->status,
- __entry->connected == 1 ? "still " : "dis"
+ TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u",
+ __get_str(addr), __get_str(port), __entry->r_xprt,
+ __entry->inline_send, __entry->inline_recv,
+ __entry->max_send, __entry->max_recv
)
);
-DEFINE_RXPRT_EVENT(xprtrdma_conn_start);
-DEFINE_RXPRT_EVENT(xprtrdma_conn_tout);
+DEFINE_CONN_EVENT(connect);
+DEFINE_CONN_EVENT(disconnect);
+
DEFINE_RXPRT_EVENT(xprtrdma_create);
DEFINE_RXPRT_EVENT(xprtrdma_op_destroy);
DEFINE_RXPRT_EVENT(xprtrdma_remove);
DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
-DEFINE_RXPRT_EVENT(xprtrdma_reconnect);
DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
DEFINE_RXPRT_EVENT(xprtrdma_op_close);
-DEFINE_RXPRT_EVENT(xprtrdma_op_connect);
+DEFINE_RXPRT_EVENT(xprtrdma_op_setport);
+
+TRACE_EVENT(xprtrdma_op_connect,
+ TP_PROTO(
+ const struct rpcrdma_xprt *r_xprt,
+ unsigned long delay
+ ),
+
+ TP_ARGS(r_xprt, delay),
+
+ TP_STRUCT__entry(
+ __field(const void *, r_xprt)
+ __field(unsigned long, delay)
+ __string(addr, rpcrdma_addrstr(r_xprt))
+ __string(port, rpcrdma_portstr(r_xprt))
+ ),
+
+ TP_fast_assign(
+ __entry->r_xprt = r_xprt;
+ __entry->delay = delay;
+ __assign_str(addr, rpcrdma_addrstr(r_xprt));
+ __assign_str(port, rpcrdma_portstr(r_xprt));
+ ),
+
+ TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu",
+ __get_str(addr), __get_str(port), __entry->r_xprt,
+ __entry->delay
+ )
+);
+
TRACE_EVENT(xprtrdma_op_set_cto,
TP_PROTO(
@@ -532,6 +604,8 @@ DEFINE_WRCH_EVENT(write);
DEFINE_WRCH_EVENT(reply);
TRACE_DEFINE_ENUM(rpcrdma_noch);
+TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
+TRACE_DEFINE_ENUM(rpcrdma_noch_mapped);
TRACE_DEFINE_ENUM(rpcrdma_readch);
TRACE_DEFINE_ENUM(rpcrdma_areadch);
TRACE_DEFINE_ENUM(rpcrdma_writech);
@@ -540,6 +614,8 @@ TRACE_DEFINE_ENUM(rpcrdma_replych);
#define xprtrdma_show_chunktype(x) \
__print_symbolic(x, \
{ rpcrdma_noch, "inline" }, \
+ { rpcrdma_noch_pullup, "pullup" }, \
+ { rpcrdma_noch_mapped, "mapped" }, \
{ rpcrdma_readch, "read list" }, \
{ rpcrdma_areadch, "*read list" }, \
{ rpcrdma_writech, "write list" }, \
@@ -667,9 +743,8 @@ TRACE_EVENT(xprtrdma_post_send,
__entry->client_id = rqst->rq_task->tk_client ?
rqst->rq_task->tk_client->cl_clid : -1;
__entry->req = req;
- __entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
- __entry->signaled = req->rl_sendctx->sc_wr.send_flags &
- IB_SEND_SIGNALED;
+ __entry->num_sge = req->rl_wr.num_sge;
+ __entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED;
__entry->status = status;
),
@@ -735,6 +810,31 @@ TRACE_EVENT(xprtrdma_post_recvs,
)
);
+TRACE_EVENT(xprtrdma_post_linv,
+ TP_PROTO(
+ const struct rpcrdma_req *req,
+ int status
+ ),
+
+ TP_ARGS(req, status),
+
+ TP_STRUCT__entry(
+ __field(const void *, req)
+ __field(int, status)
+ __field(u32, xid)
+ ),
+
+ TP_fast_assign(
+ __entry->req = req;
+ __entry->status = status;
+ __entry->xid = be32_to_cpu(req->rl_slot.rq_xid);
+ ),
+
+ TP_printk("req=%p xid=0x%08x status=%d",
+ __entry->req, __entry->xid, __entry->status
+ )
+);
+
/**
** Completion events
**/
@@ -1021,66 +1121,32 @@ DEFINE_REPLY_EVENT(xprtrdma_reply_hdr);
TRACE_EVENT(xprtrdma_fixup,
TP_PROTO(
const struct rpc_rqst *rqst,
- int len,
- int hdrlen
+ unsigned long fixup
),
- TP_ARGS(rqst, len, hdrlen),
+ TP_ARGS(rqst, fixup),
TP_STRUCT__entry(
__field(unsigned int, task_id)
__field(unsigned int, client_id)
- __field(const void *, base)
- __field(int, len)
- __field(int, hdrlen)
- ),
-
- TP_fast_assign(
- __entry->task_id = rqst->rq_task->tk_pid;
- __entry->client_id = rqst->rq_task->tk_client->cl_clid;
- __entry->base = rqst->rq_rcv_buf.head[0].iov_base;
- __entry->len = len;
- __entry->hdrlen = hdrlen;
- ),
-
- TP_printk("task:%u@%u base=%p len=%d hdrlen=%d",
- __entry->task_id, __entry->client_id,
- __entry->base, __entry->len, __entry->hdrlen
- )
-);
-
-TRACE_EVENT(xprtrdma_fixup_pg,
- TP_PROTO(
- const struct rpc_rqst *rqst,
- int pageno,
- const void *pos,
- int len,
- int curlen
- ),
-
- TP_ARGS(rqst, pageno, pos, len, curlen),
-
- TP_STRUCT__entry(
- __field(unsigned int, task_id)
- __field(unsigned int, client_id)
- __field(const void *, pos)
- __field(int, pageno)
- __field(int, len)
- __field(int, curlen)
+ __field(unsigned long, fixup)
+ __field(size_t, headlen)
+ __field(unsigned int, pagelen)
+ __field(size_t, taillen)
),
TP_fast_assign(
__entry->task_id = rqst->rq_task->tk_pid;
__entry->client_id = rqst->rq_task->tk_client->cl_clid;
- __entry->pos = pos;
- __entry->pageno = pageno;
- __entry->len = len;
- __entry->curlen = curlen;
+ __entry->fixup = fixup;
+ __entry->headlen = rqst->rq_rcv_buf.head[0].iov_len;
+ __entry->pagelen = rqst->rq_rcv_buf.page_len;
+ __entry->taillen = rqst->rq_rcv_buf.tail[0].iov_len;
),
- TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d",
- __entry->task_id, __entry->client_id,
- __entry->pageno, __entry->pos, __entry->len, __entry->curlen
+ TP_printk("task:%u@%u fixup=%lu xdr=%zu/%u/%zu",
+ __entry->task_id, __entry->client_id, __entry->fixup,
+ __entry->headlen, __entry->pagelen, __entry->taillen
)
);
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index ffa3c51dbb1a..378233fe5ac7 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -777,6 +777,99 @@ TRACE_EVENT(xprt_ping,
__get_str(addr), __get_str(port), __entry->status)
);
+DECLARE_EVENT_CLASS(xprt_writelock_event,
+ TP_PROTO(
+ const struct rpc_xprt *xprt, const struct rpc_task *task
+ ),
+
+ TP_ARGS(xprt, task),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(unsigned int, snd_task_id)
+ ),
+
+ TP_fast_assign(
+ if (task) {
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client ?
+ task->tk_client->cl_clid : -1;
+ } else {
+ __entry->task_id = -1;
+ __entry->client_id = -1;
+ }
+ __entry->snd_task_id = xprt->snd_task ?
+ xprt->snd_task->tk_pid : -1;
+ ),
+
+ TP_printk("task:%u@%u snd_task:%u",
+ __entry->task_id, __entry->client_id,
+ __entry->snd_task_id)
+);
+
+#define DEFINE_WRITELOCK_EVENT(name) \
+ DEFINE_EVENT(xprt_writelock_event, xprt_##name, \
+ TP_PROTO( \
+ const struct rpc_xprt *xprt, \
+ const struct rpc_task *task \
+ ), \
+ TP_ARGS(xprt, task))
+
+DEFINE_WRITELOCK_EVENT(reserve_xprt);
+DEFINE_WRITELOCK_EVENT(release_xprt);
+
+DECLARE_EVENT_CLASS(xprt_cong_event,
+ TP_PROTO(
+ const struct rpc_xprt *xprt, const struct rpc_task *task
+ ),
+
+ TP_ARGS(xprt, task),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(unsigned int, snd_task_id)
+ __field(unsigned long, cong)
+ __field(unsigned long, cwnd)
+ __field(bool, wait)
+ ),
+
+ TP_fast_assign(
+ if (task) {
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client ?
+ task->tk_client->cl_clid : -1;
+ } else {
+ __entry->task_id = -1;
+ __entry->client_id = -1;
+ }
+ __entry->snd_task_id = xprt->snd_task ?
+ xprt->snd_task->tk_pid : -1;
+ __entry->cong = xprt->cong;
+ __entry->cwnd = xprt->cwnd;
+ __entry->wait = test_bit(XPRT_CWND_WAIT, &xprt->state);
+ ),
+
+ TP_printk("task:%u@%u snd_task:%u cong=%lu cwnd=%lu%s",
+ __entry->task_id, __entry->client_id,
+ __entry->snd_task_id, __entry->cong, __entry->cwnd,
+ __entry->wait ? " (wait)" : "")
+);
+
+#define DEFINE_CONG_EVENT(name) \
+ DEFINE_EVENT(xprt_cong_event, xprt_##name, \
+ TP_PROTO( \
+ const struct rpc_xprt *xprt, \
+ const struct rpc_task *task \
+ ), \
+ TP_ARGS(xprt, task))
+
+DEFINE_CONG_EVENT(reserve_cong);
+DEFINE_CONG_EVENT(release_cong);
+DEFINE_CONG_EVENT(get_cong);
+DEFINE_CONG_EVENT(put_cong);
+
TRACE_EVENT(xs_stream_read_data,
TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total),
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5baf9b9be2e8..a3379765605d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1679,8 +1679,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
- __func__, status);
rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1689,11 +1687,8 @@ call_reserveresult(struct rpc_task *task)
* Even though there was an error, we may have acquired
* a request slot somehow. Make sure not to leak it.
*/
- if (task->tk_rqstp) {
- printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
- __func__, status);
+ if (task->tk_rqstp)
xprt_release(task);
- }
switch (status) {
case -ENOMEM:
@@ -1702,14 +1697,9 @@ call_reserveresult(struct rpc_task *task)
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
return;
- case -EIO: /* probably a shutdown */
- break;
default:
- printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
- __func__, status);
- break;
+ rpc_call_rpcerror(task, status);
}
- rpc_call_rpcerror(task, status);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 41df4c507193..1aafe8d3f3f4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -205,20 +205,20 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
xprt->snd_task = task;
+out_locked:
+ trace_xprt_reserve_xprt(xprt, task);
return 1;
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n",
- task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -269,23 +269,22 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (req == NULL) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
if (!xprt_need_congestion_window_wait(xprt)) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -293,6 +292,9 @@ out_sleep:
else
rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
+out_locked:
+ trace_xprt_reserve_cong(xprt, task);
+ return 1;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -357,6 +359,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
+ trace_xprt_release_xprt(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt);
@@ -374,6 +377,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
+ trace_xprt_release_cong(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
@@ -395,8 +399,7 @@ __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
if (req->rq_cong)
return 1;
- dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
- req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+ trace_xprt_get_cong(xprt, req->rq_task);
if (RPCXPRT_CONGESTED(xprt)) {
xprt_set_congestion_window_wait(xprt);
return 0;
@@ -418,6 +421,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
xprt_test_and_clear_congestion_window_wait(xprt);
+ trace_xprt_put_cong(xprt, req->rq_task);
__xprt_lock_write_next_cong(xprt);
}
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index b458bf53ca69..9d02eae353c6 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
*p = xdr_zero;
if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
- &rqst->rq_snd_buf, rpcrdma_noch))
+ &rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
trace_xprtrdma_cb_reply(rqst);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 30065a28628c..523722be6a16 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -36,8 +36,8 @@
* connect worker from running concurrently.
*
* When the underlying transport disconnects, MRs that are in flight
- * are flushed and are likely unusable. Thus all flushed MRs are
- * destroyed. New MRs are created on demand.
+ * are flushed and are likely unusable. Thus all MRs are destroyed.
+ * New MRs are created on demand.
*/
#include <linux/sunrpc/rpc_rdma.h>
@@ -88,8 +88,10 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
kfree(mr);
}
-static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
+ struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
trace_xprtrdma_mr_recycle(mr);
if (mr->mr_dir != DMA_NONE) {
@@ -107,32 +109,6 @@ static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
frwr_release_mr(mr);
}
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
-{
- struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
- mr_recycle);
-
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
-/* frwr_recycle - Discard MRs
- * @req: request to reset
- *
- * Used after a reconnect. These MRs could be in flight, we can't
- * tell. Safe thing to do is release them.
- */
-void frwr_recycle(struct rpcrdma_req *req)
-{
- struct rpcrdma_mr *mr;
-
- while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
/* frwr_reset - Place MRs back on the free list
* @req: request to reset
*
@@ -166,9 +142,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
struct ib_mr *frmr;
int rc;
- /* NB: ib_alloc_mr and device drivers typically allocate
- * memory with GFP_KERNEL.
- */
frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
@@ -180,7 +153,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
- INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
init_completion(&mr->frwr.fr_linv_done);
sg_init_table(sg, depth);
@@ -424,7 +396,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
- post_wr = &req->rl_sendctx->sc_wr;
+ post_wr = &req->rl_wr;
list_for_each_entry(mr, &req->rl_registered, mr_list) {
struct rpcrdma_frwr *frwr;
@@ -440,9 +412,6 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- /* If ib_post_send fails, the next ->send_request for
- * @req will queue these MRs for recovery.
- */
return ib_post_send(ia->ri_id->qp, post_wr, NULL);
}
@@ -468,7 +437,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
else
rpcrdma_mr_put(mr);
}
@@ -570,7 +539,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -583,6 +551,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
@@ -590,7 +559,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
bad_wr = bad_wr->next;
list_del_init(&mr->mr_list);
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
}
@@ -673,18 +642,18 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
if (!rc)
return;
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
/* The final LOCAL_INV WR in the chain is supposed to
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b86b5fd62d9f..aec3beb93b25 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
size += rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max call header size = %u\n",
- __func__, size);
return size;
}
@@ -100,8 +98,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max reply header size = %u\n",
- __func__, size);
return size;
}
@@ -363,8 +359,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
out_getmr_err:
trace_xprtrdma_nomrs(req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
- if (r_xprt->rx_ep.rep_connected != -ENODEV)
- schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
+ rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
}
@@ -393,7 +388,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
unsigned int pos;
int nsegs;
- if (rtype == rpcrdma_noch)
+ if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
goto done;
pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -565,6 +560,7 @@ static void rpcrdma_sendctx_done(struct kref *kref)
*/
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
+ struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
struct ib_sge *sge;
if (!sc->sc_unmap_count)
@@ -576,7 +572,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
*/
for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
++sge, --sc->sc_unmap_count)
- ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+ ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
@@ -589,149 +585,228 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
- struct ib_sge *sge = sc->sc_sges;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
+ return false;
sge->addr = rdmab_addr(rb);
sge->length = len;
sge->lkey = rdmab_lkey(rb);
ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
- sc->sc_wr.num_sge++;
return true;
-
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
- return false;
}
-/* Prepare the Send SGEs. The head and tail iovec, and each entry
- * in the page list, gets its own SGE.
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
*/
-static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req,
- struct xdr_buf *xdr,
- enum rpcrdma_chunktype rtype)
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, unsigned int len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
- unsigned int sge_no, page_base, len, remaining;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
- struct ib_sge *sge = sc->sc_sges;
- struct page *page, **ppages;
- /* The head iovec is straightforward, as it is already
- * DMA-mapped. Sync the content that has changed.
- */
if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
- sc->sc_device = rdmab_device(rb);
- sge_no = 1;
- sge[sge_no].addr = rdmab_addr(rb);
- sge[sge_no].length = xdr->head[0].iov_len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
- sge[sge_no].length, DMA_TO_DEVICE);
-
- /* If there is a Read chunk, the page list is being handled
- * via explicit RDMA, and thus is skipped here. However, the
- * tail iovec may include an XDR pad for the page list, as
- * well as additional content, and may not reside in the
- * same page as the head iovec.
- */
- if (rtype == rpcrdma_readch) {
- len = xdr->tail[0].iov_len;
+ return false;
- /* Do not include the tail if it is only an XDR pad */
- if (len < 4)
- goto out;
+ sge->addr = rdmab_addr(rb);
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+ DMA_TO_DEVICE);
+ return true;
+}
- /* If the content in the page list is an odd length,
- * xdr_write_pages() has added a pad at the beginning
- * of the tail iovec. Force the tail's non-pad content
- * to land at the next XDR position in the Send message.
- */
- page_base += len & 3;
- len -= len & 3;
- goto map_tail;
- }
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ unsigned int page_base, len, remaining;
+ struct page **ppages;
+ struct ib_sge *sge;
- /* If there is a page list present, temporarily DMA map
- * and prepare an SGE for each page to be sent.
- */
- if (xdr->page_len) {
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_base = offset_in_page(xdr->page_base);
- remaining = xdr->page_len;
- while (remaining) {
- sge_no++;
- if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
- goto out_mapping_overflow;
-
- len = min_t(u32, PAGE_SIZE - page_base, remaining);
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), *ppages,
- page_base, len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb),
- sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
-
- sc->sc_unmap_count++;
- ppages++;
- remaining -= len;
- page_base = 0;
- }
- }
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+ page_base, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
- /* The tail iovec is not always constructed in the same
- * page where the head iovec resides (see, for example,
- * gss_wrap_req_priv). To neatly accommodate that case,
- * DMA map it separately.
- */
- if (xdr->tail[0].iov_len) {
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
- len = xdr->tail[0].iov_len;
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
-map_tail:
- sge_no++;
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), page, page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
sc->sc_unmap_count++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
}
-out:
- sc->sc_wr.num_sge += sge_no;
- if (sc->sc_unmap_count)
- kref_get(&req->rl_kref);
return true;
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+out_mapping_err:
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
+}
-out_mapping_overflow:
- rpcrdma_sendctx_unmap(sc);
- pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
- return false;
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
+ struct xdr_buf *xdr,
+ unsigned int page_base, unsigned int len)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ struct page *page = virt_to_page(xdr->tail[0].iov_base);
+
+ sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
+
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+ ++sc->sc_unmap_count;
+ return true;
out_mapping_err:
- rpcrdma_sendctx_unmap(sc);
- trace_xprtrdma_dma_maperr(sge[sge_no].addr);
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
}
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned char *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len + xdr->page_len;
+ memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf