summaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
authorTrond Myklebust <trond.myklebust@primarydata.com>2016-07-24 17:09:02 -0400
committerTrond Myklebust <trond.myklebust@primarydata.com>2016-07-24 17:09:02 -0400
commit1592c4d62a89bbca895c568d65ce290dfbc36ecc (patch)
tree6b979bc02ded2ea7e644c34e6939ffbbb7ee001d /net/sunrpc
parent668f455dac57050e33a43ff5fe006f6cd947fc65 (diff)
parentf0445670bd81cae9f46399d98fef5cd1622d9776 (diff)
downloadlinux-1592c4d62a89bbca895c568d65ce290dfbc36ecc.tar.gz
linux-1592c4d62a89bbca895c568d65ce290dfbc36ecc.tar.bz2
linux-1592c4d62a89bbca895c568d65ce290dfbc36ecc.zip
Merge branch 'nfs-rdma'
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c369
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c122
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c242
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h118
12 files changed, 718 insertions, 851 deletions
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 813a3cdfb573..23c8e7c39656 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1018,6 +1018,8 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
auth->au_flags = 0;
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
+ if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
+ auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b1c9..60595835317a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
+ .datatouch = true,
},
[2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P,
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p",
+ .datatouch = true,
},
};
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a598..5fec3abbe19b 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
}
EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+bool
+gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return gm->gm_pfs[i].datatouch;
+ }
+ return false;
+}
+
char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
{
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897395..c5b0cb4f4056 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
/* Encode reply */
- if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+ if (*statp == rpc_drop_reply ||
+ test_bit(RQ_DROPME, &rqstp->rq_flags)) {
if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit;
}
+ if (*statp == rpc_autherr_badcred) {
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto err_bad_auth;
+ }
if (*statp == rpc_success &&
(xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513a05..ef19fa42c50f 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o \
+ fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b595..21cb3b150b37 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
* verb (fmr_op_unmap).
*/
-/* Transport recovery
- *
- * After a transport reconnect, fmr_op_map re-uses the MR already
- * allocated for the RPC, but generates a fresh rkey then maps the
- * MR again. This process is synchronous.
- */
-
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
-static struct workqueue_struct *fmr_recovery_wq;
-
-#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+/* Access mode of externally registered pages */
+enum {
+ RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ,
+};
-int
-fmr_alloc_recovery_wq(void)
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
{
- fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
- return !fmr_recovery_wq ? -ENOMEM : 0;
+ if (!ia->ri_device->alloc_fmr) {
+ pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+ }
+ return true;
}
-void
-fmr_destroy_recovery_wq(void)
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
{
- struct workqueue_struct *wq;
+ static struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
- if (!fmr_recovery_wq)
- return;
+ mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(u64), GFP_KERNEL);
+ if (!mw->fmr.fm_physaddrs)
+ goto out_free;
- wq = fmr_recovery_wq;
- fmr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(*mw->mw_sg), GFP_KERNEL);
+ if (!mw->mw_sg)
+ goto out_free;
+
+ sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+
+ mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+ &fmr_attr);
+ if (IS_ERR(mw->fmr.fm_mr))
+ goto out_fmr_err;
+
+ return 0;
+
+out_fmr_err:
+ dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
+ PTR_ERR(mw->fmr.fm_mr));
+
+out_free:
+ kfree(mw->mw_sg);
+ kfree(mw->fmr.fm_physaddrs);
+ return -ENOMEM;
}
static int
__fmr_unmap(struct rpcrdma_mw *mw)
{
LIST_HEAD(l);
+ int rc;
- list_add(&mw->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
+ list_add(&mw->fmr.fm_mr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ list_del_init(&mw->fmr.fm_mr->list);
+ return rc;
}
-/* Deferred reset of a single FMR. Generate a fresh rkey by
- * replacing the MR. There's no recovery if this fails.
- */
static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_release_mr(struct rpcrdma_mw *r)
{
- struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
- mw_work);
- struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ LIST_HEAD(unmap_list);
+ int rc;
- __fmr_unmap(mw);
- rpcrdma_put_mw(r_xprt, mw);
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
+
+ kfree(r->fmr.fm_physaddrs);
+ kfree(r->mw_sg);
+
+ /* In case this one was left mapped, try to unmap it
+ * to prevent dealloc_fmr from failing with EBUSY
+ */
+ rc = __fmr_unmap(r);
+ if (rc)
+ pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+ r, rc);
+
+ rc = ib_dealloc_fmr(r->fmr.fm_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+ r, rc);
+
+ kfree(r);
}
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
+/* Reset of a single FMR.
*/
static void
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
{
- INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
- queue_work(fmr_recovery_wq, &mw->mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ int rc;
+
+ /* ORDER: invalidate first */
+ rc = __fmr_unmap(mw);
+
+ /* ORDER: then DMA unmap */
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
+
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
+
+out_release:
+ pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
+
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+
+ fmr_op_release_mr(mw);
}
static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
-static int
-fmr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
- struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_FMR_SGES,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int i, rc;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
-
- rc = -ENOMEM;
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- goto out;
-
- r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
- sizeof(u64), GFP_KERNEL);
- if (!r->fmr.physaddrs)
- goto out_free;
-
- r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->fmr.fmr))
- goto out_fmr_err;
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
- return 0;
-
-out_fmr_err:
- rc = PTR_ERR(r->fmr.fmr);
- dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
- kfree(r->fmr.physaddrs);
-out_free:
- kfree(r);
-out:
- return rc;
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc;
struct rpcrdma_mw *mw;
+ u64 *dma_pages;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
- if (!mw) {
- mw = rpcrdma_get_mw(r_xprt);
- if (!mw)
- return -ENOMEM;
- } else {
- /* this is a retransmit; generate a fresh rkey */
- rc = __fmr_unmap(mw);
- if (rc)
- return rc;
- }
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOBUFS;
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- mw->fmr.physaddrs[i] = seg->mr_dma;
+ if (seg->mr_page)
+ sg_set_page(&mw->mw_sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+ seg->mr_len);
len += seg->mr_len;
++seg;
++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
-
- rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
- i, seg1->mr_dma);
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
+
+ if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir))
+ goto out_dmamap_err;
+
+ for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
+ dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
+ rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+ dma_pages[0]);
if (rc)
goto out_maperr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mw->fmr.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ mw->mw_handle = mw->fmr.fm_mr->rkey;
+ mw->mw_length = len;
+ mw->mw_offset = dma_pages[0] + pageoff;
-out_maperr:
- dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
- __func__, len, (unsigned long long)seg1->mr_dma,
- pageoff, i, rc);
- while (i--)
- rpcrdma_unmap_one(device, --seg);
- return rc;
-}
+ *out = mw;
+ return mw->mw_nents;
-static void
-__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- int nsegs = seg->mr_nsegs;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
- while (nsegs--)
- rpcrdma_unmap_one(device, seg++);
+out_maperr:
+ pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ len, (unsigned long long)dma_pages[0],
+ pageoff, mw->mw_nents, rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
}
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
- struct rpcrdma_mw *mw;
+ struct rpcrdma_mw *mw, *tmp;
LIST_HEAD(unmap_list);
int rc;
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* ORDER: Invalidate all of the req's MRs first
*
* ib_unmap_fmr() is slow, so use a single call instead
- * of one call per mapped MR.
+ * of one call per mapped FMR.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- list_add(&mw->fmr.fmr->list, &unmap_list);
-
- i += seg->mr_nsegs;
- }
+ list_for_each_entry(mw, &req->rl_registered, mw_list)
+ list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
rc = ib_unmap_fmr(&unmap_list);
if (rc)
- pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+ goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ list_del_init(&mw->fmr.fm_mr->list);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ rpcrdma_put_mw(r_xprt, mw);
+ }
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, seg->rl_mw);
+ return;
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
+out_reset:
+ pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
- req->rl_nchunks = 0;
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->fmr.fm_mr->list);
+ fmr_op_recover_mr(mw);
+ }
}
/* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req".
- *
- * In the asynchronous case, DMA unmapping occurs first here
- * because the rpcrdma_mr_seg is released immediately after this
- * call. It's contents won't be available in __fmr_dma_unmap later.
- * FIXME.
*/
static void
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- if (sync) {
- /* ORDER */
- __fmr_unmap(mw);
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, mw);
- } else {
- __fmr_dma_unmap(r_xprt, seg);
- __fmr_queue_recovery(mw);
- }
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-fmr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- kfree(r->fmr.physaddrs);
- rc = ib_dealloc_fmr(r->fmr.fmr);
- if (rc)
- dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
- __func__, rc);
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
- kfree(r);
+ if (sync)
+ fmr_op_recover_mr(mw);
+ else
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe,
+ .ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
- .ro_init = fmr_op_init,
- .ro_destroy = fmr_op_destroy,
+ .ro_init_mr = fmr_op_init_mr,
+ .ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c0947544babe..892b5e1d9b09 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-static struct workqueue_struct *frwr_recovery_wq;
-
-#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ goto out_not_supported;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ goto out_not_supported;
+ return true;
+
+out_not_supported:
+ pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+}
-int
-frwr_alloc_recovery_wq(void)
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
{
- frwr_recovery_wq = alloc_workqueue("frwr_recovery",
- FRWR_RECOVERY_WQ_FLAGS, 0);
- return !frwr_recovery_wq ? -ENOMEM : 0;
+ unsigned int depth = ia->ri_max_frmr_depth;
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+
+ r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
+ if (!r->mw_sg)
+ goto out_list_err;
+
+ sg_init_table(r->mw_sg, depth);
+ init_completion(&f->fr_linv_done);
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
}
-void
-frwr_destroy_recovery_wq(void)
+static void
+frwr_op_release_mr(struct rpcrdma_mw *r)
{
- struct workqueue_struct *wq;
+ int rc;
- if (!frwr_recovery_wq)
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
- wq = frwr_recovery_wq;
- frwr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ rc = ib_dereg_mr(r->frmr.fr_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+ r, rc);
+ kfree(r->mw_sg);
+ kfree(r);
}
static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0;
}
-static void
-__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_frmr *f = &mw->frmr;
- int rc;
-
- rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
- if (rc)
- return;
-
- rpcrdma_put_mw(r_xprt, mw);
-}
-
-/* Deferred reset of a single FRMR. Generate a fresh rkey by
- * replacing the MR.
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
*
* There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is
* destroyed.
*/
static void
-__frwr_recovery_worker(struct work_struct *work)
-{
- struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- mw_work);
-
- __frwr_reset_and_unmap(r->mw_xprt, r);
- return;
-}
-
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
- */
-static void
-__frwr_queue_recovery(struct rpcrdma_mw *r)
-{
- INIT_WORK(&r->mw_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->mw_work);
-}
-
-static int
-__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
- unsigned int depth)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_frmr *f = &r->frmr;
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
- f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(f->fr_mr))
- goto out_mr_err;
-
- f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
- if (!f->fr_sg)
- goto out_list_err;
-
- sg_init_table(f->fr_sg, depth);
-
- init_completion(&f->fr_linv_done);
-
- return 0;
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
-out_mr_err:
- rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_mr status %i\n",
- __func__, rc);
- return rc;
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
-out_list_err:
- rc = -ENOMEM;
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
- ib_dereg_mr(f->fr_mr);
- return rc;
-}
+out_release:
+ pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
-static void
-__frwr_release(struct rpcrdma_mw *r)
-{
- int rc;
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
- rc = ib_dereg_mr(r->frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr status %i\n",
- __func__, rc);
- kfree(r->frmr.fr_sg);
+ frwr_op_release_mr(mw);
}
static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
complete_all(&frmr->fr_linv_done);
}
-static int
-frwr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- int i;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
-
- while (i--) {
- struct rpcrdma_mw *r;
- int rc;
-
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- rc = __frwr_init(r, pd, device, depth);
- if (rc) {
- kfree(r);
- return rc;
- }
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
-
- return 0;
-}
-
-/* Post a FAST_REG Work Request to register a memory region
+/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
- struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int rc, i, n, dma_nents;
u8 key;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
+ mw = NULL;
do {
if (mw)
- __frwr_queue_recovery(mw);
+ rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOMEM;
+ return -ENOBUFS;
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
-
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->fr_sg[i],
+ sg_set_page(&mw->mw_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->fr_nents = i;
- frmr->fr_dir = direction;
-
- dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
- if (!dma_nents) {
- pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->fr_sg, frmr->fr_nents);
- return -ENOMEM;
- }
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
- n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
- if (unlikely(n != frmr->fr_nents)) {
- pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->fr_nents);
- rc = n < 0 ? n : -EINVAL;
- goto out_senderr;
- }
+ dma_nents = ib_dma_map_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (!dma_nents)
+ goto out_dmamap_err;
+
+ n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != mw->mw_nents))
+ goto out_mapmr_err;
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->fr_nents, mr->length);
+ __func__, mw, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mr->rkey;
- seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->fr_nents;
- seg1->mr_len = mr->length;
+ mw->mw_handle = mr->rkey;
+ mw->mw_length = mr->length;
+ mw->mw_offset = mr->iova;
+
+ *out = mw;
+ return mw->mw_nents;
- return frmr->fr_nents;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
+
+out_mapmr_err:
+ pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
+ frmr->fr_mr, n, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
out_senderr:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return rc;
+ pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -ENOTCONN;
}
static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *mw = seg->rl_mw;
struct rpcrdma_frmr *f = &mw->frmr;
struct ib_send_wr *invalidate_wr;
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
+ struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
+ f = NULL;
invalidate_wrs = pos = prev = NULL;
- seg = NULL;
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
-
- pos = __frwr_prepare_linv_wr(seg);
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
+ pos = __frwr_prepare_linv_wr(mw);
<