summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c2
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h1
-rw-r--r--drivers/infiniband/hw/hfi1/common.h4
-rw-r--r--drivers/infiniband/hw/hfi1/driver.c58
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h19
-rw-r--r--drivers/infiniband/hw/hfi1/init.c10
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c5
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h2
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c542
-rw-r--r--drivers/infiniband/hw/hfi1/rc.h50
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c2719
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h196
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c52
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h2
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rc.h48
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h758
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h12
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h1
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c176
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h61
-rw-r--r--drivers/infiniband/hw/qib/qib_rc.c7
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c13
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c13
-rw-r--r--include/rdma/ib_hdrs.h9
-rw-r--r--include/rdma/rdma_vt.h12
-rw-r--r--include/rdma/rdmavt_qp.h18
-rw-r--r--include/rdma/tid_rdma_defs.h52
27 files changed, 4669 insertions, 173 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 4d40311f082e..612f04190ed8 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+ hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index ba3d99e6e33b..6c27c1c6a868 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -927,6 +927,7 @@ enum {
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
+ C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 40d3cfb58bd1..7310a5dba420 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -340,6 +340,10 @@ struct diag_pkt {
#define HFI1_PSM_IOC_BASE_SEQ 0x0
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a8ad70730203..2a9d2912f5db 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1575,25 +1575,32 @@ drop:
return -EINVAL;
}
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
- dd_dev_err(rcd->dd,
- "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
- rcd->ctxt, packet->rhf,
- packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
- packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
- packet->rhf & RHF_DC_ERR ? "dc " : "",
- packet->rhf & RHF_TID_ERR ? "tid " : "",
- packet->rhf & RHF_LEN_ERR ? "len " : "",
- packet->rhf & RHF_ECC_ERR ? "ecc " : "",
- packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
- packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
- rte);
+ show_eflags_errs(packet);
}
/*
@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
- dd_dev_err(packet->rcd->dd,
- "Unhandled expected packet received. Dropping.\n");
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
- dd_dev_err(packet->rcd->dd,
- "Unhandled eager packet received. Dropping.\n");
+ trace_hfi1_rcvhdr(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ show_eflags_errs(packet);
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 9aa0357e17b7..6582184cc985 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -198,6 +198,14 @@ struct exp_tid_set {
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+ struct list_head queue_head;
+ /* queue head for QP TID resource waiters */
+ u32 enqueue; /* count of tid enqueues */
+ u32 dequeue; /* count of tid dequeues */
+};
+
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
@@ -291,6 +299,12 @@ struct hfi1_ctxtdata {
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
+ /* lock protecting all Expected TID data of kernel contexts */
+ spinlock_t exp_lock;
+ /* Queue for QP's waiting for HW TID flows */
+ struct tid_queue flow_queue;
+ /* Queue for QP's waiting for HW receive array entries */
+ struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
@@ -323,6 +337,9 @@ struct hfi1_ctxtdata {
*/
u8 subctxt_cnt;
+ /* Bit mask to track free TID RDMA HW flows */
+ unsigned long flow_mask;
+ struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
@@ -2103,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
- else
+ else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index a8dbd0f191f5..d13304f7340d 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -370,6 +370,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
+ spin_lock_init(&rcd->exp_lock);
+ INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+ INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
@@ -472,6 +475,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
+
+ /* Initialize TID flow generations for the context */
+ hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
@@ -771,6 +777,8 @@ static void enable_chip(struct hfi1_devdata *dd)
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
@@ -1589,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
- hfi1_clear_tids(rcd);
+ hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index f822f92b415f..acdd9eba189b 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -319,6 +319,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
+ hfi1_setup_tid_rdma_wqe(qp, wqe);
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
@@ -738,6 +739,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
+ hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
@@ -745,6 +747,8 @@ void stop_send_queue(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv;
iowait_cancel_work(&priv->s_iowait);
+ if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+ rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
@@ -758,6 +762,7 @@ void quiesce_qp(struct rvt_qp *qp)
void notify_qp_reset(struct rvt_qp *qp)
{
+ hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 7adb6dff6813..ce25a27aa4a1 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -63,11 +63,13 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
+#define HFI1_S_WAIT_TID_SPACE 0x10000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 092d5eba980f..6c9ef572fc69 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,28 +51,48 @@
#include "hfi.h"
#include "qp.h"
+#include "rc.h"
#include "verbs_txreq.h"
#include "trace.h"
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp);
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
- u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled)
+ __must_hold(&qp->s_lock)
{
- u32 len;
-
- len = delta_psn(psn, wqe->psn) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ struct rvt_ack_entry *e = NULL;
+ u8 i, p;
+ bool s = true;
+
+ for (i = qp->r_head_ack_queue; ; i = p) {
+ if (i == qp->s_tail_ack_queue)
+ s = false;
+ if (i)
+ p = i - 1;
+ else
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+ if (p == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[p];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (p == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ s = false;
+ break;
+ }
+ }
+ if (prev)
+ *prev = p;
+ if (prev_ack)
+ *prev_ack = i;
+ if (scheduled)
+ *scheduled = s;
+ return e;
}
/**
@@ -92,13 +112,16 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
{
struct rvt_ack_entry *e;
u32 hwords;
- u32 len;
- u32 bth0, bth2;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
struct hfi1_qp_priv *priv = qp->priv;
+ bool last_pkt;
+ u32 delta;
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
/* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
@@ -170,6 +193,26 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(READ_REQ)) {
+ /*
+ * If a TID RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_ack_state = TID_OP(READ_RESP);
+ goto read_resp;
} else {
/* COMPARE_SWAP or FETCH_ADD */
ps->s_txreq->ss = NULL;
@@ -207,6 +250,28 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(READ_RESP):
+read_resp:
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+ &bth1, &bth2, &len,
+ &last_pkt);
+ if (delta == 0)
+ goto error_qp;
+ hwords += delta;
+ if (last_pkt) {
+ e->sent = 1;
+ /*
+ * Increment qp->s_tail_ack_queue through s_ack_state
+ * transition.
+ */
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ }
+ break;
+ case TID_OP(READ_REQ):
+ goto bail;
+
default:
normal:
/*
@@ -236,7 +301,14 @@ normal:
ps->s_txreq->hdr_dwords = hwords;
hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
-
+error_qp:
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
+ spin_lock(&qp->s_lock);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
bail:
qp->s_ack_state = OP(ACKNOWLEDGE);
/*
@@ -263,17 +335,22 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_other_headers *ohdr;
- struct rvt_sge_state *ss;
+ struct rvt_sge_state *ss = NULL;
struct rvt_swqe *wqe;
- u32 hwords;
- u32 len;
- u32 bth0 = 0, bth2;
+ struct hfi1_swqe_priv *wpriv;
+ struct tid_rdma_request *req = NULL;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
int delta;
+ struct tid_rdma_flow *flow = NULL;
+ trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
ps->s_txreq = get_txreq(ps->dev, qp);
if (!ps->s_txreq)
@@ -314,8 +391,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
/* will get called again */
goto done_free_tx;
}
@@ -334,6 +411,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/* Send a request. */
wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
switch (qp->s_state) {
default:
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -355,9 +433,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/*
* If a fence is requested, wait for previous
* RDMA read and atomic operations to finish.
+ * However, there is no need to guard against
+ * TID RDMA READ after TID RDMA READ.
*/
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
- qp->s_num_rd_atomic) {
+ qp->s_num_rd_atomic &&
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
qp->s_flags |= RVT_S_WAIT_FENCE;
goto bail;
}
@@ -402,6 +484,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
len = wqe->length;
ss = &qp->s_sge;
bth2 = mask_psn(qp->s_psn);
+
+ /*
+ * Interlock between various IB requests and TID RDMA
+ * if necessary.
+ */
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
+ goto bail;
+
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
@@ -483,16 +574,14 @@ no_flow_control:
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
&ohdr->u.rc.reth);
@@ -508,20 +597,92 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_READ:
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
+ wpriv = wqe->priv;
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow. We could get here under
+ * three conditions; (1) It's a new request; (2) We are
+ * sending the second or later segment of a request,
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+ * when the last segment of a previous request is
+ * received just before this; (3) We are re-sending a
+ * request.
+ */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ if (newreq) {
+ struct tid_rdma_flow *flow =
+ &req->flows[req->setup_head];
+
+ /*
+ * Set up s_sge as it is needed for TID
+ * allocation. However, if the pages have been
+ * walked and mapped, skip it. An earlier try
+ * has failed to allocate the TID entries.
+ */
+ if (!flow->npagesets) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ req->isge = 0;
+ req->clear_tail = req->setup_head;
+ req->flow_idx = req->setup_head;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+ } else if (delta == 0) {
+ /* Re-send a request */
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_pending = 0;
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ }
+ req->s_next_psn = qp->s_psn;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
/* FALLTHROUGH */
case IB_WR_OPFN:
@@ -555,11 +716,13 @@ no_flow_control:
default:
goto bail;
}
- qp->s_sge.sge = wqe->sg_list[0];
- qp->s_sge.sg_list = wqe->sg_list + 1;
- qp->s_sge.num_sge = wqe->wr.num_sge;
- qp->s_sge.total_len = wqe->length;
- qp->s_len = wqe->length;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ }
if (newreq) {
qp->s_tail++;
if (qp->s_tail >= qp->s_size)
@@ -567,6 +730,8 @@ no_flow_control:
}
if (wqe->wr.opcode == IB_WR_RDMA_READ)
qp->s_psn = wqe->lpsn + 1;
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ qp->s_psn = req->s_next_psn;
else
qp->s_psn++;
break;
@@ -683,6 +848,103 @@ no_flow_control:
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+ case TID_OP(READ_RESP):
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto bail;
+ /* This is used to restart a TID read request */
+ req = wqe_to_tid_req(wqe);
+ wpriv = wqe->priv;
+ /*
+ * Back down. The field qp->s_psn has been set to the psn with
+ * which the request should be restart. It's OK to use division
+ * as this is on the retry path.
+ */
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+ /*
+ * The following function need to be redefined to return the
+ * status to make sure that we find the flow. At the same
+ * time, we can use the req->state change to check if the
+ * call succeeds or not.
+ */
+ req->state = TID_REQUEST_RESEND;
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ if (req->state != TID_REQUEST_ACTIVE) {
+ /*
+ * Failed to find the flow. Release all allocated tid
+ * resources.
+ */
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+ goto bail;
+ }
+ req->state = TID_REQUEST_RESEND;
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ flow = &req->flows[req->flow_idx];
+ len -= flow->sent;
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+ case TID_OP(READ_REQ):
+ req = wqe_to_tid_req(wqe);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+ /*
+ * If the current WR is not TID RDMA READ, or this is the start
+ * of a new request, we need to change the qp->s_state so that
+ * the request can be set up properly.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+ qp->s_cur == qp->s_tail) {
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ if (delta == 0 || qp->s_cur == qp->s_tail)
+ goto check_s_state;
+ else
+ goto bail;
+ }
+
+ /* Rate limiting */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+
+ wpriv = wqe->priv;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
@@ -951,6 +1213,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
}
/**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+ struct rvt_swqe *wqe)
+{
+ u32 opcode = wqe->wr.opcode;
+
+ if (opcode == IB_WR_RDMA_READ ||
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ qp->s_num_rd_atomic++;
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ u32 cur_seg;
+
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+ req->ack_pending = cur_seg - req->comp_seg;
+ priv->pending_tid_r_segs += req->ack_pending;
+ qp->s_num_rd_atomic += req->ack_pending;
+ } else {
+ priv->pending_tid_r_segs += req->total_segs;
+ qp->s_num_rd_atomic += req->total_segs;
+ }
+ }
+}
+
+/**
* reset_psn - reset the QP state to send starting from PSN
* @qp: the QP
* @psn: the packet sequence number to restart at
@@ -964,9 +1263,12 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
u32 n = qp->s_acked;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
u32 opcode;
+ struct hfi1_qp_priv *priv = qp->priv;
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
+ priv->pending_tid_r_segs = 0;
+ qp->s_num_rd_atomic = 0;
/*
* If we are starting the request from the beginning,
@@ -976,9 +1278,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
+ update_num_rd_atomic(qp, psn, wqe);
/* Find the work request opcode corresponding to the given PSN. */
- opcode = wqe->wr.opcode;
for (;;) {
int diff;
@@ -988,8 +1290,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
break;
wqe = rvt_get_swqe_ptr(qp, n);
diff = cmp_psn(psn, wqe->psn);
- if (diff < 0)
+ if (diff < 0) {
+ /* Point wqe back to the previous one*/
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
break;
+ }
qp->s_cur = n;
/*
* If we are starting the request from the beginning,
@@ -999,8 +1304,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
- opcode = wqe->wr.opcode;
+
+ update_num_rd_atomic(qp, psn, wqe);
}
+ opcode = wqe->wr.opcode;
/*
* Set the state to restart in the middle