From 99f80d2f5fb6d4165186390ecba83952803b667b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Oct 2016 06:14:39 -0700 Subject: IB/hfi1: Optimize lkey validation structures Profiling shows that the key validation is susceptible to cache line trading when accessing the lkey table. Fix by separating out the read mostly fields from the write fields. In addition the shift amount, which is function of the lkey table size, is precomputed and stored with the table pointer. Since both the shift and table pointer are in the same read mostly cacheline, this saves a cache line in this hot path. Reviewed-by: Sebastian Sanchez Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_mr.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h index 6b3c6c8b6b77..de59de28b6a2 100644 --- a/include/rdma/rdmavt_mr.h +++ b/include/rdma/rdmavt_mr.h @@ -90,11 +90,15 @@ struct rvt_mregion { #define RVT_MAX_LKEY_TABLE_BITS 23 struct rvt_lkey_table { - spinlock_t lock; /* protect changes in this struct */ - u32 next; /* next unused index (speeds search) */ - u32 gen; /* generation count */ + /* read mostly fields */ u32 max; /* size of the table */ + u32 shift; /* lkey/rkey shift */ struct rvt_mregion __rcu **table; + /* writeable fields */ + /* protect changes in this struct */ + spinlock_t lock ____cacheline_aligned_in_smp; + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ }; /* -- cgit v1.2.3 From be5d740bdc5448a8a542f8aa84d88d65fe88e486 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 17 Oct 2016 04:19:07 -0700 Subject: IB/rdmvat: Organize hot path calldowns into a single cacheline Save a cacheline by having hot path calldowns together. Reviewed-by: Sebastian Sanchez Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdma_vt.h | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e31502107a58..861e23eaebda 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -185,6 +185,27 @@ struct rvt_driver_provided { * check_support() for details. */ + /* hot path calldowns in a single cacheline */ + + /* + * Give the driver a notice that there is send work to do. It is up to + * the driver to generally push the packets out, this just queues the + * work with the driver. There are two variants here. The no_lock + * version requires the s_lock not to be held. The other assumes the + * s_lock is held. + */ + void (*schedule_send)(struct rvt_qp *qp); + void (*schedule_send_no_lock)(struct rvt_qp *qp); + + /* Driver specific work request checking */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + + /* + * Sometimes rdmavt needs to kick the driver's send progress. That is + * done by this call back. + */ + void (*do_send)(struct rvt_qp *qp); + /* Passed to ib core registration. Callback to create syfs files */ int (*port_callback)(struct ib_device *, u8, struct kobject *); @@ -222,22 +243,6 @@ struct rvt_driver_provided { */ void (*notify_qp_reset)(struct rvt_qp *qp); - /* - * Give the driver a notice that there is send work to do. It is up to - * the driver to generally push the packets out, this just queues the - * work with the driver. There are two variants here. The no_lock - * version requires the s_lock not to be held. The other assumes the - * s_lock is held. - */ - void (*schedule_send)(struct rvt_qp *qp); - void (*schedule_send_no_lock)(struct rvt_qp *qp); - - /* - * Sometimes rdmavt needs to kick the driver's send progress. That is - * done by this call back. - */ - void (*do_send)(struct rvt_qp *qp); - /* * Get a path mtu from the driver based on qp attributes. */ @@ -324,9 +329,6 @@ struct rvt_driver_provided { void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); - /* Notify driver a mad agent has been created */ void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx); @@ -355,12 +357,12 @@ struct rvt_dev_info { /* post send table */ const struct rvt_operation_params *post_parms; - struct rvt_mregion __rcu *dma_mr; - struct rvt_lkey_table lkey_table; - /* Driver specific helper functions */ struct rvt_driver_provided driver_f; + struct rvt_mregion __rcu *dma_mr; + struct rvt_lkey_table lkey_table; + /* Internal use */ int n_pds_allocated; spinlock_t n_pds_lock; /* Protect pd allocated count */ -- cgit v1.2.3 From 850d8fd765079d223d47de89f1f03ab95d1036cb Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 10 Nov 2016 11:30:56 +0200 Subject: IB/mlx4: Handle IPv4 header when demultiplexing MAD When MAD arrives to the hypervisor, we need to identify which slave it should be sent by destination GID. When L3 protocol is IPv4 the GRH is replaced by an IPv4 header. This patch detects when IPv4 header needs to be parsed instead of GRH. Fixes: b6ffaeffaea4 ('mlx4: In RoCE allow guests to have multiple GIDS') Signed-off-by: Moni Shoua Signed-off-by: Daniel Jurgens Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5ad43a487745..467a4b476e29 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2580,6 +2580,24 @@ void ib_dealloc_pd(struct ib_pd *pd); */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +/** + * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header + * work completion. + * @hdr: the L3 header to parse + * @net_type: type of header to parse + * @sgid: place to store source gid + * @dgid: place to store destination gid + */ +int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid); + +/** + * ib_get_rdma_header_version - Get the header version + * @hdr: the L3 header to parse + */ +int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); + /** * ib_init_ah_from_wc - Initializes address handle attributes from a * work completion. @@ -3357,4 +3375,5 @@ int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); + #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 4d4099584c2c4dca6c04d78ded4cc81f50cc3634 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Oct 2016 20:13:07 +0300 Subject: IB/hns: Move HNS RoCE user vendor structures This patch moves HNS vendor's specific structures to common UAPI folder which will be visible to all consumers. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/Kbuild | 1 + include/uapi/rdma/hns-abi.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 include/uapi/rdma/hns-abi.h (limited to 'include') diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild index f14ab7ff5fee..b54f10d3c1f7 100644 --- a/include/uapi/rdma/Kbuild +++ b/include/uapi/rdma/Kbuild @@ -14,3 +14,4 @@ header-y += mlx5-abi.h header-y += mthca-abi.h header-y += nes-abi.h header-y += ocrdma-abi.h +header-y += hns-abi.h diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h new file mode 100644 index 000000000000..5d7401963e35 --- /dev/null +++ b/include/uapi/rdma/hns-abi.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HNS_ABI_USER_H +#define HNS_ABI_USER_H + +#include + +struct hns_roce_ib_create_cq { + __u64 buf_addr; +}; + +struct hns_roce_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +struct hns_roce_ib_alloc_ucontext_resp { + __u32 qp_tab_size; +}; +#endif /* HNS_ABI_USER_H */ -- cgit v1.2.3 From b1226c7db1d997fa6955cd3b54ba333bd0d8a29c Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Sun, 2 Oct 2016 19:10:21 -0700 Subject: vmxnet3: Move PCI Id to pci_ids.h The VMXNet3 PCI Id will be shared with our paravirtual RDMA driver. Moved it to the shared location in pci_ids.h. Suggested-by: Leon Romanovsky Acked-by: Bjorn Helgaas Reviewed-by: Yuval Shaia Signed-off-by: Adit Ranadive Signed-off-by: Doug Ledford --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c58752fe16c4..98bb455302cf 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2251,6 +2251,7 @@ #define PCI_DEVICE_ID_RASTEL_2PORT 0x2000 #define PCI_VENDOR_ID_VMWARE 0x15ad +#define PCI_DEVICE_ID_VMWARE_VMXNET3 0x07b0 #define PCI_VENDOR_ID_ZOLTRIX 0x15b0 #define PCI_DEVICE_ID_ZOLTRIX_2BD0 0x2bd0 -- cgit v1.2.3 From e730139b3464cc740c33131c872f7d173744ef11 Mon Sep 17 00:00:00 2001 From: Jakub Pawlak Date: Wed, 7 Dec 2016 19:32:41 -0800 Subject: IB/hfi1: Disable header suppression for short packets For the received packets with payload less or equal 8DWS RxDmaDataFifoRdUncErr is not reported. There is set RHF.EccErr if the header is not suppressed. When such packet is detected on the send side the header suppression mechanism is disabled by clearing SH bit in the packet header. Reviewed-by: Mitko Haralanov Signed-off-by: Jakub Pawlak Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/uapi/rdma/hfi/hfi1_user.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index d15e7289d835..587b7360e820 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -75,7 +75,7 @@ * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define HFI1_USER_SWMINOR 2 +#define HFI1_USER_SWMINOR 3 /* * We will encode the major/minor inside a single 32bit version number. -- cgit v1.2.3 From f2dc9cdce83c4aac2f8b6c803b0327df1c7d44a6 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 7 Dec 2016 19:34:06 -0800 Subject: IB/rdmavt: Add a send completion helper This is for use by client drivers to drive send completions into a CQ. A new exported table allows for the mapping of ib_wr_opcode into a ib_wc_opcode. Reviewed-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 2c5183ef0243..d78e99cf6c11 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -51,6 +51,7 @@ #include #include #include +#include /* * Atomic bit definitions for r_aflags. */ @@ -527,7 +528,44 @@ static inline void rvt_qp_wqe_unreserve( } } -extern const int ib_rvt_state_ops[]; +extern const enum ib_wc_opcode ib_rvt_wc_opcode[]; + +/** + * rvt_qp_swqe_complete() - insert send completion + * @qp - the qp + * @wqe - the send wqe + * @status - completion status + * + * Insert a send completion into the completion + * queue if the qp indicates it should be done. + * + * See IBTA 10.7.3.1 for info on completion + * control. + */ +static inline void rvt_qp_swqe_complete( + struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) + return; + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_rvt_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + wc.byte_len = wqe->length; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } +} + +extern const int ib_rvt_state_ops[]; struct rvt_dev_info; int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); -- cgit v1.2.3 From f6475223b12b3a38298976506edb042e13158798 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 7 Dec 2016 19:34:25 -0800 Subject: IB/rdmavt: Add swqe mr deref helper Add a helper to release mr references held by an swqe. Reviewed-by: Brian Welty Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d78e99cf6c11..04facda681e0 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -485,6 +485,23 @@ static inline void rvt_put_qp(struct rvt_qp *qp) wake_up(&qp->wait); } +/** + * rvt_put_swqe - drop mr refs held by swqe + * @wqe - the send wqe + * + * This drops any mr references held by the swqe + */ +static inline void rvt_put_swqe(struct rvt_swqe *wqe) +{ + int i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + rvt_put_mr(sge->mr); + } +} + /** * rvt_qp_wqe_reserve - reserve operation * @qp - the rvt qp -- cgit v1.2.3 From 5dc806052a9fc8c44e111646f9a9f436877749d0 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 7 Dec 2016 19:34:37 -0800 Subject: IB/rdmavt, IB/hfi1, IB/qib: Add inlines for mtu division Add rvt_div_round_up_mtu() and rvt_div_mtu() routines to do the computation based on the pmtu and the log_pmtu. Change divides in qib, hfi1 to use the new inlines. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 04facda681e0..f3dbd157ae5c 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -582,7 +582,29 @@ static inline void rvt_qp_swqe_complete( } } -extern const int ib_rvt_state_ops[]; +/** + * @qp - the qp pair + * @len - the length + * + * Perform a shift based mtu round up divide + */ +static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len) +{ + return (len + qp->pmtu - 1) >> qp->log_pmtu; +} + +/** + * @qp - the qp pair + * @len - the length + * + * Perform a shift based mtu divide + */ +static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len) +{ + return len >> qp->log_pmtu; +} + +extern const int ib_rvt_state_ops[]; struct rvt_dev_info; int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); -- cgit v1.2.3 From c226dc22ec4904340e3e14a536983cda3dbe7914 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Oct 2016 12:15:20 +0200 Subject: net/mlx5: Report multi packet WQE capabilities Multi packet WQE enables sending multiple fix sized packets using a single WQE. The exposed field reports such HW support. Signed-off-by: Bodong Wang Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2632cb2caf10..0779ad2e8f51 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -576,7 +576,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 self_lb_en_modifiable[0x1]; u8 reserved_at_9[0x2]; u8 max_lso_cap[0x5]; - u8 reserved_at_10[0x2]; + u8 multi_pkt_send_wqe[0x2]; u8 wqe_inline_mode[0x2]; u8 rss_ind_tbl_cap[0x4]; u8 reg_umr_sq[0x1]; -- cgit v1.2.3 From 191ded4a4d991acf17207e0b4370fef070bce3e9 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 31 Oct 2016 12:15:21 +0200 Subject: IB/mlx5: Report mlx5 multi packet WQE caps during query The capabilities whether hardware support multi packet WQE or not is exposed to user space through query_device by uhw. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index f5d0f4e83b59..93d6b9fe8e78 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -129,6 +129,8 @@ struct mlx5_ib_query_device_resp { __u32 response_length; struct mlx5_ib_tso_caps tso_caps; struct mlx5_ib_rss_caps rss_caps; + __u32 mlx5_ib_support_multi_pkt_send_wqes; + __u32 reserved; }; struct mlx5_ib_create_cq { -- cgit v1.2.3 From 7e43a2a5bae39fedaa7cce21d637e0c8d96d8e54 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 31 Oct 2016 12:16:44 +0200 Subject: IB/mlx5: Report mlx5 CQE compression caps during query The capabilities include: - Max number of compressed and aggregated CQEs in a single session, while zero means unsupported. - For Responder, there are two formats of mini CQE: mini CQE with Rx hash and mini CQE with checksum. They're mutual exclusive. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 93d6b9fe8e78..6649d13a2dbb 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -124,11 +124,23 @@ struct mlx5_ib_rss_caps { __u8 reserved[7]; }; +enum mlx5_ib_cqe_comp_res_format { + MLX5_IB_CQE_RES_FORMAT_HASH = 1 << 0, + MLX5_IB_CQE_RES_FORMAT_CSUM = 1 << 1, + MLX5_IB_CQE_RES_RESERVED = 1 << 2, +}; + +struct mlx5_ib_cqe_comp_caps { + __u32 max_num; + __u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */ +}; + struct mlx5_ib_query_device_resp { __u32 comp_mask; __u32 response_length; struct mlx5_ib_tso_caps tso_caps; struct mlx5_ib_rss_caps rss_caps; + struct mlx5_ib_cqe_comp_caps cqe_comp_caps; __u32 mlx5_ib_support_multi_pkt_send_wqes; __u32 reserved; }; -- cgit v1.2.3 From 1cbe6fc86ccfe05a910be4883da7c7bd28c190fe Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 31 Oct 2016 12:16:45 +0200 Subject: IB/mlx5: Add support for CQE compressing CQE compressing reduces PCI overhead by coalescing and compressing multiple CQEs into a single merged CQE. Successful compressing improves message rate especially for small packet traffic. CQE compressing is supported for all 64B CQE formats (with certain limitations) generated by RQ/Responder or by SQ/Requestor. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 6649d13a2dbb..b0a41c8dc1fb 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -149,7 +149,9 @@ struct mlx5_ib_create_cq { __u64 buf_addr; __u64 db_addr; __u32 cqe_size; - __u32 reserved; /* explicit padding (optional on i386) */ + __u8 cqe_comp_en; + __u8 cqe_comp_res_format; + __u16 reserved; /* explicit padding (optional on i386) */ }; struct mlx5_ib_create_cq_resp { -- cgit v1.2.3 From 0dbf3332b7b683db33a385a3ce9baab157e3ff9a Mon Sep 17 00:00:00 2001 From: Moses Reuben Date: Mon, 14 Nov 2016 19:04:47 +0200 Subject: IB/core: Add flow spec tunneling support In order to support tunneling, that can be used by the QP, both struct ib_flow_spec_tunnel and struct ib_flow_tunnel_filter can be used to more IP or UDP based tunneling protocols (e.g NVGRE, GRE, etc). IB_FLOW_SPEC_VXLAN_TUNNEL type flow specification is added to use this functionality and match specific Vxlan packets. In similar to IPv6, we check overflow of the vni value by comparing with the maximum size. Signed-off-by: Moses Reuben Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 467a4b476e29..b8213818de89 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1599,7 +1599,8 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, - IB_FLOW_SPEC_UDP = 0x41 + IB_FLOW_SPEC_UDP = 0x41, + IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 4 @@ -1707,6 +1708,21 @@ struct ib_flow_spec_tcp_udp { struct ib_flow_tcp_udp_filter mask; }; +struct ib_flow_tunnel_filter { + __be32 tunnel_id; + u8 real_sz[0]; +}; + +/* ib_flow_spec_tunnel describes the Vxlan tunnel + * the tunnel_id from val has the vni value + */ +struct ib_flow_spec_tunnel { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_tunnel_filter val; + struct ib_flow_tunnel_filter mask; +}; + union ib_flow_spec { struct { enum ib_flow_spec_type type; @@ -1717,6 +1733,7 @@ union ib_flow_spec { struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; + struct ib_flow_spec_tunnel tunnel; }; struct ib_flow_attr { -- cgit v1.2.3 From 76bd23b34204cad78f48aec4cef38869a66aa594 Mon Sep 17 00:00:00 2001 From: Moses Reuben Date: Mon, 14 Nov 2016 19:04:48 +0200 Subject: IB/core: Align structure ib_flow_spec_type Aligned the structure ib_flow_spec_type indentation, after adding a new definition. Signed-off-by: Moses Reuben Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b8213818de89..4bc748fddcac 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1592,14 +1592,14 @@ enum ib_flow_attr_type { /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ - IB_FLOW_SPEC_ETH = 0x20, - IB_FLOW_SPEC_IB = 0x22, + IB_FLOW_SPEC_ETH = 0x20, + IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ - IB_FLOW_SPEC_IPV4 = 0x30, - IB_FLOW_SPEC_IPV6 = 0x31, + IB_FLOW_SPEC_IPV4 = 0x30, + IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ - IB_FLOW_SPEC_TCP = 0x40, - IB_FLOW_SPEC_UDP = 0x41, + IB_FLOW_SPEC_TCP = 0x40, + IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 -- cgit v1.2.3 From a0cb4c759af12943806564294fa53ab08cb7cf93 Mon Sep 17 00:00:00 2001 From: Moses Reuben Date: Mon, 14 Nov 2016 19:04:49 +0200 Subject: IB/uverbs: Add support for Vxlan protocol Add ib_uverbs_flow_spec_tunnel to define the rule to match Vxlan, the type, size, reserved fields are identical to rest of the protocols, and are used to identify the spec. The tunnel id is the vni value of the Vxlan protocol, and it is used as part of the steering rule, it is limited by the mask. The steering rule configured on the hardware does a match according to vni and other protocols. In the same way as rest of the protocols that we match, the uniq field's of each protocol are represented on the val and the mask. Signed-off-by: Moses Reuben Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/ib_user_verbs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 25225ebbc7d5..90ba5e88ec00 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -908,6 +908,23 @@ struct ib_uverbs_flow_spec_ipv6 { struct ib_uverbs_flow_ipv6_filter mask; }; +struct ib_uverbs_flow_tunnel_filter { + __be32 tunnel_id; +}; + +struct ib_uverbs_flow_spec_tunnel { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_tunnel_filter val; + struct ib_uverbs_flow_tunnel_filter mask; +}; + struct ib_uverbs_flow_attr { __u32 type; __u16 size; -- cgit v1.2.3 From fbf46860b19ddb485f00bef1ad1a43aabc9f71ad Mon Sep 17 00:00:00 2001 From: Moses Reuben Date: Mon, 14 Nov 2016 19:04:51 +0200 Subject: IB/core: Introduce inner flow steering For a tunneled packet which contains external and internal headers, we refer to the external headers as "outer fields" and the internal headers as "inner fields". Example of a tunneled packet: { L2 | L3 | L4 | tunnel header | L2 | L3 | l4 | data } | | | | | | | { outer fields }{ inner fields } This patch introduces a new flag for flow steering rules - IB_FLOW_SPEC_INNER - which specifies that the rule applies to the inner fields, rather than to the outer fields of the packet. Signed-off-by: Moses Reuben Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4bc748fddcac..6d0dd6525e14 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1601,9 +1601,10 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, + IB_FLOW_SPEC_INNER = 0x100, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 -#define IB_FLOW_SPEC_SUPPORT_LAYERS 4 +#define IB_FLOW_SPEC_SUPPORT_LAYERS 8 /* Flow steering rule priority is set according to it's domain. * Lower domain value means higher priority. @@ -1631,7 +1632,7 @@ struct ib_flow_eth_filter { }; struct ib_flow_spec_eth { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; @@ -1645,7 +1646,7 @@ struct ib_flow_ib_filter { }; struct ib_flow_spec_ib { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; @@ -1670,7 +1671,7 @@ struct ib_flow_ipv4_filter { }; struct ib_flow_spec_ipv4 { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; @@ -1688,7 +1689,7 @@ struct ib_flow_ipv6_filter { }; struct ib_flow_spec_ipv6 { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; @@ -1702,7 +1703,7 @@ struct ib_flow_tcp_udp_filter { }; struct ib_flow_spec_tcp_udp { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; @@ -1717,7 +1718,7 @@ struct ib_flow_tunnel_filter { * the tunnel_id from val has the vni value */ struct ib_flow_spec_tunnel { - enum ib_flow_spec_type type; + u32 type; u16 size; struct ib_flow_tunnel_filter val; struct ib_flow_tunnel_filter mask; @@ -1725,7 +1726,7 @@ struct ib_flow_spec_tunnel { union ib_flow_spec { struct { - enum ib_flow_spec_type type; + u32 type; u16 size; }; struct ib_flow_spec_eth eth; -- cgit v1.2.3 From c90ea9d8e51196d9c528e57d9ab09ee7d41f0ba0 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Nov 2016 08:23:22 +0200 Subject: IB/core: Change ib_resolve_eth_dmac to use it in create AH The function ib_resolve_eth_dmac() requires struct qp_attr * and qp_attr_mask as parameters while the function might be useful to resolve dmac for address handles. This patch changes the signature of the function so it can be used in the flow of creating an address handle. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6d0dd6525e14..0c6f973e407c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3394,4 +3394,6 @@ void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); +int ib_resolve_eth_dmac(struct ib_device *device, + struct ib_ah_attr *ah_attr); #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 6ad279c5a2e55bf2bd100b4222090d4717de88d5 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Nov 2016 08:23:23 +0200 Subject: IB/mlx5: Report that device has udata response in create_ah To make mlx5 user driver aware of whether kernel driver returns dmac in user data response add a new flag that will be returned back to user-space through alloc_ucontext. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index b0a41c8dc1fb..ac28729a1245 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -82,6 +82,7 @@ enum mlx5_ib_alloc_ucontext_resp_mask { enum mlx5_user_cmds_supp_uhw { MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0, + MLX5_USER_CMDS_SUPP_UHW_CREATE_AH = 1 << 1, }; struct mlx5_ib_alloc_ucontext_resp { -- cgit v1.2.3 From 477864c8fcd953e5a988073ca5be18bb7fd93410 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Nov 2016 08:23:24 +0200 Subject: IB/core: Let create_ah return extended response to user Add struct ib_udata to the signature of create_ah callback that is implemented by IB device drivers. This allows HW drivers to return extra data to the userspace library. This patch prepares the ground for mlx5 driver to resolve destination mac address for a given GID and return it to userspace. This patch was previously submitted by Knut Omang as a part of the patch set to support Oracle's Infiniband HCA (SIF). Signed-off-by: Knut Omang Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0c6f973e407c..73417a22ee4d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1951,7 +1951,8 @@ struct ib_device { struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, - struct ib_ah_attr *ah_attr); + struct ib_ah_attr *ah_attr, + struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, -- cgit v1.2.3 From 5097e71f3edafad3e7d8d4f9c4a137d9aad0fae2 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 23 Nov 2016 08:23:25 +0200 Subject: IB/mlx5: Use kernel driver to help userspace create ah Resolving a MAC address for a given IP address in userspace is inefficient. This patch lets mlx5 user driver using the kernel driver to resolve the mac and get the answer in the private section of the response. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index ac28729a1245..3ebf3db24c34 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -249,6 +249,12 @@ struct mlx5_ib_create_wq { __u32 reserved; }; +struct mlx5_ib_create_ah_resp { + __u32 response_length; + __u8 dmac[ETH_ALEN]; + __u8 reserved[6]; +}; + struct mlx5_ib_create_wq_resp { __u32 response_length; __u32 reserved; -- cgit v1.2.3 From d949167d68b304c0a00331cf33ef49a29b65d85f Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 1 Dec 2016 13:43:13 +0200 Subject: IB/mlx5: Report mlx5 packet pacing capabilities when querying device Enable mlx5 based hardware to report packet pacing capabilities from kernel to user space. Packet pacing allows to limit the rate to any number between the maximum and minimum, based on user settings. The capabilities are exposed to user space through query_device by uhw. The following capabilities are reported: 1. The maximum and minimum rate limit in kbps supported by packet pacing. 2. Bitmap showing which QP types are supported by packet pacing operation. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx5-abi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 3ebf3db24c34..fae6cdaeb56d 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -136,12 +136,25 @@ struct mlx5_ib_cqe_comp_caps { __u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */ }; +struct mlx5_packet_pacing_caps { + __u32 qp_rate_limit_min; + __u32 qp_rate_limit_max; /* In kpbs */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; + __u32 reserved; +}; + struct mlx5_ib_query_device_resp { __u32 comp_mask; __u32 response_length; struct mlx5_ib_tso_caps tso_caps; struct mlx5_ib_rss_caps rss_caps; struct mlx5_ib_cqe_comp_caps cqe_comp_caps; + struct mlx5_packet_pacing_caps packet_pacing_caps; __u32 mlx5_ib_support_multi_pkt_send_wqes; __u32 reserved; }; -- cgit v1.2.3 From 528e5a1bd3f0e9b760cb3a1062fce7513712a15d Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 1 Dec 2016 13:43:14 +0200 Subject: IB/core: Support rate limit for packet pacing Add new member rate_limit to ib_qp_attr which holds the packet pacing rate in kbps, 0 means unlimited. IB_QP_RATE_LIMIT is added to ib_attr_mask and could be used by RAW QPs when changing QP state from RTR to RTS, RTS to RTS. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 73417a22ee4d..8029d2a51f14 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1102,6 +1102,7 @@ enum ib_qp_attr_mask { IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), + IB_QP_RATE_LIMIT = (1<<25), }; enum ib_qp_state { @@ -1151,6 +1152,7 @@ struct ib_qp_attr { u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; + u32 rate_limit; }; enum ib_wr_opcode { -- cgit v1.2.3 From 189aba99e70030cfb56bd8f199bc5b077a1bc6ff Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 1 Dec 2016 13:43:15 +0200 Subject: IB/uverbs: Extend modify_qp and support packet pacing An new uverbs command ib_uverbs_ex_modify_qp is added to support more QP attributes. User driver should choose to call the legacy/extended API based on input mask. IB_USER_LAST_QP_ATTR_MASK is added to indicated the maximum bit position which supports legacy ib_uverbs_modify_qp. IB_USER_LEGACY_LAST_QP_ATTR_MASK indicates the maximum bit position which supports ib_uverbs_ex_modify_qp, the value of this mask should be updated if new mask is added later. Along with this change, rate_limit is supported by the extended command, user driver could use it to control packet packing. Signed-off-by: Bodong Wang Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/ib_user_verbs.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 90ba5e88ec00..dfdfe4e92d31 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -37,6 +37,7 @@ #define IB_USER_VERBS_H #include +#include /* * Increment this value if any changes that break userspace ABI @@ -93,6 +94,7 @@ enum { IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ, IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, + IB_USER_VERBS_EX_CMD_MODIFY_QP = IB_USER_VERBS_CMD_MODIFY_QP, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW, IB_USER_VERBS_EX_CMD_CREATE_WQ, @@ -545,6 +547,14 @@ enum { IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, }; +enum { + IB_USER_LEGACY_LAST_QP_ATTR_MASK = IB_QP_DEST_QPN +}; + +enum { + IB_USER_LAST_QP_ATTR_MASK = IB_QP_RATE_LIMIT +}; + struct ib_uverbs_ex_create_qp { __u64 user_handle; __u32 pd_handle; @@ -684,9 +694,20 @@ struct ib_uverbs_modify_qp { __u64 driver_data[0]; }; +struct ib_uverbs_ex_modify_qp { + struct ib_uverbs_modify_qp base; + __u32 rate_limit; + __u32 reserved; +}; + struct ib_uverbs_modify_qp_resp { }; +struct ib_uverbs_ex_modify_qp_resp { + __u32 comp_mask; + __u32 response_length; +}; + struct ib_uverbs_destroy_qp { __u64 response; __u32 qp_handle; -- cgit v1.2.3 From 9fa240bbfc4200b080c8fad12579659c2c2f36b5 Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Tue, 18 Oct 2016 13:20:29 -0400 Subject: IB/mad: Eliminate redundant SM class version defines for OPA and rename class version define to indicate SM rather than SMP or SMI Signed-off-by: Hal Rosenstock Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- include/rdma/ib_mad.h | 2 +- include/rdma/opa_smi.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index c8a773ffe23b..981214b3790c 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -46,7 +46,7 @@ #define IB_MGMT_BASE_VERSION 1 #define OPA_MGMT_BASE_VERSION 0x80 -#define OPA_SMP_CLASS_VERSION 0x80 +#define OPA_SM_CLASS_VERSION 0x80 /* Management classes */ #define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h index 4a529ef47995..f7896117936e 100644 --- a/include/rdma/opa_smi.h +++ b/include/rdma/opa_smi.h @@ -44,8 +44,6 @@ #define OPA_MAX_SLS 32 #define OPA_MAX_SCS 32 -#define OPA_SMI_CLASS_VERSION 0x80 - #define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF) struct opa_smp { -- cgit v1.2.3 From 77a5db13153906a7e00740b10b2730e53385c5a8 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 26 Oct 2016 12:36:40 -0700 Subject: rdma_cm: add rdma_reject_msg() helper function rdma_reject_msg() returns a pointer to a string message associated with the transport reject reason codes. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Steve Wise Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- include/rdma/ib_cm.h | 6 ++++++ include/rdma/iw_cm.h | 6 ++++++ include/rdma/rdma_cm.h | 8 ++++++++ 3 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 92a7d85917b4..b49258b16f4e 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -603,4 +603,10 @@ struct ib_cm_sidr_rep_param { int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param); +/** + * ibcm_reject_msg - return a pointer to a reject message string. + * @reason: Value returned in the REJECT event status field. + */ +const char *__attribute_const__ ibcm_reject_msg(int reason); + #endif /* IB_CM_H */ diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 6d0065c322b7..5cd7701db148 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -253,4 +253,10 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt); int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); +/** + * iwcm_reject_msg - return a pointer to a reject message string. + * @reason: Value returned in the REJECT event status field. + */ +const char *__attribute_const__ iwcm_reject_msg(int reason); + #endif /* IW_CM_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 81fb1d15e8bb..f11a768be06b 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -388,4 +388,12 @@ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); */ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); +/** + * rdma_reject_msg - return a pointer to a reject message string. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, + int reason); + #endif /* RDMA_CM_H */ -- cgit v1.2.3 From 5042a73d3e9de7bcc2a31adea08ee95bbce998dc Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 26 Oct 2016 12:36:47 -0700 Subject: rdma_cm: add rdma_is_consumer_reject() helper function Return true if the peer consumer application rejected the connection attempt. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Steve Wise Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- include/rdma/rdma_cm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index f11a768be06b..62039c2fd951 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -395,5 +395,12 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); */ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason); +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason); #endif /* RDMA_CM_H */ -- cgit v1.2.3 From 5f24410408fd093734ce758f2fe3a66fe543de22 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 26 Oct 2016 12:36:47 -0700 Subject: rdma_cm: add rdma_consumer_reject_data helper function rdma_consumer_reject_data() will return the private data pointer and length if any is available. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- include/rdma/rdma_cm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 62039c2fd951..d3968b561f86 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -403,4 +403,14 @@ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, */ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason); +/** + * rdma_consumer_reject_data - return the consumer reject private data and + * length, if any. + * @id: Communication identifier that received the REJECT event. + * @ev: RDMA CM reject event. + * @data_len: Pointer to the resulting length of the consumer data. + */ +const void *rdma_consumer_reject_data(struct rdma_cm_id *id, + struct rdma_cm_event *ev, u8 *data_len); + #endif /* RDMA_CM_H */ -- cgit v1.2.3 From 35493294df1a1b06268a4e7b77b7a323772779f3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Oct 2016 10:51:17 -0600 Subject: rdma UAPI: Use __kernel_sockaddr_storage The kernel side is #ifdef'd to this type, and the UAPI header should use it directly. It has slightly different alignment requirments from the usual user space version. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/rdma_user_cm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 01923d463673..d71da36e3cd6 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -110,7 +110,7 @@ struct rdma_ucm_bind { __u32 id; __u16 addr_size; __u16 reserved; - struct sockaddr_storage addr; + struct __kernel_sockaddr_storage addr; }; struct rdma_ucm_resolve_ip { @@ -126,8 +126,8 @@ struct rdma_ucm_resolve_addr { __u16 src_size; __u16 dst_size; __u32 reserved; - struct sockaddr_storage src_addr; - struct sockaddr_storage dst_addr; + struct __kernel_sockaddr_storage src_addr; + struct __kernel_sockaddr_storage dst_addr; }; struct rdma_ucm_resolve_route { @@ -164,8 +164,8 @@ struct rdma_ucm_query_addr_resp { __u16 pkey; __u16 src_size; __u16 dst_size; - struct sockaddr_storage src_addr; - struct sockaddr_storage dst_addr; + struct __kernel_sockaddr_storage src_addr; + struct __kernel_sockaddr_storage dst_addr; }; struct rdma_ucm_query_path_resp { @@ -257,7 +257,7 @@ struct rdma_ucm_join_mcast { __u32 id; __u16 addr_size; __u16 join_flags; - struct sockaddr_storage addr; + struct __kernel_sockaddr_storage addr; }; struct rdma_ucm_get_event { -- cgit v1.2.3 From 29c8d9eba550c6d73d17cc1618a9f5f2a7345aa1 Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Sun, 2 Oct 2016 19:10:22 -0700 Subject: IB: Add vmw_pvrdma driver This patch series adds a driver for a paravirtual RDMA device. The device is developed for VMware's Virtual Machines and allows existing RDMA applications to continue to use existing Verbs API when deployed in VMs on ESXi. We recently did a presentation in the OFA Workshop [1] regarding this device. Description and RDMA Support ============================ The virtual device is exposed as a dual function PCIe device. One part is a virtual network device (VMXNet3) which provides networking properties like MAC, IP addresses to the RDMA part of the device. The networking properties are used to register GIDs required by RDMA applications to communicate. These patches add support and the all required infrastructure for letting applications use such a device. We support the mandatory Verbs API as well as the base memory management extensions (Local Inv, Send with Inv and Fast Register Work Requests). We currently support both Reliable Connected and Unreliable Datagram QPs but do not support Shared Receive Queues (SRQs). Also, we support the following types of Work Requests: o Send/Receive (with or without Immediate Data) o RDMA Write (with or without Immediate Data) o RDMA Read o Local Invalidate o Send with Invalidate o Fast Register Work Requests This version only adds support for version 1 of RoCE. We will add RoCEv2 support in a future patch. We do support registration of both MAC-based and IP-based GIDs. I have also created a git tree for our user-level driver [2]. Testing ======= We have tested this internally for various types of Guest OS - Red Hat, Centos, Ubuntu 12.04/14.04/16.04, Oracle Enterprise Linux, SLES 12 using backported versions of this driver. The tests included several runs of the performance tests (included with OFED), Intel MPI PingPong benchmark on OpenMPI, krping for FRWRs. Mellanox has been kind enough to test the backported version of the driver internally on their hardware using a VMware provided ESX build. I have also applied and tested this with Doug's k.o/for-4.9 branch (commit 5603910b). Note, that this patch series should be applied all together. I split out the commits so that it may be easier to review. PVRDMA Resources ================ [1] OFA Workshop Presentation - https://openfabrics.org/images/eventpresos/2016presentations/102parardma.pdf [2] Libpvrdma User-level library - http://git.openfabrics.org/?p=~aditr/libpvrdma.git;a=summary Reviewed-by: Jorgen Hansen Reviewed-by: George Zhang Reviewed-by: Aditya Sarwade Reviewed-by: Bryan Tan Reviewed-by: Leon Romanovsky Signed-off-by: Adit Ranadive Signed-off-by: Doug Ledford --- include/uapi/rdma/Kbuild | 1 + include/uapi/rdma/vmw_pvrdma-abi.h | 289 +++++++++++++++++++++++++++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 include/uapi/rdma/vmw_pvrdma-abi.h (limited to 'include') diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild index f14ab7ff5fee..6a8a934c540c 100644 --- a/include/uapi/rdma/Kbuild +++ b/include/uapi/rdma/Kbuild @@ -14,3 +14,4 @@ header-y += mlx5-abi.h header-y += mthca-abi.h header-y += nes-abi.h header-y += ocrdma-abi.h +header-y += vmw_pvrdma-abi.h diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h new file mode 100644 index 000000000000..5016abc9ee97 --- /dev/null +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __VMW_PVRDMA_ABI_H__ +#define __VMW_PVRDMA_ABI_H__ + +#include + +#define PVRDMA_UVERBS_ABI_VERSION 3 /* ABI Version. */ +#define PVRDMA_UAR_HANDLE_MASK 0x00FFFFFF /* Bottom 24 bits. */ +#define PVRDMA_UAR_QP_OFFSET 0 /* QP doorbell. */ +#define PVRDMA_UAR_QP_SEND BIT(30) /* Send bit. */ +#define PVRDMA_UAR_QP_RECV BIT(31) /* Recv bit. */ +#define PVRDMA_UAR_CQ_OFFSET 4 /* CQ doorbell. */ +#define PVRDMA_UAR_CQ_ARM_SOL BIT(29) /* Arm solicited bit. */ +#define PVRDMA_UAR_CQ_ARM BIT(30) /* Arm bit. */ +#define PVRDMA_UAR_CQ_POLL BIT(31) /* Poll bit. */ + +enum pvrdma_wr_opcode { + PVRDMA_WR_RDMA_WRITE, + PVRDMA_WR_RDMA_WRITE_WITH_IMM, + PVRDMA_WR_SEND, + PVRDMA_WR_SEND_WITH_IMM, + PVRDMA_WR_RDMA_READ, + PVRDMA_WR_ATOMIC_CMP_AND_SWP, + PVRDMA_WR_ATOMIC_FETCH_AND_ADD, + PVRDMA_WR_LSO, + PVRDMA_WR_SEND_WITH_INV, + PVRDMA_WR_RDMA_READ_WITH_INV, + PVRDMA_WR_LOCAL_INV, + PVRDMA_WR_FAST_REG_MR, + PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP, + PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD, + PVRDMA_WR_BIND_MW, + PVRDMA_WR_REG_SIG_MR, +}; + +enum pvrdma_wc_status { + PVRDMA_WC_SUCCESS, + PVRDMA_WC_LOC_LEN_ERR, + PVRDMA_WC_LOC_QP_OP_ERR, + PVRDMA_WC_LOC_EEC_OP_ERR, + PVRDMA_WC_LOC_PROT_ERR, + PVRDMA_WC_WR_FLUSH_ERR, + PVRDMA_WC_MW_BIND_ERR, + PVRDMA_WC_BAD_RESP_ERR, + PVRDMA_WC_LOC_ACCESS_ERR, + PVRDMA_WC_REM_INV_REQ_ERR, + PVRDMA_WC_REM_ACCESS_ERR, + PVRDMA_WC_REM_OP_ERR, + PVRDMA_WC_RETRY_EXC_ERR, + PVRDMA_WC_RNR_RETRY_EXC_ERR, + PVRDMA_WC_LOC_RDD_VIOL_ERR, + PVRDMA_WC_REM_INV_RD_REQ_ERR, + PVRDMA_WC_REM_ABORT_ERR, + PVRDMA_WC_INV_EECN_ERR, + PVRDMA_WC_INV_EEC_STATE_ERR, + PVRDMA_WC_FATAL_ERR, + PVRDMA_WC_RESP_TIMEOUT_ERR, + PVRDMA_WC_GENERAL_ERR, +}; + +enum pvrdma_wc_opcode { + PVRDMA_WC_SEND, + PVRDMA_WC_RDMA_WRITE, + PVRDMA_WC_RDMA_READ, + PVRDMA_WC_COMP_SWAP, + PVRDMA_WC_FETCH_ADD, + PVRDMA_WC_BIND_MW, + PVRDMA_WC_LSO, + PVRDMA_WC_LOCAL_INV, + PVRDMA_WC_FAST_REG_MR, + PVRDMA_WC_MASKED_COMP_SWAP, + PVRDMA_WC_MASKED_FETCH_ADD, + PVRDMA_WC_RECV = 1 << 7, + PVRDMA_WC_RECV_RDMA_WITH_IMM, +}; + +enum pvrdma_wc_flags { + PVRDMA_WC_GRH = 1 << 0, + PVRDMA_WC_WITH_IMM = 1 << 1, + PVRDMA_WC_WITH_INVALIDATE = 1 << 2, + PVRDMA_WC_IP_CSUM_OK = 1 << 3, + PVRDMA_WC_WITH_SMAC = 1 << 4, + PVRDMA_WC_WITH_VLAN = 1 << 5, + PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_VLAN, +}; + +struct pvrdma_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 reserved; +}; + +struct pvrdma_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct pvrdma_create_cq { + __u64 buf_addr; + __u32 buf_size; + __u32 reserved; +}; + +struct pvrdma_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct pvrdma_resize_cq { + __u64 buf_addr; + __u32 buf_size; + __u32 reserved; +}; + +struct pvrdma_create_srq { + __u64 buf_addr; +}; + +struct pvrdma_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct pvrdma_create_qp { + __u64 rbuf_addr; + __u64 sbuf_addr; + __u32 rbuf_size; + __u32 sbuf_size; + __u64 qp_addr; +}; + +/* PVRDMA masked atomic compare and swap */ +struct pvrdma_ex_cmp_swap { + __u64 swap_val; + __u64 compare_val; + __u64 swap_mask; + __u64 compare_mask; +}; + +/* PVRDMA masked atomic fetch and add */ +struct pvrdma_ex_fetch_add { + __u64 add_val; + __u64 field_boundary; +}; + +/* PVRDMA address vector. */ +struct pvrdma_av { + __u32 port_pd; + __u32 sl_tclass_flowlabel; + __u8 dgid[16]; + __u8 src_path_bits; + __u8 gid_index; + __u8 stat_rate; + __u8 hop_limit; + __u8 dmac[6]; + __u8 reserved[6]; +}; + +/* PVRDMA scatter/gather entry */ +struct pvrdma_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +/* PVRDMA receive queue work request */ +struct pvrdma_rq_wqe_hdr { + __u64 wr_id; /* wr id */ + __u32 num_sge; /* size of s/g array */ + __u32 total_len; /* reserved */ +}; +/* Use pvrdma_sge (ib_sge) for receive queue s/g array elements. */ + +/* PVRDMA send queue work request */ +struct pvrdma_sq_wqe_hdr { + __u64 wr_id; /* wr id */ + __u32 num_sge; /* size of s/g array */ + __u32 total_len; /* reserved */ + __u32 opcode; /* operation type */ + __u32 send_flags; /* wr flags */ + union { + __u32 imm_data; + __u32 invalidate_rkey; + } ex; + __u32 reserved; + union { + struct { + __u64 remote_addr; + __u32 rkey; + __u8 reserved[4]; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u64 remote_addr; + __u32 log_arg_sz; + __u32 rkey; + union { + struct pvrdma_ex_cmp_swap cmp_swap; + struct pvrdma_ex_fetch_add fetch_add; + } wr_data; + } masked_atomics; + struct { + __u64 iova_start; + __u64 pl_pdir_dma; + __u32 page_shift; + __u32 page_list_len; + __u32 length; + __u32 access_flags; + __u32 rkey; + } fast_reg; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + struct pvrdma_av av; + } ud; + } wr; +}; +/* Use pvrdma_sge (ib_sge) for send queue s/g array elements. */ + +/* Completion queue element. */ +struct pvrdma_cqe { + __u64 wr_id; + __u64 qp; + __u32 opcode; + __u32 status; + __u32 byte_len; + __u32 imm_data; + __u32 src_qp; + __u32 wc_flags; + __u32 vendor_err; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 smac[6]; + __u8 reserved2[7]; /* Pad to next power of 2 (64). */ +}; + +#endif /* __VMW_PVRDMA_ABI_H__ */ -- cgit v1.2.3