diff options
66 files changed, 4323 insertions, 3012 deletions
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 1eb9b1294a63..dbfd854c32c9 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -58,19 +58,13 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, return false; } -static inline u32 rdma_rw_max_sge(struct ib_device *dev, - enum dma_data_direction dir) -{ - return dir == DMA_TO_DEVICE ? - dev->attrs.max_sge : dev->attrs.max_sge_rd; -} - static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) { /* arbitrary limit to avoid allocating gigantic resources */ return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); } +/* Caller must have zero-initialized *reg. */ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) @@ -114,6 +108,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + struct rdma_rw_reg_ctx *prev = NULL; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); int i, j, ret = 0, count = 0; @@ -125,7 +120,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } for (i = 0; i < ctx->nr_ops; i++) { - struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; u32 nents = min(sg_cnt, pages_per_mr); @@ -162,9 +156,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, sg_cnt -= nents; for (j = 0; j < nents; j++) sg = sg_next(sg); + prev = reg; offset = 0; } + if (prev) + prev->wr.wr.next = NULL; + ctx->type = RDMA_RW_MR; return count; @@ -181,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 max_sge = rdma_rw_max_sge(dev, dir); + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; struct ib_sge *sge; u32 total_len = 0, i, j; @@ -205,11 +204,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->remote_addr = remote_addr + total_len; rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - rdma_wr->wr.num_sge++; - sge->addr = ib_sg_dma_address(dev, sg) + offset; sge->length = ib_sg_dma_len(dev, sg) - offset; sge->lkey = qp->pd->local_dma_lkey; @@ -220,8 +218,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, offset = 0; } - if (i + 1 < ctx->nr_ops) - rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + rdma_wr->wr.next = i + 1 < ctx->nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; } ctx->type = RDMA_RW_MULTI_WR; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 2e813edcddab..f2b776efab3a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -825,6 +825,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, } } + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + return qp; } EXPORT_SYMBOL(ib_create_qp); diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index f846fd51b85b..f6ea0881765a 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,8 +1,9 @@ config INFINIBAND_HFI1 tristate "Intel OPA Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT + depends on X86_64 && INFINIBAND_RDMAVT && I2C select MMU_NOTIFIER select CRC32 + select I2C_ALGOBIT ---help--- This is a low-level driver for Intel OPA Gen1 adapter. config HFI1_DEBUG_SDMA_ORDER diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 9b5382c94b0c..0cf97a09b64b 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ eprom.o file_ops.o firmware.o \ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ + qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o hfi1-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 14d7eeb09be6..79575ee873f2 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -47,12 +47,18 @@ #include <linux/topology.h> #include <linux/cpumask.h> #include <linux/module.h> +#include <linux/cpumask.h> #include "hfi.h" #include "affinity.h" #include "sdma.h" #include "trace.h" +struct hfi1_affinity_node_list node_affinity = { + .list = LIST_HEAD_INIT(node_affinity.list), + .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock), +}; + /* Name of IRQ types, indexed by enum irq_type */ static const char * const irq_type_names[] = { "SDMA", @@ -61,6 +67,9 @@ static const char * const irq_type_names[] = { "OTHER", }; +/* Per NUMA node count of HFI devices */ +static unsigned int *hfi1_per_node_cntr; + static inline void init_cpu_mask_set(struct cpu_mask_set *set) { cpumask_clear(&set->mask); @@ -69,47 +78,136 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) } /* Initialize non-HT cpu cores mask */ -int init_real_cpu_mask(struct hfi1_devdata *dd) +void init_real_cpu_mask(void) { - struct hfi1_affinity *info; int possible, curr_cpu, i, ht; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - cpumask_clear(&info->real_cpu_mask); + cpumask_clear(&node_affinity.real_cpu_mask); /* Start with cpu online mask as the real cpu mask */ - cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); /* * Remove HT cores from the real cpu mask. Do this in two steps below. */ - possible = cpumask_weight(&info->real_cpu_mask); + possible = cpumask_weight(&node_affinity.real_cpu_mask); ht = cpumask_weight(topology_sibling_cpumask( - cpumask_first(&info->real_cpu_mask))); + cpumask_first(&node_affinity.real_cpu_mask))); /* * Step 1. Skip over the first N HT siblings and use them as the * "real" cores. Assumes that HT cores are not enumerated in * succession (except in the single core case). */ - curr_cpu = cpumask_first(&info->real_cpu_mask); + curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); for (i = 0; i < possible / ht; i++) - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); /* * Step 2. Remove the remaining HT siblings. Use cpumask_next() to * skip any gaps. */ for (; i < possible; i++) { - cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); + } +} + +int node_affinity_init(void) +{ + int node; + struct pci_dev *dev = NULL; + const struct pci_device_id *ids = hfi1_pci_tbl; + + cpumask_clear(&node_affinity.proc.used); + cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); + + node_affinity.proc.gen = 0; + node_affinity.num_core_siblings = + cpumask_weight(topology_sibling_cpumask( + cpumask_first(&node_affinity.proc.mask) + )); + node_affinity.num_online_nodes = num_online_nodes(); + node_affinity.num_online_cpus = num_online_cpus(); + + /* + * The real cpu mask is part of the affinity struct but it has to be + * initialized early. It is needed to calculate the number of user + * contexts in set_up_context_variables(). + */ + init_real_cpu_mask(); + + hfi1_per_node_cntr = kcalloc(num_possible_nodes(), + sizeof(*hfi1_per_node_cntr), GFP_KERNEL); + if (!hfi1_per_node_cntr) + return -ENOMEM; + + while (ids->vendor) { + dev = NULL; + while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { + node = pcibus_to_node(dev->bus); + if (node < 0) + node = numa_node_id(); + + hfi1_per_node_cntr[node]++; + } + ids++; } - dd->affinity = info; return 0; } +void node_affinity_destroy(void) +{ + struct list_head *pos, *q; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + list_for_each_safe(pos, q, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, + list); + list_del(pos); + kfree(entry); + } + spin_unlock(&node_affinity.lock); + kfree(hfi1_per_node_cntr); +} + +static struct hfi1_affinity_node *node_affinity_allocate(int node) +{ + struct hfi1_affinity_node *entry; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + entry->node = node; + INIT_LIST_HEAD(&entry->list); + + return entry; +} + +/* + * It appends an entry to the list. + * It *must* be called with node_affinity.lock held. + */ +static void node_affinity_add_tail(struct hfi1_affinity_node *entry) +{ + list_add_tail(&entry->list, &node_affinity.list); +} + +/* It must be called with node_affinity.lock held */ +static struct hfi1_affinity_node *node_affinity_lookup(int node) +{ + struct list_head *pos; + struct hfi1_affinity_node *entry; + + list_for_each(pos, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, list); + if (entry->node == node) + return entry; + } + + return NULL; +} + /* * Interrupt affinity. * @@ -121,10 +219,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd) * to the node relative 1 as necessary. * */ -void hfi1_dev_affinity_init(struct hfi1_devdata *dd) +int hfi1_dev_affinity_init(struct hfi1_devdata *dd) { int node = pcibus_to_node(dd->pcidev->bus); - struct hfi1_affinity *info = dd->affinity; + struct hfi1_affinity_node *entry; const struct cpumask *local_mask; int curr_cpu, possible, i; @@ -132,56 +230,93 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd) node = numa_node_id(); dd->node = node; - spin_lock_init(&info->lock); - - init_cpu_mask_set(&info->def_intr); - init_cpu_mask_set(&info->rcv_intr); - init_cpu_mask_set(&info->proc); - local_mask = cpumask_of_node(dd->node); if (cpumask_first(local_mask) >= nr_cpu_ids) local_mask = topology_core_cpumask(0); - /* Use the "real" cpu mask of this node as the default */ - cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); - - /* fill in the receive list */ - possible = cpumask_weight(&info->def_intr.mask); - curr_cpu = cpumask_first(&info->def_intr.mask); - if (possible == 1) { - /* only one CPU, everyone will use it */ - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - } else { - /* - * Retain the first CPU in the default list for the control - * context. - */ - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - /* - * Remove the remaining kernel receive queues from - * the default list and add them to the receive list. - */ - for (i = 0; i < dd->n_krcv_queues - 1; i++) { - cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - if (curr_cpu >= nr_cpu_ids) - break; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + /* + * If this is the first time this NUMA node's affinity is used, + * create an entry in the global affinity structure and initialize it. + */ + if (!entry) { + entry = node_affinity_allocate(node); + if (!entry) { + dd_dev_err(dd, + "Unable to allocate global affinity node\n"); + return -ENOMEM; } - } + init_cpu_mask_set(&entry->def_intr); + init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->general_intr_mask); + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, + local_mask); + + /* fill in the receive list */ + possible = cpumask_weight(&entry->def_intr.mask); + curr_cpu = cpumask_first(&entry->def_intr.mask); + + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + } else { + /* + * The general/control context will be the first CPU in + * the default list, so it is removed from the default + * list and added to the general interrupt list. + */ + cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); - cpumask_copy(&info->proc.mask, cpu_online_mask); -} + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; + i < (dd->n_krcv_queues - 1) * + hfi1_per_node_cntr[dd->node]; + i++) { + cpumask_clear_cpu(curr_cpu, + &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, + &entry->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } -void hfi1_dev_affinity_free(struct hfi1_devdata *dd) -{ - kfree(dd->affinity); + /* + * If there ends up being 0 CPU cores leftover for SDMA + * engines, use the same CPU cores as general/control + * context. + */ + if (cpumask_weight(&entry->def_intr.mask) == 0) + cpumask_copy(&entry->def_intr.mask, + &entry->general_intr_mask); + } + + spin_lock(&node_affinity.lock); + node_affinity_add_tail(entry); + spin_unlock(&node_affinity.lock); + } + + return 0; } int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) { int ret; cpumask_var_t diff; - struct cpu_mask_set *set; + struct hfi1_affinity_node *entry; + struct cpu_mask_set *set = NULL; struct sdma_engine *sde = NULL; struct hfi1_ctxtdata *rcd = NULL; char extra[64]; @@ -194,22 +329,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) if (!ret) return -ENOMEM; + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + switch (msix->type) { case IRQ_SDMA: sde = (struct sdma_engine *)msix->arg; scnprintf(extra, 64, "engine %u", sde->this_idx); - /* fall through */ + set = &entry->def_intr; + break; case IRQ_GENERAL: - set = &dd->affinity->def_intr; + cpu = cpumask_first(&entry->general_intr_mask); break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - if (rcd->ctxt == HFI1_CTRL_CTXT) { - set = &dd->affinity->def_intr; - cpu = cpumask_first(&set->mask); - } else { - set = &dd->affinity->rcv_intr; - } + if (rcd->ctxt == HFI1_CTRL_CTXT) + cpu = cpumask_first(&entry->general_intr_mask); + else + set = &entry->rcv_intr; scnprintf(extra, 64, "ctxt %u", rcd->ctxt); break; default: @@ -218,12 +356,12 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) } /* - * The control receive context is placed on a particular CPU, which - * is set above. Skip accounting for it. Everything else finds its - * CPU here. + * The general and control contexts are placed on a particular + * CPU, which is set above. Skip accounting for it. Everything else + * finds its CPU here. */ - if (cpu == -1) { - spin_lock(&dd->affinity->lock); + if (cpu == -1 && set) { + spin_lock(&node_affinity.lock); if (cpumask_equal(&set->mask, &set->used)) { /* * We've used up all the CPUs, bump up the generation @@ -235,7 +373,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) cpumask_andnot(diff, &set->mask, &set->used); cpu = cpumask_first(diff); cpumask_set_cpu(cpu, &set->used); - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } switch (msix->type) { @@ -263,43 +401,84 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, { struct cpu_mask_set *set = NULL; struct hfi1_ctxtdata *rcd; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); switch (msix->type) { case IRQ_SDMA: + set = &entry->def_intr; + break; case IRQ_GENERAL: - set = &dd->affinity->def_intr; + /* Don't do accounting for general contexts */ break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - /* only do accounting for non control contexts */ + /* Don't do accounting for control contexts */ if (rcd->ctxt != HFI1_CTRL_CTXT) - set = &dd->affinity->rcv_intr; + set = &entry->rcv_intr; break; default: return; } if (set) { - spin_lock(&dd->affinity->lock); + spin_lock(&node_affinity.lock); cpumask_andnot(&set->used, &set->used, &msix->mask); if (cpumask_empty(&set->used) && set->gen) { |
