66 files changed, 4323 insertions, 3012 deletions
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 1eb9b1294a63..dbfd854c32c9 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -58,19 +58,13 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
 	return false;
 }
 
-static inline u32 rdma_rw_max_sge(struct ib_device *dev,
-		enum dma_data_direction dir)
-{
-	return dir == DMA_TO_DEVICE ?
-		dev->attrs.max_sge : dev->attrs.max_sge_rd;
-}
-
 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
 {
 	/* arbitrary limit to avoid allocating gigantic resources */
 	return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
 }
 
+/* Caller must have zero-initialized *reg. */
 static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
 		u32 sg_cnt, u32 offset)
@@ -114,6 +108,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
+	struct rdma_rw_reg_ctx *prev = NULL;
 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
 	int i, j, ret = 0, count = 0;
 
@@ -125,7 +120,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	}
 
 	for (i = 0; i < ctx->nr_ops; i++) {
-		struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL;
 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
 		u32 nents = min(sg_cnt, pages_per_mr);
 
@@ -162,9 +156,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		sg_cnt -= nents;
 		for (j = 0; j < nents; j++)
 			sg = sg_next(sg);
+		prev = reg;
 		offset = 0;
 	}
 
+	if (prev)
+		prev->wr.wr.next = NULL;
+
 	ctx->type = RDMA_RW_MR;
 	return count;
 
@@ -181,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 max_sge = rdma_rw_max_sge(dev, dir);
+	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+		      qp->max_read_sge;
 	struct ib_sge *sge;
 	u32 total_len = 0, i, j;
 
@@ -205,11 +204,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
 		rdma_wr->remote_addr = remote_addr + total_len;
 		rdma_wr->rkey = rkey;
+		rdma_wr->wr.num_sge = nr_sge;
 		rdma_wr->wr.sg_list = sge;
 
 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
-			rdma_wr->wr.num_sge++;
-
 			sge->addr = ib_sg_dma_address(dev, sg) + offset;
 			sge->length = ib_sg_dma_len(dev, sg) - offset;
 			sge->lkey = qp->pd->local_dma_lkey;
@@ -220,8 +218,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 			offset = 0;
 		}
 
-		if (i + 1 < ctx->nr_ops)
-			rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
+		rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+			&ctx->map.wrs[i + 1].wr : NULL;
 	}
 
 	ctx->type = RDMA_RW_MULTI_WR;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 2e813edcddab..f2b776efab3a 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -825,6 +825,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 		}
 	}
 
+	/*
+	 * Note: all hw drivers guarantee that max_send_sge is lower than
+	 * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+	 * max_send_sge <= max_sge_rd.
+	 */
+	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+				 device->attrs.max_sge_rd);
+
 	return qp;
 }
 EXPORT_SYMBOL(ib_create_qp);
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
index f846fd51b85b..f6ea0881765a 100644
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -1,8 +1,9 @@
 config INFINIBAND_HFI1
 	tristate "Intel OPA Gen1 support"
-	depends on X86_64 && INFINIBAND_RDMAVT
+	depends on X86_64 && INFINIBAND_RDMAVT && I2C
 	select MMU_NOTIFIER
 	select CRC32
+	select I2C_ALGOBIT
 	---help---
 	This is a low-level driver for Intel OPA Gen1 adapter.
 config HFI1_DEBUG_SDMA_ORDER
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 9b5382c94b0c..0cf97a09b64b 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
 	eprom.o file_ops.o firmware.o \
 	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
+	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
 	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
 	verbs_txreq.o
 hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 14d7eeb09be6..79575ee873f2 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -47,12 +47,18 @@
 #include <linux/topology.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/cpumask.h>
 
 #include "hfi.h"
 #include "affinity.h"
 #include "sdma.h"
 #include "trace.h"
 
+struct hfi1_affinity_node_list node_affinity = {
+	.list = LIST_HEAD_INIT(node_affinity.list),
+	.lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
 	"SDMA",
@@ -61,6 +67,9 @@ static const char * const irq_type_names[] = {
 	"OTHER",
 };
 
+/* Per NUMA node count of HFI devices */
+static unsigned int *hfi1_per_node_cntr;
+
 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 {
 	cpumask_clear(&set->mask);
@@ -69,47 +78,136 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 }
 
 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
+void init_real_cpu_mask(void)
 {
-	struct hfi1_affinity *info;
 	int possible, curr_cpu, i, ht;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	cpumask_clear(&info->real_cpu_mask);
+	cpumask_clear(&node_affinity.real_cpu_mask);
 
 	/* Start with cpu online mask as the real cpu mask */
-	cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
 
 	/*
 	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
 	 */
-	possible = cpumask_weight(&info->real_cpu_mask);
+	possible = cpumask_weight(&node_affinity.real_cpu_mask);
 	ht = cpumask_weight(topology_sibling_cpumask(
-					cpumask_first(&info->real_cpu_mask)));
+				cpumask_first(&node_affinity.real_cpu_mask)));
 	/*
 	 * Step 1.  Skip over the first N HT siblings and use them as the
 	 * "real" cores.  Assumes that HT cores are not enumerated in
 	 * succession (except in the single core case).
 	 */
-	curr_cpu = cpumask_first(&info->real_cpu_mask);
+	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 	for (i = 0; i < possible / ht; i++)
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 	/*
 	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 	 * skip any gaps.
 	 */
 	for (; i < possible; i++) {
-		cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+	}
+}
+
+int node_affinity_init(void)
+{
+	int node;
+	struct pci_dev *dev = NULL;
+	const struct pci_device_id *ids = hfi1_pci_tbl;
+
+	cpumask_clear(&node_affinity.proc.used);
+	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+
+	node_affinity.proc.gen = 0;
+	node_affinity.num_core_siblings =
+				cpumask_weight(topology_sibling_cpumask(
+					cpumask_first(&node_affinity.proc.mask)
+					));
+	node_affinity.num_online_nodes = num_online_nodes();
+	node_affinity.num_online_cpus = num_online_cpus();
+
+	/*
+	 * The real cpu mask is part of the affinity struct but it has to be
+	 * initialized early. It is needed to calculate the number of user
+	 * contexts in set_up_context_variables().
+	 */
+	init_real_cpu_mask();
+
+	hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
+				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
+	if (!hfi1_per_node_cntr)
+		return -ENOMEM;
+
+	while (ids->vendor) {
+		dev = NULL;
+		while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
+			node = pcibus_to_node(dev->bus);
+			if (node < 0)
+				node = numa_node_id();
+
+			hfi1_per_node_cntr[node]++;
+		}
+		ids++;
 	}
 
-	dd->affinity = info;
 	return 0;
 }
 
+void node_affinity_destroy(void)
+{
+	struct list_head *pos, *q;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	list_for_each_safe(pos, q, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node,
+				   list);
+		list_del(pos);
+		kfree(entry);
+	}
+	spin_unlock(&node_affinity.lock);
+	kfree(hfi1_per_node_cntr);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+	struct hfi1_affinity_node *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+	entry->node = node;
+	INIT_LIST_HEAD(&entry->list);
+
+	return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+	list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+	struct list_head *pos;
+	struct hfi1_affinity_node *entry;
+
+	list_for_each(pos, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node, list);
+		if (entry->node == node)
+			return entry;
+	}
+
+	return NULL;
+}
+
 /*
  * Interrupt affinity.
  *
@@ -121,10 +219,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd)
  * to the node relative 1 as necessary.
  *
  */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
 	int node = pcibus_to_node(dd->pcidev->bus);
-	struct hfi1_affinity *info = dd->affinity;
+	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
 	int curr_cpu, possible, i;
 
@@ -132,56 +230,93 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 		node = numa_node_id();
 	dd->node = node;
 
-	spin_lock_init(&info->lock);
-
-	init_cpu_mask_set(&info->def_intr);
-	init_cpu_mask_set(&info->rcv_intr);
-	init_cpu_mask_set(&info->proc);
-
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
-	/* Use the "real" cpu mask of this node as the default */
-	cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-	/*  fill in the receive list */
-	possible = cpumask_weight(&info->def_intr.mask);
-	curr_cpu = cpumask_first(&info->def_intr.mask);
-	if (possible == 1) {
-		/*  only one CPU, everyone will use it */
-		cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-	} else {
-		/*
-		 * Retain the first CPU in the default list for the control
-		 * context.
-		 */
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-		/*
-		 * Remove the remaining kernel receive queues from
-		 * the default list and add them to the receive list.
-		 */
-		for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-			cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-			cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-			curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-			if (curr_cpu >= nr_cpu_ids)
-				break;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
+	/*
+	 * If this is the first time this NUMA node's affinity is used,
+	 * create an entry in the global affinity structure and initialize it.
+	 */
+	if (!entry) {
+		entry = node_affinity_allocate(node);
+		if (!entry) {
+			dd_dev_err(dd,
+				   "Unable to allocate global affinity node\n");
+			return -ENOMEM;
 		}
-	}
+		init_cpu_mask_set(&entry->def_intr);
+		init_cpu_mask_set(&entry->rcv_intr);
+		cpumask_clear(&entry->general_intr_mask);
+		/* Use the "real" cpu mask of this node as the default */
+		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+			    local_mask);
+
+		/* fill in the receive list */
+		possible = cpumask_weight(&entry->def_intr.mask);
+		curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+		if (possible == 1) {
+			/* only one CPU, everyone will use it */
+			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+		} else {
+			/*
+			 * The general/control context will be the first CPU in
+			 * the default list, so it is removed from the default
+			 * list and added to the general interrupt list.
+			 */
+			cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
+			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+			curr_cpu = cpumask_next(curr_cpu,
+						&entry->def_intr.mask);
 
-	cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
+			/*
+			 * Remove the remaining kernel receive queues from
+			 * the default list and add them to the receive list.
+			 */
+			for (i = 0;
+			     i < (dd->n_krcv_queues - 1) *
+				  hfi1_per_node_cntr[dd->node];
+			     i++) {
+				cpumask_clear_cpu(curr_cpu,
+						  &entry->def_intr.mask);
+				cpumask_set_cpu(curr_cpu,
+						&entry->rcv_intr.mask);
+				curr_cpu = cpumask_next(curr_cpu,
+							&entry->def_intr.mask);
+				if (curr_cpu >= nr_cpu_ids)
+					break;
+			}
 
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-	kfree(dd->affinity);
+			/*
+			 * If there ends up being 0 CPU cores leftover for SDMA
+			 * engines, use the same CPU cores as general/control
+			 * context.
+			 */
+			if (cpumask_weight(&entry->def_intr.mask) == 0)
+				cpumask_copy(&entry->def_intr.mask,
+					     &entry->general_intr_mask);
+		}
+
+		spin_lock(&node_affinity.lock);
+		node_affinity_add_tail(entry);
+		spin_unlock(&node_affinity.lock);
+	}
+
+	return 0;
 }
 
 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 {
 	int ret;
 	cpumask_var_t diff;
-	struct cpu_mask_set *set;
+	struct hfi1_affinity_node *entry;
+	struct cpu_mask_set *set = NULL;
 	struct sdma_engine *sde = NULL;
 	struct hfi1_ctxtdata *rcd = NULL;
 	char extra[64];
@@ -194,22 +329,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	if (!ret)
 		return -ENOMEM;
 
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
 	switch (msix->type) {
 	case IRQ_SDMA:
 		sde = (struct sdma_engine *)msix->arg;
 		scnprintf(extra, 64, "engine %u", sde->this_idx);
-		/* fall through */
+		set = &entry->def_intr;
+		break;
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		cpu = cpumask_first(&entry->general_intr_mask);
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
-		if (rcd->ctxt == HFI1_CTRL_CTXT) {
-			set = &dd->affinity->def_intr;
-			cpu = cpumask_first(&set->mask);
-		} else {
-			set = &dd->affinity->rcv_intr;
-		}
+		if (rcd->ctxt == HFI1_CTRL_CTXT)
+			cpu = cpumask_first(&entry->general_intr_mask);
+		else
+			set = &entry->rcv_intr;
 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 		break;
 	default:
@@ -218,12 +356,12 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	}
 
 	/*
-	 * The control receive context is placed on a particular CPU, which
-	 * is set above.  Skip accounting for it.  Everything else finds its
-	 * CPU here.
+	 * The general and control contexts are placed on a particular
+	 * CPU, which is set above. Skip accounting for it. Everything else
+	 * finds its CPU here.
 	 */
-	if (cpu == -1) {
-		spin_lock(&dd->affinity->lock);
+	if (cpu == -1 && set) {
+		spin_lock(&node_affinity.lock);
 		if (cpumask_equal(&set->mask, &set->used)) {
 			/*
 			 * We've used up all the CPUs, bump up the generation
@@ -235,7 +373,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 		cpumask_andnot(diff, &set->mask, &set->used);
 		cpu = cpumask_first(diff);
 		cpumask_set_cpu(cpu, &set->used);
-		spin_unlock(&dd->affinity->lock);
+		spin_unlock(&node_affinity.lock);
 	}
 
 	switch (msix->type) {
@@ -263,43 +401,84 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 {
 	struct cpu_mask_set *set = NULL;
 	struct hfi1_ctxtdata *rcd;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
 
 	switch (msix->type) {
 	case IRQ_SDMA:
+		set = &entry->def_intr;
+		break;
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		/* Don't do accounting for general contexts */
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
-		/* only do accounting for non control contexts */
+		/* Don't do accounting for control contexts */
 		if (rcd->ctxt != HFI1_CTRL_CTXT)
-			set = &dd->affinity->rcv_intr;
+			set = &entry->rcv_intr;
 		break;
 	default:
 		return;
 	}
 
 	if (set) {
-		spin_lock(&dd->affinity->lock);
+		spin_lock(&node_affinity.lock);
 		cpumask_andnot(&set->used, &se