62 files changed, 26781 insertions, 0 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
new file mode 100644
index 000000000000..3cc3ff0cccb1
--- /dev/null
+++ b/drivers/infiniband/Kconfig
@@ -0,0 +1,14 @@
+menu "InfiniBand support"
+
+config INFINIBAND
+	tristate "InfiniBand support"
+	---help---
+	  Core support for InfiniBand (IB).  Make sure to also select
+	  any protocols you wish to use as well as drivers for your
+	  InfiniBand hardware.
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+endmenu
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
new file mode 100644
index 000000000000..d256cf798218
--- /dev/null
+++ b/drivers/infiniband/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= hw/mthca/
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 000000000000..d2dbfb52c0a3
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+obj-$(CONFIG_INFINIBAND) +=	ib_core.o ib_mad.o ib_sa.o ib_umad.o
+
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o
+
+ib_mad-y :=			mad.o smi.o agent.o
+
+ib_sa-y :=			sa_query.o
+
+ib_umad-y :=			user_mad.o
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 000000000000..7aee5ebf3f01
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/dma-mapping.h>
+
+#include <asm/bug.h>
+
+#include <ib_smi.h>
+
+#include "smi.h"
+#include "agent_priv.h"
+#include "mad_priv.h"
+#include "agent.h"
+
+spinlock_t ib_agent_port_list_lock;
+static LIST_HEAD(ib_agent_port_list);
+
+/*
+ * Caller must hold ib_agent_port_list_lock
+ */
+static inline struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num,
+		    struct ib_mad_agent *mad_agent)
+{
+	struct ib_agent_port_private *entry;
+
+	BUG_ON(!(!!device ^ !!mad_agent));  /* Exactly one MUST be (!NULL) */
+
+	if (device) {
+		list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+			if (entry->smp_agent->device == device &&
+			    entry->port_num == port_num)
+				return entry;
+		}
+	} else {
+		list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+			if ((entry->smp_agent == mad_agent) ||
+			    (entry->perf_mgmt_agent == mad_agent))
+				return entry;
+		}
+	}
+	return NULL;
+}
+
+static inline struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num,
+		  struct ib_mad_agent *mad_agent)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num, mad_agent);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return entry;
+}
+
+int smi_check_local_dr_smp(struct ib_smp *smp,
+			   struct ib_device *device,
+			   int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+
+	if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return 1;
+	port_priv = ib_get_agent_port(device, port_num, NULL);
+	if (!port_priv) {
+		printk(KERN_DEBUG SPFX "smi_check_local_dr_smp %s port %d "
+		       "not open\n",
+		       device->name, port_num);
+		return 1;
+	}
+
+	return smi_check_local_smp(port_priv->smp_agent, smp);
+}
+
+static int agent_mad_send(struct ib_mad_agent *mad_agent,
+			  struct ib_agent_port_private *port_priv,
+			  struct ib_mad_private *mad_priv,
+			  struct ib_grh *grh,
+			  struct ib_wc *wc)
+{
+	struct ib_agent_send_wr *agent_send_wr;
+	struct ib_sge gather_list;
+	struct ib_send_wr send_wr;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+	int ret = 1;
+
+	agent_send_wr = kmalloc(sizeof(*agent_send_wr), GFP_KERNEL);
+	if (!agent_send_wr)
+		goto out;
+	agent_send_wr->mad = mad_priv;
+
+	/* PCI mapping */
+	gather_list.addr = dma_map_single(mad_agent->device->dma_device,
+					  &mad_priv->mad,
+					  sizeof(mad_priv->mad),
+					  DMA_TO_DEVICE);
+	gather_list.length = sizeof(mad_priv->mad);
+	gather_list.lkey = (*port_priv->mr).lkey;
+
+	send_wr.next = NULL;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.sg_list = &gather_list;
+	send_wr.num_sge = 1;
+	send_wr.wr.ud.remote_qpn = wc->src_qp; /* DQPN */
+	send_wr.wr.ud.timeout_ms = 0;
+	send_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+	ah_attr.dlid = wc->slid;
+	ah_attr.port_num = mad_agent->port_num;
+	ah_attr.src_path_bits = wc->dlid_path_bits;
+	ah_attr.sl = wc->sl;
+	ah_attr.static_rate = 0;
+	ah_attr.ah_flags = 0; /* No GRH */
+	if (mad_priv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT) {
+		if (wc->wc_flags & IB_WC_GRH) {
+			ah_attr.ah_flags = IB_AH_GRH;
+			/* Should sgid be looked up ? */
+			ah_attr.grh.sgid_index = 0;
+			ah_attr.grh.hop_limit = grh->hop_limit;
+			ah_attr.grh.flow_label = be32_to_cpup(
+				&grh->version_tclass_flow)  & 0xfffff;
+			ah_attr.grh.traffic_class = (be32_to_cpup(
+				&grh->version_tclass_flow) >> 20) & 0xff;
+			memcpy(ah_attr.grh.dgid.raw,
+			       grh->sgid.raw,
+			       sizeof(ah_attr.grh.dgid));
+		}
+	}
+
+	agent_send_wr->ah = ib_create_ah(mad_agent->qp->pd, &ah_attr);
+	if (IS_ERR(agent_send_wr->ah)) {
+		printk(KERN_ERR SPFX "No memory for address handle\n");
+		kfree(agent_send_wr);
+		goto out;
+	}
+
+	send_wr.wr.ud.ah = agent_send_wr->ah;
+	if (mad_priv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT) {
+		send_wr.wr.ud.pkey_index = wc->pkey_index;
+		send_wr.wr.ud.remote_qkey = IB_QP1_QKEY;
+	} else { 	/* for SMPs */
+		send_wr.wr.ud.pkey_index = 0;
+		send_wr.wr.ud.remote_qkey = 0;
+	}
+	send_wr.wr.ud.mad_hdr = &mad_priv->mad.mad.mad_hdr;
+	send_wr.wr_id = (unsigned long)agent_send_wr;
+
+	pci_unmap_addr_set(agent_send_wr, mapping, gather_list.addr);
+
+	/* Send */
+	spin_lock_irqsave(&port_priv->send_list_lock, flags);
+	if (ib_post_send_mad(mad_agent, &send_wr, &bad_send_wr)) {
+		spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+		dma_unmap_single(mad_agent->device->dma_device,
+				 pci_unmap_addr(agent_send_wr, mapping),
+				 sizeof(mad_priv->mad),
+				 DMA_TO_DEVICE);
+		ib_destroy_ah(agent_send_wr->ah);
+		kfree(agent_send_wr);
+	} else {
+		list_add_tail(&agent_send_wr->send_list,
+			      &port_priv->send_posted_list);
+		spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+int agent_send(struct ib_mad_private *mad,
+	       struct ib_grh *grh,
+	       struct ib_wc *wc,
+	       struct ib_device *device,
+	       int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *mad_agent;
+
+	port_priv = ib_get_agent_port(device, port_num, NULL);
+	if (!port_priv) {
+		printk(KERN_DEBUG SPFX "agent_send %s port %d not open\n",
+		       device->name, port_num);
+		return 1;
+	}
+
+	/* Get mad agent based on mgmt_class in MAD */
+	switch (mad->mad.mad.mad_hdr.mgmt_class) {
+		case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+			mad_agent = port_priv->smp_agent;
+			break;
+		case IB_MGMT_CLASS_PERF_MGMT:
+			mad_agent = port_priv->perf_mgmt_agent;
+			break;
+		default:
+			return 1;
+	}
+
+	return agent_mad_send(mad_agent, port_priv, mad, grh, wc);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_agent_port_private	*port_priv;
+	struct ib_agent_send_wr		*agent_send_wr;
+	unsigned long			flags;
+
+	/* Find matching MAD agent */
+	port_priv = ib_get_agent_port(NULL, 0, mad_agent);
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "agent_send_handler: no matching MAD "
+		       "agent %p\n", mad_agent);
+		return;
+	}
+
+	agent_send_wr = (struct ib_agent_send_wr *)(unsigned long)mad_send_wc->wr_id;
+	spin_lock_irqsave(&port_priv->send_list_lock, flags);
+	/* Remove completed send from posted send MAD list */
+	list_del(&agent_send_wr->send_list);
+	spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+
+	/* Unmap PCI */
+	dma_unmap_single(mad_agent->device->dma_device,
+			 pci_unmap_addr(agent_send_wr, mapping),
+			 sizeof(agent_send_wr->mad->mad),
+			 DMA_TO_DEVICE);
+
+	ib_destroy_ah(agent_send_wr->ah);
+
+	/* Release allocated memory */
+	kmem_cache_free(ib_mad_cache, agent_send_wr->mad);
+	kfree(agent_send_wr);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	int ret;
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	/* First, check if port already open for SMI */
+	port_priv = ib_get_agent_port(device, port_num, NULL);
+	if (port_priv) {
+		printk(KERN_DEBUG SPFX "%s port %d already open\n",
+		       device->name, port_num);
+		return 0;
+	}
+
+	/* Create new device info */
+	port_priv = kmalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	memset(port_priv, 0, sizeof *port_priv);
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->send_list_lock);
+	INIT_LIST_HEAD(&port_priv->send_posted_list);
+
+	/* Obtain send only MAD agent for SM class (SMI QP) */
+	port_priv->smp_agent = ib_register_mad_agent(device, port_num,
+						     IB_QPT_SMI,
+						     NULL, 0,
+						    &agent_send_handler,
+						     NULL, NULL);
+
+	if (IS_ERR(port_priv->smp_agent)) {
+		ret = PTR_ERR(port_priv->smp_agent);
+		goto error2;
+	}
+
+	/* Obtain send only MAD agent for PerfMgmt class (GSI QP) */
+	port_priv->perf_mgmt_agent = ib_register_mad_agent(device, port_num,
+							   IB_QPT_GSI,
+							   NULL, 0,
+							  &agent_send_handler,
+							   NULL, NULL);
+	if (IS_ERR(port_priv->perf_mgmt_agent)) {
+		ret = PTR_ERR(port_priv->perf_mgmt_agent);
+		goto error3;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->smp_agent->qp->pd,
+				      IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		printk(KERN_ERR SPFX "Couldn't get DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error4;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error4:
+	ib_unregister_mad_agent(port_priv->perf_mgmt_agent);
+error3:
+	ib_unregister_mad_agent(port_priv->smp_agent);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num, NULL);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_dereg_mr(port_priv->mr);
+
+	ib_unregister_mad_agent(port_priv->perf_mgmt_agent);
+	ib_unregister_mad_agent(port_priv->smp_agent);
+	kfree(port_priv);
+
+	return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 000000000000..d9426842254a
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+extern spinlock_t ib_agent_port_list_lock;
+
+extern int ib_agent_port_open(struct ib_device *device,
+			      int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern int agent_send(struct ib_mad_private *mad,
+		      struct ib_grh *grh,
+		      struct ib_wc *wc,
+		      struct ib_device *device,
+		      int port_num);
+
+#endif	/* __AGENT_H_ */
diff --git a/drivers/infiniband/core/agent_priv.h b/drivers/infiniband/core/agent_priv.h
new file mode 100644
index 000000000000..17a0cce5813c
--- /dev/null
+++ b/drivers/infiniband/core/agent_priv.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent_priv.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __IB_AGENT_PRIV_H__
+#define __IB_AGENT_PRIV_H__
+
+#include <linux/pci.h>
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_send_wr {
+	struct list_head send_list;
+	struct ib_ah *ah;
+	struct ib_mad_private *mad;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct list_head send_posted_list;
+	spinlock_t send_list_lock;
+	int port_num;
+	struct ib_mad_agent *smp_agent;	      /* SM class */
+	struct ib_mad_agent *perf_mgmt_agent; /* PerfMgmt class */
+	struct ib_mr *mr;
+};
+
+#endif	/* __IB_AGENT_PRIV_H__ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 000000000000..3042360c97e1
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_gid_cache {
+	int             table_len;
+	union ib_gid    table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+	return device->node_type == IB_NODE_SWITCH ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+	return device->node_type == IB_NODE_SWITCH ? 0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.gid_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*gid = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int p, i;
+	int ret = -ENOENT;
+
+	*port_num = -1;
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		cache = device->cache.gid_cache[p];
+		for (i = 0; i < cache->table_len; ++i) {
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+				*port_num = p + start_port(device);
+				if (index)
+					*index = i;
+				ret = 0;
+				goto found;
+			}
+		}
+	}
+found:
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	int                        i;
+	int                        ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+		       ret, device->name);
+		goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+			    sizeof *gid_cache->table, GFP_KERNEL);
+	if (!gid_cache)
+		goto err;
+
+	gid_cache->table_len = tprops->gid_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < gid_cache->table_len; ++i) {
+		ret = ib_query_gid(device, port, i, gid_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+
+	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+	write_unlock_irq(&device->cache.lock);
+
+	kfree(old_pkey_cache);
+	kfree(old_gid_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(gid_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(void *work_ptr)
+{
+	struct ib_update_work *work = work_ptr;
+
+	ib_cache_update(work->device, work->port_num);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task, work);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			schedule_work(&work->work);
+		}
+	}
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.pkey_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+	device->cache.gid_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+	if (!device->cache.pkey_cache || !device->cache.gid_cache) {
+		printk(KERN_WARNING "Couldn't allocate cache "
+		       "for %s\n", device->name);
+		goto err;
+	}
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		device->cache.pkey_cache[p] = NULL;
+		device->cache.gid_cache [p] = NULL;
+		ib_cache_update(device, p + start_port(device));
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	if (ib_register_event_handler(&device->cache.event_handler))
+		goto err_cache;
+
+	return;
+
+err_cache:
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+err:
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+	int p;
+
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_scheduled_work();
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+}
+
+static struct ib_client cache_client = {
+	.name   = "cache",
+	.add    = ib_cache_setup_one,
+	.remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+	return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+}
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 000000000000..797049626ff6
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <ib_verbs.h>
+
+int  ib_device_register_sysfs(struct ib_device *device);
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int  ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int  ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 000000000000..9197e92d708a
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include <asm/semaphore.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+};
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_sem protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DECLARE_MUTEX(device_sem);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(query_gid),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr)
+	};
+	int i;
+
+	for (i = 0; i < sizeof mandatory_table / sizeof mandatory_table[0]; ++i) {
+		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+			       device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+	long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	void *dev;
+
+	BUG_ON(size < sizeof (struct ib_device));
+
+	dev = kmalloc(size, GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	memset(dev, 0, size);
+
+	return dev;
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device);
+		return;
+	}
+
+	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+	ib_device_unregister_sysfs(device);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+		       device->name, client->name);
+		return -ENOMEM;
+	}
+
+	context->client = client;
+	context->data   = NULL;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return 0;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device)
+{
+	int ret;
+
+	down(&device_sem);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	INIT_LIST_HEAD(&device->client_data_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+
+	ret = ib_device_register_sysfs(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+		       device->name);
+		goto out;
+	}
+
+	list_add_tail(&device->core_list, &device_list);
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	{
+		struct ib_client *client;
+
+		list_for_each_entry(client, &client_list, list)
+			if (client->add && !add_client_context(device, client))
+				client->add(device);
+	}
+
+ out:
+	up(&device_sem);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	down(&device_sem);
+
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (client->remove)
+			client->remove(device);
+
+	list_del(&device->core_list);
+
+	up(&device_sem);
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	down(&device_sem);
+
+	list_add_tail(&client->list, &client_list);
+	list_for_each_entry(device, &device_list, core_list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
+
+	up(&device_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	down(&device_sem);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (client->remove)
+			client->remove(device);
+
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				list_del(&context->list);
+				kfree(context);
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	list_del(&client->list);
+
+	up(&device_sem);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Get IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	printk(KERN_WARNING "No client context found for %s/%s\n",
+	       device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr)
+{
+	return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid)
+{
+	return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	return device->modify_port(device, port_num, port_modify_mask,
+				   port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+	ret = ib_sysfs_setup();
+	if (ret)
+		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+
+	ret = ib_cache_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		ib_sysfs_cleanup();
+	}
+
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	ib_sysfs_cleanup();
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 000000000000..2e9469f18926
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: fmr_pool.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < IB_FMR_MAX_REMAPS) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct task_struct       *thread;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page,
+			    (u32) (first_page >> 32),
+			    0);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+	struct hlist_node *pos;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, pos, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			printk(KERN_WARNING "Unmapping FMR 0x%08x with ref count %d",
+			       fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice(&pool->dirty_list, &unmap_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING "ib_unmap_fmr returned %d", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+	struct ib_fmr_pool *pool = pool_ptr;
+
+	do {
+		if (pool->dirty_len >= pool->dirty_watermark ||
+		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+			ib_fmr_batch_release(pool);
+
+			atomic_inc(&pool->flush_ser);
+			wake_up_interruptible(&pool->force_wait);
+
+			if (pool->flush_function)
+				pool->flush_function(pool, pool->flush_arg);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (pool->dirty_len < pool->dirty_watermark &&
+		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		    !kthread_should_stop())
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	int i;
+	int ret;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		printk(KERN_WARNING "Device %s does not support fast memory regions",
+		       device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING "couldn't allocate pool struct");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->cache_bucket   = NULL;
+
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			printk(KERN_WARNING "Failed to allocate cache in pool");
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->thread = kthread_create(ib_fmr_cleanup_thread,
+				      pool,
+				      "ib_fmr(%s)",
+				      device->name);
+	if (IS_ERR(pool->thread)) {
+		printk(KERN_WARNING "couldn't start cleanup thread");
+		ret = PTR_ERR(pool->thread);
+		goto out_free_pool;
+	}
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr attr = {
+			.max_pages = params->max_pages_per_fmr,
+			.max_maps  = IB_FMR_MAX_REMAPS,
+			.page_size = PAGE_SHIFT
+		};
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64),
+				      GFP_KERNEL);
+			if (!fmr) {
+				printk(KERN_WARNING "failed to allocate fmr struct "
+				       "for FMR %d", i);
+				goto out_fail;
+			}
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &attr);
+			if (IS_ERR(fmr->fmr)) {
+				printk(KERN_WARNING "fmr_create failed for FMR %d", i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+int ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	int                 i;
+
+	kthread_stop(pool->thread);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		printk(KERN_WARNING "pool still has %d regions registered",
+		       pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+
+	atomic_inc(&pool->req_ser);
+	/*
+	 * It's OK if someone else bumps req_ser again here -- we'll
+	 * just wait a little longer.
+	 */
+	serial = atomic_read(&pool->req_ser);
+
+	wake_up_process(pool->thread);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) -
+				     atomic_read(&pool->req_ser) >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                *io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  *io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 *io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		printk(KERN_WARNING "fmr_map returns %d",
+		       result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = *io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < IB_FMR_MAX_REMAPS) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			++pool->dirty_len;
+			wake_up_process(pool->thread);
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		printk(KERN_WARNING "FMR %p has ref count %d < 0",
+		       fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 000000000000..4ec7fff29b5d
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,2714 @@
+/*
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mad.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+
+#include <ib_mad.h>
+
+#include "mad_priv.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+
+kmem_cache_t *ib_mad_cache;
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static spinlock_t ib_mad_port_list_lock;
+
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					struct ib_mad *mad, int solicited);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+				    struct ib_mad_send_wc *mad_send_wc);
+static void timeout_sends(void *data);
+static void cancel_sends(void *data);
+static void local_completions(void *data);
+static int solicited_mad(struct ib_mad *mad);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1)
+		goto error1;
+
+	if (rmpp_version)
+		goto error1;	/* XXX: until RMPP implemented */
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+			goto error1;
+		if (!recv_handler)
+			goto error1;
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+				goto error1;
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui))
+				goto error1;
+		}
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kmalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error2;
+		}
+		/* Make a copy of the MAD registration request */
+		memcpy(reg_req, mad_reg_req, sizeof *reg_req);
+	}
+
+	/* Now, fill in the various structures */
+	memset(mad_agent_priv, 0, sizeof *mad_agent_priv);
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error3;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error3;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error3;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_WORK(&mad_agent_priv->timed_work, timeout_sends, mad_agent_priv);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions,
+		   mad_agent_priv);
+	INIT_LIST_HEAD(&mad_agent_priv->canceled_list);
+	INIT_WORK(&mad_agent_priv->canceled_work, cancel_sends, mad_agent_priv);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_waitqueue_head(&mad_agent_priv->wait);
+
+	return &mad_agent_priv->agent;
+
+error3:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	kfree(reg_req);
+error2:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = kmalloc(sizeof mad_snoop_priv *
+					  qp_info->snoop_table_size + 1,
+					  GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+		if (qp_info->snoop_table) {
+			memcpy(new_snoop_table, qp_info->snoop_table,
+			       sizeof mad_snoop_priv *
+			       qp_info->snoop_table_size);
+			kfree(qp_info->snoop_table);
+		}
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kmalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	memset(mad_snoop_priv, 0, sizeof *mad_snoop_priv);
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_waitqueue_head(&mad_snoop_priv->wait);
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error2;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+
+	port_priv = mad_agent_priv->qp_info->port_priv;
+
+	cancel_delayed_work(&mad_agent_priv->timed_work);
+	flush_workqueue(port_priv->wq);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	/* XXX: Cleanup pending RMPP receives for this agent */
+
+	atomic_dec(&mad_agent_priv->refcount);
+	wait_event(mad_agent_priv->wait,
+		   !atomic_read(&mad_agent_priv->refcount));
+
+	if (mad_agent_priv->reg_req)
+		kfree(mad_agent_priv->reg_req);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	atomic_dec(&mad_snoop_priv->refcount);
+	wait_event(mad_snoop_priv->wait,
+		   !atomic_read(&mad_snoop_priv->refcount));
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_send_wr *send_wr,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_wr, mad_send_wc);
+		if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+			wake_up(&mad_snoop_priv->wait);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+						   mad_recv_wc);
+		if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+			wake_up(&mad_snoop_priv->wait);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+			 struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_id = wr_id;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp_num = IB_QP0;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_smp *smp,
+				  struct ib_send_wr *send_wr)
+{
+	int ret, solicited;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	struct ib_wc mad_wc;
+
+	if (!smi_handle_dr_smp_send(smp, device->node_type, port_num)) {
+		ret = -EINVAL;
+		printk(KERN_ERR PFX "Invalid directed route\n");
+		goto out;
+	}
+	/* Check to post send on QP or process locally */
+	ret = smi_check_local_dr_smp(smp, device, port_num);
+	if (!ret || !device->process_mad)
+		goto out;
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for local response MAD\n");
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(send_wr->wr_id, smp->dr_slid, send_wr->wr.ud.pkey_index,
+		     send_wr->wr.ud.port_num, &mad_wc);
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (struct ib_mad *)smp,
+				  (struct ib_mad *)&mad_priv->mad);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		/*
+		 * See if response is solicited and
+		 * there is a recv handler
+		 */
+		if (solicited_mad(&mad_priv->mad.mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		solicited = solicited_mad(&mad_priv->mad.mad);
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			mad_priv->mad.mad.mad_hdr.tid =
+				((struct ib_mad *)smp)->mad_hdr.tid;
+			recv_mad_agent = find_mad_agent(port_priv,
+						       &mad_priv->mad.mad,
+							solicited);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			kfree(local);
+			ret = 0;
+			goto out;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->send_wr = *send_wr;
+	local->send_wr.sg_list = local->sg_list;
+	memcpy(local->sg_list, send_wr->sg_list,
+	       sizeof *send_wr->sg_list * send_wr->num_sge);
+	local->send_wr.next = NULL;
+	local->tid = send_wr->wr.ud.mad_hdr->tid;
+	local->wr_id = send_wr->wr_id;
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		  &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int ib_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		       struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_send_wr *bad_send_wr;
+	unsigned long flags;
+	int ret;
+
+	/* Replace user's WR ID with our own to find WR upon completion */
+	qp_info = mad_agent_priv->qp_info;
+	mad_send_wr->wr_id = mad_send_wr->send_wr.wr_id;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count++ < qp_info->send_queue.max_active) {
+		list_add_tail(&mad_send_wr->mad_list.list,
+			      &qp_info->send_queue.list);
+		spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+		ret = ib_post_send(mad_agent_priv->agent.qp,
+				   &mad_send_wr->send_wr, &bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			dequeue_mad(&mad_send_wr->mad_list);
+		}
+	} else {
+		list_add_tail(&mad_send_wr->mad_list.list,
+			      &qp_info->overflow_list);
+		spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_agent *mad_agent,
+		     struct ib_send_wr *send_wr,
+		     struct ib_send_wr **bad_send_wr)
+{
+	int ret = -EINVAL;
+	struct ib_mad_agent_private *mad_agent_priv;
+
+	/* Validate supplied parameters */
+	if (!bad_send_wr)
+		goto error1;
+
+	if (!mad_agent || !send_wr)
+		goto error2;
+
+	if (!mad_agent->send_handler)
+		goto error2;
+
+	mad_agent_priv = container_of(mad_agent,
+				      struct ib_mad_agent_private,
+				      agent);
+
+	/* Walk list of send WRs and post each on send list */
+	while (send_wr) {
+		unsigned long			flags;
+		struct ib_send_wr		*next_send_wr;
+		struct ib_mad_send_wr_private	*mad_send_wr;
+		struct ib_smp			*smp;
+
+		/* Validate more parameters */
+		if (send_wr->num_sge > IB_MAD_SEND_REQ_MAX_SG)
+			goto error2;
+
+		if (send_wr->wr.ud.timeout_ms && !mad_agent->recv_handler)
+			goto error2;
+
+		if (!send_wr->wr.ud.mad_hdr) {
+			printk(KERN_ERR PFX "MAD header must be supplied "
+			       "in WR %p\n", send_wr);
+			goto error2;
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_wr = (struct ib_send_wr *)send_wr->next;
+
+		smp = (struct ib_smp *)send_wr->wr.ud.mad_hdr;
+		if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv, smp,
+						     send_wr);
+			if (ret < 0)		/* error */
+				goto error2;
+			else if (ret == 1)	/* locally consumed */
+				goto next;
+		}
+
+		/* Allocate MAD send WR tracking structure */
+		mad_send_wr = kmalloc(sizeof *mad_send_wr, GFP_ATOMIC);
+		if (!mad_send_wr) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_send_wr_private\n");
+			ret = -ENOMEM;
+			goto error2;
+		}
+
+		mad_send_wr->send_wr = *send_wr;
+		mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+		memcpy(mad_send_wr->sg_list, send_wr->sg_list,
+		       sizeof *send_wr->sg_list * send_wr->num_sge);
+		mad_send_wr->send_wr.next = NULL;
+		mad_send_wr->tid = send_wr->wr.ud.mad_hdr->tid;
+		mad_send_wr->agent = mad_agent;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_wr->wr.
+							ud.timeout_ms);
+		mad_send_wr->retry = 0;
+		/* One reference for each work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		ret = ib_send_mad(mad_agent_priv, mad_send_wr);
+		if (ret) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error2;
+		}
+next:
+		send_wr = next_send_wr;
+	}
+	return 0;
+
+error2:
+	*bad_send_wr = send_wr;
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *entry;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+
+	mad_priv_hdr = container_of(mad_recv_wc,
+				    struct ib_mad_private_header,
+				    recv_wc);
+	priv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+
+	/*
+	 * Walk receive buffer list associated with this WC
+	 * No need to remove them from list of receive buffers
+	 */
+	list_for_each_entry(entry, &mad_recv_wc->recv_buf.list, list) {
+		/* Free previous receive buffer */
+		kmem_cache_free(ib_mad_cache, priv);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+	}
+
+	/* Free last buffer */
+	kmem_cache_free(ib_mad_cache, priv);
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+			  void *buf)
+{
+	printk(KERN_ERR PFX "ib_coalesce_recv_mad() not implemented yet\n");
+}
+EXPORT_SYMBOL(ib_coalesce_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		if ((*method)->agent[i]) {
+			printk(KERN_ERR PFX "Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kmalloc(sizeof **method, GFP_ATOMIC);
+	if (!*method) {
+		printk(KERN_ERR PFX "No memory for "
+		       "ib_mad_mgmt_method_table\n");
+		return -ENOMEM;
+	}
+	/* Clear management method table */
+	memset(*method, 0, sizeof **method);
+
+	return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+                /* Is there matching OUI for this vendor class ? */
+                if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kmalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_class_table\n");
+			ret = -ENOMEM;
+			goto error1;
+		}
+		/* Clear management class table */
+		memset(*class, 0, sizeof(**class));
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kmalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class_table\n");
+			goto error1;
+		}
+		/* Clear management vendor class table */
+		memset(vendor, 0, sizeof(*vendor));
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kmalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class\n");
+			goto error2;
+		}
+		memset(vendor_class, 0, sizeof(*vendor_class));
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	printk(KERN_ERR PFX "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			 kfree(method);
+			 class->method_table[mgmt_class] = NULL;
+			 /* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static int response_mad(struct ib_mad *mad)
+{
+	/* Trap represses are responses although response bit is reset */
+	return ((mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		(mad->mad_hdr.method & IB_MGMT_METHOD_RESP));
+}
+
+static int solicited_mad(struct ib_mad *mad)
+{
+	/* CM MADs are never solicited */
+	if (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CM) {
+		return 0;
+	}
+
+	/* XXX: Determine whether MAD is using RMPP */
+
+	/* Not using RMPP */
+	/* Is this MAD a response to a previous MAD ? */
+	return response_mad(mad);
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       struct ib_mad *mad,
+	       int solicited)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+
+	/*
+	 * Whether MAD was solicited determines type of routing to
+	 * MAD client.
+	 */
+	if (solicited) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list,
+				    agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+			class = port_priv->version[
+					mad->mad_hdr.class_version].class;
+			if (!class)
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad->mad_hdr.mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad->mad_hdr.class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad->mad_hdr.mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (struct ib_vendor_mad *)mad;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			printk(KERN_NOTICE PFX "No receive handler for client "
+			       "%p on port %d\n",
+			       &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+	int valid = 0;
+
+	/* Make sure MAD base version is understood */
+	if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+		printk(KERN_ERR PFX "MAD received with unsupported base "
+		       "version %d\n", mad->mad_hdr.base_version);
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+/*
+ * Return start of fully reassembled MAD, or NULL, if MAD isn't assembled yet
+ */
+static struct ib_mad_private *
+reassemble_recv(struct ib_mad_agent_private *mad_agent_priv,
+		struct ib_mad_private *recv)
+{
+	/* Until we have RMPP, all receives are reassembled!... */
+	INIT_LIST_HEAD(&recv->header.recv_wc.recv_buf.list);
+	return recv;
+}
+
+static struct ib_mad_send_wr_private*
+find_send_req(struct ib_mad_agent_private *mad_agent_priv,
+	      u64 tid)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (mad_send_wr->tid == tid)
+			return mad_send_wr;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (mad_send_wr->tid == tid && mad_send_wr->timeout) {
+			/* Verify request has not been canceled */
+			return (mad_send_wr->status == IB_WC_SUCCESS) ?
+				mad_send_wr : NULL;
+		}
+	}
+	return NULL;
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_private *recv,
+				 int solicited)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	/* Fully reassemble receive before processing */
+	recv = reassemble_recv(mad_agent_priv, recv);
+	if (!recv) {
+		if (atomic_dec_and_test(&mad_agent_priv->refcount))
+			wake_up(&mad_agent_priv->wait);
+		return;
+	}
+
+	/* Complete corresponding request */
+	if (solicited) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = find_send_req(mad_agent_priv,
+					    recv->mad.mad.mad_hdr.tid);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(&recv->header.recv_wc);
+			if (atomic_dec_and_test(&mad_agent_priv->refcount))
+				wake_up(&mad_agent_priv->wait);
+			return;
+		}
+		/* Timeout = 0 means that we won't wait for a response */
+		mad_send_wr->timeout = 0;
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		/* Defined behavior is to complete response before request */
+		recv->header.recv_wc.wc->wr_id = mad_send_wr->wr_id;
+		mad_agent_priv->agent.recv_handler(
+						&mad_agent_priv->agent,
+						&recv->header.recv_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.wr_id = mad_send_wr->wr_id;
+		ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+	} else {
+		mad_agent_priv->agent.recv_handler(
+						&mad_agent_priv->agent,
+						&recv->header.recv_wc);
+		if (atomic_dec_and_test(&mad_agent_priv->refcount))
+			wake_up(&mad_agent_priv->wait);
+	}
+}
+
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response;
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_agent_private *mad_agent;
+	int solicited;
+
+	response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+	if (!response)
+		printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
+		       "for response buffer\n");
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	dma_unmap_single(port_priv->device->dma_device,
+			 pci_unmap_addr(&recv->header, mapping),
+			 sizeof(struct ib_mad_private) -
+			 sizeof(struct ib_mad_private_header),
+			 DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.recv_wc.wc = wc;
+	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+	recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+		goto out;
+
+	if (recv->mad.mad.mad_hdr.mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (!smi_handle_dr_smp_recv(&recv->mad.smp,
+					    port_priv->device->node_type,
+					    port_priv->port_num,
+					    port_priv->device->phys_port_cnt))
+			goto out;
+		if (!smi_check_forward_dr_smp(&recv->mad.smp))
+			goto local;
+		if (!smi_handle_dr_smp_send(&recv->mad.smp,
+					    port_priv->device->node_type,
+					    port_priv->port_num))
+			goto out;
+		if (!smi_check_local_dr_smp(&recv->mad.smp,
+					    port_priv->device,
+					    port_priv->port_num))
+			goto out;
+	}
+
+local:
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		int ret;
+
+		if (!response) {
+			printk(KERN_ERR PFX "No memory for response MAD\n");
+			/*
+			 * Is it better to assume that
+			 * it wouldn't be processed ?
+			 */
+			goto out;
+		}
+
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     &recv->mad.mad,
+						     &response->mad.mad);
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				/* Send response */
+				if (!agent_send(response, &recv->grh, wc,
+						port_priv->device,
+						port_priv->port_num))
+					response = NULL;
+				goto out;
+			}
+		}
+	}
+
+	/* Determine corresponding MAD agent for incoming receive MAD */
+	solicited = solicited_mad(&recv->mad.mad);
+	mad_agent = find_mad_agent(port_priv, &recv->mad.mad, solicited);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, recv, solicited);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long delay;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		cancel_delayed_work(&mad_agent_priv->timed_work);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			cancel_delayed_work(&mad_agent_priv->timed_work);
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(mad_agent_priv->qp_info->
+					   port_priv->wq,
+					   &mad_agent_priv->timed_work, delay);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_agent_private *mad_agent_priv,
+			      struct ib_mad_send_wr_private *mad_send_wr )
+{
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+		temp_mad_send_wr = list_entry(list_item,
+					      struct ib_mad_send_wr_private,
+					      agent_list);
+		if (time_after(mad_send_wr->timeout,
+			       temp_mad_send_wr->timeout))
+			break;
+	}
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) {
+		cancel_delayed_work(&mad_agent_priv->timed_work);
+		queue_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+				   &mad_agent_priv->timed_work, delay);
+	}
+}
+
+/*
+ * Process a send work completion
+ */
+static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+				    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+
+	mad_agent_priv = container_of(mad_send_wr->agent,
+				      struct ib_mad_agent_private, agent);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_agent_priv, mad_send_wr);
+		}
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+					    mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		wake_up(&mad_agent_priv->wait);
+
+	kfree(mad_send_wr);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	unsigned long flags;
+	int ret;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_del(&mad_list->list);
+		list_add_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	/* Restore client wr_id in WC and complete send */
+	wc->wr_id = mad_send_wr->wr_id;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_wr,
+			   (struct ib_mad_send_wc *)wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, (struct ib_mad_send_wc *)wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				&bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/* Determine if failure was a send or receive */
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	if (mad_list->mad_queue == &qp_info->recv_queue)
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+					&bad_send_wr);
+			if (ret)
+				ib_mad_send_done_handler(port_priv, wc);
+		} else
+			ib_mad_send_done_handler(port_priv, wc);
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				printk(KERN_ERR PFX "mad_error_handler - "
+				       "ib_modify_qp to RTS : %d\n", ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+		ib_mad_send_done_handler(port_priv, wc);
+	}
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(void *data)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_wc wc;
+
+	port_priv = (struct ib_mad_port_private *)data;
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_mad_send_done_handler(port_priv, &wc);
+				break;
+			case IB_WC_RECV:
+				ib_mad_recv_done_handler(port_priv, &wc);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else
+			mad_error_handler(port_priv, &wc);
+	}
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+ 			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.wr_id = mad_send_wr->wr_id;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		list_del(&mad_send_wr->agent_list);
+		kfree(mad_send_wr);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_by_wr_id(struct ib_mad_agent_private *mad_agent_priv,
+		   u64 wr_id)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (mad_send_wr->wr_id == wr_id)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (mad_send_wr->wr_id == wr_id)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+void cancel_sends(void *data)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	mad_agent_priv = data;
+
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->canceled_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->canceled_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		list_del(&mad_send_wr->agent_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		mad_send_wc.wr_id = mad_send_wr->wr_id;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		kfree(mad_send_wr);
+		if (atomic_dec_and_test(&mad_agent_priv->refcount))
+			wake_up(&mad_agent_priv->wait);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		  u64 wr_id)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_by_wr_id(mad_agent_priv, wr_id);
+	if (!mad_send_wr) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		goto out;
+	}
+
+	if (mad_send_wr->status == IB_WC_SUCCESS)
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+
+	if (mad_send_wr->refcount != 0) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		goto out;
+	}
+
+	list_del(&mad_send_wr->agent_list);
+	list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->canceled_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->canceled_work);
+out:
+	return;
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(void *data)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+
+	mad_agent_priv = (struct ib_mad_agent_private *)data;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		if (local->mad_priv) {
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+				kmem_cache_free(ib_mad_cache, local->mad_priv);
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(local->wr_id, IB_LID_PERMISSIVE,
+				     0 /* pkey index */,
+				     recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+			local->mad_priv->header.recv_wc.mad_len =
+						sizeof(struct ib_mad);
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.recv_buf.list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						&local->mad_priv->mad.mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.wr_id = local->wr_id;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info, &local->send_wr,
+				  &mad_send_wc,
+				   IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_del(&local->completion_list);
+		atomic_dec(&mad_agent_priv->refcount);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void timeout_sends(void *data)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags, delay;
+
+	mad_agent_priv = (struct ib_mad_agent_private *)data;
+
+	mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(mad_agent_priv->qp_info->
+					   port_priv->wq,
+					   &mad_agent_priv->timed_work, delay);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		mad_send_wc.wr_id = mad_send_wr->wr_id;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		kfree(mad_send_wr);
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+
+	queue_work(port_priv->wq, &port_priv->work);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				printk(KERN_ERR PFX "No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = dma_map_single(qp_info->port_priv->
+						device->dma_device,
+					&mad_priv->grh,
+					sizeof *mad_priv -
+						sizeof mad_priv->header,
+					DMA_FROM_DEVICE);
+		pci_unmap_addr_set(&mad_priv->header, mapping, sg_list.addr);
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			dma_unmap_single(qp_info->port_priv->device->dma_device,
+					 pci_unmap_addr(&mad_priv->header,
+							mapping),
+					 sizeof *mad_priv -
+					   sizeof mad_priv->header,
+					 DMA_FROM_DEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		/* Undo PCI mapping */
+		dma_unmap_single(qp_info->port_priv->device->dma_device,
+				 pci_unmap_addr(&recv->header, mapping),
+				 sizeof(struct ib_mad_private) -
+				 sizeof(struct ib_mad_private_header),
+				 DMA_FROM_DEVICE);
+		kmem_cache_free(ib_mad_cache, recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ 	if (!attr) {
+		printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = 0;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "INIT: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTR: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTS: %d\n", i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		printk(KERN_ERR PFX "Failed to request completion "
+		       "notification: %d\n", ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = IB_MAD_QP_SEND_SIZE;
+	qp_init_attr.cap.max_recv_wr = IB_MAD_QP_RECV_SIZE;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
+		       get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = IB_MAD_QP_SEND_SIZE;
+	qp_info->recv_queue.max_active = IB_MAD_QP_RECV_SIZE;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	ib_destroy_qp(qp_info->qp);
+	if (qp_info->snoop_table)
+		kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+
+	/* First, check if port already open at MAD layer */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (port_priv) {
+		printk(KERN_DEBUG PFX "%s port %d already open\n",
+		       device->name, port_num);
+		return 0;
+	}
+
+	/* Create new device info */
+	port_priv = kmalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+		return -ENOMEM;
+	}
+	memset(port_priv, 0, sizeof *port_priv);
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = (IB_MAD_QP_SEND_SIZE + IB_MAD_QP_RECV_SIZE) * 2;
+	port_priv->cq = ib_create_cq(port_priv->device,
+				     (ib_comp_handler)
+					ib_mad_thread_completion_handler,
+				     NULL, port_priv, cq_size);
+	if (IS_ERR(port_priv->cq)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device);
+	if (IS_ERR(port_priv->pd)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error5;
+	}
+
+	ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+	if (ret)
+		goto error6;
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = create_singlethread_workqueue(name);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+	INIT_WORK(&port_priv->work, ib_mad_completion_handler, port_priv);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		printk(KERN_ERR PFX "Couldn't start port\n");
+		goto error9;
+	}
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+	return 0;
+
+error9:
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dereg_mr(port_priv->mr);
+error5:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		printk(KERN_ERR PFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	/* Stop processing completions. */
+	flush_workqueue(port_priv->wq);
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dereg_mr(port_priv->mr);
+	ib_dealloc_pd(port_priv->pd);
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int ret, num_ports, cur_port, i, ret2;
+
+	if (device->node_type == IB_NODE_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		ret = ib_mad_port_open(device, cur_port);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d\n",
+			       device->name, cur_port);
+			goto error_device_open;
+		}
+		ret = ib_agent_port_open(device, cur_port);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d "
+			       "for agents\n",
+			       device->name, cur_port);
+			goto error_device_open;
+		}
+	}
+
+	goto error_device_query;
+
+error_device_open:
+	while (i > 0) {
+		cur_port--;
+		ret2 = ib_agent_port_close(device, cur_port);
+		if (ret2) {
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, cur_port);
+		}
+		ret2 = ib_mad_port_close(device, cur_port);
+		if (ret2) {
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, cur_port);
+		}
+		i--;
+	}
+
+error_device_query:
+	return;
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+	int ret = 0, i, num_ports, cur_port, ret2;
+
+	if (device->node_type == IB_NODE_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		ret2 = ib_agent_port_close(device, cur_port);
+		if (ret2) {
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, cur_port);
+			if (!ret)
+				ret = ret2;
+		}
+		ret2 = ib_mad_port_close(device, cur_port);
+		if (ret2) {
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, cur_port);
+			if (!ret)
+				ret = ret2;
+		}
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+	int ret;
+
+	spin_lock_init(&ib_mad_port_list_lock);
+	spin_lock_init(&ib_agent_port_list_lock);
+
+	ib_mad_cache = kmem_cache_create("ib_mad",
+					 sizeof(struct ib_mad_private),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL,
+					 NULL);
+	if (!ib_mad_cache) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+		ret = -EINVAL;
+		goto error2;
+	}
+
+	return 0;
+
+error2:
+	kmem_cache_destroy(ib_mad_cache);
+error1:
+	return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+	ib_unregister_client(&mad_client);
+
+	if (kmem_cache_destroy(ib_mad_cache)) {
+		printk(KERN_DEBUG PFX "Failed to destroy ib_mad cache\n");
+	}
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 000000000000..4ba9f726bf1d
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mad_priv.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/pci.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <ib_mad.h>
+#include <ib_smi.h>
+
+
+#define PFX "ib_mad: "
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	8
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	struct ib_grh grh;
+	union {
+		struct ib_mad mad;
+		struct ib_rmpp_mad rmpp_mad;
+		struct ib_smp smp;
+	} mad;
+} __attribute__ ((packed));
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct work_struct timed_work;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head canceled_list;
+	struct work_struct canceled_work;
+
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	u8 rmpp_version;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	wait_queue_head_t wait;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent *agent;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	u64 wr_id;			/* client WR ID */
+	u64 tid;
+	unsigned long timeout;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	u64 wr_id;			/* client WR ID */
+	u64 tid;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+extern kmem_cache_t *ib_mad_cache;
+
+#endif	/* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 000000000000..5f15feffeae2
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 000000000000..d4233ee61c35
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: sa_query.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * These two structures must be packed because they have 64-bit fields
+ * that are only 32-bit aligned.  64-bit architectures will lay them
+ * out wrong otherwise.  (And unfortunately they are sent on the wire
+ * so we can't change the layout)
+ */
+struct ib_sa_hdr {
+	u64			sm_key;
+	u16			attr_offset;
+	u16			reserved;
+	ib_sa_comp_mask		comp_mask;
+} __attribute__ ((packed));
+
+struct ib_sa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	struct ib_sa_hdr	sa_hdr;
+	u8			data[200];
+} __attribute__ ((packed));
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_mr        *mr;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_port  *port;
+	struct ib_sa_mad   *mad;
+	struct ib_sa_sm_ah *sm_ah;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+	int                 id;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+enum {
+	IB_SA_ATTR_CLASS_PORTINFO    = 0x01,
+	IB_SA_ATTR_NOTICE	     = 0x02,
+	IB_SA_ATTR_INFORM_INFO	     = 0x03,
+	IB_SA_ATTR_NODE_REC	     = 0x11,
+	IB_SA_ATTR_PORT_INFO_REC     = 0x12,
+	IB_SA_ATTR_SL2VL_REC	     = 0x13,
+	IB_SA_ATTR_SWITCH_REC	     = 0x14,
+	IB_SA_ATTR_LINEAR_FDB_REC    = 0x15,
+	IB_SA_ATTR_RANDOM_FDB_REC    = 0x16,
+	IB_SA_ATTR_MCAST_FDB_REC     = 0x17,
+	IB_SA_ATTR_SM_INFO_REC	     = 0x18,
+	IB_SA_ATTR_LINK_REC	     = 0x20,
+	IB_SA_ATTR_GUID_INFO_REC     = 0x30,
+	IB_SA_ATTR_SERVICE_REC	     = 0x31,
+	IB_SA_ATTR_PARTITION_REC     = 0x33,
+	IB_SA_ATTR_RANGE_REC	     = 0x34,
+	IB_SA_ATTR_PATH_REC	     = 0x35,
+	IB_SA_ATTR_VL_ARB_REC	     = 0x36,
+	IB_SA_ATTR_MC_GROUP_REC	     = 0x37,
+	IB_SA_ATTR_MC_MEMBER_REC     = 0x38,
+	IB_SA_ATTR_TRACE_REC	     = 0x39,
+	IB_SA_ATTR_MULTI_PATH_REC    = 0x3a,
+	IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b
+};
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(void *port_ptr)
+{
+	struct ib_sa_port *port = port_ptr;
+	struct ib_sa_sm_ah *new_ah, *old_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	old_ah = port->sm_ah;
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+	if (old_ah)
+		kref_put(&old_ah->ref, free_sm_ah);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE) {
+		struct ib_sa_device *sa_dev =
+			ib_get_client_data(event->device, &sa_client);
+
+		schedule_work(&sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, id);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms)
+{
+	struct ib_sa_port *port = query->port;
+	unsigned long flags;
+	int ret;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+		.wr	     = {
+			 .ud = {
+				 .mad_hdr     = &query->mad->mad_hdr,
+				 .remote_qpn  = 1,
+				 .remote_qkey = IB_QP1_QKEY,
+				 .timeout_ms  = timeout_ms
+			 }
+		 }
+	};
+
+retry:
+	if (!idr_pre_get(&query_idr, GFP_ATOMIC))
+		return -ENOMEM;
+	spin_lock_irqsave(&idr_lock, flags);
+	ret = idr_get_new(&query_idr, query, &query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		return ret;
+
+	wr.wr_id = query->id;
+
+	spin_lock_irqsave(&port->ah_lock, flags);
+	kref_get(&port->sm_ah->ref);
+	query->sm_ah = port->sm_ah;
+	wr.wr.ud.ah  = port->sm_ah->ah;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	gather_list.addr   = dma_map_single(port->agent->device->dma_device,
+					    query->mad,
+					    sizeof (struct ib_sa_mad),
+					    DMA_TO_DEVICE);
+	gather_list.length = sizeof (struct ib_sa_mad);
+	gather_list.lkey   = port->mr->lkey;
+	pci_unmap_addr_set(query, mapping, gather_list.addr);
+
+	ret = ib_post_send_mad(port->agent, &wr, &bad_wr);
+	if (ret) {
+		dma_unmap_single(port->agent->device->dma_device,
+				 pci_unmap_addr(query, mapping),
+				 sizeof (struct ib_sa_mad),
+				 DMA_TO_DEVICE);
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, query->id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	return ret;
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(sa_query->mad);
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port   = &sa_dev->port[port_num - sa_dev->start_port];
+	struct ib_mad_agent *agent  = port->agent;
+	int ret;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+	query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+	if (!query->sa_query.mad) {
+		kfree(query);
+		return -ENOMEM;
+	}
+
+	query->callback = callback;
+	query->context  = context;
+
+	init_mad(query->sa_query.mad, agent);
+
+	query->sa_query.callback              = ib_sa_path_rec_callback;
+	query->sa_query.release               = ib_sa_path_rec_release;
+	query->sa_query.port                  = port;
+	query->sa_query.mad->mad_hdr.method   = IB_MGMT_METHOD_GET;
+	query->sa_query.mad->mad_hdr.attr_id  = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+		rec, query->sa_query.mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms);
+	if (ret) {
+		*sa_query = NULL;
+		kfree(query->sa_query.mad);
+		kfree(query);
+	}
+
+	return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(sa_query->mad);
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, int gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port   = &sa_dev->port[port_num - sa_dev->start_port];
+	struct ib_mad_agent *agent  = port->agent;
+	int ret;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+	query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+	if (!query->sa_query.mad) {
+		kfree(query);
+		return -ENOMEM;
+	}
+
+	query->callback = callback;
+	query->context  = context;
+
+	init_mad(query->sa_query.mad, agent);
+
+	query->sa_query.callback              = ib_sa_mcmember_rec_callback;
+	query->sa_query.release               = ib_sa_mcmember_rec_release;
+	query->sa_query.port                  = port;
+	query->sa_query.mad->mad_hdr.method   = method;
+	query->sa_query.mad->mad_hdr.attr_id  = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, query->sa_query.mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms);
+	if (ret) {
+		*sa_query = NULL;
+		kfree(query->sa_query.mad);
+		kfree(query);
+	}
+
+	return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_mcmember_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	query = idr_find(&query_idr, mad_send_wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	if (!query)
+		return;
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+		/* No callback -- already got recv */
+		break;
+	case IB_WC_RESP_TIMEOUT_ERR:
+		query->callback(query, -ETIMEDOUT, NULL);
+		break;
+	case IB_WC_WR_FLUSH_ERR:
+		query->callback(query, -EINTR, NULL);
+		break;
+	default:
+		query->callback(query, -EIO, NULL);
+		break;
+	}
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(query, mapping),
+			 sizeof (struct ib_sa_mad),
+			 DMA_TO_DEVICE);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+
+	query->release(query);
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, mad_send_wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	query = idr_find(&query_idr, mad_recv_wc->wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	if (query) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	int s, e, i;
+
+	if (device->node_type == IB_NODE_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kmalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		sa_dev->port[i].mr       = NULL;
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		sa_dev->port[i].mr = ib_get_dma_mr(sa_dev->port[i].agent->qp->pd,
+						   IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(sa_dev->port[i].mr)) {
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			goto err;
+		}
+
+		INIT_WORK(&sa_dev->port[i].update_task,
+			  update_sm_ah, &sa_dev->port[i]);
+	}
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		update_sm_ah(&sa_dev->port[i]);
+
+	return;
+
+err:
+	while (--i >= 0) {
+		ib_dereg_mr(sa_dev->port[i].mr);
+		ib_unregister_mad_agent(sa_dev->port[i].agent);
+	}
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		ib_unregister_mad_agent(sa_dev->port[i].agent);
+		kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	spin_lock_init(&idr_lock);
+	spin_lock_init(&tid_lock);
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret)
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	ib_unregister_client(&sa_client);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 000000000000..b4b284324a33
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: smi.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+int smi_handle_dr_smp_send(struct ib_smp *smp,
+			   u8 node_type,
+			   int port_num)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 */
+		if (hop_cnt && hop_ptr == 0) {
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num);
+		}
+
+		/* C14-9:2 */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != IB_NODE_SWITCH)
+				return 0;
+
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (node_type == IB_NODE_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != IB_NODE_SWITCH)
+				return 0;
+
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			smp->hop_ptr--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (node_type == IB_NODE_SWITCH ||
+				smp->dr_slid == IB_LID_PERMISSIVE);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (hop_ptr == 0)
+			return 1;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return 0;
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+int smi_handle_dr_smp_recv(struct ib_smp *smp,
+			   u8 node_type,
+			   int port_num,
+			   int phys_port_cnt)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && hop_ptr == 0)
+			return 0;
+
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != IB_NODE_SWITCH)
+				return 0;
+
+			smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+			return (smp->initial_path[hop_ptr+1] <= phys_port_cnt);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+
+			return (node_type == IB_NODE_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != IB_NODE_SWITCH)
+				return 0;
+
+			/* smp->hop_ptr updated when sending */
+			return (smp->return_path[hop_ptr-1] <= phys_port_cnt);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			if (smp->dr_slid == IB_LID_PERMISSIVE) {
+				/* giving SMP to SM - update hop_ptr */
+				smp->hop_ptr--;
+				return 1;
+			}
+			/* smp->hop_ptr updated when sending */
+			return (node_type == IB_NODE_SWITCH);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (hop_ptr == 0);
+	}
+}
+
+/*
+ * Return 1 if the received DR SMP should be forwarded to the send queue
+ * Return 0 if the SMP should be completed up the stack
+ */
+int smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return 1;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (smp->dr_dlid == IB_LID_PERMISSIVE);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return 1;
+	} else {
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return 1;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (smp->dr_slid != IB_LID_PERMISSIVE);
+	}
+	return 0;
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 000000000000..db25503a0736
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: smi.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+int smi_handle_dr_smp_recv(struct ib_smp *smp,
+			   u8 node_type,
+			   int port_num,
+			   int phys_port_cnt);
+extern int smi_check_forward_dr_smp(struct ib_smp *smp);
+extern int smi_handle_dr_smp_send(struct ib_smp *smp,
+				  u8 node_type,
+				  int port_num);
+extern int smi_check_local_dr_smp(struct ib_smp *smp,
+				  struct ib_device *device,
+				  int port_num);
+
+/*
+ * Return 1 if the SMP should be handled by the local SMA/SM via process_mad
+ */
+static inline int smi_check_local_smp(struct ib_mad_agent *mad_agent,
+                         	      struct ib_smp *smp)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((mad_agent->device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)));
+}
+
+#endif	/* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 000000000000..3a413f72ff6d
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include "core_priv.h"
+
+#include <ib_mad.h>
+
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct attribute_group gid_group;
+	struct attribute     **gid_attr;
+	struct attribute_group pkey_group;
+	struct attribute     **pkey_attr;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute attr;
+	int                   index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return 0;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state >= 0 && attr.state <= ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case 2: speed = " DDR"; break;
+	case 4: speed = " QDR"; break;
+	}
+
+	rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((u16 *) gid.raw)[0]),
+		       be16_to_cpu(((u16 *) gid.raw)[1]),
+		       be16_to_cpu(((u16 *) gid.raw)[2]),
+		       be16_to_cpu(((u16 *) gid.raw)[3]),
+		       be16_to_cpu(((u16 *) gid.raw)[4]),
+		       be16_to_cpu(((u16 *) gid.raw)[5]),
+		       be16_to_cpu(((u16 *) gid.raw)[6]),
+		       be16_to_cpu(((u16 *) gid.raw)[7]));
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
+struct port_table_attribute port_pma_attr_##_name = {			\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24)	\
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	ssize_t ret;
+
+	if (!p->ibdev->process_mad)
+		return sprintf(buf, "N/A (no PMA)\n");
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version  = 1;
+	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+	in_mad->mad_hdr.class_version = 1;
+	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+
+	in_mad->data[41] = p->port_num;	/* PortSelect field */
+
+	if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+		 p->port_num, NULL, NULL, in_mad, out_mad) &
+	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	switch (width) {
+	case 4:
+		ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+					    (offset % 4)) & 0xf);
+		break;
+	case 8:
+		ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+		break;
+	case 16:
+		ret = sprintf(buf, "%u\n",
+			      be16_to_cpup((u16 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	case 32:
+		ret = sprintf(buf, "%u\n",
+			      be32_to_cpup((u32 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	default:
+		ret = 0;
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error		    ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery	    ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed		    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors		    ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards		    ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors	    ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped		    , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_port_xmit_data.attr.attr,
+	&port_pma_attr_port_rcv_data.attr.attr,
+	&port_pma_attr_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_rcv_packets.attr.attr,
+	NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->gid_attr[i]); ++i) {
+		kfree(a->name);
+		kfree(a);
+	}
+
+	for (i = 0; (a = p->pkey_attr[i]); ++i) {
+		kfree(a->name);
+		kfree(a);
+	}
+
+	kfree(p->gid_attr);
+	kfree(p);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct class_device *cdev)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+
+	kfree(dev);
+}
+
+static int ib_device_hotplug(struct class_device *cdev, char **envp,
+			     int num_envp, char *buf, int size)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+	int i = 0, len = 0;
+
+	if (add_hotplug_env_var(envp, num_envp, &i, buf, size, &len,
+				"NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It might be nice to pass the node GUID to hotplug, but
+	 * right now the only way to get it is to query the device
+	 * provider, and this can crash during device removal because
+	 * we are will be running after driver removal has started.
+	 * We could add a node_guid field to struct ib_device, or we
+	 * could just let the hotplug script read the node GUID from
+	 * sysfs when devices are added.
+	 */
+
+	envp[i] = NULL;
+	return 0;
+}
+
+static int alloc_group(struct attribute ***attr,
+		       ssize_t (*show)(struct ib_port *,
+				       struct port_attribute *, char *buf),
+		       int len)
+{
+	struct port_table_attribute ***tab_attr =
+		(struct port_table_attribute ***) attr;
+	int i;
+	int ret;
+
+	*tab_attr = kmalloc((1 + len) * sizeof *tab_attr, GFP_KERNEL);
+	if (!*tab_attr)
+		return -ENOMEM;
+
+	memset(*tab_attr, 0, (1 + len) * sizeof *tab_attr);
+
+	for (i = 0; i < len; ++i) {
+		(*tab_attr)[i] = kmalloc(sizeof *(*tab_attr)[i], GFP_KERNEL);
+		if (!(*tab_attr)[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		memset((*tab_attr)[i], 0, sizeof *(*tab_attr)[i]);
+		(*tab_attr)[i]->attr.attr.name = kmalloc(8, GFP_KERNEL);
+		if (!(*tab_attr)[i]->attr.attr.name) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (snprintf((*tab_attr)[i]->attr.attr.name, 8, "%d", i) >= 8) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		(*tab_attr)[i]->attr.attr.mode  = S_IRUGO;
+		(*tab_attr)[i]->attr.attr.owner = THIS_MODULE;
+		(*tab_attr)[i]->attr.show       = show;
+		(*tab_attr)[i]->index           = i;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < len; ++i) {
+		if ((*tab_attr)[i])
+			kfree((*tab_attr)[i]->attr.attr.name);
+		kfree((*tab_attr)[i]);
+	}
+
+	kfree(*tab_attr);
+
+	return ret;
+}
+
+static int add_port(struct ib_device *device, int port_num)
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kmalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	memset(p, 0, sizeof *p);
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+	p->kobj.ktype = &port_type;
+
+	p->kobj.parent = kobject_get(&device->ports_parent);
+	if (!p->kobj.parent) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	ret = kobject_set_name(&p->kobj, "%d", port_num);
+	if (ret)
+		goto err_put;
+
+	ret = kobject_register(&p->kobj);
+	if (ret)
+		goto err_put;
+
+	ret = sysfs_create_group(&p->kobj, &pma_group);
+	if (ret)
+		goto err_put;
+
+	ret = alloc_group(&p->gid_attr, show_port_gid, attr.gid_tbl_len);
+	if (ret)
+		goto err_remove_pma;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = p->gid_attr;
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	ret = alloc_group(&p->pkey_attr, show_port_pkey, attr.pkey_tbl_len);
+	if (ret)
+		goto err_remove_gid;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = p->pkey_attr;
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+	return 0;
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i) {
+		kfree(p->pkey_attr[i]->name);
+		kfree(p->pkey_attr[i]);
+	}
+
+	kfree(p->pkey_attr);
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i) {
+		kfree(p->gid_attr[i]->name);
+		kfree(p->gid_attr[i]);
+	}
+
+	kfree(p->gid_attr);
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+	kobject_put(&device->ports_parent);
+
+err:
+	kfree(p);
+	return ret;
+}
+
+static ssize_t show_node_type(struct class_device *cdev, char *buf)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+
+	switch (dev->node_type) {
+	case IB_NODE_CA:     return sprintf(buf, "%d: CA\n", dev->node_type);
+	case IB_NODE_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case IB_NODE_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:             return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct class_device *cdev, char *buf)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((u16 *) &attr.sys_image_guid)[0]),
+		       be16_to_cpu(((u16 *) &attr.sys_image_guid)[1]),
+		       be16_to_cpu(((u16 *) &attr.sys_image_guid)[2]),
+		       be16_to_cpu(((u16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct class_device *cdev, char *buf)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((u16 *) &attr.node_guid)[0]),
+		       be16_to_cpu(((u16 *) &attr.node_guid)[1]),
+		       be16_to_cpu(((u16 *) &attr.node_guid)[2]),
+		       be16_to_cpu(((u16 *) &attr.node_guid)[3]));
+}
+
+static CLASS_DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static CLASS_DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static CLASS_DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+
+static struct class_device_attribute *ib_class_attributes[] = {
+	&class_device_attr_node_type,
+	&class_device_attr_sys_image_guid,
+	&class_device_attr_node_guid
+};
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.release = ib_device_release,
+	.hotplug = ib_device_hotplug,
+};
+
+int ib_device_register_sysfs(struct ib_device *device)
+{
+	struct class_device *class_dev = &device->class_dev;
+	int ret;
+	int i;
+
+	class_dev->class      = &ib_class;
+	class_dev->class_data = device;
+	strlcpy(class_dev->class_id, device->name, BUS_ID_SIZE);
+
+	INIT_LIST_HEAD(&device->port_list);
+
+	ret = class_device_register(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = class_device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent.parent = kobject_get(&class_dev->kobj);
+	if (!device->ports_parent.parent) {
+		ret = -EBUSY;
+		goto err_unregister;
+	}
+	ret = kobject_set_name(&device->ports_parent, "ports");
+	if (ret)
+		goto err_put;
+	ret = kobject_register(&device->ports_parent);
+	if (ret)
+		goto err_put;
+
+	if (device->node_type == IB_NODE_SWITCH) {
+		ret = add_port(device, 0);
+		if (ret)
+			goto err_put;
+	} else {
+		int i;
+
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	return 0;
+
+err_put:
+	{
+		struct kobject *p, *t;
+		struct ib_port *port;
+
+		list_for_each_entry_safe(p, t, &device->port_list, entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct ib_port, kobj);
+			sysfs_remove_group(p, &pma_group);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			kobject_unregister(p);
+		}
+	}
+
+	kobject_put(&class_dev->kobj);
+
+err_unregister:
+	class_device_unregister(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	struct kobject *p, *t;
+	struct ib_port *port;
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		list_del(&p->entry);
+		port = container_of(p, struct ib_port, kobj);
+		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		kobject_unregister(p);
+	}
+
+	kobject_unregister(&device->ports_parent);
+	class_device_unregister(&device->class_dev);
+}
+
+int ib_sysfs_setup(void)
+{
+	return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+	class_unregister(&ib_class);
+}
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 000000000000..dc4eb1db5e96
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+
+#include <ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_ud_header_init() initializes the lrh.link_version, lrh.link_next_header,
+ * lrh.packet_length, grh.ip_version, grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a &struct ib_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_ud_header_init(int     		    payload_bytes,
+		       int    		    grh_present,
+		       struct ib_ud_header *header)
+{
+	int header_len;
+
+	memset(header, 0, sizeof *header);
+
+	header_len =
+		IB_LRH_BYTES  +
+		IB_BTH_BYTES  +
+		IB_DETH_BYTES;
+	if (grh_present) {
+		header_len += IB_GRH_BYTES;
+	}
+
+	header->lrh.link_version     = 0;
+	header->lrh.link_next_header =
+		grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+	header->lrh.packet_length    = (IB_LRH_BYTES     +
+					IB_BTH_BYTES     +
+					IB_DETH_BYTES    +
+					payload_bytes    +
+					4                + /* ICRC     */
+					3) / 4;            /* round up */
+
+	header->grh_present          = grh_present;
+	if (grh_present) {
+		header->lrh.packet_length  += IB_GRH_BYTES / 4;
+
+		header->grh.ip_version      = 6;
+		header->grh.payload_length  =
+			cpu_to_be16((IB_BTH_BYTES     +
+				     IB_DETH_BYTES    +
+				     payload_bytes    +
+				     4                + /* ICRC     */
+				     3) & ~3);          /* round up */
+		header->grh.next_header     = 0x1b;
+	}
+
+	cpu_to_be16s(&header->lrh.packet_length);
+
+	if (header->immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+		&header->lrh, buf);
+	len += IB_LRH_BYTES;
+
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+		       header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+			       header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+			       header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+		       header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+		       header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+		       header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 000000000000..54b4f33b0bf9
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: user_mad.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/kref.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <ib_mad.h>
+#include <ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0
+};
+
+struct ib_umad_port {
+	int                    devnum;
+	struct cdev            dev;
+	struct class_device    class_dev;
+
+	int                    sm_devnum;
+	struct cdev            sm_dev;
+	struct class_device    sm_class_dev;
+	struct semaphore       sm_sem;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kref          ref;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct ib_umad_port *port;
+	spinlock_t           recv_lock;
+	struct list_head     recv_list;
+	wait_queue_head_t    recv_wait;
+	struct rw_semaphore  agent_mutex;
+	struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+	struct ib_mr        *mr[IB_UMAD_MAX_AGENTS];
+};
+
+struct ib_umad_packet {
+	struct ib_user_mad mad;
+	struct ib_ah      *ah;
+	struct list_head   list;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static spinlock_t map_lock;
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS * 2);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	down_read(&file->agent_mutex);
+	for (packet->mad.id = 0;
+	     packet->mad.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.id++)
+		if (agent == file->agent[packet->mad.id]) {
+			spin_lock_irq(&file->recv_lock);
+			list_add_tail(&packet->list, &file->recv_list);
+			spin_unlock_irq(&file->recv_lock);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	up_read(&file->agent_mutex);
+
+	return ret;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet =
+		(void *) (unsigned long) send_wc->wr_id;
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(packet, mapping),
+			 sizeof packet->mad.data,
+			 DMA_TO_DEVICE);
+	ib_destroy_ah(packet->ah);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->mad.status = ETIMEDOUT;
+
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto out;
+
+	packet = kmalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto out;
+
+	memset(packet, 0, sizeof *packet);
+
+	memcpy(packet->mad.data, mad_recv_wc->recv_buf.mad, sizeof packet->mad.data);
+	packet->mad.status        = 0;
+	packet->mad.qpn 	  = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.lid 	  = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.sl  	  = mad_recv_wc->wc->sl;
+	packet->mad.path_bits 	  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.grh_present   = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.grh_present) {
+		/* XXX parse GRH */
+		packet->mad.gid_index 	  = 0;
+		packet->mad.hop_limit 	  = 0;
+		packet->mad.traffic_class = 0;
+		memset(packet->mad.gid, 0, 16);
+		packet->mad.flow_label 	  = 0;
+	}
+
+	if (queue_packet(file, agent, packet))
+		kfree(packet);
+
+out:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < sizeof (struct ib_user_mad))
+		return -EINVAL;
+
+	spin_lock_irq(&file->recv_lock);
+
+	while (list_empty(&file->recv_list)) {
+		spin_unlock_irq(&file->recv_lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->recv_lock);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	spin_unlock_irq(&file->recv_lock);
+
+	if (copy_to_user(buf, &packet->mad, sizeof packet->mad))
+		ret = -EFAULT;
+	else
+		ret = sizeof packet->mad;
+
+	kfree(packet);
+	return ret;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+	};
+	u8 method;
+	u64 *tid;
+	int ret;
+
+	if (count < sizeof (struct ib_user_mad))
+		return -EINVAL;
+
+	packet = kmalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, sizeof packet->mad)) {
+		kfree(packet);
+		return -EFAULT;
+	}
+
+	if (packet->mad.id < 0 || packet->mad.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	down_read(&file->agent_mutex);
+
+	agent = file->agent[packet->mad.id];
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	/*
+	 * If userspace is generating a request that will generate a
+	 * response, we need to make sure the high-order part of the
+	 * transaction ID matches the agent being used to send the
+	 * MAD.
+	 */
+	method = ((struct ib_mad_hdr *) packet->mad.data)->method;
+
+	if (!(method & IB_MGMT_METHOD_RESP)       &&
+	    method != IB_MGMT_METHOD_TRAP_REPRESS &&
+	    method != IB_MGMT_METHOD_SEND) {
+		tid = &((struct ib_mad_hdr *) packet->mad.data)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.lid);
+	ah_attr.sl            = packet->mad.sl;
+	ah_attr.src_path_bits = packet->mad.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	if (packet->mad.grh_present) {
+		ah_attr.ah_flags = IB_AH_GRH;
+		memcpy(ah_attr.grh.dgid.raw, packet->mad.gid, 16);
+		ah_attr.grh.flow_label 	   = packet->mad.flow_label;
+		ah_attr.grh.hop_limit  	   = packet->mad.hop_limit;
+		ah_attr.grh.traffic_class  = packet->mad.traffic_class;
+	}
+
+	packet->ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(packet->ah)) {
+		ret = PTR_ERR(packet->ah);
+		goto err_up;
+	}
+
+	gather_list.addr = dma_map_single(agent->device->dma_device,
+					  packet->mad.data,
+					  sizeof packet->mad.data,
+					  DMA_TO_DEVICE);
+	gather_list.length = sizeof packet->mad.data;
+	gather_list.lkey   = file->mr[packet->mad.id]->lkey;
+	pci_unmap_addr_set(packet, mapping, gather_list.addr);
+
+	wr.wr.ud.mad_hdr     = (struct ib_mad_hdr *) packet->mad.data;
+	wr.wr.ud.ah          = packet->ah;
+	wr.wr.ud.remote_qpn  = be32_to_cpu(packet->mad.qpn);
+	wr.wr.ud.remote_qkey = be32_to_cpu(packet->mad.qkey);
+	wr.wr.ud.timeout_ms  = packet->mad.timeout_ms;
+
+	wr.wr_id            = (unsigned long) packet;
+
+	ret = ib_post_send_mad(agent, &wr, &bad_wr);
+	if (ret) {
+		dma_unmap_single(agent->device->dma_device,
+				 pci_unmap_addr(packet, mapping),
+				 sizeof packet->mad.data,
+				 DMA_TO_DEVICE);
+		goto err_up;
+	}
+
+	up_read(&file->agent_mutex);
+
+	return sizeof packet->mad;
+
+err_up:
+	up_read(&file->agent_mutex);
+
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent;
+	int agent_id;
+	int ret;
+
+	down_write(&file->agent_mutex);
+
+	if (copy_from_user(&ureq, (void __user *) arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!file->agent[agent_id])
+			goto found;
+
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	req.mgmt_class         = ureq.mgmt_class;
+	req.mgmt_class_version = ureq.mgmt_class_version;
+	memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask);
+	memcpy(req.oui,         ureq.oui,         sizeof req.oui);
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      &req, 0, send_handler, recv_handler,
+				      file);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		goto out;
+	}
+
+	file->agent[agent_id] = agent;
+
+	file->mr[agent_id] = ib_get_dma_mr(agent->qp->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(file->mr[agent_id])) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto err_mr;
+	}
+
+	ret = 0;
+	goto out;
+
+err_mr:
+	ib_dereg_mr(file->mr[agent_id]);
+
+err:
+	file->agent[agent_id] = NULL;
+	ib_unregister_mad_agent(agent);
+
+out:
+	up_write(&file->agent_mutex);
+	return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+	u32 id;
+	int ret = 0;
+
+	down_write(&file->agent_mutex);
+
+	if (get_user(id, (u32 __user *) arg)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !file->agent[id]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ib_dereg_mr(file->mr[id]);
+	ib_unregister_mad_agent(file->agent[id]);
+	file->agent[id] = NULL;
+
+out:
+	up_write(&file->agent_mutex);
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp,
+			 unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, arg);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port =
+		container_of(inode->i_cdev, struct ib_umad_port, dev);
+	struct ib_umad_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	memset(file, 0, sizeof *file);
+
+	spin_lock_init(&file->recv_lock);
+	init_rwsem(&file->agent_mutex);
+	INIT_LIST_HEAD(&file->recv_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	filp->private_data = file;
+
+	return 0;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	int i;
+
+	for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+		if (file->agent[i]) {
+			ib_dereg_mr(file->mr[i]);
+			ib_unregister_mad_agent(file->agent[i]);
+		}
+
+	kfree(file);
+
+	return 0;
+}
+
+static struct file_operations umad_fops = {
+	.owner 	        = THIS_MODULE,
+	.read 	        = ib_umad_read,
+	.write 	        = ib_umad_write,
+	.poll 	        = ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+	.compat_ioctl   = ib_umad_ioctl,
+	.open 	        = ib_umad_open,
+	.release        = ib_umad_close
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port =
+		container_of(inode->i_cdev, struct ib_umad_port, sm_dev);
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem))
+			return -EAGAIN;
+	} else {
+		if (down_interruptible(&port->sm_sem))
+			return -ERESTARTSYS;
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret) {
+		up(&port->sm_sem);
+		return ret;
+	}
+
+	filp->private_data = port;
+
+	return 0;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	up(&port->sm_sem);
+
+	return ret;
+}
+
+static struct file_operations umad_sm_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_dev(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port = class_get_devdata(class_dev);
+
+	if (class_dev == &port->class_dev)
+		return print_dev_t(buf, port->dev.dev);
+	else
+		return print_dev_t(buf, port->sm_dev.dev);
+}
+static CLASS_DEVICE_ATTR(dev, S_IRUGO, show_dev, NULL);
+
+static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port = class_get_devdata(class_dev);
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port = class_get_devdata(class_dev);
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static CLASS_DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+	struct ib_umad_device *dev =
+		container_of(ref, struct ib_umad_device, ref);
+
+	kfree(dev);
+}
+
+static void ib_umad_release_port(struct class_device *class_dev)
+{
+	struct ib_umad_port *port = class_get_devdata(class_dev);
+
+	if (class_dev == &port->class_dev) {
+		cdev_del(&port->dev);
+		clear_bit(port->devnum, dev_map);
+	} else {
+		cdev_del(&port->sm_dev);
+		clear_bit(port->sm_devnum, dev_map);
+	}
+
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+}
+
+static struct class umad_class = {
+	.name    = "infiniband_mad",
+	.release = ib_umad_release_port
+};
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_port *port)
+{
+	spin_lock(&map_lock);
+	port->devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (port->devnum >= IB_UMAD_MAX_PORTS) {
+		spin_unlock(&map_lock);
+		return -1;
+	}
+	port->sm_devnum = find_next_zero_bit(dev_map, IB_UMAD_MAX_PORTS * 2, IB_UMAD_MAX_PORTS);
+	if (port->sm_devnum >= IB_UMAD_MAX_PORTS * 2) {
+		spin_unlock(&map_lock);
+		return -1;
+	}
+	set_bit(port->devnum, dev_map);
+	set_bit(port->sm_devnum, dev_map);
+	spin_unlock(&map_lock);
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	init_MUTEX(&port->sm_sem);
+
+	cdev_init(&port->dev, &umad_fops);
+	port->dev.owner = THIS_MODULE;
+	kobject_set_name(&port->dev.kobj, "umad%d", port->devnum);
+	if (cdev_add(&port->dev, base_dev + port->devnum, 1))
+		return -1;
+
+	port->class_dev.class = &umad_class;
+	port->class_dev.dev   = device->dma_device;
+
+	snprintf(port->class_dev.class_id, BUS_ID_SIZE, "umad%d", port->devnum);
+
+	if (class_device_register(&port->class_dev))
+		goto err_cdev;
+
+	class_set_devdata(&port->class_dev, port);
+	kref_get(&port->umad_dev->ref);
+
+	if (class_device_create_file(&port->class_dev, &class_device_attr_dev))
+		goto err_class;
+	if (class_device_create_file(&port->class_dev, &class_device_attr_ibdev))
+		goto err_class;
+	if (class_device_create_file(&port->class_dev, &class_device_attr_port))
+		goto err_class;
+
+	cdev_init(&port->sm_dev, &umad_sm_fops);
+	port->sm_dev.owner = THIS_MODULE;
+	kobject_set_name(&port->dev.kobj, "issm%d", port->sm_devnum - IB_UMAD_MAX_PORTS);
+	if (cdev_add(&port->sm_dev, base_dev + port->sm_devnum, 1))
+		return -1;
+
+	port->sm_class_dev.class = &umad_class;
+	port->sm_class_dev.dev   = device->dma_device;
+
+	snprintf(port->sm_class_dev.class_id, BUS_ID_SIZE, "issm%d", port->sm_devnum - IB_UMAD_MAX_PORTS);
+
+	if (class_device_register(&port->sm_class_dev))
+		goto err_sm_cdev;
+
+	class_set_devdata(&port->sm_class_dev, port);
+	kref_get(&port->umad_dev->ref);
+
+	if (class_device_create_file(&port->sm_class_dev, &class_device_attr_dev))
+		goto err_sm_class;
+	if (class_device_create_file(&port->sm_class_dev, &class_device_attr_ibdev))
+		goto err_sm_class;
+	if (class_device_create_file(&port->sm_class_dev, &class_device_attr_port))
+		goto err_sm_class;
+
+	return 0;
+
+err_sm_class:
+	class_device_unregister(&port->sm_class_dev);
+
+err_sm_cdev:
+	cdev_del(&port->sm_dev);
+
+err_class:
+	class_device_unregister(&port->class_dev);
+
+err_cdev:
+	cdev_del(&port->dev);
+	clear_bit(port->devnum, dev_map);
+
+	return -1;
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (device->node_type == IB_NODE_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kmalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	memset(umad_dev, 0, sizeof *umad_dev +
+	       (e - s + 1) * sizeof (struct ib_umad_port));
+
+	kref_init(&umad_dev->ref);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
+			goto err;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s) {
+		class_device_unregister(&umad_dev->port[i - s].class_dev);
+		class_device_unregister(&umad_dev->port[i - s].sm_class_dev);
+	}
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) {
+		class_device_unregister(&umad_dev->port[i].class_dev);
+		class_device_unregister(&umad_dev->port[i].sm_class_dev);
+	}
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	spin_lock_init(&map_lock);
+
+	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+				     "infiniband_mad");
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register device number\n");
+		goto out;
+	}
+
+	ret = class_register(&umad_class);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(&umad_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_unregister(&umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_unregister(&umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 000000000000..7c08ed0cd7dd
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+
+#include <ib_verbs.h>
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+	struct ib_pd *pd;
+
+	pd = device->alloc_pd(device);
+
+	if (!IS_ERR(pd)) {
+		pd->device = device;
+		atomic_set(&pd->usecnt, 0);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+	if (atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr);
+
+	if (!IS_ERR(ah)) {
+		ah->device = pd->device;
+		ah->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *qp;
+
+	qp = pd->device->create_qp(pd, qp_init_attr);
+
+	if (!IS_ERR(qp)) {
+		qp->device     	  = pd->device;
+		qp->pd         	  = pd;
+		qp->send_cq    	  = qp_init_attr->send_cq;
+		qp->recv_cq    	  = qp_init_attr->recv_cq;
+		qp->srq	       	  = qp_init_attr->srq;
+		qp->event_handler = qp_init_attr->event_handler;
+		qp->qp_context    = qp_init_attr->qp_context;
+		qp->qp_type	  = qp_init_attr->qp_type;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&qp_init_attr->send_cq->usecnt);
+		atomic_inc(&qp_init_attr->recv_cq->usecnt);
+		if (qp_init_attr->srq)
+			atomic_inc(&qp_init_attr->srq->usecnt);
+	}
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	return qp->device->modify_qp(qp, qp_attr, qp_attr_mask);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	int ret;
+
+	pd  = qp->pd;
+	scq = qp->send_cq;
+	rcq = qp->recv_cq;
+	srq = qp->srq;
+
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		atomic_dec(&scq->usecnt);
+		atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cqe);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq,
+                 int           cqe)
+{
+	int ret;
+
+	if (!cq->device->resize_cq)
+		return -ENOSYS;
+
+	ret = cq->device->resize_cq(cq, &cqe);
+	if (!ret)
+		cq->cqe = cqe;
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *mr;
+
+	mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+	if (!IS_ERR(mr)) {
+		mr->device = pd->device;
+		mr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start)
+{
+	struct ib_mr *mr;
+
+	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+				     mr_access_flags, iova_start);
+
+	if (!IS_ERR(mr)) {
+		mr->device = pd->device;
+		mr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start)
+{
+	struct ib_pd *old_pd;
+	int ret;
+
+	if (!mr->device->rereg_phys_mr)
+		return -ENOSYS;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	old_pd = mr->pd;
+
+	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+					phys_buf_array, num_phys_buf,
+					mr_access_flags, iova_start);
+
+	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+		atomic_dec(&old_pd->usecnt);
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	return mr->device->query_mr ?
+		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->dereg_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+	struct ib_mw *mw;
+
+	if (!pd->device->alloc_mw)
+		return ERR_PTR(-ENOSYS);
+
+	mw = pd->device->alloc_mw(pd);
+	if (!IS_ERR(mw)) {
+		mw->device = pd->device;
+		mw->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = mw->pd;
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-ENOSYS);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	return qp->device->attach_mcast ?
+		qp->device->attach_mcast(qp, gid, lid) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	return qp->device->detach_mcast ?
+		qp->device->detach_mcast(qp, gid, lid) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 000000000000..e88be85b3d5c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI && INFINIBAND
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_MTHCA
+	default n
+	---help---
+	  This option causes the mthca driver produce a bunch of debug
+	  messages.  Select this is you are developing the driver or
+	  trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 000000000000..5dcbd43073e2
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o mthca_memfree.o mthca_uar.o
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 000000000000..b1db48dd91d6
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_allocator.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	u32 obj;
+
+	spin_lock(&alloc->lock);
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock(&alloc->lock);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	obj &= alloc->max - 1;
+	spin_lock(&alloc->lock);
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+	spin_unlock(&alloc->lock);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page) {
+		int i = index & (PAGE_SIZE / sizeof (void *) - 1);
+		return array->page_list[p].page[i];
+	} else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & (PAGE_SIZE / sizeof (void *) - 1)] =
+		value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	}
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 000000000000..426d32778e9c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_av.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+
+#include "mthca_dev.h"
+
+struct mthca_av {
+	u32 port_pd;
+	u8  reserved1;
+	u8  g_slid;
+	u16 dlid;
+	u8  reserved2;
+	u8  gid_index;
+	u8  msg_sr;
+	u8  hop_limit;
+	u32 sl_tclass_flowlabel;
+	u32 dgid[4];
+};
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->type = MTHCA_AH_PCI_POOL;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		ah->av   = kmalloc(sizeof *ah->av, GFP_KERNEL);
+		if (!ah->av)
+			return -ENOMEM;
+
+		ah->type = MTHCA_AH_KMALLOC;
+		av       = ah->av;
+	} else if (!atomic_read(&pd->sqp_count) &&
+		 !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto on_hca_fail;
+
+		av = kmalloc(sizeof *av, GFP_KERNEL);
+		if (!av)
+			goto on_hca_fail;
+
+		ah->type = MTHCA_AH_ON_HCA;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+on_hca_fail:
+	if (ah->type == MTHCA_AH_PCI_POOL) {
+		ah->av = pci_pool_alloc(dev->av_table.pool,
+					SLAB_KERNEL, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	memset(av, 0, MTHCA_AV_SIZE);
+
+	av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+	av->g_slid  = ah_attr->src_path_bits;
+	av->dlid    = cpu_to_be16(ah_attr->dlid);
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		ah_attr->static_rate;
+	av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		av->g_slid |= 0x80;
+		av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+			ah_attr->grh.sgid_index;
+		av->hop_limit = ah_attr->grh.hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+	} else {
+		/* Arbel workaround -- low byte of GID must be 2 */
+		av->dgid[3] = cpu_to_be32(2);
+	}
+
+	if (0) {
+		int j;
+
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((u32 *) av)[j]));
+	}
+
+	if (ah->type == MTHCA_AH_ON_HCA) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	switch (ah->type) {
+	case MTHCA_AH_ON_HCA:
+		mthca_free(&dev->av_table.alloc,
+ 			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+		break;
+
+	case MTHCA_AH_PCI_POOL:
+		pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+		break;
+
+	case MTHCA_AH_KMALLOC:
+		kfree(ah->av);
+		break;
+	}
+
+	return 0;
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = ah->av->g_slid & 0x7f;
+	if (ah->av->g_slid & 0x80) {
+		header->grh_present = 1;
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		ib_get_cached_gid(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index,
+				  &header->grh.source_gid);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	} else {
+		header->grh_present = 0;
+	}
+
+	return 0;
+}
+
+int __devinit mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (dev->hca_type == ARBEL_NATIVE)
+		return 0;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void __devexit mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (dev->hca_type == ARBEL_NATIVE)
+		return;
+
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	pci_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 000000000000..9def0981f630
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cmd.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1
+};
+#else
+enum {
+	CMD_TIME_CLASS_A = 60 * HZ,
+	CMD_TIME_CLASS_B = 60 * HZ,
+	CMD_TIME_CLASS_C = 60 * HZ
+};
+#endif
+
+enum {
+	GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+	struct completion done;
+	struct timer_list timer;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+
+	if (down_interruptible(&dev->cmd.hcr_sem))
+		return -EINTR;
+
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end)) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+
+	if (go_bit(dev)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel(cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel(cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel(cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel(cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel(cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel(cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel(cpu_to_be32((1 << HCR_GO_BIT)                |
+				 (event ? (1 << HCA_E_BIT) : 0)   |
+				 (op_modifier << HCR_OPMOD_SHIFT) |
+				 op),                       dev->hcr + 6 * 4);
+
+out:
+	up(&dev->cmd.hcr_sem);
+	return err;
+}
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	unsigned long end;
+
+	if (down_interruptible(&dev->cmd.poll_sem))
+		return -EINTR;
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end)) {
+		set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm) {
+		memcpy_fromio(out_param, dev->hcr + HCR_OUT_PARAM_OFFSET, sizeof (u64));
+		be64_to_cpus(out_param);
+	}
+
+	*status = be32_to_cpu(__raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	context->token += dev->cmd.token_mask + 1;
+
+	complete(&context->done);
+}
+
+static void event_timeout(unsigned long context_ptr)
+{
+	struct mthca_cmd_context *context =
+		(struct mthca_cmd_context *) context_ptr;
+
+	context->result = -EBUSY;
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	if (down_interruptible(&dev->cmd.event_sem))
+		return -EINTR;
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	context->timer.expires  = jiffies + timeout;
+	add_timer(&context->timer);
+
+	wait_for_completion(&context->done);
+	del_timer_sync(&context->timer);
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	*status = context->status;
+	if (*status)
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, *status);
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.use_events)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout,
+		     u8 *status)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout, status);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.use_events)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+		init_timer(&dev->cmd.context[i].timer);
+		dev->cmd.context[i].timer.data     =
+			(unsigned long) &dev->cmd.context[i];
+		dev->cmd.context[i].timer.function = event_timeout;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.use_events = 1;
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.use_events = 0;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, HZ, status);
+
+	if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, HZ, status);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+			 u64 virt, u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	struct mthca_icm_iter iter;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	inbox = pci_alloc_consistent(dev->pdev, PAGE_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, PAGE_SIZE);
+
+	for (mthca_icm_first(icm, &iter);
+	     !mthca_icm_last(&iter);
+	     mthca_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+		if (lg < 12) {
+			mthca_warn(dev, "Got FW area not aligned to 4K (%llx/%lx).\n",
+				   (unsigned long long) mthca_icm_addr(&iter),
+				   mthca_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < mthca_icm_size(&iter) / (1 << lg); ++i, ++nent) {
+			if (virt != -1) {
+				*((__be64 *) (inbox + nent * 4)) =
+					cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			*((__be64 *) (inbox + nent * 4 + 2)) =
+				cpu_to_be64((mthca_icm_addr(&iter) +
+					     (i << lg)) | (lg - 12));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (nent == PAGE_SIZE / 16) {
+				err = mthca_cmd(dev, indma, nent, 0, op,
+						CMD_TIME_CLASS_B, status);
+				if (err || *status)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mthca_cmd(dev, indma, nent, 0, op,
+				CMD_TIME_CLASS_B, status);
+
+	switch (op) {
+	case CMD_MAP_FA:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM_AUX:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	pci_free_consistent(dev->pdev, PAGE_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1, status);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_FW_OUT_SIZE, &outdma);
+	if (!outbox) {
+		return -ENOMEM;
+	}
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more signifant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Arbel page size is always 4 KB; round up number of
+		 * system pages needed.
+		 */
+		dev->fw.arbel.fw_pages =
+			(dev->fw.arbel.fw_pages + (1 << (PAGE_SHIFT - 12)) - 1) >>
+			(PAGE_SHIFT - 12);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_FW_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	u8 info;
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	outbox = pci_alloc_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C, status);
+
+	if (err)
+		goto out;
+
+	if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	pci_free_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status)
+{
+	u8 info;
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	u8 field;
+	u16 size;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+	dev_lim->max_srq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+	dev_lim->max_qp_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET);
+		dev_lim->mtt_seg_sz = size;
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+		dev_lim->mtt_seg_sz   = MTHCA_MTT_SEG_SIZE;
+		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+	}
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_ADAPTER_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET);
+	MTHCA_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET);
+	MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+	MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS_OFFSET        	 0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UARC_SZ_OFFSET         (INIT_HCA_UAR_OFFSET + 0x09)
+#define  INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define  INIT_HCA_UAR_CTX_BASE_OFFSET    (INIT_HCA_UAR_OFFSET + 0x18)
+
+	inbox = pci_alloc_consistent(dev->pdev, INIT_HCA_IN_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	if (dev->hca_type != ARBEL_NATIVE)
+		MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	}
+
+	MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->log_uar_sz,  INIT_HCA_LOG_UAR_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uarc_base,   INIT_HCA_UAR_CTX_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, indma, 0, 0, CMD_INIT_HCA,
+			HZ, status);
+
+	pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_FLAG_1X          (1 << 8)
+#define INIT_IB_FLAG_4X          (1 << 9)
+#define INIT_IB_FLAG_12X         (1 << 11)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	inbox = pci_alloc_consistent(dev->pdev, INIT_IB_IN_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->enable_1x     ? INIT_IB_FLAG_1X  : 0;
+	flags |= param->enable_4x     ? INIT_IB_FLAG_4X  : 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, indma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A, status);
+
+	pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, HZ, status);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, HZ, status);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port, u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int err;
+	u32 flags = 0;
+
+#define SET_IB_IN_SIZE         0x40
+#define SET_IB_FLAGS_OFFSET    0x00
+#define SET_IB_FLAG_SIG        (1 << 18)
+#define SET_IB_FLAG_RQK        (1 <<  0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET  0x08
+
+	inbox = pci_alloc_consistent(dev->pdev, SET_IB_IN_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, SET_IB_IN_SIZE);
+
+	flags |= param->set_si_guid     ? SET_IB_FLAG_SIG : 0;
+	flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+	MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,  SET_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, indma, port, 0, CMD_SET_IB,
+			CMD_TIME_CLASS_B, status);
+
+	pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt, status);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status)
+{
+	u64 *inbox;
+	dma_addr_t indma;
+	int err;
+
+	inbox = pci_alloc_consistent(dev->pdev, 16, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mthca_cmd(dev, indma, 1, 0, CMD_MAP_ICM, CMD_TIME_CLASS_B, status);
+
+	pci_free_consistent(dev->pdev, 16, inbox, indma);
+
+	if (!err)
+		mthca_dbg(dev, "Mapped page at %llx for ICM.\n",
+			  (unsigned long long) virt);
+
+	return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status)
+{
+	mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+		  page_count, (unsigned long long) virt);
+
+	return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1, status);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+		       u8 *status)
+{
+	int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, 0, CMD_SET_ICM_SIZE,
+				CMD_TIME_CLASS_A, status);
+
+	if (ret || status)
+		return ret;
+
+	/*
+	 * Arbel page size is always 4 KB; round up number of system
+	 * pages needed.
+	 */
+	*aux_pages = (*aux_pages + (1 << (PAGE_SHIFT - 12)) - 1) >> (PAGE_SHIFT - 12);
+
+	return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mpt_entry,
+			       MTHCA_MPT_ENTRY_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, mpt_index, 0, CMD_SW2HW_MPT,
+			CMD_TIME_CLASS_B, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_MPT_ENTRY_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	if (mpt_entry) {
+		outdma = pci_map_single(dev->pdev, mpt_entry,
+					MTHCA_MPT_ENTRY_SIZE,
+					PCI_DMA_FROMDEVICE);
+		if (pci_dma_mapping_error(outdma))
+			return -ENOMEM;
+	}
+
+	err = mthca_cmd_box(dev, 0, outdma, mpt_index, !mpt_entry,
+			    CMD_HW2SW_MPT,
+			    CMD_TIME_CLASS_B, status);
+
+	if (mpt_entry)
+		pci_unmap_single(dev->pdev, outdma,
+				 MTHCA_MPT_ENTRY_SIZE,
+				 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+		    int num_mtt, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mtt_entry,
+			       (num_mtt + 2) * 8,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, num_mtt, 0, CMD_WRITE_MTT,
+			CMD_TIME_CLASS_B, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 (num_mtt + 2) * 8, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, eq_context,
+			       MTHCA_EQ_CONTEXT_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, eq_num, 0, CMD_SW2HW_EQ,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_EQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, eq_context,
+				MTHCA_EQ_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, eq_num, 0,
+			    CMD_HW2SW_EQ,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_EQ_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, cq_context,
+			       MTHCA_CQ_CONTEXT_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_CQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, cq_context,
+				MTHCA_CQ_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, cq_num, 0,
+			    CMD_HW2SW_CQ,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_CQ_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+		    int is_ee, void *qp_context, u32 optmask,
+		    u8 *status)
+{
+	static const u16 op[] = {
+		[MTHCA_TRANS_RST2INIT]  = CMD_RST2INIT_QPEE,
+		[MTHCA_TRANS_INIT2INIT] = CMD_INIT2INIT_QPEE,
+		[MTHCA_TRANS_INIT2RTR]  = CMD_INIT2RTR_QPEE,
+		[MTHCA_TRANS_RTR2RTS]   = CMD_RTR2RTS_QPEE,
+		[MTHCA_TRANS_RTS2RTS]   = CMD_RTS2RTS_QPEE,
+		[MTHCA_TRANS_SQERR2RTS] = CMD_SQERR2RTS_QPEE,
+		[MTHCA_TRANS_ANY2ERR]   = CMD_2ERR_QPEE,
+		[MTHCA_TRANS_RTS2SQD]   = CMD_RTS2SQD_QPEE,
+		[MTHCA_TRANS_SQD2SQD]   = CMD_SQD2SQD_QPEE,
+		[MTHCA_TRANS_SQD2RTS]   = CMD_SQD2RTS_QPEE,
+		[MTHCA_TRANS_ANY2RST]   = CMD_ERR2RST_QPEE
+	};
+	u8 op_mod = 0;
+
+	dma_addr_t indma;
+	int err;
+
+	if (trans < 0 || trans >= ARRAY_SIZE(op))
+		return -EINVAL;
+
+	if (trans == MTHCA_TRANS_ANY2RST) {
+		indma  = 0;
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		qp_context = pci_alloc_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+						  &indma);
+		op_mod = 2;	/* write outbox, any->reset */
+	} else {
+		indma = pci_map_single(dev->pdev, qp_context,
+				       MTHCA_QP_CONTEXT_SIZE,
+				       PCI_DMA_TODEVICE);
+		if (pci_dma_mapping_error(indma))
+			return -ENOMEM;
+
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk("  opt param mask: %08x\n", be32_to_cpup(qp_context));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("  [%02x] ", i * 4);
+				printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+	}
+
+	if (trans == MTHCA_TRANS_ANY2RST) {
+		err = mthca_cmd_box(dev, 0, indma, (!!is_ee << 24) | num,
+				    op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(qp_context));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+	} else
+		err = mthca_cmd(dev, indma, (!!is_ee << 24) | num,
+				op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+	if (trans != MTHCA_TRANS_ANY2RST)
+		pci_unmap_single(dev->pdev, indma,
+				 MTHCA_QP_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	else
+		pci_free_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+				    qp_context, indma);
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   void *qp_context, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, qp_context,
+				MTHCA_QP_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, (!!is_ee << 24) | num, 0,
+			    CMD_QUERY_QPEE,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_QP_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETY:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc* in_wc, struct ib_grh* in_grh,
+		  void *in_mad, void *response_mad, u8 *status)
+{
+	void *box;
+	dma_addr_t dma;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE      0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET   0x104
+#define MAD_IFC_SL_OFFSET     0x108
+#define MAD_IFC_G_PATH_OFFSET 0x109
+#define MAD_IFC_RLID_OFFSET   0x10a
+#define MAD_IFC_PKEY_OFFSET   0x10e
+#define MAD_IFC_GRH_OFFSET    0x140
+
+	box = pci_alloc_consistent(dev->pdev, MAD_IFC_BOX_SIZE, &dma);
+	if (!box)
+		return -ENOMEM;
+
+	memcpy(box, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		u8 val;
+
+		memset(box + 256, 0, 256);
+
+		MTHCA_PUT(box, in_wc->qp_num, 	  MAD_IFC_MY_QPN_OFFSET);
+		MTHCA_PUT(box, in_wc->src_qp, 	  MAD_IFC_RQPN_OFFSET);
+
+		val = in_wc->sl << 4;
+		MTHCA_PUT(box, val,               MAD_IFC_SL_OFFSET);
+
+		val = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		MTHCA_PUT(box, val,               MAD_IFC_GRH_OFFSET);
+
+		MTHCA_PUT(box, in_wc->slid,       MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(box, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+		if (in_grh)
+			memcpy((u8 *) box + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+		op_modifier |= 0x10;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mthca_cmd_box(dev, dma, dma + 512, in_modifier, op_modifier,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C, status);
+
+	if (!err && !*status)
+		memcpy(response_mad, box + 512, 256);
+
+	pci_free_consistent(dev->pdev, MAD_IFC_BOX_SIZE, box, dma);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+		   u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, mgm,
+				MTHCA_MGM_ENTRY_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, index, 0,
+			    CMD_READ_MGM,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_MGM_ENTRY_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+		    u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mgm,
+			       MTHCA_MGM_ENTRY_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, index, 0, CMD_WRITE_MGM,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_MGM_ENTRY_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+		    u8 *status)
+{
+	dma_addr_t indma;
+	u64 imm;
+	int err;
+
+	indma = pci_map_single(dev->pdev, gid, 16, PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd_imm(dev, indma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A, status);
+	*hash = imm;
+
+	pci_unmap_single(dev->pdev, indma, 16, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100), status);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 000000000000..a8bc6aa36ff1
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cmd.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <ib_verbs.h>
+
+#define MTHCA_CMD_MAILBOX_ALIGN 16UL
+#define MTHCA_CMD_MAILBOX_EXTRA (MTHCA_CMD_MAILBOX_ALIGN - 1)
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+        /* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+        /* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_RC                 = 1 << 0,
+	DEV_LIM_FLAG_UC                 = 1 << 1,
+	DEV_LIM_FLAG_UD                 = 1 << 2,
+	DEV_LIM_FLAG_RD                 = 1 << 3,
+	DEV_LIM_FLAG_RAW_IPV6           = 1 << 4,
+	DEV_LIM_FLAG_RAW_ETHER          = 1 << 5,
+	DEV_LIM_FLAG_SRQ                = 1 << 6,
+	DEV_LIM_FLAG_BAD_PKEY_CNTR      = 1 << 8,
+	DEV_LIM_FLAG_BAD_QKEY_CNTR      = 1 << 9,
+	DEV_LIM_FLAG_MW                 = 1 << 16,
+	DEV_LIM_FLAG_AUTO_PATH_MIG      = 1 << 17,
+	DEV_LIM_FLAG_ATOMIC             = 1 << 18,
+	DEV_LIM_FLAG_RAW_MULTI          = 1 << 19,
+	DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+	DEV_LIM_FLAG_UD_MULTI           = 1 << 21,
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	int mtt_seg_sz;
+	int mpt_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32 vendor_id;
+	u32 device_id;
+	u32 revision_id;
+	u8  inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u64 eec_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u64 rdb_base;
+	u64 mc_base;
+	u64 mpt_base;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+	u64 uarc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_eecs;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_mc_table_sz;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+	int enable_1x;
+	int enable_4x;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status);
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status);
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status);
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port, u8 *status);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+		       u8 *status);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status);
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+		    int num_mtt, u8 *status);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status);
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+		    int is_ee, void *qp_context, u32 optmask,
+		    u8 *status);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   void *qp_context, u8 *status);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc* in_wc, struct ib_grh* in_grh,
+		  void *in_mad, void *response_mad, u8 *status);
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+		   u8 *status);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+		    u8 *status);
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+		    u8 *status);
+int mthca_NOP(struct mthca_dev *dev, u8 *status);
+
+#define MAILBOX_ALIGN(x) ((void *) ALIGN((unsigned long) (x), MTHCA_CMD_MAILBOX_ALIGN))
+
+#endif /* MTHCA_CMD_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 000000000000..b4bfbbfe2c3d
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_config_reg.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#include <asm/page.h>
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_MAP_ECR_SIZE     (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+#define MTHCA_EQ_SET_CI_SIZE   (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 000000000000..5dead2df7eb0
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	u32 flags;
+	u64 start;
+	u32 logsize_usrpage;
+	u32 error_eqn;		/* Tavor only */
+	u32 comp_eqn;
+	u32 pd;
+	u32 lkey;
+	u32 last_notified_index;
+	u32 solicit_producer_index;
+	u32 consumer_index;
+	u32 producer_index;
+	u32 cqn;
+	u32 ci_db;		/* Arbel only */
+	u32 state_db;		/* Arbel only */
+	u32 reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	u32 my_qpn;
+	u32 my_ee;
+	u32 rqpn;
+	u16 sl_g_mlpath;
+	u16 rlid;
+	u32 imm_etype_pkey_eec;
+	u32 byte_cnt;
+	u32 wqe;
+	u8  opcode;
+	u8  is_send;
+	u8  reserved;
+	u8  owner;
+};
+
+struct mthca_err_cqe {
+	u32 my_qpn;
+	u32 reserved1[3];
+	u8  syndrome;
+	u8  reserved2;
+	u16 db_cnt;
+	u32 reserved3;
+	u32 wqe;
+	u8  opcode;
+	u8  reserved4[2];
+	u8  owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	if (cq->is_direct)
+		return cq->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return cq->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
+{
+	struct mthca_cqe *cqe = get_cqe(cq, i);
+	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(cq, cq->cons_index & cq->ibcq.cqe);
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				     int incr)
+{
+	u32 doorbell[2];
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
+		wmb();
+	} else {
+		doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);
+		doorbell[1] = cpu_to_be32(incr - 1);
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_CQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn)
+{
+	struct mthca_cq *cq;
+	struct mthca_cqe *cqe;
+	int prod_index;
+	int nfreed = 0;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(cq, prod_index & cq->ibcq.cqe);
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while (prod_index > cq->cons_index) {
+		cqe = get_cqe(cq, (prod_index - 1) & cq->ibcq.cqe);
+		if (cqe->my_qpn == cpu_to_be32(qpn))
+			++nfreed;
+		else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index - 1 + nfreed) &
+				       cq->ibcq.cqe),
+			       cqe,
+			       MTHCA_CQ_ENTRY_SIZE);
+		--prod_index;
+	}
+
+	if (nfreed) {
+		wmb();
+		cq->cons_index += nfreed;
+		update_cons_index(dev, cq, nfreed);
+	}
+
+	spin_unlock_irq(&cq->lock);
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			    struct mthca_qp *qp, int wqe_index, int is_send,
+			    struct mthca_err_cqe *cqe,
+			    struct ib_wc *entry, int *free_cqe)
+{
+	int err;
+	int dbd;
+	u32 new_wqe;
+
+	if (1 && cqe->syndrome != SYNDROME_WR_FLUSH_ERR) {
+		int j;
+
+		mthca_dbg(dev, "%x/%d: error CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+	}
+
+	/*
+	 * For completions in error, only work request ID, status (and
+	 * freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	err = mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+	if (err)
+		return err;
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return 0;
+
+	cqe->db_cnt   = cpu_to_be16(be16_to_cpu(cqe->db_cnt) - dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+
+	return 0;
+}
+
+static void dump_cqe(struct mthca_cqe *cqe)
+{
+	int j;
+
+	for (j = 0; j < 8; ++j)
+		printk(KERN_DEBUG "  [%2x] %08x\n",
+		       j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+
+		dump_cqe(cqe);
+	}
+
+	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+		MTHCA_ERROR_CQE_OPCODE_MASK;
+	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	entry->qp_num = (*cur_qp)->qpn;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index +
+					       (*cur_qp)->rq.max];
+	} else {
+		wq = &(*cur_qp)->rq;
+		wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	}
+
+	if (wq->last_comp < wqe_index)
+		wq->tail += wqe_index - wq->last_comp;
+	else
+		wq->tail += wqe_index + wq->max - wq->last_comp;
+
+	wq->last_comp = wqe_index;
+
+	if (0)
+		mthca_dbg(dev, "%s completion for QP %06x, index %d (nr %d)\n",
+			  is_send ? "Send" : "Receive",
+			  (*cur_qp)->qpn, wqe_index, wq->max);
+
+	if (is_error) {
+		err = handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				       (struct mthca_err_cqe *) cqe,
+				       entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->opcode = IB_WC_SEND; /* XXX */
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = be16_to_cpu(cqe->sl_g_mlpath) >> 12;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = be16_to_cpu(cqe->sl_g_mlpath) & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= be16_to_cpu(cqe->sl_g_mlpath) & 0x80 ?
+					IB_WC_GRH : 0;
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (likely(free_cqe)) {
+		set_cqe_hw(cqe);
+		++(*freed);
+		++cq->cons_index;
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+	}
+
+	if (freed) {
+		wmb();
+		update_cons_index(dev, cq, freed);
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ?
+				   MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+				   MTHCA_TAVOR_CQ_DB_REQ_NOT)      |
+				  to_mcq(cq)->cqn);
+	doorbell[1] = 0xffffffff;
+
+	mthca_write64(doorbell,
+		      to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+	struct mthca_cq *cq = to_mcq(ibcq);
+	u32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cpu_to_be32(cq->cons_index);
+
+	doorbell[0] = ci;
+	doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				  (notify == IB_CQ_SOLICITED ? 1 : 2));
+
+	mthca_write_db_rec(doorbell, cq->arm_db);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32((sn << 28)                       |
+				  (notify == IB_CQ_SOLICITED ?
+				   MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+				   MTHCA_ARBEL_CQ_DB_REQ_NOT)      |
+				  cq->cqn);
+	doorbell[1] = ci;
+
+	mthca_write64(doorbell,
+		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+	return 0;
+}
+
+static void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+	int i;
+	int size;
+
+	if (cq->is_direct)
+		pci_free_consistent(dev->pdev,
+				    (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE,
+				    cq->queue.direct.buf,
+				    pci_unmap_addr(&cq->queue.direct,
+						   mapping));
+	else {
+		size = (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE;
+		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+			if (cq->queue.page_list[i].buf)
+				pci_free_consistent(dev->pdev, PAGE_SIZE,
+						    cq->queue.page_list[i].buf,
+						    pci_unmap_addr(&cq->queue.page_list[i],
+								   mapping));
+
+		kfree(cq->queue.page_list);
+	}
+}
+
+static int mthca_alloc_cq_buf(struct mthca_dev *dev, int size,
+			      struct mthca_cq *cq)
+{
+	int err = -ENOMEM;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	int i;
+
+	if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) {
+		cq->is_direct = 1;
+		npages        = 1;
+		shift         = get_order(size) + PAGE_SHIFT;
+
+		cq->queue.direct.buf = pci_alloc_consistent(dev->pdev,
+							    size, &t);
+		if (!cq->queue.direct.buf)
+			return -ENOMEM;
+
+		pci_unmap_addr_set(&cq->queue.direct, mapping, t);
+
+		memset(cq->queue.direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		cq->is_direct = 0;
+		npages        = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift         = PAGE_SHIFT;
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			return -ENOMEM;
+
+		cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list,
+					      GFP_KERNEL);
+		if (!cq->queue.page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			cq->queue.page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			cq->queue.page_list[i].buf =
+				pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+			if (!cq->queue.page_list[i].buf)
+				goto err_free;
+
+			dma_list[i] = t;
+			pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t);
+
+			memset(cq->queue.page_list[i].buf, 0, PAGE_SIZE);
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &cq->mr);
+	if (err)
+		goto err_free;
+
+	kfree(dma_list);
+
+	return 0;
+
+err_free:
+	mthca_free_cq_buf(dev, cq);
+
+err_out:
+	kfree(dma_list);
+
+	return err;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_cq *cq)
+{
+	int size = nent * MTHCA_CQ_ENTRY_SIZE;
+	void *mailbox = NULL;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+	u8 status;
+	int i;
+
+	might_sleep();
+
+	cq->ibcq.cqe = nent - 1;
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		cq->arm_sn = 1;
+
+		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+		if (err)
+			goto err_out;
+
+		err = -ENOMEM;
+
+		cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+						     cq->cqn, &cq->set_ci_db);
+		if (cq->set_ci_db_index < 0)
+			goto err_out_icm;
+
+		cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+						  cq->cqn, &cq->arm_db);
+		if (cq->arm_db_index < 0)
+			goto err_out_ci;
+	}
+
+	mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_mailbox;
+
+	cq_context = MAILBOX_ALIGN(mailbox);
+
+	err = mthca_alloc_cq_buf(dev, size, cq);
+	if (err)
+		goto err_out_mailbox;
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(get_cqe(cq, i));
+
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->refcount, 1);
+	init_waitqueue_head(&cq->wait);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->start           = cpu_to_be64(0);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
+						  dev->driver_uar.index);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(dev->driver_pd.pd_num);
+	cq_context->lkey            = cpu_to_be32(cq->mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
+		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+	}
+
+	err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	if (status) {
+		mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	kfree(mailbox);
+
+	return 0;
+
+err_out_free_mr:
+	mthca_free_mr(dev, &cq->mr);
+	mthca_free_cq_buf(dev, cq);
+
+err_out_mailbox:
+	kfree(mailbox);
+
+	mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+	mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+	return err;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	void *mailbox;
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, MAILBOX_ALIGN(mailbox), cq->cqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n",
+			   status);
+
+	if (0) {
+		u32 *ctx = MAILBOX_ALIGN(mailbox);
+		int j;
+
+		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+		       cq->cqn, cq->cons_index, !!next_cqe_sw(cq));
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+	else
+		synchronize_irq(dev->pdev->irq);
+
+	atomic_dec(&cq->refcount);
+	wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+	mthca_free_mr(dev, &cq->mr);
+	mthca_free_cq_buf(dev, cq);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	}
+
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	kfree(mailbox);
+}
+
+int __devinit mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 000000000000..56b2bfb5adb1
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_dev.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"0.06-pre"
+#define DRV_RELDATE	"November 8, 2004"
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE		/* MT25208 with extended features */
+};
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI        = 1 << 3,
+	MTHCA_FLAG_MSI_X      = 1 << 4,
+	MTHCA_FLAG_NO_LAM     = 1 << 5
+};
+
+enum {
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_RDB_ENTRY_SIZE  =  0x20,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  =  0x40,
+
+	/* Arbel FW gives us these, but we need them for Tavor */
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_MTT_SEG_SIZE    =  0x40,
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+struct mthca_cmd {
+	int                       use_events;
+	struct semaphore          hcr_sem;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      num_uars;
+	int      max_sg;
+	int      num_qps;
+	int      reserved_qps;
+	int      num_srqs;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int      mtt_seg_size;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_uar_table {
+	struct mthca_alloc alloc;
+	u64                uarc_base;
+	int                uarc_size;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc      mpt_alloc;
+	int                     max_mtt_order;
+	unsigned long         **mtt_buddy;
+	u64                     mtt_base;
+	struct mthca_icm_table *mtt_table;
+	struct mthca_icm_table *mpt_table;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	u32                arm_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	u64                icm_virt;
+	struct page       *icm_page;
+	dma_addr_t         icm_dma;
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      cq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc     	alloc;
+	u32                    	rdb_base;
+	int                    	rdb_shift;
+	int                    	sqp_start;
+	spinlock_t             	lock;
+	struct mthca_array     	qp;
+	struct mthca_icm_table *qp_table;
+	struct mthca_icm_table *eqp_table;
+};
+
+struct mthca_av_table {
+	struct pci_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct semaphore   	sem;
+	struct mthca_alloc 	alloc;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+	unsigned long    device_cap_flags;
+
+	u32              rev_id;
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct mthca_icm *fw_icm;
+			struct mthca_icm *aux_icm;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+	struct semaphore cap_mask_mutex;
+
+	void __iomem    *hcr;
+	void __iomem    *kar;
+	void __iomem    *clr_base;
+	union {
+		struct {
+			void __iomem *ecr_base;
+		} tavor;
+		struct {
+			void __iomem *eq_arm;
+			void __iomem *eq_set_ci_base;
+		} arbel;
+	} eq_regs;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_uar_table uar_table;
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_uar       driver_uar;
+	struct mthca_db_table *db_tab;
+	struct mthca_pd        driver_pd;
+	struct mthca_mr        driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+};
+
+#define mthca_dbg(mdev, format, arg...) \
+	dev_dbg(&mdev->pdev->dev, format, ## arg)
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+			case 1: (dest) = *(u8 *) __p;       break;    \
+			case 2: (dest) = be16_to_cpup(__p); break;    \
+			case 4: (dest) = be32_to_cpup(__p); break;    \
+			case 8: (dest) = be64_to_cpup(__p); break;    \
+			default: __buggy_use_of_MTHCA_GET();          \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		__typeof__(source) *__p =                             \
+			(__typeof__(source) *) ((char *) (dest) + (offset)); \
+		switch (sizeof(source)) {                             \
+			case 1: *__p = (source);            break;    \
+			case 2: *__p = cpu_to_be16(source); break;    \
+			case 4: *__p = cpu_to_be32(source); break;    \
+			case 8: *__p = cpu_to_be64(source); break;    \
+			default: __buggy_use_of_MTHCA_PUT();          \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+		       int index, int *dbd, u32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 000000000000..78b183cab54c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_doorbell.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+	*(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(val[0], dest);
+	__raw_writel(val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+	db[0] = val[0];
+	wmb();
+	db[1] = val[1];
+}
+
+#endif
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 000000000000..623daab5c92b
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,964 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_eq.c 1382 2004-12-24 02:21:02Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	u32 flags;
+	u64 start;
+	u32 logsize_usrpage;
+	u32 tavor_pd;		/* reserved for Arbel */
+	u8  reserved1[3];
+	u8  intr;
+	u32 arbel_pd;		/* lost_count for Tavor */
+	u32 lkey;
+	u32 reserved2[2];
+	u32 consumer_index;
+	u32 producer_index;
+	u32 reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+#define MTHCA_EQ_STATE_ARBEL        ( 8 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_LAST_WQE       = 0x13,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LAST_WQE)
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			u32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16 reserved1;
+			u16 token;
+			u32 reserved2;
+			u8  reserved3[3];
+			u8  status;
+			u64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			u32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			u32 cqn;
+			u32 reserved1;
+			u8  reserved2[3];
+			u8  syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32 reserved1[2];
+			u32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eq->eqn);
+	doorbell[1] = cpu_to_be32(ci & (eq->nent - 1));
+
+	/*
+	 * This barrier makes sure that all updates to ownership bits
+	 * done by set_eqe_hw() hit memory before the consumer index
+	 * is updated.  set_eq_ci() allows the HCA to possibly write
+	 * more EQ entries, and we want to avoid the exceedingly
+	 * unlikely possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/* See comment in tavor_set_eq_ci() above. */
+	wmb();
+	__raw_writel(cpu_to_be32(ci), dev->eq_regs.arbel.eq_set_ci_base +
+		     eq->eqn * 8);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	if (dev->hca_type == ARBEL_NATIVE)
+		arbel_set_eq_ci(dev, eq, ci);
+	else
+		tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_REQ_NOT | eqn);
+	doorbell[1] = 0;
+
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+	writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	if (dev->hca_type != ARBEL_NATIVE) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
+		doorbell[1] = cpu_to_be32(cqn);
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_EQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe* next_eqe_sw(struct mthca_eq *eq)
+{
+	struct mthca_eqe* eqe;
+	eqe = get_eqe(eq, eq->cons_index);
+	return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+	eqe->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+	int  eqes_found = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		int set_ci = 0;
+
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_event(dev, disarm_cqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			/*
+			 * cmd_event() may add more commands.
+			 * The card will think the queue has overflowed if
+			 * we don't tell it we've been processing events.
+			 */
+			set_ci = 1;
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+			mthca_warn(dev, "CQ %s on CQN %08x\n",
+				   eqe->event.cq_err.syndrome == 1 ?
+				   "overrun" : "access violation",
+				   be32_to_cpu(eqe->event.cq_err.cqn));
+			break;
+
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+			mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		};
+
+		set_eqe_hw(eqe);
+		++eq->cons_index;
+		eqes_found = 1;
+
+		if (unlikely(set_ci)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			set_eq_ci(dev, eq, eq->cons_index);
+			set_ci = 0;
+		}
+	}
+
+	/*
+	 * Rely on caller to set consumer index so that we don't have
+	 * to test hca_type in our interrupt handling fast path.
+	 */
+	return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+	if (ecr) {
+		writel(ecr, dev->eq_regs.tavor.ecr_base +
+		       MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i)
+			if (ecr & dev->eq_table.eq[i].eqn_mask &&
+			    mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+				tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+						dev->eq_table.eq[i].cons_index);
+				tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+			}
+	}
+
+	return IRQ_RETVAL(ecr);
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr,
+					 struct pt_regs *regs)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	tavor_set_eq_ci(dev, eq, eq->cons_index);
+	tavor_eq_req_not(dev, eq->eqn);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+	struct mthca_dev *dev = dev_ptr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+			work = 1;
+			arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+					dev->eq_table.eq[i].cons_index);
+		}
+
+	arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr,
+					       struct pt_regs *regs)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	arbel_set_eq_ci(dev, eq, eq->cons_index);
+	arbel_eq_req_not(dev, eq->eqn_mask);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int __devinit mthca_create_eq(struct mthca_dev *dev,
+				     int nent,
+				     u8 intr,
+				     struct mthca_eq *eq)
+{
+	int npages = (nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	void *mailbox = NULL;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+	u8 status;
+
+	/* Make sure EQ size is aligned to a power of 2 size. */
+	for (i = 1; i < nent; i <<= 1)
+		; /* nothing */
+	nent = i;
+
+	eq->dev = dev;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = kmalloc(sizeof *eq_context + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_free;
+	eq_context = MAILBOX_ALIGN(mailbox);
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = pci_alloc_consistent(dev->pdev,
+							    PAGE_SIZE, &t);
+		if (!eq->page_list[i].buf)
+			goto err_out_free;
+
+		dma_list[i] = t;
+		pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	for (i = 0; i < nent; ++i)
+		set_eqe_hw(get_eqe(eq, i));
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	eq->nent = nent;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	if (dev->hca_type == ARBEL_NATIVE)
+		eq_context->flags  |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (dev->hca_type == ARBEL_NATIVE) {
+		eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+	} else {
+		eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+		eq_context->tavor_pd         = cpu_to_be32(dev->driver_pd.pd_num);
+	}
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, eq_context, eq->eqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+	if (status) {
+		mthca_warn(dev, "SW2HW_EQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	kfree(mailbox);
+
+	eq->eqn_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	dev->eq_table.arm_mask |= eq->eqn_mask;
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    eq->page_list[i].buf,
+					    pci_unmap_addr(&eq->page_list[i],
+							   mapping));
+
+	kfree(eq->page_list);
+	kfree(dma_list);
+	kfree(mailbox);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	void *mailbox = NULL;
+	int err;
+	u8 status;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = kmalloc(sizeof (struct mthca_eq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		return;
+
+	err = mthca_HW2SW_EQ(dev, MAILBOX_ALIGN(mailbox),
+			     eq->eqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+	if (status)
+		mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n",
+			   status);
+
+	dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(MAILBOX_ALIGN(mailbox) + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    pci_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	kfree(mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq)
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+}
+
+static int __devinit mthca_map_reg(struct mthca_dev *dev,
+				   unsigned long offset, unsigned long size,
+				   void __iomem **map)
+{
+	unsigned long base = pci_resource_start(dev->pdev, 0);
+
+	if (!request_mem_region(base + offset, size, DRV_NAME))
+		return -EBUSY;
+
+	*map = ioremap(base + offset, size);
+	if (!*map) {
+		release_mem_region(base + offset, size);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mthca_unmap_reg(struct mthca_dev *dev, unsigned long offset,
+			    unsigned long size, void __iomem *map)
+{
+	unsigned long base = pci_resource_start(dev->pdev, 0);
+
+	release_mem_region(base + offset, size);
+	iounmap(map);
+}
+
+static int __devinit mthca_map_eq_regs(struct mthca_dev *dev)
+{
+	unsigned long mthca_base;
+
+	mthca_base = pci_resource_start(dev->pdev, 0);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		/*
+		 * We assume that the EQ arm and EQ set CI registers
+		 * fall within the first BAR.  We can't trust the
+		 * values firmware gives us, since those addresses are
+		 * valid on the HCA's side of the PCI bus but not
+		 * necessarily the host side.
+		 */
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Add 4 because we limit ourselves to EQs 0 ... 31,
+		 * so we only need the low word of the register.
+		 */
+		if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.eq_arm_base) + 4, 4,
+				  &dev->eq_regs.arbel.eq_arm)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+					dev->clr_base);
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.eq_set_ci_base,
+				  MTHCA_EQ_SET_CI_SIZE,
+				  &dev->eq_regs.arbel.eq_set_ci_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					      dev->fw.arbel.eq_arm_base) + 4, 4,
+					dev->eq_regs.arbel.eq_arm);
+			mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+					dev->clr_base);
+			return -ENOMEM;
+		}
+	} else {
+		if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+				  MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				  &dev->eq_regs.tavor.ecr_base)) {
+			mthca_err(dev, "Couldn't map ecr register, "
+				  "aborting.\n");
+			mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+					dev->clr_base);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+
+}
+
+static void __devexit mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+	if (dev->hca_type == ARBEL_NATIVE) {
+		mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				dev->fw.arbel.eq_set_ci_base,
+				MTHCA_EQ_SET_CI_SIZE,
+				dev->eq_regs.arbel.eq_set_ci_base);
+		mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+				      dev->fw.arbel.eq_arm_base) + 4, 4,
+				dev->eq_regs.arbel.eq_arm);
+		mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				dev->clr_base);
+	} else {
+		mthca_unmap_reg(dev, MTHCA_ECR_BASE,
+				MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				dev->eq_regs.tavor.ecr_base);
+		mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				dev->clr_base);
+	}
+}
+
+int __devinit mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+	int ret;
+	u8 status;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 32 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	dev->eq_table.icm_virt = icm_virt;
+	dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!dev->eq_table.icm_page)
+		return -ENOMEM;
+	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->eq_table.icm_dma)) {
+		__free_page(dev->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt, &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(dev->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void __devexit mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+	u8 status;
+
+	mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, PAGE_SIZE / 4096, &status);
+	pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(dev->eq_table.icm_page);
+}
+
+int __devinit mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	err = mthca_map_eq_regs(dev);
+	if (err)
+		goto err_out_free;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI ||
+	    dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 31 ? 4 : 0);
+	}
+
+	dev->eq_table.arm_mask = 0;
+
+	intr = (dev->mthca_flags & MTHCA_FLAG_MSI) ?
+		128 : dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME " (comp)",
+			[MTHCA_EQ_ASYNC] = DRV_NAME " (async)",
+			[MTHCA_EQ_CMD]   = DRV_NAME " (cmd)"
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  dev->hca_type == ARBEL_NATIVE ?
+					  mthca_arbel_msi_x_interrupt :
+					  mthca_tavor_msi_x_interrupt,
+					  0, eq_name[i], dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		err = request_irq(dev->pdev->irq,
+				  dev->hca_type == ARBEL_NATIVE ?
+				  mthca_arbel_interrupt :
+				  mthca_tavor_interrupt,
+				  SA_SHIRQ, DRV_NAME, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for async EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, status);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);
+
+	for (i = 0; i < MTHCA_EQ_CMD; ++i)
+		if (dev->hca_type == ARBEL_NATIVE)
+			arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+		else
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+	mthca_unmap_eq_regs(dev);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void __devexit mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	u8 status;
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_unmap_eq_regs(dev);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 000000000000..7df223642015
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+#include <ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+struct mthca_trap_mad {
+	struct ib_mad *mad;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      struct ib_mad *mad)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpup((__be16 *) (mad->data + 58)),
+				     (*(u8 *) (mad->data + 76)) & 0xf);
+
+			event.device           = ibdev;
+			event.event            = IB_EVENT_LID_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct mthca_trap_mad *tmad;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+		.wr	     = {
+			 .ud = {
+				 .remote_qpn  = qpn,
+				 .remote_qkey = qpn ? IB_QP1_QKEY : 0,
+				 .timeout_ms  = 0
+			 }
+		 }
+	};
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		tmad = kmalloc(sizeof *tmad, GFP_KERNEL);
+		if (!tmad)
+			return;
+
+		tmad->mad = kmalloc(sizeof *tmad->mad, GFP_KERNEL);
+		if (!tmad->mad) {
+			kfree(tmad);
+			return;
+		}
+
+		memcpy(tmad->mad, mad, sizeof *mad);
+
+		wr.wr.ud.mad_hdr = &tmad->mad->mad_hdr;
+		wr.wr_id         = (unsigned long) tmad;
+
+		gather_list.addr   = dma_map_single(agent->device->dma_device,
+						    tmad->mad,
+						    sizeof *tmad->mad,
+						    DMA_TO_DEVICE);
+		gather_list.length = sizeof *tmad->mad;
+		gather_list.lkey   = to_mpd(agent->qp->pd)->ntmr.ibmr.lkey;
+		pci_unmap_addr_set(tmad, mapping, gather_list.addr);
+
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		wr.wr.ud.ah      = dev->sm_ah[port_num - 1];
+		if (wr.wr.ud.ah)
+			ret = ib_post_send_mad(agent, &wr, &bad_wr);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret) {
+			dma_unmap_single(agent->device->dma_device,
+					 pci_unmap_addr(tmad, mapping),
+					 sizeof *tmad->mad,
+					 DMA_TO_DEVICE);
+			kfree(tmad->mad);
+			kfree(tmad);
+		}
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad)
+{
+	int err;
+	u8 status;
+	u16 slid = in_wc ? in_wc->slid : IB_LID_PERMISSIVE;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY,
+			    port_num, in_wc, in_grh, in_mad, out_mad,
+			    &status);
+	if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC failed\n");
+		return IB_MAD_RESULT_FAILURE;
+	}
+	if (status == MTHCA_CMD_STAT_BAD_PKT)
+		return IB_MAD_RESULT_SUCCESS;
+	if (status) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned status %02x\n",
+			  status);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status)
+		smp_snoop(ibdev, port_num, in_mad);
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct mthca_trap_mad *tmad =
+		(void *) (unsigned long) mad_send_wc->wr_id;
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(tmad, mapping),
+			 sizeof *tmad->mad,
+			 DMA_TO_DEVICE);
+	kfree(tmad->mad);
+	kfree(tmad);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL);
+			if (IS_ERR(agent))
+				goto err;
+			dev->send_agent[p][q] = agent;
+		}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return PTR_ERR(agent);
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 000000000000..9e782bc1c38d
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_main.c 1396 2004-12-28 04:10:27Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 0;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+static int msi = 0;
+module_param(msi, int, 0444);
+MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+#define msi   (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static const char mthca_version[] __devinitdata =
+	"ib_mthca: Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mthca_profile default_profile = {
+	.num_qp     = 1 << 16,
+	.rdb_per_qp = 4,
+	.num_cq     = 1 << 16,
+	.num_mcg    = 1 << 13,
+	.num_mpt    = 1 << 17,
+	.num_mtt    = 1 << 20,
+	.num_udav   = 1 << 15,	/* Tavor only */
+	.uarc_size  = 1 << 18,	/* Arbel only */
+};
+
+static int __devinit mthca_tune_pci(struct mthca_dev *mdev)
+{
+	int cap;
+	u16 val;
+
+	/* First try to max out Read Byte Count */
+	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	if (cap) {
+		if (pci_read_config_word(mdev->pdev, cap + PCI_X_CMD, &val)) {
+			mthca_err(mdev, "Couldn't read PCI-X command register, "
+				  "aborting.\n");
+			return -ENODEV;
+		}
+		val = (val & ~PCI_X_CMD_MAX_READ) | (3 << 2);
+		if (pci_write_config_word(mdev->pdev, cap + PCI_X_CMD, val)) {
+			mthca_err(mdev, "Couldn't write PCI-X command register, "
+				  "aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->hca_type == TAVOR)
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
+	if (cap) {
+		if (pci_read_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, &val)) {
+			mthca_err(mdev, "Couldn't read PCI Express device control "
+				  "register, aborting.\n");
+			return -ENODEV;
+		}
+		val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12);
+		if (pci_write_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, val)) {
+			mthca_err(mdev, "Couldn't write PCI Express device control "
+				  "register, aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->hca_type == ARBEL_NATIVE ||
+		   mdev->hca_type == ARBEL_COMPAT)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+	u8 status;
+
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %ld, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	mdev->limits.max_sg             = dev_lim->max_sg;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+
+	/* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+	   May be doable since hardware supports it for SRQ.
+
+	   IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+	   IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+	   supported by driver. */
+	mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+		mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+		mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+		mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	return 0;
+}
+
+static int __devinit mthca_init_tavor(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	struct mthca_adapter        adapter;
+
+	err = mthca_SYS_EN(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "SYS_EN returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+	err = mthca_QUERY_DDR(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DDR returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+
+	profile = default_profile;
+	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
+	profile.uarc_size = 0;
+
+	err = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (err < 0)
+		goto err_disable;
+
+	err = mthca_INIT_HCA(mdev, &init_hca, &status);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_close;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	mdev->rev_id            = adapter.revision_id;
+
+	return 0;
+
+err_close:
+	mthca_CLOSE_HCA(mdev, 0, &status);
+
+err_disable:
+	mthca_SYS_DIS(mdev, &status);
+
+	return err;
+}
+
+static int __devinit mthca_load_fw(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.fw_icm =
+		mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+				GFP_HIGHUSER | __GFP_NOWARN);
+	if (!mdev->fw.arbel.fw_icm) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command failed, aborting.\n");
+		goto err_free;
+	}
+	if (status) {
+		mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_free;
+	}
+	err = mthca_RUN_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command failed, aborting.\n");
+		goto err_unmap_fa;
+	}
+	if (status) {
+		mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev, &status);
+
+err_free:
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+	return err;
+}
+
+static int __devinit mthca_init_icm(struct mthca_dev *mdev,
+				    struct mthca_dev_lim *dev_lim,
+				    struct mthca_init_hca_param *init_hca,
+				    u64 icm_size)
+{
+	u64 aux_pages;
+	u8 status;
+	int err;
+
+	err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status);
+	if (err) {
+		mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		  (unsigned long long) icm_size >> 10,
+		  (unsigned long long) aux_pages << 2);
+
+	mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+						 GFP_HIGHUSER | __GFP_NOWARN);
+	if (!mdev->fw.arbel.aux_icm) {
+		mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status);
+	if (err) {
+		mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n");
+		goto err_free_aux;
+	}
+	if (status) {
+		mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_free_aux;
+	}
+
+	err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+	if (err) {
+		mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+							 init_hca->mtt_seg_sz,
+							 mdev->limits.num_mtt_segs,
+							 mdev->limits.reserved_mtts, 1);
+	if (!mdev->mr_table.mtt_table) {
+		mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eq;
+	}
+
+	mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+							 dev_lim->mpt_entry_sz,
+							 mdev->limits.num_mpts,
+							 mdev->limits.reserved_mrws, 1);
+	if (!mdev->mr_table.mpt_table) {
+		mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mtt;
+	}
+
+	mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+							dev_lim->qpc_entry_sz,
+							mdev->limits.num_qps,
+							mdev->limits.reserved_qps, 0);
+	if (!mdev->qp_table.qp_table) {
+		mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mpt;
+	}
+
+	mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+							 dev_lim->eqpc_entry_sz,
+							 mdev->limits.num_qps,
+							 mdev->limits.reserved_qps, 0);
+	if (!mdev->qp_table.eqp_table) {
+		mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_qp;
+	}
+
+	mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+						     dev_lim->cqc_entry_sz,
+						     mdev->limits.num_cqs,
+						     mdev->limits.reserved_cqs, 0);
+	if (!mdev->cq_table.table) {
+		mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eqp;
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+						      MTHCA_MGM_ENTRY_SIZE,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      0);
+	if (!mdev->mcg_table.table) {
+		mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_cq;
+	}
+
+	return 0;
+
+err_unmap_cq:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_eqp:
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+	mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+	mthca_UNMAP_ICM_AUX(mdev, &status);
+
+err_free_aux:
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+	return err;
+}
+
+static int __devinit mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	struct mthca_adapter        adapter;
+	u64 icm_size;
+	u8 status;
+	int err;
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_ENABLE_LAM(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n");
+		return err;
+	}
+	if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (status) {
+		mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to start FW, aborting.\n");
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		goto err_stop_fw;
+	}
+
+	profile = default_profile;
+	profile.num_uar  = dev_lim.uar_size / PAGE_SIZE;
+	profile.num_udav = 0;
+
+	icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if ((int) icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mthca_INIT_HCA(mdev, &init_hca, &status);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+		goto err_free_icm;
+	}
+	if (status) {
+		mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_free_icm;
+	}
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_free_icm;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_free_icm;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	mdev->rev_id            = adapter.revision_id;
+
+	return 0;
+
+err_free_icm:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+	mthca_unmap_eq_icm(mdev);
+
+	mthca_UNMAP_ICM_AUX(mdev, &status);
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+err_stop_fw:
+	mthca_UNMAP_FA(mdev, &status);
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+
+err_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev, &status);
+
+	return err;
+}
+
+static int __devinit mthca_init_hca(struct mthca_dev *mdev)
+{
+	if (mdev->hca_type == ARBEL_NATIVE)
+		return mthca_init_arbel(mdev);
+	else
+		return mthca_init_tavor(mdev);
+}
+
+static int __devinit mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_uar_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_uar_alloc(dev, &dev->driver_uar);
+	if (err) {
+		mthca_err(dev, "Failed to allocate driver access region, "
+			  "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->kar) {
+		mthca_err(dev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mthca_NOP(dev, &status);
+	if (err || status) {
+		mthca_err(dev, "NOP command failed to generate interrupt, aborting.\n");
+		if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))
+			mthca_err(dev, "Try again with MSI/MSI-X disabled.\n");
+		else
+			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+
+		goto err_cmd_poll;
+	}
+
+	mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_av_table_free;
+	}
+
+	return 0;
+
+err_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(dev->kar);
+
+err_uar_free:
+	mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+	mthca_cleanup_uar_table(dev);
+	return err;
+}
+
+static int __devinit mthca_request_regions(struct pci_dev *pdev,
+					   int ddr_hidden)
+{
+	int err;
+
+	/*
+	 * We can't just use pci_request_regions() because the MSI-X
+	 * table is right in the middle of the first BAR.  If we did
+	 * pci_request_region and grab all of the first BAR, then
+	 * setting up MSI-X would fail, since the PCI core wants to do
+	 * request_mem_region on the MSI-X vector table.
+	 *
+	 * So just request what we need right now, and request any
+	 * other regions we need when setting up EQs.
+	 */
+	if (!request_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+				MTHCA_HCR_SIZE, DRV_NAME))
+		return -EBUSY;
+
+	err = pci_request_region(pdev, 2, DRV_NAME);
+	if (err)
+		goto err_bar2_failed;
+
+	if (!ddr_hidden) {
+		err = pci_request_region(pdev, 4, DRV_NAME);
+		if (err)
+			goto err_bar4_failed;
+	}
+
+	return 0;
+
+err_bar4_failed:
+	pci_release_region(pdev, 2);
+
+err_bar2_failed:
+	release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+
+	return err;
+}
+
+static void mthca_release_regions(struct pci_dev *pdev,
+				  int ddr_hidden)
+{
+	if (!ddr_hidden)
+		pci_release_region(pdev, 4);
+
+	pci_release_region(pdev, 2);
+
+	release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+}
+
+static int __devinit mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	struct msix_entry entries[3];
+	int err;
+
+	entries[0].entry = 0;
+	entries[1].entry = 1;
+	entries[2].entry = 2;
+
+	err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries));
+	if (err) {
+		if (err > 0)
+			mthca_info(mdev, "Only %d MSI-X vectors available, "
+				   "not using MSI-X\n", err);
+		return err;
+	}
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector = entries[2].vector;
+
+	return 0;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	u8 status;
+
+	mthca_CLOSE_HCA(mdev, 0, &status);
+
+	if (mdev->hca_type == ARBEL_NATIVE) {
+		mthca_free_icm_table(mdev, mdev->cq_table.table);
+		mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+		mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+		mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+		mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+		mthca_unmap_eq_icm(mdev);
+
+		mthca_UNMAP_ICM_AUX(mdev, &status);
+		mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+		mthca_UNMAP_FA(mdev, &status);
+		mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev, &status);
+	} else
+		mthca_SYS_DIS(mdev, &status);
+}
+
+static int __devinit mthca_init_one(struct pci_dev *pdev,
+				    const struct pci_device_id *id)
+{
+	static int mthca_version_printed = 0;
+	static int mthca_memfree_warned = 0;
+	int ddr_hidden = 0;
+	int err;
+	struct mthca_dev *mdev;
+
+	if (!mthca_version_printed) {
+		printk(KERN_INFO "%s", mthca_version);
+		++mthca_version_printed;
+	}
+
+	printk(KERN_INFO PFX "Initializing %s (%s)\n",
+	       pci_pretty_name(pdev), pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 2) != 1 << 23) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = mthca_request_regions(pdev, ddr_hidden);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_free_res;
+		}
+	}
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	mdev->pdev     = pdev;
+	mdev->hca_type = id->driver_data;
+
+	if (mdev->hca_type == ARBEL_NATIVE && !mthca_memfree_warned++)
+		mthca_warn(mdev, "Warning: native MT25208 mode support is incomplete.  "
+			   "Your HCA may not work properly.\n");
+
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+	if (msi && !(mdev->mthca_flags & MTHCA_FLAG_MSI_X) &&
+	    !pci_enable_msi(pdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI;
+
+	sema_init(&mdev->cmd.hcr_sem, 1);
+	sema_init(&mdev->cmd.poll_sem, 1);
+	mdev->cmd.use_events = 0;
+
+	mdev->hcr = ioremap(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE, MTHCA_HCR_SIZE);
+	if (!mdev->hcr) {
+		mthca_err(mdev, "Couldn't map command register, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_free_dev;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_iounmap;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_iounmap;
+
+	err = mthca_setup_hca(mdev);
+	if (err)
+		goto err_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+
+	return 0;
+
+err_unregister:
+	mthca_unregister_device(mdev);
+
+err_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+	mthca_cleanup_uar_table(mdev);
+
+err_close:
+	mthca_close_hca(mdev);
+
+err_iounmap:
+	iounmap(mdev->hcr);
+
+err_free_dev:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+		pci_disable_msi(pdev);
+
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+	mthca_release_regions(pdev, ddr_hidden);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __devexit mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	u8 status;
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p, &status);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		iounmap(mdev->kar);
+		mthca_uar_free(mdev, &mdev->driver_uar);
+		mthca_cleanup_uar_table(mdev);
+
+		mthca_close_hca(mdev);
+
+		iounmap(mdev->hcr);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+			pci_disable_msi(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		mthca_release_regions(pdev, mdev->mthca_flags &
+				      MTHCA_FLAG_DDR_HIDDEN);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= "ib_mthca",
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= __devexit_p(mthca_remove_one)
+};
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	ret = pci_register_driver(&mthca_driver);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 000000000000..70a6553a588e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mcg.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+struct mthca_mgm {
+	u32 next_gid_index;
+	u32 reserved[3];
+	u8  gid[16];
+	u32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mgm *mgm,
+		    u16 *hash, int *prev, int *index)
+{
+	void *mailbox;
+	u8 *mgid;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(16 + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgid = MAILBOX_ALIGN(mailbox);
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mgid, hash, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "MGID_HASH returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %04x:%04x:%04x:%04x:"
+			  "%04x:%04x:%04x:%04x is %04x\n",
+			  be16_to_cpu(((u16 *) gid)[0]), be16_to_cpu(((u16 *) gid)[1]),
+			  be16_to_cpu(((u16 *) gid)[2]), be16_to_cpu(((u16 *) gid)[3]),
+			  be16_to_cpu(((u16 *) gid)[4]), be16_to_cpu(((u16 *) gid)[5]),
+			  be16_to_cpu(((u16 *) gid)[6]), be16_to_cpu(((u16 *) gid)[7]),
+			  *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			return -EINVAL;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 5;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	void *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgm = MAILBOX_ALIGN(mailbox);
+
+	if (down_interruptible(&dev->mcg_table.sem))
+		return -EINTR;
+
+	err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(mgm->gid, gid->raw, 16);
+		mgm->next_gid_index = 0;
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "READ_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 5);
+
+	err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+	}
+
+ out:
+	up(&dev->mcg_table.sem);
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	void *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgm = MAILBOX_ALIGN(mailbox);
+
+	if (down_interruptible(&dev->mcg_table.sem))
+		return -EINTR;
+
+	err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mthca_err(dev, "MGID %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
+			  "not found\n",
+			  be16_to_cpu(((u16 *) gid->raw)[0]),
+			  be16_to_cpu(((u16 *) gid->raw)[1]),
+			  be16_to_cpu(((u16 *) gid->raw)[2]),
+			  be16_to_cpu(((u16 *) gid->raw)[3]),
+			  be16_to_cpu(((u16 *) gid->raw)[4]),
+			  be16_to_cpu(((u16 *) gid->raw)[5]),
+			  be16_to_cpu(((u16 *) gid->raw)[6]),
+			  be16_to_cpu(((u16 *) gid->raw)[7]));
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		if (be32_to_cpu(mgm->next_gid_index) >> 5) {
+			err = mthca_READ_MGM(dev,
+					     be32_to_cpu(mgm->next_gid_index) >> 5,
+					     mgm, &status);
+			if (err)
+				goto out;
+			if (status) {
+				mthca_err(dev, "READ_MGM returned status %02x\n",
+					  status);
+				err = -EINVAL;
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+	} else {
+		/* Remove entry from AMGM */
+		index = be32_to_cpu(mgm->next_gid_index) >> 5;
+		err = mthca_READ_MGM(dev, prev, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(index << 5);
+
+		err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+ out:
+	up(&dev->mcg_table.sem);
+	kfree(mailbox);
+	return err;
+}
+
+int __devinit mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       dev->limits.num_amgms,
+			       dev->limits.num_amgms - 1,
+			       0);
+	if (err)
+		return err;
+
+	init_MUTEX(&dev->mcg_table.sem);
+
+	return 0;
+}
+
+void __devexit mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 000000000000..7730b5960616
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MTHCA_ICM_ALLOC_SIZE   = 1 << 18,
+	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	struct mthca_icm_chunk *chunk, *tmp;
+	int i;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (chunk->nsg > 0)
+			pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+				     PCI_DMA_BIDIRECTIONAL);
+
+		for (i = 0; i < chunk->npages; ++i)
+			__free_pages(chunk->mem[i].page,
+				     get_order(chunk->mem[i].length));
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  unsigned int gfp_mask)
+{
+	struct mthca_icm *icm;
+	struct mthca_icm_chunk *chunk = NULL;
+	int cur_order;
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
+		if (chunk->mem[chunk->npages].page) {
+			chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
+			chunk->mem[chunk->npages].offset = 0;
+
+			if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+
+				chunk = NULL;
+			}
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mthca_free_icm(dev, icm);
+	return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	int ret = 0;
+	u8 status;
+
+	down(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+					(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					__GFP_NOWARN);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+			  &status) || status) {
+		mthca_free_icm(dev, table->icm[i]);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	up(&table->mutex);
+	return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	u8 status;
+
+	down(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+		mthca_free_icm(dev, table->icm[i]);
+		table->icm[i] = NULL;
+	}
+
+	up(&table->mutex);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem)
+{
+	struct mthca_icm_table *table;
+	int num_icm;
+	int i;
+	u8 status;
+
+	num_icm = obj_size * nobj / MTHCA_TABLE_CHUNK_SIZE;
+
+	table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	init_MUTEX(&table->mutex);
+
+	for (i = 0; i < num_icm; ++i)
+		table->icm[i] = NULL;
+
+	for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+						(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+						__GFP_NOWARN);
+		if (!table->icm[i])
+			goto err;
+		if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				  &status) || status) {
+			mthca_free_icm(dev, table->icm[i]);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return table;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+			mthca_free_icm(dev, table->icm[i]);
+		}
+
+	kfree(table);
+
+	return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+	int i;
+	u8 status;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+			mthca_free_icm(dev, table->icm[i]);
+		}
+
+	kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, int page)
+{
+	return dev->uar_table.uarc_base +
+		dev->driver_uar.index * dev->uar_table.uarc_size +
+		page * 4096;
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db)
+{
+	int group;
+	int start, end, dir;
+	int i, j;
+	struct mthca_db_page *page;
+	int ret = 0;
+	u8 status;
+
+	down(&dev->db_tab->mutex);
+
+	switch (type) {
+	case MTHCA_DB_TYPE_CQ_ARM:
+	case MTHCA_DB_TYPE_SQ:
+		group = 0;
+		start = 0;
+		end   = dev->db_tab->max_group1;
+		dir   = 1;
+		break;
+
+	case MTHCA_DB_TYPE_CQ_SET_CI:
+	case MTHCA_DB_TYPE_RQ:
+	case MTHCA_DB_TYPE_SRQ:
+		group = 1;
+		start = dev->db_tab->npages - 1;
+		end   = dev->db_tab->min_group2;
+		dir   = -1;
+		break;
+
+	default:
+		return -1;
+	}
+
+	for (i = start; i != end; i += dir)
+		if (dev->db_tab->page[i].db_rec &&
+		    !bitmap_full(dev->db_tab->page[i].used,
+				 MTHCA_DB_REC_PER_PAGE)) {
+			page = dev->db_tab->page + i;
+			goto found;
+		}
+
+	if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	page = dev->db_tab->page + end;
+	page->db_rec = dma_alloc_coherent(&dev->pdev->dev, 4096,
+					  &page->mapping, GFP_KERNEL);
+	if (!page->db_rec) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memset(page->db_rec, 0, 4096);
+
+	ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, i), &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		dma_free_coherent(&dev->pdev->dev, 4096,
+				  page->db_rec, page->mapping);
+		goto out;
+	}
+
+	bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+	if (group == 0)
+		++dev->db_tab->max_group1;
+	else
+		--dev->db_tab->min_group2;
+
+found:
+	j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+	set_bit(j, page->used);
+
+	if (group == 1)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+	ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+	page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+	*db = (u32 *) &page->db_rec[j];
+
+out:
+	up(&dev->db_tab->mutex);
+
+	return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+	int i, j;
+	struct mthca_db_page *page;
+	u8 status;
+
+	i = db_index / MTHCA_DB_REC_PER_PAGE;
+	j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+	page = dev->db_tab->page + i;
+
+	down(&dev->db_tab->mutex);
+
+	page->db_rec[j] = 0;
+	if (i >= dev->db_tab->min_group2)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+	clear_bit(j, page->used);
+
+	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+	    i >= dev->db_tab->max_group1 - 1) {
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+		dma_free_coherent(&dev->pdev->dev, 4096,
+				  page->db_rec, page->mapping);
+		page->db_rec = NULL;
+
+		if (i == dev->db_tab->max_group1) {
+			--dev->db_tab->max_group1;
+			/* XXX may be able to unmap more pages now */
+		}
+		if (i == dev->db_tab->min_group2)
+			++dev->db_tab->min_group2;
+	}
+
+	up(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->hca_type != ARBEL_NATIVE)
+		return 0;
+
+	dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+	if (!dev->db_tab)
+		return -ENOMEM;
+
+	init_MUTEX(&dev->db_tab->mutex);
+
+	dev->db_tab->npages     = dev->uar_table.uarc_size / PAGE_SIZE;
+	dev->db_tab->max_group1 = 0;
+	dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+	dev->db_tab->page = kmalloc(dev->db_tab->npages *
+				    sizeof *dev->db_tab->page,
+				    GFP_KERNEL);
+	if (!dev->db_tab->page) {
+		kfree(dev->db_tab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < dev->db_tab->npages; ++i)
+		dev->db_tab->page[i].db_rec = NULL;
+
+	return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+	int i;
+	u8 status;
+
+	if (dev->hca_type != ARBEL_NATIVE)
+		return;
+
+	/*
+	 * Because we don't always free our UARC pages when they
+	 * become empty to make mthca_free_db() simpler we need to
+	 * make a sweep through the doorbell pages and free any
+	 * leftover pages now.
+	 */
+	for (i = 0; i < dev->db_tab->npages; ++i) {
+		if (!dev->db_tab->page[i].db_rec)
+			continue;
+
+		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+		dma_free_coherent(&dev->pdev->dev, 4096,
+				  dev->db_tab->page[i].db_rec,
+				  dev->db_tab->page[i].mapping);
+	}
+
+	kfree(dev->db_tab->page);
+	kfree(dev->db_tab);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 000000000000..a8fa97e140f5
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#include <asm/semaphore.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+struct mthca_icm_chunk {
+	struct list_head   list;
+	int                npages;
+	int                nsg;
+	struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+	struct list_head chunk_list;
+	int              refcount;
+};
+
+struct mthca_icm_table {
+	u64               virt;
+	int               num_icm;
+	int               num_obj;
+	int               obj_size;
+	int               lowmem;
+	struct semaphore  mutex;
+	struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+	struct mthca_icm       *icm;
+	struct mthca_icm_chunk *chunk;
+	int                     page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  unsigned int gfp_mask);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+				   struct mthca_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mthca_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mthca_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+enum {
+	MTHCA_DB_REC_PER_PAGE = 4096 / 8
+};
+
+struct mthca_db_page {
+	DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+	u64       *db_rec;
+	dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+	int 	       	      npages;
+	int 	       	      max_group1;
+	int 	       	      min_group2;
+	struct mthca_db_page *page;
+	struct semaphore      mutex;
+};
+
+enum {
+	MTHCA_DB_TYPE_INVALID   = 0x0,
+	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
+	MTHCA_DB_TYPE_SQ        = 0x3,
+	MTHCA_DB_TYPE_RQ        = 0x4,
+	MTHCA_DB_TYPE_SRQ       = 0x5,
+	MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 000000000000..80a0cd97881b
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mr.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	u32 flags;
+	u32 page_size;
+	u32 key;
+	u32 pd;
+	u64 start;
+	u64 length;
+	u32 lkey;
+	u32 window_count;
+	u32 window_count_limit;
+	u64 mtt_seg;
+	u32 mtt_sz;		/* Arbel only */
+	u32 reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_alloc_mtt(struct mthca_dev *dev, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+	for (o = order; o <= dev->mr_table.max_mtt_order; ++o) {
+		m = 1 << (dev->mr_table.max_mtt_order - o);
+		seg = find_first_bit(dev->mr_table.mtt_buddy[o], m);
+		if (seg < m)
+			goto found;
+	}
+
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+	return -1;
+
+ found:
+	clear_bit(seg, dev->mr_table.mtt_buddy[o]);
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, dev->mr_table.mtt_buddy[o]);
+	}
+
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+	while (test_bit(seg ^ 1, dev->mr_table.mtt_buddy[order])) {
+		clear_bit(seg ^ 1, dev->mr_table.mtt_buddy[order]);
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, dev->mr_table.mtt_buddy[order]);
+
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+	if (dev->hca_type == ARBEL_NATIVE)
+		return (ind >> 24) | (ind << 8);
+	else
+		return ind;
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+	if (dev->hca_type == ARBEL_NATIVE)
+		return (key << 24) | (key >> 8);
+	else
+		return key;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	void *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	mr->order = -1;
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox) {
+		mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+		return -ENOMEM;
+	}
+	mpt_entry = MAILBOX_ALIGN(mailbox);
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_PHYSICAL    |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	mpt_entry->page_size = 0;
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = 0;
+	mpt_entry->length    = ~0ULL;
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	err = mthca_SW2HW_MPT(dev, mpt_entry,
+			      key & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+	else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+	}
+
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	void *mailbox;
+	u64 *mtt_entry;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int err = -ENOMEM;
+	u8 status;
+	int i;
+
+	might_sleep();
+	WARN_ON(buffer_size_shift >= 32);
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
+	     i < list_len;
+	     i <<= 1, ++mr->order)
+		; /* nothing */
+
+	mr->first_seg = mthca_alloc_mtt(dev, mr->order);
+	if (mr->first_seg == -1)
+		goto err_out_mpt_free;
+
+	/*
+	 * If list_len is odd, we add one more dummy entry for
+	 * firmware efficiency.
+	 */
+	mailbox = kmalloc(max(sizeof *mpt_entry,
+			      (size_t) 8 * (list_len + (list_len & 1) + 2)) +
+			  MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_free_mtt;
+
+	mtt_entry = MAILBOX_ALIGN(mailbox);
+
+	mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+				   mr->first_seg * dev->limits.mtt_seg_size);
+	mtt_entry[1] = 0;
+	for (i = 0; i < list_len; ++i)
+		mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+	if (list_len & 1) {
+		mtt_entry[i + 2] = 0;
+		++list_len;
+	}
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry\n");
+		for (i = 0; i < list_len + 2; ++i)
+			printk(KERN_ERR "[%2d] %016llx\n",
+			       i, (unsigned long long) be64_to_cpu(mtt_entry[i]));
+	}
+
+	err = mthca_WRITE_MTT(dev, mtt_entry, list_len, &status);
+	if (err) {
+		mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+	if (status) {
+		mthca_warn(dev, "WRITE_MTT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_mailbox_free;
+	}
+
+	mpt_entry = MAILBOX_ALIGN(mailbox);
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base +
+					   mr->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mpt_entry,
+			      key & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+	else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+	}
+
+	kfree(mailbox);
+	return err;
+
+ err_out_mailbox_free:
+	kfree(mailbox);
+
+ err_out_free_mtt:
+	mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+ err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+	return err;
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      key_to_hw_index(dev, mr->ibmr.lkey) &
+			      (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
+			   status);
+
+	if (mr->order >= 0)
+		mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
+}
+
+int __devinit mthca_init_mr_table(struct mthca_dev *dev)
+{
+	int err;
+	int i, s;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	err = -ENOMEM;
+
+	for (i = 1, dev->mr_table.max_mtt_order = 0;
+	     i < dev->limits.num_mtt_segs;
+	     i <<= 1, ++dev->mr_table.max_mtt_order)
+		; /* nothing */
+
+	dev->mr_table.mtt_buddy = kmalloc((dev->mr_table.max_mtt_order + 1) *
+					  sizeof (long *),
+					  GFP_KERNEL);
+	if (!dev->mr_table.mtt_buddy)
+		goto err_out;
+
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		dev->mr_table.mtt_buddy[i] = NULL;
+
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i) {
+		s = BITS_TO_LONGS(1 << (dev->mr_table.max_mtt_order - i));
+		dev->mr_table.mtt_buddy[i] = kmalloc(s * sizeof (long),
+						     GFP_KERNEL);
+		if (!dev->mr_table.mtt_buddy[i])
+			goto err_out_free;
+		bitmap_zero(dev->mr_table.mtt_buddy[i],
+			    1 << (dev->mr_table.max_mtt_order - i));
+	}
+
+	set_bit(0, dev->mr_table.mtt_buddy[dev->mr_table.max_mtt_order]);
+
+	for (i = 0; i < dev->mr_table.max_mtt_order; ++i)
+		if (1 << i >= dev->limits.reserved_mtts)
+			break;
+
+	if (i == dev->mr_table.max_mtt_order) {
+		mthca_err(dev, "MTT table of order %d is "
+			  "too small.\n", i);
+		goto err_out_free;
+	}
+
+	(void) mthca_alloc_mtt(dev, i);
+
+	return 0;
+
+ err_out_free:
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		kfree(dev->mr_table.mtt_buddy[i]);
+
+ err_out:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	int i;
+
+	/* XXX check if any MRs are still allocated? */
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		kfree(dev->mr_table.mtt_buddy[i]);
+	kfree(dev->mr_table.mtt_buddy);
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 000000000000..ea66847e4ea3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_pd.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	int err;
+
+	might_sleep();
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+				     MTHCA_MPT_FLAG_LOCAL_READ |
+				     MTHCA_MPT_FLAG_LOCAL_WRITE,
+				     &pd->ntmr);
+	if (err)
+		mthca_free(&dev->pd_table.alloc, pd->pd_num);
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	might_sleep();
+	mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int __devinit mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void __devexit mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 000000000000..7881a8a919ca
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_profile.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include "mthca_profile.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_UARC,
+	MTHCA_RES_NUM
+};
+
+enum {
+	MTHCA_NUM_EQS = 32,
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+u64 mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 mem_base, mem_avail;
+	u64 total_size = 0;
+	struct mthca_resource *profile;
+	struct mthca_resource tmp;
+	int i, j;
+
+	profile = kmalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	memset(profile, 0, MTHCA_RES_NUM * sizeof *profile);
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = dev_lim->mpt_entry_sz;
+	profile[MTHCA_RES_MTT].size  = dev_lim->mtt_seg_sz;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+	profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+	profile[MTHCA_RES_QP].num    = request->num_qp;
+	profile[MTHCA_RES_EQP].num   = request->num_qp;
+	profile[MTHCA_RES_RDB].num   = request->num_qp * request->rdb_per_qp;
+	profile[MTHCA_RES_CQ].num    = request->num_cq;
+	profile[MTHCA_RES_EQ].num    = MTHCA_NUM_EQS;
+	profile[MTHCA_RES_MCG].num   = request->num_mcg;
+	profile[MTHCA_RES_MPT].num   = request->num_mpt;
+	profile[MTHCA_RES_MTT].num   = request->num_mtt;
+	profile[MTHCA_RES_UAR].num   = request->num_uar;
+	profile[MTHCA_RES_UARC].num  = request->num_uar;
+	profile[MTHCA_RES_UDAV].num  = request->num_udav;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].log_num  = max(ffs(profile[i].num) - 1, 0);
+		profile[i].size    *= profile[i].num;
+		if (dev->hca_type == ARBEL_NATIVE)
+			profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		mem_base  = 0;
+		mem_avail = dev_lim->hca.arbel.max_icm_sz;
+	} else {
+		mem_base  = dev->ddr_start;
+		mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp            = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = mem_base + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > mem_avail) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) mem_avail);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	if (dev->hca_type == ARBEL_NATIVE)
+		mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+			  (int) (total_size >> 10));
+	else
+		mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+			  (int) (total_size >> 10), (int) (mem_avail >> 10),
+			  (int) ((mem_avail - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			for (dev->qp_table.rdb_shift = 0;
+			     profile[MTHCA_RES_QP].num << dev->qp_table.rdb_shift <
+				     profile[i].num;
+			     ++dev->qp_table.rdb_shift)
+				; /* nothing */
+			dev->qp_table.rdb_base    = (u32) profile[i].start;
+			init_hca->rdb_base        = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts = profile[i].num;
+			init_hca->mpt_base   = profile[i].start;
+			init_hca->log_mpt_sz = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->limits.mtt_seg_size = dev_lim->mtt_seg_sz;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(dev_lim->mtt_seg_sz) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			dev->limits.num_uars       = profile[i].num;
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+			break;
+		case MTHCA_RES_UARC:
+			dev->uar_table.uarc_size = request->uarc_size;
+			dev->uar_table.uarc_base = profile[i].start;
+			init_hca->uarc_base   	 = profile[i].start;
+			init_hca->log_uarc_sz 	 = ffs(request->uarc_size) - 13;
+			init_hca->log_uar_sz  	 = ffs(request->num_uar) - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 000000000000..daaf7999486c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_profile.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+	int num_qp;
+	int rdb_per_qp;
+	int num_cq;
+	int num_mcg;
+	int num_mpt;
+	int num_mtt;
+	int num_udav;
+	int num_uar;
+	int uarc_size;
+};
+
+u64 mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 000000000000..bbf74cf43343
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_provider.c 1397 2004-12-28 05:09:00Z roland $
+ */
+
+#include <ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+static int mthca_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mthca_dev* mdev = to_mdev(ibdev);
+
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	props->fw_ver              = mdev->fw_ver;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->base_version       = 1;
+	in_mad->mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->class_version  	   = 1;
+	in_mad->method         	   = IB_MGMT_METHOD_GET;
+	in_mad->attr_id   	   = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(mdev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->device_cap_flags = mdev->device_cap_flags;
+	props->vendor_id        = be32_to_cpup((u32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id   = be16_to_cpup((u16 *) (out_mad->data + 30));
+	props->hw_ver           = be16_to_cpup((u16 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+	memcpy(&props->node_guid,      out_mad->data + 12, 8);
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->base_version       = 1;
+	in_mad->mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->class_version  	   = 1;
+	in_mad->method         	   = IB_MGMT_METHOD_GET;
+	in_mad->attr_id   	   = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod           = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->lid               = be16_to_cpup((u16 *) (out_mad->data + 16));
+	props->lmc               = out_mad->data[34] & 0x7;
+	props->sm_lid            = be16_to_cpup((u16 *) (out_mad->data + 18));
+	props->sm_sl             = out_mad->data[36] & 0xf;
+	props->state             = out_mad->data[32] & 0xf;
+	props->phys_state        = out_mad->data[33] >> 4;
+	props->port_cap_flags    = be32_to_cpup((u32 *) (out_mad->data + 20));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->qkey_viol_cntr    = be16_to_cpup((u16 *) (out_mad->data + 48));
+	props->active_width      = out_mad->data[31] & 0xf;
+	props->active_speed      = out_mad->data[35] >> 4;
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct mthca_set_ib_param set_ib;
+	struct ib_port_attr attr;
+	int err;
+	u8 status;
+
+	if (down_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+		return -ERESTARTSYS;
+
+	err = mthca_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	set_ib.set_si_guid     = 0;
+	set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+	set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+out:
+	up(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->base_version       = 1;
+	in_mad->mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->class_version  	   = 1;
+	in_mad->method         	   = IB_MGMT_METHOD_GET;
+	in_mad->attr_id   	   = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod           = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	*pkey = be16_to_cpu(((u16 *) out_mad->data)[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->base_version       = 1;
+	in_mad->mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->class_version  	   = 1;
+	in_mad->method         	   = IB_MGMT_METHOD_GET;
+	in_mad->attr_id   	   = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod           = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->base_version       = 1;
+	in_mad->mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->class_version  	   = 1;
+	in_mad->method         	   = IB_MGMT_METHOD_GET;
+	in_mad->attr_id   	   = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod           = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 16, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr)
+{
+	struct mthca_qp *qp;
+	int err;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->sq.max    = init_attr->cap.max_send_wr;
+		qp->rq.max    = init_attr->cap.max_recv_wr;
+		qp->sq.max_gs = init_attr->cap.max_send_sge;
+		qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     qp);
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->sq.max    = init_attr->cap.max_send_wr;
+		qp->rq.max    = init_attr->cap.max_recv_wr;
+		qp->sq.max_gs = init_attr->cap.max_send_sge;
+		qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+        init_attr->cap.max_inline_data = 0;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries)
+{
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	for (nent = 1; nent <= entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent, cq);
+	if (err) {
+		kfree(cq);
+		cq = ERR_PTR(err);
+	}
+
+	return &cq->ibcq;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
+				       struct ib_phys_buf *buffer_list,
+				       int                 num_phys_buf,
+				       int                 acc,
+				       u64                *iova_start)
+{
+	struct mthca_mr *mr;
+	u64 *page_list;
+	u64 total_size;
+	u64 mask;
+	int shift;
+	int npages;
+	int err;
+	int i, j, n;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK))
+		return ERR_PTR(-EINVAL);
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK))
+		return ERR_PTR(-EINVAL);
+
+	mask = 0;
+	total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (buffer_list[i].addr & ~PAGE_MASK)
+			return ERR_PTR(-EINVAL);
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return ERR_PTR(-EINVAL);
+
+		total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+	}
+
+	/* Find largest page shift we can use to cover buffers */
+	for (shift = PAGE_SHIFT; shift < 31; ++shift)
+		if (num_phys_buf > 1) {
+			if ((1ULL << shift) & mask)
+				break;
+		} else {
+			if (1ULL << shift >=
+			    buffer_list[0].size +
+			    (buffer_list[0].addr & ((1ULL << shift) - 1)))
+				break;
+		}
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+	buffer_list[0].addr &= ~0ull << shift;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+	if (!npages)
+		return &mr->ibmr;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+		     ++j)
+			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+		  "in PD %x; shift %d, npages %d.\n",
+		  (unsigned long long) buffer_list[0].addr,
+		  (unsigned long long) *iova_start,
+		  to_mpd(pd)->pd_num,
+		  shift, npages);
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  *iova_start, total_size,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	return &mr->ibmr;
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	mthca_free_mr(to_mdev(mr->device), to_mmr(mr));
+	kfree(mr);
+	return 0;
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	return sprintf(buf, "%x.%x.%x\n", (int) (dev->fw_ver >> 32),
+		       (int) (dev->fw_ver >> 16) & 0xffff,
+		       (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	switch (dev->hca_type) {
+	case TAVOR:        return sprintf(buf, "MT23108\n");
+	case ARBEL_COMPAT: return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case ARBEL_NATIVE: return sprintf(buf, "MT25208\n");
+	default:           return sprintf(buf, "unknown\n");
+	}
+}
+
+static CLASS_DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static CLASS_DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+
+static struct class_device_attribute *mthca_class_attributes[] = {
+	&class_device_attr_hw_rev,
+	&class_device_attr_fw_ver,
+	&class_device_attr_hca_type
+};
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.node_type            = IB_NODE_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.dma_device           = &dev->pdev->dev;
+	dev->ib_dev.class_dev.dev        = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+		dev->ib_dev.post_send     = mthca_arbel_post_send;
+		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
+	} else {
+		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+		dev->ib_dev.post_send     = mthca_tavor_post_send;
+		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
+	}
+
+	init_MUTEX(&dev->cap_mask_mutex);
+
+	ret = ib_register_device(&dev->ib_dev);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_class_attributes); ++i) {
+		ret = class_device_create_file(&dev->ib_dev.class_dev,
+					       mthca_class_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	ib_unregister_device(&dev->ib_dev);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 000000000000..0598f3905d9a
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_provider.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+struct mthca_uar {
+	unsigned long pfn;
+	int           index;
+};
+
+struct mthca_mr {
+	struct ib_mr ibmr;
+	int order;
+	u32 first_seg;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    eqn_mask;
+	u32                    cons_index;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+	MTHCA_AH_ON_HCA,
+	MTHCA_AH_PCI_POOL,
+	MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+	struct ib_ah       ibah;
+	enum mthca_ah_type type;
+	u32                key;
+	struct mthca_av   *av;
+	dma_addr_t         avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the send cq locked first.  No other
+ * nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an atomic_t ref count.  The
+ * pointer from the cq/qp_table to the struct counts as one reference.
+ * This reference also is good for access through the consumer API, so
+ * modifying the CQ/QP etc doesn't need to take another reference.
+ * Access because of a completion being polled does need a reference.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
+ * - decrement ref count
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when the
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq {
+	struct ib_cq           ibcq;
+	spinlock_t             lock;
+	atomic_t               refcount;
+	int                    cqn;
+	u32                    cons_index;
+	int                    is_direct;
+
+	/* Next fields are Arbel only */
+	int                    set_ci_db_index;
+	u32                   *set_ci_db;
+	int                    arm_db_index;
+	u32                   *arm_db;
+	int                    arm_sn;
+
+	union {
+		struct mthca_buf_list direct;
+		struct mthca_buf_list *page_list;
+	}                      queue;
+	struct mthca_mr        mr;
+	wait_queue_head_t      wait;
+};
+
+struct mthca_wq {
+	spinlock_t lock;
+	int        max;
+	unsigned   next_ind;
+	unsigned   last_comp;
+	unsigned   head;
+	unsigned   tail;
+	void      *last;
+	int        max_gs;
+	int        wqe_shift;
+
+	int        db_index;	/* Arbel only */
+	u32       *db;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	atomic_t               refcount;
+	u32                    qpn;
+	int                    is_direct;
+	u8                     transport;
+	u8                     state;
+	u8                     atomic_rd_en;
+	u8                     resp_depth;
+
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	enum ib_sig_type       sq_policy;
+	int                    send_wqe_offset;
+
+	u64                   *wrid;
+	union {
+		struct mthca_buf_list direct;
+		struct mthca_buf_list *page_list;
+	}                      queue;
+
+	wait_queue_head_t      wait;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             port;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 000000000000..7e4bbbd31f07
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2056 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_qp.c 1355 2004-12-17 15:23:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72 /* largest UD header possible */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+struct mthca_qp_path {
+	u32 port_pkey;
+	u8  rnr_retry;
+	u8  g_mylmc;
+	u16 rlid;
+	u8  ackto;
+	u8  mgid_index;
+	u8  static_rate;
+	u8  hop_limit;
+	u32 sl_tclass_flowlabel;
+	u8  rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	u32 flags;
+	u32 tavor_sched_queue;	/* Reserved on Arbel */
+	u8  mtu_msgmax;
+	u8  rq_size_stride;	/* Reserved on Tavor */
+	u8  sq_size_stride;	/* Reserved on Tavor */
+	u8  rlkey_arbel_sched_queue;	/* Reserved on Tavor */
+	u32 usr_page;
+	u32 local_qpn;
+	u32 remote_qpn;
+	u32 reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	u32 rdd;
+	u32 pd;
+	u32 wqe_base;
+	u32 wqe_lkey;
+	u32 params1;
+	u32 reserved2;
+	u32 next_send_psn;
+	u32 cqn_snd;
+	u32 snd_wqe_base_l;	/* Next send WQE on Tavor */
+	u32 snd_db_index;	/* (debugging only entries) */
+	u32 last_acked_psn;
+	u32 ssn;
+	u32 params2;
+	u32 rnr_nextrecvpsn;
+	u32 ra_buff_indx;
+	u32 cqn_rcv;
+	u32 rcv_wqe_base_l;	/* Next recv WQE on Tavor */
+	u32 rcv_db_index;	/* (debugging only entries) */
+	u32 qkey;
+	u32 srqn;
+	u32 rmsn;
+	u16 rq_wqe_counter;	/* reserved on Tavor */
+	u16 sq_wqe_counter;	/* reserved on Tavor */
+	u32 reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	u32 opt_param_mask;
+	u32 reserved1;
+	struct mthca_qp_context context;
+	u32 reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_RWE               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_NEXT_DBD       = 1 << 7,
+	MTHCA_NEXT_FENCE     = 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE = 1 << 3,
+	MTHCA_NEXT_EVENT_GEN = 1 << 2,
+	MTHCA_NEXT_SOLICIT   = 1 << 1,
+
+	MTHCA_MLX_VL15       = 1 << 17,
+	MTHCA_MLX_SLR        = 1 << 16
+};
+
+struct mthca_next_seg {
+	u32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	u32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	u32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	u32 imm;		/* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+	u32 reserved1;
+	u32 lkey;
+	u64 av_addr;
+	u32 reserved2[4];
+	u32 dqpn;
+	u32 qkey;
+	u32 reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+	u32 av[8];
+	u32 dqpn;
+	u32 qkey;
+	u32 reserved[2];
+};
+
+struct mthca_bind_seg {
+	u32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32 reserved;
+	u32 new_rkey;
+	u32 lkey;
+	u64 addr;
+	u64 length;
+};
+
+struct mthca_raddr_seg {
+	u64 raddr;
+	u32 rkey;
+	u32 reserved;
+};
+
+struct mthca_atomic_seg {
+	u64 swap_add;
+	u64 compare;
+};
+
+struct mthca_data_seg {
+	u32 byte_count;
+	u32 lkey;
+	u64 addr;
+};
+
+struct mthca_mlx_seg {
+	u32 nda_op;
+	u32 nds;
+	u32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	u16 rlid;
+	u16 vcrc;
+};
+
+static const u8 mthca_opcode[] = {
+	[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		atomic_inc(&qp->refcount);
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static const struct {
+	int trans;
+	u32 req_param[NUM_TRANS];
+	u32 opt_param[NUM_TRANS];
+} state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_INIT]  = {
+			.trans = MTHCA_TRANS_RST2INIT,
+			.req_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_ACCESS_FLAGS),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			},
+			/* bug-for-bug compatibility with VAPI: */
+			.opt_param = {
+				[MLX] = IB_QP_PORT
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_INIT]  = {
+			.trans = MTHCA_TRANS_INIT2INIT,
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_ACCESS_FLAGS),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.trans = MTHCA_TRANS_INIT2RTR,
+			.req_param = {
+				[RC]  = (IB_QP_AV                  |
+					 IB_QP_PATH_MTU            |
+					 IB_QP_DEST_QPN            |
+					 IB_QP_RQ_PSN              |
+					 IB_QP_MAX_DEST_RD_ATOMIC  |
+					 IB_QP_MIN_RNR_TIMER),
+			},
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_ALT_PATH     |
+					 IB_QP_ACCESS_FLAGS |
+					 IB_QP_PKEY_INDEX),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_RTR2RTS,
+			.req_param = {
+				[UD]  = IB_QP_SQ_PSN,
+				[RC]  = (IB_QP_TIMEOUT           |
+					 IB_QP_RETRY_CNT         |
+					 IB_QP_RNR_RETRY         |
+					 IB_QP_SQ_PSN            |
+					 IB_QP_MAX_QP_RD_ATOMIC),
+				[MLX] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_PKEY_INDEX            |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_RTS2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_ACCESS_FLAGS          |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_PATH_MIG_STATE        |
+					 IB_QP_MIN_RNR_TIMER),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.trans = MTHCA_TRANS_RTS2SQD,
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_SQD2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.trans = MTHCA_TRANS_SQD2SQD,
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX            |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_AV                    |
+					 IB_QP_TIMEOUT               |
+					 IB_QP_RETRY_CNT             |
+					 IB_QP_RNR_RETRY             |
+					 IB_QP_MAX_QP_RD_ATOMIC      |
+					 IB_QP_MAX_DEST_RD_ATOMIC    |
+					 IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_PKEY_INDEX            |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_PKEY_INDEX            |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_SQERR2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_MIN_RNR_TIMER),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR }
+	}
+};
+
+static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	u8 status;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.enable_1x = 1;
+	param.enable_4x = 1;
+	param.vl_cap    = dev->limits.vl_cap;
+	param.mtu_cap   = dev->limits.mtu_cap;
+	param.gid_cap   = dev->limits.gid_table_len;
+	param.pkey_cap  = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port, &status);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+	if (status)
+		mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	void *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 req_param, opt_param;
+	u8 status;
+	int err;
+
+	if (attr_mask & IB_QP_CUR_STATE) {
+		if (attr->cur_qp_state != IB_QPS_RTR &&
+		    attr->cur_qp_state != IB_QPS_RTS &&
+		    attr->cur_qp_state != IB_QPS_SQD &&
+		    attr->cur_qp_state != IB_QPS_SQE)
+			return -EINVAL;
+		else
+			cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	if (attr_mask & IB_QP_STATE) {
+               if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR)
+			return -EINVAL;
+		new_state = attr->qp_state;
+	} else
+		new_state = cur_state;
+
+	if (state_table[cur_state][new_state].trans == MTHCA_TRANS_INVALID) {
+		mthca_dbg(dev, "Illegal QP transition "
+			  "%d->%d\n", cur_state, new_state);
+		return -EINVAL;
+	}
+
+	req_param = state_table[cur_state][new_state].req_param[qp->transport];
+	opt_param = state_table[cur_state][new_state].opt_param[qp->transport];
+
+	if ((req_param & attr_mask) != req_param) {
+		mthca_dbg(dev, "QP transition "
+			  "%d->%d missing req attr 0x%08x\n",
+			  cur_state, new_state,
+			  req_param & ~attr_mask);
+		return -EINVAL;
+	}
+
+	if (attr_mask & ~(req_param | opt_param | IB_QP_STATE)) {
+		mthca_dbg(dev, "QP transition (transport %d) "
+			  "%d->%d has extra attr 0x%08x\n",
+			  qp->transport,
+			  cur_state, new_state,
+			  attr_mask & ~(req_param | opt_param |
+						 IB_QP_STATE));
+		return -EINVAL;
+	}
+
+	mailbox = kmalloc(sizeof (*qp_param) + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	qp_param = MAILBOX_ALIGN(mailbox);
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	/* leave tavor_sched_queue as 0 */
+
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU)
+		qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		qp_context->rq_size_stride =
+			((ffs(qp->rq.max) - 1) << 3) | (qp->rq.wqe_shift - 4);
+		qp_context->sq_size_stride =
+			((ffs(qp->sq.max) - 1) << 3) | (qp->sq.wqe_shift - 4);
+	}
+
+	/* leave arbel_sched_queue as 0 */
+
+	qp_context->usr_page   = cpu_to_be32(dev->driver_uar.index);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(to_msqp(qp)->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->pri_path.rnr_retry = attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		qp_context->pri_path.g_mylmc     = attr->ah_attr.src_path_bits & 0x7f;
+		qp_context->pri_path.rlid        = cpu_to_be16(attr->ah_attr.dlid);
+		qp_context->pri_path.static_rate = (!!attr->ah_attr.static_rate) << 3;
+		if (attr->ah_attr.ah_flags & IB_AH_GRH) {
+			qp_context->pri_path.g_mylmc |= 1 << 7;
+			qp_context->pri_path.mgid_index = attr->ah_attr.grh.sgid_index;
+			qp_context->pri_path.hop_limit = attr->ah_attr.grh.hop_limit;
+			qp_context->pri_path.sl_tclass_flowlabel =
+				cpu_to_be32((attr->ah_attr.sl << 28)                |
+					    (attr->ah_attr.grh.traffic_class << 20) |
+					    (attr->ah_attr.grh.flow_label));
+			memcpy(qp_context->pri_path.rgid,
+			       attr->ah_attr.grh.dgid.raw, 16);
+		} else {
+			qp_context->pri_path.sl_tclass_flowlabel =
+				cpu_to_be32(attr->ah_attr.sl << 28);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	/* XXX alt_path */
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SRE           |
+					     MTHCA_QP_BIT_SWE           |
+					     MTHCA_QP_BIT_SAE);
+	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		qp_context->params1 |= cpu_to_be32(min(attr->max_dest_rd_atomic ?
+						       ffs(attr->max_dest_rd_atomic) - 1 : 0,
+						       7) << 21);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
+	}
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		/*
+		 * Only enable RDMA/atomics if we have responder
+		 * resources set to a non-zero value.
+		 */
+		if (qp->resp_depth) {
+			qp_context->params2 |=
+				cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE ?
+					    MTHCA_QP_BIT_RWE : 0);
+			qp_context->params2 |=
+				cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_READ ?
+					    MTHCA_QP_BIT_RRE : 0);
+			qp_context->params2 |=
+				cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC ?
+					    MTHCA_QP_BIT_RAE : 0);
+		}
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+
+		qp->atomic_rd_en = attr->qp_access_flags;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		u8 rra_max;
+
+		if (qp->resp_depth && !attr->max_rd_atomic) {
+			/*
+			 * Lowering our responder resources to zero.
+			 * Turn off RDMA/atomics as responder.
+			 * (RWE/RRE/RAE in params2 already zero)
+			 */
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+								MTHCA_QP_OPTPAR_RRE |
+								MTHCA_QP_OPTPAR_RAE);
+		}
+
+		if (!qp->resp_depth && attr->max_rd_atomic) {
+			/*
+			 * Increasing our responder resources from
+			 * zero.  Turn on RDMA/atomics as appropriate.
+			 */
+			qp_context->params2 |=
+				cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_WRITE ?
+					    MTHCA_QP_BIT_RWE : 0);
+			qp_context->params2 |=
+				cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_READ ?
+					    MTHCA_QP_BIT_RRE : 0);
+			qp_context->params2 |=
+				cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_ATOMIC ?
+					    MTHCA_QP_BIT_RAE : 0);
+
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+								MTHCA_QP_OPTPAR_RRE |
+								MTHCA_QP_OPTPAR_RAE);
+		}
+
+		for (rra_max = 0;
+		     1 << rra_max < attr->max_rd_atomic &&
+			     rra_max < dev->qp_table.rdb_shift;
+		     ++rra_max)
+			; /* nothing */
+
+		qp_context->params2      |= cpu_to_be32(rra_max << 21);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+
+		qp->resp_depth = attr->max_rd_atomic;
+	}
+
+	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	qp_context->ra_buff_indx =
+		cpu_to_be32(dev->qp_table.rdb_base +
+			    ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+			     dev->qp_table.rdb_shift));
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (dev->hca_type == ARBEL_NATIVE)
+		qp_context->rcv_db_index   = cpu_to_be32(qp->rq.db_index);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	err = mthca_MODIFY_QP(dev, state_table[cur_state][new_state].trans,
+			      qp->qpn, 0, qp_param, 0, &status);
+	if (status) {
+		mthca_warn(dev, "modify QP %d returned status %02x.\n",
+			   state_table[cur_state][new_state].trans, status);
+		err = -EINVAL;
+	}
+
+	if (!err)
+		qp->state = new_state;
+
+	kfree(mailbox);
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we are moving QP0 to RTR, bring the IB link up; if we
+	 * are moving QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, to_msqp(qp)->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status);
+	}
+
+	return err;
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int i;
+	int npages, shift;
+	dma_addr_t t;
+	u64 *dma_list = NULL;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	switch (qp->transport) {
+	case MLX:
+		size += 2 * sizeof (struct mthca_data_seg);
+		break;
+	case UD:
+		if (dev->hca_type == ARBEL_NATIVE)
+			size += sizeof (struct mthca_arbel_ud_seg);
+		else
+			size += sizeof (struct mthca_tavor_ud_seg);
+		break;
+	default:
+		/* bind seg is as big as atomic + raddr segs */
+		size += sizeof (struct mthca_bind_seg);
+	}
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	if (size <= MTHCA_MAX_DIRECT_QP_SIZE) {
+		qp->is_direct = 1;
+		npages = 1;
+		shift = get_order(size) + PAGE_SHIFT;
+
+		if (0)
+			mthca_dbg(dev, "Creating direct QP of size %d (shift %d)\n",
+				  size, shift);
+
+		qp->queue.direct.buf = pci_alloc_consistent(dev->pdev, size, &t);
+		if (!qp->queue.direct.buf)
+			goto err_out;
+
+		pci_unmap_addr_set(&qp->queue.direct, mapping, t);
+
+		memset(qp->queue.direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		qp->is_direct = 0;
+		npages = size / PAGE_SIZE;
+		shift = PAGE_SHIFT;
+
+		if (0)
+			mthca_dbg(dev, "Creating indirect QP with %d pages\n", npages);
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out;
+
+		qp->queue.page_list = kmalloc(npages *
+					      sizeof *qp->queue.page_list,
+					      GFP_KERNEL);
+		if (!qp->queue.page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i) {
+			qp->queue.page_list[i].buf =
+				pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+			if (!qp->queue.page_list[i].buf)
+				goto err_out_free;
+
+			memset(qp->queue.page_list[i].buf, 0, PAGE_SIZE);
+
+			pci_unmap_addr_set(&qp->queue.page_list[i], mapping, t);
+			dma_list[i] = t;
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num, dma_list, shift,
+				  npages, 0, size,
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &qp->mr);
+	if (err)
+		goto err_out_free;
+
+	kfree(dma_list);
+	return 0;
+
+ err_out_free:
+	if (qp->is_direct) {
+		pci_free_consistent(dev->pdev, size,
+				    qp->queue.direct.buf,
+				    pci_unmap_addr(&qp->queue.direct, mapping));
+	} else
+		for (i = 0; i < npages; ++i) {
+			if (qp->queue.page_list[i].buf)
+				pci_free_consistent(dev->pdev, PAGE_SIZE,
+						    qp->queue.page_list[i].buf,
+						    pci_unmap_addr(&qp->queue.page_list[i],
+								   mapping));
+
+		}
+
+ err_out:
+	kfree(qp->wrid);
+	kfree(dma_list);
+	return err;
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	int ret = 0;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+		if (ret)
+			return ret;
+
+		ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+		if (ret)
+			goto err_qpc;
+
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0) {
+			ret = -ENOMEM;
+			goto err_eqpc;
+		}
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0) {
+			ret = -ENOMEM;
+			goto err_rq_db;
+		}
+	}
+
+	return 0;
+
+err_rq_db:
+	mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+
+err_eqpc:
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+	return ret;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (dev->hca_type == ARBEL_NATIVE) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+		mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+		mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+	}
+}
+
+static void mthca_wq_init(struct mthca_wq* wq)
+{
+	spin_lock_init(&wq->lock);
+	wq->next_ind  = 0;
+	wq->last_comp = wq->max - 1;
+	wq->head      = 0;
+	wq->tail      = 0;
+	wq->last      = NULL;
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 struct mthca_qp *qp)
+{
+	struct mthca_next_seg *wqe;
+	int ret;
+	int i;
+
+	atomic_set(&qp->refcount, 1);
+	qp->state    	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+	qp->sq_policy    = send_policy;
+	mthca_wq_init(&qp->sq);
+	mthca_wq_init(&qp->rq);
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	if (ret) {
+		mthca_free_memfree(dev, qp);
+		return ret;
+	}
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		for (i = 0; i < qp->rq.max; ++i) {
+			wqe = get_recv_wqe(qp, i);
+			wqe->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+						  qp->rq.wqe_shift);
+			wqe->ee_nds = cpu_to_be32(1 << (qp->rq.wqe_shift - 4));
+		}
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			wqe = get_send_wqe(qp, i);
+			wqe->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+						   qp->sq.wqe_shift) +
+						  qp->send_wqe_offset);
+		}
+	}
+
+	return 0;
+}
+
+static void mthca_align_qp_size(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	int i;
+
+	if (dev->hca_type != ARBEL_NATIVE)
+		return;
+
+	for (i = 0; 1 << i < qp->rq.max; ++i)
+		; /* nothing */
+
+	qp->rq.max = 1 << i;
+
+	for (i = 0; 1 << i < qp->sq.max; ++i)
+		; /* nothing */
+
+	qp->sq.max = 1 << i;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	mthca_align_qp_size(dev, qp);
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	int err = 0;
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+
+	mthca_align_qp_size(dev, &sqp->qp);
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->port = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	spin_lock_irq(&send_cq->lock);
+	if (send_cq != recv_cq)
+		spin_lock(&recv_cq->lock);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock(&dev->qp_table.lock);
+
+	if (send_cq != recv_cq)
+		spin_unlock(&recv_cq->lock);
+	spin_unlock_irq(&send_cq->lock);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	u8 status;
+	int size;
+	int i;
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	spin_lock_irq(&send_cq->lock);
+	if (send_cq != recv_cq)
+		spin_lock(&recv_cq->lock);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	spin_unlock(&dev->qp_table.lock);
+
+	if (send_cq != recv_cq)
+		spin_unlock(&recv_cq->lock);
+	spin_unlock_irq(&send_cq->lock);
+
+	atomic_dec(&qp->refcount);
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, MTHCA_TRANS_ANY2RST, qp->qpn, 0, NULL, 0, &status);
+
+	mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn);
+	if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn);
+
+	mthca_free_mr(dev, &qp->mr);
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	if (qp->is_direct) {
+		pci_free_consistent(dev->pdev, size,
+				    qp->queue.direct.buf,
+				    pci_unmap_addr(&qp->queue.direct, mapping));
+	} else {
+		for (i = 0; i < size / PAGE_SIZE; ++i) {
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    qp->queue.page_list[i].buf,
+					    pci_unmap_addr(&qp->queue.page_list[i],
+							   mapping));
+		}
+	}
+
+	kfree(qp->wrid);
+
+	mthca_free_memfree(dev, qp);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	} else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_send_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+
+	ib_ud_header_init(256, /* assume a MAD */
+			  sqp->ud_header.grh_present,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid == 0xffff ?
+				   MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == 0xffff)
+		sqp->ud_header.lrh.source_lid = 0xffff;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, sqp->port,
+				   sqp->pkey_index,
+				   &sqp->ud_header.bth.pkey);
+	else
+		ib_get_cached_pkey(&dev->ib_dev, sqp->port,
+				   wr->wr.ud.pkey_index,
+				   &sqp->ud_header.bth.pkey);
+	cpu_to_be16s(&sqp->ud_header.bth.pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+				    struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mthca_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	int size0 = 0;
+	u32 f0 = 0;
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				((struct mthca_raddr_seg *) wqe)->raddr =
+					cpu_to_be64(wr->wr.atomic.remote_addr);
+				((struct mthca_raddr_seg *) wqe)->rkey =
+					cpu_to_be32(wr->wr.atomic.rkey);
+				((struct mthca_raddr_seg *) wqe)->reserved = 0;
+
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+					((struct mthca_atomic_seg *) wqe)->swap_add =
+						cpu_to_be64(wr->wr.atomic.swap);
+					((struct mthca_atomic_seg *) wqe)->compare =
+						cpu_to_be64(wr->wr.atomic.compare_add);
+				} else {
+					((struct mthca_atomic_seg *) wqe)->swap_add =
+						cpu_to_be64(wr->wr.atomic.compare_add);
+					((struct mthca_atomic_seg *) wqe)->compare = 0;
+				}
+
+				wqe += sizeof (struct mthca_atomic_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16 +
+					sizeof (struct mthca_atomic_seg);
+				break;
+
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+			case IB_WR_RDMA_READ:
+				((struct mthca_raddr_seg *) wqe)->raddr =
+					cpu_to_be64(wr->wr.rdma.remote_addr);
+				((struct mthca_raddr_seg *) wqe)->rkey =
+					cpu_to_be32(wr->wr.rdma.rkey);
+				((struct mthca_raddr_seg *) wqe)->reserved = 0;
+				wqe += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			((struct mthca_tavor_ud_seg *) wqe)->lkey =
+				cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+			((struct mthca_tavor_ud_seg *) wqe)->av_addr =
+				cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+			((struct mthca_tavor_ud_seg *) wqe)->dqpn =
+				cpu_to_be32(wr->wr.ud.remote_qpn);
+			((struct mthca_tavor_ud_seg *) wqe)->qkey =
+				cpu_to_be32(wr->wr.ud.remote_qkey);
+
+			wqe += sizeof (struct mthca_tavor_ud_seg);
+			size += sizeof (struct mthca_tavor_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (prev_wqe) {
+			((struct mthca_next_seg *) prev_wqe)->nda_op =
+				cpu_to_be32(((ind << qp->sq.wqe_shift) +
+					     qp->send_wqe_offset) |
+					    mthca_opcode[wr->opcode]);
+			wmb();
+			((struct mthca_next_seg *) prev_wqe)->ee_nds =
+				cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
+		}
+
+		if (!size0) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32(((qp->sq.next_ind << qp->sq.wqe_shift) +
+					   qp->send_wqe_offset) | f0 | op0);
+		doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+		wmb();
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	int size0 = 0;
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		if (likely(prev_wqe)) {
+			((struct mthca_next_seg *) prev_wqe)->nda_op =
+				cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
+			wmb();
+			((struct mthca_next_seg *) prev_wqe)->ee_nds =
+				cpu_to_be32(MTHCA_NEXT_DBD | size);
+		}
+
+		if (!size0)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
+		doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);
+
+		wmb();
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.next_ind = ind;
+	qp->rq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	int size0 = 0;
+	u32 f0 = 0;
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case UD:
+			memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
+			       to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+			((struct mthca_arbel_ud_seg *) wqe)->dqpn =
+				cpu_to_be32(wr->wr.ud.remote_qpn);
+			((struct mthca_arbel_ud_seg *) wqe)->qkey =
+				cpu_to_be32(wr->wr.ud.remote_qkey);
+
+			wqe += sizeof (struct mthca_arbel_ud_seg);
+			size += sizeof (struct mthca_arbel_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (likely(prev_wqe)) {
+			((struct mthca_next_seg *) prev_wqe)->nda_op =
+				cpu_to_be32(((ind << qp->sq.wqe_shift) +
+					     qp->send_wqe_offset) |
+					    mthca_opcode[wr->opcode]);
+			wmb();
+			((struct mthca_next_seg *) prev_wqe)->ee_nds =
+				cpu_to_be32(MTHCA_NEXT_DBD | size);
+		}
+
+		if (!size0) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32((nreq << 24)                  |
+					  ((qp->sq.head & 0xffff) << 8) |
+					  f0 | op0);
+		doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+	void *wqe;
+
+ 	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < qp->rq.max_gs) {
+			((struct mthca_data_seg *) wqe)->byte_count = 0;
+			((struct mthca_data_seg *) wqe)->lkey = cpu_to_be32(0x100);
+			((struct mthca_data_seg *) wqe)->addr = 0;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+		       int index, int *dbd, u32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	if (dev->hca_type == ARBEL_NATIVE)
+		*dbd = 1;
+	else
+		*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+
+	return 0;
+}
+
+int __devinit mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+					    dev->qp_table.sqp_start + i * 2,
+					    &status);
+		if (err)
+			goto err_out;
+		if (status) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "status %02x, aborting.\n",
+				   status);
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+	u8 status;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 000000000000..ce3fff7d02b7
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_reset.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  swab32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (mdev->hca_type == TAVOR) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s (%s)\n",
+					  pci_pretty_name(bridge), pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s (%s)\n",
+				  pci_pretty_name(mdev->pdev), pci_name(mdev->pdev));
+		}
+
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mthca_err(mdev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't allocate memory to save HCA "
+				  "bridge PCI header, aborting.\n");
+			goto out;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto out;
+			}
+		}
+	}
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto out;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto out;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto out;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	if (bridge)
+		pci_dev_put(bridge);
+	kfree(bridge_header);
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 000000000000..1c8791ded6ff
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	uar->index = mthca_alloc(&dev->uar_table.alloc);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+	int ret;
+
+	ret = mthca_alloc_init(&dev->uar_table.alloc,
+			       dev->limits.num_uars,
+			       dev->limits.num_uars - 1,
+			       dev->limits.reserved_uars);
+	if (ret)
+		return ret;
+
+	ret = mthca_init_db_tab(dev);
+	if (ret)
+		mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+	return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+	mthca_cleanup_db_tab(dev);
+
+	/* XXX check if any UARs are still allocated? */
+	mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/drivers/infiniband/include/ib_cache.h b/drivers/infiniband/include/ib_cache.h
new file mode 100644
index 000000000000..44ef6bb9b9df
--- /dev/null
+++ b/drivers/infiniband/include/ib_cache.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device    *device,
+		      u8                   port_num,
+		      int                  index,
+		      union ib_gid        *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device    *device_handle,
+		       u8                   port_num,
+		       int                  index,
+		       u16                 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device    *device,
+			u8                   port_num,
+			u16                  pkey,
+			u16                 *index);
+
+#endif /* _IB_CACHE_H */
diff --git a/drivers/infiniband/include/ib_fmr_pool.h b/drivers/infiniband/include/ib_fmr_pool.h
new file mode 100644
index 000000000000..e8769657cbbb
--- /dev/null
+++ b/drivers/infiniband/include/ib_fmr_pool.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_fmr_pool.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ *     FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ *     more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ *     requests.
+ */
+struct ib_fmr_pool_param {
+	int                     max_pages_per_fmr;
+	enum ib_access_flags    access;
+	int                     pool_size;
+	int                     dirty_watermark;
+	void                  (*flush_function)(struct ib_fmr_pool *pool,
+						void *              arg);
+	void                   *flush_arg;
+	unsigned                cache:1;
+};
+
+struct ib_pool_fmr {
+	struct ib_fmr      *fmr;
+	struct ib_fmr_pool *pool;
+	struct list_head    list;
+	struct hlist_node   cache_node;
+	int                 ref_count;
+	int                 remap_count;
+	u64                 io_virtual_address;
+	int                 page_list_len;
+	u64                 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params);
+
+int ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                *io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/drivers/infiniband/include/ib_mad.h b/drivers/infiniband/include/ib_mad.h
new file mode 100644
index 000000000000..4a6bf6763a97
--- /dev/null
+++ b/drivers/infiniband/include/ib_mad.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_mad.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#if !defined( IB_MAD_H )
+#define IB_MAD_H
+
+#include <ib_verbs.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION			1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED		0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE	0x81
+#define IB_MGMT_CLASS_SUBN_ADM			0x03
+#define IB_MGMT_CLASS_PERF_MGMT			0x04
+#define IB_MGMT_CLASS_BM			0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT		0x06
+#define IB_MGMT_CLASS_CM			0x07
+#define IB_MGMT_CLASS_SNMP			0x08
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START	0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END		0x4F
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET			0x01
+#define IB_MGMT_METHOD_SET			0x02
+#define IB_MGMT_METHOD_GET_RESP			0x81
+#define IB_MGMT_METHOD_SEND			0x03
+#define IB_MGMT_METHOD_TRAP			0x05
+#define IB_MGMT_METHOD_REPORT			0x06
+#define IB_MGMT_METHOD_REPORT_RESP		0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS		0x07
+
+#define IB_MGMT_METHOD_RESP			0x80
+
+#define IB_MGMT_MAX_METHODS			128
+
+#define IB_QP0		0
+#define IB_QP1		__constant_htonl(1)
+#define IB_QP1_QKEY	0x80010000
+
+struct ib_grh {
+	u32		version_tclass_flow;
+	u16		paylen;
+	u8		next_hdr;
+	u8		hop_limit;
+	union ib_gid	sgid;
+	union ib_gid	dgid;
+} __attribute__ ((packed));
+
+struct ib_mad_hdr {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	u16	status;
+	u16	class_specific;
+	u64	tid;
+	u16	attr_id;
+	u16	resv;
+	u32	attr_mod;
+} __attribute__ ((packed));
+
+struct ib_rmpp_hdr {
+	u8	rmpp_version;
+	u8	rmpp_type;
+	u8	rmpp_rtime_flags;
+	u8	rmpp_status;
+	u32	seg_num;
+	u32	paylen_newwin;
+} __attribute__ ((packed));
+
+struct ib_mad {
+	struct ib_mad_hdr	mad_hdr;
+	u8			data[232];
+} __attribute__ ((packed));
+
+struct ib_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[220];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			reserved;
+	u8			oui[3];
+	u8			data[216];
+} __attribute__ ((packed));
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD.  Valid
+ *   only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+				     struct ib_send_wr *send_wr,
+				     struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user after the send operation completes.  All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents.  Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ *   Unsolicited MADs sent by this client will have the upper 32-bits
+ *   of their TID set to this value.
+ * @port_num: Port number on which QP is registered
+ */
+struct ib_mad_agent {
+	struct ib_device	*device;
+	struct ib_qp		*qp;
+	ib_mad_recv_handler	recv_handler;
+	ib_mad_send_handler	send_handler;
+	ib_mad_snoop_handler	snoop_handler;
+	void			*context;
+	u32			hi_tid;
+	u8			port_num;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @wr_id: Work request identifier associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ *   request.
+ */
+struct ib_mad_send_wc {
+	u64			wr_id;
+	enum ib_wc_status	status;
+	u32			vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ *   The data refereced by this buffer is only valid if the GRH is
+ *   valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+	struct list_head	list;
+	struct ib_grh		*grh;
+	struct ib_mad		*mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id field of the wc is set to the wr_id
+ *   for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+	struct ib_wc		*wc;
+	struct ib_mad_recv_buf	recv_buf;
+	int			mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ */
+struct ib_mad_reg_req {
+	u8	mgmt_class;
+	u8	mgmt_class_version;
+	u8	oui[3];
+	DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ *   by the caller.  This parameter may be NULL if the caller only
+ *   wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+enum ib_mad_snoop_flags {
+	/*IB_MAD_SNOOP_POSTED_SENDS	   = 1,*/
+	/*IB_MAD_SNOOP_RMPP_SENDS	   = (1<<1),*/
+	IB_MAD_SNOOP_SEND_COMPLETIONS	   = (1<<2),
+	/*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+	IB_MAD_SNOOP_RECVS		   = (1<<4)
+	/*IB_MAD_SNOOP_RMPP_RECVS	   = (1<<5),*/
+	/*IB_MAD_SNOOP_REDIRECTED_QPS	   = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *   with the registered client.
+ * @mad_agent: Specifies the associated registration to post the send to.
+ * @send_wr: Specifies the information needed to send the MAD(s).
+ * @bad_send_wr: Specifies the MAD on which an error was encountered.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ */
+int ib_post_send_mad(struct ib_mad_agent *mad_agent,
+		     struct ib_send_wr *send_wr,
+		     struct ib_send_wr **bad_send_wr);
+
+/**
+ * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ * @buf: User-provided data buffer to receive the coalesced buffers.  The
+ *   referenced buffer should be at least the size of the mad_len specified
+ *   by @mad_recv_wc.
+ *
+ * This call copies a chain of received RMPP MADs into a single data buffer,
+ * removing duplicated headers.
+ */
+void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+			  void *buf);
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD to the
+ *   access layer.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @wr_id: Indicates the work request identifier of the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   u64 wr_id);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs.  After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ *   MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ *   MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request.  If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc);
+
+#endif /* IB_MAD_H */
diff --git a/drivers/infiniband/include/ib_pack.h b/drivers/infiniband/include/ib_pack.h
new file mode 100644
index 000000000000..fe480f3e8654
--- /dev/null
+++ b/drivers/infiniband/include/ib_pack.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <ib_verbs.h>
+
+enum {
+	IB_LRH_BYTES  = 8,
+	IB_GRH_BYTES  = 40,
+	IB_BTH_BYTES  = 12,
+	IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_words;
+	int    offset_bits;
+	int    size_bits;
+	char  *field_name;
+};
+
+#define RESERVED \
+	.field_name          = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+	IB_OPCODE_ ## transport ## _ ## op = \
+		IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+	/* transport types -- just used to define real constants */
+	IB_OPCODE_RC                                = 0x00,
+	IB_OPCODE_UC                                = 0x20,
+	IB_OPCODE_RD                                = 0x40,
+	IB_OPCODE_UD                                = 0x60,
+
+	/* operations -- just used to define real constants */
+	IB_OPCODE_SEND_FIRST                        = 0x00,
+	IB_OPCODE_SEND_MIDDLE                       = 0x01,
+	IB_OPCODE_SEND_LAST                         = 0x02,
+	IB_OPCODE_SEND_LAST_WITH_IMMEDIATE          = 0x03,
+	IB_OPCODE_SEND_ONLY                         = 0x04,
+	IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE          = 0x05,
+	IB_OPCODE_RDMA_WRITE_FIRST                  = 0x06,
+	IB_OPCODE_RDMA_WRITE_MIDDLE                 = 0x07,
+	IB_OPCODE_RDMA_WRITE_LAST                   = 0x08,
+	IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE    = 0x09,
+	IB_OPCODE_RDMA_WRITE_ONLY                   = 0x0a,
+	IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    = 0x0b,
+	IB_OPCODE_RDMA_READ_REQUEST                 = 0x0c,
+	IB_OPCODE_RDMA_READ_RESPONSE_FIRST          = 0x0d,
+	IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE         = 0x0e,
+	IB_OPCODE_RDMA_READ_RESPONSE_LAST           = 0x0f,
+	IB_OPCODE_RDMA_READ_RESPONSE_ONLY           = 0x10,
+	IB_OPCODE_ACKNOWLEDGE                       = 0x11,
+	IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
+	IB_OPCODE_COMPARE_SWAP                      = 0x13,
+	IB_OPCODE_FETCH_ADD                         = 0x14,
+
+	/* real constants follow -- see comment about above IB_OPCODE()
+	   macro for more details */
+
+	/* RC */
+	IB_OPCODE(RC, SEND_FIRST),
+	IB_OPCODE(RC, SEND_MIDDLE),
+	IB_OPCODE(RC, SEND_LAST),
+	IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, SEND_ONLY),
+	IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_FIRST),
+	IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RC, RDMA_WRITE_LAST),
+	IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_READ_REQUEST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RC, ACKNOWLEDGE),
+	IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RC, COMPARE_SWAP),
+	IB_OPCODE(RC, FETCH_ADD),
+
+	/* UC */
+	IB_OPCODE(UC, SEND_FIRST),
+	IB_OPCODE(UC, SEND_MIDDLE),
+	IB_OPCODE(UC, SEND_LAST),
+	IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, SEND_ONLY),
+	IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_FIRST),
+	IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(UC, RDMA_WRITE_LAST),
+	IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+	/* RD */
+	IB_OPCODE(RD, SEND_FIRST),
+	IB_OPCODE(RD, SEND_MIDDLE),
+	IB_OPCODE(RD, SEND_LAST),
+	IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, SEND_ONLY),
+	IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_FIRST),
+	IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RD, RDMA_WRITE_LAST),
+	IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_READ_REQUEST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RD, ACKNOWLEDGE),
+	IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RD, COMPARE_SWAP),
+	IB_OPCODE(RD, FETCH_ADD),
+
+	/* UD */
+	IB_OPCODE(UD, SEND_ONLY),
+	IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+	IB_LNH_RAW        = 0,
+	IB_LNH_IP         = 1,
+	IB_LNH_IBA_LOCAL  = 2,
+	IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+	u8        virtual_lane;
+	u8        link_version;
+	u8        service_level;
+	u8        link_next_header;
+	__be16    destination_lid;
+	__be16    packet_length;
+	__be16    source_lid;
+};
+
+struct ib_unpacked_grh {
+	u8    	     ip_version;
+	u8    	     traffic_class;
+	__be32 	     flow_label;
+	__be16       payload_length;
+	u8    	     next_header;
+	u8    	     hop_limit;
+	union ib_gid source_gid;
+	union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+	u8           opcode;
+	u8           solicited_event;
+	u8           mig_req;
+	u8           pad_count;
+	u8           transport_header_version;
+	__be16       pkey;
+	__be32       destination_qpn;
+	u8           ack_req;
+	__be32       psn;
+};
+
+struct ib_unpacked_deth {
+	__be32       qkey;
+	__be32       source_qpn;
+};
+
+struct ib_ud_header {
+	struct ib_unpacked_lrh  lrh;
+	int                     grh_present;
+	struct ib_unpacked_grh  grh;
+	struct ib_unpacked_bth  bth;
+	struct ib_unpacked_deth deth;
+	int            		immediate_present;
+	__be32         		immediate_data;
+};
+
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf);
+
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure);
+
+void ib_ud_header_init(int     		   payload_bytes,
+		       int    		   grh_present,
+		       struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf);
+
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/drivers/infiniband/include/ib_sa.h b/drivers/infiniband/include/ib_sa.h
new file mode 100644
index 000000000000..f4f747707b30
--- /dev/null
+++ b/drivers/infiniband/include/ib_sa.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_sa.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/compiler.h>
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+
+enum {
+	IB_SA_CLASS_VERSION	= 2,	/* IB spec version 1.1/1.2 */
+
+	IB_SA_METHOD_DELETE	= 0x15
+};
+
+enum ib_sa_selector {
+	IB_SA_GTE  = 0,
+	IB_SA_LTE  = 1,
+	IB_SA_EQ   = 2,
+	/*
+	 * The meaning of "best" depends on the attribute: for
+	 * example, for MTU best will return the largest available
+	 * MTU, while for packet life time, best will return the
+	 * smallest available life time.
+	 */
+	IB_SA_BEST = 3
+};
+
+enum ib_sa_rate {
+	IB_SA_RATE_2_5_GBPS = 2,
+	IB_SA_RATE_5_GBPS   = 5,
+	IB_SA_RATE_10_GBPS  = 3,
+	IB_SA_RATE_20_GBPS  = 6,
+	IB_SA_RATE_30_GBPS  = 4,
+	IB_SA_RATE_40_GBPS  = 7,
+	IB_SA_RATE_60_GBPS  = 8,
+	IB_SA_RATE_80_GBPS  = 9,
+	IB_SA_RATE_120_GBPS = 10
+};
+
+static inline int ib_sa_rate_enum_to_int(enum ib_sa_rate rate)
+{
+	switch (rate) {
+	case IB_SA_RATE_2_5_GBPS: return  1;
+	case IB_SA_RATE_5_GBPS:   return  2;
+	case IB_SA_RATE_10_GBPS:  return  4;
+	case IB_SA_RATE_20_GBPS:  return  8;
+	case IB_SA_RATE_30_GBPS:  return 12;
+	case IB_SA_RATE_40_GBPS:  return 16;
+	case IB_SA_RATE_60_GBPS:  return 24;
+	case IB_SA_RATE_80_GBPS:  return 32;
+	case IB_SA_RATE_120_GBPS: return 48;
+	default: 	          return -1;
+	}
+}
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n)	((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec."  No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+/* reserved:								 0 */
+/* reserved:								 1 */
+#define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID				IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC			IB_SA_COMP_MASK( 6)
+/* reserved:								 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL       		IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT			IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS			IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
+/* reserved:								14 */
+#define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR			IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE				IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME			IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+	/* reserved */
+	/* reserved */
+	union ib_gid dgid;
+	union ib_gid sgid;
+	u16          dlid;
+	u16          slid;
+	int          raw_traffic;
+	/* reserved */
+	u32          flow_label;
+	u8           hop_limit;
+	u8           traffic_class;
+	int          reversible;
+	u8           numb_path;
+	u16          pkey;
+	/* reserved */
+	u8           sl;
+	u8           mtu_selector;
+	enum ib_mtu  mtu;
+	u8           rate_selector;
+	u8           rate;
+	u8           packet_life_time_selector;
+	u8           packet_life_time;
+	u8           preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY				IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID				IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR			IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU				IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS		IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY				IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR		IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE				IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME		IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL				IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL			IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT			IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE			IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE			IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN			IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+	union ib_gid mgid;
+	union ib_gid port_gid;
+	u32          qkey;
+	u16          mlid;
+	u8           mtu_selector;
+	enum         ib_mtu mtu;
+	u8           traffic_class;
+	u16          pkey;
+	u8 	     rate_selector;
+	u8 	     rate;
+	u8 	     packet_life_time_selector;
+	u8 	     packet_life_time;
+	u8           sl;
+	u32          flow_label;
+	u8           hop_limit;
+	u8           scope;
+	u8           join_state;
+	int          proxy_join;
+};
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query);
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, int gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **query);
+
+/**
+ * ib_sa_mcmember_rec_set - Start an MCMember set query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Set query to the SA (eg to join a multicast
+ * group).  The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query.  The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_set() is negative, it is
+ * an error code.  Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_set(struct ib_device *device, u8 port_num,
+		       struct ib_sa_mcmember_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_mcmember_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query)
+{
+	return ib_sa_mcmember_rec_query(device, port_num,
+					IB_MGMT_METHOD_SET,
+					rec, comp_mask,
+					timeout_ms, gfp_mask, callback,
+					context, query);
+}
+
+/**
+ * ib_sa_mcmember_rec_delete - Start an MCMember delete query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Delete query to the SA (eg to leave a multicast
+ * group).  The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query.  The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_delete() is negative, it
+ * is an error code.  Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_delete(struct ib_device *device, u8 port_num,
+			  struct ib_sa_mcmember_rec *rec,
+			  ib_sa_comp_mask comp_mask,
+			  int timeout_ms, int gfp_mask,
+			  void (*callback)(int status,
+					   struct ib_sa_mcmember_rec *resp,
+					   void *context),
+			  void *context,
+			  struct ib_sa_query **query)
+{
+	return ib_sa_mcmember_rec_query(device, port_num,
+					IB_SA_METHOD_DELETE,
+					rec, comp_mask,
+					timeout_ms, gfp_mask, callback,
+					context, query);
+}
+
+
+#endif /* IB_SA_H */
diff --git a/drivers/infiniband/include/ib_smi.h b/drivers/infiniband/include/ib_smi.h
new file mode 100644
index 000000000000..ca8216514963
--- /dev/null
+++ b/drivers/infiniband/include/ib_smi.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#if !defined( IB_SMI_H )
+#define IB_SMI_H
+
+#include <ib_mad.h>
+
+#define IB_LID_PERMISSIVE			0xFFFF
+
+#define IB_SMP_DATA_SIZE			64
+#define IB_SMP_MAX_PATH_HOPS			64
+
+struct ib_smp {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	u16	status;
+	u8	hop_ptr;
+	u8	hop_cnt;
+	u64	tid;
+	u16	attr_id;
+	u16	resv;
+	u32	attr_mod;
+	u64	mkey;
+	u16	dr_slid;
+	u16	dr_dlid;
+	u8	reserved[28];
+	u8	data[IB_SMP_DATA_SIZE];
+	u8	initial_path[IB_SMP_MAX_PATH_HOPS];
+	u8	return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION			__constant_htons(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE			__constant_htons(0x0002)
+#define IB_SMP_ATTR_NODE_DESC			__constant_htons(0x0010)
+#define IB_SMP_ATTR_NODE_INFO			__constant_htons(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO			__constant_htons(0x0012)
+#define IB_SMP_ATTR_GUID_INFO			__constant_htons(0x0014)
+#define IB_SMP_ATTR_PORT_INFO			__constant_htons(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE			__constant_htons(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		__constant_htons(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE		__constant_htons(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	__constant_htons(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	__constant_htons(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		__constant_htons(0x001B)
+#define IB_SMP_ATTR_SM_INFO			__constant_htons(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG			__constant_htons(0x0030)
+#define IB_SMP_ATTR_LED_INFO			__constant_htons(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK			__constant_htons(0xFF00)
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+	return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/drivers/infiniband/include/ib_user_mad.h b/drivers/infiniband/include/ib_user_mad.h
new file mode 100644
index 000000000000..06ad4a6075fa
--- /dev/null
+++ b/drivers/infiniband/include/ib_user_mad.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_mad.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION	2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad - MAD packet
+ * @data - Contents of MAD
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ *
+ * All multi-byte quantities are stored in network (big endian) byte order.
+ */
+struct ib_user_mad {
+	__u8	data[256];
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	qpn;
+	__u32   qkey;
+	__u16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__u32	flow_label;
+};
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ */
+struct ib_user_mad_reg_req {
+	__u32	id;
+	__u32	method_mask[4];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+};
+
+#define IB_IOCTL_MAGIC		0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
+					      struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#endif /* IB_USER_MAD_H */
diff --git a/drivers/infiniband/include/ib_verbs.h b/drivers/infiniband/include/ib_verbs.h
new file mode 100644
index 000000000000..cf01f044a223
--- /dev/null
+++ b/drivers/infiniband/include/ib_verbs.h
@@ -0,0 +1,1252 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <asm/atomic.h>
+
+union ib_gid {
+	u8	raw[16];
+	struct {
+		u64	subnet_prefix;
+		u64	interface_id;
+	} global;
+};
+
+enum ib_node_type {
+	IB_NODE_CA 	= 1,
+	IB_NODE_SWITCH,
+	IB_NODE_ROUTER
+};
+
+enum ib_device_cap_flags {
+	IB_DEVICE_RESIZE_MAX_WR		= 1,
+	IB_DEVICE_BAD_PKEY_CNTR		= (1<<1),
+	IB_DEVICE_BAD_QKEY_CNTR		= (1<<2),
+	IB_DEVICE_RAW_MULTI		= (1<<3),
+	IB_DEVICE_AUTO_PATH_MIG		= (1<<4),
+	IB_DEVICE_CHANGE_PHY_PORT	= (1<<5),
+	IB_DEVICE_UD_AV_PORT_ENFORCE	= (1<<6),
+	IB_DEVICE_CURR_QP_STATE_MOD	= (1<<7),
+	IB_DEVICE_SHUTDOWN_PORT		= (1<<8),
+	IB_DEVICE_INIT_TYPE		= (1<<9),
+	IB_DEVICE_PORT_ACTIVE_EVENT	= (1<<10),
+	IB_DEVICE_SYS_IMAGE_GUID	= (1<<11),
+	IB_DEVICE_RC_RNR_NAK_GEN	= (1<<12),
+	IB_DEVICE_SRQ_RESIZE		= (1<<13),
+	IB_DEVICE_N_NOTIFY_CQ		= (1<<14),
+};
+
+enum ib_atomic_cap {
+	IB_ATOMIC_NONE,
+	IB_ATOMIC_HCA,
+	IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+	u64			fw_ver;
+	u64			node_guid;
+	u64			sys_image_guid;
+	u64			max_mr_size;
+	u64			page_size_cap;
+	u32			vendor_id;
+	u32			vendor_part_id;
+	u32			hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ib_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	u16			max_pkeys;
+	u8			local_ca_ack_delay;
+};
+
+enum ib_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: 	  return -1;
+	}
+}
+
+enum ib_port_state {
+	IB_PORT_NOP		= 0,
+	IB_PORT_DOWN		= 1,
+	IB_PORT_INIT		= 2,
+	IB_PORT_ARMED		= 3,
+	IB_PORT_ACTIVE		= 4,
+	IB_PORT_ACTIVE_DEFER	= 5
+};
+
+enum ib_port_cap_flags {
+	IB_PORT_SM				= 1 <<  1,
+	IB_PORT_NOTICE_SUP			= 1 <<  2,
+	IB_PORT_TRAP_SUP			= 1 <<  3,
+	IB_PORT_OPT_IPD_SUP                     = 1 <<  4,
+	IB_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IB_PORT_SL_MAP_SUP			= 1 <<  6,
+	IB_PORT_MKEY_NVRAM			= 1 <<  7,
+	IB_PORT_PKEY_NVRAM			= 1 <<  8,
+	IB_PORT_LED_INFO_SUP			= 1 <<  9,
+	IB_PORT_SM_DISABLED			= 1 << 10,
+	IB_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IB_PORT_CM_SUP				= 1 << 16,
+	IB_PORT_SNMP_TUNNEL_SUP			= 1 << 17,
+	IB_PORT_REINIT_SUP			= 1 << 18,
+	IB_PORT_DEVICE_MGMT_SUP			= 1 << 19,
+	IB_PORT_VENDOR_CLASS_SUP		= 1 << 20,
+	IB_PORT_DR_NOTICE_SUP			= 1 << 21,
+	IB_PORT_CAP_MASK_NOTICE_SUP		= 1 << 22,
+	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
+	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
+	IB_PORT_CLIENT_REG_SUP			= 1 << 25
+};
+
+enum ib_port_width {
+	IB_WIDTH_1X	= 1,
+	IB_WIDTH_4X	= 2,
+	IB_WIDTH_8X	= 4,
+	IB_WIDTH_12X	= 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+	switch (width) {
+	case IB_WIDTH_1X:  return  1;
+	case IB_WIDTH_4X:  return  4;
+	case IB_WIDTH_8X:  return  8;
+	case IB_WIDTH_12X: return 12;
+	default: 	  return -1;
+	}
+}
+
+struct ib_port_attr {
+	enum ib_port_state	state;
+	enum ib_mtu		max_mtu;
+	enum ib_mtu		active_mtu;
+	int			gid_tbl_len;
+	u32			port_cap_flags;
+	u32			max_msg_sz;
+	u32			bad_pkey_cntr;
+	u32			qkey_viol_cntr;
+	u16			pkey_tbl_len;
+	u16			lid;
+	u16			sm_lid;
+	u8			lmc;
+	u8			max_vl_num;
+	u8			sm_sl;
+	u8			subnet_timeout;
+	u8			init_type_reply;
+	u8			active_width;
+	u8			active_speed;
+	u8                      phys_state;
+};
+
+enum ib_device_modify_flags {
+	IB_DEVICE_MODIFY_SYS_IMAGE_GUID	= 1
+};
+
+struct ib_device_modify {
+	u64	sys_image_guid;
+};
+
+enum ib_port_modify_flags {
+	IB_PORT_SHUTDOWN		= 1,
+	IB_PORT_INIT_TYPE		= (1<<2),
+	IB_PORT_RESET_QKEY_CNTR		= (1<<3)
+};
+
+struct ib_port_modify {
+	u32	set_port_cap_mask;
+	u32	clr_port_cap_mask;
+	u8	init_type;
+};
+
+enum ib_event_type {
+	IB_EVENT_CQ_ERR,
+	IB_EVENT_QP_FATAL,
+	IB_EVENT_QP_REQ_ERR,
+	IB_EVENT_QP_ACCESS_ERR,
+	IB_EVENT_COMM_EST,
+	IB_EVENT_SQ_DRAINED,
+	IB_EVENT_PATH_MIG,
+	IB_EVENT_PATH_MIG_ERR,
+	IB_EVENT_DEVICE_FATAL,
+	IB_EVENT_PORT_ACTIVE,
+	IB_EVENT_PORT_ERR,
+	IB_EVENT_LID_CHANGE,
+	IB_EVENT_PKEY_CHANGE,
+	IB_EVENT_SM_CHANGE
+};
+
+struct ib_event {
+	struct ib_device	*device;
+	union {
+		struct ib_cq	*cq;
+		struct ib_qp	*qp;
+		u8		port_num;
+	} element;
+	enum ib_event_type	event;
+};
+
+struct ib_event_handler {
+	struct ib_device *device;
+	void            (*handler)(struct ib_event_handler *, struct ib_event *);
+	struct list_head  list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler)		\
+	do {							\
+		(_ptr)->device  = _device;			\
+		(_ptr)->handler = _handler;			\
+		INIT_LIST_HEAD(&(_ptr)->list);			\
+	} while (0)
+
+struct ib_global_route {
+	union ib_gid	dgid;
+	u32		flow_label;
+	u8		sgid_index;
+	u8		hop_limit;
+	u8		traffic_class;
+};
+
+enum {
+	IB_MULTICAST_QPN = 0xffffff
+};
+
+enum ib_ah_flags {
+	IB_AH_GRH	= 1
+};
+
+struct ib_ah_attr {
+	struct ib_global_route	grh;
+	u16			dlid;
+	u8			sl;
+	u8			src_path_bits;
+	u8			static_rate;
+	u8			ah_flags;
+	u8			port_num;
+};
+
+enum ib_wc_status {
+	IB_WC_SUCCESS,
+	IB_WC_LOC_LEN_ERR,
+	IB_WC_LOC_QP_OP_ERR,
+	IB_WC_LOC_EEC_OP_ERR,
+	IB_WC_LOC_PROT_ERR,
+	IB_WC_WR_FLUSH_ERR,
+	IB_WC_MW_BIND_ERR,
+	IB_WC_BAD_RESP_ERR,
+	IB_WC_LOC_ACCESS_ERR,
+	IB_WC_REM_INV_REQ_ERR,
+	IB_WC_REM_ACCESS_ERR,
+	IB_WC_REM_OP_ERR,
+	IB_WC_RETRY_EXC_ERR,
+	IB_WC_RNR_RETRY_EXC_ERR,
+	IB_WC_LOC_RDD_VIOL_ERR,
+	IB_WC_REM_INV_RD_REQ_ERR,
+	IB_WC_REM_ABORT_ERR,
+	IB_WC_INV_EECN_ERR,
+	IB_WC_INV_EEC_STATE_ERR,
+	IB_WC_FATAL_ERR,
+	IB_WC_RESP_TIMEOUT_ERR,
+	IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+	IB_WC_SEND,
+	IB_WC_RDMA_WRITE,
+	IB_WC_RDMA_READ,
+	IB_WC_COMP_SWAP,
+	IB_WC_FETCH_ADD,
+	IB_WC_BIND_MW,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+	IB_WC_RECV			= 1 << 7,
+	IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+	IB_WC_GRH		= 1,
+	IB_WC_WITH_IMM		= (1<<1)
+};
+
+struct ib_wc {
+	u64			wr_id;
+	enum ib_wc_status	status;
+	enum ib_wc_opcode	opcode;
+	u32			vendor_err;
+	u32			byte_len;
+	__be32			imm_data;
+	u32			qp_num;
+	u32			src_qp;
+	int			wc_flags;
+	u16			pkey_index;
+	u16			slid;
+	u8			sl;
+	u8			dlid_path_bits;
+	u8			port_num;	/* valid only for DR SMPs on switches */
+};
+
+enum ib_cq_notify {
+	IB_CQ_SOLICITED,
+	IB_CQ_NEXT_COMP
+};
+
+struct ib_qp_cap {
+	u32	max_send_wr;
+	u32	max_recv_wr;
+	u32	max_send_sge;
+	u32	max_recv_sge;
+	u32	max_inline_data;
+};
+
+enum ib_sig_type {
+	IB_SIGNAL_ALL_WR,
+	IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	IB_QPT_SMI,
+	IB_QPT_GSI,
+
+	IB_QPT_RC,
+	IB_QPT_UC,
+	IB_QPT_UD,
+	IB_QPT_RAW_IPV6,
+	IB_QPT_RAW_ETY
+};
+
+struct ib_qp_init_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_qp_cap	cap;
+	enum ib_sig_type	sq_sig_type;
+	enum ib_qp_type		qp_type;
+	u8			port_num; /* special QP types only */
+};
+
+enum ib_rnr_timeout {
+	IB_RNR_TIMER_655_36 =  0,
+	IB_RNR_TIMER_000_01 =  1,
+	IB_RNR_TIMER_000_02 =  2,
+	IB_RNR_TIMER_000_03 =  3,
+	IB_RNR_TIMER_000_04 =  4,
+	IB_RNR_TIMER_000_06 =  5,
+	IB_RNR_TIMER_000_08 =  6,
+	IB_RNR_TIMER_000_12 =  7,
+	IB_RNR_TIMER_000_16 =  8,
+	IB_RNR_TIMER_000_24 =  9,
+	IB_RNR_TIMER_000_32 = 10,
+	IB_RNR_TIMER_000_48 = 11,
+	IB_RNR_TIMER_000_64 = 12,
+	IB_RNR_TIMER_000_96 = 13,
+	IB_RNR_TIMER_001_28 = 14,
+	IB_RNR_TIMER_001_92 = 15,
+	IB_RNR_TIMER_002_56 = 16,
+	IB_RNR_TIMER_003_84 = 17,
+	IB_RNR_TIMER_005_12 = 18,
+	IB_RNR_TIMER_007_68 = 19,
+	IB_RNR_TIMER_010_24 = 20,
+	IB_RNR_TIMER_015_36 = 21,
+	IB_RNR_TIMER_020_48 = 22,
+	IB_RNR_TIMER_030_72 = 23,
+	IB_RNR_TIMER_040_96 = 24,
+	IB_RNR_TIMER_061_44 = 25,
+	IB_RNR_TIMER_081_92 = 26,
+	IB_RNR_TIMER_122_88 = 27,
+	IB_RNR_TIMER_163_84 = 28,
+	IB_RNR_TIMER_245_76 = 29,
+	IB_RNR_TIMER_327_68 = 30,
+	IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+	IB_QP_STATE			= 1,
+	IB_QP_CUR_STATE			= (1<<1),
+	IB_QP_EN_SQD_ASYNC_NOTIFY	= (1<<2),
+	IB_QP_ACCESS_FLAGS		= (1<<3),
+	IB_QP_PKEY_INDEX		= (1<<4),
+	IB_QP_PORT			= (1<<5),
+	IB_QP_QKEY			= (1<<6),
+	IB_QP_AV			= (1<<7),
+	IB_QP_PATH_MTU			= (1<<8),
+	IB_QP_TIMEOUT			= (1<<9),
+	IB_QP_RETRY_CNT			= (1<<10),
+	IB_QP_RNR_RETRY			= (1<<11),
+	IB_QP_RQ_PSN			= (1<<12),
+	IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	IB_QP_ALT_PATH			= (1<<14),
+	IB_QP_MIN_RNR_TIMER		= (1<<15),
+	IB_QP_SQ_PSN			= (1<<16),
+	IB_QP_MAX_DEST_RD_ATOMIC	= (1<<17),
+	IB_QP_PATH_MIG_STATE		= (1<<18),
+	IB_QP_CAP			= (1<<19),
+	IB_QP_DEST_QPN			= (1<<20)
+};
+
+enum ib_qp_state {
+	IB_QPS_RESET,
+	IB_QPS_INIT,
+	IB_QPS_RTR,
+	IB_QPS_RTS,
+	IB_QPS_SQD,
+	IB_QPS_SQE,
+	IB_QPS_ERR
+};
+
+enum ib_mig_state {
+	IB_MIG_MIGRATED,
+	IB_MIG_REARM,
+	IB_MIG_ARMED
+};
+
+struct ib_qp_attr {
+	enum ib_qp_state	qp_state;
+	enum ib_qp_state	cur_qp_state;
+	enum ib_mtu		path_mtu;
+	enum ib_mig_state	path_mig_state;
+	u32			qkey;
+	u32			rq_psn;
+	u32			sq_psn;
+	u32			dest_qp_num;
+	int			qp_access_flags;
+	struct ib_qp_cap	cap;
+	struct ib_ah_attr	ah_attr;
+	struct ib_ah_attr	alt_ah_attr;
+	u16			pkey_index;
+	u16			alt_pkey_index;
+	u8			en_sqd_async_notify;
+	u8			sq_draining;
+	u8			max_rd_atomic;
+	u8			max_dest_rd_atomic;
+	u8			min_rnr_timer;
+	u8			port_num;
+	u8			timeout;
+	u8			retry_cnt;
+	u8			rnr_retry;
+	u8			alt_port_num;
+	u8			alt_timeout;
+};
+
+enum ib_wr_opcode {
+	IB_WR_RDMA_WRITE,
+	IB_WR_RDMA_WRITE_WITH_IMM,
+	IB_WR_SEND,
+	IB_WR_SEND_WITH_IMM,
+	IB_WR_RDMA_READ,
+	IB_WR_ATOMIC_CMP_AND_SWP,
+	IB_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ib_send_flags {
+	IB_SEND_FENCE		= 1,
+	IB_SEND_SIGNALED	= (1<<1),
+	IB_SEND_SOLICITED	= (1<<2),
+	IB_SEND_INLINE		= (1<<3)
+};
+
+struct ib_sge {
+	u64	addr;
+	u32	length;
+	u32	lkey;
+};
+
+struct ib_send_wr {
+	struct ib_send_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+	enum ib_wr_opcode	opcode;
+	int			send_flags;
+	u32			imm_data;
+	union {
+		struct {
+			u64	remote_addr;
+			u32	rkey;
+		} rdma;
+		struct {
+			u64	remote_addr;
+			u64	compare_add;
+			u64	swap;
+			u32	rkey;
+		} atomic;
+		struct {
+			struct ib_ah *ah;
+			struct ib_mad_hdr *mad_hdr;
+			u32	remote_qpn;
+			u32	remote_qkey;
+			int	timeout_ms; /* valid for MADs only */
+			u16	pkey_index; /* valid for GSI only */
+			u8	port_num;   /* valid for DR SMPs on switch only */
+		} ud;
+	} wr;
+};
+
+struct ib_recv_wr {
+	struct ib_recv_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+};
+
+enum ib_access_flags {
+	IB_ACCESS_LOCAL_WRITE	= 1,
+	IB_ACCESS_REMOTE_WRITE	= (1<<1),
+	IB_ACCESS_REMOTE_READ	= (1<<2),
+	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IB_ACCESS_MW_BIND	= (1<<4)
+};
+
+struct ib_phys_buf {
+	u64      addr;
+	u64      size;
+};
+
+struct ib_mr_attr {
+	struct ib_pd	*pd;
+	u64		device_virt_addr;
+	u64		size;
+	int		mr_access_flags;
+	u32		lkey;
+	u32		rkey;
+};
+
+enum ib_mr_rereg_flags {
+	IB_MR_REREG_TRANS	= 1,
+	IB_MR_REREG_PD		= (1<<1),
+	IB_MR_REREG_ACCESS	= (1<<2)
+};
+
+struct ib_mw_bind {
+	struct ib_mr   *mr;
+	u64		wr_id;
+	u64		addr;
+	u32		length;
+	int		send_flags;
+	int		mw_access_flags;
+};
+
+struct ib_fmr_attr {
+	int	max_pages;
+	int	max_maps;
+	u8	page_size;
+};
+
+struct ib_pd {
+	struct ib_device *device;
+	atomic_t          usecnt; /* count all resources */
+};
+
+struct ib_ah {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+	struct ib_device *device;
+	ib_comp_handler   comp_handler;
+	void             (*event_handler)(struct ib_event *, void *);
+	void *            cq_context;
+	int               cqe;
+	atomic_t          usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	void			*srq_context;
+	atomic_t		usecnt;
+};
+
+struct ib_qp {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+};
+
+struct ib_mr {
+	struct ib_device *device;
+	struct ib_pd     *pd;
+	u32		  lkey;
+	u32		  rkey;
+	atomic_t          usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	u32			rkey;
+};
+
+struct ib_fmr {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct list_head	list;
+	u32			lkey;
+	u32			rkey;
+};
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+	IB_MAD_IGNORE_MKEY	= 1,
+	IB_MAD_IGNORE_BKEY	= 2,
+	IB_MAD_IGNORE_ALL	= IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+	IB_MAD_RESULT_FAILURE  = 0,      /* (!SUCCESS is the important flag) */
+	IB_MAD_RESULT_SUCCESS  = 1 << 0, /* MAD was successfully processed   */
+	IB_MAD_RESULT_REPLY    = 1 << 1, /* Reply packet needs to be sent    */
+	IB_MAD_RESULT_CONSUMED = 1 << 2  /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+	rwlock_t                lock;
+	struct ib_event_handler event_handler;
+	struct ib_pkey_cache  **pkey_cache;
+	struct ib_gid_cache   **gid_cache;
+};
+
+struct ib_device {
+	struct device                *dma_device;
+
+	char                          name[IB_DEVICE_NAME_MAX];
+
+	struct list_head              event_handler_list;
+	spinlock_t                    event_handler_lock;
+
+	struct list_head              core_list;
+	struct list_head              client_data_list;
+	spinlock_t                    client_data_lock;
+
+	struct ib_cache               cache;
+
+	u32                           flags;
+
+	int		           (*query_device)(struct ib_device *device,
+						   struct ib_device_attr *device_attr);
+	int		           (*query_port)(struct ib_device *device,
+						 u8 port_num,
+						 struct ib_port_attr *port_attr);
+	int		           (*query_gid)(struct ib_device *device,
+						u8 port_num, int index,
+						union ib_gid *gid);
+	int		           (*query_pkey)(struct ib_device *device,
+						 u8 port_num, u16 index, u16 *pkey);
+	int		           (*modify_device)(struct ib_device *device,
+						    int device_modify_mask,
+						    struct ib_device_modify *device_modify);
+	int		           (*modify_port)(struct ib_device *device,
+						  u8 port_num, int port_modify_mask,
+						  struct ib_port_modify *port_modify);
+	struct ib_pd *             (*alloc_pd)(struct ib_device *device);
+	int                        (*dealloc_pd)(struct ib_pd *pd);
+	struct ib_ah *             (*create_ah)(struct ib_pd *pd,
+						struct ib_ah_attr *ah_attr);
+	int                        (*modify_ah)(struct ib_ah *ah,
+						struct ib_ah_attr *ah_attr);
+	int                        (*query_ah)(struct ib_ah *ah,
+					       struct ib_ah_attr *ah_attr);
+	int                        (*destroy_ah)(struct ib_ah *ah);
+	struct ib_qp *             (*create_qp)(struct ib_pd *pd,
+						struct ib_qp_init_attr *qp_init_attr);
+	int                        (*modify_qp)(struct ib_qp *qp,
+						struct ib_qp_attr *qp_attr,
+						int qp_attr_mask);
+	int                        (*query_qp)(struct ib_qp *qp,
+					       struct ib_qp_attr *qp_attr,
+					       int qp_attr_mask,
+					       struct ib_qp_init_attr *qp_init_attr);
+	int                        (*destroy_qp)(struct ib_qp *qp);
+	int                        (*post_send)(struct ib_qp *qp,
+						struct ib_send_wr *send_wr,
+						struct ib_send_wr **bad_send_wr);
+	int                        (*post_recv)(struct ib_qp *qp,
+						struct ib_recv_wr *recv_wr,
+						struct ib_recv_wr **bad_recv_wr);
+	struct ib_cq *             (*create_cq)(struct ib_device *device,
+						int cqe);
+	int                        (*destroy_cq)(struct ib_cq *cq);
+	int                        (*resize_cq)(struct ib_cq *cq, int *cqe);
+	int                        (*poll_cq)(struct ib_cq *cq, int num_entries,
+					      struct ib_wc *wc);
+	int                        (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+	int                        (*req_notify_cq)(struct ib_cq *cq,
+						    enum ib_cq_notify cq_notify);
+	int                        (*req_ncomp_notif)(struct ib_cq *cq,
+						      int wc_cnt);
+	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
+						 int mr_access_flags);
+	struct ib_mr *             (*reg_phys_mr)(struct ib_pd *pd,
+						  struct ib_phys_buf *phys_buf_array,
+						  int num_phys_buf,
+						  int mr_access_flags,
+						  u64 *iova_start);
+	int                        (*query_mr)(struct ib_mr *mr,
+					       struct ib_mr_attr *mr_attr);
+	int                        (*dereg_mr)(struct ib_mr *mr);
+	int                        (*rereg_phys_mr)(struct ib_mr *mr,
+						    int mr_rereg_mask,
+						    struct ib_pd *pd,
+						    struct ib_phys_buf *phys_buf_array,
+						    int num_phys_buf,
+						    int mr_access_flags,
+						    u64 *iova_start);
+	struct ib_mw *             (*alloc_mw)(struct ib_pd *pd);
+	int                        (*bind_mw)(struct ib_qp *qp,
+					      struct ib_mw *mw,
+					      struct ib_mw_bind *mw_bind);
+	int                        (*dealloc_mw)(struct ib_mw *mw);
+	struct ib_fmr *	           (*alloc_fmr)(struct ib_pd *pd,
+						int mr_access_flags,
+						struct ib_fmr_attr *fmr_attr);
+	int		           (*map_phys_fmr)(struct ib_fmr *fmr,
+						   u64 *page_list, int list_len,
+						   u64 iova);
+	int		           (*unmap_fmr)(struct list_head *fmr_list);
+	int		           (*dealloc_fmr)(struct ib_fmr *fmr);
+	int                        (*attach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*detach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*process_mad)(struct ib_device *device,
+						  int process_mad_flags,
+						  u8 port_num,
+						  struct ib_wc *in_wc,
+						  struct ib_grh *in_grh,
+						  struct ib_mad *in_mad,
+						  struct ib_mad *out_mad);
+
+	struct class_device          class_dev;
+	struct kobject               ports_parent;
+	struct list_head             port_list;
+
+	enum {
+		IB_DEV_UNINITIALIZED,
+		IB_DEV_REGISTERED,
+		IB_DEV_UNREGISTERED
+	}                            reg_state;
+
+	u8                           node_type;
+	u8                           phys_port_cnt;
+};
+
+struct ib_client {
+	char  *name;
+	void (*add)   (struct ib_device *);
+	void (*remove)(struct ib_device *);
+
+	struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device   (struct ib_device *device);
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client   (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			 void *data);
+
+int ib_register_event_handler  (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+		  u8 port_num, struct ib_port_attr *port_attr);
+
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ *   address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ *   handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ *   transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ *   specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+			       struct ib_send_wr *send_wr,
+			       struct ib_send_wr **bad_send_wr)
+{
+	return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+			       struct ib_recv_wr *recv_wr,
+			       struct ib_recv_wr **bad_recv_wr)
+{
+	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ *   completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ *   asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ *   the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ *   will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions.  If the return value
+ * is < 0, an error occurred.  If the return value is >= 0, it is the
+ * number of completions returned.  If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+			     struct ib_wc *wc)
+{
+	return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ *   on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @cq_notify: If set to %IB_CQ_SOLICITED, completion notification will
+ *   occur on the next solicited event. If set to %IB_CQ_NEXT_COMP,
+ *   notification will occur on the next completion.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+				   enum ib_cq_notify cq_notify)
+{
+	return cq->device->req_notify_cq(cq, cq_notify);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ *   at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ *   CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+	return cq->device->req_ncomp_notif ?
+		cq->device->req_ncomp_notif(cq, wc_cnt) :
+		-ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ *   usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ *   by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ *   memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ *   Conceptually, this call performs the functions deregister memory region
+ *   followed by register physical memory region.  Where possible,
+ *   resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ *   properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ *   the new protection domain to associated with the memory region,
+ *   otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies a list of physical buffers to use in the new
+ *   translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies the size of the phys_buf_array, otherwise, this
+ *   parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ *   field specifies the new memory access rights, otherwise, this
+ *   parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ *   HCA translation table.
+ * @mr: The memory region to deregister.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ *   QP, which binds the memory window to the given address range and
+ *   remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ *   its address range, remote access rights, and associated memory region.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	/* XXX reference counting in corresponding MR? */
+	return mw->device->bind_mw ?
+		mw->device->bind_mw(qp, mw, mw_bind) :
+		-ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+				  u64 *page_list, int list_len,
+				  u64 iova)
+{
+	return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group.  The QP must be type
+ *   IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately.  The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+#endif /* IB_VERBS_H */
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 000000000000..8d2e04cac68e
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,33 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on INFINIBAND && NETDEVICES && INET
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  The IPoIB protocol is defined by the IETF ipoib working
+	  group: <http://www.ietf.org/html.charters/ipoib-charter.html>.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging"
+	depends on INFINIBAND_IPOIB
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates an "ipoib_debugfs," which can be
+	  mounted to expose debugging information about IB multicast
+	  groups used by the IPoIB driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 000000000000..394bc08abc6f
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 000000000000..04c98f54e9c4
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/pci.h>
+#include <linux/config.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+
+#include <net/neighbour.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+/* constants */
+
+enum {
+	IPOIB_PACKET_SIZE         = 2048,
+	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
+
+	IPOIB_ENCAP_LEN 	  = 4,
+
+	IPOIB_RX_RING_SIZE 	  = 128,
+	IPOIB_TX_RING_SIZE 	  = 64,
+
+	IPOIB_NUM_WC 		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE     = 3,
+
+	IPOIB_FLAG_OPER_UP 	  = 0,
+	IPOIB_FLAG_ADMIN_UP 	  = 1,
+	IPOIB_PKEY_ASSIGNED 	  = 2,
+	IPOIB_PKEY_STOP 	  = 3,
+	IPOIB_FLAG_SUBINTERFACE   = 4,
+	IPOIB_MCAST_RUN 	  = 5,
+	IPOIB_STOP_REAPER         = 6,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND 	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+};
+
+/* structs */
+
+struct ipoib_header {
+	u16 proto;
+	u16 reserved;
+};
+
+struct ipoib_pseudoheader {
+	u8  hwaddr[INFINIBAND_ALEN];
+};
+
+struct ipoib_mcast;
+
+struct ipoib_buf {
+	struct sk_buff *skb;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+/*
+ * Device private locking: tx_lock protects members used in TX fast
+ * path (and we use LLTX so upper layers don't do extra locking).
+ * lock protects everything else.  lock nests inside of tx_lock (ie
+ * tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct net_device *dev;
+
+	unsigned long flags;
+
+	struct semaphore mcast_mutex;
+	struct semaphore vlan_mutex;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct work_struct pkey_task;
+	struct work_struct mcast_task;
+	struct work_struct flush_task;
+	struct work_struct restart_task;
+	struct work_struct ah_reap_task;
+
+	struct ib_device *ca;
+	u8            	  port;
+	u16           	  pkey;
+	struct ib_pd  	 *pd;
+	struct ib_mr  	 *mr;
+	struct ib_cq  	 *cq;
+	struct ib_qp  	 *qp;
+	u32           	  qkey;
+
+	union ib_gid local_gid;
+	u16          local_lid;
+	u8           local_rate;
+
+	unsigned int admin_mtu;
+	unsigned int mcast_mtu;
+
+	struct ipoib_buf *rx_ring;
+
+	spinlock_t        tx_lock;
+	struct ipoib_buf *tx_ring;
+	unsigned          tx_head;
+	unsigned          tx_tail;
+	struct ib_sge     tx_sge;
+	struct ib_send_wr tx_wr;
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct net_device_stats stats;
+
+	struct net_device *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+#endif
+};
+
+struct ipoib_ah {
+	struct net_device *dev;
+	struct ib_ah      *ah;
+	struct list_head   list;
+	struct kref        ref;
+	unsigned           last_send;
+};
+
+struct ipoib_path {
+	struct net_device    *dev;
+	struct ib_sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int                   query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node        rb_node;
+	struct list_head      list;
+};
+
+struct ipoib_neigh {
+	struct ipoib_ah    *ah;
+	struct sk_buff_head queue;
+
+	struct neighbour   *neighbour;
+
+	struct list_head    list;
+};
+
+static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
+{
+	return (struct ipoib_neigh **) (neigh->ha + 24 -
+					(offsetof(struct neighbour, ha) & 4));
+}
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+
+int ipoib_add_pkey_attr(struct net_device *dev);
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(void *dev_ptr);
+
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_ib_dev_flush(void *dev);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_up(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(void *dev_ptr);
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+		      struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(void *dev_ptr);
+int ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid);
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid);
+
+int ipoib_qp_create(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+void ipoib_pkey_poll(void *dev);
+int ipoib_pkey_dev_delay_open(struct net_device *dev);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_create_debug_file(struct net_device *dev);
+void ipoib_delete_debug_file(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline int ipoib_create_debug_file(struct net_device *dev) { return 0; }
+static inline void ipoib_delete_debug_file(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {					        \
+		if (ipoib_debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {					        \
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {					        \
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+
+#define IPOIB_GID_FMT		"%x:%x:%x:%x:%x:%x:%x:%x"
+
+#define IPOIB_GID_ARG(gid)	be16_to_cpup((__be16 *) ((gid).raw +  0)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  2)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  4)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  6)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  8)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 10)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 12)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 14))
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 000000000000..044f2c78ef15
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include "ipoib.h"
+
+enum {
+	IPOIB_MAGIC = 0x49504942 /* "IPIB" */
+};
+
+static DECLARE_MUTEX(ipoib_fs_mutex);
+static struct dentry *ipoib_root;
+static struct super_block *ipoib_sb;
+static LIST_HEAD(ipoib_device_list);
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			ipoib_mcast_iter_free(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		ipoib_mcast_iter_free(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	int i, n;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (iter) {
+		ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+				      &complete, &send_only);
+
+		for (n = 0, i = 0; i < sizeof mgid / 2; ++i) {
+			n += sprintf(gid_buf + n, "%x",
+				     be16_to_cpu(((u16 *)mgid.raw)[i]));
+			if (i < sizeof mgid / 2 - 1)
+				gid_buf[n++] = ':';
+		}
+	}
+
+	seq_printf(file, "GID: %*s", -(1 + (int) sizeof gid_buf), gid_buf);
+
+	seq_printf(file,
+		   " created: %10ld queuelen: %4d complete: %d send_only: %d\n",
+		   created, queuelen, complete, send_only);
+
+	return 0;
+}
+
+static struct seq_operations ipoib_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->u.generic_ip;
+
+	return 0;
+}
+
+static struct file_operations ipoib_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static struct inode *ipoib_get_inode(void)
+{
+	struct inode *inode = new_inode(ipoib_sb);
+
+	if (inode) {
+		inode->i_mode 	 = S_IFREG | S_IRUGO;
+		inode->i_uid 	 = 0;
+		inode->i_gid 	 = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks  = 0;
+		inode->i_atime 	 = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop     = &ipoib_fops;
+	}
+
+	return inode;
+}
+
+static int __ipoib_create_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct dentry *dentry;
+	struct inode *inode;
+	char name[IFNAMSIZ + sizeof "_mcg"];
+
+	snprintf(name, sizeof name, "%s_mcg", dev->name);
+
+	dentry = d_alloc_name(ipoib_root, name);
+	if (!dentry)
+		return -ENOMEM;
+
+	inode = ipoib_get_inode();
+	if (!inode) {
+		dput(dentry);
+		return -ENOMEM;
+	}
+
+	inode->u.generic_ip = dev;
+	priv->mcg_dentry = dentry;
+
+	d_add(dentry, inode);
+
+	return 0;
+}
+
+int ipoib_create_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	down(&ipoib_fs_mutex);
+
+	list_add_tail(&priv->fs_list, &ipoib_device_list);
+
+	if (!ipoib_sb) {
+		up(&ipoib_fs_mutex);
+		return 0;
+	}
+
+	up(&ipoib_fs_mutex);
+
+	return __ipoib_create_debug_file(dev);
+}
+
+void ipoib_delete_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	down(&ipoib_fs_mutex);
+	list_del(&priv->fs_list);
+	if (!ipoib_sb) {
+		up(&ipoib_fs_mutex);
+		return;
+	}
+	up(&ipoib_fs_mutex);
+
+	if (priv->mcg_dentry) {
+		d_drop(priv->mcg_dentry);
+		simple_unlink(ipoib_root->d_inode, priv->mcg_dentry);
+	}
+}
+
+static int ipoib_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr ipoib_files[] = {
+		{ "" }
+	};
+	struct ipoib_dev_priv *priv;
+	int ret;
+
+	ret = simple_fill_super(sb, IPOIB_MAGIC, ipoib_files);
+	if (ret)
+		return ret;
+
+	ipoib_root = sb->s_root;
+
+	down(&ipoib_fs_mutex);
+
+	ipoib_sb = sb;
+
+	list_for_each_entry(priv, &ipoib_device_list, fs_list) {
+		ret = __ipoib_create_debug_file(priv->dev);
+		if (ret)
+			break;
+	}
+
+	up(&ipoib_fs_mutex);
+
+	return ret;
+}
+
+static struct super_block *ipoib_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, ipoib_fill_super);
+}
+
+static void ipoib_kill_sb(struct super_block *sb)
+{
+	down(&ipoib_fs_mutex);
+	ipoib_sb = NULL;
+	up(&ipoib_fs_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type ipoib_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ipoib_debugfs",
+	.get_sb		= ipoib_get_sb,
+	.kill_sb	= ipoib_kill_sb,
+};
+
+int ipoib_register_debugfs(void)
+{
+	return register_filesystem(&ipoib_fs_type);
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	unregister_filesystem(&ipoib_fs_type);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 000000000000..c5a1d45e0ac5
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
+ */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+#define	IPOIB_OP_RECV	(1ul << 31)
+
+static DECLARE_MUTEX(pkey_sem);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return NULL;
+
+	ah->dev       = dev;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	ah->ah = ib_create_ah(pd, attr);
+	if (IS_ERR(ah->ah)) {
+		kfree(ah);
+		ah = NULL;
+	} else
+		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
+
+	unsigned long flags;
+
+	if (ah->last_send <= priv->tx_tail) {
+		ipoib_dbg(priv, "Freeing ah %p\n", ah->ah);
+		ib_destroy_ah(ah->ah);
+		kfree(ah);
+	} else {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_add_tail(&ah->list, &priv->dead_ahs);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+static inline int ipoib_ib_receive(struct ipoib_dev_priv *priv,
+				   unsigned int wr_id,
+				   dma_addr_t addr)
+{
+	struct ib_sge list = {
+		.addr    = addr,
+		.length  = IPOIB_BUF_SIZE,
+		.lkey    = priv->mr->lkey,
+	};
+	struct ib_recv_wr param = {
+		.wr_id 	    = wr_id | IPOIB_OP_RECV,
+		.sg_list    = &list,
+		.num_sge    = 1,
+	};
+	struct ib_recv_wr *bad_wr;
+
+	return ib_post_recv(priv->qp, &param, &bad_wr);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	dma_addr_t addr;
+	int ret;
+
+	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+	if (!skb) {
+		ipoib_warn(priv, "failed to allocate receive buffer\n");
+
+		priv->rx_ring[id].skb = NULL;
+		return -ENOMEM;
+	}
+	skb_reserve(skb, 4);	/* 16 byte align IP header */
+	priv->rx_ring[id].skb = skb;
+	addr = dma_map_single(priv->ca->dma_device,
+			      skb->data, IPOIB_BUF_SIZE,
+			      DMA_FROM_DEVICE);
+	pci_unmap_addr_set(&priv->rx_ring[id], mapping, addr);
+
+	ret = ipoib_ib_receive(priv, id, addr);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_receive failed for buf %d (%d)\n",
+			   id, ret);
+		dma_unmap_single(priv->ca->dma_device, addr,
+				 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		dev_kfree_skb_any(skb);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+		if (ipoib_ib_post_receive(dev, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void ipoib_ib_handle_wc(struct net_device *dev,
+			       struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+
+	ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+		       wr_id, wc->opcode, wc->status);
+
+	if (wr_id & IPOIB_OP_RECV) {
+		wr_id &= ~IPOIB_OP_RECV;
+
+		if (wr_id < IPOIB_RX_RING_SIZE) {
+			struct sk_buff *skb = priv->rx_ring[wr_id].skb;
+
+			priv->rx_ring[wr_id].skb = NULL;
+
+			dma_unmap_single(priv->ca->dma_device,
+					 pci_unmap_addr(&priv->rx_ring[wr_id],
+							mapping),
+					 IPOIB_BUF_SIZE,
+					 DMA_FROM_DEVICE);
+
+			if (wc->status != IB_WC_SUCCESS) {
+				if (wc->status != IB_WC_WR_FLUSH_ERR)
+					ipoib_warn(priv, "failed recv event "
+						   "(status=%d, wrid=%d vend_err %x)\n",
+						   wc->status, wr_id, wc->vendor_err);
+				dev_kfree_skb_any(skb);
+				return;
+			}
+
+			ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+				       wc->byte_len, wc->slid);
+
+			skb_put(skb, wc->byte_len);
+			skb_pull(skb, IB_GRH_BYTES);
+
+			if (wc->slid != priv->local_lid ||
+			    wc->src_qp != priv->qp->qp_num) {
+				skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+
+				skb_pull(skb, IPOIB_ENCAP_LEN);
+
+				dev->last_rx = jiffies;
+				++priv->stats.rx_packets;
+				priv->stats.rx_bytes += skb->len;
+
+				skb->dev = dev;
+				/* XXX get correct PACKET_ type here */
+				skb->pkt_type = PACKET_HOST;
+				netif_rx_ni(skb);
+			} else {
+				ipoib_dbg_data(priv, "dropping loopback packet\n");
+				dev_kfree_skb_any(skb);
+			}
+
+			/* repost receive */
+			if (ipoib_ib_post_receive(dev, wr_id))
+				ipoib_warn(priv, "ipoib_ib_post_receive failed "
+					   "for buf %d\n", wr_id);
+		} else
+			ipoib_warn(priv, "completion event with wrid %d\n",
+				   wr_id);
+
+	} else {
+		struct ipoib_buf *tx_req;
+		unsigned long flags;
+
+		if (wr_id >= IPOIB_TX_RING_SIZE) {
+			ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+				   wr_id, IPOIB_TX_RING_SIZE);
+			return;
+		}
+
+		ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+
+		tx_req = &priv->tx_ring[wr_id];
+
+		dma_unmap_single(priv->ca->dma_device,
+				 pci_unmap_addr(tx_req, mapping),
+				 tx_req->skb->len,
+				 DMA_TO_DEVICE);
+
+		++priv->stats.tx_packets;
+		priv->stats.tx_bytes += tx_req->skb->len;
+
+		dev_kfree_skb_any(tx_req->skb);
+
+		spin_lock_irqsave(&priv->tx_lock, flags);
+		++priv->tx_tail;
+		if (netif_queue_stopped(dev) &&
+		    priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2)
+			netif_wake_queue(dev);
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
+			ipoib_warn(priv, "failed send event "
+				   "(status=%d, wrid=%d vend_err %x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+	}
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct net_device *dev = (struct net_device *) dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int n, i;
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	do {
+		n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i)
+			ipoib_ib_handle_wc(dev, priv->ibwc + i);
+	} while (n == IPOIB_NUM_WC);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    unsigned int wr_id,
+			    struct ib_ah *address, u32 qpn,
+			    dma_addr_t addr, int len)
+{
+	struct ib_send_wr *bad_wr;
+
+	priv->tx_sge.addr             = addr;
+	priv->tx_sge.length           = len;
+
+	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_wr.wr.ud.remote_qpn  = qpn;
+	priv->tx_wr.wr.ud.ah 	      = address;
+
+	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_buf *tx_req;
+	dma_addr_t addr;
+
+	if (skb->len > dev->mtu + INFINIBAND_ALEN) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   skb->len, dev->mtu + INFINIBAND_ALEN);
+		++priv->stats.tx_dropped;
+		++priv->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+		       skb->len, address, qpn);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)];
+	tx_req->skb = skb;
+	addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len,
+			      DMA_TO_DEVICE);
+	pci_unmap_addr_set(tx_req, mapping, addr);
+
+	if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1),
+			       address->ah, qpn, addr, skb->len))) {
+		ipoib_warn(priv, "post_send failed\n");
+		++priv->stats.tx_errors;
+		dma_unmap_single(priv->ca->dma_device, addr, skb->len,
+				 DMA_TO_DEVICE);
+		dev_kfree_skb_any(skb);
+	} else {
+		dev->trans_start = jiffies;
+
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+
+		if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {
+			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+			netif_stop_queue(dev);
+		}
+	}
+}
+
+static void __ipoib_reap_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+
+	spin_lock_irq(&priv->lock);
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if (ah->last_send <= priv->tx_tail) {
+			list_del(&ah->list);
+			list_add_tail(&ah->list, &remove_list);
+		}
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(ah, tah, &remove_list, list) {
+		ipoib_dbg(priv, "Reaping ah %p\n", ah->ah);
+		ib_destroy_ah(ah->ah);
+		kfree(ah);
+	}
+}
+
+void ipoib_reap_ah(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	__ipoib_reap_ah(dev);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+}
+
+int ipoib_ib_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ret = ipoib_qp_create(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_qp_create returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		return -1;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+
+	return 0;
+}
+
+int ipoib_ib_dev_up(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	return ipoib_mcast_start_thread(dev);
+}
+
+int ipoib_ib_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	netif_carrier_off(dev);
+
+	/* Shutdown the P_Key thread if still active */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		down(&pkey_sem);
+		set_bit(IPOIB_PKEY_STOP, &priv->flags);
+		cancel_delayed_work(&priv->pkey_task);
+		up(&pkey_sem);
+		flush_workqueue(ipoib_workqueue);
+	}
+
+	ipoib_mcast_stop_thread(dev);
+
+	/*
+	 * Flush the multicast groups first so we stop any multicast joins. The
+	 * completion thread may have already died and we may deadlock waiting
+	 * for the completion thread to finish some multicast joins.
+	 */
+	ipoib_mcast_dev_flush(dev);
+
+	/* Delete broadcast and local addresses since they will be recreated */
+	ipoib_mcast_dev_down(dev);
+
+	ipoib_flush_paths(dev);
+
+	return 0;
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int pending = 0;
+	int i;
+
+	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+		if (priv->rx_ring[i].skb)
+			++pending;
+
+	return pending;
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	unsigned long begin;
+	struct ipoib_buf *tx_req;
+	int i;
+
+	/* Kill the existing QP and allocate a new one */
+	qp_attr.qp_state = IB_QPS_ERR;
+	attr_mask        = IB_QP_STATE;
+	if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+	/* Wait for all sends and receives to complete */
+	begin = jiffies;
+
+	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+				   priv->tx_head - priv->tx_tail, recvs_pending(dev));
+
+			/*
+			 * assume the HW is wedged and just free up
+			 * all our pending work requests.
+			 */
+			while (priv->tx_tail < priv->tx_head) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(IPOIB_TX_RING_SIZE - 1)];
+				dma_unmap_single(priv->ca->dma_device,
+						 pci_unmap_addr(tx_req, mapping),
+						 tx_req->skb->len,
+						 DMA_TO_DEVICE);
+				dev_kfree_skb_any(tx_req->skb);
+				++priv->tx_tail;
+			}
+
+			for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+				if (priv->rx_ring[i].skb) {
+					dma_unmap_single(priv->ca->dma_device,
+							 pci_unmap_addr(&priv->rx_ring[i],
+									mapping),
+							 IPOIB_BUF_SIZE,
+							 DMA_FROM_DEVICE);
+					dev_kfree_skb_any(priv->rx_ring[i].skb);
+					priv->rx_ring[i].skb = NULL;
+				}
+
+			goto timeout;
+		}
+
+		msleep(1);
+	}
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+	qp_attr.qp_state = IB_QPS_RESET;
+	attr_mask        = IB_QP_STATE;
+	if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	/* Wait for all AHs to be reaped */
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	cancel_delayed_work(&priv->ah_reap_task);
+	flush_workqueue(ipoib_workqueue);
+
+	begin = jiffies;
+
+	while (!list_empty(&priv->dead_ahs)) {
+		__ipoib_reap_ah(dev);
+
+		if (time_after(jiffies, begin + HZ)) {
+			ipoib_warn(priv, "timing out; will leak address handles\n");
+			break;
+		}
+
+		msleep(1);
+	}
+
+	return 0;
+}
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	if (ipoib_transport_dev_init(dev, ca)) {
+		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+		return -ENODEV;
+	}
+
+	if (dev->flags & IFF_UP) {
+		if (ipoib_ib_dev_open(dev)) {
+			ipoib_transport_dev_cleanup(dev);
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+void ipoib_ib_dev_flush(void *_dev)
+{
+	struct net_device *dev = (struct net_device *)_dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv;
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		return;
+
+	ipoib_dbg(priv, "flushing\n");
+
+	ipoib_ib_dev_down(dev);
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_ib_dev_up(dev);
+
+	/* Flush any child interfaces too */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		ipoib_ib_dev_flush(&cpriv->dev);
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+
+	ipoib_mcast_stop_thread(dev);
+
+	/* Delete the broadcast address and the local address */
+	ipoib_mcast_dev_down(dev);
+
+	ipoib_transport_dev_cleanup(dev);
+}
+
+/*
+ * Delayed P_Key Assigment Interim Support
+ *
+ * The following is initial implementation of delayed P_Key assigment
+ * mechanism. It is using the same approach implemented for the multicast
+ * group join. The single goal of this implementation is to quickly address
+ * Bug #2507. This implementation will probably be removed when the P_Key
+ * change async notification is available.
+ */
+int ipoib_open(struct net_device *dev);
+
+static void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	u16 pkey_index = 0;
+
+	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	else
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+void ipoib_pkey_poll(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		ipoib_open(dev);
+	else {
+		down(&pkey_sem);
+		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->pkey_task,
+					   HZ);
+		up(&pkey_sem);
+	}
+}
+
+int ipoib_pkey_dev_delay_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* Look for the interface pkey value in the IB Port P_Key table and */
+	/* set the interface pkey assigment flag                            */
+	ipoib_pkey_dev_check_presence(dev);
+
+	/* P_Key value not assigned yet - start polling */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		down(&pkey_sem);
+		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
+		queue_delayed_work(ipoib_workqueue,
+				   &priv->pkey_task,
+				   HZ);
+		up(&pkey_sem);
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 000000000000..5a3b5c6a4494
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
+ */
+
+#include "ipoib.h"
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one
+};
+
+int ipoib_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	if (ipoib_pkey_dev_delay_open(dev))
+		return 0;
+
+	if (ipoib_ib_dev_open(dev))
+		return -EINVAL;
+
+	if (ipoib_ib_dev_up(dev))
+		return -EINVAL;
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		down(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (flags & IFF_UP)
+				continue;
+
+			dev_change_flags(cpriv->dev, flags | IFF_UP);
+		}
+		up(&priv->vlan_mutex);
+	}
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	netif_stop_queue(dev);
+
+	ipoib_ib_dev_down(dev);
+	ipoib_ib_dev_stop(dev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		down(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (!(flags & IFF_UP))
+				continue;
+
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+		}
+		up(&priv->vlan_mutex);
+	}
+
+	return 0;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	return 0;
+}
+
+static struct ipoib_path *__path_find(struct net_device *dev,
+				      union ib_gid *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(gid->raw, path->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+		/*
+		 * It's safe to call ipoib_put_ah() inside priv->lock
+		 * here, because we know that path->ah will always
+		 * hold one more reference, so ipoib_put_ah() will
+		 * never do more than decrement the ref count.
+		 */
+		if (neigh->ah)
+			ipoib_put_ah(neigh->ah);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	kfree(path);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_splice(&priv->path_list, &remove_list);
+	INIT_LIST_HEAD(&priv->path_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		wait_for_completion(&path->done);
+		path_free(dev, path);
+	}
+}
+
+static void path_rec_completion(int status,
+				struct ib_sa_path_rec *pathrec,
+				void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct net_device *dev = path->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_neigh *neigh;
+	struct sk_buff_head skqueue;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	if (pathrec)
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n",
+			  be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
+	else
+		ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n",
+			  status, IPOIB_GID_ARG(path->pathrec.dgid));
+
+	skb_queue_head_init(&skqueue);
+
+	if (!status) {
+		struct ib_ah_attr av = {
+			.dlid 	       = be16_to_cpu(pathrec->dlid),
+			.sl 	       = pathrec->sl,
+			.port_num      = priv->port
+		};
+
+		if (ib_sa_rate_enum_to_int(pathrec->rate) > 0)
+			av.static_rate = (2 * priv->local_rate -
+					  ib_sa_rate_enum_to_int(pathrec->rate) - 1) /
+				(priv->local_rate ? priv->local_rate : 1);
+
+		ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n",
+			  av.static_rate, priv->local_rate,
+			  ib_sa_rate_enum_to_int(pathrec->rate));
+
+		ah = ipoib_create_ah(dev, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	path->ah = ah;
+
+	if (ah) {
+		path->pathrec = *pathrec;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry(neigh, &path->neigh_list, list) {
+			kref_get(&path->ah->ref);
+			neigh->ah = path->ah;
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+	} else
+		path->query = NULL;
+
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev,
+					  union ib_gid *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	path = kmalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	path->dev          = dev;
+	path->pathrec.dlid = 0;
+	path->ah           = NULL;
+
+	skb_queue_head_init(&path->queue);
+
+	INIT_LIST_HEAD(&path->neigh_list);
+	path->query = NULL;
+	init_completion(&path->done);
+
+	memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid));
+	path->pathrec.sgid      = priv->local_gid;
+	path->pathrec.pkey      = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path = 1;
+
+	return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n",
+		  IPOIB_GID_ARG(path->pathrec.dgid));
+
+	path->query_id =
+		ib_sa_path_rec_get(priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
+		path->query = NULL;
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh;
+
+	neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh) {
+		++priv->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	skb_queue_head_init(&neigh->queue);
+	neigh->neighbour = skb->dst->neighbour;
+	*to_ipoib_neigh(skb->dst->neighbour) = neigh;
+
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (skb->dst->neighbour->ha + 4));
+		if (!path)
+			goto err;
+
+		__path_add(dev, path);
+	}
+
+	list_add_tail(&neigh->list, &path->neigh_list);
+
+	if (path->pathrec.dlid) {
+		kref_get(&path->ah->ref);
+		neigh->ah = path->ah;
+
+		ipoib_send(dev, skb, path->ah,
+			   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+	} else {
+		neigh->ah  = NULL;
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+			__skb_queue_tail(&neigh->queue, skb);
+		} else {
+			++priv->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		if (!path->query && path_rec_start(dev, path))
+			goto err;
+	}
+
+	spin_unlock(&priv->lock);
+	return;
+
+err:
+	*to_ipoib_neigh(skb->dst->neighbour) = NULL;
+	list_del(&neigh->list);
+	neigh->neighbour->ops->destructor = NULL;
+	kfree(neigh);
+
+	++priv->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock(&priv->lock);
+}
+
+static void path_lookup(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
+
+	/* Look up path record for unicasts */
+	if (skb->dst->neighbour->ha[4] != 0xff) {
+		neigh_add_path(skb, dev);
+		return;
+	}
+
+	/* Add in the P_Key for multicasts */
+	skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
+	skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
+	ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb);
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_pseudoheader *phdr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (phdr->hwaddr + 4));
+		if (path) {
+			/* put pseudoheader back on for next time */
+			skb_push(skb, sizeof *phdr);
+			__skb_queue_tail(&path->queue, skb);
+
+			if (path_rec_start(dev, path)) {
+				spin_unlock(&priv->lock);
+				path_free(dev, path);
+				return;
+			} else
+				__path_add(dev, path);
+		} else {
+			++priv->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		spin_unlock(&priv->lock);
+		return;
+	}
+
+	if (path->pathrec.dlid) {
+		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
+			  be16_to_cpu(path->pathrec.dlid));
+
+		ipoib_send(dev, skb, path->ah,
+			   be32_to_cpup((__be32 *) phdr->hwaddr));
+	} else if ((path->query || !path_rec_start(dev, path)) &&
+		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		/* put pseudoheader back on for next time */
+		skb_push(skb, sizeof *phdr);
+		__skb_queue_tail(&path->queue, skb);
+	} else {
+		++priv->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+	spin_unlock(&priv->lock);
+}
+
+static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (!spin_trylock(&priv->tx_lock)) {
+		local_irq_restore(flags);
+		return NETDEV_TX_LOCKED;
+	}
+
+	/*
+	 * Check if our queue is stopped.  Since we have the LLTX bit
+	 * set, we can't rely on netif_stop_queue() preventing our
+	 * xmit function from being called with a full queue.
+	 */
+	if (unlikely(netif_queue_stopped(dev))) {
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (skb->dst && skb->dst->neighbour) {
+		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+			path_lookup(skb, dev);
+			goto out;
+		}
+
+		neigh = *to_ipoib_neigh(skb->dst->neighbour);
+
+		if (likely(neigh->ah)) {
+			ipoib_send(dev, skb, neigh->ah,
+				   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+			goto out;
+		}
+
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+			spin_lock(&priv->lock);
+			__skb_queue_tail(&neigh->queue, skb);
+			spin_unlock(&priv->lock);
+		} else {
+			++priv->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+	} else {
+		struct ipoib_pseudoheader *phdr =
+			(struct ipoib_pseudoheader *) skb->data;
+		skb_pull(skb, sizeof *phdr);
+
+		if (phdr->hwaddr[4] == 0xff) {
+			/* Add in the P_Key for multicast*/
+			phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+			phdr->hwaddr[9] = priv->pkey & 0xff;
+
+			ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb);
+		} else {
+			/* unicast GID -- should be ARP reply */
+
+			if (be16_to_cpup((u16 *) skb->data) != ETH_P_ARP) {
+				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
+					   IPOIB_GID_FMT "\n",
+					   skb->dst ? "neigh" : "dst",
+					   be16_to_cpup((u16 *) skb->data),
+					   be32_to_cpup((u32 *) phdr->hwaddr),
+					   IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4)));
+				dev_kfree_skb_any(skb);
+				++priv->stats.tx_dropped;
+				goto out;
+			}
+
+			unicast_arp_send(skb, dev, phdr);
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	return NETDEV_TX_OK;
+}
+
+static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return &priv->stats;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_warn(priv, "transmit timeout: latency %ld\n",
+		   jiffies - dev->trans_start);
+	/* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+			     struct net_device *dev,
+			     unsigned short type,
+			     void *daddr, void *saddr, unsigned len)
+{
+	struct ipoib_header *header;
+
+	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
+
+	header->proto = htons(type);
+	header->reserved = 0;
+
+	/*
+	 * If we don't have a neighbour structure, stuff the
+	 * destination address onto the front of the skb so we can
+	 * figure out where to send the packet later.
+	 */
+	if (!skb->dst || !skb->dst->neighbour) {
+		struct ipoib_pseudoheader *phdr =
+			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
+	}
+
+	return 0;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	schedule_work(&priv->restart_task);
+}
+
+static void ipoib_neigh_destructor(struct neighbour *n)
+{
+	struct ipoib_neigh *neigh;
+	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+	unsigned long flags;
+	struct ipoib_ah *ah = NULL;
+
+	ipoib_dbg(priv,
+		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+		  be32_to_cpup((__be32 *) n->ha),
+		  IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4))));
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	neigh = *to_ipoib_neigh(n);
+	if (neigh) {
+		if (neigh->ah)
+			ah = neigh->ah;
+		list_del(&neigh->list);
+		*to_ipoib_neigh(n) = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (ah)
+		ipoib_put_ah(ah);
+}
+
+static int ipoib_neigh_setup(struct neighbour *neigh)
+{
+	/*
+	 * Is this kosher?  I can't find anybody in the kernel that
+	 * sets neigh->destructor, so we should be able to set it here
+	 * without trouble.
+	 */
+	neigh->ops->destructor = ipoib_neigh_destructor;
+
+	return 0;
+}
+
+static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
+{
+	parms->neigh_setup = ipoib_neigh_setup;
+
+	return 0;
+}
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* Allocate RX/TX "rings" to hold queued skbs */
+
+	priv->rx_ring =	kmalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf),
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+		       ca->name, IPOIB_RX_RING_SIZE);
+		goto out;
+	}
+	memset(priv->rx_ring, 0,
+	       IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf));
+
+	priv->tx_ring = kmalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf),
+				GFP_KERNEL);
+	if (!priv->tx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+		       ca->name, IPOIB_TX_RING_SIZE);
+		goto out_rx_ring_cleanup;
+	}
+	memset(priv->tx_ring, 0,
+	       IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf));
+
+	/* priv->tx_head & tx_tail are already 0 */
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_ring_cleanup;
+
+	return 0;
+
+out_tx_ring_cleanup:
+	kfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out:
+	return -ENOMEM;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+
+	ipoib_delete_debug_file(dev);
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		unregister_netdev(cpriv->dev);
+		ipoib_dev_cleanup(cpriv->dev);
+		free_netdev(cpriv->dev);
+	}
+
+	ipoib_ib_dev_cleanup(dev);
+
+	if (priv->rx_ring) {
+		kfree(priv->rx_ring);
+		priv->rx_ring = NULL;
+	}
+
+	if (priv->tx_ring) {
+		kfree(priv->tx_ring);
+		priv->tx_ring = NULL;
+	}
+}
+
+static void ipoib_setup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	dev->open 		 = ipoib_open;
+	dev->stop 		 = ipoib_stop;
+	dev->change_mtu 	 = ipoib_change_mtu;
+	dev->hard_start_xmit 	 = ipoib_start_xmit;
+	dev->get_stats 		 = ipoib_get_stats;
+	dev->tx_timeout 	 = ipoib_timeout;
+	dev->hard_header 	 = ipoib_hard_header;
+	dev->set_multicast_list  = ipoib_set_mcast_list;
+	dev->neigh_setup         = ipoib_neigh_setup_dev;
+
+	dev->watchdog_timeo 	 = HZ;
+
+	dev->rebuild_header 	 = NULL;
+	dev->set_mac_address 	 = NULL;
+	dev->header_cache_update = NULL;
+
+	dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
+
+	/*
+	 * We add in INFINIBAND_ALEN to allow for the destination
+	 * address "pseudoheader" for skbs without neighbour struct.
+	 */
+	dev->hard_header_len 	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
+	dev->addr_len 		 = INFINIBAND_ALEN;
+	dev->type 		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len 	 = IPOIB_TX_RING_SIZE * 2;
+	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+
+	/* MTU will be reset when mcast join happens */
+	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
+
+	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	netif_carrier_off(dev);
+
+	SET_MODULE_OWNER(dev);
+
+	priv->dev = dev;
+
+	spin_lock_init(&priv->lock);
+	spin_lock_init(&priv->tx_lock);
+
+	init_MUTEX(&priv->mcast_mutex);
+	init_MUTEX(&priv->vlan_mutex);
+
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+
+	INIT_WORK(&priv->pkey_task,    ipoib_pkey_poll,          priv->dev);
+	INIT_WORK(&priv->mcast_task,   ipoib_mcast_join_task,    priv->dev);
+	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush,       priv->dev);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev);
+	INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah,            priv->dev);
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
+{
+	struct net_device *dev;
+
+	dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
+			   ipoib_setup);
+	if (!dev)
+		return NULL;
+
+	return netdev_priv(dev);
+}
+
+static ssize_t show_pkey(struct class_device *cdev, char *buf)
+{
+	struct ipoib_dev_priv *priv =
+		netdev_priv(container_of(cdev, struct net_device, class_dev));
+
+	return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t create_child(struct class_device *cdev,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev),
+			     pkey);
+
+	return ret ? ret : count;
+}
+static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
+
+static ssize_t delete_child(struct class_device *cdev,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev),
+				pkey);
+
+	return ret ? ret : count;
+
+}
+static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+	return class_device_create_file(&dev->class_dev,
+					&class_device_attr_pkey);
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+					 struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(format);
+	if (!priv)
+		goto alloc_mem_failed;
+
+	SET_NETDEV_DEV(priv->dev, hca->dma_device);
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto alloc_mem_failed;
+	}
+
+	priv->dev->broadcast[8] = priv->pkey >> 8;
+	priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto alloc_mem_failed;
+	} else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+
+	result = ipoib_dev_init(priv->dev, hca, port);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	result = ib_register_event_handler(&priv->event_handler);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+		       "port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto event_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
+		       hca->name, port, result);
+		goto register_failed;
+	}
+
+	if (ipoib_create_debug_file(priv->dev))
+		goto debug_failed;
+
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_create_child))
+		goto sysfs_failed;
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_delete_child))
+		goto sysfs_failed;
+
+	return priv->dev;
+
+sysfs_failed:
+	ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ib_unregister_event_handler(&priv->event_handler);
+
+event_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	int s, e, p;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	if (device->node_type == IB_NODE_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		dev = ipoib_add_port("ib%d", device, p);
+		if (!IS_ERR(dev)) {
+			priv = netdev_priv(dev);
+			list_add_tail(&priv->list, dev_list);
+		}
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device)
+{
+	struct ipoib_dev_priv *priv, *tmp;
+	struct list_head *dev_list;
+
+	dev_list = ib_get_client_data(device, &ipoib_client);
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		ib_unregister_event_handler(&priv->event_handler);
+
+		unregister_netdev(priv->dev);
+		ipoib_dev_cleanup(priv->dev);
+		free_netdev(priv->dev);
+	}
+}
+
+static int __init ipoib_init_module(void)
+{
+	int ret;
+
+	ret = ipoib_register_debugfs();
+	if (ret)
+		return ret;
+
+	/*
+	 * We create our own workqueue mainly because we want to be
+	 * able to flush it when devices are being removed.  We can't
+	 * use schedule_work()/flush_scheduled_work() because both
+	 * unregister_netdev() and linkwatch_event take the rtnl lock,
+	 * so flush_scheduled_work() can deadlock during device
+	 * removal.
+	 */
+	ipoib_workqueue = create_singlethread_workqueue("ipoib");
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_wq;
+
+	return 0;
+
+err_fs:
+	ipoib_unregister_debugfs();
+
+err_wq:
+	destroy_workqueue(ipoib_workqueue);
+
+	return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+	ipoib_unregister_debugfs();
+	ib_unregister_client(&ipoib_client);
+	destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 000000000000..f46932dc81c9
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+static DECLARE_MUTEX(mcast_mutex);
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ipoib_ah          *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+	struct completion done;
+
+	int                 query_id;
+	struct ib_sa_query *query;
+
+	unsigned long created;
+	unsigned long backoff;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+};
+
+struct ipoib_mcast_iter {
+	struct net_device *dev;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tmp;
+	unsigned long flags;
+	LIST_HEAD(ah_list);
+	struct ipoib_ah *ah, *tah;
+
+	ipoib_dbg_mcast(netdev_priv(dev),
+			"deleting multicast group " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
+		if (neigh->ah)
+			list_add_tail(&neigh->ah->list, &ah_list);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(ah, tah, &ah_list, list)
+		ipoib_put_ah(ah);
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		skb->dev = dev;
+		dev_kfree_skb_any(skb);
+	}
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kmalloc(sizeof (*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	memset(mcast, 0, sizeof (*mcast));
+
+	init_completion(&mcast->done);
+
+	mcast->dev = dev;
+	mcast->created = jiffies;
+	mcast->backoff = HZ;
+	mcast->logcount = 0;
+
+	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
+	skb_queue_head_init(&mcast->pkt_queue);
+
+	mcast->ah    = NULL;
+	mcast->query = NULL;
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the cached Q_Key before we attach if it's the broadcast group */
+	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		    sizeof (union ib_gid))) {
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group " IPOIB_GID_FMT
+				   " already attached\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			return 0;
+		}
+
+		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
+					 &mcast->mcmember.mgid);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group "
+				   IPOIB_GID_FMT "\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	{
+		struct ib_ah_attr av = {
+			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
+			.port_num      = priv->port,
+			.sl	       = mcast->mcmember.sl,
+			.ah_flags      = IB_AH_GRH,
+			.grh	       = {
+				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
+				.hop_limit     = mcast->mcmember.hop_limit,
+				.sgid_index    = 0,
+				.traffic_class = mcast->mcmember.traffic_class
+			}
+		};
+
+		av.grh.dgid = mcast->mcmember.mgid;
+
+		if (ib_sa_rate_enum_to_int(mcast->mcmember.rate) > 0)
+			av.static_rate = (2 * priv->local_rate -
+					  ib_sa_rate_enum_to_int(mcast->mcmember.rate) - 1) /
+				(priv->local_rate ? priv->local_rate : 1);
+
+		ipoib_dbg_mcast(priv, "static_rate %d for local port %dX, mcmember %dX\n",
+				av.static_rate, priv->local_rate,
+				ib_sa_rate_enum_to_int(mcast->mcmember.rate));
+
+		mcast->ah = ipoib_create_ah(dev, priv->pd, &av);
+		if (!mcast->ah) {
+			ipoib_warn(priv, "ib_address_create failed\n");
+		} else {
+			ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT
+					" AV %p, LID 0x%04x, SL %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid),
+					mcast->ah->ah,
+					be16_to_cpu(mcast->mcmember.mlid),
+					mcast->mcmember.sl);
+		}
+	}
+
+	/* actually send any queued packets */
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		skb->dev = dev;
+
+		if (!skb->dst || !skb->dst->neighbour) {
+			/* put pseudoheader back on for next time */
+			skb_push(skb, sizeof (struct ipoib_pseudoheader));
+		}
+
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+	}
+
+	return 0;
+}
+
+static void
+ipoib_mcast_sendonly_join_complete(int status,
+				   struct ib_sa_mcmember_rec *mcmember,
+				   void *mcast_ptr)
+{
+	struct ipoib_mcast *mcast = mcast_ptr;
+	struct net_device *dev = mcast->dev;
+
+	if (!status)
+		ipoib_mcast_join_finish(mcast, mcmember);
+	else {
+		if (mcast->logcount++ < 20)
+			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
+					IPOIB_GID_FMT ", status %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+		/* Flush out any queued packets */
+		while (!skb_queue_empty(&mcast->pkt_queue)) {
+			struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+			skb->dev = dev;
+
+			dev_kfree_skb_any(skb);
+		}
+
+		/* Clear the busy flag so we try again */
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	}
+
+	complete(&mcast->done);
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+#if 0				/* Some SMs don't support send-only yet */
+		.join_state = 4
+#else
+		.join_state = 1
+#endif
+	};
+	int ret = 0;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+		return -ENODEV;
+	}
+
+	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+		return -EBUSY;
+	}
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec,
+				     IB_SA_MCMEMBER_REC_MGID		|
+				     IB_SA_MCMEMBER_REC_PORT_GID	|
+				     IB_SA_MCMEMBER_REC_PKEY		|
+				     IB_SA_MCMEMBER_REC_JOIN_STATE,
+				     1000, GFP_ATOMIC,
+				     ipoib_mcast_sendonly_join_complete,
+				     mcast, &mcast->query);
+	if (ret < 0) {
+		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n",
+			   ret);
+	} else {
+		ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
+				", starting join\n",
+				IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+		mcast->query_id = ret;
+	}
+
+	return ret;
+}
+
+static void ipoib_mcast_join_complete(int status,
+				      struct ib_sa_mcmember_rec *mcmember,
+				      void *mcast_ptr)
+{
+	struct ipoib_mcast *mcast = mcast_ptr;
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT
+			" (status %d)\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+	if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) {
+		mcast->backoff = HZ;
+		down(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_work(ipoib_workqueue, &priv->mcast_task);
+		up(&mcast_mutex);
+		complete(&mcast->done);
+		return;
+	}
+
+	if (status == -EINTR) {
+		complete(&mcast->done);
+		return;
+	}
+
+	if (status && mcast->logcount++ < 20) {
+		if (status == -ETIMEDOUT || status == -EINTR) {
+			ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
+					", status %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid),
+					status);
+		} else {
+			ipoib_warn(priv, "multicast join failed for "
+				   IPOIB_GID_FMT ", status %d\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid),
+				   status);
+		}
+	}
+
+	mcast->backoff *= 2;
+	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+	mcast->query = NULL;
+
+	down(&mcast_mutex);
+	if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) {
+		if (status == -ETIMEDOUT)
+			queue_work(ipoib_workqueue, &priv->mcast_task);
+		else
+			queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+					   mcast->backoff * HZ);
+	} else
+		complete(&mcast->done);
+	up(&mcast_mutex);
+
+	return;
+}
+
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+			     int create)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (create) {
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY		|
+			IB_SA_MCMEMBER_REC_SL		|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL	|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+	}
+
+	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, comp_mask,
+				     mcast->backoff * 1000, GFP_ATOMIC,
+				     ipoib_mcast_join_complete,
+				     mcast, &mcast->query);
+
+	if (ret < 0) {
+		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
+
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+		down(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task,
+					   mcast->backoff);
+		up(&mcast_mutex);
+	} else
+		mcast->query_id = ret;
+}
+
+void ipoib_mcast_join_task(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		return;
+
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+		ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+	else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	{
+		struct ib_port_attr attr;
+
+		if (!ib_query_port(priv->ca, priv->port, &attr)) {
+			priv->local_lid  = attr.lid;
+			priv->local_rate = attr.active_speed *
+				ib_width_enum_to_int(attr.active_width);
+		} else
+			ipoib_warn(priv, "ib_query_port failed\n");
+	}
+
+	if (!priv->broadcast) {
+		priv->broadcast = ipoib_mcast_alloc(dev, 1);
+		if (!priv->broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			down(&mcast_mutex);
+			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+				queue_delayed_work(ipoib_workqueue,
+						   &priv->mcast_task, HZ);
+			up(&mcast_mutex);
+			return;
+		}
+
+		memcpy(priv->broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		       sizeof (union ib_gid));
+
+		spin_lock_irq(&priv->lock);
+		__ipoib_mcast_add(dev, priv->broadcast);
+		spin_unlock_irq(&priv->lock);
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		ipoib_mcast_join(dev, priv->broadcast, 0);
+		return;
+	}
+
+	while (1) {
+		struct ipoib_mcast *mcast = NULL;
+
+		spin_lock_irq(&priv->lock);
+		list_for_each_entry(mcast, &priv->multicast_list, list) {
+			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+				/* Found the next unjoined group */
+				break;
+			}
+		}
+		spin_unlock_irq(&priv->lock);
+
+		if (&mcast->list == &priv->multicast_list) {
+			/* All done */
+			break;
+		}
+
+		ipoib_mcast_join(dev, mcast, 1);
+		return;
+	}
+
+	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
+		IPOIB_ENCAP_LEN;
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	netif_carrier_on(dev);
+}
+
+int ipoib_mcast_start_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+	down(&mcast_mutex);
+	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_work(ipoib_workqueue, &priv->mcast_task);
+	up(&mcast_mutex);
+
+	return 0;
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	down(&mcast_mutex);
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	cancel_delayed_work(&priv->mcast_task);
+	up(&mcast_mutex);
+
+	flush_workqueue(ipoib_workqueue);
+
+	if (priv->broadcast && priv->broadcast->query) {
+		ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
+		priv->broadcast->query = NULL;
+		ipoib_dbg_mcast(priv, "waiting for bcast\n");
+		wait_for_completion(&priv->broadcast->done);
+	}
+
+	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (mcast->query) {
+			ib_sa_cancel_query(mcast->query_id, mcast->query);
+			mcast->query = NULL;
+			ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+			wait_for_completion(&mcast->done);
+		}
+	}
+
+	return 0;
+}
+
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	int ret = 0;
+
+	if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags))
+		return 0;
+
+	ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	/* Remove ourselves from the multicast group */
+	ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
+				 &mcast->mcmember.mgid);
+	if (ret)
+		ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+
+	/*
+	 * Just make one shot at leaving and don't wait for a reply;
+	 * if we fail, too bad.
+	 */
+	ret = ib_sa_mcmember_rec_delete(priv->ca, priv->port, &rec,
+					IB_SA_MCMEMBER_REC_MGID		|
+					IB_SA_MCMEMBER_REC_PORT_GID	|
+					IB_SA_MCMEMBER_REC_PKEY		|
+					IB_SA_MCMEMBER_REC_JOIN_STATE,
+					0, GFP_ATOMIC, NULL,
+					mcast, &mcast->query);
+	if (ret < 0)
+		ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
+			   "for leave (result = %d)\n", ret);
+
+	return 0;
+}
+
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+		      struct sk_buff *skb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	mcast = __ipoib_mcast_find(dev, mgid);
+	if (!mcast) {
+		/* Let's create a new send only group now */
+		ipoib_dbg_mcast(priv, "setting up send only multicast group for "
+				IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid));
+
+		mcast = ipoib_mcast_alloc(dev, 0);
+		if (!mcast) {
+			ipoib_warn(priv, "unable to allocate memory for "
+				   "multicast structure\n");
+			dev_kfree_skb_any(skb);
+			goto out;
+		}
+
+		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+		mcast->mcmember.mgid = *mgid;
+		__ipoib_mcast_add(dev, mcast);
+		list_add_tail(&mcast->list, &priv->multicast_list);
+	}
+
+	if (!mcast->ah) {
+		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+			skb_queue_tail(&mcast->pkt_queue, skb);
+		else
+			dev_kfree_skb_any(skb);
+
+		if (mcast->query)
+			ipoib_dbg_mcast(priv, "no address vector, "
+					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);
+
+		/*
+		 * If lookup completes between here and out:, don't
+		 * want to send packet twice.
+		 */
+		mcast = NULL;
+	}
+
+out:
+	if (mcast && mcast->ah) {
+		if (skb->dst            &&
+		    skb->dst->neighbour &&
+		    !*to_ipoib_neigh(skb->dst->neighbour)) {
+			struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+
+			if (neigh) {
+				kref_get(&mcast->ah->ref);
+				neigh->ah  	= mcast->ah;
+				neigh->neighbour = skb->dst->neighbour;
+				*to_ipoib_neigh(skb->dst->neighbour) = neigh;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
+			}
+		}
+
+		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+	}
+
+	spin_unlock(&priv->lock);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast, *nmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		nmcast = ipoib_mcast_alloc(dev, 0);
+		if (nmcast) {
+			nmcast->flags =
+				mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY);
+
+			nmcast->mcmember.mgid = mcast->mcmember.mgid;
+
+			/* Add the new group in before the to-be-destroyed group */
+			list_add_tail(&nmcast->list, &mcast->list);
+			list_del_init(&mcast->list);
+
+			rb_replace_node(&mcast->rb_node, &nmcast->rb_node,
+					&priv->multicast_tree);
+
+			list_add_tail(&mcast->list, &remove_list);
+		} else {
+			ipoib_warn(priv, "could not reallocate multicast group "
+				   IPOIB_GID_FMT "\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+		}
+	}
+
+	if (priv->broadcast) {
+		nmcast = ipoib_mcast_alloc(dev, 0);
+		if (nmcast) {
+			nmcast->mcmember.mgid = priv->broadcast->mcmember.mgid;
+
+			rb_replace_node(&priv->broadcast->rb_node,
+					&nmcast->rb_node,
+					&priv->multicast_tree);
+
+			list_add_tail(&priv->broadcast->list, &remove_list);
+		}
+
+		priv->broadcast = nmcast;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+void ipoib_mcast_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+
+	/* Delete broadcast since it will be recreated */
+	if (priv->broadcast) {
+		ipoib_dbg_mcast(priv, "deleting broadcast group\n");
+
+		spin_lock_irqsave(&priv->lock, flags);
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_mcast_leave(dev, priv->broadcast);
+		ipoib_mcast_free(priv->broadcast);
+		priv->broadcast = NULL;
+	}
+}
+
+void ipoib_mcast_restart_task(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct dev_mc_list *mclist;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+	ipoib_mcast_stop_thread(dev);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+	for (mclist = dev->mc_list; mclist; mclist = mclist->next) {
+		union ib_gid mgid;
+
+		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
+
+		/* Add in the P_Key */
+		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
+		mgid.raw[5] = priv->pkey & 0xff;
+
+		mcast = __ipoib_mcast_find(dev, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid "
+					IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));
+
+			nmcast = ipoib_mcast_alloc(dev, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_del(&mcast->list);
+				list_add_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(dev, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_del(&mcast->list);
+			list_add_tail(&mcast->list, &remove_list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	/* We have to cancel outside of the spinlock */
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_mcast_start_thread(dev);
+}
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->mgid.raw, 0, sizeof iter->mgid);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		ipoib_mcast_iter_free(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter)
+{
+	kfree(iter);
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 000000000000..4933edf062c2
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr *qp_attr;
+	int attr_mask;
+	int ret;
+	u16 pkey_index;
+
+	ret = -ENOMEM;
+	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+	if (!qp_attr)
+		goto out;
+
+	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	/* set correct QKey for QP */
+	qp_attr->qkey = priv->qkey;
+	attr_mask = IB_QP_QKEY;
+	ret = ib_modify_qp(priv->qp, qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+		goto out;
+	}
+
+	/* attach QP to multicast group */
+	down(&priv->mcast_mutex);
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	up(&priv->mcast_mutex);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	down(&priv->mcast_mutex);
+	ret = ib_detach_mcast(priv->qp, mgid, mlid);
+	up(&priv->mcast_mutex);
+	if (ret)
+		ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+
+	return ret;
+}
+
+int ipoib_qp_create(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	u16 pkey_index;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	/*
+	 * Search through the port P_Key table for the requested pkey value.
+	 * The port has to be assigned to the respective IB partition in
+	 * advance.
+	 */
+	ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
+	if (ret) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		return ret;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	ib_destroy_qp(priv->qp);
+	priv->qp = NULL;
+
+	return -EINVAL;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = IPOIB_TX_RING_SIZE,
+			.max_recv_wr  = IPOIB_RX_RING_SIZE,
+			.max_send_sge = 1,
+			.max_recv_sge = 1
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+
+	priv->pd = ib_alloc_pd(priv->ca);
+	if (IS_ERR(priv->pd)) {
+		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+		return -ENODEV;
+	}
+
+	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
+				IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1);
+	if (IS_ERR(priv->cq)) {
+		printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
+		goto out_free_pd;
+	}
+
+	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
+		goto out_free_cq;
+
+	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(priv->mr)) {
+		printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+		goto out_free_cq;
+	}
+
+	init_attr.send_cq = priv->cq;
+	init_attr.recv_cq = priv->cq,
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+		goto out_free_mr;
+	}
+
+	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
+	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
+
+	priv->tx_sge.lkey 	= priv->mr->lkey;
+
+	priv->tx_wr.opcode 	= IB_WR_SEND;
+	priv->tx_wr.sg_list 	= &priv->tx_sge;
+	priv->tx_wr.num_sge 	= 1;
+	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
+
+	return 0;
+
+out_free_mr:
+	ib_dereg_mr(priv->mr);
+
+out_free_cq:
+	ib_destroy_cq(priv->cq);
+
+out_free_pd:
+	ib_dealloc_pd(priv->pd);
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+
+	if (ib_dereg_mr(priv->mr))
+		ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+	if (ib_destroy_cq(priv->cq))
+		ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+	if (ib_dealloc_pd(priv->pd))
+		ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->event == IB_EVENT_PORT_ACTIVE ||
+	    record->event == IB_EVENT_LID_CHANGE  ||
+	    record->event == IB_EVENT_SM_CHANGE) {
+		ipoib_dbg(priv, "Port active event\n");
+		schedule_work(&priv->flush_task);
+	}
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 000000000000..94b8ea812fef
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct class_device *class_dev, char *buf)
+{
+	struct net_device *dev =
+		container_of(class_dev, struct net_device, class_dev);
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static CLASS_DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	down(&ppriv->vlan_mutex);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the child interfaces to make sure the Pkey doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto err;
+	}
+
+	list_for_each_entry(priv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			result = -ENOTUNIQ;
+			goto err;
+		}
+	}
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv) {
+		result = -ENOMEM;
+		goto err;
+	}
+
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	priv->pkey = pkey;
+
+	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	priv->dev->broadcast[8] = pkey >> 8;
+	priv->dev->broadcast[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto device_init_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	priv->parent = ppriv->dev;
+
+	if (ipoib_create_debug_file(priv->dev))
+		goto debug_failed;
+
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_parent))
+		goto sysfs_failed;
+
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	up(&ppriv->vlan_mutex);
+
+	return 0;
+
+sysfs_failed:
+	ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+err:
+	up(&ppriv->vlan_mutex);
+	return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	int ret = -ENOENT;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	down(&ppriv->vlan_mutex);
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			unregister_netdev(priv->dev);
+			ipoib_dev_cleanup(priv->dev);
+
+			list_del(&priv->list);
+
+			kfree(priv);
+
+			ret = 0;
+			break;
+		}
+	}
+	up(&ppriv->vlan_mutex);
+
+	return ret;
+}